Source code for dstk.utilities.clustering

"""
This module provides tools for organizing and grouping word embeddings into meaningful clusters.

For researchers in linguistics and digital humanities, 'word embeddings' are mathematical
representations of words where words with similar meanings are located close together in a
high-dimensional space. However, because these spaces are often too complex to analyze directly,
this module uses two powerful algorithms:
1. UMAP: Reduces the complexity (dimensionality) of the data while preserving the
   relationships between words.
2. HDBSCAN: Identifies "dense" regions in that simplified space to group similar words
   together into clusters.

The resulting output identifies groups of related terms and labels outliers as 'Noise'.
"""

from sklearn.cluster import HDBSCAN
from umap import UMAP

from typing import cast
from ..lib_types import ndarray, DataFrame



[docs]
def cluster_embeddings(
    embeddings: DataFrame,
    cluster_dimensions: int = 5,
    n_neighbors: int = 15,
    min_dist: float = 0.1,
    compression_metric: str = "cosine",
    approximate: int | None = None,
    min_cluster_size: int = 5,
    min_samples: int | None = None,
    cluster_metric: str = "euclidean",
) -> DataFrame:
    """
    Reduces dimensions and clusters word embeddings using UMAP and HDBSCAN.

    :param embeddings: DataFrame containing the word embeddings.
    :type embeddings: DataFrame
    :param cluster_dimensions: Number of dimensions for UMAP reduction.
    :type cluster_dimensions: int
    :param n_neighbors: Number of neighbors to consider for local structure.
    :type n_neighbors: int
    :param min_dist: Minimum distance between points in the reduced space.
    :type min_dist: float
    :param compression_metric: Metric used by UMAP (e.g., "cosine").
    :type compression_metric: str
    :param approximate: If set, a random sample of data is used for speed.
    :type approximate: int | None
    :param min_cluster_size: Minimum size required to form a cluster in HDBSCAN.
    :type min_cluster_size: int
    :param min_samples: Number of samples in a neighborhood to consider as core points.
    :type min_samples: int | None
    :param cluster_metric: Metric used by HDBSCAN (e.g., "euclidean").
    :type cluster_metric: str

    :return: A DataFrame with an additional 'cluster' column.
    :rtype: DataFrame
    """

    clustered_df: DataFrame = embeddings.copy()

    if approximate:
        embeddings = embeddings.sample(approximate, random_state=42)

    embeddings_array: ndarray = embeddings.to_numpy()

    n_words, _ = embeddings_array.shape
    effective_neighbors: int = min(n_neighbors, max(2, n_words - 1))

    cluster_reducer: UMAP = UMAP(
        n_components=cluster_dimensions,
        n_neighbors=effective_neighbors,
        min_dist=min_dist,
        metric=compression_metric,
        low_memory=True if approximate else False,
        random_state=42,
    )

    compressed_embeddings: ndarray = cast(
        ndarray, cluster_reducer.fit_transform(embeddings_array)
    )

    effective_cluster_size: int = min(min_cluster_size, max(2, n_words // 2))

    clusterer: HDBSCAN = HDBSCAN(
        min_cluster_size=effective_cluster_size,
        min_samples=min_samples,
        metric=cluster_metric,
    )

    clusterer_labels: ndarray = clusterer.fit_predict(compressed_embeddings)

    clustered_df["cluster"] = [
        "Noise" if label == -1 else f"cluster_{label}" for label in clusterer_labels
    ]

    return clustered_df