Source code for dstk.parameters.vector_similarity.geometric_measures.similarity

"""
This module implements geometric measures to quantify the semantic similarity between
lexemes based on their representation in a vector space. Following the geometric
approach, it treats word embeddings as points in an n-dimensional space where the
proximity of two vectors indicates the degree of similarity between their meanings.

Core functionalities include:

* Calculating cosine similarity to measure the distance between two specific
  distributional vectors (embeddings).
* Identifying the $k$ nearest neighbors for a target word using exact search
  algorithms, returning both the labels and their corresponding similarity scores.
* Performing approximate nearest neighbor searches to facilitate fast, memory-efficient
  queries in large datasets using HNSW (Hierarchical Navigable Small World) or
  IVF (Inverted File Index) methods.

The module is designed to provide researchers with tools for both precise pairwise
comparisons and scalable retrieval of semantically related terms within large
multidimensional embedding spaces.
"""

from sklearn.neighbors import NearestNeighbors
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from faiss import IndexHNSWFlat, IndexIVFFlat, IndexFlatL2

from typing import Literal
from ....lib_types import ndarray, Series, DataFrame, Neighbors, Neighbor



[docs]
def cos_similarity(embeddings: DataFrame, first_word: str, second_word: str) -> float:
    """
    Computes the cosine similarity between the embeddings of two words.

    :param embeddings: A dataframe containing the word embeddings.
    :type embeddings: DataFrame
    :param first_word: The first word in the pair.
    :type first_word: str
    :param second_word: The second word in the pair.
    :type second_word: str

    :returns: The cosine similarity between the first and second word.
    :rtype: float
    """

    first_word_vector: ndarray = np.array(embeddings.loc[first_word]).reshape(1, -1)
    second_word_vector: ndarray = np.array(embeddings.loc[second_word]).reshape(1, -1)

    cos_sim: ndarray = cosine_similarity(first_word_vector, second_word_vector)

    return cos_sim[0][0]




[docs]
def nearest_neighbors(
    embeddings: DataFrame, word: str, metric: str = "cosine", n_words: int = 5, **kwargs
) -> Neighbors:
    """
    Returns the top N most semantically similar words to a given target word, based on the specified distance or similarity metric.

    :param embeddings: A dataframe containing the word embeddings.
    :type embeddings: DataFrame
    :param word: The target word to find neighbors for.
    :type word: str
    :param metric: The distance or similarity metric to use (e.g., 'cosine', 'euclidean'). Defaults to 'cosine'.
    :type metric: str
    :param n_words: Number of nearest neighbors to return. Defaults to 5.
    :type of n_words: int
    :param kwargs: Additional keyword arguments to pass to sklearn's NearestNeighbors. 
    
    For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html

    :returns: A list of `Neighbor` namedtuples, one for each word close to the target word.
    :rtype: Neighbors
    """

    embeddings_array: ndarray = embeddings.to_numpy()

    word_series: Series = embeddings.loc[word]
    word_vector: ndarray = np.array(word_series).reshape(1, -1)

    distances: ndarray
    indices: ndarray

    neighbors: NearestNeighbors = NearestNeighbors(
        n_neighbors=n_words, algorithm="auto", metric=metric, **kwargs
    )
    neighbors.fit(embeddings_array)

    distances, indices = neighbors.kneighbors(word_vector, n_neighbors=n_words + 1)

    neighbor_tuples = zip(indices[0], distances[0])

    results: Neighbors = [
        Neighbor(
            str(embeddings.index[index]),
            float(1 - distance) if metric == "cosine" else float(distance),
        )
        for index, distance in neighbor_tuples
        if embeddings.index[index] != word
    ]
    return results




[docs]
def approximate_nearest_neighbors(
    embeddings: DataFrame,
    word: str,
    metric: Literal["hnsw", "ivf"] = "ivf",
    n_words: int = 5,
    n_centroids: int = 100,
    clusters_to_search: int = 10,
    n_connections: int = 16,
    search_depth: int = 8,
    construction_depth: int = 64,
) -> Neighbors:
    """
    Find words with similar embeddings using a fast, memory-efficient approximate search.

    This function returns the closest words to a target word without checking every possible word directly. Instead, it uses structures that give very close results much faster than an exact search, especially on large embedding sets.

    :param embeddings: A dataframe containing the word embeddings.
    :type embeddings: DataFrame
    :param word: The target word to find neighbors for.
    :type word: str
    :param metric: The search methods to use (IVF or HNSW). IVF uses less memory; HNSW is more accurate but heavier. Defaults to 'ivf'.
    :type metric: str
    :param n_words: Number of nearest neighbors to return. Defaults to 5.
    :type of n_words: int
    :param n_centroids: The number of centroids IVF uses to group the embeddings. Equivalent to the number of clusters. Defaults to 100.
    :type n_centroids: int
    :param clusters_to_search: Controls how many clusters (centroids) IVF visits during the search. The higher, the slower, but more accurate. Defaults to 10.
    :type clusters_to_search: int
    :param n_connections: Number of connections of each node in HNSW. Defaults to 16.
    :type n_connections: int
    :param search_depth: How much of the network HNSW will search. Defaults to 8.
    :type search_depth: int
    :param construction_depth: How muhc of the network you will search during its construction (how accurately the network is built) (Won't make any difference in search time, so it is better to use higher numbers. However, it does have an effect in construction time). Defaults to 64.
    :type construction_depth: int

    :returns: A list of `Neighbor` namedtuples, one for each word close to the target word.
    :rtype: Neighbors
    """

    embeddings_array: ndarray = embeddings.to_numpy()
    n_dimensions: int = embeddings_array.shape[1]

    word_series: Series = embeddings.loc[word]
    word_vector: ndarray = np.array(word_series).reshape(1, -1)

    distances: ndarray
    indices: ndarray

    if metric == "hnsw":
        index = IndexHNSWFlat(n_dimensions, n_connections)
        index.hnsw.efSearch = search_depth
        index.hnsw.efConstruction = construction_depth
    elif metric == "ivf":
        quantizer: IndexFlatL2 = IndexFlatL2(n_dimensions)  # Clustering method.

        index = IndexIVFFlat(quantizer, n_dimensions, n_centroids)

        index.train(embeddings_array)  # Trains the index so it can create the clusters

        index.nprobe = clusters_to_search
    else:
        raise RuntimeError(
            f"Metric '{metric}' is not implemented in approximate_nearest_neighbors"
        )

    index.add(embeddings_array)

    distances, indices = index.search(word_vector, n_words)

    neighbor_tuples = zip(indices[0], distances[0])

    results: Neighbors = [
        Neighbor(str(embeddings.index[index]), float(distance))
        for index, distance in neighbor_tuples
        if embeddings.index[index] != word
    ]

    return results