Source code for dstk.parameters.co_matrix.weighting.associative_measures

"""
This module provides functions to transform raw co-occurrence matrices into association measures
to better reflect the distributional information of lexemes.

While simple co-occurrence counts indicate how often two words appear together, they do not
always account for the "informativeness" of those occurrences. For example, a common verb
like "get" may co-occur with many different subjects, making it less informative about its
context than a specific verb like "bark."

Core functionalities include:
* Weighting co-occurrence matrices using Pointwise Mutual Information (PMI) to identify
  stronger distributional associations.
* Weighting co-occurrence matrices using Positive Pointwise Mutual Information (PPMI)
  to eliminate negative values and focus on positive associations.
* Handling sparse matrix data structures to ensure efficiency when processing large linguistic
  corpora.

The module is intended to help researchers move beyond simple frequency counts toward
meaningful distributional analysis in corpus linguistics.
"""

import pandas as pd
import numpy as np
from scipy.sparse import csr_array, coo_array
from ....utilities.dataframe_manipulation import is_sparse_dataframe

from ....lib_types import DataFrame, ndarray



[docs]
def pmi(word_by_word_matrix: DataFrame, positive: bool = False) -> DataFrame:
    """
    Weights a co-occurrence matrix by PMI or PPMI.

    :param word_by_word_matrix: The co-occurrence matrix to be weighted.
    :type word_by_word_matrix: DataFrame
    :param positive: If True, weights by PPMI; if False, weighs by PMI. Defaults to False.
    :type positive: bool
    :return: Sparse co-occurrence matrix weighted by PMI or PPMI.
    :rtype: DataFrame
    """

    matrix: coo_array = (
        word_by_word_matrix.sparse.to_coo()
        if is_sparse_dataframe(word_by_word_matrix)
        else coo_array(word_by_word_matrix)
    )

    col_totals: ndarray = np.asarray(matrix.sum(axis=0)).ravel()
    row_totals: ndarray = np.asarray(matrix.sum(axis=1)).ravel()
    total_sum: ndarray = row_totals.sum()

    rows: ndarray = matrix.row
    cols: ndarray = matrix.col
    values: ndarray = matrix.data

    expected: ndarray = (row_totals[rows] * col_totals[cols]) / total_sum

    pmi_values: ndarray = np.log(values / expected)

    if positive:
        pmi_values = np.maximum(0, pmi_values)

    weighted_matrix: csr_array = csr_array(
        (pmi_values, (rows, cols)), shape=matrix.shape
    )

    if positive:
        weighted_matrix.eliminate_zeros()

    return pd.DataFrame.sparse.from_spmatrix(
        weighted_matrix,
        index=word_by_word_matrix.index,
        columns=word_by_word_matrix.columns,
    ).fillna(0)