Source code for dstk.utilities.context_extraction

"""
This module provides supplementary utilities for analyzing and summarizing
context extraction results. It complements the main context extraction module
by offering helper functions for computing descriptive statistics, transforming
extracted contexts into tabular representations, and performing other auxiliary
operations on collocation data.

Core functionalities include:

* Counting the frequency of words appearing in extracted collocates.
* Converting context extraction results into pandas DataFrames for analysis.
* Supporting exploratory analysis of collocation and context data.
* Providing miscellaneous helper functions related to context extraction outputs.

The module is intended to complement the context extraction workflow by
providing reusable utilities for post-processing and analyzing extracted
linguistic contexts.
"""

from collections import Counter
from ..lib_types import DataFrame, Collocates
import pandas as pd



[docs]
def collocate_frequency(collocates: list[Collocates]) -> DataFrame:
    """
    Counts the frequency of words in a list of collocations and returns the result as a DataFrame.

    :param collocates: A list of collocations, where each collocation is a tuple of words.
    :type collocates: list[Collocates]

    :return: A DataFrame with two columns: "Word" and "Frequency", sorted by frequency.
    :rtype: DataFrame
    """

    all_words: list[str] = [
        word.text for collocation in collocates for word in collocation
    ]
    word_counts: Counter[str] = Counter(all_words)
    word_counts_df: DataFrame = pd.DataFrame(
        word_counts.items(), columns=["Word", "Frequency"]
    )

    return word_counts_df