Source code for dstk.corpus.analysis

"""
This module provides standard tools for lexical and corpus linguistic analysis.
It simplifies common tasks such as calculating word frequencies, identifying
collocations (frequently occurring neighbor words), generating concordances to
examine context, and extracting unique vocabularies from annotated text data.

Core functionalities include:

* Frequency Analysis: Counting occurrences of words and returning them in a structured
  pandas DataFrame for easy analysis or visualization.
* Concordance Generation: Extracting specific search terms along with their surrounding
  context to study usage patterns.
* Collocation Extraction: Identifying statistically significant pairs or sequences of
  neighboring words within a specified window.
* Vocabulary Filtering: Isolating unique items from word sequences and organizing them
  alphabetically for systematic overview.

This module serves as a primary interface for transforming raw linguistic data into
quantifiable metrics and usable structures for research in digital humanities and
computational linguistics.
"""

from collections import Counter
from typing import Sequence
import pandas as pd
from nltk.text import Text
import nltk

from ..lib_types import DataFrame, Document, Word, ConcordanceLine, Concordance



[docs]
def word_frequency(words: list[Word]) -> DataFrame:
    """
    Calculate and return the frequency of each word as a pandas DataFrame.

    :param words: A sequence of word objects.
    :type words: list[Word]

    :return: A DataFrame with "Word" and "Frequency" columns.
    :rtype: DataFrame
    """

    word_counts: Counter[str] = Counter(word.text for word in words)
    return pd.DataFrame(word_counts.items(), columns=["Word", "Frequency"])




[docs]
def get_concordances(document: Document, text: str, **kwargs) -> list[Concordance]:
    """
    Find all occurrences of a specific string within a document and return their context.

    :param document: The Stanza Document to search.
    :type document: Document
    :param text: The substring to search for.
    :type text: str
    :param kwargs: Additional arguments passed to the underlying nltk.Text.concordance_list method.

    :return: A list of Concordance objects containing left and right context.
    :rtype: list[Concordance]
    """

    corpus: Text = Text([token.text for token in document.iter_tokens()])
    lines: list[ConcordanceLine] = corpus.concordance_list(text, **kwargs)
    concordances: list[Concordance] = [
        Concordance(
            left_context=line.left_print,
            text=line.query,
            right_context=line.right_print,
        )
        for line in lines
    ]

    return concordances




[docs]
def get_collocations(
    words: list[Word], number: int = 20, window_size: int = 2
) -> list[tuple[str, str]]:
    """
    Return the most frequent collocations in a sequence of words.

    :param words: A sequence of word objects.
    :type words: list[Word]

    :param number: The maximum number of collocations to return.
    :type number: int

    :param window_size: The number of neighboring words to consider.
    :type window_size: int

    :return: A list of collocations.
    :rtype: list[tuple[str, str]]
    """
    try:
      nltk.data.find("corpora/stopwords")
    except LookupError:
      nltk.download("stopwords", quiet=True)

    text: Text = Text([word.text for word in words])

    return text.collocation_list(num=number, window_size=window_size)




[docs]
def get_vocabulary(words: Sequence[Word]) -> list[Word]:
    """
    Return the unique words in a sequence, sorted alphabetically.

    :param words: A sequence of word objects.
    :type words: Sequence[Word]

    :return: The vocabulary of the sequence.
    :rtype: list[Word]
    """

    unique_words = {word.text: word for word in words}

    return sorted(unique_words.values(), key=lambda word: word.text)