Source code for dstk.lib_types.dstk_types

from typing import (
    TypeAlias,
    Any,
    NamedTuple,
    Generator,
    Sequence, 
    TYPE_CHECKING,
    Union
)

from stanza import Document
from .stanza_types import Word, Sentence, Token
from .fasttext_types import FastText
from .gensim_types import Word2Vec

if TYPE_CHECKING:
    from ..hooks.tools import Hook

#: A dictionary mapping identifiers to Stanza Documents.
DocumentIndex: TypeAlias = dict[str, Document]

#: A single unit of analysis, either a Word or a Token.
LexicalItem: TypeAlias = Token | Word

#: A sequence of words or tokens for processing.
LexicalItemSequence: TypeAlias = Sequence[Word] | Sequence[Token]

#: Collections of sentences or word/token sequences used in linguistic analysis.
LinguisticSequences: TypeAlias = (
    Sequence[Sentence] | Sequence[Sequence[Word]] | Sequence[Sequence[Token]]
)

#: A pair of lists representing the left and right contexts of a target word.
Contexts: TypeAlias = tuple[list[Word], list[Word]]

#: A generator yielding context pairs for iteration.
ContextGenerator: TypeAlias = Generator[Contexts, None, None]

#: A tuple representing a group of collocates.
Collocates: TypeAlias = tuple[Word, ...]


[docs] class Concordance(NamedTuple): """ A record of a word's occurrence within its surrounding context. :param left_context: The text appearing before the target word. :type left_context: str :param text: The specific word or phrase being analyzed. :type text: str :param right_context: The text appearing after the target word. :type right_context: str """ left_context: str text: str right_context: str
[docs] class Bigram(NamedTuple): """ Represents a pair of words occurring together. :param collocate: The collocate word. :type collocate: Word :param target_word: The target word in the bigram. :type target_word: str """ collocate: Word target_word: str
#: Directed collocates represented as a tuple of a word and a pair of directional tags. DirectedCollocates: TypeAlias = tuple[Word, tuple[str, str]]
[docs] class Neighbor(NamedTuple): """ A tuple representing a neighboring word and associated statistical score. :param word: The neighboring word found in the text. :type word: str :param score: The statistical weight or confidence of the neighbor. :type score: float """ word: str score: float
#: A list of neighbors with their respective scores. Neighbors: TypeAlias = list[Neighbor] #: Supported neural language models (Word2Vec or FastText). NeuralModels: TypeAlias = Word2Vec | FastText #: A dictionary defining a method name and the keyword arguments passed to it. MethodDict: TypeAlias = dict[str, dict[str, Any]] #: A mapping of workflow stages to an ordered list of methods or a custom Hook. Workflow: TypeAlias = dict[str, Union[list[MethodDict], "Hook"]]
[docs] class ParameterResult(NamedTuple): """The result of an individual step within a workflow. :param name: The name of the step. :type name: str :param result: The output produced by the step. :type result: Any """ name: str result: Any
#: Generator yielding `ParameterResult` objects containing both name and data. ReturnAllGenerator: TypeAlias = Generator[ParameterResult, None, None] #: Generator yielding only the results of a parameter step without metadata. ReturnParameterGenerator: TypeAlias = Generator[Any, None, None]