Source code for dstk.utilities.typeguards

"""
This module provides type guard functions to verify the internal structure and
data types of linguistic objects within the library. These guards act as safety
checks, ensuring that data—such as collections of words, sentences, or
document indices—matches the expected format before it is processed by downstream
functions.

Core functionalities include:
* Validating document structures (e.g., checking for valid dictionaries or sequences of Stanza Documents)
* Verifying linguistic units (validating lists of sentences, words, or tokens)
* Checking complex data types like collocations and mixed linguistic sequences
* Validating workflow configurations to ensure they adhere to the required schema

By using these guards, the library ensures that errors are caught early when
processing large datasets, providing more reliable results for linguistic analysis.
"""

from ..hooks.tools import Hook

from typing import Any, TypeGuard, Sequence
from ..lib_types import (
    Workflow,
    Bigram,
    Word,
    Sentence,
    LinguisticSequences,
    Document,
    DocumentIndex,
    Token,
    Collocates,
)



[docs]
def is_document_index(index: Any) -> TypeGuard[DocumentIndex]:
    """
    Checks if the input is a non-empty dictionary mapping strings
    to Stanza Documents.

    :param index: The object to check.
    :type index: Any

    :return: True if `index` matches DocumentIndex, otherwise False.
    :rtype: bool
    """

    if not isinstance(index, dict) or not index:
        return False

    return all(
        isinstance(key, str) and isinstance(value, Document)
        for key, value in index.items()
    )




[docs]
def is_documents(documents: Any) -> TypeGuard[Sequence[Document]]:
    """
    Checks if the input is a non-empty sequence of Stanza Documents.

    :param documents: The object to check.
    :type documents: Any

    :return: True if `documents` matches Documents, otherwise False.
    :rtype: bool
    """

    if not isinstance(documents, Sequence) or not documents:
        return False

    return all(isinstance(document, Document) for document in documents)




[docs]
def is_sentences(sentences: Any) -> TypeGuard[Sequence[Sentence]]:
    """
    Checks if the input is a non-empty sequence of Sentence objects.

    :param sentences: The object to check.
    :type sentences: Any

    :return: True if `sentences` matches Sequence[Sentence], otherwise False.
    :rtype: bool
    """

    if not isinstance(sentences, Sequence) or not sentences:
        return False

    return all(isinstance(sentence, Sentence) for sentence in sentences)




[docs]
def is_words(words: Any) -> TypeGuard[Sequence[Word]]:
    """
    Checks if the input is a non-empty sequence of Word objects.

    :param words: The object to check.
    :type words: Any

    :return: True if `words` matches Sequence[Word], otherwise False.
    :rtype: bool
    """

    if not isinstance(words, Sequence) or not words:
        return False

    return all(isinstance(word, Word) for word in words)




[docs]
def is_tokens(tokens: Any) -> TypeGuard[Sequence[Token]]:
    """
    Checks if the input is a non-empty sequence of Token objects.

    :param tokens: The object to check.
    :type tokens: Any

    :return: True if `tokens` matches Sequence[Token], otherwise False.
    :rtype: bool
    """

    if not isinstance(tokens, Sequence) or not tokens:
        return False

    return all(isinstance(token, Token) for token in tokens)




[docs]
def is_collocates(collocates: Any) -> TypeGuard[list[Collocates]]:
    """
    Checks if the input is a non-empty list of tuples, where each
    tuple contains Word objects.

    :param collocates: The object to check.
    :type collocates: Any

    :return: True if `collocates` matches a list of Collocates, otherwise False.
    :rtype: bool
    """

    if not isinstance(collocates, list) or not collocates:
        return False

    return all(
        isinstance(collocate, tuple)
        and not isinstance(collocate, Bigram)
        and all(isinstance(word, Word) for word in collocate)
        for collocate in collocates
    )




[docs]
def is_sequences(sequences: Any) -> TypeGuard[LinguisticSequences]:
    """
    Checks if the input is a non-empty sequence of linguistic items
    (Sentences, Words, or Tokens).

    :param sequences: The object to check.
    :type sequences: Any
    :return: True if `sequences` matches LinguisticSequences, otherwise False.
    :rtype: bool
    """

    if not isinstance(sequences, Sequence) or not sequences:
        return False

    return all(
        isinstance(sequence, Sentence) or is_words(sequence) or is_tokens(sequence)
        for sequence in sequences
    )




[docs]
def is_workflow(workflow: Any) -> TypeGuard[Workflow]:
    """
    Checks if the input is a workflow structure, i.e., a non-empty list of dictionaries where each dictionary maps string method names to argument dictionaries with string keys.

    :param workflow: The object to check.
    :type workflow: Any

    :return: True if `workflow` matches the workflow structure, otherwise False.
    :rtype: bool
    """

    if not isinstance(workflow, dict) or not workflow:
        return False

    return all(
        isinstance(param, str)
        and (
            isinstance(value, list)
            and all(
                isinstance(method_dict, dict)
                and all(
                    isinstance(method_name, str)
                    and isinstance(kwargs, dict)
                    and all(isinstance(arg, str) for arg in kwargs.keys())
                    for method_name, kwargs in method_dict.items()
                )
                for method_dict in value
            )
        )
        or isinstance(value, Hook)
        for param, value in workflow.items()
    )