Source code for dstk.corpus.annotation

"""
This module provides tools for annotating and processing text corpora to extract linguistic information.
It acts as a unified interface for multi-language NLP tasks, supporting both Stanza and spaCy
back-ends to process raw text into structured data including part-of-speech (POS) tags,
lemmas, and word stems.

Core functionalities include:

* Processing raw text strings into structured Document objects containing linguistic metadata.
* Automatic stemming for a wide range of languages using the Snowball algorithm.
* Integration with Stanza and spaCy pipelines to perform tasks like NER, dependency parsing, and morphology.
* Exporting processed results into the standard CoNLL-U format for further research.
* Importing existing CoNLL-U files from local directories for analysis.

The module is designed to streamline the transition from raw text to structured linguistic
data for use in digital humanities and computational linguistics projects.
"""

import stanza
from stanza.utils.conll import CoNLL
from stanza.pipeline.processor import Processor, register_processor
from nltk.stem.snowball import SnowballStemmer
import spacy
import re
from pathlib import Path
import warnings

from typing import Optional

from ..lib_types.stanza_types import *
from ..lib_types.spacy_types import *
from ..lib_types.dstk_types import DocumentIndex, Word


def _stem_processor(language: str, document: Document) -> None:
    """
    Apply stemming to all words in a document based on the specified language.

    :param language: The ISO 639-1 language code (e.g., 'en', 'es').
    :type language: str
    :param document: The Stanza Document object to process.
    :type document: Document
    """

    language_map: dict[str, str] = {
        "ar": "arabic",
        "da": "danish",
        "nl": "dutch",
        "en": "english",
        "fi": "finnish",
        "fr": "french",
        "de": "german",
        "hu": "hungarian",
        "it": "italian",
        "nb": "norwegian",
        "pt": "portuguese",
        "ro": "romanian",
        "ru": "russian",
        "es": "spanish",
        "sv": "swedish",
    }

    snowball_stemmer: Optional[SnowballStemmer] = None

    if language in language_map:
        snowball_stemmer = SnowballStemmer(language_map[language])
    else:
        warnings.warn(
            f"There is currently not support for stemming {language}, so the stem will be the text form. Please read the list of supported languages."
        )

    for sentence in document.sentences:
        for word in sentence.words:
            stem: str = ""
            if snowball_stemmer is not None:
                stem = snowball_stemmer.stem(word.text)
            else:
                stem = word.text

            word.stem = stem

            if word.misc:
                word.misc += f"|stem={stem}"
            else:
                word.misc = f"stem={stem}"


def _stem_getter(self) -> str:
    """Get the stem of the word."""

    return self._stem


def _stem_setter(self, value: str) -> None:
    """Set the stem of the word."""

    self._stem = value


Word.add_property("stem", getter=_stem_getter, setter=_stem_setter)



[docs]
@register_processor("stem")
class StemmerProcessor(Processor):
    _requires = set(["tokenize", "mwt", "pos"])
    _provides = set(["stem"])

    def __init__(self, device, config, pipeline) -> None:
        self.lang: str = pipeline.lang


[docs]
    def process(self, doc: Document) -> Document:  # type: ignore
        """
        Execute the stemming step for a given document.

        :param doc: The Document to be processed.
        :type doc: Document
        :return: The processed Document.
        :rtype: Document
        """

        _stem_processor(language=self.lang, document=doc)

        return doc





[docs]
def annotate_corpus(
    corpus: list[str],
    language_model: str,
    output_dir: str | None = None,
    document_names: list[str] | None = None,
    max_length: int = 1000000,
    processors: str = "tokenize,mwt,pos,lemma,stem,depparse,ner,sentiment,constituency",
    **kwargs,
) -> DocumentIndex:
    """
    Annotates a collection of texts using the specified language model.

    :param corpus: A list of raw strings to be processed.
    :type corpus: list[str]
    :param language_model: The identifier or instance of the language model.
    :type language_model: str
    :param output_dir: Optional path where results should be saved.
    :type output_dir: str | None
    :param document_names: Optional list of names for each item in the corpus.
    :type document_names: list[str] | None
    :param max_length: Maximum allowed character length per text.
    :type max_length: int
    :param processors: Comma-separated string of processing steps to include.
    :type processors: str
    :return: A mapping of document names to annotated Document objects.
    :rtype: DocumentIndex
    """

    if document_names and len(document_names) != len(corpus):
        raise ValueError(
            "The number of filenames must be equal to the number of your documents in the corpus."
        )

    documents: list[Document] = []
    nlp: Pipeline | Language

    try:
        stanza.download(language_model)
        nlp = stanza.Pipeline(language_model, processors=processors, **kwargs)
        corpus_documents: list[Document] = [
            stanza.Document([], text=document) for document in corpus
        ]

        documents = nlp(corpus_documents)
    except Exception as error:
        nlp = spacy.load(language_model, **kwargs)
        nlp.add_pipe("conll_formatter", config={"field_names": {}}, last=True)
        nlp.max_length = max_length
        corpus = [re.sub(r"\s+", " ", document) for document in corpus]

        documents = [
            CoNLL.conll2doc(input_str=nlp(document)._.conll_str) for document in corpus
        ]

        for document in documents:
            _stem_processor(language=language_model.split("_")[0], document=document)

    document_index: DocumentIndex = {}

    for index, document in enumerate(documents):
        name = document_names[index] if document_names else f"document_{index}"
        document_index[name] = document

        if output_dir:
            CoNLL.write_doc2conll(document, f"{output_dir}/{name}.conllu")

    return document_index




[docs]
def load_annotations(
    input_dir: str, filenames: list[str] | None = None
) -> DocumentIndex:
    """
    Load CoNLL-U files from a directory and map them to their filenames.

    :param input_dir: Path to the directory containing .conllu files.
    :type input_dir: str
    :param filenames: Optional list of specific filenames to include (filtered by filename excluding extension).
    :type filenames: list[str] | None
    :return: A dictionary mapping filenames to annotated Document objects.
    :rtype: DocumentIndex
    """

    documents: list[Document] = []
    names: list[str] = []

    for file in Path(input_dir).iterdir():
        if file.suffix != ".conllu":
            continue

        if filenames and file.stem not in filenames:
            continue

        documents.append(CoNLL.conll2doc(str(file)))
        names.append(file.stem)

    if filenames and len(documents) != len(filenames):
        names_not_found = [name for name in filenames if name not in names]
        warnings.warn(
            f"The following documents were not found: {names_not_found}. Returning only {names}"
        )

    return dict(zip(names, documents))