"""
This module provides a suite of preprocessing and filtering tools for linguistic data.
It is designed to help researchers and students in the digital humanities clean, normalize,
and filter sequences of words (Word objects) or tokens based on common linguistic criteria
such as part-of-speech tags, frequency thresholds, and lexical normalization.
Core functionalities include:
* Removing stop words using NLTK's multi-language support or custom word lists.
* Filtering text by Part-of-Speech (POS) tags (e.g., retaining only nouns or verbs).
* Normalizing sequences into base forms through lemmatization or stemming.
* Cleaning text by removing punctuation and converting characters to lowercase.
* Filtering words based on minimum frequency thresholds.
* Isolating specific Named Entities (NER) from token sequences.
The module is intended to simplify the preparation of raw text for more advanced
computational linguistic analysis and visualizations.
"""
from collections import Counter
from nltk.corpus import stopwords
import nltk
from functools import lru_cache
from copy import copy
from typing import Literal, Sequence, overload
from ....lib_types import Token, Word, Token
@lru_cache()
def _get_stop_words(language: str) -> set[str]:
"""
Downloads and retrieves the set of NLTK stop words for a given language.
:param language: The full name of the language (e.g., 'english').
:type language: str
:return: A set of stop words.
:rtype: set[str]
"""
nltk.download("stopwords")
return set(stopwords.words(language))
@overload
def remove_stop_words(
words: Sequence[Word],
*,
language: str,
custom_stop_words: None = None,
) -> list[Word]: ...
@overload
def remove_stop_words(
words: Sequence[Word],
*,
language: None = None,
custom_stop_words: list[str],
) -> list[Word]: ...
@overload
def remove_stop_words(
words: Sequence[Word],
*,
language: str,
custom_stop_words: list[str],
) -> list[Word]: ...
[docs]
def remove_stop_words(
words: Sequence[Word],
*,
language: str | None = None,
custom_stop_words: list[str] | None = None,
) -> list[Word]:
"""
Filters out stop words from a sequence of words based on language or a custom list.
:param words: A sequence of Stanza Word objects.
:type words: Sequence[Word]
:param language: The two-letter ISO language code (e.g., 'en', 'es'). Defaults to None.
:type language: str or None
:param custom_stop_words: A user-defined list of stop words to filter out. Defaults to None.
:type custom_stop_words: list[str] or None
:return: A filtered list of Stanza Word objects.
:rtype: list[Word]
"""
language_map: dict[str, str] = {
"sq": "albanian",
"ar": "arabic",
"az": "azerbaijani",
"eu": "basque",
"be": "belarusian",
"bn": "bengali",
"ca": "catalan",
"zh": "chinese",
"da": "danish",
"nl": "dutch",
"en": "english",
"fi": "finnish",
"fr": "french",
"de": "german",
"el": "greek",
"he": "hebrew",
"hu": "hungarian",
"id": "indonesian",
"it": "italian",
"kk": "kazakh",
"ne": "nepali",
"nb": "norwegian bokmaal",
"pt": "portuguese",
"ro": "romanian",
"ru": "russian",
"sl": "slovene",
"es": "spanish",
"sv": "swedish",
"tg": "tajik",
"ta": "tamil",
"tr": "turkish",
"uz": "uzbek",
}
if not language and not custom_stop_words:
raise ValueError(
"You must provide either a language for your document or a list of custom stop words to filter."
)
if language not in language_map and not custom_stop_words:
raise ValueError(
f"Language {language} does not have automatic support implemented for stop words. Please enter your stop words manually as 'custom_stop_words'."
)
lower_stop_words: set[str] = set()
if custom_stop_words:
lower_stop_words = set(word.lower() for word in custom_stop_words)
stop_words: set[str] = (
_get_stop_words(language_map[language]) if language else set()
) | lower_stop_words
return [
copy(word)
for word in words
if word.lemma is None or word.lemma.lower() not in stop_words
]
[docs]
def filter_by_pos(words: Sequence[Word], allowed_pos: set[str]) -> list[Word]:
"""
Filters a sequence of words to keep only specified parts-of-speech (POS) tags.
:param words: A sequence of Stanza Word objects.
:type words: Sequence[Word]
:param allowed_pos: A set of universal POS tags to keep (e.g., {'NOUN', 'VERB'}).
:type allowed_pos: set[str]
:return: A filtered list of Stanza Word objects.
:rtype: list[Word]
"""
return [copy(word) for word in words if word.upos in allowed_pos]
[docs]
def filter_by_frequency(words: Sequence[Word], threshold: int = 50) -> list[Word]:
"""
Filters a sequence of words to keep only those that appear above a minimum frequency.
:param words: A sequence of Stanza Word objects.
:type words: Sequence[Word]
:param threshold: The minimum occurrence count required to keep a word. Defaults to 50.
:type threshold: int
:return: A filtered list of Stanza Word objects.
:rtype: list[Word]
"""
frequency: Counter[str] = Counter(word.text for word in words)
return [copy(word) for word in words if frequency[word.text] >= threshold]
[docs]
def to_lower(words: Sequence[Word]) -> list[Word]:
"""
Converts the text of all words in a sequence to lowercase.
:param words: A sequence of Stanza Word objects.
:type words: Sequence[Word]
:return: A list of lowercased Stanza Word objects.
:rtype: list[Word]
"""
lowers: list[Word] = []
for word in words:
lowered: Word = copy(word)
lowered.text = lowered.text.lower()
lowers.append(lowered)
return lowers
[docs]
def remove_punctuation(words: Sequence[Word]) -> list[Word]:
"""
Removes punctuation marks from a sequence of words.
:param words: A sequence of Stanza Word objects.
:type words: Sequence[Word]
:return: A list of Stanza Word objects excluding punctuation.
:rtype: list[Word]
"""
return [copy(word) for word in words if word.upos != "PUNCT"]
[docs]
def filter_by_ner(tokens: Sequence[Token], allowed_ner: set[str]) -> list[Token]:
"""
Filters a sequence of tokens to keep only specified Named Entity Recognition (NER) tags.
:param tokens: A sequence of Stanza Token objects.
:type tokens: Sequence[Token]
:param allowed_ner: A set of NER tags to keep (e.g., {'PERSON', 'LOC'}).
:type allowed_ner: set[str]
:return: A filtered list of Stanza Token objects.
:rtype: list[Token]
"""
return [copy(token) for token in tokens if token.ner in allowed_ner]