Source code for dstk.utilities.data_conversion

"""
This module provides utility functions for converting linguistic data between different
formats and representations common in computational linguistics and natural language
processing (NLP). It facilitates the movement of data between raw text, neural model
outputs, and structured tabular formats (pandas DataFrames), as well as standard
annotation formats like CoNLL-U.

Core functionalities include:
* Converting sequences of lexical items into space-separated strings for text processing.
* Transforming Word2Vec and FastText embeddings into pandas DataFrames for easier analysis.
* Parsing CoNLL-U files into DataFrames to allow for programmatic manipulation of
  linguistic features and metadata.
* Exporting processed DataFrames back into the standard CoNLL-U format for sharing or
  storage.

The module is intended to streamline the workflow of converting data between various
stages of a linguistic pipeline, ensuring compatibility between different tools and
data storage formats.
"""

import pandas as pd
import numpy as np
from conllu import parse

from typing import cast
from ..lib_types import (
    ndarray,
    DataFrame,
    Word2Vec,
    FastText,
    NeuralModels,
    SentenceList,
    ConlluToken,
    TokenList,
    LexicalItemSequence,
)


[docs] def sequence_to_string(items: LexicalItemSequence) -> str: """ Joins a sequence of words or tokens into a single space-separated string. :param items: A sequence of word or token objects. :type items: LexicalItemSequence :return: A single string formed by joining the text of each item. :rtype: str """ return " ".join(item.text for item in items)
[docs] def neural_model_to_dataframe(model: NeuralModels) -> DataFrame: """ Converts a trained Word2Vec or FastText model into a DataFrame of word embeddings. :param model: A trained Word2Vec or FastText model. :type model: NeuralModels :return: A DataFrame containing the word embeddings and their associated labels. :rtype: DataFrame """ word_vectors: ndarray labels: list[str] | list[None] if isinstance(model, Word2Vec): word_vectors = model.wv[model.wv.index_to_key] labels = list(model.wv.index_to_key) elif isinstance(model, FastText): words: list[str] = cast(list[str], model.words) word_vectors = np.array([model[word] for word in words]) labels = words return pd.DataFrame(word_vectors, index=labels)
[docs] def conllu_to_df(path: str) -> DataFrame: """ Parses a CoNLL-U file and converts it into a pandas DataFrame. :param path: The system path to the .conllu file. :type path: str :return: A DataFrame containing tokens, metadata, and features. :rtype: DataFrame """ with open(path, "r") as file: conllu = file.read() sentences: SentenceList = parse(conllu) tokens: list[ConlluToken] = [] for sentence_id, sentence in enumerate(sentences): metadata: list[str] = [] if sentence.metadata: for key, value in sentence.metadata.items(): metadata.append(f"{key}={value}") else: metadata.append("_") for token in sentence: miscs: list[str] = [] feats: list[str] = [] if token["misc"]: for key, value in token["misc"].items(): miscs.append(f"{key}={value}") token["misc"] = "|".join(miscs) else: miscs.append("_") if token["feats"]: for key, value in token["feats"].items(): feats.append(f"{key}={value}") token["feats"] = "|".join(feats) else: feats.append("_") token["sent_id"] = ( sentence.metadata["sent_id"] if sentence.metadata and "sent_id" in sentence.metadata else sentence_id ) token["sent_metadata"] = "|".join(metadata) tokens.append(token) return pd.DataFrame(tokens)
[docs] def df_to_conllu(dataframe: DataFrame, path: str | None) -> str: """ Converts a DataFrame of linguistic data into a CoNLL-U formatted string. :param dataframe: A pandas DataFrame containing token information and metadata. :type dataframe: DataFrame :param path: Optional file path to save the generated CoNLL-U content. :type path: str | None :return: The resulting CoNLL-U formatted string. :rtype: str """ filled_df: DataFrame = dataframe.fillna("_") filled_df["sent_id"] = filled_df["sent_id"].astype(int) conllu: str = "" for sentence_id, sentence_df in filled_df.groupby("sent_id"): dict_list: list[dict[str, str]] = sentence_df.to_dict("records") metadata: str = dict_list[0]["sent_metadata"] if metadata != "_": items: list[str] = metadata.split("|") for item in items: key, value = item.split("=") conllu += f"# {key} = {value}\n" else: conllu += f"# sent_id = {sentence_id}" tokens: list[dict] = [] for token in dict_list: del token["sent_id"] del token["sent_metadata"] tokens.append(token) token_list: TokenList = TokenList([ConlluToken(token) for token in tokens]) data: str = token_list.serialize() conllu += data if path: with open(path, "w") as file: file.write(conllu) return conllu