Source code for dstk.utilities.word2vec
"""
This module provides utility functions for managing neural word embedding models,
specifically supporting Word2Vec and FastText formats. It acts as a unified
interface to handle the loading and saving of these models, abstracting away
the differences between the underlying `gensim` and `fasttext` libraries.
Core functionalities include:
* Loading Word2Vec models from files with the .model extension.
* Loading FastText models from files with the .bin extension.
* Automatic detection of model types based on file extensions during loading.
* Saving both Word2Vec and FastText models to specified paths while
automatically applying the correct file format.
* Providing a consistent interface for handling pre-trained word vectors in
linguistic workflows.
The module is intended to simplify the integration of vector space models into
computational linguistics and digital humanities projects.
"""
from pathlib import Path
from gensim.models import Word2Vec
import fasttext
from dstk.lib_types import NeuralModels, FastText
[docs]
def load_neural_model(path: str) -> NeuralModels:
"""
Loads the trained embeddings in .model (Word2Vec) or .bin (FastText) format, depending on the algorithm used.
:param path: Path to the saved model file.
:type path: str
:returns: An instance of gensim's Word2Vec or fasttext's FastText.
:rtype: NeuralModels
"""
extension: str = Path(path).suffix.lower()
if extension == ".model":
return Word2Vec.load(path)
elif extension == ".bin":
return fasttext.load_model(path)
else:
raise ValueError(f"Model extension {extension} not recognized.")
[docs]
def save_neural_model(model: NeuralModels, path: str) -> str:
"""
Saves the trained embeddings in .model (Word2Vec) or .bin (FastText) format, depending on the algorithm used.
:param model: A trained Word2Vec or FastText model.
:type model: NeuralModels
:param path: The path (without extension) where to save the model.
:type path: str
:returns: An instance of gensim's Word2Vec or fasttext's FastText.
:rtype: NeuralModels
"""
full_path: Path = Path(path)
if isinstance(model, Word2Vec):
model.save(str(full_path.with_suffix(".model")))
elif isinstance(model, FastText):
model.save_model(str(full_path.with_suffix(".bin")))
else:
raise NotImplementedError(
f"Model identifier type {type(model.__name__)} not yet supported"
)
return str(full_path.resolve())