Source code for dstk.utilities.dataframe_manipulation

"""
This module provides helper functions to simplify the extraction of data from pandas DataFrames,
a common format used for organizing and storing linguistic datasets. It simplifies the process
of converting table-based structures into standard Python lists, making it easier to pass
data into various NLP pipelines or downstream analysis tools.

Core functionalities include:

* Extracting a specific row as a list by either its numerical position or its label name
* Extracting a specific column as a list for easy iteration and processing
* Checking if a DataFrame is stored in a sparse format (useful for managing memory when dealing with large datasets)

The module is intended to provide a simplified interface for data retrieval, ensuring that
data types are consistent when moving from tabular structures into standard Python lists.
"""

import pandas as pd

from ..lib_types import DataFrame, ndarray, Series


[docs] def get_row(dataframe: DataFrame, row: int | str) -> list: """ Returns the specified row from a dataframe as a list of values. :param dataframe: The dataframe where to extract the row. :type dataframe: DataFrame :param row: The index of the row to be extracted or its label. You can only extract by label when the datraframe contains no duplicates. Otherwise, it will raise a ValueError. :type row: int | str :raises ValueError: If the provided dataframe contains more than one row with the same name. """ row_index: int | slice | ndarray = ( row if isinstance(row, int) else dataframe.index.get_loc(row) ) if isinstance(row_index, ndarray): raise ValueError( f"The provided dataframe contains more than one row with the name '{row}'. Please enter the index number of the desired row instead." ) row_series: Series = dataframe.iloc[row_index] return row_series.to_numpy().tolist()
[docs] def get_column(dataframe: DataFrame, column: int | str) -> list: """ Returns the specified column from a dataframe as a list of values. :param dataframe: The dataframe where to extract the column. :type dataframe: DataFrame :param column: The index of the column to be extracted or its label. You can only extract by label when the datraframe contains no duplicates. Otherwise, it will raise a ValueError. :type column: int | str :raises ValueError: If the provided dataframe contains more than one column with the same name. """ column_series: Series | DataFrame = ( dataframe[column] if isinstance(column, str) else dataframe.iloc[:, column] ) if isinstance(column_series, DataFrame): raise ValueError( f"The provided dataframe contains more than one columnn with the name '{column}'. Please enter the index number of the desired column instead." ) return column_series.to_numpy().tolist()
[docs] def is_sparse_dataframe(dataframe: DataFrame) -> bool: """ Checks if all columns in the DataFrame are of a sparse type. :param dataframe: The pandas DataFrame to check. :type dataframe: DataFrame :return: True if all columns are sparse, False otherwise. :rtype: bool """ return all(pd.api.types.is_sparse(dtype) for dtype in dataframe.dtypes)