Source code for dstk.utilities.dataframe_manipulation

"""
This module provides helper functions to simplify the extraction of data from pandas DataFrames,
a common format used for organizing and storing linguistic datasets. It simplifies the process
of converting table-based structures into standard Python lists, making it easier to pass
data into various NLP pipelines or downstream analysis tools.

Core functionalities include:

* Extracting a specific row as a list by either its numerical position or its label name
* Extracting a specific column as a list for easy iteration and processing
* Checking if a DataFrame is stored in a sparse format (useful for managing memory when dealing with large datasets)

The module is intended to provide a simplified interface for data retrieval, ensuring that
data types are consistent when moving from tabular structures into standard Python lists.
"""

import pandas as pd

from ..lib_types import DataFrame, ndarray, Series



[docs]
def get_row(dataframe: DataFrame, row: int | str) -> list:
    """
    Returns the specified row from a dataframe as a list of values.

    :param dataframe: The dataframe where to extract the row.
    :type dataframe: DataFrame
    :param row: The index of the row to be extracted or its label. You can only extract by label when the datraframe contains no duplicates. Otherwise, it will raise a ValueError.
    :type row: int | str

    :raises ValueError: If the provided dataframe contains more than one row with the same name.
    """

    row_index: int | slice | ndarray = (
        row if isinstance(row, int) else dataframe.index.get_loc(row)
    )

    if isinstance(row_index, ndarray):
        raise ValueError(
            f"The provided dataframe contains more than one row with the name '{row}'. Please enter the index number of the desired row instead."
        )

    row_series: Series = dataframe.iloc[row_index]

    return row_series.to_numpy().tolist()




[docs]
def get_column(dataframe: DataFrame, column: int | str) -> list:
    """
    Returns the specified column from a dataframe as a list of values.

    :param dataframe: The dataframe where to extract the column.
    :type dataframe: DataFrame
    :param column: The index of the column to be extracted or its label. You can only extract by label when the datraframe contains no duplicates. Otherwise, it will raise a ValueError.
    :type column: int | str

    :raises ValueError: If the provided dataframe contains more than one column with the same name.
    """

    column_series: Series | DataFrame = (
        dataframe[column] if isinstance(column, str) else dataframe.iloc[:, column]
    )

    if isinstance(column_series, DataFrame):
        raise ValueError(
            f"The provided dataframe contains more than one columnn with the name '{column}'. Please enter the index number of the desired column instead."
        )

    return column_series.to_numpy().tolist()




[docs]
def is_sparse_dataframe(dataframe: DataFrame) -> bool:
    """
    Checks if all columns in the DataFrame are of a sparse type.

    :param dataframe: The pandas DataFrame to check.
    :type dataframe: DataFrame

    :return: True if all columns are sparse, False otherwise.
    :rtype: bool
    """

    return all(pd.api.types.is_sparse(dtype) for dtype in dataframe.dtypes)