Source code for dstk.utilities.dataframe_manipulation
"""
This module provides helper functions to simplify the extraction of data from pandas DataFrames,
a common format used for organizing and storing linguistic datasets. It simplifies the process
of converting table-based structures into standard Python lists, making it easier to pass
data into various NLP pipelines or downstream analysis tools.
Core functionalities include:
* Extracting a specific row as a list by either its numerical position or its label name
* Extracting a specific column as a list for easy iteration and processing
* Checking if a DataFrame is stored in a sparse format (useful for managing memory when dealing with large datasets)
The module is intended to provide a simplified interface for data retrieval, ensuring that
data types are consistent when moving from tabular structures into standard Python lists.
"""
import pandas as pd
from ..lib_types import DataFrame, ndarray, Series
[docs]
def get_row(dataframe: DataFrame, row: int | str) -> list:
"""
Returns the specified row from a dataframe as a list of values.
:param dataframe: The dataframe where to extract the row.
:type dataframe: DataFrame
:param row: The index of the row to be extracted or its label. You can only extract by label when the datraframe contains no duplicates. Otherwise, it will raise a ValueError.
:type row: int | str
:raises ValueError: If the provided dataframe contains more than one row with the same name.
"""
row_index: int | slice | ndarray = (
row if isinstance(row, int) else dataframe.index.get_loc(row)
)
if isinstance(row_index, ndarray):
raise ValueError(
f"The provided dataframe contains more than one row with the name '{row}'. Please enter the index number of the desired row instead."
)
row_series: Series = dataframe.iloc[row_index]
return row_series.to_numpy().tolist()
[docs]
def get_column(dataframe: DataFrame, column: int | str) -> list:
"""
Returns the specified column from a dataframe as a list of values.
:param dataframe: The dataframe where to extract the column.
:type dataframe: DataFrame
:param column: The index of the column to be extracted or its label. You can only extract by label when the datraframe contains no duplicates. Otherwise, it will raise a ValueError.
:type column: int | str
:raises ValueError: If the provided dataframe contains more than one column with the same name.
"""
column_series: Series | DataFrame = (
dataframe[column] if isinstance(column, str) else dataframe.iloc[:, column]
)
if isinstance(column_series, DataFrame):
raise ValueError(
f"The provided dataframe contains more than one columnn with the name '{column}'. Please enter the index number of the desired column instead."
)
return column_series.to_numpy().tolist()
[docs]
def is_sparse_dataframe(dataframe: DataFrame) -> bool:
"""
Checks if all columns in the DataFrame are of a sparse type.
:param dataframe: The pandas DataFrame to check.
:type dataframe: DataFrame
:return: True if all columns are sparse, False otherwise.
:rtype: bool
"""
return all(pd.api.types.is_sparse(dtype) for dtype in dataframe.dtypes)