Source code for dstk.utilities.matrix_manipulation

"""
This module provides a collection of helper functions for manipulating numerical data
structures, specifically focusing on NumPy arrays. It is designed
to simplify common data processing tasks such as normalization, scaling, and
transforming raw numerical outputs into formats suitable for analysis or visualization.

Core functionalities include:
* Scaling and standardizing matrices to ensure uniform variance across features.
* General utility functions for handling array dimensions and miscellaneous data transformations.

The module is intended as a toolkit for researchers who need to perform preprocessing
tasks on numerical datasets within a Python environment.
"""

from sklearn.preprocessing import StandardScaler
import pandas as pd

from ..lib_types import ndarray, DataFrame



[docs]
def scale_matrix(matrix: DataFrame, **kwargs) -> DataFrame:
    """
    Scales the input matrix to have zero mean and unit variance for each feature.

    This method applies standardization using scikit-learn's StandardScaler, which transforms the data such that each colum (feature) has a mean of 0 and a standard deviation of 1.

    :param matrix: The input data to scale.
    :type matrix: DataFrame
    :param kwargs: Additional keyword arguments to pass to sklearn's StandardScaler.

    For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

    :returns: A scaled matrix.
    :rtype: DataFrame
    """

    matrix_array: ndarray = matrix.to_numpy()

    scaler: StandardScaler = StandardScaler(**kwargs)
    scaled_matrix: ndarray = scaler.fit_transform(matrix_array)

    return pd.DataFrame(scaled_matrix, index=matrix.index, columns=matrix.columns)