Source code for abm_shape_collection.calculate_feature_statistics
import pandas as pd
from scipy.stats import ks_2samp
[docs]def calculate_feature_statistics(
features: list[str],
data: pd.DataFrame,
ref_data: pd.DataFrame,
) -> pd.DataFrame:
"""
Perform two-sample Kolmogorov-Smirnov test for goodness of fit on features.
Parameters
----------
features
List of features to perform test on.
data
Sample data, with features as columns.
ref_data : pd.DataFrame
References data, with features as columns.
Returns
-------
:
Kolmogorov-Smirnov statistics and p-values for each feature.
"""
statistics = []
for feature in features:
# Extract values for specific component.
ref_values = ref_data[feature].to_numpy()
values = data[feature].to_numpy()
# Calculate Kolmogorov-Smirnov statistic.
ks_result = ks_2samp(values, ref_values, mode="asymp")
statistics.append(
{
"FEATURE": feature.upper(),
"SIZE": len(values),
"KS_STATISTIC": ks_result.statistic,
"KS_PVALUE": ks_result.pvalue,
}
)
return pd.DataFrame(statistics)