Coverage for src/abm_shape_collection/calculate_feature_statistics.py: 100%
10 statements
« prev ^ index » next coverage.py v7.1.0, created at 2024-09-25 19:34 +0000
« prev ^ index » next coverage.py v7.1.0, created at 2024-09-25 19:34 +0000
1import pandas as pd
2from scipy.stats import ks_2samp
5def calculate_feature_statistics(
6 features: list[str],
7 data: pd.DataFrame,
8 ref_data: pd.DataFrame,
9) -> pd.DataFrame:
10 """
11 Perform two-sample Kolmogorov-Smirnov test for goodness of fit on features.
13 Parameters
14 ----------
15 features
16 List of features to perform test on.
17 data
18 Sample data, with features as columns.
19 ref_data : pd.DataFrame
20 References data, with features as columns.
22 Returns
23 -------
24 :
25 Kolmogorov-Smirnov statistics and p-values for each feature.
26 """
28 statistics = []
30 for feature in features:
31 # Extract values for specific component.
32 ref_values = ref_data[feature].to_numpy()
33 values = data[feature].to_numpy()
35 # Calculate Kolmogorov-Smirnov statistic.
36 ks_result = ks_2samp(values, ref_values, mode="asymp")
38 statistics.append(
39 {
40 "FEATURE": feature.upper(),
41 "SIZE": len(values),
42 "KS_STATISTIC": ks_result.statistic,
43 "KS_PVALUE": ks_result.pvalue,
44 }
45 )
47 return pd.DataFrame(statistics)