Coverage for src/abm_shape_collection/calculate_feature_statistics.py: 100%

10 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2024-09-25 19:34 +0000

1import pandas as pd 

2from scipy.stats import ks_2samp 

3 

4 

5def calculate_feature_statistics( 

6 features: list[str], 

7 data: pd.DataFrame, 

8 ref_data: pd.DataFrame, 

9) -> pd.DataFrame: 

10 """ 

11 Perform two-sample Kolmogorov-Smirnov test for goodness of fit on features. 

12 

13 Parameters 

14 ---------- 

15 features 

16 List of features to perform test on. 

17 data 

18 Sample data, with features as columns. 

19 ref_data : pd.DataFrame 

20 References data, with features as columns. 

21 

22 Returns 

23 ------- 

24 : 

25 Kolmogorov-Smirnov statistics and p-values for each feature. 

26 """ 

27 

28 statistics = [] 

29 

30 for feature in features: 

31 # Extract values for specific component. 

32 ref_values = ref_data[feature].to_numpy() 

33 values = data[feature].to_numpy() 

34 

35 # Calculate Kolmogorov-Smirnov statistic. 

36 ks_result = ks_2samp(values, ref_values, mode="asymp") 

37 

38 statistics.append( 

39 { 

40 "FEATURE": feature.upper(), 

41 "SIZE": len(values), 

42 "KS_STATISTIC": ks_result.statistic, 

43 "KS_PVALUE": ks_result.pvalue, 

44 } 

45 ) 

46 

47 return pd.DataFrame(statistics)