diff --git a/baselines/generate_figures.py b/baselines/generate_figures.py index 113cd70..8631b0f 100644 --- a/baselines/generate_figures.py +++ b/baselines/generate_figures.py @@ -37,7 +37,7 @@ import ptitprince as pt from functools import reduce -from scipy.stats import wilcoxon, normaltest, kruskal +from scipy.stats import wilcoxon, mannwhitneyu, normaltest, kruskal from statsmodels.stats.multitest import multipletests # Initialize logging @@ -440,7 +440,8 @@ def print_colorado_subjects_with_dice_0(df_concat): def compute_wilcoxon_test(df_concat, list_of_metrics): """ - Compute Wilcoxon signed-rank test (two related paired samples -- a same subject for nnunet_3d vs nnunet_2d) + Compute Wilcoxon signed-rank test (nonparametric, paired -- two related paired samples -- a same subject for + nnunet_3d vs nnunet_2d) https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.wilcoxon.html :param df_concat: dataframe containing all the data :param list_of_metrics: list of metrics to compute the Wilcoxon test for @@ -485,6 +486,38 @@ def compute_wilcoxon_test(df_concat, list_of_metrics): f'formatted p{format_pvalue(p)}, unformatted p={p:0.6f}') +def compute_mann_whitney_u_test(df_concat, list_of_metrics): + """ + Compute Mann-Whitney U test (nonparametric, independent samples) between site 1 and site 2 + https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mannwhitneyu.html + :param df_concat: dataframe containing all the data + :param list_of_metrics: list of metrics to compute the Mann-Whitney U test for + :return: + """ + + logger.info('') + + # Remove 'NbTestedLesions' and 'VolTestedLesions' from the list of metrics + list_of_metrics = [metric for metric in list_of_metrics if metric not in ['NbTestedLesions', 'VolTestedLesions']] + + # Loop across nnunet_2d, nnunet_3d + for method in df_concat['method'].unique(): + # Loop across metrics + for metric in list_of_metrics: + # Prepare the data + df_site1_metric = df_concat[(df_concat['site'] == 'zurich') & (df_concat['method'] == method)][metric] + df_site2_metric = df_concat[(df_concat['site'] == 'colorado') & (df_concat['method'] == method)][metric] + + # Drop nan + df_site1_metric = df_site1_metric.dropna() + df_site2_metric = df_site2_metric.dropna() + + # Compute Mann-Whitney U test + stat, p = mannwhitneyu(df_site1_metric, df_site2_metric, alternative='two-sided') + logger.info(f'{metric}, {method}: Mann-Whitney U test between Zurich and Colorado: ' + f'formatted p{format_pvalue(p)}, unformatted p={p:0.6f}') + + def compute_kruskal_wallis_test(df_concat, list_of_metrics): """ Compute Kruskal-Wallis H-test (non-parametric version of ANOVA) @@ -634,6 +667,9 @@ def main(): # Concatenate the list of dataframes into a single dataframe df_concat = pd.concat(list_of_df, ignore_index=True) + # Drop filename column (to avoid error for the following '.groupby' and '.mean()' commands) + df_concat = df_concat.drop(columns=['filename']) + # If a participant_id is duplicated (because the test image is presented across multiple seeds), average the # metrics across seeds for the same subject. df_concat = df_concat.groupby(['participant_id', 'session_id', 'site', 'method']).mean().reset_index() @@ -648,9 +684,14 @@ def main(): # Print colorado subjects with Dice=0 print_colorado_subjects_with_dice_0(df_concat) - # For lesions, compute Wilcoxon signed-rank test test between nnunet_3d and nnunet_2d + # For lesions: + # - compute Wilcoxon signed-rank test (nonparametric, paired) between nnunet_3d and nnunet_2d + # - compute Mann-Whitney U test (nonparametric, independent samples) between site 1 and site 2 if pred_type == 'lesion': + # nnunet_3d and nnunet_2d compute_wilcoxon_test(df_concat, list_of_metrics) + # site 1 vs site 2 + compute_mann_whitney_u_test(df_concat, list_of_metrics) # For SC, compute Kruskal-Wallis H-test (we have 6 methods) else: compute_kruskal_wallis_test(df_concat, list_of_metrics)