uio-bmi · mmamica · Nov 23, 2023 · Nov 2, 2023 · Nov 23, 2023
diff --git a/scripts/cli_scripts/analyse_fdr_hacking.py b/scripts/cli_scripts/analyse_fdr_hacking.py
diff --git a/scripts/cli_scripts/execute_statistical_test.py b/scripts/cli_scripts/execute_statistical_test.py
@@ -0,0 +1,67 @@
+import argparse
+import glob
+import os
+import pandas as pd
+from scipy.stats import ttest_ind
+from fdr_hacking.data_generation import *
+import numpy as np
+import plotly.express as px
+
+
+def execute():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_path', help='Path to the dataset files in tsv format', required=True)
+    parser.add_argument('--output', help='Path to the output directory, where all the results should be stored',
+                        required=True)
+    args = parser.parse_args()
+    # gather a file list of all the tsv files in args.data_path using glob.glob
+    dataset_list = glob.glob(os.path.join(args.data_path, "*.tsv"))
+    # execute statistical test and hold the results in a dict where the dataset name is the key and the value is a tuple
+    # make a ECDF plot of the p-values for each dataset in a single chart
+
+    p_values_summary_dict = {}
+    test_statistic_summary_dict = {}
+    p_values_dict = {}
+    for dataset in dataset_list:
+        # results_dict[os.path.basename(dataset)] = execute_statistical_test_on_single_dataset(dataset, args.output)
+        p_values, test_statistic = execute_statistical_test_on_single_dataset(dataset, args.output)
+        p_values_dict[os.path.basename(dataset)] = p_values
+        p_values_summary_dict[os.path.basename(dataset)] = pd.Series(p_values).describe(
+            percentiles=[0, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8,
+                         0.9, 0.95, 1])
+        test_statistic_summary_dict[os.path.basename(dataset)] = pd.Series(test_statistic).describe(
+            percentiles=[0, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5,
+                         0.6, 0.7, 0.8, 0.9, 0.95, 1])
+    p_values_summary_df = pd.DataFrame(p_values_summary_dict)
+    p_values_df = pd.DataFrame(p_values_dict)
+    # make a ECDF plot of the p-values for each dataset in p_values_df in a single chart, where each dataset is a line
+    # in the chart
+    fig = px.ecdf(p_values_df, x=p_values_df.columns)
+    fig.write_image(os.path.join(args.output, "p_values_ecdf.png"))
+    test_statistic_df = pd.DataFrame(test_statistic_summary_dict)
+    p_values_summary_df.round(2).to_csv(os.path.join(args.output, "p_values_summary_statistics.tsv"), sep="\t")
+    test_statistic_df.round(2).to_csv(os.path.join(args.output, "test_statistic_summary_statistics.tsv"), sep="\t")
+
+
+def execute_statistical_test_on_single_dataset(data_path: str, output: str):
+    data = np.loadtxt(data_path, delimiter="\t")
+    # take the filename of the input data file and use it as the name of the output directory
+    output_files_prefix = os.path.basename(data_path).replace(".tsv", "")
+    n_obs = data.shape[0]
+    group_size = n_obs // 2
+
+    group1_indices = list(range(group_size))
+    group2_indices = list(range(group_size, n_obs))
+
+    group1_data = data[group1_indices]
+    group2_data = data[group2_indices]
+    p_values = np.zeros(data.shape[1])
+    test_statistic = np.zeros(data.shape[1])
+    for col in range(data.shape[1]):
+        test_statistic[col], p_values[col] = ttest_ind(group1_data[:, col], group2_data[:, col])
+    np.savetxt(os.path.join(output, output_files_prefix + "_test_statistic.tsv"), test_statistic, delimiter="\t")
+    np.savetxt(os.path.join(output, output_files_prefix + "_p_values.tsv"), p_values, delimiter="\t")
+    px.histogram(p_values).write_image(os.path.join(output, output_files_prefix + "_p_values_histogram.png"))
+    px.histogram(test_statistic).write_image(
+        os.path.join(output, output_files_prefix + "_test_statistic_histogram.png"))
+    return p_values, test_statistic
diff --git a/scripts/cli_scripts/plot_histograms.py b/scripts/cli_scripts/plot_histograms.py
@@ -12,7 +12,9 @@ def execute():
     parser.add_argument('--aggregated_results', help='Path to the aggregated results file', required=True)
     parser.add_argument('--output_dir', help='Path to the output directory where the plots will be written',
                         required=True)
-    parser.add_argument('--with_title', help='Path to the output directory where the plots will be written',
+    parser.add_argument('--with_title', help='Whether to include the config in the title of the plot',
+                        required=False, action='store_true')
+    parser.add_argument('--remove_zero_bin', help='Whether to include the config in the title of the plot',
                         required=False, action='store_true')
     args = parser.parse_args()
     df = pd.read_csv(args.aggregated_results, sep="\t", header=0, index_col=False)
@@ -21,6 +23,9 @@ def execute():
     if not os.path.exists(args.output_dir):
         os.mkdir(args.output_dir)
     for index, row in df.iterrows():
+        if args.remove_zero_bin is True:
+            row['reporting_histogram'] = row['reporting_histogram'][1:]
+            row['reporting_histogram_bins'] = row['reporting_histogram_bins'][1:]
         num_bins = len(row['reporting_histogram'])
         bin_edges = row['reporting_histogram_bins']
         custom_tick_labels = [bin_edges[0]]

diff --git a/setup.py b/setup.py
@@ -37,6 +37,7 @@
             'plot_histograms=scripts.cli_scripts.plot_histograms:execute',
             'plot_comparative_boxcharts=scripts.cli_scripts.plot_comparative_boxcharts:execute',
             'simulate_semi_real_world_data=scripts.cli_scripts.simulate_semi_real_world_data:execute',
+            'execute_statistical_test=scripts.cli_scripts.execute_statistical_test:execute'
         ],
     },
     install_requires=requirements,