Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Users/kanduric/execute statistical test script #6

Merged
merged 2 commits into from
Nov 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 0 additions & 13 deletions scripts/cli_scripts/analyse_fdr_hacking.py

This file was deleted.

67 changes: 67 additions & 0 deletions scripts/cli_scripts/execute_statistical_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import argparse
import glob
import os
import pandas as pd
from scipy.stats import ttest_ind
from fdr_hacking.data_generation import *
import numpy as np
import plotly.express as px


def execute():
parser = argparse.ArgumentParser()
parser.add_argument('--data_path', help='Path to the dataset files in tsv format', required=True)
parser.add_argument('--output', help='Path to the output directory, where all the results should be stored',
required=True)
args = parser.parse_args()
# gather a file list of all the tsv files in args.data_path using glob.glob
dataset_list = glob.glob(os.path.join(args.data_path, "*.tsv"))
# execute statistical test and hold the results in a dict where the dataset name is the key and the value is a tuple
# make a ECDF plot of the p-values for each dataset in a single chart

p_values_summary_dict = {}
test_statistic_summary_dict = {}
p_values_dict = {}
for dataset in dataset_list:
# results_dict[os.path.basename(dataset)] = execute_statistical_test_on_single_dataset(dataset, args.output)
p_values, test_statistic = execute_statistical_test_on_single_dataset(dataset, args.output)
p_values_dict[os.path.basename(dataset)] = p_values
p_values_summary_dict[os.path.basename(dataset)] = pd.Series(p_values).describe(
percentiles=[0, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8,
0.9, 0.95, 1])
test_statistic_summary_dict[os.path.basename(dataset)] = pd.Series(test_statistic).describe(
percentiles=[0, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5,
0.6, 0.7, 0.8, 0.9, 0.95, 1])
p_values_summary_df = pd.DataFrame(p_values_summary_dict)
p_values_df = pd.DataFrame(p_values_dict)
# make a ECDF plot of the p-values for each dataset in p_values_df in a single chart, where each dataset is a line
# in the chart
fig = px.ecdf(p_values_df, x=p_values_df.columns)
fig.write_image(os.path.join(args.output, "p_values_ecdf.png"))
test_statistic_df = pd.DataFrame(test_statistic_summary_dict)
p_values_summary_df.round(2).to_csv(os.path.join(args.output, "p_values_summary_statistics.tsv"), sep="\t")
test_statistic_df.round(2).to_csv(os.path.join(args.output, "test_statistic_summary_statistics.tsv"), sep="\t")


def execute_statistical_test_on_single_dataset(data_path: str, output: str):
data = np.loadtxt(data_path, delimiter="\t")
# take the filename of the input data file and use it as the name of the output directory
output_files_prefix = os.path.basename(data_path).replace(".tsv", "")
n_obs = data.shape[0]
group_size = n_obs // 2

group1_indices = list(range(group_size))
group2_indices = list(range(group_size, n_obs))

group1_data = data[group1_indices]
group2_data = data[group2_indices]
p_values = np.zeros(data.shape[1])
test_statistic = np.zeros(data.shape[1])
for col in range(data.shape[1]):
test_statistic[col], p_values[col] = ttest_ind(group1_data[:, col], group2_data[:, col])
np.savetxt(os.path.join(output, output_files_prefix + "_test_statistic.tsv"), test_statistic, delimiter="\t")
np.savetxt(os.path.join(output, output_files_prefix + "_p_values.tsv"), p_values, delimiter="\t")
px.histogram(p_values).write_image(os.path.join(output, output_files_prefix + "_p_values_histogram.png"))
px.histogram(test_statistic).write_image(
os.path.join(output, output_files_prefix + "_test_statistic_histogram.png"))
return p_values, test_statistic
7 changes: 6 additions & 1 deletion scripts/cli_scripts/plot_histograms.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ def execute():
parser.add_argument('--aggregated_results', help='Path to the aggregated results file', required=True)
parser.add_argument('--output_dir', help='Path to the output directory where the plots will be written',
required=True)
parser.add_argument('--with_title', help='Path to the output directory where the plots will be written',
parser.add_argument('--with_title', help='Whether to include the config in the title of the plot',
required=False, action='store_true')
parser.add_argument('--remove_zero_bin', help='Whether to include the config in the title of the plot',
required=False, action='store_true')
args = parser.parse_args()
df = pd.read_csv(args.aggregated_results, sep="\t", header=0, index_col=False)
Expand All @@ -21,6 +23,9 @@ def execute():
if not os.path.exists(args.output_dir):
os.mkdir(args.output_dir)
for index, row in df.iterrows():
if args.remove_zero_bin is True:
row['reporting_histogram'] = row['reporting_histogram'][1:]
row['reporting_histogram_bins'] = row['reporting_histogram_bins'][1:]
num_bins = len(row['reporting_histogram'])
bin_edges = row['reporting_histogram_bins']
custom_tick_labels = [bin_edges[0]]
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
'plot_histograms=scripts.cli_scripts.plot_histograms:execute',
'plot_comparative_boxcharts=scripts.cli_scripts.plot_comparative_boxcharts:execute',
'simulate_semi_real_world_data=scripts.cli_scripts.simulate_semi_real_world_data:execute',
'execute_statistical_test=scripts.cli_scripts.execute_statistical_test:execute'
],
},
install_requires=requirements,
Expand Down
Loading