diff --git a/.gitignore b/.gitignore index e8b9312..248259c 100644 --- a/.gitignore +++ b/.gitignore @@ -159,7 +159,6 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ -docs examples/tmp examples/simulated_data diff --git a/documentation/docs/api/cpm_regression.md b/documentation/docs/api/cpm_regression.md new file mode 100644 index 0000000..8c9c312 --- /dev/null +++ b/documentation/docs/api/cpm_regression.md @@ -0,0 +1,3 @@ +# CPM Regression + +::: cpm.cpm_analysis \ No newline at end of file diff --git a/documentation/docs/api/edge_selection.md b/documentation/docs/api/edge_selection.md new file mode 100644 index 0000000..56ec5b5 --- /dev/null +++ b/documentation/docs/api/edge_selection.md @@ -0,0 +1,3 @@ +# Edge Selection + +::: cpm.edge_selection \ No newline at end of file diff --git a/documentation/docs/api/fold.md b/documentation/docs/api/fold.md new file mode 100644 index 0000000..958cb04 --- /dev/null +++ b/documentation/docs/api/fold.md @@ -0,0 +1,3 @@ +# Fold + +::: cpm.fold \ No newline at end of file diff --git a/documentation/docs/api/models.md b/documentation/docs/api/models.md new file mode 100644 index 0000000..c9a779c --- /dev/null +++ b/documentation/docs/api/models.md @@ -0,0 +1,3 @@ +# Predictive Models + +::: cpm.models \ No newline at end of file diff --git a/documentation/docs/assets/img/CCCPM.pdf b/documentation/docs/assets/img/CCCPM.pdf new file mode 100644 index 0000000..879d541 Binary files /dev/null and b/documentation/docs/assets/img/CCCPM.pdf differ diff --git a/documentation/docs/assets/img/CCCPM.png b/documentation/docs/assets/img/CCCPM.png new file mode 100644 index 0000000..5c60a52 Binary files /dev/null and b/documentation/docs/assets/img/CCCPM.png differ diff --git a/documentation/docs/assets/img/CCCPM_large.png b/documentation/docs/assets/img/CCCPM_large.png new file mode 100644 index 0000000..c80de98 Binary files /dev/null and b/documentation/docs/assets/img/CCCPM_large.png differ diff --git a/documentation/docs/assets/img/CCCPM_medium.png b/documentation/docs/assets/img/CCCPM_medium.png new file mode 100644 index 0000000..ac98324 Binary files /dev/null and b/documentation/docs/assets/img/CCCPM_medium.png differ diff --git a/documentation/docs/examples/human_connectome_project.md b/documentation/docs/examples/human_connectome_project.md new file mode 100644 index 0000000..974d1b2 --- /dev/null +++ b/documentation/docs/examples/human_connectome_project.md @@ -0,0 +1,3 @@ +```python +--8<-- "examples/hcp_analysis.py" +``` \ No newline at end of file diff --git a/documentation/docs/examples/simulated_data.md b/documentation/docs/examples/simulated_data.md new file mode 100644 index 0000000..542c4b7 --- /dev/null +++ b/documentation/docs/examples/simulated_data.md @@ -0,0 +1,3 @@ +```python +--8<-- "examples/example_simulated_data.py" +``` \ No newline at end of file diff --git a/documentation/docs/stylesheets/extra.css b/documentation/docs/stylesheets/extra.css new file mode 100644 index 0000000..79b16f8 --- /dev/null +++ b/documentation/docs/stylesheets/extra.css @@ -0,0 +1,12 @@ +:root { + --md-primary-fg-color: #257180; + --md-primary-fg-color--light: #257180; + --md-primary-fg-color--dark: #257180; + --md-accent-fg-color: #FD8B51; + --md-accent-fg-color--transparent: #FD8B51; + --md-accent-bg-color: #FD8B51; + --md-accent-bg-color--light: #FD8B51; +} +:root > * { + --md-code-bg-color: rgb(242, 229, 191, 0.3); +} \ No newline at end of file diff --git a/examples/example_simulated_data3.py b/examples/example_simulated_data3.py new file mode 100644 index 0000000..2b63b0c --- /dev/null +++ b/examples/example_simulated_data3.py @@ -0,0 +1,32 @@ +from sklearn.model_selection import KFold, ShuffleSplit, RepeatedKFold + +from cpm import CPMRegression +from cpm.simulate_data import simulate_regression_data_2 +from cpm.edge_selection import PThreshold, UnivariateEdgeSelection + + +link_types = ['no_link', + 'no_no_link', + 'direct_link', + 'weak_link' + ] +edge_statistics = ['pearson', 'pearson_partial'] +results_folder = '/spm-data/vault-data3/mmll/projects/confound_corrected_cpm/results' + +for link in link_types: + for edge_statistic in edge_statistics: + X, y, covariates = simulate_regression_data_2(n_features=1225, n_informative_features=50, link_type=link) + + univariate_edge_selection = UnivariateEdgeSelection(edge_statistic=[edge_statistic], + edge_selection=[PThreshold(threshold=[0.05], + correction=['fdr_by'])]) + cpm = CPMRegression(results_directory=f'{results_folder}/simulated_data_{link}_{edge_statistic}', + cv=RepeatedKFold(n_splits=10, n_repeats=5, random_state=42), + edge_selection=univariate_edge_selection, + inner_cv=ShuffleSplit(n_splits=3, test_size=0.2, random_state=42), + add_edge_filter=True, + n_permutations=2) + cpm.estimate(X=X, y=y, covariates=covariates) + + #cpm._calculate_permutation_results('./tmp/example_simulated_data2') + diff --git a/examples/hannah.py b/examples/hannah.py deleted file mode 100644 index 476ff68..0000000 --- a/examples/hannah.py +++ /dev/null @@ -1,38 +0,0 @@ -import numpy as np -import pandas as pd - -from sklearn.model_selection import KFold - -from cpm import CPMRegression -import scipy.io - -from cpm.edge_selection import PThreshold, UnivariateEdgeSelection -from cpm.reporting.plots.utils import matrix_to_vector_3d - - -# Load the .mat file -covariates_file = '/spm-data/vault-data3/TIP-Studie/09_Connectome/0_Projekte/FürNils/design_matrix_SAD_age_sex_med_01000.mat' -covariates = scipy.io.loadmat(covariates_file)['design_matrix_SAD_age_sex_med_01000'] -y = covariates[:, 1] -covs = covariates[:, 2:] - -brain_file = '/spm-data/vault-data3/TIP-Studie/09_Connectome/0_Projekte/FürNils/Connectivity_dti_nos_Aparc_stacked.mat' -connectome = scipy.io.loadmat(brain_file)['connectivityAparcstacked'] -connectome = np.moveaxis(connectome, -1, 0) -X = matrix_to_vector_3d(connectome) - -atlas_file = "/spm-data/vault-data3/TIP-Studie/09_Connectome/0_Projekte/FürNils/brainRegions_aparc.csv" - -p_threshold = PThreshold(threshold=[0.01], correction=[None]) -univariate_edge_selection = UnivariateEdgeSelection(edge_statistic=['pearson'], - edge_selection=[p_threshold]) - -cpm_regression = CPMRegression(results_directory='./tmp/hannah_sad_pearson', - cv=KFold(n_splits=10, shuffle=True, random_state=42), - edge_selection=univariate_edge_selection, - #cv_edge_selection=KFold(n_splits=2, shuffle=True, random_state=42), - add_edge_filter=True, - n_permutations=1000, - atlas_labels=atlas_file) -results = cpm_regression.estimate(X=X, y=y, covariates=covs) -#cpm_regression._calculate_permutation_results() diff --git a/examples/hcp_analysis.py b/examples/hcp_analysis.py deleted file mode 100644 index 2707368..0000000 --- a/examples/hcp_analysis.py +++ /dev/null @@ -1,62 +0,0 @@ -import os -import pandas as pd -import numpy as np -from cpm.reporting.plots.utils import matrix_to_vector_3d -from sklearn.model_selection import RepeatedKFold, KFold - -from cpm import CPMRegression -from cpm.edge_selection import PThreshold, UnivariateEdgeSelection - - -data_folder = "/spm-data/vault-data3/mmll/data/HCP/100_nodes_times_series/" -X_file = "functional_connectivity.npy" -ids_file = "subject_ids.csv" -meta_file = "RESTRICTED_NilsWinter_11_26_2024_8_3_14.csv" -unres_meta_file = "unrestricted_NilsWinter_12_4_2024_10_11_45.csv" - -X = np.load(os.path.join(data_folder, X_file)) -ids = pd.read_csv(os.path.join(data_folder, ids_file)) -ids['Subject'] = ids['Subject_ID'] -df = pd.read_csv(os.path.join(data_folder, meta_file)) -df_unres = pd.read_csv(os.path.join(data_folder, unres_meta_file)) - -df = pd.merge(ids, df, how="left", on="Subject") -df = pd.merge(df, df_unres, how="left", on="Subject") - -df['Sex'] = pd.get_dummies(df['Gender'], drop_first=True, dtype='int').to_numpy() - -X = matrix_to_vector_3d(X) -#target = "BMI" -target = "SSAGA_Income" -#target = "Age_in_Yrs" - -X = X[~df[target].isna()] -df = df[~df[target].isna()] - -#X = X[~df['BMI'].isna()] -#df = df[~df['BMI'].isna()] - -#edge_statistics = ['pearson', 'pearson_partial'] -edge_statistics = ['spearman'] - -#covariates = df['BMI'].to_numpy().reshape(-1, 1) -covariates = df[['Age_in_Yrs', 'Sex']].to_numpy() - - -for edge_statistic in edge_statistics: - univariate_edge_selection = UnivariateEdgeSelection(edge_statistic=[edge_statistic], - edge_selection=[ - PThreshold(threshold=[0.1, 0.05, 0.01, 0.005], - correction=[None]), - # PThreshold(threshold=[0.05], - # correction=['fdr_by']) - ]) - cpm = CPMRegression(results_directory=f'{data_folder}/results/hcp_{target}_{edge_statistic}', - cv=KFold(n_splits=10, shuffle=True, random_state=42), - edge_selection=univariate_edge_selection, - inner_cv=KFold(n_splits=10, shuffle=True, random_state=42), - select_stable_edges=True, - stability_threshold=0, - n_permutations=20) - cpm.estimate(X=X, y=df[target].to_numpy(), covariates=covariates) -print() \ No newline at end of file diff --git a/examples/human_connectome_project.py b/examples/human_connectome_project.py deleted file mode 100644 index d7b3da8..0000000 --- a/examples/human_connectome_project.py +++ /dev/null @@ -1,49 +0,0 @@ -import os -import pandas as pd -import numpy as np - -# Define the folder containing the .txt files -folder_path = ('/spm-data/vault-data3/mmll/data/HCP/100_nodes_times_series/3T_HCP1200_MSMAll_d100_ts2') - -# Initialize a list to store correlation matrices -correlation_matrices = [] - -# Initialize a list to store subject IDs -subject_ids = [] - -file_list = [f for f in os.listdir(folder_path) if f.endswith('.txt')] - -# Sort the filenames by subject ID (assuming subject IDs are in the filename before .txt) -file_list.sort() # This will sort alphabetically, which works if subject IDs are formatted consistently - -# Loop through all files in the specified folder -for filename in file_list: - print(filename) - # Construct the full file path - file_path = os.path.join(folder_path, filename) - - # Read the text file into a DataFrame - df = pd.read_csv(file_path, sep='\s+', header=None) - - # Calculate the correlation matrix - correlation_matrix = df.corr() - - # Append the correlation matrix to the list - correlation_matrices.append(correlation_matrix.values) - - # Extract subject ID from the filename (remove the .txt extension) - subject_id = filename[:-4] # Exclude the last 4 characters ('.txt') - subject_ids.append(subject_id) - -# Convert the list of correlation matrices into a 3D numpy array -correlation_array = np.array(correlation_matrices) - -# Create a DataFrame for subject IDs -subject_ids_df = pd.DataFrame(subject_ids, columns=['Subject_ID']) - -# Save the numpy array and DataFrame to disk -np.save('/spm-data/vault-data3/mmll/data/HCP/100_nodes_times_series/functional_connectivity.npy', correlation_array) # Save as .npy file -subject_ids_df.to_csv('/spm-data/vault-data3/mmll/data/HCP/100_nodes_times_series/subject_ids.csv', index=False) # Save as .csv file - - -print() \ No newline at end of file diff --git a/examples/macs.py b/examples/macs.py deleted file mode 100644 index 748e189..0000000 --- a/examples/macs.py +++ /dev/null @@ -1,62 +0,0 @@ -import numpy as np -import pandas as pd - -from sklearn.model_selection import KFold - -from cpm import CPMRegression - - -#X = np.load('/spm-data/vault-data3/mmll/projects/macs_datahub_example/AnalysisReady/all/FunctionalConnectome/X.npy') -#df = pd.read_csv('/spm-data/vault-data3/mmll/projects/macs_datahub_example/AnalysisReady/all/' -# 'FunctionalConnectome/sample.csv', -# na_values=-99) -X = np.load('/spm-data/vault-data3/mmll/projects/cpm_macs_example/datahub/AnalysisReady/hc/DTI_fractional_anisotropy/X.npy') -df = pd.read_csv('/spm-data/vault-data3/mmll/projects/cpm_macs_example/datahub/AnalysisReady/hc/DTI_fractional_anisotropy/subjects.csv', - na_values=-99) - -X = X[df['Group'] <= 1] -df = df[df['Group'] <= 1] - - -#X = X[~df['CTQ_Sum'].isna()] -#df = df[~df['CTQ_Sum'].isna()] -X = X[~df['IQ'].isna()] -df = df[~df['IQ'].isna()] - -#X = X[~df['NEOFFI_Neurotizismus'].isna()] -#df = df[~df['NEOFFI_Neurotizismus'].isna()] - -#X = X[~df['FSozU_Sum'].isna()] -#df = df[~df['FSozU_Sum'].isna()] - -#X = X[~df['BDI_Sum'].isna()] -#df = df[~df['BDI_Sum'].isna()] -covs = df[['Alter', 'Geschlecht', 'Site']].to_numpy() -#covs = df[['Geschlecht', 'Site']].to_numpy() -#covs = df[['Geschlecht']].to_numpy() -#y = df['BDI_Sum'].to_numpy() -#y = df['FSozU_Sum'].to_numpy() -#y = df['Alter'].to_numpy() -y = df['IQ'].to_numpy() - -#y = df['NEOFFI_Neurotizismus'].to_numpy() -#y = df['CTQ_Sum'].to_numpy() -#covs = df[['Geschlecht']].to_numpy() -#y = df['Alter'].to_numpy() - - -from cpm.edge_selection import PThreshold, UnivariateEdgeSelection -p_threshold = PThreshold(threshold=[0.1, 0.05, 0.01, 0.005, 0.001], correction=[None]) -univariate_edge_selection = UnivariateEdgeSelection(edge_statistic=['pearson_partial'], - edge_selection=[p_threshold]) - -cpm = CPMRegression(results_directory='./tmp/macs_IQ_partial', - cv=KFold(n_splits=10, shuffle=True, random_state=42), - edge_selection=univariate_edge_selection, - cv_edge_selection=KFold(n_splits=10, shuffle=True, random_state=42), - add_edge_filter=True, - n_permutations=20) -results = cpm.estimate(X=X, y=y, covariates=covs) -#print(results) -#cpm._calculate_permutation_results() - diff --git a/examples/plot_simulation_results2.py b/examples/plot_simulation_results2.py new file mode 100644 index 0000000..16bd60c --- /dev/null +++ b/examples/plot_simulation_results2.py @@ -0,0 +1,42 @@ +import os + +import seaborn as sns +import matplotlib.pyplot as plt +import pandas as pd + + +results_folder = "simulated_data" +edge_statistics = ['pearson', 'pearson_partial'] + + +for scenario in ['A', 'B']: + for edge_statistic in edge_statistics: + results = {f"scenario_{scenario}1": os.path.join(results_folder, f"scenario_{scenario}1", "results", edge_statistic), + f"scenario_{scenario}2": os.path.join(results_folder, f"scenario_{scenario}2", "results", edge_statistic), + f"scenario_{scenario}3": os.path.join(results_folder, f"scenario_{scenario}3", "results", edge_statistic), + f"scenario_{scenario}4": os.path.join(results_folder, f"scenario_{scenario}4", "results", edge_statistic), + } + + dfs = [] + for link_type, folder in results.items(): + df = pd.read_csv(os.path.join(folder, 'cv_results.csv')) + df['link_type'] = link_type + dfs.append(df) + + concatenated_df = pd.concat(dfs, ignore_index=True) + + concatenated_df = concatenated_df[concatenated_df['network'] == 'both'] + + + g = sns.FacetGrid(concatenated_df, col="link_type", margin_titles=True, despine=True, + height=2.5, hue="model") + g.map(plt.axvline, x=0, color='grey', linewidth=0.5, zorder=-1) + g.map(sns.violinplot, "pearson_score", "model", inner=None, split=True,# hue="model",# hue_order=[1, 2], + density_norm='count', dodge=True, palette="Blues_r", fill=True) + #g.map(sns.boxplot, "pearson_score", "model", dodge=True, #hue=1, hue_order=[2, 1] + # ) + g.set_titles(col_template="{col_name}", size=7) + g.set_xlabels("Pearson correlation", size=8) + plt.suptitle(edge_statistic) + plt.show() + print() diff --git a/examples/save_config.py b/examples/save_config.py index 6bf1aca..f888955 100644 --- a/examples/save_config.py +++ b/examples/save_config.py @@ -17,7 +17,7 @@ cpm = CPMRegression(results_directory='./tmp/example_simulated_data2', cv=KFold(n_splits=5, shuffle=True, random_state=42), edge_selection=univariate_edge_selection, - cv_edge_selection=ShuffleSplit(n_splits=1, test_size=0.2, random_state=42), + inner_cv=ShuffleSplit(n_splits=1, test_size=0.2, random_state=42), add_edge_filter=True, n_permutations=100) cpm.save_configuration(config_filename='./config.pkl') diff --git a/examples/simulate_data_3.py b/examples/simulate_data_3.py index 6a6a40e..bb4a3b9 100644 --- a/examples/simulate_data_3.py +++ b/examples/simulate_data_3.py @@ -1,7 +1,15 @@ +import os from sklearn.linear_model import LinearRegression +from sklearn.model_selection import KFold, ShuffleSplit, RepeatedKFold from sklearn.metrics import r2_score import numpy as np import matplotlib.pyplot as plt +import pandas as pd +import seaborn as sns +from sklearn.model_selection import cross_val_score +from cpm import CPMRegression +from cpm.simulate_data import simulate_regression_data_2 +from cpm.edge_selection import PThreshold, UnivariateEdgeSelection def generate_scenario_data(scenario, n_samples=1000, n_features=10, n_informative=3, noise_level=0.1): @@ -9,39 +17,58 @@ def generate_scenario_data(scenario, n_samples=1000, n_features=10, n_informativ # Generate feature matrix X X = np.random.normal(0, 1, (n_samples, n_features)) + z = np.random.normal(0, 1, n_samples) + y = np.random.normal(0, 1, n_samples) # Generate ground truth coefficients for X's influence on z and y - z_coefficients = np.zeros(n_features) - y_coefficients = np.zeros(n_features) - - if scenario == 1: - # Scenario 1: y and z are influenced by X independently - z_coefficients[:n_informative] = np.linspace(1, 0.1, n_informative) # Decreasing influence - y_coefficients[:n_informative] = np.linspace(1, 0.1, n_informative) # Decreasing influence - z = np.dot(X, z_coefficients) + np.random.normal(0, noise_level, n_samples) - y = np.dot(X, y_coefficients) + np.random.normal(0, noise_level, n_samples) - - elif scenario == 2: - # Scenario 2: z influences both X and y, inducing spurious association - z = np.random.normal(0, 1, n_samples) - z_coefficients[:n_informative] = np.linspace(1, 0.1, n_informative) - y = 2 * z + np.random.normal(0, noise_level, n_samples) + z_coefficients = np.empty(n_features) + y_coefficients = np.empty(n_features) + + z_coefficients[:n_informative] = np.linspace(0.3, 0.1, n_informative) + y_coefficients[:n_informative] = np.linspace(0.3, 0.1, n_informative) + + if scenario == "A1": + pass + + elif scenario == "A2": + for i in range(n_features): + X[:, i] = X[:, i] + 0.8 * z * z_coefficients[i] + + elif scenario == "A3": + y = y + 0.8 * z + + elif scenario == "A4": + for i in range(n_features): + X[:, i] = X[:, i] + 0.8 * z * z_coefficients[i] + y = y + 0.8 * z + + elif scenario == "B1": + for i in range(n_features): + X[:, i] = X[:, i] + y * y_coefficients[i] + + elif scenario == "B2": for i in range(n_features): - X[:, i] = z * z_coefficients[i] + np.random.normal(0, noise_level, n_samples) - - elif scenario == 3: - # Scenario 3: y is influenced by both X and z, with z partially mediating - z_coefficients[:n_informative] = np.linspace(1, 0.1, n_informative) - y_coefficients[:n_informative] = np.linspace(1, 0.1, n_informative) - z = np.dot(X, z_coefficients) + np.random.normal(0, noise_level, n_samples) - y = np.dot(X, y_coefficients) + 0.5 * z + np.random.normal(0, noise_level, n_samples) - elif scenario == 4: - y_coefficients[:n_informative] = np.linspace(1, 0.1, n_informative) - z = np.random.normal(0, noise_level, n_samples) - y = np.dot(X, y_coefficients) + np.random.normal(0, noise_level, n_samples) + X[:, i] = X[:, i] + y * y_coefficients[i] + model = LinearRegression() + model.fit(y.reshape(-1, 1), X[:, 0]) + resid = X[:, 0] - model.predict(y.reshape(-1, 1)) + z = z + 2 * resid + + elif scenario == "B3": + for i in range(n_features): + X[:, i] = X[:, i] + y * y_coefficients[i] + model = LinearRegression() + model.fit(X[:, 0].reshape(-1, 1), y) + resid = y - model.predict(X[:, 0].reshape(-1, 1)) + z = z + 2 * resid + + elif scenario == "B4": + z = 1.5 * z + y + for i in range(n_features): + X[:, i] = X[:, i] + 2 * y * y_coefficients[i] + 0.1 * z * z_coefficients[i] else: - raise ValueError("Invalid scenario. Choose 1, 2, or 3.") + raise NotImplementedError("Invalid scenario.") return X, y, z @@ -69,28 +96,59 @@ def calculate_explained_variance_y_given_z(y, z): return r2 +def residualize_X(X, z): + """Regress out the effect of z from each column of X.""" + z = z.reshape(-1, 1) # Reshape z for regression + residualized_X = np.zeros_like(X) + + for i in range(X.shape[1]): # Iterate over columns (features) + model = LinearRegression() + model.fit(z, X[:, i]) + # Compute residuals + residualized_X[:, i] = X[:, i] - model.predict(z) + + return residualized_X + + +def calculate_explained_variance_y_with_residualized_X(X, y, z): + """Calculate explained variance of y by X after removing z's influence on X.""" + X_residualized = residualize_X(X, z) + return calculate_explained_variance(X_residualized, y) + + +#edge_statistics = ['pearson', 'pearson_partial'] +edge_statistics = ['pearson'] + # Generate data for each scenario and calculate explained variances -for scenario in range(1, 5): - X, y, z = generate_scenario_data(scenario, noise_level=1) - - # Calculate explained variances - explained_variance_X_y = calculate_explained_variance(X, y) - explained_variance_X_z = calculate_explained_variance(X, z) - explained_variance_y_given_z = calculate_explained_variance_y_given_z(y, z) - - print(f"Scenario {scenario}:") - print(f" Explained Variance (R^2) of X for y: {explained_variance_X_y:.2f}") - print(f" Explained Variance (R^2) of X for z: {explained_variance_X_z:.2f}") - print(f" Explained Variance (R^2) of y for z: {explained_variance_y_given_z:.2f}") - print(f" Corr(X[:, 0], y): {np.corrcoef(X[:, 0], y)[0, 1]:.2f}") - print(f" Corr(X[:, 0], z): {np.corrcoef(X[:, 0], z)[0, 1]:.2f}") - print(f" Corr(y, z): {np.corrcoef(y, z)[0, 1]:.2f}") - - # Plot y vs z for visualization - plt.figure() - plt.scatter(y, z, alpha=0.5, label="y vs z") - plt.title(f"Scenario {scenario}: y vs z") - plt.xlabel("y") - plt.ylabel("z") - plt.legend() - plt.show() +for scenario in ["A1", "A2", "A3", "A4", "B1", "B2", "B3", "B4"]: + for edge_statistic in edge_statistics: + print(f"Scenario {scenario}:") + X, y, z = generate_scenario_data(scenario, noise_level=1, n_samples=1000, n_features=105, n_informative=10) + folder = f"simulated_data/scenario_{scenario}" + os.makedirs(folder, exist_ok=True) + np.save(f"{folder}/X.npy", X) + np.save(f"{folder}/y.npy", y) + np.save(f"{folder}/covariates.npy", z) + + + univariate_edge_selection = UnivariateEdgeSelection(edge_statistic=[edge_statistic], + edge_selection=[PThreshold(threshold=[0.05], + correction=['fdr_by'])]) + cpm = CPMRegression(results_directory=os.path.join(folder, 'results', f'{edge_statistic}'), + cv=RepeatedKFold(n_splits=10, n_repeats=5, random_state=42), + edge_selection=univariate_edge_selection, + # cv_edge_selection=ShuffleSplit(n_splits=1, test_size=0.2, random_state=42), + add_edge_filter=True, + n_permutations=2) + cpm.estimate(X=X, y=y, covariates=z.reshape(-1, 1)) + + # Calculate explained variances + explained_variance_X_y = calculate_explained_variance(X[:,0].reshape(-1, 1), y) + explained_variance_X_z = calculate_explained_variance(X[:,0].reshape(-1, 1), z) + explained_variance_y_given_z = calculate_explained_variance_y_given_z(y, z) + explained_variance_X_y_residualized = calculate_explained_variance_y_with_residualized_X(X[:,0].reshape(-1, 1), y, z) + + print(f" Explained Variance (R^2) in y given X: {explained_variance_X_y:.2f}") + print(f" Explained Variance (R^2) in y given X controlling for z: {explained_variance_X_y_residualized:.2f}") + print(f" Explained Variance (R^2) in z given X: {explained_variance_X_z:.2f}") + print(f" Explained Variance (R^2) in y given z: {explained_variance_y_given_z:.2f}") diff --git a/examples/trap_kongress.py b/examples/trap_kongress.py deleted file mode 100644 index f0b2e19..0000000 --- a/examples/trap_kongress.py +++ /dev/null @@ -1,31 +0,0 @@ -import numpy as np -import pandas as pd - -from sklearn.model_selection import KFold - -from cpm import CPMRegression -from cpm.edge_selection import PThreshold, UnivariateEdgeSelection - - -X = np.load('/spm-data/vault-data3/mmll/projects/cpm_macs_example/datahub/AnalysisReady/hc/DTI_fractional_anisotropy/X.npy') -df = pd.read_csv('/spm-data/vault-data3/mmll/projects/cpm_macs_example/datahub/AnalysisReady/hc/DTI_fractional_anisotropy/subjects.csv', - na_values=-99) - -X = X[~df['Haushaltsnetto'].isna()] -df = df[~df['Haushaltsnetto'].isna()] -covs = df[['Alter', 'Geschlecht', 'Site']].to_numpy() -y = df['Haushaltsnetto'].to_numpy() - -# define edge selection -p_threshold = PThreshold(threshold=[0.001], correction=[None]) -p_fdr = PThreshold(threshold=[0.05], correction=['fdr_by']) -univariate_edge_selection = UnivariateEdgeSelection(edge_statistic=['pearson_partial'], edge_selection=[p_threshold]) - -# run cpm regression -cpm = CPMRegression(results_directory='/spm-data/vault-data3/mmll/projects/cpm_macs_example/haushaltsnetto', - cv=KFold(n_splits=10, shuffle=True, random_state=42), - edge_selection=univariate_edge_selection, - cv_edge_selection=KFold(n_splits=10, shuffle=True, random_state=42), - add_edge_filter=True, - n_permutations=1000) -results = cpm.estimate(X=X, y=y, covariates=covs)