WIP

wwu-mmll · Sep 27, 2024 · da5457e · da5457e
1 parent e587bcb
commit da5457e
Show file tree

Hide file tree

Showing 9 changed files with 190 additions and 24 deletions.
diff --git a/cpm/cpm_analysis.py b/cpm/cpm_analysis.py
@@ -162,12 +162,12 @@ def estimate(self,
         self.logger.info(f"Starting estimation with {self.n_permutations} permutations.")
 
         # Estimate models on actual data
-        self._estimate(X=X, y=y, covariates=covariates, perm_run=0)
+        self._estimate(X=X, y=np.squeeze(y), covariates=covariates, perm_run=0)
         self.logger.info("=" * 50)
 
         # Estimate models on permuted data
         for perm_id in range(1, self.n_permutations + 1):
-            self._estimate(X=X, y=y, covariates=covariates, perm_run=perm_id)
+            self._estimate(X=X, y=np.squeeze(y), covariates=covariates, perm_run=perm_id)
 
         self._calculate_permutation_results()
         self.logger.info("Estimation completed.")
@@ -396,7 +396,7 @@ def _update_predictions(self, y_pred, y_true, best_params, fold):
         preds = (pd.DataFrame.from_dict(y_pred).stack().explode().reset_index().rename(
             {'level_0': 'network', 'level_1': 'model', 0: 'y_pred'}, axis=1).set_index(['network', 'model']))
         n_network_model = ModelDict.n_models() * NetworkDict.n_networks()
-        preds['y_true'] = np.repeat(y_true, n_network_model)
+        preds['y_true'] = np.tile(y_true, n_network_model)
         preds['params'] = [best_params] * y_true.shape[0] * n_network_model
         preds['fold'] = [fold] * y_true.shape[0] * n_network_model
         return preds

diff --git a/cpm/edge_selection.py b/cpm/edge_selection.py
@@ -178,13 +178,20 @@ def _generate_config_grid(self):
         return ParameterGrid(grid_elements)
 
     def fit_transform(self, X, y=None, covariates=None):
-        _, p_values = one_sample_t_test(X, 0)
-        valid_edges = p_values < 0.05
+        t_test_filter = False
+        if t_test_filter:
+            _, p_values = one_sample_t_test(X, 0)
+            valid_edges = p_values < 0.05
+        else:
+            valid_edges = np.bool(np.ones(X.shape[1]))
+
+        #r_edges, p_edges = np.zeros(X.shape[1]), np.ones(X.shape[1])
+        #r_edges_masked, p_edges_masked = self.compute_edge_statistics(X=X[:, valid_edges], y=y, covariates=covariates)
+        #r_edges[valid_edges] = r_edges_masked
+        #p_edges[valid_edges] = p_edges_masked
+
+        r_edges, p_edges = self.compute_edge_statistics(X=X, y=y, covariates=covariates)
 
-        r_edges, p_edges = np.zeros(X.shape[1]), np.ones(X.shape[1])
-        r_edges_masked, p_edges_masked = self.compute_edge_statistics(X=X[:, valid_edges], y=y, covariates=covariates)
-        r_edges[valid_edges] = r_edges_masked
-        p_edges[valid_edges] = p_edges_masked
         edges = self.edge_selection.select(r=r_edges, p=p_edges)
         return edges
 

diff --git a/cpm/reporting/report.py b/cpm/reporting/report.py
@@ -14,7 +14,12 @@ def main():
         #st.session_state['results_directory'] = '/home/nwinter/PycharmProjects/cpm_python/examples/tmp/example_simulated_data2/'
         #st.session_state['results_directory'] = '/home/nwinter/PycharmProjects/cpm_python/examples/tmp/hannah_sad/'
         #st.session_state['results_directory'] = '/home/nwinter/PycharmProjects/cpm_python/examples/tmp/hannah_sad_pearson/'
-        st.session_state['results_directory'] = '/home/nwinter/PycharmProjects/cpm_python/examples/tmp/macs_ctq_hc/'
+        #st.session_state['results_directory'] = '/spm-data/vault-data3/mmll/projects/cpm_macs_example/haushaltsnetto'
+        #st.session_state['results_directory'] = '/spm-data/vault-data3/mmll/projects/cpm_macs_example/haushaltsnetto_partial'
+        #st.session_state['results_directory'] = '/spm-data/vault-data3/mmll/projects/cpm_macs_example/haushaltsnetto_05_fdr'
+        st.session_state[
+            'results_directory'] = '/home/nwinter/PycharmProjects/cpm_python/examples/tmp/simulated_data_no_no_link/'
+
     st.session_state['df'] = pd.read_csv(os.path.join(st.session_state.results_directory, 'cv_results.csv'))
     st.session_state['df_main'] = load_results_from_folder(st.session_state.results_directory, 'cv_results_mean_std.csv')
     st.session_state['df_predictions'] = load_data_from_folder(st.session_state.results_directory, 'cv_predictions.csv')

diff --git a/cpm/simulate_data.py b/cpm/simulate_data.py
@@ -44,14 +44,14 @@ def simulate_regression_data(n_samples: int = 500,
                            random_state=random_state, shuffle=False)
 
     # Introduce some variability in features
-    X[:, np.arange(0, X.shape[1], 2)] *= -1
+    #X[:, np.arange(0, X.shape[1], 2)] *= -1
 
     # Generate initial random covariates
     covariates = np.random.rand(n_samples, n_covariates)
 
     # Define coefficients for covariates based on features to introduce correlation
     feature_to_covariate_coeff = np.random.rand(n_features, n_covariates)
-    covariates += np.dot(X, feature_to_covariate_coeff) * 0.1  # Introducing correlation with features
+    covariates += np.dot(X, feature_to_covariate_coeff) * 0.2  # Introducing correlation with features
 
     # Standardize the covariates after introducing correlation
     scaler_cov = StandardScaler()
@@ -70,3 +70,58 @@ def simulate_regression_data(n_samples: int = 500,
     y = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()
 
     return X, y, covariates
+
+
+def simulate_regression_data_2(n_samples: int = 500,
+                               n_features: int = 4950,
+                               n_informative_features: int = 200,
+                               link_type: str = 'no_link',
+                               random_state: int = 42):
+    """
+    Simulates data with specified parameters for regression problems.
+
+    This function generates a dataset consisting of features (with some informative),
+    covariates, and a target variable. The generated dataset has correlations between
+    the features, covariates, and the target variable as per input arguments.
+
+    Args:
+        n_samples (int): Number of samples in the dataset. Default is 500.
+        n_features (int): Total number of features including irrelevant ones. Default is 4950.
+        n_informative_features (int): Number of informative features affecting the target. Default is 200.
+        random_state (int): Seed value used for reproducibility. Default is 42.
+
+    Returns:
+        tuple: Tuple containing three NumPy arrays - X, y, and z representing features, target, and additional variables respectively.
+             - X: Array-like shape=(n_samples, n_features).
+             - y: Array-like shape=(n_samples, ).
+             - z: Array-like shape=(n_samples, ).
+    """
+
+    # Set seed for reproducibility
+    np.random.seed(random_state)
+
+    mu, sigma = 0, 1.0  # mean and standard deviation
+    x_rand = np.random.normal(mu, sigma, [n_samples, n_features])
+    y_rand = np.random.normal(mu, sigma, n_samples).reshape(-1, 1)
+    z_rand = np.random.normal(mu, sigma, n_samples).reshape(-1, 1)
+    noise = np.random.normal(mu, sigma, n_samples).reshape(-1, 1)
+    X = np.copy(x_rand)
+    y = np.copy(y_rand)
+
+    if link_type == 'no_link':
+        z = 1 * y_rand + z_rand
+        X[:, :n_informative_features] = x_rand[:, :n_informative_features] + z
+    elif link_type == 'no_no_link':
+        z = 0.8 * y_rand + z_rand
+    elif link_type == 'direct_link':
+        z = 0.8 * y_rand + z_rand
+        X[:, :n_informative_features] = x_rand[:, :n_informative_features] + (0.2 * (noise * y_rand) + y_rand) + z
+    elif link_type == 'weak_link':
+        z = 0.3 * y_rand + z_rand
+        X[:, :n_informative_features] = x_rand[:, :n_informative_features] + (0.8 * (noise * y_rand) + y_rand) + z
+    else:
+        raise NotImplemented(f'Link type {link_type} not implemented')
+
+    X[:, :n_informative_features:2] = X[:, :n_informative_features:2] * (-1)
+
+    return X, y, z
diff --git a/examples/example_simulated_data.py b/examples/example_simulated_data.py
@@ -5,20 +5,20 @@
 from cpm.edge_selection import PThreshold, UnivariateEdgeSelection
 
 
-X, y, covariates = simulate_regression_data(n_features=1225, n_informative_features=20,
-                                            covariate_effect_size=50,
-                                            feature_effect_size=0.01,
-                                            noise_level=5)
+X, y, covariates = simulate_regression_data(n_features=1225, n_informative_features=50,
+                                            covariate_effect_size=0.2,
+                                            feature_effect_size=100,
+                                            noise_level=0.1)
 
 univariate_edge_selection = UnivariateEdgeSelection(edge_statistic=['pearson'],
-                                                    edge_selection=[PThreshold(threshold=[0.1],
+                                                    edge_selection=[PThreshold(threshold=[0.05],
                                                                                correction=[None])])
 cpm = CPMRegression(results_directory='./tmp/example_simulated_data2',
                     cv=KFold(n_splits=5, shuffle=True, random_state=42),
                     edge_selection=univariate_edge_selection,
                     #cv_edge_selection=ShuffleSplit(n_splits=1, test_size=0.2, random_state=42),
                     add_edge_filter=True,
-                    n_permutations=100)
+                    n_permutations=10)
 cpm.estimate(X=X, y=y, covariates=covariates)
 
 #cpm._calculate_permutation_results('./tmp/example_simulated_data2')

diff --git a/examples/example_simulated_data2.py b/examples/example_simulated_data2.py
@@ -0,0 +1,29 @@
+from sklearn.model_selection import KFold, ShuffleSplit, RepeatedKFold
+
+from cpm import CPMRegression
+from cpm.simulate_data import simulate_regression_data_2
+from cpm.edge_selection import PThreshold, UnivariateEdgeSelection
+
+
+link_types = [#'no_link',
+              #'no_no_link',
+              'direct_link',
+              'weak_link'
+            ]
+
+for link in link_types:
+    X, y, covariates = simulate_regression_data_2(n_features=1225, n_informative_features=50, link_type=link)
+
+    univariate_edge_selection = UnivariateEdgeSelection(edge_statistic=['pearson'],
+                                                        edge_selection=[PThreshold(threshold=[0.05],
+                                                                                   correction=[None])])
+    cpm = CPMRegression(results_directory=f'./tmp/simulated_data_{link}',
+                        cv=RepeatedKFold(n_splits=10, n_repeats=5, random_state=42),
+                        edge_selection=univariate_edge_selection,
+                        #cv_edge_selection=ShuffleSplit(n_splits=1, test_size=0.2, random_state=42),
+                        add_edge_filter=True,
+                        n_permutations=2)
+    cpm.estimate(X=X, y=y, covariates=covariates)
+
+    #cpm._calculate_permutation_results('./tmp/example_simulated_data2')
+
diff --git a/examples/macs.py b/examples/macs.py
@@ -10,12 +10,12 @@
 #df = pd.read_csv('/spm-data/vault-data3/mmll/projects/macs_datahub_example/AnalysisReady/all/'
 #                 'FunctionalConnectome/sample.csv',
 #                 na_values=-99)
-X = np.load('/spm-data/vault-data3/mmll/projects/microbiome_mdd_eggerthella/data/AnalysisReady/sample_leon/DTI_fractional_anisotropy/X.npy')
-df = pd.read_csv('/spm-data/vault-data3/mmll/projects/microbiome_mdd_eggerthella/data/AnalysisReady/sample_leon/DTI_fractional_anisotropy/subjects.csv',
+X = np.load('/spm-data/vault-data3/mmll/projects/cpm_macs_example/datahub/AnalysisReady/hc/DTI_fractional_anisotropy/X.npy')
+df = pd.read_csv('/spm-data/vault-data3/mmll/projects/cpm_macs_example/datahub/AnalysisReady/hc/DTI_fractional_anisotropy/subjects.csv',
                  na_values=-99)
 
-X = X[df['Group'] == 1]
-df = df[df['Group'] == 1]
+#X = X[df['Group'] == 1]
+#df = df[df['Group'] == 1]
 
 
 X = X[~df['CTQ_Sum'].isna()]
@@ -36,12 +36,12 @@
 univariate_edge_selection = UnivariateEdgeSelection(edge_statistic=['pearson_partial'],
                                                     edge_selection=[p_threshold])
 
-cpm = CPMRegression(results_directory='./tmp/macs_ctq_hc',
+cpm = CPMRegression(results_directory='./tmp/macs_ctq',
                     cv=KFold(n_splits=10, shuffle=True, random_state=42),
                     edge_selection=univariate_edge_selection,
                     #cv_edge_selection=KFold(n_splits=2, shuffle=True, random_state=42),
                     add_edge_filter=True,
-                    n_permutations=1000)
+                    n_permutations=100)
 results = cpm.estimate(X=X, y=y, covariates=covs)
 #print(results)
 cpm._calculate_permutation_results()

diff --git a/examples/plot_simulation_results.py b/examples/plot_simulation_results.py
@@ -0,0 +1,39 @@
+import os
+
+import seaborn as sns
+import matplotlib.pyplot as plt
+import pandas as pd
+
+
+results_folder = "/home/nwinter/PycharmProjects/cpm_python/examples/tmp/"
+
+results = {"brain not associated with y\n brain not associated with confound\n confound associated with y": os.path.join(results_folder, "simulated_data_no_no_link"),
+           "brain not associated with y\n brain associated with confound\n confound associated with y": os.path.join(results_folder, "simulated_data_no_link"),
+           "brain weakly associated with y\n brain associated with confound\n confound associated with y": os.path.join(results_folder, "simulated_data_weak_link"),
+           "brain strongly associated with y\n brain associated with confound\n confound associated with y": os.path.join(results_folder, "simulated_data_direct_link"),
+
+           }
+
+dfs = []
+for link_type, folder in results.items():
+    df = pd.read_csv(os.path.join(folder, 'cv_results.csv'))
+    df['link_type'] = link_type
+    dfs.append(df)
+
+concatenated_df = pd.concat(dfs, ignore_index=True)
+
+concatenated_df = concatenated_df[concatenated_df['network'] == 'both']
+
+concatenated_df['model'] = concatenated_df['model'].replace({"covariates": "confound only", "full": "connectome + confound",
+                                 "connectome": "connectome only"})
+
+g = sns.FacetGrid(concatenated_df, col="link_type", margin_titles=True, despine=True,
+                  height=2.5)
+g.map(plt.axvline, x=0, color='grey', linewidth=0.5, zorder=-1)
+g.map(sns.violinplot, "pearson_score", "model", inner=None, split=True, hue=1, hue_order=[1, 2],
+      density_norm='count', dodge=True, palette="Blues_r")
+g.map(sns.boxplot, "pearson_score", "model", dodge=True, hue=1, hue_order=[2, 1])
+g.set_titles(col_template="{col_name}", size=7)
+g.set_xlabels("Pearson correlation", size=8)
+plt.show()
+print()
diff --git a/examples/trap_kongress.py b/examples/trap_kongress.py
@@ -0,0 +1,31 @@
+import numpy as np
+import pandas as pd
+
+from sklearn.model_selection import KFold
+
+from cpm import CPMRegression
+from cpm.edge_selection import PThreshold, UnivariateEdgeSelection
+
+
+X = np.load('/spm-data/vault-data3/mmll/projects/cpm_macs_example/datahub/AnalysisReady/hc/DTI_fractional_anisotropy/X.npy')
+df = pd.read_csv('/spm-data/vault-data3/mmll/projects/cpm_macs_example/datahub/AnalysisReady/hc/DTI_fractional_anisotropy/subjects.csv',
+                 na_values=-99)
+
+X = X[~df['Haushaltsnetto'].isna()]
+df = df[~df['Haushaltsnetto'].isna()]
+covs = df[['Alter', 'Geschlecht', 'Site']].to_numpy()
+y = df['Haushaltsnetto'].to_numpy()
+
+# define edge selection
+p_threshold = PThreshold(threshold=[0.001], correction=[None])
+p_fdr = PThreshold(threshold=[0.05], correction=['fdr_by'])
+univariate_edge_selection = UnivariateEdgeSelection(edge_statistic=['pearson_partial'], edge_selection=[p_threshold])
+
+# run cpm regression
+cpm = CPMRegression(results_directory='/spm-data/vault-data3/mmll/projects/cpm_macs_example/haushaltsnetto',
+                    cv=KFold(n_splits=10, shuffle=True, random_state=42),
+                    edge_selection=univariate_edge_selection,
+                    cv_edge_selection=KFold(n_splits=10, shuffle=True, random_state=42),
+                    add_edge_filter=True,
+                    n_permutations=1000)
+results = cpm.estimate(X=X, y=y, covariates=covs)