Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
NilsWinter committed Sep 27, 2024
1 parent e587bcb commit da5457e
Show file tree
Hide file tree
Showing 9 changed files with 190 additions and 24 deletions.
6 changes: 3 additions & 3 deletions cpm/cpm_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,12 +162,12 @@ def estimate(self,
self.logger.info(f"Starting estimation with {self.n_permutations} permutations.")

# Estimate models on actual data
self._estimate(X=X, y=y, covariates=covariates, perm_run=0)
self._estimate(X=X, y=np.squeeze(y), covariates=covariates, perm_run=0)
self.logger.info("=" * 50)

# Estimate models on permuted data
for perm_id in range(1, self.n_permutations + 1):
self._estimate(X=X, y=y, covariates=covariates, perm_run=perm_id)
self._estimate(X=X, y=np.squeeze(y), covariates=covariates, perm_run=perm_id)

self._calculate_permutation_results()
self.logger.info("Estimation completed.")
Expand Down Expand Up @@ -396,7 +396,7 @@ def _update_predictions(self, y_pred, y_true, best_params, fold):
preds = (pd.DataFrame.from_dict(y_pred).stack().explode().reset_index().rename(
{'level_0': 'network', 'level_1': 'model', 0: 'y_pred'}, axis=1).set_index(['network', 'model']))
n_network_model = ModelDict.n_models() * NetworkDict.n_networks()
preds['y_true'] = np.repeat(y_true, n_network_model)
preds['y_true'] = np.tile(y_true, n_network_model)
preds['params'] = [best_params] * y_true.shape[0] * n_network_model
preds['fold'] = [fold] * y_true.shape[0] * n_network_model
return preds
Expand Down
19 changes: 13 additions & 6 deletions cpm/edge_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,13 +178,20 @@ def _generate_config_grid(self):
return ParameterGrid(grid_elements)

def fit_transform(self, X, y=None, covariates=None):
_, p_values = one_sample_t_test(X, 0)
valid_edges = p_values < 0.05
t_test_filter = False
if t_test_filter:
_, p_values = one_sample_t_test(X, 0)
valid_edges = p_values < 0.05
else:
valid_edges = np.bool(np.ones(X.shape[1]))

#r_edges, p_edges = np.zeros(X.shape[1]), np.ones(X.shape[1])
#r_edges_masked, p_edges_masked = self.compute_edge_statistics(X=X[:, valid_edges], y=y, covariates=covariates)
#r_edges[valid_edges] = r_edges_masked
#p_edges[valid_edges] = p_edges_masked

r_edges, p_edges = self.compute_edge_statistics(X=X, y=y, covariates=covariates)

r_edges, p_edges = np.zeros(X.shape[1]), np.ones(X.shape[1])
r_edges_masked, p_edges_masked = self.compute_edge_statistics(X=X[:, valid_edges], y=y, covariates=covariates)
r_edges[valid_edges] = r_edges_masked
p_edges[valid_edges] = p_edges_masked
edges = self.edge_selection.select(r=r_edges, p=p_edges)
return edges

Expand Down
7 changes: 6 additions & 1 deletion cpm/reporting/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@ def main():
#st.session_state['results_directory'] = '/home/nwinter/PycharmProjects/cpm_python/examples/tmp/example_simulated_data2/'
#st.session_state['results_directory'] = '/home/nwinter/PycharmProjects/cpm_python/examples/tmp/hannah_sad/'
#st.session_state['results_directory'] = '/home/nwinter/PycharmProjects/cpm_python/examples/tmp/hannah_sad_pearson/'
st.session_state['results_directory'] = '/home/nwinter/PycharmProjects/cpm_python/examples/tmp/macs_ctq_hc/'
#st.session_state['results_directory'] = '/spm-data/vault-data3/mmll/projects/cpm_macs_example/haushaltsnetto'
#st.session_state['results_directory'] = '/spm-data/vault-data3/mmll/projects/cpm_macs_example/haushaltsnetto_partial'
#st.session_state['results_directory'] = '/spm-data/vault-data3/mmll/projects/cpm_macs_example/haushaltsnetto_05_fdr'
st.session_state[
'results_directory'] = '/home/nwinter/PycharmProjects/cpm_python/examples/tmp/simulated_data_no_no_link/'

st.session_state['df'] = pd.read_csv(os.path.join(st.session_state.results_directory, 'cv_results.csv'))
st.session_state['df_main'] = load_results_from_folder(st.session_state.results_directory, 'cv_results_mean_std.csv')
st.session_state['df_predictions'] = load_data_from_folder(st.session_state.results_directory, 'cv_predictions.csv')
Expand Down
59 changes: 57 additions & 2 deletions cpm/simulate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,14 @@ def simulate_regression_data(n_samples: int = 500,
random_state=random_state, shuffle=False)

# Introduce some variability in features
X[:, np.arange(0, X.shape[1], 2)] *= -1
#X[:, np.arange(0, X.shape[1], 2)] *= -1

# Generate initial random covariates
covariates = np.random.rand(n_samples, n_covariates)

# Define coefficients for covariates based on features to introduce correlation
feature_to_covariate_coeff = np.random.rand(n_features, n_covariates)
covariates += np.dot(X, feature_to_covariate_coeff) * 0.1 # Introducing correlation with features
covariates += np.dot(X, feature_to_covariate_coeff) * 0.2 # Introducing correlation with features

# Standardize the covariates after introducing correlation
scaler_cov = StandardScaler()
Expand All @@ -70,3 +70,58 @@ def simulate_regression_data(n_samples: int = 500,
y = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

return X, y, covariates


def simulate_regression_data_2(n_samples: int = 500,
n_features: int = 4950,
n_informative_features: int = 200,
link_type: str = 'no_link',
random_state: int = 42):
"""
Simulates data with specified parameters for regression problems.
This function generates a dataset consisting of features (with some informative),
covariates, and a target variable. The generated dataset has correlations between
the features, covariates, and the target variable as per input arguments.
Args:
n_samples (int): Number of samples in the dataset. Default is 500.
n_features (int): Total number of features including irrelevant ones. Default is 4950.
n_informative_features (int): Number of informative features affecting the target. Default is 200.
random_state (int): Seed value used for reproducibility. Default is 42.
Returns:
tuple: Tuple containing three NumPy arrays - X, y, and z representing features, target, and additional variables respectively.
- X: Array-like shape=(n_samples, n_features).
- y: Array-like shape=(n_samples, ).
- z: Array-like shape=(n_samples, ).
"""

# Set seed for reproducibility
np.random.seed(random_state)

mu, sigma = 0, 1.0 # mean and standard deviation
x_rand = np.random.normal(mu, sigma, [n_samples, n_features])
y_rand = np.random.normal(mu, sigma, n_samples).reshape(-1, 1)
z_rand = np.random.normal(mu, sigma, n_samples).reshape(-1, 1)
noise = np.random.normal(mu, sigma, n_samples).reshape(-1, 1)
X = np.copy(x_rand)
y = np.copy(y_rand)

if link_type == 'no_link':
z = 1 * y_rand + z_rand
X[:, :n_informative_features] = x_rand[:, :n_informative_features] + z
elif link_type == 'no_no_link':
z = 0.8 * y_rand + z_rand
elif link_type == 'direct_link':
z = 0.8 * y_rand + z_rand
X[:, :n_informative_features] = x_rand[:, :n_informative_features] + (0.2 * (noise * y_rand) + y_rand) + z
elif link_type == 'weak_link':
z = 0.3 * y_rand + z_rand
X[:, :n_informative_features] = x_rand[:, :n_informative_features] + (0.8 * (noise * y_rand) + y_rand) + z
else:
raise NotImplemented(f'Link type {link_type} not implemented')

X[:, :n_informative_features:2] = X[:, :n_informative_features:2] * (-1)

return X, y, z
12 changes: 6 additions & 6 deletions examples/example_simulated_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,20 @@
from cpm.edge_selection import PThreshold, UnivariateEdgeSelection


X, y, covariates = simulate_regression_data(n_features=1225, n_informative_features=20,
covariate_effect_size=50,
feature_effect_size=0.01,
noise_level=5)
X, y, covariates = simulate_regression_data(n_features=1225, n_informative_features=50,
covariate_effect_size=0.2,
feature_effect_size=100,
noise_level=0.1)

univariate_edge_selection = UnivariateEdgeSelection(edge_statistic=['pearson'],
edge_selection=[PThreshold(threshold=[0.1],
edge_selection=[PThreshold(threshold=[0.05],
correction=[None])])
cpm = CPMRegression(results_directory='./tmp/example_simulated_data2',
cv=KFold(n_splits=5, shuffle=True, random_state=42),
edge_selection=univariate_edge_selection,
#cv_edge_selection=ShuffleSplit(n_splits=1, test_size=0.2, random_state=42),
add_edge_filter=True,
n_permutations=100)
n_permutations=10)
cpm.estimate(X=X, y=y, covariates=covariates)

#cpm._calculate_permutation_results('./tmp/example_simulated_data2')
Expand Down
29 changes: 29 additions & 0 deletions examples/example_simulated_data2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from sklearn.model_selection import KFold, ShuffleSplit, RepeatedKFold

from cpm import CPMRegression
from cpm.simulate_data import simulate_regression_data_2
from cpm.edge_selection import PThreshold, UnivariateEdgeSelection


link_types = [#'no_link',
#'no_no_link',
'direct_link',
'weak_link'
]

for link in link_types:
X, y, covariates = simulate_regression_data_2(n_features=1225, n_informative_features=50, link_type=link)

univariate_edge_selection = UnivariateEdgeSelection(edge_statistic=['pearson'],
edge_selection=[PThreshold(threshold=[0.05],
correction=[None])])
cpm = CPMRegression(results_directory=f'./tmp/simulated_data_{link}',
cv=RepeatedKFold(n_splits=10, n_repeats=5, random_state=42),
edge_selection=univariate_edge_selection,
#cv_edge_selection=ShuffleSplit(n_splits=1, test_size=0.2, random_state=42),
add_edge_filter=True,
n_permutations=2)
cpm.estimate(X=X, y=y, covariates=covariates)

#cpm._calculate_permutation_results('./tmp/example_simulated_data2')

12 changes: 6 additions & 6 deletions examples/macs.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@
#df = pd.read_csv('/spm-data/vault-data3/mmll/projects/macs_datahub_example/AnalysisReady/all/'
# 'FunctionalConnectome/sample.csv',
# na_values=-99)
X = np.load('/spm-data/vault-data3/mmll/projects/microbiome_mdd_eggerthella/data/AnalysisReady/sample_leon/DTI_fractional_anisotropy/X.npy')
df = pd.read_csv('/spm-data/vault-data3/mmll/projects/microbiome_mdd_eggerthella/data/AnalysisReady/sample_leon/DTI_fractional_anisotropy/subjects.csv',
X = np.load('/spm-data/vault-data3/mmll/projects/cpm_macs_example/datahub/AnalysisReady/hc/DTI_fractional_anisotropy/X.npy')
df = pd.read_csv('/spm-data/vault-data3/mmll/projects/cpm_macs_example/datahub/AnalysisReady/hc/DTI_fractional_anisotropy/subjects.csv',
na_values=-99)

X = X[df['Group'] == 1]
df = df[df['Group'] == 1]
#X = X[df['Group'] == 1]
#df = df[df['Group'] == 1]


X = X[~df['CTQ_Sum'].isna()]
Expand All @@ -36,12 +36,12 @@
univariate_edge_selection = UnivariateEdgeSelection(edge_statistic=['pearson_partial'],
edge_selection=[p_threshold])

cpm = CPMRegression(results_directory='./tmp/macs_ctq_hc',
cpm = CPMRegression(results_directory='./tmp/macs_ctq',
cv=KFold(n_splits=10, shuffle=True, random_state=42),
edge_selection=univariate_edge_selection,
#cv_edge_selection=KFold(n_splits=2, shuffle=True, random_state=42),
add_edge_filter=True,
n_permutations=1000)
n_permutations=100)
results = cpm.estimate(X=X, y=y, covariates=covs)
#print(results)
cpm._calculate_permutation_results()
Expand Down
39 changes: 39 additions & 0 deletions examples/plot_simulation_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import os

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd


results_folder = "/home/nwinter/PycharmProjects/cpm_python/examples/tmp/"

results = {"brain not associated with y\n brain not associated with confound\n confound associated with y": os.path.join(results_folder, "simulated_data_no_no_link"),
"brain not associated with y\n brain associated with confound\n confound associated with y": os.path.join(results_folder, "simulated_data_no_link"),
"brain weakly associated with y\n brain associated with confound\n confound associated with y": os.path.join(results_folder, "simulated_data_weak_link"),
"brain strongly associated with y\n brain associated with confound\n confound associated with y": os.path.join(results_folder, "simulated_data_direct_link"),

}

dfs = []
for link_type, folder in results.items():
df = pd.read_csv(os.path.join(folder, 'cv_results.csv'))
df['link_type'] = link_type
dfs.append(df)

concatenated_df = pd.concat(dfs, ignore_index=True)

concatenated_df = concatenated_df[concatenated_df['network'] == 'both']

concatenated_df['model'] = concatenated_df['model'].replace({"covariates": "confound only", "full": "connectome + confound",
"connectome": "connectome only"})

g = sns.FacetGrid(concatenated_df, col="link_type", margin_titles=True, despine=True,
height=2.5)
g.map(plt.axvline, x=0, color='grey', linewidth=0.5, zorder=-1)
g.map(sns.violinplot, "pearson_score", "model", inner=None, split=True, hue=1, hue_order=[1, 2],
density_norm='count', dodge=True, palette="Blues_r")
g.map(sns.boxplot, "pearson_score", "model", dodge=True, hue=1, hue_order=[2, 1])
g.set_titles(col_template="{col_name}", size=7)
g.set_xlabels("Pearson correlation", size=8)
plt.show()
print()
31 changes: 31 additions & 0 deletions examples/trap_kongress.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold

from cpm import CPMRegression
from cpm.edge_selection import PThreshold, UnivariateEdgeSelection


X = np.load('/spm-data/vault-data3/mmll/projects/cpm_macs_example/datahub/AnalysisReady/hc/DTI_fractional_anisotropy/X.npy')
df = pd.read_csv('/spm-data/vault-data3/mmll/projects/cpm_macs_example/datahub/AnalysisReady/hc/DTI_fractional_anisotropy/subjects.csv',
na_values=-99)

X = X[~df['Haushaltsnetto'].isna()]
df = df[~df['Haushaltsnetto'].isna()]
covs = df[['Alter', 'Geschlecht', 'Site']].to_numpy()
y = df['Haushaltsnetto'].to_numpy()

# define edge selection
p_threshold = PThreshold(threshold=[0.001], correction=[None])
p_fdr = PThreshold(threshold=[0.05], correction=['fdr_by'])
univariate_edge_selection = UnivariateEdgeSelection(edge_statistic=['pearson_partial'], edge_selection=[p_threshold])

# run cpm regression
cpm = CPMRegression(results_directory='/spm-data/vault-data3/mmll/projects/cpm_macs_example/haushaltsnetto',
cv=KFold(n_splits=10, shuffle=True, random_state=42),
edge_selection=univariate_edge_selection,
cv_edge_selection=KFold(n_splits=10, shuffle=True, random_state=42),
add_edge_filter=True,
n_permutations=1000)
results = cpm.estimate(X=X, y=y, covariates=covs)

0 comments on commit da5457e

Please sign in to comment.