Skip to content

Commit

Permalink
Add lots of documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
NilsWinter committed Dec 10, 2024
1 parent e3d4fa3 commit 751bcc9
Show file tree
Hide file tree
Showing 6 changed files with 347 additions and 61 deletions.
40 changes: 27 additions & 13 deletions cpm/cpm_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,22 @@ def __init__(self,
"""
Initialize the CPMRegression object.
:param results_directory: Directory to save results.
:param cv: Outer cross-validation strategy.
:param inner_cv: Inner cross-validation strategy for edge selection.
:param edge_selection: Method for edge selection.
:param impute_missing_values: Whether to impute missing values.
:param n_permutations: Number of permutations to run for permutation testing.
:param atlas_labels: CSV file containing atlas and regions labels.
Parameters
----------
results_directory: str
Directory to save results.
cv: Union[BaseCrossValidator, BaseShuffleSplit]
Outer cross-validation strategy.
inner_cv: Union[BaseCrossValidator, BaseShuffleSplit]
Inner cross-validation strategy for edge selection.
edge_selection: UnivariateEdgeSelection
Method for edge selection.
impute_missing_values: bool
Whether to impute missing values.
n_permutations: int
Number of permutations to run for permutation testing.
atlas_labels: str
CSV file containing atlas and regions labels.
"""
self.results_directory = results_directory
self.cv = cv
Expand Down Expand Up @@ -94,9 +103,11 @@ def _copy_atlas_labels(self):

def save_configuration(self, config_filename: str):
"""
Save the current configuration to a file.
Saves the current configuration settings to a file in Pickle format. All attributes related to the configuration of the object
are serialized and stored in a file with the same base name as the provided filename, but with a .pkl extension.
:param config_filename: Path to the configuration file.
:param config_filename: The base name of the file where the configuration will be saved.
:return: None
"""
config_path = os.path.splitext(config_filename)[0] + '.pkl'
config_data = {
Expand Down Expand Up @@ -138,11 +149,14 @@ def estimate(self,
y: Union[pd.Series, pd.DataFrame, np.ndarray],
covariates: Union[pd.Series, pd.DataFrame, np.ndarray]):
"""
Estimate the CPM Regression models and run permutation tests.
Estimates a model using the provided data and conducts permutation testing. This method first fits the model to the actual data and subsequently performs estimation on permuted data for a specified number of permutations. Finally, it calculates permutation results.
Parameters
----------
X: Feature data used for the model. Can be a pandas DataFrame or a NumPy array.
y: Target variable used in the estimation process. Can be a pandas Series, DataFrame, or a NumPy array.
covariates: Additional covariate data to include in the model. Can be a pandas Series, DataFrame, or a NumPy array.
:param X: Features (predictors).
:param y: Labels (target variable).
:param covariates: Covariates to control for.
"""
self.logger.info(f"Starting estimation with {self.n_permutations} permutations.")

Expand Down
69 changes: 69 additions & 0 deletions cpm/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,62 @@ def n_models():


class LinearCPMModel:
"""
Linear Connectome-based Predictive Modeling (CPM) implementation.
This class implements a linear CPM model, allowing for fitting and prediction
based on connectome data, covariates, and residuals.
Attributes
----------
models : ModelDict
A dictionary containing the fitted models for different networks and data types
(connectome, covariates, residuals, and full model).
models_residuals : dict
A dictionary storing linear regression models used to calculate residuals
for connectome data, controlling for covariates.
edges : dict
A dictionary defining the edges (features) used for each network (e.g., 'positive', 'negative').
Parameters
----------
edges : dict
Dictionary containing indices of edges for 'positive' and 'negative' networks.
"""
def __init__(self, edges):
"""
Initialize the LinearCPMModel.
Parameters
----------
edges : dict
Dictionary containing indices of edges for 'positive' and 'negative' networks.
"""
self.models = ModelDict()
self.models_residuals = {}
self.edges = edges

def fit(self, X, y, covariates):
"""
Fit the CPM model.
This method fits multiple linear regression models for the connectome, covariates,
residuals, and full model using the provided data.
Parameters
----------
X : numpy.ndarray
A 2D array of shape (n_samples, n_features) representing the connectome data.
y : numpy.ndarray
A 1D array of shape (n_samples,) representing the target variable.
covariates : numpy.ndarray
A 2D array of shape (n_samples, n_covariates) representing the covariates.
Returns
-------
LinearCPMModel
The fitted CPM model instance.
"""
connectome = {}
residuals = {}
for network in ['positive', 'negative']:
Expand All @@ -49,6 +99,25 @@ def fit(self, X, y, covariates):
return self

def predict(self, X, covariates):
"""
Predict using the fitted CPM model.
This method generates predictions for the target variable using the
connectome, covariates, residuals, and full models.
Parameters
----------
X : numpy.ndarray
A 2D array of shape (n_samples, n_features) representing the connectome data.
covariates : numpy.ndarray
A 2D array of shape (n_samples, n_covariates) representing the covariates.
Returns
-------
ModelDict
A dictionary containing predictions for each network and model type
(connectome, covariates, residuals, and full model).
"""
connectome = {}
residuals = {}
for network in ['positive', 'negative']:
Expand Down
123 changes: 120 additions & 3 deletions documentation/docs/getting_started.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,121 @@
# Getting started
# Getting Started

In order to get started, you will need connectivity matrices or data, that is already graph data.
For this introduction we will assume that you are using connectivity matrices.
This guide will help you get started with running an analysis using the `CPMRegression` class. It provides a step-by-step description of how to set up, configure, and execute an analysis, along with explanations of the inputs and parameters.

---

## Step 1: Prepare Your Data

To run an analysis, you need the following inputs:

- **Connectome Data (`X`)**: A 2D array (numpy array or pandas DataFrame) of shape `(n_samples, n_features)` containing connectome edge values for each subject.
- **Target Variable (`y`)**: A 1D array or pandas Series of shape `(n_samples,)` containing the outcome variable (e.g., clinical scores, behavioral measures).
- **Covariates**: A 2D array or pandas DataFrame of shape `(n_samples, n_covariates)` containing variables to control for (e.g., age, sex).

Ensure that all inputs have consistent sample sizes (`n_samples`).

---

## Step 2: Configure the Analysis

### **Cross-Validation**
The `CPMRegression` class uses an outer cross-validation loop for performance evaluation and an optional inner cross-validation loop for hyperparameter optimization.

- **Outer CV (`cv`)**: Defines the cross-validation strategy (e.g., `KFold`).
- **Inner CV (`inner_cv`)**: Used for optimizing hyperparameters during edge selection. Can be left as `None` if not needed.

Example:

```python
from sklearn.model_selection import KFold

outer_cv = KFold(n_splits=10, shuffle=True, random_state=42)
```

### Edge Selection
The toolbox implements univariate edge selection, allowing users to specify the method for evaluating and selecting edges based on statistical tests.

#### Edge Statistics
Choose from the following methods for computing edge statistics:

- **pearson**: Pearson correlation
- **pearson_partial**: Pearson partial correlation (controlling for covariates)
- **spearman**: Spearman rank correlation
- **spearman_partial**: Spearman partial correlation (controlling for covariates)

#### p-Thresholds
- Set a single value (e.g., 0.05) or provide multiple values (e.g., [0.01, 0.05, 0.1]).
- If multiple thresholds are specified, the toolbox will optimize for the best p-threshold during inner cross-validation.

#### FDR Correction
- Optional FDR correction for multiple comparisons can be applied using correction='fdr_by'.


Example:

```python
from cpm.edge_selection import UnivariateEdgeSelection, PThreshold

edge_statistic = 'pearson'
univariate_edge_selection = UnivariateEdgeSelection(
edge_statistic=[edge_statistic],
edge_selection=[PThreshold(threshold=[0.05], correction=['fdr_by'])]
)
```

## Step 3: Set Up the CPMRegression Object
Create an instance of the CPMRegression class with the required inputs:

```python
from cpm.models import CPMRegression

cpm = CPMRegression(
results_directory="results/",
cv=outer_cv,
inner_cv=inner_cv, # Optional
edge_selection=univariate_edge_selection,
select_stable_edges=True,
stability_threshold=0.8,
impute_missing_values=True,
n_permutations=100
)
```
### Key Parameters
- **results_directory**: Directory where results will be saved.
- **cv**: Outer cross-validation strategy.
- **inner_cv**: Inner cross-validation strategy for hyperparameter optimization (optional).
- **edge_selection**: Configuration for univariate edge selection.
- **select_stable_edges**: Whether to select stable edges across folds (True or False).
- **stability_threshold**: Minimum proportion of folds in which an edge must be selected to be considered stable.
- **impute_missing_values**: Whether to impute missing values (True or False).
- **n_permutations**: Number of permutations for permutation testing.

## Step 4: Run the Analysis
Call the estimate method to perform the analysis:

```python
X = ... # Load your connectome data (numpy array or pandas DataFrame)
y = ... # Load your target variable (numpy array or pandas Series)
covariates = ... # Load your covariates (numpy array or pandas DataFrame)

cpm.estimate(X=X, y=y, covariates=covariates)
```

This will:

1. Perform edge selection based on the specified method and thresholds.
2. Train and evaluate models for each cross-validation fold.
3. Save results, including predictions, metrics, and permutation-based significance tests, to the results_directory.


## Step 5: Review Results
After the analysis, you can find the results in the results_directory, including:

- Cross-validation metrics (e.g., mean absolute error, R²).
- Model predictions for each fold.
- Edge stability and significance.

You can load and inspect these results for further analysis.

---
By following these steps, you can quickly set up and execute a connectome-based predictive modeling analysis using the CPMRegression class. For further customization, refer to the API documentation.
72 changes: 60 additions & 12 deletions documentation/docs/index.md
Original file line number Diff line number Diff line change
@@ -1,18 +1,66 @@
# Connectome-Based Predictive Modeling (CPM)
![Logo](assets/img/CCCPM_medium.png)
# Confound-Corrected Connectome-Based Predictive Modeling (CCCPM)
[![GitHub Workflow Status](https://img.shields.io/github/workflow/status/wwu-mmll/cpm_python/cpm/python-test)](https://github.com/wwu-mmll/cpm_python/actions)
[![Coverage Status](https://coveralls.io/repos/github/wwu-mmll/cpm_python/badge.svg?branch=main)](https://coveralls.io/github/wwu-mmll/cpm_python?branch=main)
[![Github Contributors](https://img.shields.io/github/contributors-anon/wwu-mmll/cpm_python?color=blue)](https://github.com/wwu-mmll/cpm_python/graphs/contributors)
[![Github Commits](https://img.shields.io/github/commit-activity/y/wwu-mmll/cpm_python)](https://github.com/wwu-mmll/cpm_python/commits/main)

`cpm_model` is a Python package for connectome-based predictive modeling. This package provides tools for preprocessing, feature extraction, model training, and evaluation specific to connectome data.
CCCPM is a newly developed Python toolbox designed specifically for researchers in psychiatry and neuroscience to
perform connectome-based predictive modeling. This package offers a comprehensive framework for building predictive
models from structural and functional connectome data, with a strong focus on methodological rigor, interpretability,
confound control, and statistical robustness.

## Features
---
## Background

- **Preprocessing**: Load and clean connectome data.
- **Feature Extraction**: Extract features from connectomes.
- **Model Training**: Train predictive models on connectome data.
- **Evaluation**: Evaluate model performance.
Network-based approaches are increasingly recognized as essential for understanding the complex relationships in brain connectivity that underlie behavior, cognition, and mental health. In psychiatry and neuroscience, analyzing structural and functional networks can reveal patterns associated with mental disorders, support individualized predictions, and improve our understanding of brain function. However, these analyses require robust tools that account for the unique challenges of connectome data, such as high dimensionality, variability, and the influence of confounding factors.

## Installation
Despite the growing importance of connectome-based predictive modeling (CPM), there is currently no fully developed software package for performing these analyses. Existing options are limited to a few MATLAB scripts, which lack the flexibility, transparency, and rigor required to foster replicable research. CCCPM addresses this gap by providing a Python-based, flexible, and rigorously designed toolbox that encourages replicable analyses while allowing researchers to tailor their workflows to specific research questions.

You can install the package using pip:
---

```sh
pip install git+https://github.com/mmll/cpm_python.git
```
## Overview

CCCPM was developed to address key challenges in connectome-based analyses, including optimizing model hyperparameters, controlling for confounding variables, and assessing the reliability of selected network features. This toolbox introduces novel methods, such as stability metrics for selected edges, and integrates well-established practices like nested cross-validation and permutation-based significance testing. By doing so, CCCPM provides a powerful and transparent tool for researchers aiming to explore brain networks' contributions to predictive models.

### Key Features

- **Hyperparameter Optimization**: Fine-tune model parameters, such as p-thresholds for edge selection, to achieve better predictive performance.
- **Confound Adjustment**: Use partial correlation methods during edge selection to rigorously control for covariates and confounding variables.
- **Residualization**: Remove the influence of confounds from connectome strengths to ensure cleaner data inputs.
- **Statistical Validation**: Assess model and edge-level significance using permutation-based testing, ensuring that findings are statistically robust.
- **Stability Metrics**: Evaluate the reliability of selected edges across iterations, improving the interpretability and reproducibility of identified networks.
- **Model Increment Analysis**: Quantify the unique contribution of connectome data to predictive models, helping to clarify their added value in prediction tasks.

---

## Why CCCPM?

Unlike existing CPM implementations, which are limited in scope and flexibility, CCCPM is designed to foster rigorous and replicable research. Its Python-based architecture ensures accessibility and compatibility with modern data science workflows, while its features address the specific challenges of connectome-based analyses. By offering a robust and transparent framework, CCCPM enables researchers to conduct analyses that are not only flexible and customizable but also reproducible and scientifically sound.

---

## Features in Detail

### **Data Imputation**
CCCPM includes methods to handle missing data effectively, ensuring that datasets with incomplete connectome information can still be utilized without introducing biases.

### **Nested Cross-Validation**
A nested cross-validation scheme is implemented to separate hyperparameter tuning from model evaluation. This ensures that the reported model performance is unbiased and reflects its true generalization capability.

### **Threshold Optimization**
The toolbox automates the optimization of p-thresholds, which determine which edges in the connectome are selected for model building. This allows researchers to identify thresholds that balance performance and interpretability.

### **Confound Adjustment**
By implementing partial correlations, CCCPM allows researchers to account for confounding variables during edge selection, ensuring that identified networks represent genuine relationships rather than artifacts.

### **Statistical Significance**
Permutation-based testing is provided to evaluate the significance of both model performance and selected edges, adding rigor to findings and reducing the risk of false-positive results.

### **Edge Stability**
CCCPM introduces a stability metric for selected edges, helping researchers evaluate the consistency of their findings across multiple iterations. This enhances the reliability of results and their potential for replication.

### **Model Increment Analysis**
Assess the added predictive value of connectome data by calculating the incremental contribution of network features to overall model performance.

---
Loading

0 comments on commit 751bcc9

Please sign in to comment.