Add lots of documentation

wwu-mmll · Dec 10, 2024 · 751bcc9 · 751bcc9
1 parent e3d4fa3
commit 751bcc9
Show file tree

Hide file tree

Showing 6 changed files with 347 additions and 61 deletions.
diff --git a/cpm/cpm_analysis.py b/cpm/cpm_analysis.py
@@ -41,13 +41,22 @@ def __init__(self,
         """
         Initialize the CPMRegression object.
 
-        :param results_directory: Directory to save results.
-        :param cv: Outer cross-validation strategy.
-        :param inner_cv: Inner cross-validation strategy for edge selection.
-        :param edge_selection: Method for edge selection.
-        :param impute_missing_values: Whether to impute missing values.
-        :param n_permutations: Number of permutations to run for permutation testing.
-        :param atlas_labels: CSV file containing atlas and regions labels.
+        Parameters
+        ----------
+        results_directory: str
+            Directory to save results.
+        cv: Union[BaseCrossValidator, BaseShuffleSplit]
+            Outer cross-validation strategy.
+        inner_cv: Union[BaseCrossValidator, BaseShuffleSplit]
+            Inner cross-validation strategy for edge selection.
+        edge_selection:  UnivariateEdgeSelection
+            Method for edge selection.
+        impute_missing_values: bool
+            Whether to impute missing values.
+        n_permutations: int
+            Number of permutations to run for permutation testing.
+        atlas_labels: str
+            CSV file containing atlas and regions labels.
         """
         self.results_directory = results_directory
         self.cv = cv
@@ -94,9 +103,11 @@ def _copy_atlas_labels(self):
 
     def save_configuration(self, config_filename: str):
         """
-        Save the current configuration to a file.
+        Saves the current configuration settings to a file in Pickle format. All attributes related to the configuration of the object
+        are serialized and stored in a file with the same base name as the provided filename, but with a .pkl extension.
 
-        :param config_filename: Path to the configuration file.
+        :param config_filename: The base name of the file where the configuration will be saved.
+        :return: None
         """
         config_path = os.path.splitext(config_filename)[0] + '.pkl'
         config_data = {
@@ -138,11 +149,14 @@ def estimate(self,
                  y: Union[pd.Series, pd.DataFrame, np.ndarray],
                  covariates: Union[pd.Series, pd.DataFrame, np.ndarray]):
         """
-        Estimate the CPM Regression models and run permutation tests.
+        Estimates a model using the provided data and conducts permutation testing. This method first fits the model to the actual data and subsequently performs estimation on permuted data for a specified number of permutations. Finally, it calculates permutation results.
+
+        Parameters
+        ----------
+        X: Feature data used for the model. Can be a pandas DataFrame or a NumPy array.
+        y: Target variable used in the estimation process. Can be a pandas Series, DataFrame, or a NumPy array.
+        covariates: Additional covariate data to include in the model. Can be a pandas Series, DataFrame, or a NumPy array.
 
-        :param X: Features (predictors).
-        :param y: Labels (target variable).
-        :param covariates: Covariates to control for.
         """
         self.logger.info(f"Starting estimation with {self.n_permutations} permutations.")
 

diff --git a/cpm/models.py b/cpm/models.py
@@ -23,12 +23,62 @@ def n_models():
 
 
 class LinearCPMModel:
+    """
+    Linear Connectome-based Predictive Modeling (CPM) implementation.
+
+    This class implements a linear CPM model, allowing for fitting and prediction
+    based on connectome data, covariates, and residuals.
+
+    Attributes
+    ----------
+    models : ModelDict
+        A dictionary containing the fitted models for different networks and data types
+        (connectome, covariates, residuals, and full model).
+    models_residuals : dict
+        A dictionary storing linear regression models used to calculate residuals
+        for connectome data, controlling for covariates.
+    edges : dict
+        A dictionary defining the edges (features) used for each network (e.g., 'positive', 'negative').
+
+    Parameters
+    ----------
+    edges : dict
+        Dictionary containing indices of edges for 'positive' and 'negative' networks.
+    """
     def __init__(self, edges):
+        """
+        Initialize the LinearCPMModel.
+
+        Parameters
+        ----------
+        edges : dict
+            Dictionary containing indices of edges for 'positive' and 'negative' networks.
+        """
         self.models = ModelDict()
         self.models_residuals = {}
         self.edges = edges
 
     def fit(self, X, y, covariates):
+        """
+        Fit the CPM model.
+
+        This method fits multiple linear regression models for the connectome, covariates,
+        residuals, and full model using the provided data.
+
+        Parameters
+        ----------
+        X : numpy.ndarray
+            A 2D array of shape (n_samples, n_features) representing the connectome data.
+        y : numpy.ndarray
+            A 1D array of shape (n_samples,) representing the target variable.
+        covariates : numpy.ndarray
+            A 2D array of shape (n_samples, n_covariates) representing the covariates.
+
+        Returns
+        -------
+        LinearCPMModel
+            The fitted CPM model instance.
+        """
         connectome = {}
         residuals = {}
         for network in ['positive', 'negative']:
@@ -49,6 +99,25 @@ def fit(self, X, y, covariates):
         return self
 
     def predict(self, X, covariates):
+        """
+        Predict using the fitted CPM model.
+
+        This method generates predictions for the target variable using the
+        connectome, covariates, residuals, and full models.
+
+        Parameters
+        ----------
+        X : numpy.ndarray
+            A 2D array of shape (n_samples, n_features) representing the connectome data.
+        covariates : numpy.ndarray
+            A 2D array of shape (n_samples, n_covariates) representing the covariates.
+
+        Returns
+        -------
+        ModelDict
+            A dictionary containing predictions for each network and model type
+            (connectome, covariates, residuals, and full model).
+        """
         connectome = {}
         residuals = {}
         for network in ['positive', 'negative']:

diff --git a/documentation/docs/getting_started.md b/documentation/docs/getting_started.md
@@ -1,4 +1,121 @@
-# Getting started
+# Getting Started
 
-In order to get started, you will need connectivity matrices or data, that is already graph data. 
-For this introduction we will assume that you are using connectivity matrices.
+This guide will help you get started with running an analysis using the `CPMRegression` class. It provides a step-by-step description of how to set up, configure, and execute an analysis, along with explanations of the inputs and parameters.
+
+---
+
+## Step 1: Prepare Your Data
+
+To run an analysis, you need the following inputs:
+
+- **Connectome Data (`X`)**: A 2D array (numpy array or pandas DataFrame) of shape `(n_samples, n_features)` containing connectome edge values for each subject.
+- **Target Variable (`y`)**: A 1D array or pandas Series of shape `(n_samples,)` containing the outcome variable (e.g., clinical scores, behavioral measures).
+- **Covariates**: A 2D array or pandas DataFrame of shape `(n_samples, n_covariates)` containing variables to control for (e.g., age, sex).
+
+Ensure that all inputs have consistent sample sizes (`n_samples`).
+
+---
+
+## Step 2: Configure the Analysis
+
+### **Cross-Validation**
+The `CPMRegression` class uses an outer cross-validation loop for performance evaluation and an optional inner cross-validation loop for hyperparameter optimization.
+
+- **Outer CV (`cv`)**: Defines the cross-validation strategy (e.g., `KFold`).
+- **Inner CV (`inner_cv`)**: Used for optimizing hyperparameters during edge selection. Can be left as `None` if not needed.
+
+Example:
+
+```python
+from sklearn.model_selection import KFold
+
+outer_cv = KFold(n_splits=10, shuffle=True, random_state=42)
+```
+
+### Edge Selection
+The toolbox implements univariate edge selection, allowing users to specify the method for evaluating and selecting edges based on statistical tests.
+
+#### Edge Statistics
+Choose from the following methods for computing edge statistics:
+
+- **pearson**: Pearson correlation
+- **pearson_partial**: Pearson partial correlation (controlling for covariates)
+- **spearman**: Spearman rank correlation
+- **spearman_partial**: Spearman partial correlation (controlling for covariates)
+
+#### p-Thresholds
+- Set a single value (e.g., 0.05) or provide multiple values (e.g., [0.01, 0.05, 0.1]).
+- If multiple thresholds are specified, the toolbox will optimize for the best p-threshold during inner cross-validation.
+
+#### FDR Correction
+- Optional FDR correction for multiple comparisons can be applied using correction='fdr_by'.
+
+
+Example:
+
+```python
+from cpm.edge_selection import UnivariateEdgeSelection, PThreshold
+
+edge_statistic = 'pearson'
+univariate_edge_selection = UnivariateEdgeSelection(
+    edge_statistic=[edge_statistic],
+    edge_selection=[PThreshold(threshold=[0.05], correction=['fdr_by'])]
+)
+```
+
+## Step 3: Set Up the CPMRegression Object
+Create an instance of the CPMRegression class with the required inputs:
+
+```python
+from cpm.models import CPMRegression
+
+cpm = CPMRegression(
+    results_directory="results/",
+    cv=outer_cv,
+    inner_cv=inner_cv,  # Optional
+    edge_selection=univariate_edge_selection,
+    select_stable_edges=True,
+    stability_threshold=0.8,
+    impute_missing_values=True,
+    n_permutations=100
+)
+```
+### Key Parameters
+- **results_directory**: Directory where results will be saved.
+- **cv**: Outer cross-validation strategy.
+- **inner_cv**: Inner cross-validation strategy for hyperparameter optimization (optional).
+- **edge_selection**: Configuration for univariate edge selection.
+- **select_stable_edges**: Whether to select stable edges across folds (True or False).
+- **stability_threshold**: Minimum proportion of folds in which an edge must be selected to be considered stable.
+- **impute_missing_values**: Whether to impute missing values (True or False).
+- **n_permutations**: Number of permutations for permutation testing.
+
+## Step 4: Run the Analysis
+Call the estimate method to perform the analysis:
+
+```python
+X = ...  # Load your connectome data (numpy array or pandas DataFrame)
+y = ...  # Load your target variable (numpy array or pandas Series)
+covariates = ...  # Load your covariates (numpy array or pandas DataFrame)
+
+cpm.estimate(X=X, y=y, covariates=covariates)
+```
+
+This will:
+
+1. Perform edge selection based on the specified method and thresholds.
+2. Train and evaluate models for each cross-validation fold.
+3. Save results, including predictions, metrics, and permutation-based significance tests, to the results_directory.
+
+
+## Step 5: Review Results
+After the analysis, you can find the results in the results_directory, including:
+
+- Cross-validation metrics (e.g., mean absolute error, R²).
+- Model predictions for each fold.
+- Edge stability and significance.
+
+You can load and inspect these results for further analysis.
+
+---
+By following these steps, you can quickly set up and execute a connectome-based predictive modeling analysis using the CPMRegression class. For further customization, refer to the API documentation.
diff --git a/documentation/docs/index.md b/documentation/docs/index.md
@@ -1,18 +1,66 @@
-# Connectome-Based Predictive Modeling (CPM)
+![Logo](assets/img/CCCPM_medium.png)
+# Confound-Corrected Connectome-Based Predictive Modeling (CCCPM)
+[![GitHub Workflow Status](https://img.shields.io/github/workflow/status/wwu-mmll/cpm_python/cpm/python-test)](https://github.com/wwu-mmll/cpm_python/actions)
+[![Coverage Status](https://coveralls.io/repos/github/wwu-mmll/cpm_python/badge.svg?branch=main)](https://coveralls.io/github/wwu-mmll/cpm_python?branch=main)
+[![Github Contributors](https://img.shields.io/github/contributors-anon/wwu-mmll/cpm_python?color=blue)](https://github.com/wwu-mmll/cpm_python/graphs/contributors)
+[![Github Commits](https://img.shields.io/github/commit-activity/y/wwu-mmll/cpm_python)](https://github.com/wwu-mmll/cpm_python/commits/main)
 
-`cpm_model` is a Python package for connectome-based predictive modeling. This package provides tools for preprocessing, feature extraction, model training, and evaluation specific to connectome data.
+CCCPM is a newly developed Python toolbox designed specifically for researchers in psychiatry and neuroscience to 
+perform connectome-based predictive modeling. This package offers a comprehensive framework for building predictive 
+models from structural and functional connectome data, with a strong focus on methodological rigor, interpretability, 
+confound control, and statistical robustness.
 
-## Features
+---
+## Background
 
-- **Preprocessing**: Load and clean connectome data.
-- **Feature Extraction**: Extract features from connectomes.
-- **Model Training**: Train predictive models on connectome data.
-- **Evaluation**: Evaluate model performance.
+Network-based approaches are increasingly recognized as essential for understanding the complex relationships in brain connectivity that underlie behavior, cognition, and mental health. In psychiatry and neuroscience, analyzing structural and functional networks can reveal patterns associated with mental disorders, support individualized predictions, and improve our understanding of brain function. However, these analyses require robust tools that account for the unique challenges of connectome data, such as high dimensionality, variability, and the influence of confounding factors.
 
-## Installation
+Despite the growing importance of connectome-based predictive modeling (CPM), there is currently no fully developed software package for performing these analyses. Existing options are limited to a few MATLAB scripts, which lack the flexibility, transparency, and rigor required to foster replicable research. CCCPM addresses this gap by providing a Python-based, flexible, and rigorously designed toolbox that encourages replicable analyses while allowing researchers to tailor their workflows to specific research questions.
 
-You can install the package using pip:
+---
 
-```sh
-pip install git+https://github.com/mmll/cpm_python.git
-```
+## Overview
+
+CCCPM was developed to address key challenges in connectome-based analyses, including optimizing model hyperparameters, controlling for confounding variables, and assessing the reliability of selected network features. This toolbox introduces novel methods, such as stability metrics for selected edges, and integrates well-established practices like nested cross-validation and permutation-based significance testing. By doing so, CCCPM provides a powerful and transparent tool for researchers aiming to explore brain networks' contributions to predictive models.
+
+### Key Features
+
+- **Hyperparameter Optimization**: Fine-tune model parameters, such as p-thresholds for edge selection, to achieve better predictive performance.
+- **Confound Adjustment**: Use partial correlation methods during edge selection to rigorously control for covariates and confounding variables.
+- **Residualization**: Remove the influence of confounds from connectome strengths to ensure cleaner data inputs.
+- **Statistical Validation**: Assess model and edge-level significance using permutation-based testing, ensuring that findings are statistically robust.
+- **Stability Metrics**: Evaluate the reliability of selected edges across iterations, improving the interpretability and reproducibility of identified networks.
+- **Model Increment Analysis**: Quantify the unique contribution of connectome data to predictive models, helping to clarify their added value in prediction tasks.
+
+---
+
+## Why CCCPM?
+
+Unlike existing CPM implementations, which are limited in scope and flexibility, CCCPM is designed to foster rigorous and replicable research. Its Python-based architecture ensures accessibility and compatibility with modern data science workflows, while its features address the specific challenges of connectome-based analyses. By offering a robust and transparent framework, CCCPM enables researchers to conduct analyses that are not only flexible and customizable but also reproducible and scientifically sound.
+
+---
+
+## Features in Detail
+
+### **Data Imputation**
+CCCPM includes methods to handle missing data effectively, ensuring that datasets with incomplete connectome information can still be utilized without introducing biases.
+
+### **Nested Cross-Validation**
+A nested cross-validation scheme is implemented to separate hyperparameter tuning from model evaluation. This ensures that the reported model performance is unbiased and reflects its true generalization capability.
+
+### **Threshold Optimization**
+The toolbox automates the optimization of p-thresholds, which determine which edges in the connectome are selected for model building. This allows researchers to identify thresholds that balance performance and interpretability.
+
+### **Confound Adjustment**
+By implementing partial correlations, CCCPM allows researchers to account for confounding variables during edge selection, ensuring that identified networks represent genuine relationships rather than artifacts.
+
+### **Statistical Significance**
+Permutation-based testing is provided to evaluate the significance of both model performance and selected edges, adding rigor to findings and reducing the risk of false-positive results.
+
+### **Edge Stability**
+CCCPM introduces a stability metric for selected edges, helping researchers evaluate the consistency of their findings across multiple iterations. This enhances the reliability of results and their potential for replication.
+
+### **Model Increment Analysis**
+Assess the added predictive value of connectome data by calculating the incremental contribution of network features to overall model performance.
+
+---