workflow4metabolomics · etiennejls · Jul 11, 2025 · Jul 16, 2025 · Jul 16, 2025 · Jul 16, 2025
diff --git a/tools/parsec/.shed.yml b/tools/parsec/.shed.yml
@@ -0,0 +1,7 @@
+categories: [Metabolomics]
+description: 'Batch correction for extracted metabolomics data based on mixed models'
+homepage_url: http://workflow4metabolomics.org
+remote_repository_url: https://github.com/workflow4metabolomics/tools-metabolomics
+long_description: 'This Galaxy tool is designed to correct batch and cohort effects in intensity measurements from metabolomic studies. Using a mixed-model approach, it adjusts intensity values while accounting for batch and injection order effects.'
+name: parsec
+owner: workflow4metabolomics
diff --git a/tools/parsec/README.md b/tools/parsec/README.md
@@ -0,0 +1,113 @@
+# Galaxy Tool Documentation: Batch Cohort Correction
+
+## Overview
+This Galaxy tool is designed to correct batch and cohort effects in intensity measurements from scientific studies. Using a mixed-model approach, it adjusts intensity values while accounting for batch (random) and injection order (fixed) effects.
+
+---
+
+## Table of Contents
+1. [Overview](#overview)
+2. [Prerequisites](#prerequisites)
+3. [Installation](#installation)
+4. [Inputs](#inputs)
+5. [Outputs](#outputs)
+6. [Usage Example](#usage-example)
+7. [Commands Executed by Galaxy](#commands-executed-by-galaxy)
+8. [Important Notes](#important-notes)
+9. [Contributing](#contributing)
+10. [License](#license)
+11. [About](#about)
+
+---
+
+## Prerequisites
+- **Galaxy Platform**: Ensure access to a functional Galaxy instance.  
+- **R version 4.2.2**: The tool relies on R for computations.  
+- Required R packages: `r-optparse`, `r-dplyr`, `r-lme4`.
+
+---
+
+## Installation
+Download the tool from the Galaxy repository or install it directly on your Galaxy instance:
+
+```bash
+git clone https://github.com/your_name/your_project.git
+```
+
+---
+
+## Inputs
+The input file should be in CSV format and include the following columns:
+- **Batch**: Batch identifier (optional for batch correction).
+- **SampleID**: Sample identifier.
+- **Injection_Order**: Injection order (mandatory for correction).
+- **Ion1, Ion2, ...**: Intensity columns to be corrected.
+
+**Sample Input File:**
+```csv
+SampleID,Batch,Injection_Order,Ion1,Ion2
+1,1,5,500,300
+2,1,15,520,310
+3,2,25,490,290
+4,2,35,505,295
+```
+
+---
+
+## Outputs
+The output will also be in CSV format, with corrected intensity values.
+
+**Sample Output File:**
+```csv
+SampleID,Batch,Injection_Order,Ion1,Ion2
+1,1,5,-0.2464,-0.2464
+2,1,15,1.3362,1.3362
+3,2,25,-0.5720,-0.5719
+4,2,35,0.3269,0.3268
+```
+
+---
+
+## Usage Example
+1. Upload your CSV file to Galaxy.  
+2. Select the **Batch Cohort Correction** tool in your workflow.  
+3. Specify the input file and set a name for the output file.  
+4. Run the job and retrieve the corrected output file.
+
+---
+
+## Commands Executed by Galaxy
+The process will run the following command:
+
+```bash
+Rscript $__tool_directory__/executable_func.R --input $input --output $output
+```
+
+---
+
+## Important Notes
+- **Injection_Order**: Mandatory for accurate corrections.  
+- **CSV Format**: Ensure the file is properly formatted with columns separated by commas.  
+- Malformed or improperly formatted files will result in explicit errors.
+
+---
+
+## Contributing
+1. Fork the repository.  
+2. Create a branch for your updates.  
+3. Submit a pull request.  
+4. Report bugs or suggest improvements in the Issues section.
+
+---
+
+## License
+_here we will write about the_ `LICENSE`
+
+---
+
+## About
+### Authors:  
+- **Elfried Salanon**  
+📅 **Date:** 2025  
+- **Marie Lefebvre**  
+📅 **Date:** 2025  
diff --git a/tools/parsec/parsec.R b/tools/parsec/parsec.R
@@ -0,0 +1,154 @@
+options(warn = -1)
+# --- IMPORT PACKAGES ---
+suppressPackageStartupMessages({
+    library(optparse)
+    library(lme4)
+})
+
+# --- MAIN FUNCTION (CORRECTION) ---
+batch_cohort_correction <- function(
+    data,
+    batch_col,
+    injection_order_col,
+    intensity_cols) {
+    # Checks for mandatory columns
+    missing_cols <- setdiff(
+        c(batch_col, intensity_cols, injection_order_col),
+        colnames(data)
+    )
+    if (length(missing_cols) > 0) {
+        stop(
+            paste(
+                "❌  missing columns",
+                paste(missing_cols, collapse = ", ")
+            )
+        )
+    }
+
+    # 🔧 Cleaning the data and conversion to numeric values
+    data[intensity_cols] <- lapply(data[intensity_cols], function(x) {
+        x <- gsub("\\s+", "", as.character(x))
+        as.numeric(x)
+    })
+
+    # 1. Log-transform
+    data[intensity_cols] <- lapply(data[intensity_cols], log1p)
+
+    # 2. Batch standardisation
+    data <- unsplit(
+        lapply(
+            split(data, data[[batch_col]]),
+            function(df) {
+                df[intensity_cols] <- lapply(
+                    df[intensity_cols],
+                    function(col) as.numeric(scale(col))
+                )
+                df
+            }
+        ),
+        data[[batch_col]]
+    )
+
+    # 3. Linear mixed model
+    for (col in intensity_cols) {
+        model <- lmer(
+            as.formula(paste0(
+                col,
+                " ~ ", injection_order_col, " + (1|", batch_col, ")"
+            )),
+            data = data,
+            REML = TRUE,
+            control = lmerControl(check.conv.singular = "ignore")
+        )
+        data[[col]] <- residuals(model)
+    }
+
+    # 4. Inverse transform
+    data[intensity_cols] <- lapply(data[intensity_cols], expm1)
+
+    return(data)
+}
+
+# --- CLI ARGUMENTS ---
+option_list <- list(
+    make_option(
+        c("-d", "--dataMatrix"),
+        type = "character",
+        help = "Data matrix"
+    ),
+    make_option(
+        c("-s", "--sampleMData"),
+        type = "character",
+        help = "Sample metadata"
+    ),
+    make_option(
+        c("-v", "--variableMData"),
+        type = "character",
+        help = "Variable metadata"
+    ),
+    make_option(
+        c("-o", "--output"),
+        type = "character",
+        help = "Output file"
+    )
+)
+opt <- parse_args(OptionParser(option_list = option_list))
+
+# --- FILE LOADINGS ---
+if (!all(file.exists(opt$dataMatrice, opt$sampleMData, opt$variableMData))) {
+    stop("❌ At least one of the input files could not be found !")
+}
+
+data_matrix <- read.csv(
+    opt$dataMatrix,
+    header = TRUE,
+    sep = "\t",
+    row.names = 1
+)
+sample_metadata <- read.csv(
+    opt$sampleMData,
+    header = TRUE,
+    sep = "\t",
+    row.names = 1
+)
+variable_metadata <- read.csv(
+    opt$variableMData,
+    header = TRUE,
+    sep = "\t",
+    row.names = 1
+)
+
+
+# --- DATA TRANSPOSE ---
+data_t <- as.data.frame(t(data_matrix))
+
+if (ncol(data_t) != nrow(variable_metadata)) {
+    stop("❌ Incompatibility: nb of variables ≠ nb of variableMetadata")
+}
+
+# --- COLUMNS SUBJECTED TO CORRECTION ---
+intensity_cols <- rownames(variable_metadata)
+
+# --- DM+SM FUSION ---
+data_set <- transform(
+    merge(sample_metadata, data_t, by = 0),
+    row.names = Row.names,
+    Row.names = NULL
+)
+
+# --- CORRECTION---
+corrected_data <- batch_cohort_correction(
+    data_set,
+    batch_col = "batch",
+    injection_order_col = "injectionOrder",
+    intensity_cols = intensity_cols
+)
+
+# --- FINAL EXPORT ---
+write.table(
+    round(corrected_data[intensity_cols], 10),
+    file = opt$output,
+    quote = TRUE,
+    row.names = TRUE,
+    sep = "\t"
+)