bigbio · ypriverol · May 7, 2024 · Mar 24, 2024 · Mar 24, 2024 · Mar 24, 2024
diff --git a/bin/extract_sample.py b/bin/extract_sample.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+
+import argparse
+import errno
+import os
+import sys
+from pathlib import Path
+import pandas as pd
+
+
+def parse_args(args=None):
+    Description = "Extract sample information from an experiment design file"
+    Epilog = "Example usage: python extract_sample.py <EXP>"
+
+    parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
+    parser.add_argument("EXP", help="Expdesign file to be extracted")
+    return parser.parse_args(args)
+
+
+def extract_sample(expdesign):
+    data = pd.read_csv(expdesign, sep="\t", header=0, dtype=str)
+    fTable = data.dropna()
+
+    # two table format
+    with open(expdesign, "r") as f:
+        lines = f.readlines()
+        empty_row = lines.index("\n")
+        s_table = [i.replace("\n", "").split("\t") for i in lines[empty_row + 1:]][1:]
+        s_header = lines[empty_row + 1].replace("\n", "").split("\t")
+        s_DataFrame = pd.DataFrame(s_table, columns=s_header)
+
+    sample_dt = pd.DataFrame()
+    if "MSstats_Mixture" not in s_DataFrame.columns:
+        fTable = fTable[["Spectra_Filepath", "Sample"]]
+        fTable.to_csv(f"{Path(expdesign).stem}_sample.csv", sep="\t", index=False)
+    else:
+        fTable.drop_duplicates(subset=["Spectra_Filepath"], inplace=True)
+        for _, row in fTable.iterrows():
+            mixture_id = s_DataFrame[s_DataFrame["Sample"] == row["Sample"]]["MSstats_Mixture"]
+            sample_dt = sample_dt.append({"Spectra_Filepath": row["Spectra_Filepath"], "Sample": mixture_id},
+                                         ignore_index=True)
+        sample_dt.to_csv(f"{Path(expdesign).stem}_sample.csv", sep="\t", index=False)
+
+
+def main(args=None):
+    args = parse_args(args)
+    extract_sample(args.EXP)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/bin/ms2rescore_cli.py b/bin/ms2rescore_cli.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python
+
+import sys
+import click
+import importlib.resources
+import json
+import logging
+from typing import List
+
+import pandas as pd
+
+from ms2rescore import rescore, package_data
+from psm_utils.io.idxml import IdXMLReader, IdXMLWriter
+from psm_utils import PSMList
+import pyopenms as oms
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+
+
+def parse_cli_arguments_to_config(**kwargs):
+    """Update default MS²Rescore config with CLI arguments"""
+    config = json.load(importlib.resources.open_text(package_data, "config_default.json"))
+
+    for key, value in kwargs.items():
+        # Skip these arguments since they need to set in a nested dict of feature_generators
+        if key in ["ms2pip_model", "ms2_tolerance", "rng", "calibration_set_size"]:
+            continue
+
+        elif key == "feature_generators":
+            feature_generators = value.split(",")
+            # Reset feature generator dict since there might be default generators we don't want
+            config["ms2rescore"]["feature_generators"] = {}
+            if "basic" in feature_generators:
+                config["ms2rescore"]["feature_generators"]["basic"] = {}
+            if "ms2pip" in feature_generators:
+                config["ms2rescore"]["feature_generators"]["ms2pip"] = {
+                    "model": kwargs["ms2pip_model"],
+                    "ms2_tolerance": kwargs["ms2_tolerance"],
+                }
+            if "deeplc" in feature_generators:
+                config["ms2rescore"]["feature_generators"]["deeplc"] = {
+                    "deeplc_retrain": False,
+                    "calibration_set_size": kwargs["calibration_set_size"],
+                }
+            if "maxquant" in feature_generators:
+                config["ms2rescore"]["feature_generators"]["maxquant"] = {}
+            if "ionmob" in feature_generators:
+                config["ms2rescore"]["feature_generators"]["ionmob"] = {}
+
+        elif key == "rescoring_engine":
+            # Reset rescoring engine dict we want to allow only computing features
+            config["ms2rescore"]["rescoring_engine"] = {}
+            if value == "mokapot":
+                config["ms2rescore"]["rescoring_engine"]["mokapot"] = {
+                    "write_weights": True,
+                    "write_txt": False,
+                    "write_flashlfq": False,
+                    "rng": kwargs["rng"],
+                    "test_fdr": kwargs["test_fdr"],
+                    "max_workers": kwargs["processes"],
+                }
+            if value == "percolator":
+                logging.info(
+                    "Percolator rescoring engine has been specified. Use the idXML containing rescoring features and run Percolator in a separate step."
+                )
+                continue
+
+        else:
+            config["ms2rescore"][key] = value
+
+    return config
+
+
+def rescore_idxml(input_file, output_file, config) -> None:
+    """Rescore PSMs in an idXML file and keep other information unchanged."""
+    # Read PSMs
+    reader = IdXMLReader(input_file)
+    psm_list = reader.read_file()
+
+    # Rescore
+    rescore(config, psm_list)
+
+    # Filter out PeptideHits within PeptideIdentification(s) that could not be processed by all feature generators
+    peptide_ids_filtered = filter_out_artifact_psms(psm_list, reader.peptide_ids)
+
+    # Write
+    writer = IdXMLWriter(output_file, reader.protein_ids, peptide_ids_filtered)
+    writer.write_file(psm_list)
+
+
+def filter_out_artifact_psms(
+    psm_list: PSMList, peptide_ids: List[oms.PeptideIdentification]
+) -> List[oms.PeptideIdentification]:
+    """Filter out PeptideHits that could not be processed by all feature generators"""
+    num_mandatory_features = max([len(psm.rescoring_features) for psm in psm_list])
+    new_psm_list = PSMList(psm_list=[psm for psm in psm_list if len(psm.rescoring_features) == num_mandatory_features])
+
+    # get differing peptidoforms of both psm lists
+    psm_list_peptides = set([next(iter(psm.provenance_data.items()))[1] for psm in psm_list])
+    new_psm_list_peptides = set([next(iter(psm.provenance_data.items()))[1] for psm in new_psm_list])
+    not_supported_peptides = psm_list_peptides - new_psm_list_peptides
+
+    # no need to filter if all peptides are supported
+    if len(not_supported_peptides) == 0:
+        return peptide_ids
+    # Create new peptide ids and filter out not supported peptides
+    new_peptide_ids = []
+    for peptide_id in peptide_ids:
+        new_hits = []
+        for hit in peptide_id.getHits():
+            if hit.getSequence().toString() in not_supported_peptides:
+                continue
+            new_hits.append(hit)
+        if len(new_hits) == 0:
+            continue
+        peptide_id.setHits(new_hits)
+        new_peptide_ids.append(peptide_id)
+    logging.info(
+        f"Removed {len(psm_list_peptides) - len(new_psm_list_peptides)} PSMs. Peptides not supported: {not_supported_peptides}"
+    )
+    return new_peptide_ids
+
+
+@click.command()
+@click.option(
+    "-p", "--psm_file", help="Path to PSM file (PIN, mzIdentML, MaxQuant msms, X!Tandem XML, idXML)", required=True
+)
+@click.option(
+    "-s",
+    "--spectrum_path",
+    help="Path to MGF/mzML spectrum file or directory with spectrum files (default: derived from identification file)",
+    required=True,
+)
+@click.option(
+    "-o", "--output_path", help="Path and stem for output file names (default: derive from identification file)"
+)
+@click.option("-l", "--log_level", help="Logging level (default: `info`)", default="info")
+@click.option("-n", "--processes", help="Number of parallel processes available to MS²Rescore", type=int, default=16)
+@click.option("-f", "--fasta_file", help="Path to FASTA file")
+@click.option("-t", "--test_fdr", help="The false-discovery rate threshold at which to evaluate the learned models. (default: 0.05)", default=0.05)
+@click.option(
+    "-fg",
+    "--feature_generators",
+    help="Comma-separated list of feature generators to use (default: `ms2pip,deeplc`). See ms2rescore doc for further information",
+    default="",
+)
+@click.option("-pipm", "--ms2pip_model", help="MS²PIP model (default: `Immuno-HCD`)", type=str, default="Immuno-HCD")
+@click.option(
+    "-ms2tol", "--ms2_tolerance", help="Fragment mass tolerance [Da](default: `0.02`)", type=float, default=0.02
+)
+@click.option(
+    "-cs",
+    "--calibration_set_size",
+    help="Percentage of number of calibration set for DeepLC (default: `0.15`)",
+    default=0.15,
+)
+@click.option("-re", "--rescoring_engine", help="Either mokapot or percolator (default: `mokapot`)", default="mokapot")
+@click.option(
+    "-rng", "--rng", help="Seed for mokapot's random number generator (default: `4711`)", type=int, default=4711
+)
+@click.option("-d", "--id_decoy_pattern", help="Regex decoy pattern (default: `DECOY_`)", default="^DECOY_")
+@click.option(
+    "-lsb",
+    "--lower_score_is_better",
+    help="Interpretation of primary search engine score (default: True)",
+    default=True,
+)
+def main(**kwargs):
+    config = parse_cli_arguments_to_config(**kwargs)
+    logging.info("MS²Rescore config:")
+    logging.info(config)
+    rescore_idxml(kwargs["psm_file"], kwargs["output_path"], config)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/bin/psm_conversion.py b/bin/psm_conversion.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+
 import numpy as np
 import pyopenms as oms
 import pandas as pd
@@ -63,8 +64,8 @@ def convert_psm(idxml, spectra_file, export_decoy_psm):
 
         if isinstance(spectra_df, pd.DataFrame):
             spectra = spectra_df[spectra_df["scan"] == scan_number]
-            mz_array = spectra["mz"].values[0]
-            intensity_array = spectra["intensity"].values[0]
+            mz_array = spectra["mz"].values
+            intensity_array = spectra["intensity"].values
             num_peaks = len(mz_array)
 
         for hit in peptide_id.getHits():
@@ -84,6 +85,9 @@ def convert_psm(idxml, spectra_file, export_decoy_psm):
             elif search_engines == "Sage":
                 id_scores = ["Sage:hyperscore: " + str(hit.getScore())]
 
+            if hit.metaValueExists("MS:1001491"):
+                global_qvalue = hit.getMetaValue("MS:1001491")
+
             charge = hit.getCharge()
             peptidoform = hit.getSequence().toString()
             modifications = mods_position(peptidoform)

diff --git a/conf/modules.config b/conf/modules.config
@@ -270,4 +270,47 @@ process {
         ]
     }
 
+    // ID ONLY
+    withName: 'MS2RESCORE' {
+        ext.args   = [
+            "--ms2pip_model ${params.ms2pip_model}",
+            "--rescoring_engine ${params.posterior_probabilities}",
+            "--calibration_set_size ${params.calibration_set_size}",
+            "--test_fdr ${params.test_FDR}",
+            params.feature_generators.trim() ? "--feature_generators ${params.feature_generators}" : ''
+        ].join(' ').trim()
+        publishDir  = [
+            path: { "${params.outdir}/${task.process.tokenize(':')[-1].toLowerCase()}" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
+    withName: '.*:DDA_ID:IDSCORESWITCHER' {
+        ext.args   = [
+            "-new_score_orientation lower_better",
+            "-new_score_type \"Posterior Error Probability\"",
+            "-debug $params.idscoreswitcher_debug"
+        ].join(' ').trim()
+        publishDir  = [
+            path: { "${params.outdir}/idscoreswitcherforluciphor" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
+    withName: '.*:DDA_ID:PSMFDRCONTROL:IDSCORESWITCHER' {
+        ext.args    = [
+            "-new_score_orientation lower_better",
+            "-old_score \"Posterior Error Probability\"",
+            "-new_score_type q-value",
+            "-debug $params.idscoreswitcher_debug"
+        ].join(' ').trim()
+        publishDir  = [
+            path: { "${params.outdir}/idscoreswitcher" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
 }
diff --git a/modules/local/extract_sample/main.nf b/modules/local/extract_sample/main.nf
@@ -0,0 +1,31 @@
+process GETSAMPLE {
+    tag "$design.Name"
+    label 'process_low'
+
+    conda "bioconda::sdrf-pipelines=0.0.25"
+    if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+        container "https://depot.galaxyproject.org/singularity/sdrf-pipelines:0.0.25--pyhdfd78af_0"
+    } else {
+        container "biocontainers/sdrf-pipelines:0.0.25--pyhdfd78af_0"
+    }
+
+    input:
+    path design
+
+    output:
+    path "*_sample.csv", emit: ch_expdesign_sample
+    path "versions.yml", emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    """
+    extract_sample.py "${design}" 2>&1 | tee extract_sample.log
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        sdrf-pipelines: \$(parse_sdrf --version 2>&1 | awk -F ' ' '{print \$2}')
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/extract_sample/meta.yml b/modules/local/extract_sample/meta.yml
@@ -0,0 +1,27 @@
+name: GETSAMPLE
+description: A module to extract sample information from experimental design file
+keywords:
+  - sample
+  - conversion
+tools:
+  - custom:
+      description: |
+        A custom module for sample extraction.
+      homepage: https://github.com/bigbio/quantms
+      documentation: https://github.com/bigbio/quantms/tree/readthedocs
+input:
+  - design:
+      type: file
+      description: experimental design file
+      pattern: "*.csv"
+output:
+  - ch_expdesign_sample:
+      type: file
+      description: sample csv file
+      pattern: "*_sample.csv"
+  - version:
+      type: file
+      description: File containing software version
+      pattern: "versions.yml"
+authors:
+  - "@daichengxin"