From 76f73be736942785ba677dfb61d4b7eefba94258 Mon Sep 17 00:00:00 2001 From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com> Date: Sun, 24 Mar 2024 17:55:18 +0800 Subject: [PATCH 01/19] add ms2score --- bin/ms2rescore_cli.py | 174 ++++++++++++++++++ bin/psm_conversion.py | 5 +- conf/modules.config | 14 ++ modules/local/ms2rescore/main.nf | 55 ++++++ modules/local/ms2rescore/meta.yml | 38 ++++ .../local/openms/extractpsmfeatures/main.nf | 4 +- nextflow.config | 6 + subworkflows/local/dda_id.nf | 20 +- subworkflows/local/psmrescoring.nf | 2 +- 9 files changed, 308 insertions(+), 10 deletions(-) create mode 100644 bin/ms2rescore_cli.py create mode 100644 modules/local/ms2rescore/main.nf create mode 100644 modules/local/ms2rescore/meta.yml diff --git a/bin/ms2rescore_cli.py b/bin/ms2rescore_cli.py new file mode 100644 index 00000000..1298dd38 --- /dev/null +++ b/bin/ms2rescore_cli.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python + +import sys +import click +import importlib.resources +import json +import logging +from typing import List + +import pandas as pd + +from ms2rescore import rescore, package_data +from psm_utils.io.idxml import IdXMLReader, IdXMLWriter +from psm_utils import PSMList +import pyopenms as oms + +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") + + +def parse_cli_arguments_to_config(**kwargs): + """Update default MS²Rescore config with CLI arguments""" + config = json.load(importlib.resources.open_text(package_data, "config_default.json")) + + for key, value in kwargs.items(): + # Skip these arguments since they need to set in a nested dict of feature_generators + if key in ["ms2pip_model", "ms2_tolerance", "rng", "calibration_set_size"]: + continue + + elif key == "feature_generators": + feature_generators = value.split(",") + # Reset feature generator dict since there might be default generators we don't want + config["ms2rescore"]["feature_generators"] = {} + if "basic" in feature_generators: + config["ms2rescore"]["feature_generators"]["basic"] = {} + if "ms2pip" in feature_generators: + config["ms2rescore"]["feature_generators"]["ms2pip"] = { + "model": kwargs["ms2pip_model"], + "ms2_tolerance": kwargs["ms2_tolerance"], + } + if "deeplc" in feature_generators: + config["ms2rescore"]["feature_generators"]["deeplc"] = { + "deeplc_retrain": False, + "calibration_set_size": kwargs["calibration_set_size"], + } + if "maxquant" in feature_generators: + config["ms2rescore"]["feature_generators"]["maxquant"] = {} + if "ionmob" in feature_generators: + config["ms2rescore"]["feature_generators"]["ionmob"] = {} + + elif key == "rescoring_engine": + # Reset rescoring engine dict we want to allow only computing features + config["ms2rescore"]["rescoring_engine"] = {} + if value == "mokapot": + config["ms2rescore"]["rescoring_engine"]["mokapot"] = { + "write_weights": True, + "write_txt": False, + "write_flashlfq": False, + "rng": kwargs["rng"], + "max_workers": kwargs["processes"], + } + if value == "percolator": + logging.info( + "Percolator rescoring engine has been specified. Use the idXML containing rescoring features and run Percolator in a separate step." + ) + continue + + else: + config["ms2rescore"][key] = value + + return config + + +def rescore_idxml(input_file, output_file, config) -> None: + """Rescore PSMs in an idXML file and keep other information unchanged.""" + # Read PSMs + reader = IdXMLReader(input_file) + psm_list = reader.read_file() + + # Rescore + rescore(config, psm_list) + + # Filter out PeptideHits within PeptideIdentification(s) that could not be processed by all feature generators + peptide_ids_filtered = filter_out_artifact_psms(psm_list, reader.peptide_ids) + + # Write + writer = IdXMLWriter(output_file, reader.protein_ids, peptide_ids_filtered) + writer.write_file(psm_list) + + +def filter_out_artifact_psms( + psm_list: PSMList, peptide_ids: List[oms.PeptideIdentification] +) -> List[oms.PeptideIdentification]: + """Filter out PeptideHits that could not be processed by all feature generators""" + num_mandatory_features = max([len(psm.rescoring_features) for psm in psm_list]) + new_psm_list = PSMList(psm_list=[psm for psm in psm_list if len(psm.rescoring_features) == num_mandatory_features]) + + # get differing peptidoforms of both psm lists + psm_list_peptides = set([next(iter(psm.provenance_data.items()))[1] for psm in psm_list]) + new_psm_list_peptides = set([next(iter(psm.provenance_data.items()))[1] for psm in new_psm_list]) + not_supported_peptides = psm_list_peptides - new_psm_list_peptides + + # no need to filter if all peptides are supported + if len(not_supported_peptides) == 0: + return peptide_ids + # Create new peptide ids and filter out not supported peptides + new_peptide_ids = [] + for peptide_id in peptide_ids: + new_hits = [] + for hit in peptide_id.getHits(): + if hit.getSequence().toString() in not_supported_peptides: + continue + new_hits.append(hit) + if len(new_hits) == 0: + continue + peptide_id.setHits(new_hits) + new_peptide_ids.append(peptide_id) + logging.info( + f"Removed {len(psm_list_peptides) - len(new_psm_list_peptides)} PSMs. Peptides not supported: {not_supported_peptides}" + ) + return new_peptide_ids + + +@click.command() +@click.option( + "-p", "--psm_file", help="Path to PSM file (PIN, mzIdentML, MaxQuant msms, X!Tandem XML, idXML)", required=True +) +@click.option( + "-s", + "--spectrum_path", + help="Path to MGF/mzML spectrum file or directory with spectrum files (default: derived from identification file)", + required=True, +) +@click.option( + "-o", "--output_path", help="Path and stem for output file names (default: derive from identification file)" +) +@click.option("-l", "--log_level", help="Logging level (default: `info`)", default="info") +@click.option("-n", "--processes", help="Number of parallel processes available to MS²Rescore", type=int, default=16) +@click.option("-f", "--fasta_file", help="Path to FASTA file") +@click.option( + "-fg", + "--feature_generators", + help="Comma-separated list of feature generators to use (default: `ms2pip,deeplc`). See ms2rescore doc for further information", + default="", +) +@click.option("-pipm", "--ms2pip_model", help="MS²PIP model (default: `Immuno-HCD`)", type=str, default="Immuno-HCD") +@click.option( + "-ms2tol", "--ms2_tolerance", help="Fragment mass tolerance [Da](default: `0.02`)", type=float, default=0.02 +) +@click.option( + "-cs", + "--calibration_set_size", + help="Percentage of number of calibration set for DeepLC (default: `0.15`)", + default=0.15, +) +@click.option("-re", "--rescoring_engine", help="Either mokapot or percolator (default: `mokapot`)", default="mokapot") +@click.option( + "-rng", "--rng", help="Seed for mokapot's random number generator (default: `4711`)", type=int, default=4711 +) +@click.option("-d", "--id_decoy_pattern", help="Regex decoy pattern (default: `DECOY_`)", default="^DECOY_") +@click.option( + "-lsb", + "--lower_score_is_better", + help="Interpretation of primary search engine score (default: True)", + default=True, +) +def main(**kwargs): + config = parse_cli_arguments_to_config(**kwargs) + logging.info("MS²Rescore config:") + logging.info(config) + rescore_idxml(kwargs["psm_file"], kwargs["output_path"], config) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/psm_conversion.py b/bin/psm_conversion.py index c122a24f..75793d0c 100644 --- a/bin/psm_conversion.py +++ b/bin/psm_conversion.py @@ -1,4 +1,5 @@ #!/usr/bin/env python + import numpy as np import pyopenms as oms import pandas as pd @@ -63,8 +64,8 @@ def convert_psm(idxml, spectra_file, export_decoy_psm): if isinstance(spectra_df, pd.DataFrame): spectra = spectra_df[spectra_df["scan"] == scan_number] - mz_array = spectra["mz"].values[0] - intensity_array = spectra["intensity"].values[0] + mz_array = spectra["mz"].values + intensity_array = spectra["intensity"].values num_peaks = len(mz_array) for hit in peptide_id.getHits(): diff --git a/conf/modules.config b/conf/modules.config index c220be03..c1d1fc5b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -270,4 +270,18 @@ process { ] } + withName: 'MS2RESCORE' { + ext.args = [ + "--ms2_tolerance ${2 * params.fragment_mass_tolerance}", + "--ms2pip_model ${params.ms2pip_model}", + "--rescoring_engine ${params.posterior_probabilities}", + params.feature_generators.trim() ? "--feature_generators ${params.feature_generators}" : '' + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } diff --git a/modules/local/ms2rescore/main.nf b/modules/local/ms2rescore/main.nf new file mode 100644 index 00000000..a1e049a7 --- /dev/null +++ b/modules/local/ms2rescore/main.nf @@ -0,0 +1,55 @@ +process MS2RESCORE { + tag "$meta.mzml_id" + label 'process_high' + + conda "bioconda::ms2rescore=3.0.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ms2rescore:3.0.1--pyhdfd78af_2': + 'biocontainers/ms2rescore:3.0.1--pyhdfd78af_2' }" + + // userEmulation settings when docker is specified + containerOptions = (workflow.containerEngine == 'docker') ? '-u $(id -u) -e "HOME=${HOME}" -v /etc/passwd:/etc/passwd:ro -v /etc/shadow:/etc/shadow:ro -v /etc/group:/etc/group:ro -v $HOME:$HOME' : '' + + input: + tuple val(meta), path(idxml), path(mzml) + + output: + tuple val(meta), path("*ms2rescore.idXML") , emit: idxml + tuple val(meta), path("*feature_names.tsv"), emit: feature_names + tuple val(meta), path("*.html" ) , optional:true, emit: html + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.mzml_id}_ms2rescore" + + """ + ms2rescore_cli.py \\ + --psm_file $idxml \\ + --spectrum_path . \\ + --output_path ${idxml.baseName}_ms2rescore.idXML \\ + --processes $task.cpus \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + MS²Rescore: \$(echo \$(ms2rescore --version 2>&1) | grep -oP 'MS²Rescore \\(v\\K[^\\)]+' )) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}_ms2rescore" + + """ + touch ${prefix}.idXML + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + MS²Rescore: \$(echo \$(ms2rescore --version 2>&1) | grep -oP 'MS²Rescore \\(v\\K[^\\)]+' )) + END_VERSIONS + """ +} diff --git a/modules/local/ms2rescore/meta.yml b/modules/local/ms2rescore/meta.yml new file mode 100644 index 00000000..887c3a04 --- /dev/null +++ b/modules/local/ms2rescore/meta.yml @@ -0,0 +1,38 @@ +name: MS2RESCORE +description: A module to perform MS2 rescoring step +keywords: + - MS2 + - rescoring +tools: + - custom: + description: | + A custom module for MS2 rescoring. + homepage: https://github.com/bigbio/quantms + documentation: https://github.com/bigbio/quantms/tree/readthedocs +input: + - idxml_file: + type: file + description: idXML identification file + pattern: "*.idXML" + - mzml: + type: file + description: spectrum data file + pattern: "*.mzML" + - meta: + type: map + description: Groovy Map containing sample information +output: + - idxml: + type: file + description: idXML identification file after MS2 rescoring + pattern: "*.idXML" + - feature_names: + type: file + description: File containing feature names + pattern: "*feature_names.tsv" + - version: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@daichengxin" diff --git a/modules/local/openms/extractpsmfeatures/main.nf b/modules/local/openms/extractpsmfeatures/main.nf index a9ff73b1..b430fc2c 100644 --- a/modules/local/openms/extractpsmfeatures/main.nf +++ b/modules/local/openms/extractpsmfeatures/main.nf @@ -10,7 +10,7 @@ process EXTRACTPSMFEATURES { 'biocontainers/openms-thirdparty:3.1.0--h9ee0642_1' }" input: - tuple val(meta), path(id_file) + tuple val(meta), path(id_file), path(extra_feat) output: tuple val(meta), path("${id_file.baseName}_feat.idXML"), emit: id_files_feat @@ -20,12 +20,14 @@ process EXTRACTPSMFEATURES { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.mzml_id}" + def feature = (params.ms2rescore == true) && (params.id_only == true) ? "-extra ${extra_feat}" : "" """ PSMFeatureExtractor \\ -in ${id_file} \\ -out ${id_file.baseName}_feat.idXML \\ -threads $task.cpus \\ + -extra ${feature} \\ $args \\ 2>&1 | tee ${id_file.baseName}_extract_psm_feature.log diff --git a/nextflow.config b/nextflow.config index 8cd10ce8..05d9436f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -101,6 +101,12 @@ params { min_fr_mz = null max_fr_mz = null + // MSRESCORE flags + ms2rescore = false + ms2pip_model = 'HCD2021' + feature_generators = 'deeplc,ms2pip' + deeplc_calibration_set_size = 0.15 + // PeptideIndexer flags IL_equivalent = true unmatched_action = "warn" diff --git a/subworkflows/local/dda_id.nf b/subworkflows/local/dda_id.nf index c98b1c55..119ceefa 100644 --- a/subworkflows/local/dda_id.nf +++ b/subworkflows/local/dda_id.nf @@ -8,6 +8,7 @@ include { PERCOLATOR } from '../../modules/local/openms/thirdparty/perco include { FALSEDISCOVERYRATE as FDRIDPEP } from '../../modules/local/openms/falsediscoveryrate/main' include { IDPEP } from '../../modules/local/openms/idpep/main' include { PSMCONVERSION } from '../../modules/local/extract_psm/main' +include { MS2RESCORE } from '../../modules/local/ms2rescore/main' // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules @@ -39,7 +40,7 @@ workflow DDA_ID { sage: filename.name.contains('sage') return [meta, filename] nosage: true - return [meta, filename] + return [meta, filename, []] }.set{ch_id_files_branched} @@ -48,9 +49,18 @@ workflow DDA_ID { // if (params.skip_rescoring == false) { if (params.posterior_probabilities == 'percolator') { - EXTRACTPSMFEATURES(ch_id_files_branched.nosage) - ch_id_files_feats = ch_id_files_branched.sage.mix(EXTRACTPSMFEATURES.out.id_files_feat) - ch_software_versions = ch_software_versions.mix(EXTRACTPSMFEATURES.out.version) + if (params.ms2rescore == true) { + MS2RESCORE(ch_id_files.combine(ch_file_preparation_results, by: 0)) + ch_software_versions = ch_software_versions.mix(MS2RESCORE.out.versions) + EXTRACTPSMFEATURES(MS2RESCORE.out.idxml.join(MS2RESCORE.out.feature_names)) + ch_id_files_feats = EXTRACTPSMFEATURES.out.id_files_feat + ch_software_versions = ch_software_versions.mix(EXTRACTPSMFEATURES.out.version) + } else { + EXTRACTPSMFEATURES(ch_id_files_branched.nosage) + ch_id_files_feats = ch_id_files_branched.sage.mix(EXTRACTPSMFEATURES.out.id_files_feat) + ch_software_versions = ch_software_versions.mix(EXTRACTPSMFEATURES.out.version) + } + PERCOLATOR(ch_id_files_feats) ch_software_versions = ch_software_versions.mix(PERCOLATOR.out.version) ch_consensus_input = PERCOLATOR.out.id_files_perc @@ -90,8 +100,6 @@ workflow DDA_ID { // // Extract PSMs and export parquet format // - ch_spectrum_data.view() - PSMFDRCONTROL.out.id_filtered.view() PSMCONVERSION(PSMFDRCONTROL.out.id_filtered.combine(ch_spectrum_data, by: 0)) } else { diff --git a/subworkflows/local/psmrescoring.nf b/subworkflows/local/psmrescoring.nf index 7bc89d98..9f243c87 100644 --- a/subworkflows/local/psmrescoring.nf +++ b/subworkflows/local/psmrescoring.nf @@ -21,7 +21,7 @@ workflow PSMRESCORING { sage: filename.name.contains('sage') return [meta, filename] nosage: true - return [meta, filename] + return [meta, filename, []] }.set{ch_id_files_branched} EXTRACTPSMFEATURES(ch_id_files_branched.nosage) ch_id_files_feats = ch_id_files_branched.sage.mix(EXTRACTPSMFEATURES.out.id_files_feat) From 192ad038a2654326e6599b32a1379a75ce559bfa Mon Sep 17 00:00:00 2001 From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com> Date: Sun, 24 Mar 2024 18:24:24 +0800 Subject: [PATCH 02/19] fixed --- nextflow.config | 1 - nextflow_schema.json | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 05d9436f..876659e4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -105,7 +105,6 @@ params { ms2rescore = false ms2pip_model = 'HCD2021' feature_generators = 'deeplc,ms2pip' - deeplc_calibration_set_size = 0.15 // PeptideIndexer flags IL_equivalent = true diff --git a/nextflow_schema.json b/nextflow_schema.json index 8375cfa3..0248a2ce 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -434,6 +434,24 @@ "default": false, "fa_icon": "far fa-check-square" }, + "ms2rescore": { + "type": "boolean", + "description": "Whether performing peptide identification rescoring with LC-MS predictors such as MS²PIP and DeepLC.", + "default": false, + "fa_icon": "far fa-check-square" + }, + "ms2pip_model": { + "type": "string", + "description": "Which deep learning model to generate feature.", + "fa_icon": "fas fa-font", + "default": "HCD2021" + }, + "feature_generators": { + "type": "string", + "description": "Which feature generator to generate feature.", + "fa_icon": "fas fa-font", + "default": "deeplc,ms2pip" + }, "posterior_probabilities": { "type": "string", "description": "How to calculate posterior probabilities for PSMs:\n\n* 'percolator' = Re-score based on PSM-feature-based SVM and transform distance\n to hyperplane for posteriors\n* 'fit_distributions' = Fit positive and negative distributions to scores\n (similar to PeptideProphet)", From f4e3919e4b60d33249981dc0252dabda12142d88 Mon Sep 17 00:00:00 2001 From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com> Date: Tue, 26 Mar 2024 11:17:38 +0800 Subject: [PATCH 03/19] add mokapot --- conf/modules.config | 16 +++++++++++++++- modules/local/ms2rescore/main.nf | 20 ++++++++++++++++---- modules/local/openms/consensusid/main.nf | 2 +- nextflow.config | 1 + nextflow_schema.json | 10 ++++++++-- subworkflows/local/dda_id.nf | 12 ++++++++---- workflows/quantms.nf | 6 +++++- 7 files changed, 54 insertions(+), 13 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index c1d1fc5b..4480740f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -270,11 +270,12 @@ process { ] } + // ID ONLY withName: 'MS2RESCORE' { ext.args = [ - "--ms2_tolerance ${2 * params.fragment_mass_tolerance}", "--ms2pip_model ${params.ms2pip_model}", "--rescoring_engine ${params.posterior_probabilities}", + "--calibration_set_size ${params.calibration_set_size}", params.feature_generators.trim() ? "--feature_generators ${params.feature_generators}" : '' ].join(' ').trim() publishDir = [ @@ -284,4 +285,17 @@ process { ] } + withName: '.*:DDA_ID:IDSCORESWITCHER' { + ext.args = [ + "-new_score_orientation lower_better", + "-new_score_type \"Posterior Error Probability\"", + "-debug $params.idscoreswitcher_debug" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/idscoreswitcherforluciphor" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } diff --git a/modules/local/ms2rescore/main.nf b/modules/local/ms2rescore/main.nf index a1e049a7..ffa26a4f 100644 --- a/modules/local/ms2rescore/main.nf +++ b/modules/local/ms2rescore/main.nf @@ -2,10 +2,10 @@ process MS2RESCORE { tag "$meta.mzml_id" label 'process_high' - conda "bioconda::ms2rescore=3.0.1" + conda "bioconda::ms2rescore=3.0.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ms2rescore:3.0.1--pyhdfd78af_2': - 'biocontainers/ms2rescore:3.0.1--pyhdfd78af_2' }" + 'https://depot.galaxyproject.org/singularity/ms2rescore:3.0.2--pyhdfd78af_0': + 'biocontainers/ms2rescore:3.0.2--pyhdfd78af_0' }" // userEmulation settings when docker is specified containerOptions = (workflow.containerEngine == 'docker') ? '-u $(id -u) -e "HOME=${HOME}" -v /etc/passwd:/etc/passwd:ro -v /etc/shadow:/etc/shadow:ro -v /etc/group:/etc/group:ro -v $HOME:$HOME' : '' @@ -18,6 +18,7 @@ process MS2RESCORE { tuple val(meta), path("*feature_names.tsv"), emit: feature_names tuple val(meta), path("*.html" ) , optional:true, emit: html path "versions.yml" , emit: versions + path "*.log" , emit: log when: task.ext.when == null || task.ext.when @@ -26,13 +27,24 @@ process MS2RESCORE { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.mzml_id}_ms2rescore" + + // ms2rescore only supports Da unit. https://ms2rescore.readthedocs.io/en/v3.0.2/userguide/configuration/ + if (meta['fragmentmasstoleranceunit'].toLowerCase().endsWith('da')) { + ms2_tolerence = 2 * meta['fragmentmasstolerance'] + } else { + log.info "Warning: MS2Rescore only supports Da unit. Set default ms2 tolerance as 0.02!" + ms2_tolerence = 0.02 + } + """ ms2rescore_cli.py \\ --psm_file $idxml \\ --spectrum_path . \\ + --ms2_tolerance $ms2_tolerence \\ --output_path ${idxml.baseName}_ms2rescore.idXML \\ --processes $task.cpus \\ - $args + $args \\ + 2>&1 | tee ${meta.mzml_id}_ms2rescore.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/openms/consensusid/main.nf b/modules/local/openms/consensusid/main.nf index 5264701d..ad8a20f3 100644 --- a/modules/local/openms/consensusid/main.nf +++ b/modules/local/openms/consensusid/main.nf @@ -31,7 +31,7 @@ process CONSENSUSID { -filter:considered_hits $params.consensusid_considered_top_hits \\ -debug $params.consensusid_debug \\ $args \\ - 2>&1 | tee ${meta.id}_consensusID.log + 2>&1 | tee ${meta.mzml_id}_consensusID.log cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index 876659e4..904d2f55 100644 --- a/nextflow.config +++ b/nextflow.config @@ -105,6 +105,7 @@ params { ms2rescore = false ms2pip_model = 'HCD2021' feature_generators = 'deeplc,ms2pip' + calibration_set_size = 0.15 // PeptideIndexer flags IL_equivalent = true diff --git a/nextflow_schema.json b/nextflow_schema.json index 0248a2ce..2b35e9f7 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -452,12 +452,18 @@ "fa_icon": "fas fa-font", "default": "deeplc,ms2pip" }, + "calibration_set_size": { + "type": "number", + "description": "Percentage of number of calibration set for DeepLC", + "default": 0.15, + "fa_icon": "fas fa-filter" + }, "posterior_probabilities": { "type": "string", - "description": "How to calculate posterior probabilities for PSMs:\n\n* 'percolator' = Re-score based on PSM-feature-based SVM and transform distance\n to hyperplane for posteriors\n* 'fit_distributions' = Fit positive and negative distributions to scores\n (similar to PeptideProphet)", + "description": "How to calculate posterior probabilities for PSMs:\n\n* 'percolator' = Re-score based on PSM-feature-based SVM and transform distance\n to hyperplane for posteriors\n* 'fit_distributions' = Fit positive and negative distributions to scores\n (similar to PeptideProphet)\n\n* 'mokapot' = Re-score based on PSM-feature-based semi-supervised learning algorithm introduced by Percolator", "fa_icon": "fas fa-list-ol", "default": "percolator", - "enum": ["percolator", "fit_distributions"] + "enum": ["percolator", "fit_distributions", "mokapot"] }, "run_fdr_cutoff": { "type": "number", diff --git a/subworkflows/local/dda_id.nf b/subworkflows/local/dda_id.nf index 119ceefa..7dd234a6 100644 --- a/subworkflows/local/dda_id.nf +++ b/subworkflows/local/dda_id.nf @@ -9,6 +9,7 @@ include { FALSEDISCOVERYRATE as FDRIDPEP } from '../../modules/local/openms/fals include { IDPEP } from '../../modules/local/openms/idpep/main' include { PSMCONVERSION } from '../../modules/local/extract_psm/main' include { MS2RESCORE } from '../../modules/local/ms2rescore/main' +include { IDSCORESWITCHER } from '../../modules/local/openms/idscoreswitcher/main' // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules @@ -64,10 +65,13 @@ workflow DDA_ID { PERCOLATOR(ch_id_files_feats) ch_software_versions = ch_software_versions.mix(PERCOLATOR.out.version) ch_consensus_input = PERCOLATOR.out.id_files_perc - } - - - if (params.posterior_probabilities != 'percolator') { + } else if (params.posterior_probabilities == 'mokapot') { + MS2RESCORE(ch_id_files.combine(ch_file_preparation_results, by: 0)) + ch_software_versions = ch_software_versions.mix(MS2RESCORE.out.versions) + IDSCORESWITCHER(MS2RESCORE.out.idxml.combine(Channel.value("PEP"))) + ch_software_versions = ch_software_versions.mix(IDSCORESWITCHER.out.version) + ch_consensus_input = IDSCORESWITCHER.out.id_score_switcher.combine(Channel.value("MS:1001491")) + } else { ch_fdridpep = Channel.empty() if (params.search_engines.split(",").size() == 1) { FDRIDPEP(ch_id_files) diff --git a/workflows/quantms.nf b/workflows/quantms.nf index e0772204..f61d6d62 100644 --- a/workflows/quantms.nf +++ b/workflows/quantms.nf @@ -104,7 +104,11 @@ workflow QUANTMS { ch_versions = ch_versions.mix(DECOYDATABASE.out.version.ifEmpty(null)) } - if (params.id_only) { + // This rescoring engine currently only is supported in id_only subworkflows via ms2rescore. + if (params.id_only | params.posterior_probabilities == "mokapot") { + if (params.id_only == false) { + log.warn "The mokapot rescoring engine currently only is supported in id_only subworkflow via ms2rescore." + } DDA_ID( FILE_PREPARATION.out.results, ch_searchengine_in_db, FILE_PREPARATION.out.spectrum_data) ch_versions = ch_versions.mix(DDA_ID.out.version.ifEmpty(null)) } else { From 18d0571863082ffefa3c3eb43b97c2e32ca42118 Mon Sep 17 00:00:00 2001 From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com> Date: Tue, 26 Mar 2024 11:20:41 +0800 Subject: [PATCH 04/19] Update modules.config --- conf/modules.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 4480740f..4778ada8 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -285,8 +285,8 @@ process { ] } - withName: '.*:DDA_ID:IDSCORESWITCHER' { - ext.args = [ + withName: '.*:DDA_ID:IDSCORESWITCHER' { + ext.args = [ "-new_score_orientation lower_better", "-new_score_type \"Posterior Error Probability\"", "-debug $params.idscoreswitcher_debug" From c7b415e0d2ab1d88a68599a90e905f93514ed80e Mon Sep 17 00:00:00 2001 From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com> Date: Tue, 26 Mar 2024 11:22:24 +0800 Subject: [PATCH 05/19] Update modules.config --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 4778ada8..a1065abc 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -296,6 +296,6 @@ process { mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - } + } } From fe840a999ca299b043a7c20c0acd9fac5566aa54 Mon Sep 17 00:00:00 2001 From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com> Date: Sun, 31 Mar 2024 13:13:10 +0800 Subject: [PATCH 06/19] fixed --- bin/ms2rescore_cli.py | 2 ++ bin/psm_conversion.py | 3 +++ conf/modules.config | 15 +++++++++++++++ modules/local/ms2rescore/main.nf | 4 ++-- subworkflows/local/dda_id.nf | 12 ++++++++++-- 5 files changed, 32 insertions(+), 4 deletions(-) diff --git a/bin/ms2rescore_cli.py b/bin/ms2rescore_cli.py index 1298dd38..61e517e9 100644 --- a/bin/ms2rescore_cli.py +++ b/bin/ms2rescore_cli.py @@ -56,6 +56,7 @@ def parse_cli_arguments_to_config(**kwargs): "write_txt": False, "write_flashlfq": False, "rng": kwargs["rng"], + "test_fdr": kwargs["test_fdr"], "max_workers": kwargs["processes"], } if value == "percolator": @@ -136,6 +137,7 @@ def filter_out_artifact_psms( @click.option("-l", "--log_level", help="Logging level (default: `info`)", default="info") @click.option("-n", "--processes", help="Number of parallel processes available to MS²Rescore", type=int, default=16) @click.option("-f", "--fasta_file", help="Path to FASTA file") +@click.option("-t", "--test_fdr", help="The false-discovery rate threshold at which to evaluate the learned models. (default: 0.05)", default=0.05) @click.option( "-fg", "--feature_generators", diff --git a/bin/psm_conversion.py b/bin/psm_conversion.py index 75793d0c..767859a0 100644 --- a/bin/psm_conversion.py +++ b/bin/psm_conversion.py @@ -85,6 +85,9 @@ def convert_psm(idxml, spectra_file, export_decoy_psm): elif search_engines == "Sage": id_scores = ["Sage:hyperscore: " + str(hit.getScore())] + if hit.metaValueExists("MS:1001491"): + global_qvalue = hit.getMetaValue("MS:1001491") + charge = hit.getCharge() peptidoform = hit.getSequence().toString() modifications = mods_position(peptidoform) diff --git a/conf/modules.config b/conf/modules.config index a1065abc..b5a05b30 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -276,6 +276,7 @@ process { "--ms2pip_model ${params.ms2pip_model}", "--rescoring_engine ${params.posterior_probabilities}", "--calibration_set_size ${params.calibration_set_size}", + "--test_fdr ${params.test_FDR}", params.feature_generators.trim() ? "--feature_generators ${params.feature_generators}" : '' ].join(' ').trim() publishDir = [ @@ -298,4 +299,18 @@ process { ] } + withName: '.*:DDA_ID:PSMFDRCONTROL:IDSCORESWITCHER' { + ext.args = [ + "-new_score_orientation lower_better", + "-old_score \"Posterior Error Probability\"", + "-new_score_type q-value", + "-debug $params.idscoreswitcher_debug" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/idscoreswitcher" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } diff --git a/modules/local/ms2rescore/main.nf b/modules/local/ms2rescore/main.nf index ffa26a4f..6667337a 100644 --- a/modules/local/ms2rescore/main.nf +++ b/modules/local/ms2rescore/main.nf @@ -30,7 +30,7 @@ process MS2RESCORE { // ms2rescore only supports Da unit. https://ms2rescore.readthedocs.io/en/v3.0.2/userguide/configuration/ if (meta['fragmentmasstoleranceunit'].toLowerCase().endsWith('da')) { - ms2_tolerence = 2 * meta['fragmentmasstolerance'] + ms2_tolerence = meta['fragmentmasstolerance'] } else { log.info "Warning: MS2Rescore only supports Da unit. Set default ms2 tolerance as 0.02!" ms2_tolerence = 0.02 @@ -54,7 +54,7 @@ process MS2RESCORE { stub: def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}_ms2rescore" + def prefix = task.ext.prefix ?: "${meta.mzml_id}_ms2rescore" """ touch ${prefix}.idXML diff --git a/subworkflows/local/dda_id.nf b/subworkflows/local/dda_id.nf index 7dd234a6..3a595735 100644 --- a/subworkflows/local/dda_id.nf +++ b/subworkflows/local/dda_id.nf @@ -53,8 +53,16 @@ workflow DDA_ID { if (params.ms2rescore == true) { MS2RESCORE(ch_id_files.combine(ch_file_preparation_results, by: 0)) ch_software_versions = ch_software_versions.mix(MS2RESCORE.out.versions) - EXTRACTPSMFEATURES(MS2RESCORE.out.idxml.join(MS2RESCORE.out.feature_names)) - ch_id_files_feats = EXTRACTPSMFEATURES.out.id_files_feat + + MS2RESCORE.out.idxml.join(MS2RESCORE.out.feature_names).branch{ meta, idxml, feature_name -> + sage: idxml.name.contains('sage') + return [meta, idxml] + nosage: true + return [meta, idxml, feature_name] + }.set{ch_ms2rescore_branched} + + EXTRACTPSMFEATURES(ch_ms2rescore_branched.nosage) + ch_id_files_feats = EXTRACTPSMFEATURES.out.id_files_feat.mix(ch_ms2rescore_branched.sage) ch_software_versions = ch_software_versions.mix(EXTRACTPSMFEATURES.out.version) } else { EXTRACTPSMFEATURES(ch_id_files_branched.nosage) From 6bc25fe00a316d23b36b72cd65cff475d5f64fcf Mon Sep 17 00:00:00 2001 From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com> Date: Sun, 28 Apr 2024 10:35:55 +0800 Subject: [PATCH 07/19] add rescore range parameter --- bin/extract_sample.py | 51 +++++++++++ nextflow.config | 1 + nextflow_schema.json | 7 ++ subworkflows/local/dda_id.nf | 158 +++++++++++++++++++++++++++++++++-- workflows/quantms.nf | 2 +- 5 files changed, 213 insertions(+), 6 deletions(-) create mode 100644 bin/extract_sample.py diff --git a/bin/extract_sample.py b/bin/extract_sample.py new file mode 100644 index 00000000..0fb2c2bd --- /dev/null +++ b/bin/extract_sample.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python + +import argparse +import errno +import os +import sys +from pathlib import Path +import pandas as pd + + +def parse_args(args=None): + Description = "Extract sample information from an experiment design file" + Epilog = "Example usage: python extract_sample.py " + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("EXP", help="Expdesign file to be extracted") + return parser.parse_args(args) + + +def extract_sample(expdesign): + data = pd.read_csv(expdesign, sep="\t", header=0, dtype=str) + fTable = data.dropna() + + # two table format + with open(expdesign, "r") as f: + lines = f.readlines() + empty_row = lines.index("\n") + s_table = [i.replace("\n", "").split("\t") for i in lines[empty_row + 1:]][1:] + s_header = lines[empty_row + 1].replace("\n", "").split("\t") + s_DataFrame = pd.DataFrame(s_table, columns=s_header) + + sample_dt = pd.DataFrame() + if "MSstats_Mixture" not in s_DataFrame.columns: + fTable = fTable[["Spectra_Filepath", "Sample"]] + fTable.to_csv(f"{Path(expdesign).stem}_sample.csv", sep="\t", index=False) + else: + fTable.drop_duplicates(subset=["Spectra_Filepath"], inplace=True) + for _, row in fTable.iterrows(): + mixture_id = s_DataFrame[s_DataFrame["Sample"] == row["Sample"]]["MSstats_Mixture"] + sample_dt = sample_dt.append({"Spectra_Filepath": row["Spectra_Filepath"], "Sample": mixture_id}, + ignore_index=True) + sample_dt.to_csv(f"{Path(expdesign).stem}_sample.csv", sep="\t", index=False) + + +def main(args=None): + args = parse_args(args) + extract_sample(args.EXP) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/nextflow.config b/nextflow.config index 32a1584b..fb41a50c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -105,6 +105,7 @@ params { // MSRESCORE flags ms2rescore = false + rescore_range = 'independent_run' ms2pip_model = 'HCD2021' feature_generators = 'deeplc,ms2pip' calibration_set_size = 0.15 diff --git a/nextflow_schema.json b/nextflow_schema.json index f736fd51..de62b9bc 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -447,6 +447,13 @@ "default": false, "fa_icon": "far fa-check-square" }, + "rescore_range": { + "type": "string", + "description": "Rescoring for independent run, Sample or whole experiments", + "fa_icon": "fas fa-font", + "default": "independent_run", + "enum": ["independent_run", "by_sample", "by_project"] + }, "ms2pip_model": { "type": "string", "description": "Which deep learning model to generate feature.", diff --git a/subworkflows/local/dda_id.nf b/subworkflows/local/dda_id.nf index 3a595735..ac43e570 100644 --- a/subworkflows/local/dda_id.nf +++ b/subworkflows/local/dda_id.nf @@ -5,11 +5,14 @@ include { DECOYDATABASE } from '../../modules/local/openms/decoydatabase/main' include { CONSENSUSID } from '../../modules/local/openms/consensusid/main' include { EXTRACTPSMFEATURES } from '../../modules/local/openms/extractpsmfeatures/main' include { PERCOLATOR } from '../../modules/local/openms/thirdparty/percolator/main' +include { IDMERGER } from '../../modules/local/openms/idmerger/main' +include { IDRIPPER } from '../../modules/local/openms/idripper/main' include { FALSEDISCOVERYRATE as FDRIDPEP } from '../../modules/local/openms/falsediscoveryrate/main' include { IDPEP } from '../../modules/local/openms/idpep/main' include { PSMCONVERSION } from '../../modules/local/extract_psm/main' include { MS2RESCORE } from '../../modules/local/ms2rescore/main' include { IDSCORESWITCHER } from '../../modules/local/openms/idscoreswitcher/main' +include { EXTRACT_SAMPLE } from '../../modules/local/extract_sample/main' // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules @@ -22,6 +25,7 @@ workflow DDA_ID { ch_file_preparation_results ch_database_wdecoy ch_spectrum_data + ch_expdesign main: @@ -70,9 +74,77 @@ workflow DDA_ID { ch_software_versions = ch_software_versions.mix(EXTRACTPSMFEATURES.out.version) } - PERCOLATOR(ch_id_files_feats) - ch_software_versions = ch_software_versions.mix(PERCOLATOR.out.version) - ch_consensus_input = PERCOLATOR.out.id_files_perc + // Rescoring for independent run, Sample or whole experiments + if (params.rescore_range == "independent_run") { + PERCOLATOR(ch_id_files_feats) + ch_software_versions = ch_software_versions.mix(PERCOLATOR.out.version) + ch_consensus_input = PERCOLATOR.out.id_files_perc + } else if (params.rescore_range == "by_sample") { + // Sample map + EXTRACT_SAMPLE(ch_expdesign) + ch_expdesign_sample = EXTRACT_SAMPLE.out.ch_expdesign_sample + ch_expdesign_sample.splitCsv(header: true, sep: '\t') + .map { get_sample_map(it) }.set{ sample_map_idv } + + sample_map = sample_map_idv.collect().map{ all_sample_map( it ) } + + // Group by search_engines and convert meta + ch_id_files_feats.combine( sample_map ).branch{ meta, filename, sample_map -> + sage: filename.name.contains('sage') + return [comvert_exp_meta(meta, "sample_id", filename, sample_map), filename] + msgf: filename.name.contains('msgf') + return [comvert_exp_meta(meta, "sample_id", filename, sample_map), filename] + comet: filename.name.contains('comet') + return [comvert_exp_meta(meta, "sample_id", filename, sample_map), filename] + }.set{ch_id_files_feat_branched} + + // IDMERGER for samples group + IDMERGER(ch_id_files_feat_branched.comet.groupTuple(by: 0) + .mix(ch_id_files_feat_branched.msgf.groupTuple(by: 0)) + .mix(ch_id_files_feat_branched.sage.groupTuple(by: 0))) + ch_software_versions = ch_software_versions.mix(IDMERGER.out.version) + + PERCOLATOR(IDMERGER.out.id_merged) + ch_software_versions = ch_software_versions.mix(PERCOLATOR.out.version) + + // Currently only ID runs on exactly one mzML file are supported in CONSENSUSID. Split idXML by runs + IDRIPPER(PERCOLATOR.out.id_files_perc) + IDRIPPER.out.meta.first().combine(IDRIPPER.out.id_rippers.flatten()) + .map{ [comvert_exp_meta(it[0], "mzml_id", it[1], ""), it[1], "MS:1001491"] } + .set{ ch_consensus_input } + ch_consensus_input.view() + ch_software_versions = ch_software_versions.mix(IDRIPPER.out.version) + + } else if (params.rescore_range == "by_project"){ + // Split ch_id_files_feats by search_engines + ch_id_files_feats.branch{ meta, filename -> + sage: filename.name.contains('sage') + return [comvert_exp_meta(meta, "experiment_id", filename, ""), filename] + msgf: filename.name.contains('msgf') + return [comvert_exp_meta(meta, "experiment_id", filename, ""), filename] + comet: filename.name.contains('comet') + return [comvert_exp_meta(meta, "experiment_id", filename, ""), filename] + }.set{ch_id_files_feat_branched} + + // IDMERGER for whole experiments + IDMERGER(ch_id_files_feat_branched.comet.groupTuple(by: 0) + .mix(ch_id_files_feat_branched.msgf.groupTuple(by: 0)) + .mix(ch_id_files_feat_branched.sage.groupTuple(by: 0))) + ch_software_versions = ch_software_versions.mix(IDMERGER.out.version) + + PERCOLATOR(IDMERGER.out.id_merged) + ch_software_versions = ch_software_versions.mix(PERCOLATOR.out.version) + + // Currently only ID runs on exactly one mzML file are supported in CONSENSUSID. Split idXML by runs + IDRIPPER(PERCOLATOR.out.id_files_perc) + IDRIPPER.out.meta.first().combine(IDRIPPER.out.id_rippers.flatten()) + .map{ [comvert_exp_meta(it[0], "mzml_id", it[1], ""), it[1], "MS:1001491"] } + .set{ ch_consensus_input } + ch_software_versions = ch_software_versions.mix(IDRIPPER.out.version) + + } + + } else if (params.posterior_probabilities == 'mokapot') { MS2RESCORE(ch_id_files.combine(ch_file_preparation_results, by: 0)) ch_software_versions = ch_software_versions.mix(MS2RESCORE.out.versions) @@ -109,9 +181,8 @@ workflow DDA_ID { PSMFDRCONTROL(ch_psmfdrcontrol) ch_software_versions = ch_software_versions.mix(PSMFDRCONTROL.out.version.ifEmpty(null)) - // + // Extract PSMs and export parquet format - // PSMCONVERSION(PSMFDRCONTROL.out.id_filtered.combine(ch_spectrum_data, by: 0)) } else { @@ -122,3 +193,80 @@ workflow DDA_ID { emit: version = ch_software_versions } + +// Function to group by mzML/sample/experiment +def comvert_exp_meta(Map meta, value, file_name, sample_map) { + def exp_meta = [:] + + if (value == "experiment_id") { + exp_meta.mzml_id = meta.experiment_id + } else if (value == "mzml_id") { + position = file(file_name).name.lastIndexOf('_sage_perc.idXML') + if (position == -1) { + position = file(file_name).name.lastIndexOf('_comet_perc.idXML') + if (position == -1) { + position = file(file_name).name.lastIndexOf('_msgf_perc.idXML') + } + } + exp_meta.mzml_id = file(file_name).name.take(position) + } else if (value == "sample_id") { + tag = file(file_name).name.lastIndexOf('_perc.idXML') + if (tag == -1) { + position = file(file_name).name.lastIndexOf('_sage.idXML') + if (position == -1) { + position = file(file_name).name.lastIndexOf('_comet_feat.idXML') + if (position == -1) { + position = file(file_name).name.lastIndexOf('_msgf_feat.idXML') + } + } + } else { + position = file(file_name).name.lastIndexOf('_sage_perc.idXML') + if (position == -1) { + position = file(file_name).name.lastIndexOf('_comet_perc.idXML') + if (position == -1) { + position = file(file_name).name.lastIndexOf('_msgf_perc.idXML') + } + } + } + + file_name = file(file_name).name.take(position) + exp_meta.mzml_id = sample_map[file_name] + } + + + exp_meta.experiment_id = meta.experiment_id + exp_meta.labelling_type = meta.labelling_type + exp_meta.dissociationmethod = meta.dissociationmethod + exp_meta.fixedmodifications = meta.fixedmodifications + exp_meta.variablemodifications = meta.variablemodifications + exp_meta.precursormasstolerance = meta.precursormasstolerance + exp_meta.precursormasstoleranceunit = meta.precursormasstoleranceunit + exp_meta.fragmentmasstolerance = meta.fragmentmasstolerance + exp_meta.fragmentmasstoleranceunit = meta.fragmentmasstoleranceunit + exp_meta.enzyme = meta.enzyme + exp_meta.acquisition_method = meta.acquisition_method + print(exp_meta) + return exp_meta +} + +// Function to get sample map +def get_sample_map(LinkedHashMap row) { + def sample_map = [:] + + filestr = row.Spectra_Filepath + file_name = file(filestr).name.take(file(filestr).name.lastIndexOf('.')) + sample = row.Sample + sample_map[file_name] = sample + + return sample_map + +} + +def all_sample_map(sample_list) { + res = [:] + sample_list.each { + res = res + it + } + + return res +} diff --git a/workflows/quantms.nf b/workflows/quantms.nf index f61d6d62..82c9a870 100644 --- a/workflows/quantms.nf +++ b/workflows/quantms.nf @@ -109,7 +109,7 @@ workflow QUANTMS { if (params.id_only == false) { log.warn "The mokapot rescoring engine currently only is supported in id_only subworkflow via ms2rescore." } - DDA_ID( FILE_PREPARATION.out.results, ch_searchengine_in_db, FILE_PREPARATION.out.spectrum_data) + DDA_ID( FILE_PREPARATION.out.results, ch_searchengine_in_db, FILE_PREPARATION.out.spectrum_data, CREATE_INPUT_CHANNEL.out.ch_expdesign) ch_versions = ch_versions.mix(DDA_ID.out.version.ifEmpty(null)) } else { TMT(ch_fileprep_result.iso, CREATE_INPUT_CHANNEL.out.ch_expdesign, ch_searchengine_in_db) From a877cee97dd08662d8ab8ab8570baac9f7498131 Mon Sep 17 00:00:00 2001 From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com> Date: Sun, 28 Apr 2024 10:39:09 +0800 Subject: [PATCH 08/19] Update dda_id.nf --- subworkflows/local/dda_id.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/dda_id.nf b/subworkflows/local/dda_id.nf index ac43e570..983decd7 100644 --- a/subworkflows/local/dda_id.nf +++ b/subworkflows/local/dda_id.nf @@ -89,7 +89,7 @@ workflow DDA_ID { sample_map = sample_map_idv.collect().map{ all_sample_map( it ) } // Group by search_engines and convert meta - ch_id_files_feats.combine( sample_map ).branch{ meta, filename, sample_map -> + ch_id_files_feats.combine( sample_map ).branch{ meta, filename, sample_map -> sage: filename.name.contains('sage') return [comvert_exp_meta(meta, "sample_id", filename, sample_map), filename] msgf: filename.name.contains('msgf') From 434a60d585e68f539f23435e8a35dd120fc59f98 Mon Sep 17 00:00:00 2001 From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com> Date: Sun, 28 Apr 2024 10:56:00 +0800 Subject: [PATCH 09/19] add new modules --- modules/local/extract_sample/main.nf | 31 +++++++++++ modules/local/extract_sample/meta.yml | 27 +++++++++ modules/local/openms/idmerger/main.nf | 55 +++++++++++++++++++ modules/local/openms/idmerger/meta.yml | 33 +++++++++++ modules/local/openms/idripper/main.nf | 53 ++++++++++++++++++ modules/local/openms/idripper/meta.yml | 33 +++++++++++ .../openms/thirdparty/percolator/main.nf | 2 +- subworkflows/local/dda_id.nf | 4 +- 8 files changed, 235 insertions(+), 3 deletions(-) create mode 100644 modules/local/extract_sample/main.nf create mode 100644 modules/local/extract_sample/meta.yml create mode 100644 modules/local/openms/idmerger/main.nf create mode 100644 modules/local/openms/idmerger/meta.yml create mode 100644 modules/local/openms/idripper/main.nf create mode 100644 modules/local/openms/idripper/meta.yml diff --git a/modules/local/extract_sample/main.nf b/modules/local/extract_sample/main.nf new file mode 100644 index 00000000..6a7210f4 --- /dev/null +++ b/modules/local/extract_sample/main.nf @@ -0,0 +1,31 @@ +process GETSAMPLE { + tag "$design.Name" + label 'process_low' + + conda "bioconda::sdrf-pipelines=0.0.25" + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/sdrf-pipelines:0.0.25--pyhdfd78af_0" + } else { + container "biocontainers/sdrf-pipelines:0.0.25--pyhdfd78af_0" + } + + input: + path design + + output: + path "*_sample.csv", emit: ch_expdesign_sample + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + extract_sample.py "${design}" 2>&1 | tee extract_sample.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sdrf-pipelines: \$(parse_sdrf --version 2>&1 | awk -F ' ' '{print \$2}') + END_VERSIONS + """ +} diff --git a/modules/local/extract_sample/meta.yml b/modules/local/extract_sample/meta.yml new file mode 100644 index 00000000..548484ae --- /dev/null +++ b/modules/local/extract_sample/meta.yml @@ -0,0 +1,27 @@ +name: GETSAMPLE +description: A module to extract sample information from experimental design file +keywords: + - sample + - conversion +tools: + - custom: + description: | + A custom module for sample extraction. + homepage: https://github.com/bigbio/quantms + documentation: https://github.com/bigbio/quantms/tree/readthedocs +input: + - design: + type: file + description: experimental design file + pattern: "*.csv" +output: + - ch_expdesign_sample: + type: file + description: sample csv file + pattern: "*_sample.csv" + - version: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@daichengxin" diff --git a/modules/local/openms/idmerger/main.nf b/modules/local/openms/idmerger/main.nf new file mode 100644 index 00000000..9fde67e3 --- /dev/null +++ b/modules/local/openms/idmerger/main.nf @@ -0,0 +1,55 @@ +process IDMERGER { + tag "$meta.mzml_id" + label 'process_medium' + label 'openms' + + conda "bioconda::openms-thirdparty=3.1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/openms-thirdparty:3.1.0--h9ee0642_1' : + 'biocontainers/openms-thirdparty:3.1.0--h9ee0642_1' }" + + input: + tuple val(meta), path(id_files) + + output: + tuple val(meta), path("*_merged.idXML"), emit: id_merged + path "versions.yml", emit: version + path "*.log", emit: log + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.mzml_id}" + + if (params.rescore_range == "by_project") { + if (id_files[0].baseName.contains('sage')){ + prefix = "${meta.experiment_id}_sage" + } else if (id_files[0].baseName.contains('comet')){ + prefix = "${meta.experiment_id}_comet" + } else { + prefix = "${meta.experiment_id}_msgf" + } + } else if (params.rescore_range == "by_sample") { + if (id_files[0].baseName.contains('sage')){ + prefix = "${meta.mzml_id}_sage" + } else if (id_files[0].baseName.contains('comet')){ + prefix = "${meta.mzml_id}_comet" + } else { + prefix = "${meta.mzml_id}_msgf" + } + } + + """ + IDMerger \\ + -in ${id_files.join(' ')} \\ + -threads $task.cpus \\ + -out ${prefix}_merged.idXML \\ + -merge_proteins_add_PSMs \\ + $args \\ + 2>&1 | tee ${prefix}_merged.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + IDMerger: \$(IDMerger 2>&1 | grep -E '^Version(.*)' | sed 's/Version: //g' | cut -d ' ' -f 1) + END_VERSIONS + """ +} diff --git a/modules/local/openms/idmerger/meta.yml b/modules/local/openms/idmerger/meta.yml new file mode 100644 index 00000000..94b5caf5 --- /dev/null +++ b/modules/local/openms/idmerger/meta.yml @@ -0,0 +1,33 @@ +name: idmerger +description: Merges several idXML files into one file. +keywords: + - merge + - idXML + - OpenMS +tools: + - IDMerger: + description: | + Merges several idXML files into one file. + homepage: https://www.openms.org/documentation/html/TOPP_IDMerger.html + documentation: https://www.openms.org/documentation/html/TOPP_IDMerger.html +input: + - id_files: + type: file + description: | + Input files separated by blank. + pattern: "*.{idXML}" +output: + - id_merge: + type: file + description: Output Merged file + pattern: "*.{idXML}" + - log: + type: file + description: log file + pattern: "*.log" + - version: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@daichengxin" diff --git a/modules/local/openms/idripper/main.nf b/modules/local/openms/idripper/main.nf new file mode 100644 index 00000000..61eee8d9 --- /dev/null +++ b/modules/local/openms/idripper/main.nf @@ -0,0 +1,53 @@ +process IDRIPPER { + tag "$meta.mzml_id" + label 'process_medium' + label 'openms' + + conda "bioconda::openms-thirdparty=3.1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/openms-thirdparty:3.1.0--h9ee0642_1' : + 'biocontainers/openms-thirdparty:3.1.0--h9ee0642_1' }" + + input: + tuple val(meta), path(id_file), val(qval_score) + + output: + val(meta), emit: meta + path("*.idXML"), emit: id_rippers + val("MS:1001491"), emit: qval_score + path "versions.yml", emit: version + path "*.log", emit: log + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.mzml_id}" + + if (id_file.baseName.contains('sage')){ + pattern = "_sage_perc.idXML" + } else if (id_file.baseName.contains('comet')){ + pattern = "_comet_perc.idXML" + } else { + pattern = "_msgf_perc.idXML" + } + + """ + IDRipper \\ + -in ${id_file} \\ + -threads $task.cpus \\ + -out ./ \\ + -split_ident_runs \\ + $args \\ + 2>&1 | tee ${prefix}_idripper.log + + for i in `ls | grep -v \"_perc.idXML\$\" | grep \".idXML\$\"` + do + mv \$i `ls \"\$i\" |awk -F \".\" \'{print \$1\"${pattern}\"}\'` + done + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + IDRipper: \$(IDRipper 2>&1 | grep -E '^Version(.*)' | sed 's/Version: //g' | cut -d ' ' -f 1) + END_VERSIONS + """ +} +// diff --git a/modules/local/openms/idripper/meta.yml b/modules/local/openms/idripper/meta.yml new file mode 100644 index 00000000..81cc525f --- /dev/null +++ b/modules/local/openms/idripper/meta.yml @@ -0,0 +1,33 @@ +name: idripper +description: Splits the protein/peptide identifications of an idXML file into several idXML files according their annotated file origin. +keywords: + - split + - idXML + - OpenMS +tools: + - IDMerger: + description: | + IDRipper splits the protein/peptide identifications of an idXML file into several idXML files according their annotated file origin. + homepage: https://www.openms.org/documentation/html/TOPP_IDRipper.html + documentation: https://www.openms.org/documentation/html/TOPP_IDRipper.html +input: + - id_file: + type: file + description: | + Input file, in which the protein/peptide identifications must be tagged with 'file_origin' + pattern: "*.{idXML}" +output: + - id_rippers: + type: file + description: Output split files + pattern: "*.{idXML}" + - log: + type: file + description: log file + pattern: "*.log" + - version: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@daichengxin" diff --git a/modules/local/openms/thirdparty/percolator/main.nf b/modules/local/openms/thirdparty/percolator/main.nf index 7a99dd11..af166e96 100644 --- a/modules/local/openms/thirdparty/percolator/main.nf +++ b/modules/local/openms/thirdparty/percolator/main.nf @@ -12,7 +12,7 @@ process PERCOLATOR { tuple val(meta), path(id_file) output: - tuple val(meta), path("${id_file.baseName}_perc.idXML"), val("MS:1001491"), emit: id_files_perc + tuple val(meta), path("*_perc.idXML"), val("MS:1001491"), emit: id_files_perc path "versions.yml", emit: version path "*.log", emit: log diff --git a/subworkflows/local/dda_id.nf b/subworkflows/local/dda_id.nf index 983decd7..d6b57b55 100644 --- a/subworkflows/local/dda_id.nf +++ b/subworkflows/local/dda_id.nf @@ -12,7 +12,7 @@ include { IDPEP } from '../../modules/local/openms/idpe include { PSMCONVERSION } from '../../modules/local/extract_psm/main' include { MS2RESCORE } from '../../modules/local/ms2rescore/main' include { IDSCORESWITCHER } from '../../modules/local/openms/idscoreswitcher/main' -include { EXTRACT_SAMPLE } from '../../modules/local/extract_sample/main' +include { GETSAMPLE } from '../../modules/local/extract_sample/main' // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules @@ -81,7 +81,7 @@ workflow DDA_ID { ch_consensus_input = PERCOLATOR.out.id_files_perc } else if (params.rescore_range == "by_sample") { // Sample map - EXTRACT_SAMPLE(ch_expdesign) + GETSAMPLE(ch_expdesign) ch_expdesign_sample = EXTRACT_SAMPLE.out.ch_expdesign_sample ch_expdesign_sample.splitCsv(header: true, sep: '\t') .map { get_sample_map(it) }.set{ sample_map_idv } From b648f57d023b879c108f4428c94262df0a726708 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 28 Apr 2024 19:41:12 +0100 Subject: [PATCH 10/19] Update subworkflows/local/dda_id.nf Co-authored-by: Jonas Scheid <43858870+jonasscheid@users.noreply.github.com> --- subworkflows/local/dda_id.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/dda_id.nf b/subworkflows/local/dda_id.nf index d6b57b55..0d6c8c83 100644 --- a/subworkflows/local/dda_id.nf +++ b/subworkflows/local/dda_id.nf @@ -195,7 +195,7 @@ workflow DDA_ID { } // Function to group by mzML/sample/experiment -def comvert_exp_meta(Map meta, value, file_name, sample_map) { +def convert_exp_meta(Map meta, value, file_name, sample_map) { def exp_meta = [:] if (value == "experiment_id") { From cc01bd226e611cadf423198f994cbc020782f030 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 28 Apr 2024 19:41:19 +0100 Subject: [PATCH 11/19] Update subworkflows/local/dda_id.nf Co-authored-by: Jonas Scheid <43858870+jonasscheid@users.noreply.github.com> --- subworkflows/local/dda_id.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/subworkflows/local/dda_id.nf b/subworkflows/local/dda_id.nf index 0d6c8c83..a4a83c33 100644 --- a/subworkflows/local/dda_id.nf +++ b/subworkflows/local/dda_id.nf @@ -112,7 +112,6 @@ workflow DDA_ID { IDRIPPER.out.meta.first().combine(IDRIPPER.out.id_rippers.flatten()) .map{ [comvert_exp_meta(it[0], "mzml_id", it[1], ""), it[1], "MS:1001491"] } .set{ ch_consensus_input } - ch_consensus_input.view() ch_software_versions = ch_software_versions.mix(IDRIPPER.out.version) } else if (params.rescore_range == "by_project"){ From 4fbbd0c5c9bee113ba95e8b9a1a11e8b16f35bf3 Mon Sep 17 00:00:00 2001 From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com> Date: Mon, 6 May 2024 19:18:08 +0800 Subject: [PATCH 12/19] fixed --- conf/modules.config | 7 ++++--- .../local/openms/extractpsmfeatures/main.nf | 8 ++++++-- nextflow.config | 1 + nextflow_schema.json | 7 +++++++ subworkflows/local/dda_id.nf | 18 +++++++++--------- 5 files changed, 27 insertions(+), 14 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index a168d867..52e477a6 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -68,7 +68,8 @@ if (params.add_decoys) { if (params.posterior_probabilities == "percolator") { process { // EXTRACTPSMFEATURE - withName: '.*:ID:PSMRESCORING:EXTRACTPSMFEATURES' { + withName: '.*:EXTRACTPSMFEATURES' { + ext.args = "-debug $params.extractpsmfeature_debug" publishDir = [ path: { "${params.outdir}/extractpsmfeature" }, mode: params.publish_dir_mode, @@ -78,14 +79,14 @@ if (params.posterior_probabilities == "percolator") { } //PERCOLATOR - withName: '.*:ID:PSMRESCORING:PERCOLATOR' { + withName: '.*:PERCOLATOR' { ext.args = "-debug $params.percolator_debug" } } } else { process { // IDPEP - withName: '.*:ID:PSMRESCORING:IDPEP' { + withName: '.*:IDPEP' { ext.args = "-debug $params.idpep_debug" } } diff --git a/modules/local/openms/extractpsmfeatures/main.nf b/modules/local/openms/extractpsmfeatures/main.nf index b430fc2c..503aff55 100644 --- a/modules/local/openms/extractpsmfeatures/main.nf +++ b/modules/local/openms/extractpsmfeatures/main.nf @@ -20,14 +20,18 @@ process EXTRACTPSMFEATURES { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.mzml_id}" - def feature = (params.ms2rescore == true) && (params.id_only == true) ? "-extra ${extra_feat}" : "" + + feature = "" + if (params.ms2rescore && params.id_only) { + feature = "-extra \$(awk 'NR > 1 && \$1 !~ /psm_file/ {printf \"%s \", \$2}' ${extra_feat})" + } """ PSMFeatureExtractor \\ -in ${id_file} \\ -out ${id_file.baseName}_feat.idXML \\ -threads $task.cpus \\ - -extra ${feature} \\ + ${feature} \\ $args \\ 2>&1 | tee ${id_file.baseName}_extract_psm_feature.log diff --git a/nextflow.config b/nextflow.config index fb41a50c..ad30cb2e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -33,6 +33,7 @@ params { // Debug level decoydatabase_debug = 0 pp_debug = 0 + extractpsmfeature_debug = 0 idfilter_debug = 0 idscoreswitcher_debug = 0 iso_debug = 0 diff --git a/nextflow_schema.json b/nextflow_schema.json index de62b9bc..1ad00b40 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -485,6 +485,13 @@ "default": 0.1, "fa_icon": "fas fa-filter" }, + "extractpsmfeature_debug": { + "type": "integer", + "description": "Debug level when running the PSMFeatureExtractor step. Increase for verbose logging", + "fa_icon": "fas fa-bug", + "hidden": true, + "default": 0 + }, "idfilter_debug": { "type": "integer", "description": "Debug level when running the IDFilter step. Increase for verbose logging", diff --git a/subworkflows/local/dda_id.nf b/subworkflows/local/dda_id.nf index a4a83c33..e04f851a 100644 --- a/subworkflows/local/dda_id.nf +++ b/subworkflows/local/dda_id.nf @@ -91,11 +91,11 @@ workflow DDA_ID { // Group by search_engines and convert meta ch_id_files_feats.combine( sample_map ).branch{ meta, filename, sample_map -> sage: filename.name.contains('sage') - return [comvert_exp_meta(meta, "sample_id", filename, sample_map), filename] + return [convert_exp_meta(meta, "sample_id", filename, sample_map), filename] msgf: filename.name.contains('msgf') - return [comvert_exp_meta(meta, "sample_id", filename, sample_map), filename] + return [convert_exp_meta(meta, "sample_id", filename, sample_map), filename] comet: filename.name.contains('comet') - return [comvert_exp_meta(meta, "sample_id", filename, sample_map), filename] + return [convert_exp_meta(meta, "sample_id", filename, sample_map), filename] }.set{ch_id_files_feat_branched} // IDMERGER for samples group @@ -110,7 +110,7 @@ workflow DDA_ID { // Currently only ID runs on exactly one mzML file are supported in CONSENSUSID. Split idXML by runs IDRIPPER(PERCOLATOR.out.id_files_perc) IDRIPPER.out.meta.first().combine(IDRIPPER.out.id_rippers.flatten()) - .map{ [comvert_exp_meta(it[0], "mzml_id", it[1], ""), it[1], "MS:1001491"] } + .map{ [convert_exp_meta(it[0], "mzml_id", it[1], ""), it[1], "MS:1001491"] } .set{ ch_consensus_input } ch_software_versions = ch_software_versions.mix(IDRIPPER.out.version) @@ -118,11 +118,11 @@ workflow DDA_ID { // Split ch_id_files_feats by search_engines ch_id_files_feats.branch{ meta, filename -> sage: filename.name.contains('sage') - return [comvert_exp_meta(meta, "experiment_id", filename, ""), filename] + return [convert_exp_meta(meta, "experiment_id", filename, ""), filename] msgf: filename.name.contains('msgf') - return [comvert_exp_meta(meta, "experiment_id", filename, ""), filename] + return [convert_exp_meta(meta, "experiment_id", filename, ""), filename] comet: filename.name.contains('comet') - return [comvert_exp_meta(meta, "experiment_id", filename, ""), filename] + return [convert_exp_meta(meta, "experiment_id", filename, ""), filename] }.set{ch_id_files_feat_branched} // IDMERGER for whole experiments @@ -137,7 +137,7 @@ workflow DDA_ID { // Currently only ID runs on exactly one mzML file are supported in CONSENSUSID. Split idXML by runs IDRIPPER(PERCOLATOR.out.id_files_perc) IDRIPPER.out.meta.first().combine(IDRIPPER.out.id_rippers.flatten()) - .map{ [comvert_exp_meta(it[0], "mzml_id", it[1], ""), it[1], "MS:1001491"] } + .map{ [convert_exp_meta(it[0], "mzml_id", it[1], ""), it[1], "MS:1001491"] } .set{ ch_consensus_input } ch_software_versions = ch_software_versions.mix(IDRIPPER.out.version) @@ -244,7 +244,7 @@ def convert_exp_meta(Map meta, value, file_name, sample_map) { exp_meta.fragmentmasstoleranceunit = meta.fragmentmasstoleranceunit exp_meta.enzyme = meta.enzyme exp_meta.acquisition_method = meta.acquisition_method - print(exp_meta) + return exp_meta } From 4330b09c32027ec8642ccd00ccb9c25b69ed0d53 Mon Sep 17 00:00:00 2001 From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com> Date: Mon, 6 May 2024 19:40:22 +0800 Subject: [PATCH 13/19] Update main.nf --- subworkflows/nf-core/utils_nfcore_pipeline/main.nf | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf index a8b55d6f..98e4b691 100644 --- a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf +++ b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf @@ -65,9 +65,15 @@ def checkProfileProvided(nextflow_cli_args) { // Citation string for pipeline // def workflowCitation() { + def temp_doi_ref = "" + String[] manifest_doi = workflow.manifest.doi.tokenize(",") + // Using a loop to handle multiple DOIs + // Removing `https://doi.org/` to handle pipelines using DOIs vs DOI resolvers + // Removing ` ` since the manifest.doi is a string and not a proper list + for (String doi_ref: manifest_doi) temp_doi_ref += " https://doi.org/${doi_ref.replace('https://doi.org/', '').replace(' ', '')}\n" return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + "* The pipeline\n" + - " ${workflow.manifest.doi}\n\n" + + temp_doi_ref + "\n" + "* The nf-core framework\n" + " https://doi.org/10.1038/s41587-020-0439-x\n\n" + "* Software dependencies\n" + @@ -437,4 +443,4 @@ def imNotification(summary_params, hook_url) { if (! postRC.equals(200)) { log.warn(post.getErrorStream().getText()); } -} +} \ No newline at end of file From 7cd22afd51140f5ec2f4667a98cea8119fcfe64c Mon Sep 17 00:00:00 2001 From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com> Date: Mon, 6 May 2024 20:21:46 +0800 Subject: [PATCH 14/19] Update ms2rescore_cli.py --- bin/ms2rescore_cli.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/ms2rescore_cli.py b/bin/ms2rescore_cli.py index 61e517e9..7c607bc4 100644 --- a/bin/ms2rescore_cli.py +++ b/bin/ms2rescore_cli.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# Written by Jonas Scheid under the MIT license import sys import click From 77b2b2b7c6de8b875ed3a23a39914b511fecdfce Mon Sep 17 00:00:00 2001 From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com> Date: Mon, 6 May 2024 20:35:55 +0800 Subject: [PATCH 15/19] Update modules.json --- modules.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules.json b/modules.json index beb0405b..8bd8a9c6 100644 --- a/modules.json +++ b/modules.json @@ -26,7 +26,7 @@ }, "utils_nfcore_pipeline": { "branch": "master", - "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", + "git_sha": "92de218a329bfc9a9033116eb5f65fd270e72ba3", "installed_by": ["subworkflows"] }, "utils_nfvalidation_plugin": { From d3cab9dc08f8abe312230c34949c306c7582ff09 Mon Sep 17 00:00:00 2001 From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com> Date: Mon, 6 May 2024 21:07:27 +0800 Subject: [PATCH 16/19] Update main.nf --- subworkflows/nf-core/utils_nfcore_pipeline/main.nf | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf index 98e4b691..a8b55d6f 100644 --- a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf +++ b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf @@ -65,15 +65,9 @@ def checkProfileProvided(nextflow_cli_args) { // Citation string for pipeline // def workflowCitation() { - def temp_doi_ref = "" - String[] manifest_doi = workflow.manifest.doi.tokenize(",") - // Using a loop to handle multiple DOIs - // Removing `https://doi.org/` to handle pipelines using DOIs vs DOI resolvers - // Removing ` ` since the manifest.doi is a string and not a proper list - for (String doi_ref: manifest_doi) temp_doi_ref += " https://doi.org/${doi_ref.replace('https://doi.org/', '').replace(' ', '')}\n" return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + "* The pipeline\n" + - temp_doi_ref + "\n" + + " ${workflow.manifest.doi}\n\n" + "* The nf-core framework\n" + " https://doi.org/10.1038/s41587-020-0439-x\n\n" + "* Software dependencies\n" + @@ -443,4 +437,4 @@ def imNotification(summary_params, hook_url) { if (! postRC.equals(200)) { log.warn(post.getErrorStream().getText()); } -} \ No newline at end of file +} From 1d978f3a8fdd4be0a050241f967563f5c04d0176 Mon Sep 17 00:00:00 2001 From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com> Date: Mon, 6 May 2024 21:25:51 +0800 Subject: [PATCH 17/19] try --- subworkflows/nf-core/utils_nfcore_pipeline/main.nf | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf index a8b55d6f..14558c39 100644 --- a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf +++ b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf @@ -65,9 +65,15 @@ def checkProfileProvided(nextflow_cli_args) { // Citation string for pipeline // def workflowCitation() { + def temp_doi_ref = "" + String[] manifest_doi = workflow.manifest.doi.tokenize(",") + // Using a loop to handle multiple DOIs + // Removing `https://doi.org/` to handle pipelines using DOIs vs DOI resolvers + // Removing ` ` since the manifest.doi is a string and not a proper list + for (String doi_ref: manifest_doi) temp_doi_ref += " https://doi.org/${doi_ref.replace('https://doi.org/', '').replace(' ', '')}\n" return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + "* The pipeline\n" + - " ${workflow.manifest.doi}\n\n" + + temp_doi_ref + "\n" + "* The nf-core framework\n" + " https://doi.org/10.1038/s41587-020-0439-x\n\n" + "* Software dependencies\n" + From c41f628378bfc76a829d0a1b926b3789b2f7aaca Mon Sep 17 00:00:00 2001 From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com> Date: Tue, 7 May 2024 00:30:50 +0800 Subject: [PATCH 18/19] fixed --- conf/modules.config | 5 ++++- nextflow.config | 2 +- nextflow_schema.json | 8 ++++---- workflows/quantms.nf | 3 +++ 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 52e477a6..89d3bc4b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -80,7 +80,10 @@ if (params.posterior_probabilities == "percolator") { //PERCOLATOR withName: '.*:PERCOLATOR' { - ext.args = "-debug $params.percolator_debug" + ext.args = [ + "-debug $params.percolator_debug", + (params.fdr_level != 'psm_level_fdrs') ? "-" + params.fdr_level : "" + ].join(' ').trim() } } } else { diff --git a/nextflow.config b/nextflow.config index ad30cb2e..2461e68e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -124,7 +124,7 @@ params { // Percolator flags train_FDR = 0.05 test_FDR = 0.05 - FDR_level = 'peptide-level-fdrs' + fdr_level = 'peptide_level_fdrs' klammer = false description_correct_features = 0 subset_max_train = 300000 diff --git a/nextflow_schema.json b/nextflow_schema.json index 1ad00b40..746e312c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -522,12 +522,12 @@ "description": "In the following you can find help for the Percolator specific options that are only used if [`--posterior_probabilities`](#posterior_probabilities) was set to 'percolator'.\nNote that there are currently some restrictions to the original options of Percolator:\n\n* no Percolator protein FDR possible (currently OpenMS' FDR is used on protein level)\n* no support for separate target and decoy databases (i.e. no min-max q-value calculation or target-decoy competition strategy)\n* no support for combined or experiment-wide peptide re-scoring. Currently search results per input file are submitted to Percolator independently.", "default": "", "properties": { - "FDR_level": { + "fdr_level": { "type": "string", - "description": "Calculate FDR on PSM ('psm-level-fdrs') or peptide level ('peptide-level-fdrs')?", - "default": "peptide-level-fdrs", + "description": "Calculate FDR on PSM ('psm_level_fdrs') or peptide level ('peptide_level_fdrs')?", + "default": "peptide_level_fdrs", "fa_icon": "fas fa-list-ol", - "enum": ["peptide-level-fdrs", "psm-level-fdrs"] + "enum": ["peptide_level_fdrs", "psm_level_fdrs"] }, "train_FDR": { "type": "number", diff --git a/workflows/quantms.nf b/workflows/quantms.nf index 82c9a870..7645dfb4 100644 --- a/workflows/quantms.nf +++ b/workflows/quantms.nf @@ -109,6 +109,9 @@ workflow QUANTMS { if (params.id_only == false) { log.warn "The mokapot rescoring engine currently only is supported in id_only subworkflow via ms2rescore." } + if (params.posterior_probabilities == "mokapot" && params.FDR_level == "peptide-level-fdrs") { + log.warn "The rescoring engine is set to mokapot. This rescoring engine currently only supports psm-level-fdr via ms2rescore." + } DDA_ID( FILE_PREPARATION.out.results, ch_searchengine_in_db, FILE_PREPARATION.out.spectrum_data, CREATE_INPUT_CHANNEL.out.ch_expdesign) ch_versions = ch_versions.mix(DDA_ID.out.version.ifEmpty(null)) } else { From bae2e3e43aea202818adf79e2de3c23ee77f193b Mon Sep 17 00:00:00 2001 From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com> Date: Tue, 7 May 2024 11:22:20 +0800 Subject: [PATCH 19/19] fixed typo --- conf/test_full_lfq.config | 2 +- conf/test_full_tmt.config | 2 +- conf/test_localize.config | 1 + workflows/quantms.nf | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/conf/test_full_lfq.config b/conf/test_full_lfq.config index fe17eb44..4e4684f1 100644 --- a/conf/test_full_lfq.config +++ b/conf/test_full_lfq.config @@ -29,5 +29,5 @@ params { add_decoys = true add_triqler_output = true protein_level_fdr_cutoff = 0.01 - psm_pep_fdr_cutoff = 0.01 + psm_level_fdr_cutoff = 0.01 } diff --git a/conf/test_full_tmt.config b/conf/test_full_tmt.config index d4b8469f..11eea647 100644 --- a/conf/test_full_tmt.config +++ b/conf/test_full_tmt.config @@ -27,7 +27,7 @@ params { posterior_probabilities = "percolator" search_engines = "comet,msgf" protein_level_fdr_cutoff = 0.01 - psm_pep_fdr_cutoff = 0.01 + psm_level_fdr_cutoff = 0.01 add_decoys = true protocol = 'TMT' } diff --git a/conf/test_localize.config b/conf/test_localize.config index 3ed7a152..ef32bddd 100644 --- a/conf/test_localize.config +++ b/conf/test_localize.config @@ -30,4 +30,5 @@ params { psm_level_fdr_cutoff = 0.50 skip_post_msstats = true quantify_decoys = true + fdr_level = "psm_level_fdrs" } diff --git a/workflows/quantms.nf b/workflows/quantms.nf index 7645dfb4..e45a6c64 100644 --- a/workflows/quantms.nf +++ b/workflows/quantms.nf @@ -109,7 +109,7 @@ workflow QUANTMS { if (params.id_only == false) { log.warn "The mokapot rescoring engine currently only is supported in id_only subworkflow via ms2rescore." } - if (params.posterior_probabilities == "mokapot" && params.FDR_level == "peptide-level-fdrs") { + if (params.posterior_probabilities == "mokapot" && params.fdr_level == "peptide_level_fdrs") { log.warn "The rescoring engine is set to mokapot. This rescoring engine currently only supports psm-level-fdr via ms2rescore." } DDA_ID( FILE_PREPARATION.out.results, ch_searchengine_in_db, FILE_PREPARATION.out.spectrum_data, CREATE_INPUT_CHANNEL.out.ch_expdesign)