Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add optional ms2rescore step #362

Merged
merged 23 commits into from
May 7, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions bin/extract_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env python

import argparse
import errno
import os
import sys
from pathlib import Path
import pandas as pd


def parse_args(args=None):
Description = "Extract sample information from an experiment design file"
Epilog = "Example usage: python extract_sample.py <EXP>"

parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser.add_argument("EXP", help="Expdesign file to be extracted")
return parser.parse_args(args)


def extract_sample(expdesign):
data = pd.read_csv(expdesign, sep="\t", header=0, dtype=str)
fTable = data.dropna()

# two table format
with open(expdesign, "r") as f:
lines = f.readlines()
empty_row = lines.index("\n")
s_table = [i.replace("\n", "").split("\t") for i in lines[empty_row + 1:]][1:]
s_header = lines[empty_row + 1].replace("\n", "").split("\t")
s_DataFrame = pd.DataFrame(s_table, columns=s_header)

sample_dt = pd.DataFrame()
if "MSstats_Mixture" not in s_DataFrame.columns:
fTable = fTable[["Spectra_Filepath", "Sample"]]
fTable.to_csv(f"{Path(expdesign).stem}_sample.csv", sep="\t", index=False)
else:
fTable.drop_duplicates(subset=["Spectra_Filepath"], inplace=True)
for _, row in fTable.iterrows():
mixture_id = s_DataFrame[s_DataFrame["Sample"] == row["Sample"]]["MSstats_Mixture"]
sample_dt = sample_dt.append({"Spectra_Filepath": row["Spectra_Filepath"], "Sample": mixture_id},
ignore_index=True)
sample_dt.to_csv(f"{Path(expdesign).stem}_sample.csv", sep="\t", index=False)


def main(args=None):
args = parse_args(args)
extract_sample(args.EXP)


if __name__ == "__main__":
sys.exit(main())
176 changes: 176 additions & 0 deletions bin/ms2rescore_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
#!/usr/bin/env python

ypriverol marked this conversation as resolved.
Show resolved Hide resolved
import sys
import click
import importlib.resources
import json
import logging
from typing import List

import pandas as pd

from ms2rescore import rescore, package_data
from psm_utils.io.idxml import IdXMLReader, IdXMLWriter
from psm_utils import PSMList
import pyopenms as oms

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")


def parse_cli_arguments_to_config(**kwargs):
"""Update default MS²Rescore config with CLI arguments"""
config = json.load(importlib.resources.open_text(package_data, "config_default.json"))

for key, value in kwargs.items():
# Skip these arguments since they need to set in a nested dict of feature_generators
if key in ["ms2pip_model", "ms2_tolerance", "rng", "calibration_set_size"]:
continue

elif key == "feature_generators":
feature_generators = value.split(",")
# Reset feature generator dict since there might be default generators we don't want
config["ms2rescore"]["feature_generators"] = {}
if "basic" in feature_generators:
config["ms2rescore"]["feature_generators"]["basic"] = {}
if "ms2pip" in feature_generators:
config["ms2rescore"]["feature_generators"]["ms2pip"] = {
"model": kwargs["ms2pip_model"],
"ms2_tolerance": kwargs["ms2_tolerance"],
}
if "deeplc" in feature_generators:
config["ms2rescore"]["feature_generators"]["deeplc"] = {
"deeplc_retrain": False,
"calibration_set_size": kwargs["calibration_set_size"],
}
if "maxquant" in feature_generators:
config["ms2rescore"]["feature_generators"]["maxquant"] = {}
if "ionmob" in feature_generators:
config["ms2rescore"]["feature_generators"]["ionmob"] = {}

elif key == "rescoring_engine":
# Reset rescoring engine dict we want to allow only computing features
config["ms2rescore"]["rescoring_engine"] = {}
if value == "mokapot":
config["ms2rescore"]["rescoring_engine"]["mokapot"] = {
"write_weights": True,
"write_txt": False,
"write_flashlfq": False,
"rng": kwargs["rng"],
"test_fdr": kwargs["test_fdr"],
"max_workers": kwargs["processes"],
}
if value == "percolator":
logging.info(
"Percolator rescoring engine has been specified. Use the idXML containing rescoring features and run Percolator in a separate step."
)
continue

else:
config["ms2rescore"][key] = value

return config


def rescore_idxml(input_file, output_file, config) -> None:
"""Rescore PSMs in an idXML file and keep other information unchanged."""
# Read PSMs
reader = IdXMLReader(input_file)
psm_list = reader.read_file()

# Rescore
rescore(config, psm_list)

# Filter out PeptideHits within PeptideIdentification(s) that could not be processed by all feature generators
peptide_ids_filtered = filter_out_artifact_psms(psm_list, reader.peptide_ids)

# Write
writer = IdXMLWriter(output_file, reader.protein_ids, peptide_ids_filtered)
writer.write_file(psm_list)


def filter_out_artifact_psms(
psm_list: PSMList, peptide_ids: List[oms.PeptideIdentification]
) -> List[oms.PeptideIdentification]:
"""Filter out PeptideHits that could not be processed by all feature generators"""
num_mandatory_features = max([len(psm.rescoring_features) for psm in psm_list])
new_psm_list = PSMList(psm_list=[psm for psm in psm_list if len(psm.rescoring_features) == num_mandatory_features])

# get differing peptidoforms of both psm lists
psm_list_peptides = set([next(iter(psm.provenance_data.items()))[1] for psm in psm_list])
new_psm_list_peptides = set([next(iter(psm.provenance_data.items()))[1] for psm in new_psm_list])
not_supported_peptides = psm_list_peptides - new_psm_list_peptides

# no need to filter if all peptides are supported
if len(not_supported_peptides) == 0:
return peptide_ids
# Create new peptide ids and filter out not supported peptides
new_peptide_ids = []
for peptide_id in peptide_ids:
new_hits = []
for hit in peptide_id.getHits():
if hit.getSequence().toString() in not_supported_peptides:
continue
new_hits.append(hit)
if len(new_hits) == 0:
continue
peptide_id.setHits(new_hits)
new_peptide_ids.append(peptide_id)
logging.info(
f"Removed {len(psm_list_peptides) - len(new_psm_list_peptides)} PSMs. Peptides not supported: {not_supported_peptides}"
)
return new_peptide_ids


@click.command()
@click.option(
"-p", "--psm_file", help="Path to PSM file (PIN, mzIdentML, MaxQuant msms, X!Tandem XML, idXML)", required=True
)
@click.option(
"-s",
"--spectrum_path",
help="Path to MGF/mzML spectrum file or directory with spectrum files (default: derived from identification file)",
required=True,
)
@click.option(
"-o", "--output_path", help="Path and stem for output file names (default: derive from identification file)"
)
@click.option("-l", "--log_level", help="Logging level (default: `info`)", default="info")
@click.option("-n", "--processes", help="Number of parallel processes available to MS²Rescore", type=int, default=16)
@click.option("-f", "--fasta_file", help="Path to FASTA file")
@click.option("-t", "--test_fdr", help="The false-discovery rate threshold at which to evaluate the learned models. (default: 0.05)", default=0.05)
@click.option(
"-fg",
"--feature_generators",
help="Comma-separated list of feature generators to use (default: `ms2pip,deeplc`). See ms2rescore doc for further information",
default="",
)
@click.option("-pipm", "--ms2pip_model", help="MS²PIP model (default: `Immuno-HCD`)", type=str, default="Immuno-HCD")
@click.option(
"-ms2tol", "--ms2_tolerance", help="Fragment mass tolerance [Da](default: `0.02`)", type=float, default=0.02
)
@click.option(
"-cs",
"--calibration_set_size",
help="Percentage of number of calibration set for DeepLC (default: `0.15`)",
default=0.15,
)
@click.option("-re", "--rescoring_engine", help="Either mokapot or percolator (default: `mokapot`)", default="mokapot")
@click.option(
"-rng", "--rng", help="Seed for mokapot's random number generator (default: `4711`)", type=int, default=4711
)
@click.option("-d", "--id_decoy_pattern", help="Regex decoy pattern (default: `DECOY_`)", default="^DECOY_")
@click.option(
"-lsb",
"--lower_score_is_better",
help="Interpretation of primary search engine score (default: True)",
default=True,
)
def main(**kwargs):
config = parse_cli_arguments_to_config(**kwargs)
logging.info("MS²Rescore config:")
logging.info(config)
rescore_idxml(kwargs["psm_file"], kwargs["output_path"], config)


if __name__ == "__main__":
sys.exit(main())
8 changes: 6 additions & 2 deletions bin/psm_conversion.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python

import numpy as np
import pyopenms as oms
import pandas as pd
Expand Down Expand Up @@ -63,8 +64,8 @@ def convert_psm(idxml, spectra_file, export_decoy_psm):

if isinstance(spectra_df, pd.DataFrame):
spectra = spectra_df[spectra_df["scan"] == scan_number]
mz_array = spectra["mz"].values[0]
intensity_array = spectra["intensity"].values[0]
mz_array = spectra["mz"].values
intensity_array = spectra["intensity"].values
num_peaks = len(mz_array)

for hit in peptide_id.getHits():
Expand All @@ -84,6 +85,9 @@ def convert_psm(idxml, spectra_file, export_decoy_psm):
elif search_engines == "Sage":
id_scores = ["Sage:hyperscore: " + str(hit.getScore())]

if hit.metaValueExists("MS:1001491"):
global_qvalue = hit.getMetaValue("MS:1001491")

charge = hit.getCharge()
peptidoform = hit.getSequence().toString()
modifications = mods_position(peptidoform)
Expand Down
43 changes: 43 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -270,4 +270,47 @@ process {
]
}

// ID ONLY
withName: 'MS2RESCORE' {
ext.args = [
"--ms2pip_model ${params.ms2pip_model}",
"--rescoring_engine ${params.posterior_probabilities}",
ypriverol marked this conversation as resolved.
Show resolved Hide resolved
"--calibration_set_size ${params.calibration_set_size}",
"--test_fdr ${params.test_FDR}",
params.feature_generators.trim() ? "--feature_generators ${params.feature_generators}" : ''
].join(' ').trim()
publishDir = [
path: { "${params.outdir}/${task.process.tokenize(':')[-1].toLowerCase()}" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: '.*:DDA_ID:IDSCORESWITCHER' {
ext.args = [
"-new_score_orientation lower_better",
"-new_score_type \"Posterior Error Probability\"",
"-debug $params.idscoreswitcher_debug"
].join(' ').trim()
publishDir = [
path: { "${params.outdir}/idscoreswitcherforluciphor" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: '.*:DDA_ID:PSMFDRCONTROL:IDSCORESWITCHER' {
ext.args = [
"-new_score_orientation lower_better",
"-old_score \"Posterior Error Probability\"",
"-new_score_type q-value",
"-debug $params.idscoreswitcher_debug"
].join(' ').trim()
publishDir = [
path: { "${params.outdir}/idscoreswitcher" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

}
31 changes: 31 additions & 0 deletions modules/local/extract_sample/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
process GETSAMPLE {
tag "$design.Name"
label 'process_low'

conda "bioconda::sdrf-pipelines=0.0.25"
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container "https://depot.galaxyproject.org/singularity/sdrf-pipelines:0.0.25--pyhdfd78af_0"
} else {
container "biocontainers/sdrf-pipelines:0.0.25--pyhdfd78af_0"
}

input:
path design

output:
path "*_sample.csv", emit: ch_expdesign_sample
path "versions.yml", emit: versions

when:
task.ext.when == null || task.ext.when

script:
"""
extract_sample.py "${design}" 2>&1 | tee extract_sample.log

cat <<-END_VERSIONS > versions.yml
"${task.process}":
sdrf-pipelines: \$(parse_sdrf --version 2>&1 | awk -F ' ' '{print \$2}')
END_VERSIONS
"""
}
27 changes: 27 additions & 0 deletions modules/local/extract_sample/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: GETSAMPLE
description: A module to extract sample information from experimental design file
keywords:
- sample
- conversion
tools:
- custom:
description: |
A custom module for sample extraction.
homepage: https://github.com/bigbio/quantms
documentation: https://github.com/bigbio/quantms/tree/readthedocs
input:
- design:
type: file
description: experimental design file
pattern: "*.csv"
output:
- ch_expdesign_sample:
type: file
description: sample csv file
pattern: "*_sample.csv"
- version:
type: file
description: File containing software version
pattern: "versions.yml"
authors:
- "@daichengxin"
Loading
Loading