-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #29 from hgb-bin-proteomics/develop
add xiFdrExporter
- Loading branch information
Showing
9 changed files
with
191 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
File renamed without changes
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
File renamed without changes
File renamed without changes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
#!/usr/bin/env python3 | ||
|
||
# Exporter of MS Annika CSM Results to xiFDR input format | ||
# 2024 (c) Micha Johannes Birklbauer | ||
# https://github.com/michabirklbauer/ | ||
# micha.birklbauer@gmail.com | ||
|
||
import argparse | ||
import pandas as pd | ||
|
||
__version = "1.0.1" | ||
__date = "20240505" | ||
|
||
""" | ||
DESCRIPTION: | ||
A script to export MS Annika CSM results (.xlsx) to a xiFDR input file (.csv). | ||
CSMs should be unfiltered, therefore include decoys and not be validated for any | ||
FDR. | ||
Warning: This exporter currently only reports one/the first protein for | ||
ambiguous peptides that are found in more than one protein! | ||
USAGE: | ||
xiFdrExporter_msannika.py f [f] | ||
[-o OUTPUT] | ||
[-h] | ||
[--version] | ||
positional arguments: | ||
f Crosslink-Spectrum-Matches (CSMs) exported from | ||
MS Annika in Microsoft Excel (.xlsx) format. | ||
optional arguments: | ||
-o OUTPUT, --output OUTPUT | ||
Prefix of the output file. | ||
-h, --help show this help message and exit | ||
--version show program's version number and exit | ||
""" | ||
|
||
# Exporter class with constructor that takes one MS Annika CSM result file as | ||
# input. CSMs should not be in any way filtered and exported to Microsoft Excel | ||
# .xlsx format from Proteome Discoverer. | ||
class MSAnnika_Exporter: | ||
|
||
def __init__(self, input_file: str): | ||
self.input_file = input_file | ||
|
||
# static method to generate pandas dataframe of xiFDR export without class | ||
# instance. Takes the file name of the CSM file as input. | ||
@staticmethod | ||
def generate_df(input_file: str) -> pd.DataFrame: | ||
|
||
print("Warning: This exporter currently only reports one/the first protein for ambiguous peptides that are found in more than one protein!") | ||
|
||
df = pd.read_excel(input_file) | ||
df.rename(columns = {"Spectrum File": "run", | ||
"First Scan": "scan", | ||
"Sequence A": "peptide1", | ||
"Sequence B": "peptide2", | ||
"Crosslinker Position A": "peptide link 1", | ||
"Crosslinker Position B": "peptide link 2", | ||
"Charge": "precursor charge", | ||
"Combined Score": "score", | ||
"Score Alpha": "peptide1 score", | ||
"Score Beta": "peptide2 score", | ||
"Accession A": "accession1", | ||
"Accession B": "accession2", | ||
"A in protein": "peptide position 1", | ||
"B in protein": "peptide position 2"}, | ||
inplace = True, | ||
errors = "raise") | ||
# remove the following two lines if I find out how to denote ambiguous peptides in xiFDR (e.g. peptides that link to more than one protein) | ||
df["accession1"] = df["accession1"].apply(lambda x: x.split(";")[0]) | ||
df["accession2"] = df["accession2"].apply(lambda x: x.split(";")[0]) | ||
df["is decoy 1"] = df["Alpha T/D"].apply(lambda x: "false" if "t" in str(x).lower() else "true") | ||
df["is decoy 2"] = df["Beta T/D"].apply(lambda x: "false" if "t" in str(x).lower() else "true") | ||
# same issue again - this would be used if xiFDR allows more than protein per peptide | ||
#df["peptide position 1"] = df["peptide position 1"].apply(lambda x: ";".join([str(int(y) + 1) for y in str(x).split(";")])) | ||
#df["peptide position 2"] = df["peptide position 2"].apply(lambda x: ";".join([str(int(y) + 1) for y in str(x).split(";")])) | ||
# remove the following two lines if I figure above out | ||
df["peptide position 1"] = df["peptide position 1"].apply(lambda x: int(x.split(";")[0]) + 1) | ||
df["peptide position 2"] = df["peptide position 2"].apply(lambda x: int(x.split(";")[0]) + 1) | ||
|
||
return df | ||
|
||
# classmethod implementation of the static generate_df | ||
def __generate_csv_df(self) -> pd.DataFrame: | ||
return self.generate_df(self.input_file) | ||
|
||
# export function, takes one argument "output_file" which sets the prefix | ||
# of generated output file | ||
def export(self, output_file: str = None) -> pd.DataFrame: | ||
csv = self.__generate_csv_df() | ||
|
||
if output_file is None: | ||
output_file = ".".join(self.input_file.split(".")[:-1]) | ||
|
||
csv.to_csv(output_file + "_xiFDR.csv", index = False) | ||
|
||
return csv | ||
|
||
# initialize exporter and export xiFDR csv file | ||
def main() -> None: | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument(metavar = "f", | ||
dest = "file", | ||
help = "Name/Path of the MS Annika CSM result file (in .xlsx format) to process.", | ||
type = str, | ||
nargs = 1) | ||
parser.add_argument("-o", "--output", | ||
dest = "output", | ||
default = None, | ||
help = "Prefix of the output file.", | ||
type = str) | ||
parser.add_argument("--version", | ||
action = "version", | ||
version = __version) | ||
args = parser.parse_args() | ||
|
||
exporter = MSAnnika_Exporter(args.file[0]) | ||
|
||
exporter.export(args.output) | ||
|
||
if __name__ == "__main__": | ||
main() |