diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml new file mode 100644 index 0000000..66eefa6 --- /dev/null +++ b/.github/workflows/python-app.yml @@ -0,0 +1,35 @@ +# This workflow will lint with Python 3.9 +# e.g. check for syntax errors and undefined names +# Reference workflow provided by (c) GitHub +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: scout_extensions + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.9 + uses: actions/setup-python@v2 + with: + python-version: 3.9 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics diff --git a/README.md b/README.md index b114729..8468314 100644 --- a/README.md +++ b/README.md @@ -1 +1,55 @@ -# Scout_extensions \ No newline at end of file +# Scout Extensions + +Extensions to use [Scout](https://github.com/diogobor/Scout) with other software. + +## Requirements + +Python 3.7+ and the following packages are required: +- Install [pandas](https://pandas.pydata.org/): `pip install pandas` +- Install [xlsxwriter](https://xlsxwriter.readthedocs.io/): `pip install xlsxwriter` + +## Scout to [IMP-X-FDR](https://github.com/fstanek/imp-x-fdr) converter + +The main purpose of this script is to convert Scout output files to MS Annika format - which are usable with the [IMP-X-FDR](https://github.com/fstanek/imp-x-fdr) tool. This way Scout can be benchmarked on synthetic peptide libraries. + +``` +DESCRIPTION: +A script to convert Scout *.csv result files to MS Annika format as +Microsoft Excel worksheets for usage with IMP-X-FDR (v1.1.0). + +USAGE: +scoutToIMPXFDR.py f [f ...] + [-o OUTPUT] + [-xl CROSSLINKER] + [-xlmod CROSSLINKER_MODIFICATION] + [-h] + [--version] + +positional arguments: + f Scout result file to process, if second filename + is given it will be used as the output name! + +optional arguments: + -h, --help show this help message and exit. + -o OUTPUT, --output OUTPUT + Name of the output file. + -xl CROSSLINKER, --crosslinker CROSSLINKER + Name of the Crosslinker e.g. DSSO. + -xlmod CROSSLINKER_MODIFICATION, --crosslinker-modification CROSSLINKER_MODIFICATION + Residue that the Crosslinker binds to e.g. K for DSSO. + --version show program's version number and exit. +``` + +Example Usage: + +``` +python scoutToIMPXFDR.py my_scout_results.csv -o my_scout_results_converted -xl DSSO -xlmod K +``` + +## License + +[MIT License](https://github.com/hgb-bin-proteomics/MaXLinker_extensions/blob/master/LICENSE) + +## Contact + +[micha.birklbauer@fh-hagenberg.at](mailto:micha.birklbauer@fh-hagenberg.at) diff --git a/scoutToIMPXFDR.py b/scoutToIMPXFDR.py new file mode 100644 index 0000000..ff2394e --- /dev/null +++ b/scoutToIMPXFDR.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 + +# Scout Result file to MS Annika Result file converter +# 2022 (c) Micha Johannes Birklbauer +# https://github.com/michabirklbauer/ +# micha.birklbauer@gmail.com + +import argparse +import pandas as pd +import traceback as tb + +__version = "1.0.0" +__date = "2023-10-17" + +""" +DESCRIPTION: +A script to convert Scout *.csv result files to MS Annika format as +Microsoft Excel worksheets for usage with IMP-X-FDR (v1.1.0). + +USAGE: +scoutToIMPXFDR.py f [f ...] + [-o OUTPUT] + [-xl CROSSLINKER] + [-xlmod CROSSLINKER_MODIFICATION] + [-h] + [--version] + +positional arguments: + f Scout result file to process, if second filename + is given it will be used as the output name! + +optional arguments: + -h, --help show this help message and exit. + -o OUTPUT, --output OUTPUT + Name of the output file. + -xl CROSSLINKER, --crosslinker CROSSLINKER + Name of the Crosslinker e.g. DSSO. + -xlmod CROSSLINKER_MODIFICATION, --crosslinker-modification CROSSLINKER_MODIFICATION + Residue that the Crosslinker binds to e.g. K for DSSO. + --version show program's version number and exit. +""" + +#### MS Annika Result columns mapping #### +# Checked: (bool) TRUE | FALSE -> create with FALSE +# Crosslinker: (string) e.g. DSSO -> create with crosslinker name +# Crosslink Type: (string selection) Intra | Inter -> mapped "Link-Type" +# # CSMs: (int) -> mapped "CSM count" +# # Proteins: (int) -> create with zeros +# Sequence A: (string) e.g. [K]SSAAR -> mapped "Alpha peptide" +# Accession A: (string) e.g. P0A7X3 -> mapped "Alpha protein mapping(s)" +# Position A: int -> mapped "Alpha peptide position" +# Sequence B: (string) -> ^ +# Accession B: (string) -> ^ +# Position B: (int) -> ^ +# Protein Descriptions A: (string) -> mapped "Alpha protein mapping(s)" +# Protein Descriptions B: (string) -> ^ +# Best CSM Score: (double) -> mapped "Score" +# In protein A: (int) -> create with zeros +# In protein B: (int) -> create with zeros +# Decoy: (bool) TRUE | FALSE -> create with FALSE +# Modifications A: (string) e.g. K1(DSSO);M1(Oxidation) -> create with xl name and modification +# Modifications B: (string) -> create with xl name and modification +# Confidence: (string selection) High | Medium | Low -> create with High + +# function that returns pandas dataframe in annika format +def create_annika_result(scout_filename: str, crosslinker: str = "DSSO", crosslinker_aa: str = "K") -> pd.DataFrame: + + if len(crosslinker_aa) != 1: + raise Exception("Crosslinker modifications that affect more than one amino acid are not supported! Exiting...") + + # load file + scout_df = pd.read_csv(scout_filename) + nrows = scout_df.shape[0] + + # columns + Checked = ["FALSE" for i in range(nrows)] + Crosslinker = [crosslinker for i in range(nrows)] + Crosslink_Type = scout_df["Link-Type"].apply(lambda x: "Intra" if "intra" in x.lower() else "Inter").tolist() + CSMs = scout_df["CSM count"].tolist() + Proteins = [0 for i in range(nrows)] + Sequence_A = scout_df["Alpha peptide"].apply(lambda x: x.replace(" ", "")).tolist() + Accession_A = scout_df["Alpha protein mapping(s)"].tolist() + Position_A = scout_df["Alpha peptide position"].tolist() + Sequence_B = scout_df["Beta peptide"].apply(lambda x: x.replace(" ", "")).tolist() + Accession_B = scout_df["Beta protein mapping(s)"].tolist() + Position_B = scout_df["Beta peptide position"].tolist() + Protein_Descriptions_A = scout_df["Alpha protein mapping(s)"].tolist() + Protein_Descriptions_B = scout_df["Beta protein mapping(s)"].tolist() + Best_CSM_Score = scout_df["Score"].tolist() + In_protein_A = [0 for i in range(nrows)] + In_protein_B = [0 for i in range(nrows)] + Decoy = ["FALSE" for i in range(nrows)] + Modifications_A = scout_df["Alpha peptide position"].apply(lambda x: crosslinker_aa + str(x) + "(" + crosslinker + ")").tolist() + Modifications_B = scout_df["Beta peptide position"].apply(lambda x: crosslinker_aa + str(x) + "(" + crosslinker + ")").tolist() + Confidence = ["High" for i in range(nrows)] + + # create annika dataframe + annika_df = pd.DataFrame({"Checked": Checked, + "Crosslinker": Crosslinker, + "Crosslink Type": Crosslink_Type, + "# CSMs": CSMs, + "# Proteins": Proteins, + "Sequence A": Sequence_A, + "Accession A": Accession_A, + "Position A": Position_A, + "Sequence B": Sequence_B, + "Accession B": Accession_B, + "Position B": Position_B, + "Protein Descriptions A": Protein_Descriptions_A, + "Protein Descriptions B": Protein_Descriptions_B, + "Best CSM Score": Best_CSM_Score, + "In protein A": In_protein_A, + "In protein B": In_protein_B, + "Decoy": Decoy, + "Modifications A": Modifications_A, + "Modifications B": Modifications_B, + "Confidence": Confidence}) + + return annika_df + +# read Scout result and write MS Annika result (in xlsx format) +def main() -> pd.DataFrame: + + parser = argparse.ArgumentParser() + parser.add_argument(metavar = "f", + dest = "files", + help = "Scout result file to process, if second filename is given it will be used as the output name!", + type = str, + nargs = "+") + parser.add_argument("-o", "--output", + dest = "output", + default = None, + help = "Name of the output file.", + type = str) + parser.add_argument("-xl", "--crosslinker", + dest = "crosslinker", + default = "DSSO", + help = "Name of the Crosslinker e.g. DSSO.", + type = str) + parser.add_argument("-xlmod", "--crosslinker-modification", + dest = "crosslinker_modification", + default = "K", + help = "Residue that the Crosslinker binds to e.g. K for DSSO.", + type = str) + parser.add_argument("--version", + action = "version", + version = __version) + args = parser.parse_args() + + input_file = args.files[0] + output_file = args.files[0].split(".csv")[0] + ".xlsx" + + if len(args.files) > 1: + output_file = args.files[1].split(".xlsx")[0] + ".xlsx" + + if args.output is not None: + output_file = args.output.split(".xlsx")[0] + ".xlsx" + + scout_resultdf = create_annika_result(input_file, args.crosslinker, args.crosslinker_modification) + scout_resultdf.to_excel(output_file, sheet_name = "Crosslinks", index = False, engine = "xlsxwriter") + + return scout_resultdf + +if __name__ == "__main__": + + scout_df = main()