Merge pull request #1 from hgb-bin-proteomics/develop

v1.0.0
hgb-bin-proteomics · Oct 18, 2023 · 3afebf2 · 3afebf2
2 parents 5a6dc11 + 97e93f7
commit 3afebf2
Show file tree

Hide file tree

Showing 3 changed files with 256 additions and 1 deletion.
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -0,0 +1,35 @@
+# This workflow will lint with Python 3.9
+# e.g. check for syntax errors and undefined names
+# Reference workflow provided by (c) GitHub
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: scout_extensions
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.9
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install flake8 pytest
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
diff --git a/README.md b/README.md
@@ -1 +1,55 @@
-# Scout_extensions
+# Scout Extensions
+
+Extensions to use [Scout](https://github.com/diogobor/Scout) with other software.
+
+## Requirements
+
+Python 3.7+ and the following packages are required:
+- Install [pandas](https://pandas.pydata.org/): `pip install pandas`
+- Install [xlsxwriter](https://xlsxwriter.readthedocs.io/): `pip install xlsxwriter`
+
+## Scout to [IMP-X-FDR](https://github.com/fstanek/imp-x-fdr) converter
+
+The main purpose of this script is to convert Scout output files to MS Annika format - which are usable with the [IMP-X-FDR](https://github.com/fstanek/imp-x-fdr) tool. This way Scout can be benchmarked on synthetic peptide libraries.
+
+```
+DESCRIPTION:
+A script to convert Scout *.csv result files to MS Annika format as
+Microsoft Excel worksheets for usage with IMP-X-FDR (v1.1.0).
+
+USAGE:
+scoutToIMPXFDR.py f [f ...]
+                    [-o OUTPUT]
+                    [-xl CROSSLINKER]
+                    [-xlmod CROSSLINKER_MODIFICATION]
+                    [-h]
+                    [--version]
+
+positional arguments:
+  f                     Scout result file to process, if second filename
+                        is given it will be used as the output name!
+
+optional arguments:
+  -h, --help            show this help message and exit.
+  -o OUTPUT, --output OUTPUT
+                        Name of the output file.
+  -xl CROSSLINKER, --crosslinker CROSSLINKER
+                        Name of the Crosslinker e.g. DSSO.
+  -xlmod CROSSLINKER_MODIFICATION, --crosslinker-modification CROSSLINKER_MODIFICATION
+                        Residue that the Crosslinker binds to e.g. K for DSSO.
+  --version             show program's version number and exit.
+```
+
+Example Usage:
+
+```
+python scoutToIMPXFDR.py my_scout_results.csv -o my_scout_results_converted -xl DSSO -xlmod K
+```
+
+## License
+
+[MIT License](https://github.com/hgb-bin-proteomics/MaXLinker_extensions/blob/master/LICENSE)
+
+## Contact
+
+[micha.birklbauer@fh-hagenberg.at](mailto:micha.birklbauer@fh-hagenberg.at)
diff --git a/scoutToIMPXFDR.py b/scoutToIMPXFDR.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+
+# Scout Result file to MS Annika Result file converter
+# 2022 (c) Micha Johannes Birklbauer
+# https://github.com/michabirklbauer/
+# micha.birklbauer@gmail.com
+
+import argparse
+import pandas as pd
+import traceback as tb
+
+__version = "1.0.0"
+__date = "2023-10-17"
+
+"""
+DESCRIPTION:
+A script to convert Scout *.csv result files to MS Annika format as
+Microsoft Excel worksheets for usage with IMP-X-FDR (v1.1.0).
+
+USAGE:
+scoutToIMPXFDR.py f [f ...]
+                    [-o OUTPUT]
+                    [-xl CROSSLINKER]
+                    [-xlmod CROSSLINKER_MODIFICATION]
+                    [-h]
+                    [--version]
+
+positional arguments:
+  f                     Scout result file to process, if second filename
+                        is given it will be used as the output name!
+
+optional arguments:
+  -h, --help            show this help message and exit.
+  -o OUTPUT, --output OUTPUT
+                        Name of the output file.
+  -xl CROSSLINKER, --crosslinker CROSSLINKER
+                        Name of the Crosslinker e.g. DSSO.
+  -xlmod CROSSLINKER_MODIFICATION, --crosslinker-modification CROSSLINKER_MODIFICATION
+                        Residue that the Crosslinker binds to e.g. K for DSSO.
+  --version             show program's version number and exit.
+"""
+
+#### MS Annika Result columns mapping ####
+# Checked: (bool) TRUE | FALSE                             -> create with FALSE
+# Crosslinker:  (string) e.g. DSSO                         -> create with crosslinker name
+# Crosslink Type: (string selection) Intra | Inter         -> mapped "Link-Type"
+# # CSMs: (int)                                            -> mapped "CSM count"
+# # Proteins: (int)                                        -> create with zeros
+# Sequence A: (string) e.g. [K]SSAAR                       -> mapped "Alpha peptide"
+# Accession A: (string) e.g. P0A7X3                        -> mapped "Alpha protein mapping(s)"
+# Position A: int                                          -> mapped "Alpha peptide position"
+# Sequence B: (string)                                     -> ^
+# Accession B: (string)                                    -> ^
+# Position B: (int)                                        -> ^
+# Protein Descriptions A: (string)                         -> mapped "Alpha protein mapping(s)"
+# Protein Descriptions B: (string)                         -> ^
+# Best CSM Score: (double)                                 -> mapped "Score"
+# In protein A: (int)                                      -> create with zeros
+# In protein B: (int)                                      -> create with zeros
+# Decoy: (bool) TRUE | FALSE                               -> create with FALSE
+# Modifications A: (string) e.g. K1(DSSO);M1(Oxidation)    -> create with xl name and modification
+# Modifications B: (string)                                -> create with xl name and modification
+# Confidence: (string selection) High | Medium | Low       -> create with High
+
+# function that returns pandas dataframe in annika format
+def create_annika_result(scout_filename: str, crosslinker: str = "DSSO", crosslinker_aa: str = "K") -> pd.DataFrame:
+
+    if len(crosslinker_aa) != 1:
+        raise Exception("Crosslinker modifications that affect more than one amino acid are not supported! Exiting...")
+
+    # load file
+    scout_df = pd.read_csv(scout_filename)
+    nrows = scout_df.shape[0]
+
+    # columns
+    Checked = ["FALSE" for i in range(nrows)]
+    Crosslinker = [crosslinker for i in range(nrows)]
+    Crosslink_Type = scout_df["Link-Type"].apply(lambda x: "Intra" if "intra" in x.lower() else "Inter").tolist()
+    CSMs = scout_df["CSM count"].tolist()
+    Proteins = [0 for i in range(nrows)]
+    Sequence_A = scout_df["Alpha peptide"].apply(lambda x: x.replace(" ", "")).tolist()
+    Accession_A = scout_df["Alpha protein mapping(s)"].tolist()
+    Position_A = scout_df["Alpha peptide position"].tolist()
+    Sequence_B = scout_df["Beta peptide"].apply(lambda x: x.replace(" ", "")).tolist()
+    Accession_B = scout_df["Beta protein mapping(s)"].tolist()
+    Position_B = scout_df["Beta peptide position"].tolist()
+    Protein_Descriptions_A = scout_df["Alpha protein mapping(s)"].tolist()
+    Protein_Descriptions_B = scout_df["Beta protein mapping(s)"].tolist()
+    Best_CSM_Score = scout_df["Score"].tolist()
+    In_protein_A = [0 for i in range(nrows)]
+    In_protein_B = [0 for i in range(nrows)]
+    Decoy = ["FALSE" for i in range(nrows)]
+    Modifications_A = scout_df["Alpha peptide position"].apply(lambda x: crosslinker_aa + str(x) + "(" + crosslinker + ")").tolist()
+    Modifications_B = scout_df["Beta peptide position"].apply(lambda x: crosslinker_aa + str(x) + "(" + crosslinker + ")").tolist()
+    Confidence = ["High" for i in range(nrows)]
+
+    # create annika dataframe
+    annika_df = pd.DataFrame({"Checked": Checked,
+                              "Crosslinker": Crosslinker,
+                              "Crosslink Type": Crosslink_Type,
+                              "# CSMs": CSMs,
+                              "# Proteins": Proteins,
+                              "Sequence A": Sequence_A,
+                              "Accession A": Accession_A,
+                              "Position A": Position_A,
+                              "Sequence B": Sequence_B,
+                              "Accession B": Accession_B,
+                              "Position B": Position_B,
+                              "Protein Descriptions A": Protein_Descriptions_A,
+                              "Protein Descriptions B": Protein_Descriptions_B,
+                              "Best CSM Score": Best_CSM_Score,
+                              "In protein A": In_protein_A,
+                              "In protein B": In_protein_B,
+                              "Decoy": Decoy,
+                              "Modifications A": Modifications_A,
+                              "Modifications B": Modifications_B,
+                              "Confidence": Confidence})
+
+    return annika_df
+
+# read Scout result and write MS Annika result (in xlsx format)
+def main() -> pd.DataFrame:
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(metavar = "f",
+                        dest = "files",
+                        help = "Scout result file to process, if second filename is given it will be used as the output name!",
+                        type = str,
+                        nargs = "+")
+    parser.add_argument("-o", "--output",
+                        dest = "output",
+                        default = None,
+                        help = "Name of the output file.",
+                        type = str)
+    parser.add_argument("-xl", "--crosslinker",
+                        dest = "crosslinker",
+                        default = "DSSO",
+                        help = "Name of the Crosslinker e.g. DSSO.",
+                        type = str)
+    parser.add_argument("-xlmod", "--crosslinker-modification",
+                        dest = "crosslinker_modification",
+                        default = "K",
+                        help = "Residue that the Crosslinker binds to e.g. K for DSSO.",
+                        type = str)
+    parser.add_argument("--version",
+                        action = "version",
+                        version = __version)
+    args = parser.parse_args()
+
+    input_file = args.files[0]
+    output_file = args.files[0].split(".csv")[0] + ".xlsx"
+
+    if len(args.files) > 1:
+        output_file = args.files[1].split(".xlsx")[0] + ".xlsx"
+
+    if args.output is not None:
+        output_file = args.output.split(".xlsx")[0] + ".xlsx"
+
+    scout_resultdf = create_annika_result(input_file, args.crosslinker, args.crosslinker_modification)
+    scout_resultdf.to_excel(output_file, sheet_name = "Crosslinks", index = False, engine = "xlsxwriter")
+
+    return scout_resultdf
+
+if __name__ == "__main__":
+
+    scout_df = main()