-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from hgb-bin-proteomics/develop
v1.0.0
- Loading branch information
Showing
3 changed files
with
256 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# This workflow will lint with Python 3.9 | ||
# e.g. check for syntax errors and undefined names | ||
# Reference workflow provided by (c) GitHub | ||
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions | ||
|
||
name: scout_extensions | ||
|
||
on: | ||
push: | ||
branches: [ master ] | ||
pull_request: | ||
branches: [ master ] | ||
|
||
jobs: | ||
build: | ||
|
||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- uses: actions/checkout@v2 | ||
- name: Set up Python 3.9 | ||
uses: actions/setup-python@v2 | ||
with: | ||
python-version: 3.9 | ||
- name: Install dependencies | ||
run: | | ||
python -m pip install --upgrade pip | ||
pip install flake8 pytest | ||
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi | ||
- name: Lint with flake8 | ||
run: | | ||
# stop the build if there are Python syntax errors or undefined names | ||
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics | ||
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide | ||
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,55 @@ | ||
# Scout_extensions | ||
# Scout Extensions | ||
|
||
Extensions to use [Scout](https://github.com/diogobor/Scout) with other software. | ||
|
||
## Requirements | ||
|
||
Python 3.7+ and the following packages are required: | ||
- Install [pandas](https://pandas.pydata.org/): `pip install pandas` | ||
- Install [xlsxwriter](https://xlsxwriter.readthedocs.io/): `pip install xlsxwriter` | ||
|
||
## Scout to [IMP-X-FDR](https://github.com/fstanek/imp-x-fdr) converter | ||
|
||
The main purpose of this script is to convert Scout output files to MS Annika format - which are usable with the [IMP-X-FDR](https://github.com/fstanek/imp-x-fdr) tool. This way Scout can be benchmarked on synthetic peptide libraries. | ||
|
||
``` | ||
DESCRIPTION: | ||
A script to convert Scout *.csv result files to MS Annika format as | ||
Microsoft Excel worksheets for usage with IMP-X-FDR (v1.1.0). | ||
USAGE: | ||
scoutToIMPXFDR.py f [f ...] | ||
[-o OUTPUT] | ||
[-xl CROSSLINKER] | ||
[-xlmod CROSSLINKER_MODIFICATION] | ||
[-h] | ||
[--version] | ||
positional arguments: | ||
f Scout result file to process, if second filename | ||
is given it will be used as the output name! | ||
optional arguments: | ||
-h, --help show this help message and exit. | ||
-o OUTPUT, --output OUTPUT | ||
Name of the output file. | ||
-xl CROSSLINKER, --crosslinker CROSSLINKER | ||
Name of the Crosslinker e.g. DSSO. | ||
-xlmod CROSSLINKER_MODIFICATION, --crosslinker-modification CROSSLINKER_MODIFICATION | ||
Residue that the Crosslinker binds to e.g. K for DSSO. | ||
--version show program's version number and exit. | ||
``` | ||
|
||
Example Usage: | ||
|
||
``` | ||
python scoutToIMPXFDR.py my_scout_results.csv -o my_scout_results_converted -xl DSSO -xlmod K | ||
``` | ||
|
||
## License | ||
|
||
[MIT License](https://github.com/hgb-bin-proteomics/MaXLinker_extensions/blob/master/LICENSE) | ||
|
||
## Contact | ||
|
||
[micha.birklbauer@fh-hagenberg.at](mailto:micha.birklbauer@fh-hagenberg.at) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
#!/usr/bin/env python3 | ||
|
||
# Scout Result file to MS Annika Result file converter | ||
# 2022 (c) Micha Johannes Birklbauer | ||
# https://github.com/michabirklbauer/ | ||
# micha.birklbauer@gmail.com | ||
|
||
import argparse | ||
import pandas as pd | ||
import traceback as tb | ||
|
||
__version = "1.0.0" | ||
__date = "2023-10-17" | ||
|
||
""" | ||
DESCRIPTION: | ||
A script to convert Scout *.csv result files to MS Annika format as | ||
Microsoft Excel worksheets for usage with IMP-X-FDR (v1.1.0). | ||
USAGE: | ||
scoutToIMPXFDR.py f [f ...] | ||
[-o OUTPUT] | ||
[-xl CROSSLINKER] | ||
[-xlmod CROSSLINKER_MODIFICATION] | ||
[-h] | ||
[--version] | ||
positional arguments: | ||
f Scout result file to process, if second filename | ||
is given it will be used as the output name! | ||
optional arguments: | ||
-h, --help show this help message and exit. | ||
-o OUTPUT, --output OUTPUT | ||
Name of the output file. | ||
-xl CROSSLINKER, --crosslinker CROSSLINKER | ||
Name of the Crosslinker e.g. DSSO. | ||
-xlmod CROSSLINKER_MODIFICATION, --crosslinker-modification CROSSLINKER_MODIFICATION | ||
Residue that the Crosslinker binds to e.g. K for DSSO. | ||
--version show program's version number and exit. | ||
""" | ||
|
||
#### MS Annika Result columns mapping #### | ||
# Checked: (bool) TRUE | FALSE -> create with FALSE | ||
# Crosslinker: (string) e.g. DSSO -> create with crosslinker name | ||
# Crosslink Type: (string selection) Intra | Inter -> mapped "Link-Type" | ||
# # CSMs: (int) -> mapped "CSM count" | ||
# # Proteins: (int) -> create with zeros | ||
# Sequence A: (string) e.g. [K]SSAAR -> mapped "Alpha peptide" | ||
# Accession A: (string) e.g. P0A7X3 -> mapped "Alpha protein mapping(s)" | ||
# Position A: int -> mapped "Alpha peptide position" | ||
# Sequence B: (string) -> ^ | ||
# Accession B: (string) -> ^ | ||
# Position B: (int) -> ^ | ||
# Protein Descriptions A: (string) -> mapped "Alpha protein mapping(s)" | ||
# Protein Descriptions B: (string) -> ^ | ||
# Best CSM Score: (double) -> mapped "Score" | ||
# In protein A: (int) -> create with zeros | ||
# In protein B: (int) -> create with zeros | ||
# Decoy: (bool) TRUE | FALSE -> create with FALSE | ||
# Modifications A: (string) e.g. K1(DSSO);M1(Oxidation) -> create with xl name and modification | ||
# Modifications B: (string) -> create with xl name and modification | ||
# Confidence: (string selection) High | Medium | Low -> create with High | ||
|
||
# function that returns pandas dataframe in annika format | ||
def create_annika_result(scout_filename: str, crosslinker: str = "DSSO", crosslinker_aa: str = "K") -> pd.DataFrame: | ||
|
||
if len(crosslinker_aa) != 1: | ||
raise Exception("Crosslinker modifications that affect more than one amino acid are not supported! Exiting...") | ||
|
||
# load file | ||
scout_df = pd.read_csv(scout_filename) | ||
nrows = scout_df.shape[0] | ||
|
||
# columns | ||
Checked = ["FALSE" for i in range(nrows)] | ||
Crosslinker = [crosslinker for i in range(nrows)] | ||
Crosslink_Type = scout_df["Link-Type"].apply(lambda x: "Intra" if "intra" in x.lower() else "Inter").tolist() | ||
CSMs = scout_df["CSM count"].tolist() | ||
Proteins = [0 for i in range(nrows)] | ||
Sequence_A = scout_df["Alpha peptide"].apply(lambda x: x.replace(" ", "")).tolist() | ||
Accession_A = scout_df["Alpha protein mapping(s)"].tolist() | ||
Position_A = scout_df["Alpha peptide position"].tolist() | ||
Sequence_B = scout_df["Beta peptide"].apply(lambda x: x.replace(" ", "")).tolist() | ||
Accession_B = scout_df["Beta protein mapping(s)"].tolist() | ||
Position_B = scout_df["Beta peptide position"].tolist() | ||
Protein_Descriptions_A = scout_df["Alpha protein mapping(s)"].tolist() | ||
Protein_Descriptions_B = scout_df["Beta protein mapping(s)"].tolist() | ||
Best_CSM_Score = scout_df["Score"].tolist() | ||
In_protein_A = [0 for i in range(nrows)] | ||
In_protein_B = [0 for i in range(nrows)] | ||
Decoy = ["FALSE" for i in range(nrows)] | ||
Modifications_A = scout_df["Alpha peptide position"].apply(lambda x: crosslinker_aa + str(x) + "(" + crosslinker + ")").tolist() | ||
Modifications_B = scout_df["Beta peptide position"].apply(lambda x: crosslinker_aa + str(x) + "(" + crosslinker + ")").tolist() | ||
Confidence = ["High" for i in range(nrows)] | ||
|
||
# create annika dataframe | ||
annika_df = pd.DataFrame({"Checked": Checked, | ||
"Crosslinker": Crosslinker, | ||
"Crosslink Type": Crosslink_Type, | ||
"# CSMs": CSMs, | ||
"# Proteins": Proteins, | ||
"Sequence A": Sequence_A, | ||
"Accession A": Accession_A, | ||
"Position A": Position_A, | ||
"Sequence B": Sequence_B, | ||
"Accession B": Accession_B, | ||
"Position B": Position_B, | ||
"Protein Descriptions A": Protein_Descriptions_A, | ||
"Protein Descriptions B": Protein_Descriptions_B, | ||
"Best CSM Score": Best_CSM_Score, | ||
"In protein A": In_protein_A, | ||
"In protein B": In_protein_B, | ||
"Decoy": Decoy, | ||
"Modifications A": Modifications_A, | ||
"Modifications B": Modifications_B, | ||
"Confidence": Confidence}) | ||
|
||
return annika_df | ||
|
||
# read Scout result and write MS Annika result (in xlsx format) | ||
def main() -> pd.DataFrame: | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument(metavar = "f", | ||
dest = "files", | ||
help = "Scout result file to process, if second filename is given it will be used as the output name!", | ||
type = str, | ||
nargs = "+") | ||
parser.add_argument("-o", "--output", | ||
dest = "output", | ||
default = None, | ||
help = "Name of the output file.", | ||
type = str) | ||
parser.add_argument("-xl", "--crosslinker", | ||
dest = "crosslinker", | ||
default = "DSSO", | ||
help = "Name of the Crosslinker e.g. DSSO.", | ||
type = str) | ||
parser.add_argument("-xlmod", "--crosslinker-modification", | ||
dest = "crosslinker_modification", | ||
default = "K", | ||
help = "Residue that the Crosslinker binds to e.g. K for DSSO.", | ||
type = str) | ||
parser.add_argument("--version", | ||
action = "version", | ||
version = __version) | ||
args = parser.parse_args() | ||
|
||
input_file = args.files[0] | ||
output_file = args.files[0].split(".csv")[0] + ".xlsx" | ||
|
||
if len(args.files) > 1: | ||
output_file = args.files[1].split(".xlsx")[0] + ".xlsx" | ||
|
||
if args.output is not None: | ||
output_file = args.output.split(".xlsx")[0] + ".xlsx" | ||
|
||
scout_resultdf = create_annika_result(input_file, args.crosslinker, args.crosslinker_modification) | ||
scout_resultdf.to_excel(output_file, sheet_name = "Crosslinks", index = False, engine = "xlsxwriter") | ||
|
||
return scout_resultdf | ||
|
||
if __name__ == "__main__": | ||
|
||
scout_df = main() |