Skip to content

Commit

Permalink
this is the version that paper results comes from
Browse files Browse the repository at this point in the history
  • Loading branch information
mostafakalhor committed Oct 15, 2024
1 parent 100afb2 commit f689768
Show file tree
Hide file tree
Showing 6 changed files with 342 additions and 2 deletions.
1 change: 1 addition & 0 deletions spectrum_io/search_result/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
from .msfragger import MSFragger
from .sage import Sage
from .xisearch import Xisearch
from .scout import Scout
236 changes: 236 additions & 0 deletions spectrum_io/search_result/scout.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
import glob
import logging
import os
import re
from pathlib import Path
from typing import Union

import numpy as np
import pandas as pd
import spectrum_fundamentals.constants as c
from spectrum_fundamentals.mod_string import xisearch_to_internal

from .search_results import SearchResults

logger = logging.getLogger(__name__)


class Scout(SearchResults):
"""Handle search results from xisearch."""

def read_result(self, tmt_labeled: str = "") -> pd.DataFrame:
"""
Function to read a csv of CSMs and perform some basic formatting.
:param tmt_labeled: tmt label as str
:raises NotImplementedError: if a tmt label is provided
:return: pd.DataFrame with the formatted data
"""
if tmt_labeled != "":
raise NotImplementedError("TMT is not supported for Scout")

logger.info("Reading search results file...")
columns_to_read = [
"ScanNumber",
"Charge",
"ExperimentalMZ",
"AlphaPeptide",
"BetaPeptide",
"AlphaPos",
"BetaPos",
"AlphaMappings",
"BetaMappings",
"ClassificationScore",
"FileName",
]

df = pd.read_csv(self.path, usecols=columns_to_read)
print(df.shape[0])
logger.info("Finished reading search results file.")
# Standardize column names
df = Scout.update_columns_for_prosit(df)
df = Scout.filter_valid_prosit_sequences(df)
#df.to_csv("/cmnfs/data/proteomics/XL/Fan_lab/raw_files/raw_files_third_fourth_batch/last_version_scout_both_third_fourth_together/df_filtered_by_oktoberfest.csv")
#df = Scout.filter_duplicates(df)
return df

@staticmethod
def filter_duplicates(df: pd.DataFrame) -> pd.DataFrame:
"""
keep csm with higher score and remove duplicate (only top ranks) .
:param df: df to filter
:return: filtered df as pd.DataFrame
"""
repetitive_combinations = df[df.duplicated(subset=['ScanNumber', 'RAW_FILE'], keep=False)]
filtered_df = repetitive_combinations.groupby(['ScanNumber', 'RAW_FILE']).apply(lambda x: x.loc[x['ClassificationScore'].idxmax()])
filtered_df.reset_index(drop=True, inplace=True)
final_df = pd.concat([df.drop_duplicates(subset=['ScanNumber', 'RAW_FILE'], keep=False), filtered_df])
final_df.reset_index(drop=True, inplace=True)
df = final_df
return df

@staticmethod
def extract_modifications(peptide_seq: str):
modifications = []
# Find all matches of modifications
matches = re.findall(r'([CM])\(\+([\d.]+)\)', peptide_seq)
for match in matches:
mod, _ = match
# Add modification to the list
if mod == 'C':
modifications.append('cm')
elif mod == 'M':
modifications.append('ox')
return ';'.join(modifications)



@staticmethod
def extract_modification_positions(peptide_seq: str):
pattern = r'([A-Z])(\(\+\d+\.\d+\))?'
matches = re.findall(pattern, peptide_seq)
split_peptide = []
for match in matches:
amino_acid = match[0]
modification = match[1] if match[1] else ''
split_peptide.append(amino_acid + modification)
positions = [str(i + 1) for i, component in enumerate(split_peptide) if '+' in component]
return ';'.join(positions)


def self_or_between_mp(df: pd.DataFrame) -> pd.DataFrame:
df['tmp_id'] = df.index
df_expl = df.copy()
df_expl.loc[:,'AlphaMappings'] = df_expl['AlphaMappings'].str.split(';')
df_expl.loc[:,'BetaMappings'] = df_expl['BetaMappings'].str.split(';')
df_expl = df_expl.explode('AlphaMappings')
df_expl = df_expl.explode('BetaMappings')
df_expl.loc[:, 'self'] = False
df_expl.loc[
df_expl['AlphaMappings']==df_expl['BetaMappings']
, 'self'
] = True
id_to_self = df_expl.groupby('tmp_id', dropna=False).agg(
{
'self': 'max'
}
).reset_index()
df = df.drop(['self'], axis=1, errors='ignore').merge(
id_to_self,
on = ['tmp_id'],
validate = '1:1'
)
df.loc[:,'fdr_group'] = df['self'].apply(lambda x: 'self' if x else 'between')
return df



@staticmethod
def update_columns_for_prosit(df: pd.DataFrame) -> pd.DataFrame:
"""
Update columns of df to work with xl-prosit.
:param df: df to modify
:return: modified df as pd.DataFrame
"""
#Filter csms that does not contain any "k"
df = df[(df["AlphaPeptide"].str.contains("K")) & (df["BetaPeptide"].str.contains("K"))]
print(df.shape[0])
df['decoy_p1'] = df['AlphaMappings'].str.contains('Reverse').astype(bool)
df['decoy_p2'] = df['BetaMappings'].str.contains('Reverse').astype(bool)
df["protein_p1"] = df["AlphaMappings"]
df["protein_p2"] = df["BetaMappings"]
df["decoy"] = df["decoy_p1"] | df["decoy_p2"]
df["REVERSE"] = df["decoy"]
df['RAW_FILE'] = df['FileName'].apply(lambda x: x.split('\\')[-1])
df["MASS"] = df["ExperimentalMZ"]
df["PRECURSOR_CHARGE"] = df["Charge"]
df["CROSSLINKER_TYPE"] = "DSSO"
df["crosslinker_name"] = "DSSO"
df["linked_aa_p1"] = "K"
df["linked_aa_p2"] = "K"
df["linear"] = "False"
df["match_score"] = "ClassificationScore"
df["SCORE"] = df["ClassificationScore"]
df["SCAN_NUMBER"] = df["ScanNumber"]
df['SEQUENCE_A'] = df['AlphaPeptide'].apply(lambda x: re.sub(r'\([^)]*\)', '', x))
df['SEQUENCE_B'] = df['BetaPeptide'].apply(lambda x: re.sub(r'\([^)]*\)', '', x))
df["base_sequence_p1"] = df["SEQUENCE_A"]
df["base_sequence_p2"] = df["SEQUENCE_B"]
df = df[df.apply(lambda row: row["SEQUENCE_A"][row["AlphaPos"]] == "K" , axis=1)]
df = df[df.apply(lambda row: row["SEQUENCE_B"][row["BetaPos"]] == "K" , axis=1)]
df['Modifications_A'] = df['AlphaPeptide'].apply(Scout.extract_modifications)
df['Modifications_B'] = df['BetaPeptide'].apply(Scout.extract_modifications)
df['mods_p1'] = df['AlphaPeptide'].apply(Scout.extract_modifications)
df['mods_p2'] = df['BetaPeptide'].apply(Scout.extract_modifications)
df['ModificationPositions1'] = df['AlphaPeptide'].apply(Scout.extract_modification_positions)
df['ModificationPositions2'] = df['BetaPeptide'].apply(Scout.extract_modification_positions)
df["CROSSLINKER_POSITION_A"] = df["AlphaPos"] + 1
df["CROSSLINKER_POSITION_B"] = df["BetaPos"] + 1
df["mod_pos_p1"] = df["AlphaPos"] + 1
df["mod_pos_p2"] = df["BetaPos"] + 1
df["link_pos_p1"] = df["AlphaPos"] + 1
df["link_pos_p2"] = df["BetaPos"] + 1
df['PEPTIDE_LENGTH_A'] = df['SEQUENCE_A'].apply(len)
df['PEPTIDE_LENGTH_B'] = df['SEQUENCE_B'].apply(len)
df['aa_len_p1'] = df['SEQUENCE_A'].apply(len)
df['aa_len_p2'] = df['SEQUENCE_B'].apply(len)
df = Scout.self_or_between_mp(df)
df["fdr_group"] = np.where(df["AlphaMappings"].str.replace("Reverse_", "") == df["BetaMappings"].str.replace("Reverse_", ""), "self", "between")
df.drop(columns=['self'], inplace=True)
df.drop(columns=['tmp_id'], inplace=True)
logger.info("Converting Scout peptide sequence to internal format...")
df["RAW_FILE"] = df["RAW_FILE"].str.replace(".raw", "")
df["MODIFIED_SEQUENCE_A"] = df.apply(
lambda row: xisearch_to_internal(
xl=row["CROSSLINKER_TYPE"],
seq=row["SEQUENCE_A"],
mod=row["Modifications_A"],
crosslinker_position=row["CROSSLINKER_POSITION_A"],
mod_positions=row["ModificationPositions1"],
),
axis=1,
result_type="expand",
)
df["MODIFIED_SEQUENCE_B"] = df.apply(
lambda row: xisearch_to_internal(
xl=row["CROSSLINKER_TYPE"],
seq=row["SEQUENCE_B"],
mod=row["Modifications_B"],
crosslinker_position=row["CROSSLINKER_POSITION_B"],
mod_positions=row["ModificationPositions2"],
),
axis=1,
result_type="expand",
)
new_column_names = {
"FileName": "run_name",
"ScanNumber": "scan_number",
"ExperimentalMZ": "precursor_mass",
"Charge": "precursor_charge",
"scan_number": "ScanNumber"
}
df = df.rename(columns=new_column_names)
return df

@staticmethod
def filter_valid_prosit_sequences(df: pd.DataFrame) -> pd.DataFrame:
"""
Filter valid Prosit sequences.
:param df: df to filter
:return: df after filtration
"""
logger.info(f"#sequences before filtering for valid prosit sequences: {len(df.index)}")

df = df[(df["PEPTIDE_LENGTH_A"] <= 30)]
df = df[df["PEPTIDE_LENGTH_A"] >= 6]
df = df[(df["PEPTIDE_LENGTH_B"] <= 30)]
df = df[df["PEPTIDE_LENGTH_B"] >= 6]
df = df[(~df["SEQUENCE_A"].str.contains(r"B|\*|\.|U|O|X|Z|\(|\)"))]
df = df[(~df["SEQUENCE_B"].str.contains(r"B|\*|\.|U|O|X|Z|\(|\)"))]
df = df[df["PRECURSOR_CHARGE"] <= 6]
logger.info(f"#sequences after filtering for valid prosit sequences: {len(df.index)}")

return df
77 changes: 75 additions & 2 deletions spectrum_io/search_result/xisearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
import re
from pathlib import Path
from typing import Union

from math import ceil
import numpy as np
import pandas as pd
import spectrum_fundamentals.constants as c
from spectrum_fundamentals.mod_string import xisearch_to_internal
import multiprocess as mp

from .search_results import SearchResults

Expand Down Expand Up @@ -43,13 +44,15 @@ def read_result(self, tmt_labeled: str = "") -> pd.DataFrame:
"linked_aa_p1",
"mods_p1",
"mod_pos_p1",
"protein_p1",
"decoy_p2",
"base_sequence_p2",
"aa_len_p2",
"link_pos_p2",
"linked_aa_p2",
"mods_p2",
"mod_pos_p2",
"protein_p2",
"linear",
"match_score",
]
Expand All @@ -62,6 +65,10 @@ def read_result(self, tmt_labeled: str = "") -> pd.DataFrame:
df = Xisearch.filter_xisearch_result(df)
df = Xisearch.update_columns_for_prosit(df)
df = Xisearch.filter_valid_prosit_sequences(df)
#df = Xisearch.filter_duplicates(df)
df = Xisearch.fdr_group(df, fdr_group_col=None, decoy_class=None)
df.to_csv("/cmnfs/data/proteomics/XL/juri_lab/MycoEcoliRaw/test/df_filtered_2.csv")

return df

@staticmethod
Expand All @@ -72,6 +79,8 @@ def filter_xisearch_result(df: pd.DataFrame) -> pd.DataFrame:
:param df: df to filter
:return: filtered df as pd.DataFrame
"""
df['linear'] = df['linear'].fillna(True)
df['linear'] = df['linear'].astype(bool)
df = df[~df["linear"]]
df = df[df["linked_aa_p1"].notna() & df["linked_aa_p1"].str.contains("K")]
df = df[df["linked_aa_p2"].notna() & df["linked_aa_p2"].str.contains("K")]
Expand All @@ -80,6 +89,67 @@ def filter_xisearch_result(df: pd.DataFrame) -> pd.DataFrame:

return df

@staticmethod
def self_or_between(df):
df.loc[:,'protein_p1_arr'] = df.loc[:,'protein_p1'].astype(str).str.replace('REV_','').str.split(';').map(np.unique).map(list)
df.loc[:,'protein_p2_arr'] = df.loc[:,'protein_p2'].astype(str).str.replace('REV_','').str.split(';').map(np.unique).map(list)
df.loc[:,'proteins_arr'] = (df.loc[:,'protein_p1_arr'] + df.loc[:,'protein_p2_arr'])
df.loc[:,'proteins_arr_unique'] = df.loc[:,'proteins_arr'].map(np.unique)
df.loc[:,'is_between'] = ~(df.loc[:,'proteins_arr'].map(len) - df.loc[:,'proteins_arr_unique'].map(len)).astype(bool)
df.drop(
[
'protein_p1_arr',
'protein_p2_arr',
'proteins_arr',
'proteins_arr_unique'
],
axis=1,
inplace=True
)
return df.loc[:,'is_between'].map({True: 'between', False: 'self'})

@staticmethod
def self_or_between_mp(df):
pool_size = min([10,os.cpu_count()])
slice_size = ceil(len(df)/pool_size)
cols = ['protein_p1', 'protein_p2']
df = df[cols]
df_slices = [
df.iloc[i*slice_size:(i+1)*slice_size][cols]
for i in range(pool_size)
]
print('slicing done')
print(f"Pool size: {pool_size}")
with mp.Pool(processes=pool_size) as pool:
map_res = pool.map(Xisearch.self_or_between, df_slices)
return pd.concat(map_res).copy()


@staticmethod
def fdr_group(df, fdr_group_col=None, decoy_class=None):
original_order = df.index
if fdr_group_col is None:
df.loc[:,'fdr_group'] = Xisearch.self_or_between_mp(df)
return df



@staticmethod
def filter_duplicates(df: pd.DataFrame) -> pd.DataFrame:
"""
keep csm with higher score and remove duplicate (only top ranks) .
:param df: df to filter
:return: filtered df as pd.DataFrame
"""
repetitive_combinations = df[df.duplicated(subset=['scan_number', 'run_name'], keep=False)]
filtered_df = repetitive_combinations.groupby(['scan_number', 'run_name']).apply(lambda x: x.loc[x['match_score'].idxmax()])
filtered_df.reset_index(drop=True, inplace=True)
final_df = pd.concat([df.drop_duplicates(subset=['scan_number', 'run_name'], keep=False), filtered_df])
final_df.reset_index(drop=True, inplace=True)
df = final_df
return df

@staticmethod
def update_columns_for_prosit(df: pd.DataFrame) -> pd.DataFrame:
"""
Expand All @@ -88,7 +158,10 @@ def update_columns_for_prosit(df: pd.DataFrame) -> pd.DataFrame:
:param df: df to modify
:return: modified df as pd.DataFrame
"""
df['crosslinker_name'] = df['crosslinker_name'].replace(to_replace='*', value='DSSO')
df["decoy"] = df["decoy_p1"] | df["decoy_p2"]
df['run_name'] = df['run_name'].str.replace('-', '_')
df.to_csv("/cmnfs/data/proteomics/XL/juri_lab/MycoEcoliRaw/test/df_filtered_1.csv")
df["RAW_FILE"] = df["run_name"]
df["MASS"] = df["precursor_mass"]
df["PRECURSOR_CHARGE"] = df["precursor_charge"]
Expand All @@ -106,7 +179,7 @@ def update_columns_for_prosit(df: pd.DataFrame) -> pd.DataFrame:
df["ModificationPositions2"] = df["mod_pos_p2"]
df["PEPTIDE_LENGTH_A"] = df["aa_len_p1"]
df["PEPTIDE_LENGTH_B"] = df["aa_len_p2"]
logger.info("Converting XIsearch peptide sequence to internal format...")
logger.info("Converting Xisearch peptide sequence to internal format...")

df["RAW_FILE"] = df["RAW_FILE"].str.replace(".raw", "")

Expand Down
7 changes: 7 additions & 0 deletions tests/unit_tests/data/peptide_protein_map.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
TYDDATK Protein4
TFTVTE Protein4
TTADDYT REV__Protein4
ASPTQPIQLTK Protein5
EASPTQPIQLTK Protein5
ETLQIPQTPSAK REV__Protein5
ETLQIPQTPSA REV__Protein5
1 change: 1 addition & 0 deletions tests/unit_tests/data/peptide_protein_map.tsv.params.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/cmnfs/home/m.kalhor/miniconda3/envs/oktoberfest/bin/pytest
Loading

0 comments on commit f689768

Please sign in to comment.