diff --git a/spectrum_io/search_result/__init__.py b/spectrum_io/search_result/__init__.py index 1da55b4..0fd408e 100644 --- a/spectrum_io/search_result/__init__.py +++ b/spectrum_io/search_result/__init__.py @@ -5,3 +5,4 @@ from .msfragger import MSFragger from .sage import Sage from .xisearch import Xisearch +from .scout import Scout diff --git a/spectrum_io/search_result/scout.py b/spectrum_io/search_result/scout.py new file mode 100644 index 0000000..6abb71d --- /dev/null +++ b/spectrum_io/search_result/scout.py @@ -0,0 +1,236 @@ +import glob +import logging +import os +import re +from pathlib import Path +from typing import Union + +import numpy as np +import pandas as pd +import spectrum_fundamentals.constants as c +from spectrum_fundamentals.mod_string import xisearch_to_internal + +from .search_results import SearchResults + +logger = logging.getLogger(__name__) + + +class Scout(SearchResults): + """Handle search results from xisearch.""" + + def read_result(self, tmt_labeled: str = "") -> pd.DataFrame: + """ + Function to read a csv of CSMs and perform some basic formatting. + + :param tmt_labeled: tmt label as str + :raises NotImplementedError: if a tmt label is provided + :return: pd.DataFrame with the formatted data + """ + if tmt_labeled != "": + raise NotImplementedError("TMT is not supported for Scout") + + logger.info("Reading search results file...") + columns_to_read = [ + "ScanNumber", + "Charge", + "ExperimentalMZ", + "AlphaPeptide", + "BetaPeptide", + "AlphaPos", + "BetaPos", + "AlphaMappings", + "BetaMappings", + "ClassificationScore", + "FileName", + ] + + df = pd.read_csv(self.path, usecols=columns_to_read) + print(df.shape[0]) + logger.info("Finished reading search results file.") + # Standardize column names + df = Scout.update_columns_for_prosit(df) + df = Scout.filter_valid_prosit_sequences(df) + #df.to_csv("/cmnfs/data/proteomics/XL/Fan_lab/raw_files/raw_files_third_fourth_batch/last_version_scout_both_third_fourth_together/df_filtered_by_oktoberfest.csv") + #df = Scout.filter_duplicates(df) + return df + + @staticmethod + def filter_duplicates(df: pd.DataFrame) -> pd.DataFrame: + """ + keep csm with higher score and remove duplicate (only top ranks) . + + :param df: df to filter + :return: filtered df as pd.DataFrame + """ + repetitive_combinations = df[df.duplicated(subset=['ScanNumber', 'RAW_FILE'], keep=False)] + filtered_df = repetitive_combinations.groupby(['ScanNumber', 'RAW_FILE']).apply(lambda x: x.loc[x['ClassificationScore'].idxmax()]) + filtered_df.reset_index(drop=True, inplace=True) + final_df = pd.concat([df.drop_duplicates(subset=['ScanNumber', 'RAW_FILE'], keep=False), filtered_df]) + final_df.reset_index(drop=True, inplace=True) + df = final_df + return df + + @staticmethod + def extract_modifications(peptide_seq: str): + modifications = [] + # Find all matches of modifications + matches = re.findall(r'([CM])\(\+([\d.]+)\)', peptide_seq) + for match in matches: + mod, _ = match + # Add modification to the list + if mod == 'C': + modifications.append('cm') + elif mod == 'M': + modifications.append('ox') + return ';'.join(modifications) + + + + @staticmethod + def extract_modification_positions(peptide_seq: str): + pattern = r'([A-Z])(\(\+\d+\.\d+\))?' + matches = re.findall(pattern, peptide_seq) + split_peptide = [] + for match in matches: + amino_acid = match[0] + modification = match[1] if match[1] else '' + split_peptide.append(amino_acid + modification) + positions = [str(i + 1) for i, component in enumerate(split_peptide) if '+' in component] + return ';'.join(positions) + + + def self_or_between_mp(df: pd.DataFrame) -> pd.DataFrame: + df['tmp_id'] = df.index + df_expl = df.copy() + df_expl.loc[:,'AlphaMappings'] = df_expl['AlphaMappings'].str.split(';') + df_expl.loc[:,'BetaMappings'] = df_expl['BetaMappings'].str.split(';') + df_expl = df_expl.explode('AlphaMappings') + df_expl = df_expl.explode('BetaMappings') + df_expl.loc[:, 'self'] = False + df_expl.loc[ + df_expl['AlphaMappings']==df_expl['BetaMappings'] + , 'self' + ] = True + id_to_self = df_expl.groupby('tmp_id', dropna=False).agg( + { + 'self': 'max' + } + ).reset_index() + df = df.drop(['self'], axis=1, errors='ignore').merge( + id_to_self, + on = ['tmp_id'], + validate = '1:1' + ) + df.loc[:,'fdr_group'] = df['self'].apply(lambda x: 'self' if x else 'between') + return df + + + + @staticmethod + def update_columns_for_prosit(df: pd.DataFrame) -> pd.DataFrame: + """ + Update columns of df to work with xl-prosit. + + :param df: df to modify + :return: modified df as pd.DataFrame + """ + #Filter csms that does not contain any "k" + df = df[(df["AlphaPeptide"].str.contains("K")) & (df["BetaPeptide"].str.contains("K"))] + print(df.shape[0]) + df['decoy_p1'] = df['AlphaMappings'].str.contains('Reverse').astype(bool) + df['decoy_p2'] = df['BetaMappings'].str.contains('Reverse').astype(bool) + df["protein_p1"] = df["AlphaMappings"] + df["protein_p2"] = df["BetaMappings"] + df["decoy"] = df["decoy_p1"] | df["decoy_p2"] + df["REVERSE"] = df["decoy"] + df['RAW_FILE'] = df['FileName'].apply(lambda x: x.split('\\')[-1]) + df["MASS"] = df["ExperimentalMZ"] + df["PRECURSOR_CHARGE"] = df["Charge"] + df["CROSSLINKER_TYPE"] = "DSSO" + df["crosslinker_name"] = "DSSO" + df["linked_aa_p1"] = "K" + df["linked_aa_p2"] = "K" + df["linear"] = "False" + df["match_score"] = "ClassificationScore" + df["SCORE"] = df["ClassificationScore"] + df["SCAN_NUMBER"] = df["ScanNumber"] + df['SEQUENCE_A'] = df['AlphaPeptide'].apply(lambda x: re.sub(r'\([^)]*\)', '', x)) + df['SEQUENCE_B'] = df['BetaPeptide'].apply(lambda x: re.sub(r'\([^)]*\)', '', x)) + df["base_sequence_p1"] = df["SEQUENCE_A"] + df["base_sequence_p2"] = df["SEQUENCE_B"] + df = df[df.apply(lambda row: row["SEQUENCE_A"][row["AlphaPos"]] == "K" , axis=1)] + df = df[df.apply(lambda row: row["SEQUENCE_B"][row["BetaPos"]] == "K" , axis=1)] + df['Modifications_A'] = df['AlphaPeptide'].apply(Scout.extract_modifications) + df['Modifications_B'] = df['BetaPeptide'].apply(Scout.extract_modifications) + df['mods_p1'] = df['AlphaPeptide'].apply(Scout.extract_modifications) + df['mods_p2'] = df['BetaPeptide'].apply(Scout.extract_modifications) + df['ModificationPositions1'] = df['AlphaPeptide'].apply(Scout.extract_modification_positions) + df['ModificationPositions2'] = df['BetaPeptide'].apply(Scout.extract_modification_positions) + df["CROSSLINKER_POSITION_A"] = df["AlphaPos"] + 1 + df["CROSSLINKER_POSITION_B"] = df["BetaPos"] + 1 + df["mod_pos_p1"] = df["AlphaPos"] + 1 + df["mod_pos_p2"] = df["BetaPos"] + 1 + df["link_pos_p1"] = df["AlphaPos"] + 1 + df["link_pos_p2"] = df["BetaPos"] + 1 + df['PEPTIDE_LENGTH_A'] = df['SEQUENCE_A'].apply(len) + df['PEPTIDE_LENGTH_B'] = df['SEQUENCE_B'].apply(len) + df['aa_len_p1'] = df['SEQUENCE_A'].apply(len) + df['aa_len_p2'] = df['SEQUENCE_B'].apply(len) + df = Scout.self_or_between_mp(df) + df["fdr_group"] = np.where(df["AlphaMappings"].str.replace("Reverse_", "") == df["BetaMappings"].str.replace("Reverse_", ""), "self", "between") + df.drop(columns=['self'], inplace=True) + df.drop(columns=['tmp_id'], inplace=True) + logger.info("Converting Scout peptide sequence to internal format...") + df["RAW_FILE"] = df["RAW_FILE"].str.replace(".raw", "") + df["MODIFIED_SEQUENCE_A"] = df.apply( + lambda row: xisearch_to_internal( + xl=row["CROSSLINKER_TYPE"], + seq=row["SEQUENCE_A"], + mod=row["Modifications_A"], + crosslinker_position=row["CROSSLINKER_POSITION_A"], + mod_positions=row["ModificationPositions1"], + ), + axis=1, + result_type="expand", + ) + df["MODIFIED_SEQUENCE_B"] = df.apply( + lambda row: xisearch_to_internal( + xl=row["CROSSLINKER_TYPE"], + seq=row["SEQUENCE_B"], + mod=row["Modifications_B"], + crosslinker_position=row["CROSSLINKER_POSITION_B"], + mod_positions=row["ModificationPositions2"], + ), + axis=1, + result_type="expand", + ) + new_column_names = { + "FileName": "run_name", + "ScanNumber": "scan_number", + "ExperimentalMZ": "precursor_mass", + "Charge": "precursor_charge", + "scan_number": "ScanNumber" + } + df = df.rename(columns=new_column_names) + return df + + @staticmethod + def filter_valid_prosit_sequences(df: pd.DataFrame) -> pd.DataFrame: + """ + Filter valid Prosit sequences. + + :param df: df to filter + :return: df after filtration + """ + logger.info(f"#sequences before filtering for valid prosit sequences: {len(df.index)}") + + df = df[(df["PEPTIDE_LENGTH_A"] <= 30)] + df = df[df["PEPTIDE_LENGTH_A"] >= 6] + df = df[(df["PEPTIDE_LENGTH_B"] <= 30)] + df = df[df["PEPTIDE_LENGTH_B"] >= 6] + df = df[(~df["SEQUENCE_A"].str.contains(r"B|\*|\.|U|O|X|Z|\(|\)"))] + df = df[(~df["SEQUENCE_B"].str.contains(r"B|\*|\.|U|O|X|Z|\(|\)"))] + df = df[df["PRECURSOR_CHARGE"] <= 6] + logger.info(f"#sequences after filtering for valid prosit sequences: {len(df.index)}") + + return df diff --git a/spectrum_io/search_result/xisearch.py b/spectrum_io/search_result/xisearch.py index 5e1bbd9..21c1946 100644 --- a/spectrum_io/search_result/xisearch.py +++ b/spectrum_io/search_result/xisearch.py @@ -4,11 +4,12 @@ import re from pathlib import Path from typing import Union - +from math import ceil import numpy as np import pandas as pd import spectrum_fundamentals.constants as c from spectrum_fundamentals.mod_string import xisearch_to_internal +import multiprocess as mp from .search_results import SearchResults @@ -43,6 +44,7 @@ def read_result(self, tmt_labeled: str = "") -> pd.DataFrame: "linked_aa_p1", "mods_p1", "mod_pos_p1", + "protein_p1", "decoy_p2", "base_sequence_p2", "aa_len_p2", @@ -50,6 +52,7 @@ def read_result(self, tmt_labeled: str = "") -> pd.DataFrame: "linked_aa_p2", "mods_p2", "mod_pos_p2", + "protein_p2", "linear", "match_score", ] @@ -62,6 +65,10 @@ def read_result(self, tmt_labeled: str = "") -> pd.DataFrame: df = Xisearch.filter_xisearch_result(df) df = Xisearch.update_columns_for_prosit(df) df = Xisearch.filter_valid_prosit_sequences(df) + #df = Xisearch.filter_duplicates(df) + df = Xisearch.fdr_group(df, fdr_group_col=None, decoy_class=None) + df.to_csv("/cmnfs/data/proteomics/XL/juri_lab/MycoEcoliRaw/test/df_filtered_2.csv") + return df @staticmethod @@ -72,6 +79,8 @@ def filter_xisearch_result(df: pd.DataFrame) -> pd.DataFrame: :param df: df to filter :return: filtered df as pd.DataFrame """ + df['linear'] = df['linear'].fillna(True) + df['linear'] = df['linear'].astype(bool) df = df[~df["linear"]] df = df[df["linked_aa_p1"].notna() & df["linked_aa_p1"].str.contains("K")] df = df[df["linked_aa_p2"].notna() & df["linked_aa_p2"].str.contains("K")] @@ -80,6 +89,67 @@ def filter_xisearch_result(df: pd.DataFrame) -> pd.DataFrame: return df + @staticmethod + def self_or_between(df): + df.loc[:,'protein_p1_arr'] = df.loc[:,'protein_p1'].astype(str).str.replace('REV_','').str.split(';').map(np.unique).map(list) + df.loc[:,'protein_p2_arr'] = df.loc[:,'protein_p2'].astype(str).str.replace('REV_','').str.split(';').map(np.unique).map(list) + df.loc[:,'proteins_arr'] = (df.loc[:,'protein_p1_arr'] + df.loc[:,'protein_p2_arr']) + df.loc[:,'proteins_arr_unique'] = df.loc[:,'proteins_arr'].map(np.unique) + df.loc[:,'is_between'] = ~(df.loc[:,'proteins_arr'].map(len) - df.loc[:,'proteins_arr_unique'].map(len)).astype(bool) + df.drop( + [ + 'protein_p1_arr', + 'protein_p2_arr', + 'proteins_arr', + 'proteins_arr_unique' + ], + axis=1, + inplace=True + ) + return df.loc[:,'is_between'].map({True: 'between', False: 'self'}) + + @staticmethod + def self_or_between_mp(df): + pool_size = min([10,os.cpu_count()]) + slice_size = ceil(len(df)/pool_size) + cols = ['protein_p1', 'protein_p2'] + df = df[cols] + df_slices = [ + df.iloc[i*slice_size:(i+1)*slice_size][cols] + for i in range(pool_size) + ] + print('slicing done') + print(f"Pool size: {pool_size}") + with mp.Pool(processes=pool_size) as pool: + map_res = pool.map(Xisearch.self_or_between, df_slices) + return pd.concat(map_res).copy() + + + @staticmethod + def fdr_group(df, fdr_group_col=None, decoy_class=None): + original_order = df.index + if fdr_group_col is None: + df.loc[:,'fdr_group'] = Xisearch.self_or_between_mp(df) + return df + + + + @staticmethod + def filter_duplicates(df: pd.DataFrame) -> pd.DataFrame: + """ + keep csm with higher score and remove duplicate (only top ranks) . + + :param df: df to filter + :return: filtered df as pd.DataFrame + """ + repetitive_combinations = df[df.duplicated(subset=['scan_number', 'run_name'], keep=False)] + filtered_df = repetitive_combinations.groupby(['scan_number', 'run_name']).apply(lambda x: x.loc[x['match_score'].idxmax()]) + filtered_df.reset_index(drop=True, inplace=True) + final_df = pd.concat([df.drop_duplicates(subset=['scan_number', 'run_name'], keep=False), filtered_df]) + final_df.reset_index(drop=True, inplace=True) + df = final_df + return df + @staticmethod def update_columns_for_prosit(df: pd.DataFrame) -> pd.DataFrame: """ @@ -88,7 +158,10 @@ def update_columns_for_prosit(df: pd.DataFrame) -> pd.DataFrame: :param df: df to modify :return: modified df as pd.DataFrame """ + df['crosslinker_name'] = df['crosslinker_name'].replace(to_replace='*', value='DSSO') df["decoy"] = df["decoy_p1"] | df["decoy_p2"] + df['run_name'] = df['run_name'].str.replace('-', '_') + df.to_csv("/cmnfs/data/proteomics/XL/juri_lab/MycoEcoliRaw/test/df_filtered_1.csv") df["RAW_FILE"] = df["run_name"] df["MASS"] = df["precursor_mass"] df["PRECURSOR_CHARGE"] = df["precursor_charge"] @@ -106,7 +179,7 @@ def update_columns_for_prosit(df: pd.DataFrame) -> pd.DataFrame: df["ModificationPositions2"] = df["mod_pos_p2"] df["PEPTIDE_LENGTH_A"] = df["aa_len_p1"] df["PEPTIDE_LENGTH_B"] = df["aa_len_p2"] - logger.info("Converting XIsearch peptide sequence to internal format...") + logger.info("Converting Xisearch peptide sequence to internal format...") df["RAW_FILE"] = df["RAW_FILE"].str.replace(".raw", "") diff --git a/tests/unit_tests/data/peptide_protein_map.tsv b/tests/unit_tests/data/peptide_protein_map.tsv new file mode 100644 index 0000000..6947cd5 --- /dev/null +++ b/tests/unit_tests/data/peptide_protein_map.tsv @@ -0,0 +1,7 @@ +TYDDATK Protein4 +TFTVTE Protein4 +TTADDYT REV__Protein4 +ASPTQPIQLTK Protein5 +EASPTQPIQLTK Protein5 +ETLQIPQTPSAK REV__Protein5 +ETLQIPQTPSA REV__Protein5 diff --git a/tests/unit_tests/data/peptide_protein_map.tsv.params.txt b/tests/unit_tests/data/peptide_protein_map.tsv.params.txt new file mode 100644 index 0000000..4424a1b --- /dev/null +++ b/tests/unit_tests/data/peptide_protein_map.tsv.params.txt @@ -0,0 +1 @@ +/cmnfs/home/m.kalhor/miniconda3/envs/oktoberfest/bin/pytest \ No newline at end of file diff --git a/tests/unit_tests/data/prosit_input_with_proteins.csv b/tests/unit_tests/data/prosit_input_with_proteins.csv new file mode 100644 index 0000000..78157d8 --- /dev/null +++ b/tests/unit_tests/data/prosit_input_with_proteins.csv @@ -0,0 +1,22 @@ +modified_sequence,collision_energy,precursor_charge,protein,fragmentation +TYDDATK,30,2,Protein4,CID +TYDDATK,30,3,Protein4,CID +TYDDATK,30,4,Protein4,CID +TFTVTE,30,2,Protein4,CID +TFTVTE,30,3,Protein4,CID +TFTVTE,30,4,Protein4,CID +TTADDYT,30,2,REV__Protein4,CID +TTADDYT,30,3,REV__Protein4,CID +TTADDYT,30,4,REV__Protein4,CID +ASPTQPIQLTK,30,2,Protein5,CID +ASPTQPIQLTK,30,3,Protein5,CID +ASPTQPIQLTK,30,4,Protein5,CID +EASPTQPIQLTK,30,2,Protein5,CID +EASPTQPIQLTK,30,3,Protein5,CID +EASPTQPIQLTK,30,4,Protein5,CID +ETLQIPQTPSAK,30,2,REV__Protein5,CID +ETLQIPQTPSAK,30,3,REV__Protein5,CID +ETLQIPQTPSAK,30,4,REV__Protein5,CID +ETLQIPQTPSA,30,2,REV__Protein5,CID +ETLQIPQTPSA,30,3,REV__Protein5,CID +ETLQIPQTPSA,30,4,REV__Protein5,CID