diff --git a/.bumpversion.cfg b/.bumpversion.cfg index e2df489..cdbdd14 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.0 +current_version = 0.4.1 commit = True tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? diff --git a/alpharaw/__init__.py b/alpharaw/__init__.py index 87a572e..038ea27 100644 --- a/alpharaw/__init__.py +++ b/alpharaw/__init__.py @@ -15,7 +15,7 @@ def register_readers(): print("[WARN] pythonnet is not installed") __project__ = "alpharaw" -__version__ = "0.4.0" +__version__ = "0.4.1" __license__ = "Apache" __description__ = "An open-source Python package to unify raw MS data access and storage." __author__ = "Mann Labs" diff --git a/alpharaw/dia/normal_dia.py b/alpharaw/dia/normal_dia.py index f23126e..d93c2d0 100644 --- a/alpharaw/dia/normal_dia.py +++ b/alpharaw/dia/normal_dia.py @@ -1,7 +1,9 @@ import pandas as pd - +import numpy as np import typing +from collections import defaultdict + from alpharaw.ms_data_base import MSData_Base from alpharaw.utils.df_processing import remove_unused_peaks @@ -15,7 +17,17 @@ def __init__(self, ms_data: MSData_Base): "dia_group" ] = ms_data.spectrum_df.precursor_mz.astype(int) - def get_grouped_ms_data(self, + self.dia_group_dfs = self.ms_data.spectrum_df.groupby("dia_group") + self.dia_isolation_dict = {} + for dia_group, df in self.dia_group_dfs: + if dia_group == -1: continue + self.dia_isolation_dict[dia_group] = ( + df.isolation_lower_mz.values[0], + df.isolation_upper_mz.values[0] + ) + self.dia_groups = np.sort(list(self.dia_isolation_dict.keys())) + + def get_ms_data_for_a_group(self, dia_group:int=-1, return_alpharaw_data: bool=True, return_alphatims_data: bool=True, @@ -32,7 +44,7 @@ def get_grouped_ms_data(self, TimsTOF: Alphatims object for the window, if `return_alphatims_data==True` """ - spec_df = self.ms_data.spectrum_df.query(f"dia_group == {dia_group}") + spec_df = self.dia_group_dfs.get_group(dia_group) if return_alphatims_data: ms_data, ms_tims = convert_to_alphatims( @@ -52,4 +64,26 @@ def get_grouped_ms_data(self, ms_data.spectrum_df = spec_df ms_data.peak_df = peak_df return ms_data - + + def assign_dia_groups(self, + precursor_mzs + )->typing.DefaultDict[typing.List, typing.List]: + dia_precursor_groups = defaultdict(list) + for i, mz in enumerate(precursor_mzs): + i_group = np.searchsorted(self.dia_groups, int(mz)) + if i_group == 0: + i_group = 1 + elif i_group == len(self.dia_groups): + i_group -= 1 + dia_group = self.dia_groups[i_group] + if dia_group in self.dia_isolation_dict: + isolation_lower, isolation_upper = self.dia_isolation_dict[dia_group] + if mz >= isolation_lower and mz <= isolation_upper: + dia_precursor_groups[dia_group].append(i) + continue + dia_group = self.dia_groups[i_group-1] + if dia_group in self.dia_isolation_dict: + isolation_lower, isolation_upper = self.dia_isolation_dict[dia_group] + if mz >= isolation_lower and mz <= isolation_upper: + dia_precursor_groups[dia_group].append(i) + return dia_precursor_groups \ No newline at end of file diff --git a/alpharaw/match/psm_match.py b/alpharaw/match/psm_match.py index a25e99a..bcdb794 100644 --- a/alpharaw/match/psm_match.py +++ b/alpharaw/match/psm_match.py @@ -3,18 +3,6 @@ import pandas as pd import tqdm from typing import Union, Tuple -import multiprocessing as mp -from alphabase.utils import process_bar -try: - from alpharaw.thermo import ThermoRawData - from alpharaw.sciex import SciexWiffData -except: - pass - -from alpharaw.mzml import MzMLReader -from alpharaw.wrappers.alphapept_wrapper import AlphaPept_HDF_MS2_Reader -from alpharaw.legacy_msdata.mgf import MGFReader - from alphabase.peptide.fragment import ( create_fragment_mz_dataframe, @@ -30,91 +18,10 @@ from alpharaw.match.match_utils import ( match_closest_peaks, match_highest_peaks, ) -from ..utils.ms_path_utils import parse_ms_files_to_dict - -@numba.njit -def match_one_raw_with_numba( - spec_idxes, frag_start_idxes, frag_stop_idxes, - all_frag_mzs, all_frag_mz_tols, - all_spec_mzs, all_spec_intensities, - peak_start_idxes, peak_stop_idxes, - matched_intensities, matched_mz_errs, - matched_closest=True, -): - """ - Internel function to match fragment mz values to spectrum mz values. - Matched_mz_errs[i] = np.inf if no peaks are matched. +from alpharaw.utils.ms_path_utils import parse_ms_files_to_dict - Results will saved in place of matched_intensities - and matched_mz_errs. - """ - for spec_idx, frag_start, frag_stop in zip( - spec_idxes, frag_start_idxes, frag_stop_idxes - ): - if spec_idx == -1: continue - peak_start = peak_start_idxes[spec_idx] - peak_stop = peak_stop_idxes[spec_idx] - if peak_stop == peak_start: continue - spec_mzs = all_spec_mzs[peak_start:peak_stop] - spec_intens = all_spec_intensities[peak_start:peak_stop] +from alpharaw.dia.normal_dia import NormalDIAGrouper - frag_mzs = all_frag_mzs[frag_start:frag_stop,:].copy() - frag_mz_tols = all_frag_mz_tols[frag_start:frag_stop,:].copy() - - if matched_closest: - matched_idxes = match_closest_peaks( - spec_mzs, spec_intens, - frag_mzs, frag_mz_tols - ).reshape(-1) - else: - matched_idxes = match_highest_peaks( - spec_mzs, spec_intens, - frag_mzs, frag_mz_tols - ).reshape(-1) - - matched_intens = spec_intens[matched_idxes] - matched_intens[matched_idxes==-1] = 0 - - matched_mass_errs = np.abs( - spec_mzs[ - matched_idxes.reshape(-1) - ]-frag_mzs.reshape(-1) - ) - matched_mass_errs[matched_idxes==-1] = np.inf - - matched_intensities[ - frag_start:frag_stop,: - ] = matched_intens.reshape(frag_mzs.shape) - - matched_mz_errs[ - frag_start:frag_stop,: - ] = matched_mass_errs.reshape(frag_mzs.shape) - -def load_ms_data( - ms_file:Union[str, MSData_Base], - ms_file_type:str='alpharaw_hdf', - process_count:int = 8, -)->MSData_Base: - """Load MS files - - Parameters - ---------- - ms_file : str | MSData_Base - ms2 file path - - ms_file_type : str, optional - ms2 file type, could be - ["alpharaw_hdf","thermo","sciex","alphapept_hdf","mgf"]. - Default to 'alpharaw_hdf' - """ - if isinstance(ms_file, MSData_Base): - return ms_file - else: - raw_data = ms_reader_provider.get_reader( - ms_file_type, process_count=process_count - ) - raw_data.import_raw(ms_file) - return raw_data class PepSpecMatch: """ @@ -336,7 +243,7 @@ def match_ms2_one_raw(self, self.psm_df = psm_df_one_raw psm_df_one_raw = self._add_missing_columns_to_psm_df( - psm_df_one_raw + psm_df_one_raw, self.raw_data ) ( @@ -373,9 +280,8 @@ def match_ms2_one_raw(self, ) def _match_ms2_one_raw_numba(self, - raw_name_psm_df_one_raw:tuple + raw_name:str, psm_df_one_raw:pd.DataFrame ): - raw_name, psm_df_one_raw = raw_name_psm_df_one_raw if raw_name in self._ms_file_dict: raw_data = load_ms_data( self._ms_file_dict[raw_name], self._ms_file_type, @@ -407,6 +313,7 @@ def _match_ms2_one_raw_numba(self, ) else: print(f"`{raw_name}` is not found in ms_file_dict.") + return psm_df_one_raw def match_ms2_multi_raw(self, psm_df: pd.DataFrame, @@ -465,10 +372,11 @@ def match_ms2_multi_raw(self, self.ms_loader_thread_num = process_num # if process_num <= 1 or len(self._ms_file_dict) <= 1: + psm_df_list = [] for raw_name, df_group in tqdm.tqdm( self.psm_df.groupby('raw_name') ): - self._match_ms2_one_raw_numba((raw_name, df_group)) + psm_df_list.append(self._match_ms2_one_raw_numba(raw_name, df_group)) # else: # with mp.get_context("spawn").Pool(processes=process_num) as p: # df_groupby = self.psm_df.groupby('raw_name') @@ -481,96 +389,21 @@ def match_ms2_multi_raw(self, # gen_group_df(df_groupby) # ), df_groupby.ngroups # ) - + + self.psm_df = pd.concat(psm_df_list, ignore_index=True) return ( self.psm_df, self.fragment_mz_df, self.matched_intensity_df, self.matched_mz_err_df ) - -@numba.njit -def get_best_matched_intens( - matched_intensity_values:np.ndarray, - frag_start_idxes:np.ndarray, - frag_stop_idxes:np.ndarray, -): - ret_intens = np.zeros( - shape = matched_intensity_values.shape[1:], - dtype=matched_intensity_values.dtype - ) - for i in range(len(frag_start_idxes)): - start = frag_start_idxes[i] - stop = frag_stop_idxes[i] - i = np.argmax(np.matmul( - matched_intensity_values[:,start:stop,:], - matched_intensity_values[:,start:stop,:].T - ).sum()) - ret_intens[start:stop,:] = matched_intensity_values[i,start:stop,:] - return ret_intens - -@numba.njit -def get_ion_count_scores( - frag_mz_values:np.ndarray, - matched_intens:np.ndarray, - frag_start_idxes:np.ndarray, - frag_stop_idxes:np.ndarray, - min_mz:float = 200, -): - scores = [] - for i in range(len(frag_start_idxes)): - scores.append(np.count_nonzero( - matched_intens[ - frag_start_idxes[i]:frag_stop_idxes[i],: - ].copy().reshape(-1)[ - frag_mz_values[ - frag_start_idxes[i]:frag_stop_idxes[i],: - ].copy().reshape(-1)>=min_mz - ] - )) - return np.array(scores,np.int32) - -@numba.njit -def get_dia_spec_idxes( - spec_rt_values:np.ndarray, - spec_isolation_lower_mz_values:np.ndarray, - spec_isolation_upper_mz_values:np.ndarray, - query_start_rt_values:np.ndarray, - query_stop_rt_values:np.ndarray, - query_mz_values:np.ndarray, - max_spec_per_query:int, -): - rt_start_idxes = np.searchsorted(spec_rt_values, query_start_rt_values) - rt_stop_idxes = np.searchsorted(spec_rt_values, query_stop_rt_values) - - spec_idxes = np.full( - (len(query_mz_values),max_spec_per_query), - -1, dtype=np.int32 - ) - for iquery in range(len(rt_start_idxes)): - idx_list = [] - for ispec in range(rt_start_idxes[iquery], rt_stop_idxes[iquery]): - if ( - query_mz_values[iquery]>=spec_isolation_lower_mz_values[ispec] and - query_mz_values[iquery]<=spec_isolation_upper_mz_values[ispec] - ): - idx_list.append(ispec) - if len(idx_list) > max_spec_per_query: - spec_idxes[iquery,:] = idx_list[ - len(idx_list)/2-max_spec_per_query//2: - len(idx_list)/2+max_spec_per_query//2+1 - ] - else: - spec_idxes[iquery,:len(idx_list)] = idx_list - return spec_idxes class PepSpecMatch_DIA(PepSpecMatch): max_spec_per_query: int = 3 min_frag_mz: float = 200.0 - rt_query_window: float = 20.0/60.0 # 20 seconds def _add_missing_columns_to_psm_df(self, psm_df:pd.DataFrame, raw_data=None ): - # DIA results do not have spec_idx in psm_df, nothing to merge + # DIA results do not have spec_idx/scan_num in psm_df, nothing to merge return psm_df def _prepare_matching_dfs(self): @@ -580,11 +413,13 @@ def _prepare_matching_dfs(self): [fragment_mz_df]*self.max_spec_per_query, ignore_index=True ) + if self.use_ppm: + self.all_frag_mz_tols = fragment_mz_df.values*self.tolerance*1e-6 + else: + self.all_frag_mz_tols = np.full_like(fragment_mz_df.values, self.tolerance) psm_df_list = [] len_frags = len(fragment_mz_df)//self.max_spec_per_query - self.psm_df["rt_start"] = self.psm_df.rt - self.rt_query_window/2 - self.psm_df["rt_stop"] = self.psm_df.rt + self.rt_query_window/2 for i in range(self.max_spec_per_query): psm_df = self.psm_df.copy() psm_df["frag_start_idx"] = psm_df.frag_start_idx+i*len_frags @@ -609,53 +444,61 @@ def _prepare_matching_dfs(self): matched_mz_err_df ) def _match_ms2_one_raw_numba(self, - raw_name_psm_df_one_raw:tuple + raw_name, psm_df_one_raw ): - raw_name, psm_df_one_raw = raw_name_psm_df_one_raw + psm_df_one_raw = psm_df_one_raw.reset_index(drop=True) + if raw_name in self._ms_file_dict: raw_data = load_ms_data( self._ms_file_dict[raw_name], self._ms_file_type, process_count=self.ms_loader_thread_num ) - psm_df_one_raw = self._add_missing_columns_to_psm_df( - psm_df_one_raw, raw_data - ) - - if self.use_ppm: - all_frag_mz_tols = self.fragment_mz_df.values*self.tolerance*1e-6 - else: - all_frag_mz_tols = np.full_like(self.fragment_mz_df.values, self.tolerance) + psm_origin_len = len(psm_df_one_raw)//self.max_spec_per_query + + grouper = NormalDIAGrouper(raw_data) - spec_idxes = get_dia_spec_idxes( - raw_data.spectrum_df.rt.values, - raw_data.spectrum_df.isolation_lower_mz.values, - raw_data.spectrum_df.isolation_upper_mz.values, - psm_df_one_raw.rt_start.values[:len(psm_df_one_raw)//self.max_spec_per_query], - psm_df_one_raw.rt_stop.values[:len(psm_df_one_raw)//self.max_spec_per_query], - psm_df_one_raw.precursor_mz.values[:len(psm_df_one_raw)//self.max_spec_per_query], - max_spec_per_query=self.max_spec_per_query + psm_groups = grouper.assign_dia_groups( + psm_df_one_raw.precursor_mz.values[ + :psm_origin_len + ] ) - - psm_df_one_raw["spec_idx"] = spec_idxes.T.reshape(-1) - - match_one_raw_with_numba( - psm_df_one_raw.spec_idx.values, - psm_df_one_raw.frag_start_idx.values, - psm_df_one_raw.frag_stop_idx.values, - self.fragment_mz_df.values, - all_frag_mz_tols, - raw_data.peak_df.mz.values, - raw_data.peak_df.intensity.values, - raw_data.spectrum_df.peak_start_idx.values, - raw_data.spectrum_df.peak_stop_idx.values, - self.matched_intensity_df.values, - self.matched_mz_err_df.values, - self.match_closest + + all_spec_idxes = np.full( + len(psm_df_one_raw), -1, dtype=np.int32 ) + + for dia_group, group_df in grouper.dia_group_dfs: + psm_idxes = psm_groups[dia_group] + if len(psm_idxes) == 0: continue + psm_idxes = np.array(psm_idxes, dtype=np.int32) + spec_idxes = get_dia_spec_idxes( + group_df.rt.values, + psm_df_one_raw.rt.values[psm_idxes], + max_spec_per_query=self.max_spec_per_query + ) + for i in range(spec_idxes.shape[-1]): + all_spec_idxes[ + psm_idxes+psm_origin_len*i + ] = spec_idxes[:,i] + + match_one_raw_with_numba( + all_spec_idxes, + psm_df_one_raw.frag_start_idx.values, + psm_df_one_raw.frag_stop_idx.values, + self.fragment_mz_df.values, + self.all_frag_mz_tols, + raw_data.peak_df.mz.values, + raw_data.peak_df.intensity.values, + group_df.peak_start_idx.values, + group_df.peak_stop_idx.values, + self.matched_intensity_df.values, + self.matched_mz_err_df.values, + self.match_closest + ) else: print(f"`{raw_name}` is not found in ms_file_dict.") - + return psm_df_one_raw def match_ms2_multi_raw(self, psm_df: pd.DataFrame, @@ -663,21 +506,169 @@ def match_ms2_multi_raw(self, ms_file_type: str = "alpharaw_hdf", process_num:int = 8, ): + if isinstance(ms_files, list): + ms_files = parse_ms_files_to_dict(ms_files) + psm_df = psm_df[ + psm_df.raw_name.isin(ms_files) + ].reset_index(drop=True) super().match_ms2_multi_raw( psm_df, ms_files, ms_file_type, process_num ) - self.psm_df["score"] = get_ion_count_scores( - self.fragment_mz_df.values, - self.matched_intensity_df.values, - self.psm_df.frag_start_idx.values, - self.psm_df.frag_stop_idx.values, - self.min_frag_mz, - ) - return ( self.psm_df, self.fragment_mz_df, self.matched_intensity_df, self.matched_mz_err_df ) + +@numba.njit +def match_one_raw_with_numba( + spec_idxes, frag_start_idxes, frag_stop_idxes, + all_frag_mzs, all_frag_mz_tols, + all_spec_mzs, all_spec_intensities, + peak_start_idxes, peak_stop_idxes, + matched_intensities, matched_mz_errs, + matched_closest=True, +): + """ + Internel function to match fragment mz values to spectrum mz values. + Matched_mz_errs[i] = np.inf if no peaks are matched. + + Results will saved in place of matched_intensities + and matched_mz_errs. + """ + for spec_idx, frag_start, frag_stop in zip( + spec_idxes, frag_start_idxes, frag_stop_idxes + ): + if spec_idx == -1: continue + peak_start = peak_start_idxes[spec_idx] + peak_stop = peak_stop_idxes[spec_idx] + if peak_stop == peak_start: continue + spec_mzs = all_spec_mzs[peak_start:peak_stop] + spec_intens = all_spec_intensities[peak_start:peak_stop] + + frag_mzs = all_frag_mzs[frag_start:frag_stop,:].copy() + frag_mz_tols = all_frag_mz_tols[frag_start:frag_stop,:].copy() + + if matched_closest: + matched_idxes = match_closest_peaks( + spec_mzs, spec_intens, + frag_mzs, frag_mz_tols + ).reshape(-1) + else: + matched_idxes = match_highest_peaks( + spec_mzs, spec_intens, + frag_mzs, frag_mz_tols + ).reshape(-1) + + matched_intens = spec_intens[matched_idxes] + matched_intens[matched_idxes==-1] = 0 + + matched_mass_errs = np.abs( + spec_mzs[ + matched_idxes.reshape(-1) + ]-frag_mzs.reshape(-1) + ) + matched_mass_errs[matched_idxes==-1] = np.inf + + matched_intensities[ + frag_start:frag_stop,: + ] = matched_intens.reshape(frag_mzs.shape) + + matched_mz_errs[ + frag_start:frag_stop,: + ] = matched_mass_errs.reshape(frag_mzs.shape) + +def load_ms_data( + ms_file:Union[str, MSData_Base], + ms_file_type:str='alpharaw_hdf', + process_count:int = 8, +)->MSData_Base: + """Load MS files + + Parameters + ---------- + ms_file : str | MSData_Base + ms2 file path + + ms_file_type : str, optional + ms2 file type, could be + ["alpharaw_hdf","thermo","sciex","alphapept_hdf","mgf"]. + Default to 'alpharaw_hdf' + """ + if isinstance(ms_file, MSData_Base): + return ms_file + else: + raw_data = ms_reader_provider.get_reader( + ms_file_type, process_count=process_count + ) + raw_data.import_raw(ms_file) + return raw_data + + +@numba.njit +def get_best_matched_intens( + matched_intensity_values:np.ndarray, + frag_start_idxes:np.ndarray, + frag_stop_idxes:np.ndarray, +): + ret_intens = np.zeros( + shape = matched_intensity_values.shape[1:], + dtype=matched_intensity_values.dtype + ) + for i in range(len(frag_start_idxes)): + start = frag_start_idxes[i] + stop = frag_stop_idxes[i] + i = np.argmax(np.matmul( + matched_intensity_values[:,start:stop,:], + matched_intensity_values[:,start:stop,:].T + ).sum()) + ret_intens[start:stop,:] = matched_intensity_values[i,start:stop,:] + return ret_intens + +@numba.njit +def get_ion_count_scores( + frag_mz_values:np.ndarray, + matched_intens:np.ndarray, + frag_start_idxes:np.ndarray, + frag_stop_idxes:np.ndarray, + min_mz:float = 200, +): + scores = [] + for i in range(len(frag_start_idxes)): + scores.append(np.count_nonzero( + matched_intens[ + frag_start_idxes[i]:frag_stop_idxes[i],: + ].copy().reshape(-1)[ + frag_mz_values[ + frag_start_idxes[i]:frag_stop_idxes[i],: + ].copy().reshape(-1)>=min_mz + ] + )) + return np.array(scores,np.int32) + +@numba.njit +def get_dia_spec_idxes( + spec_rt_values:np.ndarray, + query_rt_values:np.ndarray, + max_spec_per_query:int, +): + rt_idxes = np.searchsorted(spec_rt_values, query_rt_values) + + spec_idxes = np.full( + (len(rt_idxes),max_spec_per_query), + -1, dtype=np.int32 + ) + n = max_spec_per_query // 2 + + for iquery in range(len(rt_idxes)): + if rt_idxes[iquery] < n: + spec_idxes[iquery,:] = np.arange(0, max_spec_per_query) + else: + spec_idxes[iquery,:] = np.arange( + rt_idxes[iquery]-n, + rt_idxes[iquery]-n+max_spec_per_query + ) + return spec_idxes + \ No newline at end of file diff --git a/alpharaw/viz/df_utils.py b/alpharaw/viz/df_utils.py index b799d70..e5b29ec 100644 --- a/alpharaw/viz/df_utils.py +++ b/alpharaw/viz/df_utils.py @@ -32,40 +32,19 @@ def make_psm_plot_df_for_peptide( ppm:float = 20.0, charged_frag_types: list = ["b_z1","b_z2","y_z1","y_z2"], include_fragments:bool=True, + fragment_intensity_df:pd.DataFrame=None, include_precursor_isotopes:bool=False, max_isotope:int = 6, min_frag_mz:float = 100.0, match_mode:typing.Literal["closest","highest"]="closest", )->pd.DataFrame: - """ - - Args: - spec_masses (np.ndarray): _description_ - spec_intensities (np.ndarray): _description_ - sequence (str): _description_ - mods (str): _description_ - mod_sites (str): _description_ - charge (int): _description_ - rt_sec (float, optional): _description_. Defaults to 0.0. - mobility (float, optional): _description_. Defaults to 0.0. - ppm (float, optional): _description_. Defaults to 20.0. - charged_frag_types (list, optional): _description_. Defaults to ["b_z1","b_z2","y_z1","y_z2"]. - include_fragments (bool, optional): _description_. Defaults to True. - include_precursor_isotopes (bool, optional): _description_. Defaults to False. - max_isotope (int, optional): _description_. Defaults to 6. - min_frag_mz (float, optional): _description_. Defaults to 100.0. - match_mode (typing.Literal["closest","highest"], optional): _description_. Defaults to "closest". - model_mgr (_type_, optional): _description_. Defaults to None. - - Returns: - pd.DataFrame: _description_ - """ plot_df = make_xic_plot_df_for_peptide( sequence, mods, mod_sites, charge, rt_sec=rt_sec, mobility=mobility, charged_frag_types=charged_frag_types, include_fragments=include_fragments, + fragment_intensity_df=fragment_intensity_df, include_precursor_isotopes=include_precursor_isotopes, max_isotope=max_isotope, min_frag_mz=min_frag_mz, @@ -95,6 +74,7 @@ def make_xic_plot_df_for_peptide( ms_level:int=2, charged_frag_types: list = ["b_z1","b_z2","y_z1","y_z2"], include_fragments:bool=True, + fragment_intensity_df:pd.DataFrame = None, include_precursor_isotopes:bool=False, max_isotope:int = 6, min_frag_mz:float = 100.0, @@ -108,8 +88,17 @@ def make_xic_plot_df_for_peptide( include_precursor_isotopes=include_precursor_isotopes, max_isotope=max_isotope, ) + if fragment_intensity_df is not None: + columns = np.intersect1d( + fragment_mz_df.columns.values, + fragment_intensity_df.columns.values, + ) + fragment_mz_df = fragment_mz_df[columns] + fragment_intensity_df = fragment_intensity_df[columns] + return translate_precursor_fragment_df_to_plot_df( precursor_df, fragment_mz_df, + fragment_intensity_df=fragment_intensity_df, rt_sec=rt_sec, mobility=mobility, ms_level=ms_level, @@ -143,7 +132,7 @@ def make_psm_plot_for_dfs( query_mass_tols = plot_df.mz.values*ppm*1e-6, query_frag_idxes = plot_df.fragment_site.values, modified_sequence = plot_df.modified_sequence.values[0], - mod_sites="", + mod_sites=precursor_df.mod_sites.values[0], query_intensities = plot_df.intensity.values if "intensity" in plot_df.columns else None, match_mode = match_mode, @@ -240,10 +229,21 @@ def make_psm_plot_df( ion_name=query_ion_names, )) + def PCC_sim(x, y): + x = x-np.mean(x) + y = y-np.mean(y) + return np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y)+1e-8) + PCC = PCC_sim( + -query_intensities[matched_bools], + matched_spec_intens + ) + df = pd.concat([ spec_df, matched_df, mirror_df ], ignore_index=True) df["mod_sites"] = mod_sites + if len(mirror_df) > 0: + df["pcc"] = PCC return df def get_modified_sequence( diff --git a/alpharaw/viz/psm_plot.py b/alpharaw/viz/psm_plot.py index 03c3a1a..9289f35 100644 --- a/alpharaw/viz/psm_plot.py +++ b/alpharaw/viz/psm_plot.py @@ -225,6 +225,9 @@ def __init__(self, def plot(self, plot_df, sequence, title, plot_unmatched_peaks=False ): + + if "pcc" in plot_df.columns and len(plot_df) > 0: + title = f"{title} (R={plot_df.pcc.values[0]:.3f})" self._init_plot(title) self.peak_plot.plot( diff --git a/alpharaw/wrappers/alphapept_wrapper.py b/alpharaw/wrappers/alphapept_wrapper.py index 240a772..b2ce70f 100644 --- a/alpharaw/wrappers/alphapept_wrapper.py +++ b/alpharaw/wrappers/alphapept_wrapper.py @@ -6,60 +6,6 @@ MSData_Base, ms_reader_provider ) -def get_peak_lists(starts, ends, peak_df): - mass_list = [peak_df.mz.values[start:end].tolist() for start,end in zip(starts,ends)] - inten_list = [peak_df.intensity.values[start:end].tolist() for start,end in zip(starts,ends)] - return mass_list, inten_list - -def extract_ms1(raw_data:MSData_Base, query_data:dict): - spec_df = raw_data.spectrum_df.query("ms_level==1") - scans = spec_df.spec_idx.values - rts = spec_df.rt.values - ms_levels = spec_df.ms_level.values - - mass_list_ms1, int_list_ms1 = get_peak_lists( - spec_df.peak_start_idx.values, - spec_df.peak_stop_idx.values, - raw_data.peak_df - ) - - query_data["scan_list_ms1"] = scans - query_data["rt_list_ms1"] = rts - query_data["mass_list_ms1"] = np.array(mass_list_ms1, dtype=object) - query_data["int_list_ms1"] = np.array(int_list_ms1, dtype=object) - query_data["ms_list_ms1"] = ms_levels - -def extract_ms2(raw_data:MSData_Base, query_data:dict): - spec_df = raw_data.spectrum_df.query("ms_level==2") - scans = spec_df.spec_idx.values - rts = spec_df.rt.values - ms_levels = spec_df.ms_level.values - precursor_mzs2 = spec_df.precursor_mz.values - charges = spec_df.charge.values - charges[charges<=0] = 2 - - mass_list_ms2, int_list_ms2 = get_peak_lists( - spec_df.peak_start_idx.values, - spec_df.peak_stop_idx.values, - raw_data.peak_df - ) - - query_data["scan_list_ms2"] = scans - query_data["rt_list_ms2"] = rts - query_data["mass_list_ms2"] = mass_list_ms2 - query_data["int_list_ms2"] = int_list_ms2 - query_data["ms_list_ms2"] = ms_levels - query_data["prec_mass_list2"] = precursor_mzs2 - query_data["mono_mzs2"] = precursor_mzs2 - query_data["charge2"] = charges - -def parse_msdata_to_alphapept(raw_data:MSData_Base): - query_data = {} - extract_ms1(raw_data, query_data) - extract_ms2(raw_data, query_data) - - return query_data, raw_data.creation_time - class AlphaPept_HDF_MS2_Reader(MSData_Base): """MS2 from AlphaPept HDF""" def _import(self, _path): @@ -123,3 +69,57 @@ def _set_dataframes(self, _path): ms_reader_provider.register_reader('alphapept', AlphaPept_HDF_MS2_Reader) ms_reader_provider.register_reader('alphapept_hdf', AlphaPept_HDF_MS2_Reader) + +def get_peak_lists(starts, ends, peak_df): + mass_list = [peak_df.mz.values[start:end].tolist() for start,end in zip(starts,ends)] + inten_list = [peak_df.intensity.values[start:end].tolist() for start,end in zip(starts,ends)] + return mass_list, inten_list + +def extract_ms1(raw_data:MSData_Base, query_data:dict): + spec_df = raw_data.spectrum_df.query("ms_level==1") + scans = spec_df.spec_idx.values + rts = spec_df.rt.values + ms_levels = spec_df.ms_level.values + + mass_list_ms1, int_list_ms1 = get_peak_lists( + spec_df.peak_start_idx.values, + spec_df.peak_stop_idx.values, + raw_data.peak_df + ) + + query_data["scan_list_ms1"] = scans + query_data["rt_list_ms1"] = rts + query_data["mass_list_ms1"] = np.array(mass_list_ms1, dtype=object) + query_data["int_list_ms1"] = np.array(int_list_ms1, dtype=object) + query_data["ms_list_ms1"] = ms_levels + +def extract_ms2(raw_data:MSData_Base, query_data:dict): + spec_df = raw_data.spectrum_df.query("ms_level==2") + scans = spec_df.spec_idx.values + rts = spec_df.rt.values + ms_levels = spec_df.ms_level.values + precursor_mzs2 = spec_df.precursor_mz.values + charges = spec_df.charge.values + charges[charges<=0] = 2 + + mass_list_ms2, int_list_ms2 = get_peak_lists( + spec_df.peak_start_idx.values, + spec_df.peak_stop_idx.values, + raw_data.peak_df + ) + + query_data["scan_list_ms2"] = scans + query_data["rt_list_ms2"] = rts + query_data["mass_list_ms2"] = mass_list_ms2 + query_data["int_list_ms2"] = int_list_ms2 + query_data["ms_list_ms2"] = ms_levels + query_data["prec_mass_list2"] = precursor_mzs2 + query_data["mono_mzs2"] = precursor_mzs2 + query_data["charge2"] = charges + +def parse_msdata_to_alphapept(raw_data:MSData_Base): + query_data = {} + extract_ms1(raw_data, query_data) + extract_ms2(raw_data, query_data) + + return query_data, raw_data.creation_time diff --git a/docs/conf.py b/docs/conf.py index 41b01ef..6e0861f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -23,7 +23,7 @@ copyright = '2022, Mann Labs, MPIB' author = 'Mann Labs, MPIB' -release = "0.4.0" +release = "0.4.1" # -- General configuration --------------------------------------------------- diff --git a/nbs/test_dia_matching.ipynb b/nbs/test_dia_matching.ipynb index 096be2e..2455365 100644 --- a/nbs/test_dia_matching.ipynb +++ b/nbs/test_dia_matching.ipynb @@ -14,7 +14,16 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:root:WARNING: Temp mmap arrays are written to /var/folders/fh/hf8t3l1x02d42ggk3b304_rh0000gn/T/temp_mmap_37vmr95k. Cleanup of this folder is OS dependant, and might need to be triggered manually! Current space: 624,114,790,400\n", + "WARNING:root:WARNING: No Bruker libraries are available for this operating system. Mobility and m/z values need to be estimated. While this estimation often returns acceptable results with errors < 0.02 Th, huge errors (e.g. offsets of 6 Th) have already been observed for some samples!\n" + ] + } + ], "source": [ "from alpharaw.match.psm_match import PepSpecMatch_DIA" ] @@ -413,7 +422,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1/1 [00:03<00:00, 3.39s/it]\n" + "100%|██████████| 1/1 [00:07<00:00, 7.98s/it]\n" ] } ], @@ -859,8 +868,8 @@ " \n", " 0\n", " 0.000000\n", + " 0.0\n", " 0.000000\n", - " 111593.757812\n", " 0.0\n", " 0.0\n", " 0.0\n", @@ -869,9 +878,9 @@ " \n", " \n", " 1\n", - " 131525.031250\n", - " 0.000000\n", - " 23976.539062\n", + " 190300.578125\n", + " 0.0\n", + " 37765.183594\n", " 0.0\n", " 0.0\n", " 0.0\n", @@ -880,8 +889,8 @@ " \n", " \n", " 2\n", - " 102311.109375\n", " 0.000000\n", + " 0.0\n", " 0.000000\n", " 0.0\n", " 0.0\n", @@ -891,9 +900,9 @@ " \n", " \n", " 3\n", - " 113438.023438\n", " 0.000000\n", - " 112715.445312\n", + " 0.0\n", + " 0.000000\n", " 0.0\n", " 0.0\n", " 0.0\n", @@ -902,9 +911,9 @@ " \n", " \n", " 4\n", - " 496574.343750\n", " 0.000000\n", - " 419370.625000\n", + " 0.0\n", + " 0.000000\n", " 0.0\n", " 0.0\n", " 0.0\n", @@ -925,7 +934,7 @@ " \n", " 534754\n", " 0.000000\n", - " 8577.349609\n", + " 0.0\n", " 0.000000\n", " 0.0\n", " 0.0\n", @@ -935,8 +944,8 @@ " \n", " \n", " 534755\n", - " 13675.758789\n", - " 29054.722656\n", + " 0.000000\n", + " 0.0\n", " 0.000000\n", " 0.0\n", " 0.0\n", @@ -947,8 +956,8 @@ " \n", " 534756\n", " 0.000000\n", - " 9776.382812\n", - " 8527.920898\n", + " 0.0\n", + " 0.000000\n", " 0.0\n", " 0.0\n", " 0.0\n", @@ -958,7 +967,7 @@ " \n", " 534757\n", " 0.000000\n", - " 10933.444336\n", + " 0.0\n", " 0.000000\n", " 0.0\n", " 0.0\n", @@ -969,7 +978,7 @@ " \n", " 534758\n", " 0.000000\n", - " 27407.386719\n", + " 0.0\n", " 0.000000\n", " 0.0\n", " 0.0\n", @@ -983,31 +992,31 @@ "" ], "text/plain": [ - " b_z1 b_z2 y_z1 y_z2 b_modloss_z1 \\\n", - "0 0.000000 0.000000 111593.757812 0.0 0.0 \n", - "1 131525.031250 0.000000 23976.539062 0.0 0.0 \n", - "2 102311.109375 0.000000 0.000000 0.0 0.0 \n", - "3 113438.023438 0.000000 112715.445312 0.0 0.0 \n", - "4 496574.343750 0.000000 419370.625000 0.0 0.0 \n", - "... ... ... ... ... ... \n", - "534754 0.000000 8577.349609 0.000000 0.0 0.0 \n", - "534755 13675.758789 29054.722656 0.000000 0.0 0.0 \n", - "534756 0.000000 9776.382812 8527.920898 0.0 0.0 \n", - "534757 0.000000 10933.444336 0.000000 0.0 0.0 \n", - "534758 0.000000 27407.386719 0.000000 0.0 0.0 \n", + " b_z1 b_z2 y_z1 y_z2 b_modloss_z1 b_modloss_z2 \\\n", + "0 0.000000 0.0 0.000000 0.0 0.0 0.0 \n", + "1 190300.578125 0.0 37765.183594 0.0 0.0 0.0 \n", + "2 0.000000 0.0 0.000000 0.0 0.0 0.0 \n", + "3 0.000000 0.0 0.000000 0.0 0.0 0.0 \n", + "4 0.000000 0.0 0.000000 0.0 0.0 0.0 \n", + "... ... ... ... ... ... ... \n", + "534754 0.000000 0.0 0.000000 0.0 0.0 0.0 \n", + "534755 0.000000 0.0 0.000000 0.0 0.0 0.0 \n", + "534756 0.000000 0.0 0.000000 0.0 0.0 0.0 \n", + "534757 0.000000 0.0 0.000000 0.0 0.0 0.0 \n", + "534758 0.000000 0.0 0.000000 0.0 0.0 0.0 \n", "\n", - " b_modloss_z2 y_modloss_z1 y_modloss_z2 \n", - "0 0.0 0.0 0.0 \n", - "1 0.0 0.0 0.0 \n", - "2 0.0 0.0 0.0 \n", - "3 0.0 0.0 0.0 \n", - "4 0.0 0.0 0.0 \n", - "... ... ... ... \n", - "534754 0.0 0.0 0.0 \n", - "534755 0.0 0.0 0.0 \n", - "534756 0.0 0.0 0.0 \n", - "534757 0.0 0.0 0.0 \n", - "534758 0.0 0.0 0.0 \n", + " y_modloss_z1 y_modloss_z2 \n", + "0 0.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "... ... ... \n", + "534754 0.0 0.0 \n", + "534755 0.0 0.0 \n", + "534756 0.0 0.0 \n", + "534757 0.0 0.0 \n", + "534758 0.0 0.0 \n", "\n", "[534759 rows x 8 columns]" ] @@ -1033,16 +1042,6 @@ "execution_count": 8, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/wenfengzeng/workspace/peptdeep/peptdeep/model/ms2.py:695: UserWarning: MPS: nonzero op is supported natively starting from macOS 13.0. Falling back on CPU. This may have performance implications. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/native/mps/operations/Indexing.mm:218.)\n", - " cos[cos>1] = 1\n", - "/Users/wenfengzeng/workspace/peptdeep/peptdeep/model/ms2.py:699: UserWarning: torch.sort is supported by MPS on MacOS 13+, please upgrade. Falling back to CPU (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/native/mps/operations/Sort.mm:30.)\n", - " sorted_idx = x.argsort(dim=1)\n" - ] - }, { "data": { "text/html": [ @@ -1080,66 +1079,66 @@ " \n", " \n", " mean\n", - " 0.938649\n", - " 0.943356\n", - " 0.846499\n", - " 0.823260\n", + " 0.793184\n", + " 0.801761\n", + " 0.662681\n", + " 0.786549\n", " \n", " \n", " std\n", - " 0.122191\n", - " 0.113922\n", - " 0.156032\n", - " 0.140910\n", + " 0.259575\n", + " 0.252218\n", + " 0.251722\n", + " 0.137454\n", " \n", " \n", " min\n", - " -0.057496\n", + " -0.050093\n", " 0.000000\n", " 0.000000\n", - " -0.062621\n", + " 0.026527\n", " \n", " \n", " 25%\n", - " 0.941413\n", - " 0.946230\n", - " 0.792737\n", - " 0.750613\n", + " 0.722388\n", + " 0.736639\n", + " 0.530222\n", + " 0.704215\n", " \n", " \n", " 50%\n", - " 0.986450\n", - " 0.987556\n", - " 0.900479\n", - " 0.851517\n", + " 0.900511\n", + " 0.905750\n", + " 0.724400\n", + " 0.805559\n", " \n", " \n", " 75%\n", - " 0.997344\n", - " 0.997548\n", - " 0.955660\n", - " 0.931510\n", + " 0.972494\n", + " 0.973826\n", + " 0.855859\n", + " 0.894600\n", " \n", " \n", " max\n", - " 0.999998\n", - " 0.999998\n", - " 0.998884\n", + " 1.000003\n", + " 1.000002\n", + " 1.000000\n", " 1.000000\n", " \n", " \n", " >0.90\n", - " 0.833929\n", - " 0.844719\n", - " 0.506305\n", - " 0.335203\n", + " 0.510807\n", + " 0.522722\n", + " 0.167494\n", + " 0.260136\n", " \n", " \n", " >0.75\n", - " 0.933075\n", - " 0.940357\n", - " 0.807384\n", - " 0.750356\n", + " 0.726856\n", + " 0.739632\n", + " 0.461060\n", + " 0.635836\n", " \n", " \n", "\n", @@ -1148,15 +1147,15 @@ "text/plain": [ " PCC COS SA SPC\n", "count 20142.000000 20142.000000 20142.000000 20142.000000\n", - "mean 0.938649 0.943356 0.846499 0.823260\n", - "std 0.122191 0.113922 0.156032 0.140910\n", - "min -0.057496 0.000000 0.000000 -0.062621\n", - "25% 0.941413 0.946230 0.792737 0.750613\n", - "50% 0.986450 0.987556 0.900479 0.851517\n", - "75% 0.997344 0.997548 0.955660 0.931510\n", - "max 0.999998 0.999998 0.998884 1.000000\n", - ">0.90 0.833929 0.844719 0.506305 0.335203\n", - ">0.75 0.933075 0.940357 0.807384 0.750356" + "mean 0.793184 0.801761 0.662681 0.786549\n", + "std 0.259575 0.252218 0.251722 0.137454\n", + "min -0.050093 0.000000 0.000000 0.026527\n", + "25% 0.722388 0.736639 0.530222 0.704215\n", + "50% 0.900511 0.905750 0.724400 0.805559\n", + "75% 0.972494 0.973826 0.855859 0.894600\n", + "max 1.000003 1.000002 1.000000 1.000000\n", + ">0.90 0.510807 0.522722 0.167494 0.260136\n", + ">0.75 0.726856 0.739632 0.461060 0.635836" ] }, "execution_count": 8, @@ -1180,6 +1179,13 @@ "sum(metrics_list)/len(metrics_list)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null,