Skip to content

Commit

Permalink
Merge pull request #57 from MannLabs/53-match-i
Browse files Browse the repository at this point in the history
53 match i
  • Loading branch information
jalew188 authored Jun 18, 2024
2 parents e12ed10 + e221cb9 commit 9974ca7
Show file tree
Hide file tree
Showing 3 changed files with 170 additions and 7 deletions.
8 changes: 8 additions & 0 deletions alpharaw/match/mass_calibration.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
import warnings

import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor

warnings.warn(
"This module will be removed in the future as "
"mass calibration has already been implemented in alphaDIA.",
category=DeprecationWarning,
)


def get_fragment_median(start_end_idxes: tuple, frag_df: pd.DataFrame):
start_idx, end_idx = start_end_idxes
Expand Down
30 changes: 28 additions & 2 deletions alpharaw/match/match_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,33 @@ def match_batch_spec(
peak_stop_idxes: np.ndarray,
query_mzs: np.ndarray,
query_mz_tols: np.ndarray,
):
) -> Tuple[np.ndarray, np.ndarray]:
"""
Extract matched mzs and intensities for query m/z values against the given batch spectra.
Parameters
----------
spec_idxes : np.ndarray
The batch spectra, given as spectrum indexes.
peak_mzs : np.ndarray
The peak m/z values in the whole raw data.
peak_intens : np.ndarray
The peak intensities in the whole raw data.
peak_start_idxes : np.ndarray
The batch spectra, given as the start indexes in peak m/z and intensities.
peak_stop_idxes : np.ndarray
The batch spectra, given as the stop indexes in peak m/z and intensities.
query_mzs : np.ndarray
The query m/z values, these can be from fragments of a precursor.
query_mz_tols : np.ndarray
The query tolerance values of query_mzs.
Returns
-------
Tuple[ndarray, ndarray]
ndarray with shape (spectrum num, query num): matched m/z values. 0.0 if not matched.
ndarray with shape (spectrum num, query num): matched intensity values. 0.0 if not matched.
"""
matched_mzs = np.zeros((len(spec_idxes), len(query_mzs)), dtype=peak_mzs.dtype)
matched_intens = np.zeros(
(len(spec_idxes), len(query_mzs)), dtype=peak_intens.dtype
Expand Down Expand Up @@ -63,7 +89,7 @@ def match_closest_peaks(
query_mzs: np.ndarray,
query_mz_tols: np.ndarray,
) -> np.ndarray:
"""Matching query mz values against sorted MS2/spec masses,
"""Matching query mz values against sorted MS2/spec m/z values,
only closest (minimal abs mass error) peaks are returned.
Parameters
Expand Down
139 changes: 134 additions & 5 deletions alpharaw/match/spec_finder.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import List

import numba
import numpy as np
import pandas as pd
Expand All @@ -9,8 +11,32 @@ def find_spec_idxes_by_rt(
query_stop_rt: float,
query_left_mz: float,
query_right_mz: float,
):
) -> np.ndarray:
"""
Find MS2 spectrum indices (int32) from the `spectrum_df`
by given RT window and precursor m/z window.
Parameters
----------
spectrum_df : pd.DataFrame
Spectrum dataframe to find spectrum indices.
query_start_rt : float
Left RT of the query RT window.
query_stop_rt : float
Right RT of the query RT window.
query_left_mz : float
Left m/z of the query m/z window.
query_right_mz : float
Right m/z of the query m/z window.
Returns
-------
ndarray[int32]
Result spectrum indices. `int32` is used here as there will be
no more than 2 billions of spectra in a raw file.
"""
if "multinotch" in spectrum_df.columns:
# if multinotch, there are multiple isolation windows of MS2 spectra.
return find_multinotch_spec_idxes(
spec_rts=spectrum_df.rt.values,
spec_multinotch_wins=spectrum_df.multinotch.values,
Expand All @@ -21,6 +47,7 @@ def find_spec_idxes_by_rt(
query_right_mz=query_right_mz,
)
else:
# normal isolation windows (one window to one MS2 spectrum)
return find_spec_idxes(
spec_rts=spectrum_df.rt.values,
spec_isolation_lower_mzs=spectrum_df.isolation_lower_mz.values,
Expand All @@ -34,13 +61,40 @@ def find_spec_idxes_by_rt(

def find_multinotch_spec_idxes(
spec_rts: np.ndarray,
spec_multinotch_wins: list,
spec_multinotch_wins: List[List],
spec_ms_levels: np.ndarray,
query_start_rt: float,
query_stop_rt: float,
query_left_mz: float,
query_right_mz: float,
) -> np.ndarray:
"""
Find MS2 spectrum indices (int32) from the "multinotch" `spectrum_df`
by given RT window and precursor m/z window.
"multinotch" means there are multiple isolation windows of MS2 spectra.
Parameters
----------
spec_rts : np.ndarray
RT values of the spectra.
spec_multinotch_wins : List[List]
List (num of spectra) of list (multiple isolation windows).
spec_ms_levels : np.ndarray
MS levels of the spectra.
query_start_rt : float
Left RT of the query RT window.
query_stop_rt : float
Right RT of the query RT window.
query_left_mz : float
Left m/z of the query m/z window.
query_right_mz : float
Right m/z of the query m/z window.
Returns
-------
np.ndarray[int32]
Result spectrum indices.
"""
start_idx = np.searchsorted(spec_rts, query_start_rt)
stop_idx = np.searchsorted(spec_rts, query_stop_rt) + 1
spec_idxes = []
Expand All @@ -59,7 +113,26 @@ def find_dia_spec_idxes_same_window(
spec_rt_values: np.ndarray,
query_rt_values: np.ndarray,
max_spec_per_query: int,
):
) -> np.ndarray:
"""
For given array of query RT values, find spectrum indices
from the subset of spectra within the same normal DIA m/z window.
This function is numba accelerated.
Parameters
----------
spec_rt_values : np.ndarray
RT values of given DIA spectra.
query_rt_values : np.ndarray
Query RT values.
max_spec_per_query : int
Return maximal spectrum indices (scan windows) for the given query.
Returns
-------
ndarray[int32]
Result spectrum indices with shape (query num, max_spec_per_query).
"""
rt_idxes = np.searchsorted(spec_rt_values, query_rt_values)

spec_idxes = np.full((len(rt_idxes), max_spec_per_query), -1, dtype=np.int32)
Expand All @@ -84,7 +157,34 @@ def find_spec_idxes(
query_stop_rt: float,
query_left_mz: float,
query_right_mz: float,
):
) -> np.ndarray:
"""
Find MS2 spectrum indices (int32) from all the spectra
by given RT window and precursor m/z window.
This function is numba accelerated.
Parameters
----------
spec_rts : np.ndarray
RT values of the spectra.
spec_isolation_lower_mzs : np.ndarray
Left m/z values of the isolation windows.
spec_isolation_upper_mzs : np.ndarray
Right m/z values of the isolation windows.
query_start_rt : float
Left RT of the query RT window.
query_stop_rt : float
Right RT of the query RT window.
query_left_mz : float
Left m/z of the query m/z window.
query_right_mz : float
Right m/z of the query m/z window.
Returns
-------
np.ndarray[int32]
Result spectrum indices.
"""
rt_start_idx = np.searchsorted(spec_rts, query_start_rt)
rt_stop_idx = np.searchsorted(spec_rts, query_stop_rt) + 1

Expand All @@ -108,7 +208,36 @@ def find_batch_spec_idxes(
query_left_mzs: np.ndarray,
query_right_mzs: np.ndarray,
max_spec_per_query: int,
):
) -> np.ndarray:
"""
Find MS2 spectrum indices (int32) from all the spectra
by the given batch of RT windows and precursor m/z windows.
This function is numba accelerated.
Parameters
----------
spec_rts : np.ndarray
RT values of the spectra.
spec_isolation_lower_mzs : np.ndarray
Left m/z values of the isolation windows.
spec_isolation_upper_mzs : np.ndarray
Right m/z values of the isolation windows.
query_start_rts : np.ndarray
Left RT values of the query RT windows.
query_stop_rts : np.ndarray
Right RT values of the query RT windows.
query_left_mzs : np.ndarray
Left m/z values of the query m/z windows.
query_right_mzs : np.ndarray
Right m/z values of the query m/z windows.
max_spec_per_query : int
Return maximal spectrum indices (scan windows) for the given query.
Returns
-------
ndarray[int32]
Result spectrum indices with shape (query num, max_spec_per_query).
"""
rt_start_idxes = np.searchsorted(spec_rts, query_start_rts)
rt_stop_idxes = np.searchsorted(spec_rts, query_stop_rts) + 1

Expand Down

0 comments on commit 9974ca7

Please sign in to comment.