Merge pull request #57 from MannLabs/53-match-i

53 match i
MannLabs · Jun 18, 2024 · 9974ca7 · 9974ca7
2 parents e12ed10 + e221cb9
commit 9974ca7
Show file tree

Hide file tree

Showing 3 changed files with 170 additions and 7 deletions.
diff --git a/alpharaw/match/mass_calibration.py b/alpharaw/match/mass_calibration.py
@@ -1,7 +1,15 @@
+import warnings
+
 import numpy as np
 import pandas as pd
 from sklearn.neighbors import KNeighborsRegressor
 
+warnings.warn(
+    "This module will be removed in the future as "
+    "mass calibration has already been implemented in alphaDIA.",
+    category=DeprecationWarning,
+)
+
 
 def get_fragment_median(start_end_idxes: tuple, frag_df: pd.DataFrame):
     start_idx, end_idx = start_end_idxes

diff --git a/alpharaw/match/match_utils.py b/alpharaw/match/match_utils.py
@@ -13,7 +13,33 @@ def match_batch_spec(
     peak_stop_idxes: np.ndarray,
     query_mzs: np.ndarray,
     query_mz_tols: np.ndarray,
-):
+) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Extract matched mzs and intensities for query m/z values against the given batch spectra.
+
+    Parameters
+    ----------
+    spec_idxes : np.ndarray
+        The batch spectra, given as spectrum indexes.
+    peak_mzs : np.ndarray
+        The peak m/z values in the whole raw data.
+    peak_intens : np.ndarray
+        The peak intensities in the whole raw data.
+    peak_start_idxes : np.ndarray
+        The batch spectra, given as the start indexes in peak m/z and intensities.
+    peak_stop_idxes : np.ndarray
+        The batch spectra, given as the stop indexes in peak m/z and intensities.
+    query_mzs : np.ndarray
+        The query m/z values, these can be from fragments of a precursor.
+    query_mz_tols : np.ndarray
+        The query tolerance values of query_mzs.
+
+    Returns
+    -------
+    Tuple[ndarray, ndarray]
+        ndarray with shape (spectrum num, query num): matched m/z values. 0.0 if not matched.
+        ndarray with shape (spectrum num, query num): matched intensity values. 0.0 if not matched.
+    """
     matched_mzs = np.zeros((len(spec_idxes), len(query_mzs)), dtype=peak_mzs.dtype)
     matched_intens = np.zeros(
         (len(spec_idxes), len(query_mzs)), dtype=peak_intens.dtype
@@ -63,7 +89,7 @@ def match_closest_peaks(
     query_mzs: np.ndarray,
     query_mz_tols: np.ndarray,
 ) -> np.ndarray:
-    """Matching query mz values against sorted MS2/spec masses,
+    """Matching query mz values against sorted MS2/spec m/z values,
     only closest (minimal abs mass error) peaks are returned.
 
     Parameters

diff --git a/alpharaw/match/spec_finder.py b/alpharaw/match/spec_finder.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import numba
 import numpy as np
 import pandas as pd
@@ -9,8 +11,32 @@ def find_spec_idxes_by_rt(
     query_stop_rt: float,
     query_left_mz: float,
     query_right_mz: float,
-):
+) -> np.ndarray:
+    """
+    Find MS2 spectrum indices (int32) from the `spectrum_df`
+    by given RT window and precursor m/z window.
+
+    Parameters
+    ----------
+    spectrum_df : pd.DataFrame
+        Spectrum dataframe to find spectrum indices.
+    query_start_rt : float
+        Left RT of the query RT window.
+    query_stop_rt : float
+        Right RT of the query RT window.
+    query_left_mz : float
+        Left m/z of the query m/z window.
+    query_right_mz : float
+        Right m/z of the query m/z window.
+
+    Returns
+    -------
+    ndarray[int32]
+        Result spectrum indices. `int32` is used here as there will be
+        no more than 2 billions of spectra in a raw file.
+    """
     if "multinotch" in spectrum_df.columns:
+        # if multinotch, there are multiple isolation windows of MS2 spectra.
         return find_multinotch_spec_idxes(
             spec_rts=spectrum_df.rt.values,
             spec_multinotch_wins=spectrum_df.multinotch.values,
@@ -21,6 +47,7 @@ def find_spec_idxes_by_rt(
             query_right_mz=query_right_mz,
         )
     else:
+        # normal isolation windows (one window to one MS2 spectrum)
         return find_spec_idxes(
             spec_rts=spectrum_df.rt.values,
             spec_isolation_lower_mzs=spectrum_df.isolation_lower_mz.values,
@@ -34,13 +61,40 @@ def find_spec_idxes_by_rt(
 
 def find_multinotch_spec_idxes(
     spec_rts: np.ndarray,
-    spec_multinotch_wins: list,
+    spec_multinotch_wins: List[List],
     spec_ms_levels: np.ndarray,
     query_start_rt: float,
     query_stop_rt: float,
     query_left_mz: float,
     query_right_mz: float,
 ) -> np.ndarray:
+    """
+    Find MS2 spectrum indices (int32) from the "multinotch" `spectrum_df`
+    by given RT window and precursor m/z window.
+    "multinotch" means there are multiple isolation windows of MS2 spectra.
+
+    Parameters
+    ----------
+    spec_rts : np.ndarray
+        RT values of the spectra.
+    spec_multinotch_wins : List[List]
+        List (num of spectra) of list (multiple isolation windows).
+    spec_ms_levels : np.ndarray
+        MS levels of the spectra.
+    query_start_rt : float
+        Left RT of the query RT window.
+    query_stop_rt : float
+        Right RT of the query RT window.
+    query_left_mz : float
+        Left m/z of the query m/z window.
+    query_right_mz : float
+        Right m/z of the query m/z window.
+
+    Returns
+    -------
+    np.ndarray[int32]
+        Result spectrum indices.
+    """
     start_idx = np.searchsorted(spec_rts, query_start_rt)
     stop_idx = np.searchsorted(spec_rts, query_stop_rt) + 1
     spec_idxes = []
@@ -59,7 +113,26 @@ def find_dia_spec_idxes_same_window(
     spec_rt_values: np.ndarray,
     query_rt_values: np.ndarray,
     max_spec_per_query: int,
-):
+) -> np.ndarray:
+    """
+    For given array of query RT values, find spectrum indices
+    from the subset of spectra within the same normal DIA m/z window.
+    This function is numba accelerated.
+
+    Parameters
+    ----------
+    spec_rt_values : np.ndarray
+        RT values of given DIA spectra.
+    query_rt_values : np.ndarray
+        Query RT values.
+    max_spec_per_query : int
+        Return maximal spectrum indices (scan windows) for the given query.
+
+    Returns
+    -------
+    ndarray[int32]
+        Result spectrum indices with shape (query num, max_spec_per_query).
+    """
     rt_idxes = np.searchsorted(spec_rt_values, query_rt_values)
 
     spec_idxes = np.full((len(rt_idxes), max_spec_per_query), -1, dtype=np.int32)
@@ -84,7 +157,34 @@ def find_spec_idxes(
     query_stop_rt: float,
     query_left_mz: float,
     query_right_mz: float,
-):
+) -> np.ndarray:
+    """
+    Find MS2 spectrum indices (int32) from all the spectra
+    by given RT window and precursor m/z window.
+    This function is numba accelerated.
+
+    Parameters
+    ----------
+    spec_rts : np.ndarray
+        RT values of the spectra.
+    spec_isolation_lower_mzs : np.ndarray
+        Left m/z values of the isolation windows.
+    spec_isolation_upper_mzs : np.ndarray
+        Right m/z values of the isolation windows.
+    query_start_rt : float
+        Left RT of the query RT window.
+    query_stop_rt : float
+        Right RT of the query RT window.
+    query_left_mz : float
+        Left m/z of the query m/z window.
+    query_right_mz : float
+        Right m/z of the query m/z window.
+
+    Returns
+    -------
+    np.ndarray[int32]
+        Result spectrum indices.
+    """
     rt_start_idx = np.searchsorted(spec_rts, query_start_rt)
     rt_stop_idx = np.searchsorted(spec_rts, query_stop_rt) + 1
 
@@ -108,7 +208,36 @@ def find_batch_spec_idxes(
     query_left_mzs: np.ndarray,
     query_right_mzs: np.ndarray,
     max_spec_per_query: int,
-):
+) -> np.ndarray:
+    """
+    Find MS2 spectrum indices (int32) from all the spectra
+    by the given batch of RT windows and precursor m/z windows.
+    This function is numba accelerated.
+
+    Parameters
+    ----------
+    spec_rts : np.ndarray
+        RT values of the spectra.
+    spec_isolation_lower_mzs : np.ndarray
+        Left m/z values of the isolation windows.
+    spec_isolation_upper_mzs : np.ndarray
+        Right m/z values of the isolation windows.
+    query_start_rts : np.ndarray
+        Left RT values of the query RT windows.
+    query_stop_rts : np.ndarray
+        Right RT values of the query RT windows.
+    query_left_mzs : np.ndarray
+        Left m/z values of the query m/z windows.
+    query_right_mzs : np.ndarray
+        Right m/z values of the query m/z windows.
+    max_spec_per_query : int
+        Return maximal spectrum indices (scan windows) for the given query.
+
+    Returns
+    -------
+    ndarray[int32]
+        Result spectrum indices with shape (query num, max_spec_per_query).
+    """
     rt_start_idxes = np.searchsorted(spec_rts, query_start_rts)
     rt_stop_idxes = np.searchsorted(spec_rts, query_stop_rts) + 1