Merge pull request #33 from MannLabs/development

Development
MannLabs · Mar 16, 2024 · 31f8eee · 31f8eee
2 parents 3e682e5 + 34c47a0
commit 31f8eee
Show file tree

Hide file tree

Showing 8 changed files with 272 additions and 212 deletions.
diff --git a/alpharaw/cli.py b/alpharaw/cli.py
@@ -42,10 +42,10 @@ def run(ctx, **kwargs):
     if ctx.invoked_subcommand is None:
         click.echo(run.get_help(ctx))
 
-@run.command("parse", help="Convert raw files into alpharaw_hdf format.")
+@run.command("parse", help="Convert raw files into alpharaw hdf5 (.hdf) format.")
 @click.option(
     "--raw_type", type=str, default="thermo_raw",
-    show_default=True, help=f"Only `thermo_raw` is supported currently.",
+    show_default=True, help=f"Only `thermo_raw`, `sciex_wiff` is supported currently.",
 )
 @click.option(
     "--raw", multiple=True, default=[],

diff --git a/alpharaw/match/psm_match.py b/alpharaw/match/psm_match.py
@@ -479,7 +479,7 @@ def _match_ms2_one_raw_numba(self,
                 psm_idxes = psm_groups[dia_group]
                 if len(psm_idxes) == 0: continue
                 psm_idxes = np.array(psm_idxes, dtype=np.int32)
-                spec_idxes = get_dia_spec_idxes(
+                spec_idxes = query_dia_spec_idxes_same_window(
                     group_df.rt.values,
                     psm_df_one_raw.rt.values[psm_idxes],
                     max_spec_per_query=self.max_spec_per_query
@@ -659,7 +659,7 @@ def get_ion_count_scores(
     return np.array(scores,np.int32)
 
 @numba.njit    
-def get_dia_spec_idxes(
+def query_dia_spec_idxes_same_window(
     spec_rt_values:np.ndarray, 
     query_rt_values:np.ndarray, 
     max_spec_per_query:int,
@@ -682,4 +682,37 @@ def get_dia_spec_idxes(
             )
     return spec_idxes
 
-
+@numba.njit    
+def query_spec_idxes(
+    spec_rts:np.ndarray, 
+    spec_isolation_lower_mzs:np.ndarray, 
+    spec_isolation_upper_mzs:np.ndarray,
+    query_start_rts:np.ndarray, 
+    query_stop_rts:np.ndarray,
+    query_mzs:np.ndarray,
+    max_spec_per_query:int,
+):
+    rt_start_idxes = np.searchsorted(spec_rts, query_start_rts)
+    rt_stop_idxes = np.searchsorted(spec_rts, query_stop_rts)+1
+
+    spec_idxes = np.full(
+        (len(query_mzs),max_spec_per_query),
+        -1, dtype=np.int32
+    )
+    for iquery in range(len(rt_start_idxes)):
+        idx_list = []
+        for ispec in range(rt_start_idxes[iquery], rt_stop_idxes[iquery]):
+            if (
+                query_mzs[iquery]>=spec_isolation_lower_mzs[ispec] and
+                query_mzs[iquery]<=spec_isolation_upper_mzs[ispec]
+            ):
+                idx_list.append(ispec)
+        if len(idx_list) > max_spec_per_query:
+            spec_idxes[iquery,:] = idx_list[
+                len(idx_list)/2-max_spec_per_query//2:
+                len(idx_list)/2+max_spec_per_query//2+1
+            ]
+        else:
+            spec_idxes[iquery,:len(idx_list)] = idx_list
+    return spec_idxes
+
diff --git a/alpharaw/match/psm_match_alphatims.py b/alpharaw/match/psm_match_alphatims.py
@@ -98,16 +98,16 @@ class PepSpecMatch_AlphaTims(PepSpecMatch):
 
     def get_peak_df(self,
         precursor_mz:float,
-        rt:float,
+        rt_sec:float,
         im:float=0.0,
     )->pd.DataFrame:
         """
         Parameters
         ----------
         precursor_mz : float
             Precursor m/z value
-        rt : float
-            RT value in minutes
+        rt_sec : float
+            RT value in seconds
         im : float, optional
             Ion mobility, by default 0.0
 
@@ -116,7 +116,6 @@ def get_peak_df(self,
         pd.DataFrame
             peak_df in alphatims DF format
         """
-        rt_sec = rt*60
         rt_slice = slice(
             rt_sec-self.rt_sec_tol_to_slice_ms2,
             rt_sec+self.rt_sec_tol_to_slice_ms2,
@@ -176,16 +175,16 @@ def find_k_nearest(array, val, k=3):
     def get_peaks(
         self,
         precursor_mz:float,
-        rt:float,
+        rt_sec:float,
         im:float=0.0,
     )->tuple:
         """
         Parameters
         ----------
         precursor_mz : float
             Precursor m/z value
-        rt : float
-            RT value in minutes
+        rt_sec : float
+            RT value in seconds
         im : float, optional
             Ion mobility, by default 0.0
 
@@ -195,7 +194,7 @@ def get_peaks(
             np.ndarray: peak m/z values
             np.ndarray: peak intensity values
         """
-        spec_df = self.get_peak_df(precursor_mz, rt, im)
+        spec_df = self.get_peak_df(precursor_mz, rt_sec, im)
         spec_df = spec_df.sort_values('mz_values').reset_index(drop=True)
         return (
             spec_df.mz_values.values, 

diff --git a/alpharaw/ms_data_base.py b/alpharaw/ms_data_base.py
@@ -71,7 +71,7 @@ def __init__(
         self.peak_df:pd.DataFrame = pd.DataFrame()
         self._raw_file_path = ''
         self.centroided = centroided
-        self.save_as_hdf = save_as_hdf
+        self._save_as_hdf = save_as_hdf
         self.creation_time = ''
         self.file_type = ''
         self.instrument = 'none'
@@ -99,7 +99,7 @@ def import_raw(self, _path:str):
         self._set_dataframes(raw_data)
         self._check_df()
 
-        if self.save_as_hdf:
+        if self._save_as_hdf:
             self.save_hdf(_path+'.hdf')
 
     def load_raw(self, _path:str):
@@ -167,9 +167,12 @@ def _set_dataframes(self, raw_data:dict):
 
         for col, val in raw_data.items():
             if col in self.column_dtypes:
-                self.spectrum_df[col] = np.array(
-                    val, dtype=self.column_dtypes[col]
-                )
+                if self.column_dtypes[col] == "O":
+                    self.spectrum_df[col] = list(val)
+                else:
+                    self.spectrum_df[col] = np.array(
+                        val, dtype=self.column_dtypes[col]
+                    )
 
     def _read_creation_time(self, raw_data):
         pass

diff --git a/alpharaw/raw_access/pythermorawfilereader.py b/alpharaw/raw_access/pythermorawfilereader.py
@@ -292,6 +292,11 @@ def GetStatusLogForRetentionTime(self, rt):
     def GetStatusLogForScanNum(self, scan):
         return self.GetStatusLogForRetentionTime(self.RTFromScanNum(scan))
 
+    def GetScanEventForScanNum(self, scanNumber):
+        return IScanEventBase(
+            self.source.GetScanEventForScanNumber(scanNumber)
+        )
+
     def GetNumberOfMassRangesFromScanNum(self, scanNumber):
         """This function gets the number of MassRange data items in the scan."""
         return IScanEventBase(