From 52ed49f08895e0fa5e06cbd598b956b073a75863 Mon Sep 17 00:00:00 2001
From: jalew188 <jalew188@gmail.com>
Date: Sun, 31 Dec 2023 16:10:37 +0100
Subject: [PATCH 01/10] psm_match remove ion count score

---
 alpharaw/match/psm_match.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/alpharaw/match/psm_match.py b/alpharaw/match/psm_match.py
index a25e99a..0a961bf 100644
--- a/alpharaw/match/psm_match.py
+++ b/alpharaw/match/psm_match.py
@@ -555,8 +555,8 @@ def get_dia_spec_idxes(
                 idx_list.append(ispec)
         if len(idx_list) > max_spec_per_query:
             spec_idxes[iquery,:] = idx_list[
-                len(idx_list)/2-max_spec_per_query//2:
-                len(idx_list)/2+max_spec_per_query//2+1
+                len(idx_list)//2-max_spec_per_query//2:
+                len(idx_list)//2+max_spec_per_query//2+1
             ]
         else:
             spec_idxes[iquery,:len(idx_list)] = idx_list
@@ -668,14 +668,6 @@ def match_ms2_multi_raw(self,
             process_num
         )
 
-        self.psm_df["score"] = get_ion_count_scores(
-            self.fragment_mz_df.values,
-            self.matched_intensity_df.values,
-            self.psm_df.frag_start_idx.values,
-            self.psm_df.frag_stop_idx.values,
-            self.min_frag_mz,
-        )
-
         return (
             self.psm_df, self.fragment_mz_df, 
             self.matched_intensity_df, self.matched_mz_err_df

From d9822562d890b2d526bfd7f466a557de5cfb74f0 Mon Sep 17 00:00:00 2001
From: jalew188 <jalew188@gmail.com>
Date: Mon, 1 Jan 2024 21:11:20 +0100
Subject: [PATCH 02/10] dia grouper in psm_match_dia

---
 alpharaw/dia/normal_dia.py  |  41 +++-
 alpharaw/match/psm_match.py | 380 ++++++++++++++++++------------------
 2 files changed, 224 insertions(+), 197 deletions(-)

diff --git a/alpharaw/dia/normal_dia.py b/alpharaw/dia/normal_dia.py
index f23126e..b0dbbf4 100644
--- a/alpharaw/dia/normal_dia.py
+++ b/alpharaw/dia/normal_dia.py
@@ -1,7 +1,9 @@
 import pandas as pd
-
+import numpy as np
 import typing
 
+from collections import defaultdict
+
 from alpharaw.ms_data_base import MSData_Base
 
 from alpharaw.utils.df_processing import remove_unused_peaks
@@ -15,7 +17,17 @@ def __init__(self, ms_data: MSData_Base):
             "dia_group"
         ] = ms_data.spectrum_df.precursor_mz.astype(int)
 
-    def get_grouped_ms_data(self, 
+        self.dia_group_dfs = self.ms_data.spectrum_df.groupby("dia_group")
+        self.dia_isolation_dict = {}
+        for dia_group, df in self.dia_group_dfs:
+            if dia_group == -1: continue
+            self.dia_isolation_dict[dia_group] = (
+                df.isolation_lower_mz.values[0],
+                df.isolation_upper_mz.values[0]
+            )
+        self.dia_groups = np.sort(list(self.dia_isolation_dict.keys()))
+
+    def get_ms_data_for_a_group(self, 
         dia_group:int=-1, 
         return_alpharaw_data: bool=True,
         return_alphatims_data: bool=True,
@@ -32,7 +44,7 @@ def get_grouped_ms_data(self,
             TimsTOF: Alphatims object for the window, if `return_alphatims_data==True`
         """
 
-        spec_df = self.ms_data.spectrum_df.query(f"dia_group == {dia_group}")
+        spec_df = self.dia_group_dfs.get_group(dia_group)
 
         if return_alphatims_data:
             ms_data, ms_tims = convert_to_alphatims(
@@ -52,4 +64,25 @@ def get_grouped_ms_data(self,
             ms_data.spectrum_df = spec_df
             ms_data.peak_df = peak_df
             return ms_data
-        
+
+    def assign_dia_groups(self, 
+        precursor_mzs
+    )->typing.DefaultDict[typing.List]:
+        dia_precursor_groups = defaultdict(list)
+        for i, mz in enumerate(precursor_mzs):
+            i_group = np.searchsorted(self.dia_groups, int(mz))
+            if i_group == 0:
+                i_group = 1
+            elif i_group == len(self.dia_groups):
+                i_group -= 1
+            dia_group = self.dia_groups[i_group]
+            if dia_group in self.dia_isolation_dict:
+                isolation_lower, isolation_upper = self.dia_isolation_dict[dia_group]
+                if mz >= isolation_lower and mz <= isolation_upper:
+                    dia_precursor_groups[dia_group].append(i)
+                    continue
+            dia_group = self.dia_groups[i_group-1]
+            if dia_group in self.dia_isolation_dict:
+                isolation_lower, isolation_upper = self.dia_isolation_dict[dia_group]
+                if mz >= isolation_lower and mz <= isolation_upper:
+                    dia_precursor_groups[dia_group].append(i)
\ No newline at end of file
diff --git a/alpharaw/match/psm_match.py b/alpharaw/match/psm_match.py
index 0a961bf..1534937 100644
--- a/alpharaw/match/psm_match.py
+++ b/alpharaw/match/psm_match.py
@@ -3,18 +3,6 @@
 import pandas as pd
 import tqdm
 from typing import Union, Tuple
-import multiprocessing as mp
-from alphabase.utils import process_bar
-try:
-    from alpharaw.thermo import ThermoRawData
-    from alpharaw.sciex import SciexWiffData
-except:
-    pass
-
-from alpharaw.mzml import MzMLReader
-from alpharaw.wrappers.alphapept_wrapper import AlphaPept_HDF_MS2_Reader
-from alpharaw.legacy_msdata.mgf import MGFReader
-
 
 from alphabase.peptide.fragment import (
     create_fragment_mz_dataframe, 
@@ -30,91 +18,10 @@
 from alpharaw.match.match_utils import (
     match_closest_peaks, match_highest_peaks, 
 )
-from ..utils.ms_path_utils import parse_ms_files_to_dict
+from alpharaw.utils.ms_path_utils import parse_ms_files_to_dict
 
-@numba.njit
-def match_one_raw_with_numba(
-    spec_idxes, frag_start_idxes, frag_stop_idxes,
-    all_frag_mzs, all_frag_mz_tols,
-    all_spec_mzs, all_spec_intensities, 
-    peak_start_idxes, peak_stop_idxes,
-    matched_intensities, matched_mz_errs, 
-    matched_closest=True,
-):
-    """ 
-    Internel function to match fragment mz values to spectrum mz values.
-    Matched_mz_errs[i] = np.inf if no peaks are matched.
+from alpharaw.dia.normal_dia import NormalDIAGrouper
 
-    Results will saved in place of matched_intensities 
-    and matched_mz_errs.
-    """
-    for spec_idx, frag_start, frag_stop in zip(
-        spec_idxes, frag_start_idxes, frag_stop_idxes
-    ):
-        if spec_idx == -1: continue
-        peak_start = peak_start_idxes[spec_idx]
-        peak_stop = peak_stop_idxes[spec_idx]
-        if peak_stop == peak_start: continue
-        spec_mzs = all_spec_mzs[peak_start:peak_stop]
-        spec_intens = all_spec_intensities[peak_start:peak_stop]
-
-        frag_mzs = all_frag_mzs[frag_start:frag_stop,:].copy()
-        frag_mz_tols = all_frag_mz_tols[frag_start:frag_stop,:].copy()
-        
-        if matched_closest:
-            matched_idxes = match_closest_peaks(
-                spec_mzs, spec_intens, 
-                frag_mzs, frag_mz_tols
-            ).reshape(-1)
-        else:
-            matched_idxes = match_highest_peaks(
-                spec_mzs, spec_intens, 
-                frag_mzs, frag_mz_tols
-            ).reshape(-1)
-
-        matched_intens = spec_intens[matched_idxes]
-        matched_intens[matched_idxes==-1] = 0
-
-        matched_mass_errs = np.abs(
-            spec_mzs[
-                matched_idxes.reshape(-1)
-            ]-frag_mzs.reshape(-1)
-        )
-        matched_mass_errs[matched_idxes==-1] = np.inf
-
-        matched_intensities[
-            frag_start:frag_stop,:
-        ] = matched_intens.reshape(frag_mzs.shape)
-
-        matched_mz_errs[
-            frag_start:frag_stop,:
-        ] = matched_mass_errs.reshape(frag_mzs.shape)
-
-def load_ms_data(
-    ms_file:Union[str, MSData_Base],
-    ms_file_type:str='alpharaw_hdf',
-    process_count:int = 8,
-)->MSData_Base:
-    """Load MS files
-
-    Parameters
-    ----------
-    ms_file : str | MSData_Base
-        ms2 file path
-
-    ms_file_type : str, optional
-        ms2 file type, could be 
-        ["alpharaw_hdf","thermo","sciex","alphapept_hdf","mgf"].
-        Default to 'alpharaw_hdf'
-    """
-    if isinstance(ms_file, MSData_Base):
-        return ms_file
-    else:
-        raw_data = ms_reader_provider.get_reader(
-            ms_file_type, process_count=process_count
-        )
-        raw_data.import_raw(ms_file)
-        return raw_data
 
 class PepSpecMatch:
     """
@@ -373,9 +280,8 @@ def match_ms2_one_raw(self,
         )
 
     def _match_ms2_one_raw_numba(self, 
-        raw_name_psm_df_one_raw:tuple
+        raw_name:str, psm_df_one_raw:pd.DataFrame
     ):
-        raw_name, psm_df_one_raw = raw_name_psm_df_one_raw
         if raw_name in self._ms_file_dict:
             raw_data = load_ms_data(
                 self._ms_file_dict[raw_name], self._ms_file_type,
@@ -468,7 +374,7 @@ def match_ms2_multi_raw(self,
         for raw_name, df_group in tqdm.tqdm(
             self.psm_df.groupby('raw_name')
         ):
-            self._match_ms2_one_raw_numba((raw_name, df_group))
+            self._match_ms2_one_raw_numba(raw_name, df_group)
         # else:
         #     with mp.get_context("spawn").Pool(processes=process_num) as p:
         #         df_groupby = self.psm_df.groupby('raw_name')
@@ -486,91 +392,15 @@ def match_ms2_multi_raw(self,
             self.psm_df, self.fragment_mz_df, 
             self.matched_intensity_df, self.matched_mz_err_df
         )
-    
-@numba.njit
-def get_best_matched_intens(
-    matched_intensity_values:np.ndarray,
-    frag_start_idxes:np.ndarray,
-    frag_stop_idxes:np.ndarray,
-):
-    ret_intens = np.zeros(
-        shape = matched_intensity_values.shape[1:],
-        dtype=matched_intensity_values.dtype
-    )
-    for i in range(len(frag_start_idxes)):
-        start = frag_start_idxes[i]
-        stop = frag_stop_idxes[i]
-        i = np.argmax(np.matmul(
-            matched_intensity_values[:,start:stop,:],
-            matched_intensity_values[:,start:stop,:].T
-        ).sum())
-        ret_intens[start:stop,:] = matched_intensity_values[i,start:stop,:]
-    return ret_intens
-
-@numba.njit
-def get_ion_count_scores(
-    frag_mz_values:np.ndarray,
-    matched_intens:np.ndarray,
-    frag_start_idxes:np.ndarray,
-    frag_stop_idxes:np.ndarray,
-    min_mz:float = 200,
-):
-    scores = []
-    for i in range(len(frag_start_idxes)):
-        scores.append(np.count_nonzero(
-            matched_intens[
-                frag_start_idxes[i]:frag_stop_idxes[i],:
-            ].copy().reshape(-1)[
-                frag_mz_values[
-                    frag_start_idxes[i]:frag_stop_idxes[i],:
-                ].copy().reshape(-1)>=min_mz
-            ]
-        ))
-    return np.array(scores,np.int32)
-
-@numba.njit    
-def get_dia_spec_idxes(
-    spec_rt_values:np.ndarray, 
-    spec_isolation_lower_mz_values:np.ndarray, 
-    spec_isolation_upper_mz_values:np.ndarray,
-    query_start_rt_values:np.ndarray, 
-    query_stop_rt_values:np.ndarray,
-    query_mz_values:np.ndarray,
-    max_spec_per_query:int,
-):
-    rt_start_idxes = np.searchsorted(spec_rt_values, query_start_rt_values)
-    rt_stop_idxes = np.searchsorted(spec_rt_values, query_stop_rt_values)
-    
-    spec_idxes = np.full(
-        (len(query_mz_values),max_spec_per_query),
-        -1, dtype=np.int32
-    )
-    for iquery in range(len(rt_start_idxes)):
-        idx_list = []
-        for ispec in range(rt_start_idxes[iquery], rt_stop_idxes[iquery]):
-            if (
-                query_mz_values[iquery]>=spec_isolation_lower_mz_values[ispec] and
-                query_mz_values[iquery]<=spec_isolation_upper_mz_values[ispec]
-            ):
-                idx_list.append(ispec)
-        if len(idx_list) > max_spec_per_query:
-            spec_idxes[iquery,:] = idx_list[
-                len(idx_list)//2-max_spec_per_query//2:
-                len(idx_list)//2+max_spec_per_query//2+1
-            ]
-        else:
-            spec_idxes[iquery,:len(idx_list)] = idx_list
-    return spec_idxes
 
 class PepSpecMatch_DIA(PepSpecMatch):
     max_spec_per_query: int = 3
     min_frag_mz: float = 200.0
-    rt_query_window: float = 20.0/60.0 # 20 seconds
 
     def _add_missing_columns_to_psm_df(self,
         psm_df:pd.DataFrame, raw_data=None
     ):
-        # DIA results do not have spec_idx in psm_df, nothing to merge
+        # DIA results do not have spec_idx/scan_num in psm_df, nothing to merge
         return psm_df
     
     def _prepare_matching_dfs(self):
@@ -583,8 +413,6 @@ def _prepare_matching_dfs(self):
 
         psm_df_list = []
         len_frags = len(fragment_mz_df)//self.max_spec_per_query
-        self.psm_df["rt_start"] = self.psm_df.rt - self.rt_query_window/2
-        self.psm_df["rt_stop"] = self.psm_df.rt + self.rt_query_window/2
         for i in range(self.max_spec_per_query):
             psm_df = self.psm_df.copy()
             psm_df["frag_start_idx"] = psm_df.frag_start_idx+i*len_frags
@@ -609,38 +437,49 @@ def _prepare_matching_dfs(self):
             matched_mz_err_df
         )
     def _match_ms2_one_raw_numba(self, 
-        raw_name_psm_df_one_raw:tuple
+        raw_name, psm_df_one_raw
     ):
-        raw_name, psm_df_one_raw = raw_name_psm_df_one_raw
+        psm_df_one_raw = psm_df_one_raw.reset_index(drop=True)
+        
         if raw_name in self._ms_file_dict:
             raw_data = load_ms_data(
                 self._ms_file_dict[raw_name], self._ms_file_type,
                 process_count=self.ms_loader_thread_num
             )
 
-            psm_df_one_raw = self._add_missing_columns_to_psm_df(
-                psm_df_one_raw, raw_data
-            )
-
             if self.use_ppm:
                 all_frag_mz_tols = self.fragment_mz_df.values*self.tolerance*1e-6
             else:
                 all_frag_mz_tols = np.full_like(self.fragment_mz_df.values, self.tolerance)
 
-            spec_idxes = get_dia_spec_idxes(
-                raw_data.spectrum_df.rt.values,
-                raw_data.spectrum_df.isolation_lower_mz.values,
-                raw_data.spectrum_df.isolation_upper_mz.values,
-                psm_df_one_raw.rt_start.values[:len(psm_df_one_raw)//self.max_spec_per_query],
-                psm_df_one_raw.rt_stop.values[:len(psm_df_one_raw)//self.max_spec_per_query],
-                psm_df_one_raw.precursor_mz.values[:len(psm_df_one_raw)//self.max_spec_per_query],
-                max_spec_per_query=self.max_spec_per_query
+            psm_origin_len = len(psm_df_one_raw)//self.max_spec_per_query
+            
+            grouper = NormalDIAGrouper(raw_data)
+
+            psm_groups = grouper.assign_dia_groups(
+                psm_df_one_raw.precursor_mz.values[
+                    :psm_origin_len
+                ]
+            )
+            
+            all_spec_idxes = np.full(
+                len(psm_df_one_raw), -1, dtype=int
             )
 
-            psm_df_one_raw["spec_idx"] = spec_idxes.T.reshape(-1)
+            for dia_group, group_df in grouper.dia_group_dfs:
+                psm_idxes = np.array(psm_groups[dia_group])
+                spec_idxes = get_dia_spec_idxes(
+                    group_df.rt.values,
+                    psm_df_one_raw.rt.values[psm_idxes],
+                    max_spec_per_query=self.max_spec_per_query
+                )
+                for i in range(spec_idxes.shape[-1]):
+                    all_spec_idxes[
+                        psm_idxes+psm_origin_len*i
+                    ] = spec_idxes[:,i]
 
             match_one_raw_with_numba(
-                psm_df_one_raw.spec_idx.values,
+                all_spec_idxes,
                 psm_df_one_raw.frag_start_idx.values,
                 psm_df_one_raw.frag_stop_idx.values,
                 self.fragment_mz_df.values,
@@ -663,6 +502,11 @@ def match_ms2_multi_raw(self,
         ms_file_type: str = "alpharaw_hdf",
         process_num:int = 8,
     ):
+        if isinstance(ms_files, list):
+            ms_files = parse_ms_files_to_dict(ms_files)
+        psm_df = psm_df[
+            psm_df.raw_name.isin(ms_files)
+        ].reset_index(drop=True)
         super().match_ms2_multi_raw(
             psm_df, ms_files, ms_file_type,
             process_num
@@ -672,4 +516,154 @@ def match_ms2_multi_raw(self,
             self.psm_df, self.fragment_mz_df, 
             self.matched_intensity_df, self.matched_mz_err_df
         )
+
+@numba.njit
+def match_one_raw_with_numba(
+    spec_idxes, frag_start_idxes, frag_stop_idxes,
+    all_frag_mzs, all_frag_mz_tols,
+    all_spec_mzs, all_spec_intensities, 
+    peak_start_idxes, peak_stop_idxes,
+    matched_intensities, matched_mz_errs, 
+    matched_closest=True,
+):
+    """ 
+    Internel function to match fragment mz values to spectrum mz values.
+    Matched_mz_errs[i] = np.inf if no peaks are matched.
+
+    Results will saved in place of matched_intensities 
+    and matched_mz_errs.
+    """
+    for spec_idx, frag_start, frag_stop in zip(
+        spec_idxes, frag_start_idxes, frag_stop_idxes
+    ):
+        if spec_idx == -1: continue
+        peak_start = peak_start_idxes[spec_idx]
+        peak_stop = peak_stop_idxes[spec_idx]
+        if peak_stop == peak_start: continue
+        spec_mzs = all_spec_mzs[peak_start:peak_stop]
+        spec_intens = all_spec_intensities[peak_start:peak_stop]
+
+        frag_mzs = all_frag_mzs[frag_start:frag_stop,:].copy()
+        frag_mz_tols = all_frag_mz_tols[frag_start:frag_stop,:].copy()
+        
+        if matched_closest:
+            matched_idxes = match_closest_peaks(
+                spec_mzs, spec_intens, 
+                frag_mzs, frag_mz_tols
+            ).reshape(-1)
+        else:
+            matched_idxes = match_highest_peaks(
+                spec_mzs, spec_intens, 
+                frag_mzs, frag_mz_tols
+            ).reshape(-1)
+
+        matched_intens = spec_intens[matched_idxes]
+        matched_intens[matched_idxes==-1] = 0
+
+        matched_mass_errs = np.abs(
+            spec_mzs[
+                matched_idxes.reshape(-1)
+            ]-frag_mzs.reshape(-1)
+        )
+        matched_mass_errs[matched_idxes==-1] = np.inf
+
+        matched_intensities[
+            frag_start:frag_stop,:
+        ] = matched_intens.reshape(frag_mzs.shape)
+
+        matched_mz_errs[
+            frag_start:frag_stop,:
+        ] = matched_mass_errs.reshape(frag_mzs.shape)
+
+def load_ms_data(
+    ms_file:Union[str, MSData_Base],
+    ms_file_type:str='alpharaw_hdf',
+    process_count:int = 8,
+)->MSData_Base:
+    """Load MS files
+
+    Parameters
+    ----------
+    ms_file : str | MSData_Base
+        ms2 file path
+
+    ms_file_type : str, optional
+        ms2 file type, could be 
+        ["alpharaw_hdf","thermo","sciex","alphapept_hdf","mgf"].
+        Default to 'alpharaw_hdf'
+    """
+    if isinstance(ms_file, MSData_Base):
+        return ms_file
+    else:
+        raw_data = ms_reader_provider.get_reader(
+            ms_file_type, process_count=process_count
+        )
+        raw_data.import_raw(ms_file)
+        return raw_data
+
+    
+@numba.njit
+def get_best_matched_intens(
+    matched_intensity_values:np.ndarray,
+    frag_start_idxes:np.ndarray,
+    frag_stop_idxes:np.ndarray,
+):
+    ret_intens = np.zeros(
+        shape = matched_intensity_values.shape[1:],
+        dtype=matched_intensity_values.dtype
+    )
+    for i in range(len(frag_start_idxes)):
+        start = frag_start_idxes[i]
+        stop = frag_stop_idxes[i]
+        i = np.argmax(np.matmul(
+            matched_intensity_values[:,start:stop,:],
+            matched_intensity_values[:,start:stop,:].T
+        ).sum())
+        ret_intens[start:stop,:] = matched_intensity_values[i,start:stop,:]
+    return ret_intens
+
+@numba.njit
+def get_ion_count_scores(
+    frag_mz_values:np.ndarray,
+    matched_intens:np.ndarray,
+    frag_start_idxes:np.ndarray,
+    frag_stop_idxes:np.ndarray,
+    min_mz:float = 200,
+):
+    scores = []
+    for i in range(len(frag_start_idxes)):
+        scores.append(np.count_nonzero(
+            matched_intens[
+                frag_start_idxes[i]:frag_stop_idxes[i],:
+            ].copy().reshape(-1)[
+                frag_mz_values[
+                    frag_start_idxes[i]:frag_stop_idxes[i],:
+                ].copy().reshape(-1)>=min_mz
+            ]
+        ))
+    return np.array(scores,np.int32)
+
+@numba.njit    
+def get_dia_spec_idxes(
+    spec_rt_values:np.ndarray, 
+    query_rt_values:np.ndarray, 
+    max_spec_per_query:int,
+):
+    rt_idxes = np.searchsorted(spec_rt_values, query_rt_values)
+    
+    spec_idxes = np.full(
+        (len(rt_idxes),max_spec_per_query),
+        -1, dtype=np.int32
+    )
+    n = max_spec_per_query // 2
+
+    for iquery in range(len(rt_idxes)):
+        if rt_idxes[iquery] < n:
+            spec_idxes[iquery,:] = np.arange(0, max_spec_per_query)
+        else:
+            spec_idxes[iquery,:] = np.arange(
+                rt_idxes[iquery]-n, rt_idxes[iquery]+n+1
+            )
+    return spec_idxes
+
             
\ No newline at end of file

From 90f97d7ced283ffbabaae03e10855007642442cc Mon Sep 17 00:00:00 2001
From: jalew188 <jalew188@gmail.com>
Date: Mon, 1 Jan 2024 21:32:50 +0100
Subject: [PATCH 03/10] FIX defaultdict typing error

---
 alpharaw/dia/normal_dia.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/alpharaw/dia/normal_dia.py b/alpharaw/dia/normal_dia.py
index b0dbbf4..40eeb7f 100644
--- a/alpharaw/dia/normal_dia.py
+++ b/alpharaw/dia/normal_dia.py
@@ -67,7 +67,7 @@ def get_ms_data_for_a_group(self,
 
     def assign_dia_groups(self, 
         precursor_mzs
-    )->typing.DefaultDict[typing.List]:
+    )->typing.DefaultDict[typing.List, typing.List]:
         dia_precursor_groups = defaultdict(list)
         for i, mz in enumerate(precursor_mzs):
             i_group = np.searchsorted(self.dia_groups, int(mz))

From 790281401135b08c43b5c7b34b157c394474bc67 Mon Sep 17 00:00:00 2001
From: jalew188 <jalew188@gmail.com>
Date: Mon, 1 Jan 2024 21:33:31 +0100
Subject: [PATCH 04/10] FIX defaultdict typing error

---
 alpharaw/dia/normal_dia.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/alpharaw/dia/normal_dia.py b/alpharaw/dia/normal_dia.py
index 40eeb7f..d93c2d0 100644
--- a/alpharaw/dia/normal_dia.py
+++ b/alpharaw/dia/normal_dia.py
@@ -85,4 +85,5 @@ def assign_dia_groups(self,
             if dia_group in self.dia_isolation_dict:
                 isolation_lower, isolation_upper = self.dia_isolation_dict[dia_group]
                 if mz >= isolation_lower and mz <= isolation_upper:
-                    dia_precursor_groups[dia_group].append(i)
\ No newline at end of file
+                    dia_precursor_groups[dia_group].append(i)
+        return dia_precursor_groups
\ No newline at end of file

From 7d1cd5576a8821475a67dd7aeeabc704b8a79fb7 Mon Sep 17 00:00:00 2001
From: jalew188 <jalew188@gmail.com>
Date: Mon, 1 Jan 2024 21:42:41 +0100
Subject: [PATCH 05/10] FIX defaultdict typing error

---
 alpharaw/match/psm_match.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/alpharaw/match/psm_match.py b/alpharaw/match/psm_match.py
index 1534937..3f98d88 100644
--- a/alpharaw/match/psm_match.py
+++ b/alpharaw/match/psm_match.py
@@ -467,7 +467,7 @@ def _match_ms2_one_raw_numba(self,
             )
 
             for dia_group, group_df in grouper.dia_group_dfs:
-                psm_idxes = np.array(psm_groups[dia_group])
+                psm_idxes = np.array(psm_groups[dia_group], dtype=np.int32)
                 spec_idxes = get_dia_spec_idxes(
                     group_df.rt.values,
                     psm_df_one_raw.rt.values[psm_idxes],

From 95c67cbb8b39631e0a219ae1545ed8753704d247 Mon Sep 17 00:00:00 2001
From: jalew188 <jalew188@gmail.com>
Date: Mon, 1 Jan 2024 22:53:06 +0100
Subject: [PATCH 06/10] FIX: group_df.frag idx for matching

---
 alpharaw/match/psm_match.py |  46 ++++-----
 nbs/test_dia_matching.ipynb | 187 ++++++++++++++++++------------------
 2 files changed, 117 insertions(+), 116 deletions(-)

diff --git a/alpharaw/match/psm_match.py b/alpharaw/match/psm_match.py
index 3f98d88..0f7b01d 100644
--- a/alpharaw/match/psm_match.py
+++ b/alpharaw/match/psm_match.py
@@ -410,6 +410,10 @@ def _prepare_matching_dfs(self):
             [fragment_mz_df]*self.max_spec_per_query,
             ignore_index=True
         )
+        if self.use_ppm:
+            self.all_frag_mz_tols = fragment_mz_df.values*self.tolerance*1e-6
+        else:
+            self.all_frag_mz_tols = np.full_like(fragment_mz_df.values, self.tolerance)
 
         psm_df_list = []
         len_frags = len(fragment_mz_df)//self.max_spec_per_query
@@ -447,11 +451,6 @@ def _match_ms2_one_raw_numba(self,
                 process_count=self.ms_loader_thread_num
             )
 
-            if self.use_ppm:
-                all_frag_mz_tols = self.fragment_mz_df.values*self.tolerance*1e-6
-            else:
-                all_frag_mz_tols = np.full_like(self.fragment_mz_df.values, self.tolerance)
-
             psm_origin_len = len(psm_df_one_raw)//self.max_spec_per_query
             
             grouper = NormalDIAGrouper(raw_data)
@@ -463,11 +462,13 @@ def _match_ms2_one_raw_numba(self,
             )
             
             all_spec_idxes = np.full(
-                len(psm_df_one_raw), -1, dtype=int
+                len(psm_df_one_raw), -1, dtype=np.int32
             )
 
             for dia_group, group_df in grouper.dia_group_dfs:
-                psm_idxes = np.array(psm_groups[dia_group], dtype=np.int32)
+                psm_idxes = psm_groups[dia_group]
+                if len(psm_idxes) == 0: continue
+                psm_idxes = np.array(psm_idxes, dtype=np.int32)
                 spec_idxes = get_dia_spec_idxes(
                     group_df.rt.values,
                     psm_df_one_raw.rt.values[psm_idxes],
@@ -478,20 +479,20 @@ def _match_ms2_one_raw_numba(self,
                         psm_idxes+psm_origin_len*i
                     ] = spec_idxes[:,i]
 
-            match_one_raw_with_numba(
-                all_spec_idxes,
-                psm_df_one_raw.frag_start_idx.values,
-                psm_df_one_raw.frag_stop_idx.values,
-                self.fragment_mz_df.values,
-                all_frag_mz_tols, 
-                raw_data.peak_df.mz.values, 
-                raw_data.peak_df.intensity.values,
-                raw_data.spectrum_df.peak_start_idx.values,
-                raw_data.spectrum_df.peak_stop_idx.values,
-                self.matched_intensity_df.values,
-                self.matched_mz_err_df.values,
-                self.match_closest
-            )
+                match_one_raw_with_numba(
+                    all_spec_idxes,
+                    psm_df_one_raw.frag_start_idx.values,
+                    psm_df_one_raw.frag_stop_idx.values,
+                    self.fragment_mz_df.values,
+                    self.all_frag_mz_tols, 
+                    raw_data.peak_df.mz.values, 
+                    raw_data.peak_df.intensity.values,
+                    group_df.peak_start_idx.values,
+                    group_df.peak_stop_idx.values,
+                    self.matched_intensity_df.values,
+                    self.matched_mz_err_df.values,
+                    self.match_closest
+                )
         else:
             print(f"`{raw_name}` is not found in ms_file_dict.")
             
@@ -662,7 +663,8 @@ def get_dia_spec_idxes(
             spec_idxes[iquery,:] = np.arange(0, max_spec_per_query)
         else:
             spec_idxes[iquery,:] = np.arange(
-                rt_idxes[iquery]-n, rt_idxes[iquery]+n+1
+                rt_idxes[iquery]-n, 
+                rt_idxes[iquery]-n+max_spec_per_query
             )
     return spec_idxes
 
diff --git a/nbs/test_dia_matching.ipynb b/nbs/test_dia_matching.ipynb
index 096be2e..e1ca41c 100644
--- a/nbs/test_dia_matching.ipynb
+++ b/nbs/test_dia_matching.ipynb
@@ -14,7 +14,16 @@
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:root:WARNING: Temp mmap arrays are written to /var/folders/fh/hf8t3l1x02d42ggk3b304_rh0000gn/T/temp_mmap_37vmr95k. Cleanup of this folder is OS dependant, and might need to be triggered manually! Current space: 624,114,790,400\n",
+      "WARNING:root:WARNING: No Bruker libraries are available for this operating system. Mobility and m/z values need to be estimated. While this estimation often returns acceptable results with errors < 0.02 Th, huge errors (e.g. offsets of 6 Th) have already been observed for some samples!\n"
+     ]
+    }
+   ],
    "source": [
     "from alpharaw.match.psm_match import PepSpecMatch_DIA"
    ]
@@ -413,7 +422,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 1/1 [00:03<00:00,  3.39s/it]\n"
+      "100%|██████████| 1/1 [00:07<00:00,  7.98s/it]\n"
      ]
     }
    ],
@@ -859,8 +868,8 @@
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
-       "      <td>111593.757812</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
@@ -869,9 +878,9 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>131525.031250</td>\n",
-       "      <td>0.000000</td>\n",
-       "      <td>23976.539062</td>\n",
+       "      <td>190300.578125</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>37765.183594</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
@@ -880,8 +889,8 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>102311.109375</td>\n",
        "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
@@ -891,9 +900,9 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>113438.023438</td>\n",
        "      <td>0.000000</td>\n",
-       "      <td>112715.445312</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
@@ -902,9 +911,9 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>496574.343750</td>\n",
        "      <td>0.000000</td>\n",
-       "      <td>419370.625000</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
@@ -925,7 +934,7 @@
        "    <tr>\n",
        "      <th>534754</th>\n",
        "      <td>0.000000</td>\n",
-       "      <td>8577.349609</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
@@ -935,8 +944,8 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>534755</th>\n",
-       "      <td>13675.758789</td>\n",
-       "      <td>29054.722656</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
@@ -947,8 +956,8 @@
        "    <tr>\n",
        "      <th>534756</th>\n",
        "      <td>0.000000</td>\n",
-       "      <td>9776.382812</td>\n",
-       "      <td>8527.920898</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
@@ -958,7 +967,7 @@
        "    <tr>\n",
        "      <th>534757</th>\n",
        "      <td>0.000000</td>\n",
-       "      <td>10933.444336</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
@@ -969,7 +978,7 @@
        "    <tr>\n",
        "      <th>534758</th>\n",
        "      <td>0.000000</td>\n",
-       "      <td>27407.386719</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
@@ -983,31 +992,31 @@
        "</div>"
       ],
       "text/plain": [
-       "                 b_z1          b_z2           y_z1  y_z2  b_modloss_z1  \\\n",
-       "0            0.000000      0.000000  111593.757812   0.0           0.0   \n",
-       "1       131525.031250      0.000000   23976.539062   0.0           0.0   \n",
-       "2       102311.109375      0.000000       0.000000   0.0           0.0   \n",
-       "3       113438.023438      0.000000  112715.445312   0.0           0.0   \n",
-       "4       496574.343750      0.000000  419370.625000   0.0           0.0   \n",
-       "...               ...           ...            ...   ...           ...   \n",
-       "534754       0.000000   8577.349609       0.000000   0.0           0.0   \n",
-       "534755   13675.758789  29054.722656       0.000000   0.0           0.0   \n",
-       "534756       0.000000   9776.382812    8527.920898   0.0           0.0   \n",
-       "534757       0.000000  10933.444336       0.000000   0.0           0.0   \n",
-       "534758       0.000000  27407.386719       0.000000   0.0           0.0   \n",
+       "                 b_z1  b_z2          y_z1  y_z2  b_modloss_z1  b_modloss_z2  \\\n",
+       "0            0.000000   0.0      0.000000   0.0           0.0           0.0   \n",
+       "1       190300.578125   0.0  37765.183594   0.0           0.0           0.0   \n",
+       "2            0.000000   0.0      0.000000   0.0           0.0           0.0   \n",
+       "3            0.000000   0.0      0.000000   0.0           0.0           0.0   \n",
+       "4            0.000000   0.0      0.000000   0.0           0.0           0.0   \n",
+       "...               ...   ...           ...   ...           ...           ...   \n",
+       "534754       0.000000   0.0      0.000000   0.0           0.0           0.0   \n",
+       "534755       0.000000   0.0      0.000000   0.0           0.0           0.0   \n",
+       "534756       0.000000   0.0      0.000000   0.0           0.0           0.0   \n",
+       "534757       0.000000   0.0      0.000000   0.0           0.0           0.0   \n",
+       "534758       0.000000   0.0      0.000000   0.0           0.0           0.0   \n",
        "\n",
-       "        b_modloss_z2  y_modloss_z1  y_modloss_z2  \n",
-       "0                0.0           0.0           0.0  \n",
-       "1                0.0           0.0           0.0  \n",
-       "2                0.0           0.0           0.0  \n",
-       "3                0.0           0.0           0.0  \n",
-       "4                0.0           0.0           0.0  \n",
-       "...              ...           ...           ...  \n",
-       "534754           0.0           0.0           0.0  \n",
-       "534755           0.0           0.0           0.0  \n",
-       "534756           0.0           0.0           0.0  \n",
-       "534757           0.0           0.0           0.0  \n",
-       "534758           0.0           0.0           0.0  \n",
+       "        y_modloss_z1  y_modloss_z2  \n",
+       "0                0.0           0.0  \n",
+       "1                0.0           0.0  \n",
+       "2                0.0           0.0  \n",
+       "3                0.0           0.0  \n",
+       "4                0.0           0.0  \n",
+       "...              ...           ...  \n",
+       "534754           0.0           0.0  \n",
+       "534755           0.0           0.0  \n",
+       "534756           0.0           0.0  \n",
+       "534757           0.0           0.0  \n",
+       "534758           0.0           0.0  \n",
        "\n",
        "[534759 rows x 8 columns]"
       ]
@@ -1033,16 +1042,6 @@
    "execution_count": 8,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/wenfengzeng/workspace/peptdeep/peptdeep/model/ms2.py:695: UserWarning: MPS: nonzero op is supported natively starting from macOS 13.0. Falling back on CPU. This may have performance implications. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/native/mps/operations/Indexing.mm:218.)\n",
-      "  cos[cos>1] = 1\n",
-      "/Users/wenfengzeng/workspace/peptdeep/peptdeep/model/ms2.py:699: UserWarning: torch.sort is supported by MPS on MacOS 13+, please upgrade. Falling back to CPU (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/native/mps/operations/Sort.mm:30.)\n",
-      "  sorted_idx = x.argsort(dim=1)\n"
-     ]
-    },
     {
      "data": {
       "text/html": [
@@ -1080,66 +1079,66 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>mean</th>\n",
-       "      <td>0.938649</td>\n",
-       "      <td>0.943356</td>\n",
-       "      <td>0.846499</td>\n",
-       "      <td>0.823260</td>\n",
+       "      <td>0.793184</td>\n",
+       "      <td>0.801761</td>\n",
+       "      <td>0.662681</td>\n",
+       "      <td>0.786549</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>std</th>\n",
-       "      <td>0.122191</td>\n",
-       "      <td>0.113922</td>\n",
-       "      <td>0.156032</td>\n",
-       "      <td>0.140910</td>\n",
+       "      <td>0.259575</td>\n",
+       "      <td>0.252218</td>\n",
+       "      <td>0.251722</td>\n",
+       "      <td>0.137454</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>min</th>\n",
-       "      <td>-0.057496</td>\n",
+       "      <td>-0.050093</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>0.000000</td>\n",
-       "      <td>-0.062621</td>\n",
+       "      <td>0.026527</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>25%</th>\n",
-       "      <td>0.941413</td>\n",
-       "      <td>0.946230</td>\n",
-       "      <td>0.792737</td>\n",
-       "      <td>0.750613</td>\n",
+       "      <td>0.722388</td>\n",
+       "      <td>0.736639</td>\n",
+       "      <td>0.530222</td>\n",
+       "      <td>0.704215</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>50%</th>\n",
-       "      <td>0.986450</td>\n",
-       "      <td>0.987556</td>\n",
-       "      <td>0.900479</td>\n",
-       "      <td>0.851517</td>\n",
+       "      <td>0.900511</td>\n",
+       "      <td>0.905750</td>\n",
+       "      <td>0.724400</td>\n",
+       "      <td>0.805559</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>75%</th>\n",
-       "      <td>0.997344</td>\n",
-       "      <td>0.997548</td>\n",
-       "      <td>0.955660</td>\n",
-       "      <td>0.931510</td>\n",
+       "      <td>0.972494</td>\n",
+       "      <td>0.973826</td>\n",
+       "      <td>0.855859</td>\n",
+       "      <td>0.894600</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>max</th>\n",
-       "      <td>0.999998</td>\n",
-       "      <td>0.999998</td>\n",
-       "      <td>0.998884</td>\n",
+       "      <td>1.000003</td>\n",
+       "      <td>1.000002</td>\n",
+       "      <td>1.000000</td>\n",
        "      <td>1.000000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>&gt;0.90</th>\n",
-       "      <td>0.833929</td>\n",
-       "      <td>0.844719</td>\n",
-       "      <td>0.506305</td>\n",
-       "      <td>0.335203</td>\n",
+       "      <td>0.510807</td>\n",
+       "      <td>0.522722</td>\n",
+       "      <td>0.167494</td>\n",
+       "      <td>0.260136</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>&gt;0.75</th>\n",
-       "      <td>0.933075</td>\n",
-       "      <td>0.940357</td>\n",
-       "      <td>0.807384</td>\n",
-       "      <td>0.750356</td>\n",
+       "      <td>0.726856</td>\n",
+       "      <td>0.739632</td>\n",
+       "      <td>0.461060</td>\n",
+       "      <td>0.635836</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -1148,15 +1147,15 @@
       "text/plain": [
        "                PCC           COS            SA           SPC\n",
        "count  20142.000000  20142.000000  20142.000000  20142.000000\n",
-       "mean       0.938649      0.943356      0.846499      0.823260\n",
-       "std        0.122191      0.113922      0.156032      0.140910\n",
-       "min       -0.057496      0.000000      0.000000     -0.062621\n",
-       "25%        0.941413      0.946230      0.792737      0.750613\n",
-       "50%        0.986450      0.987556      0.900479      0.851517\n",
-       "75%        0.997344      0.997548      0.955660      0.931510\n",
-       "max        0.999998      0.999998      0.998884      1.000000\n",
-       ">0.90      0.833929      0.844719      0.506305      0.335203\n",
-       ">0.75      0.933075      0.940357      0.807384      0.750356"
+       "mean       0.793184      0.801761      0.662681      0.786549\n",
+       "std        0.259575      0.252218      0.251722      0.137454\n",
+       "min       -0.050093      0.000000      0.000000      0.026527\n",
+       "25%        0.722388      0.736639      0.530222      0.704215\n",
+       "50%        0.900511      0.905750      0.724400      0.805559\n",
+       "75%        0.972494      0.973826      0.855859      0.894600\n",
+       "max        1.000003      1.000002      1.000000      1.000000\n",
+       ">0.90      0.510807      0.522722      0.167494      0.260136\n",
+       ">0.75      0.726856      0.739632      0.461060      0.635836"
       ]
      },
      "execution_count": 8,

From 651e525123c1f60cd28f40371600d5af02dc2b3b Mon Sep 17 00:00:00 2001
From: jalew188 <jalew188@gmail.com>
Date: Tue, 2 Jan 2024 15:25:56 +0100
Subject: [PATCH 07/10] FIX psm_match add rt infor if missing

---
 alpharaw/match/psm_match.py | 11 +++++++----
 nbs/test_dia_matching.ipynb |  7 +++++++
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/alpharaw/match/psm_match.py b/alpharaw/match/psm_match.py
index 0f7b01d..bcdb794 100644
--- a/alpharaw/match/psm_match.py
+++ b/alpharaw/match/psm_match.py
@@ -243,7 +243,7 @@ def match_ms2_one_raw(self,
         self.psm_df = psm_df_one_raw
 
         psm_df_one_raw = self._add_missing_columns_to_psm_df(
-            psm_df_one_raw
+            psm_df_one_raw, self.raw_data
         )
 
         (
@@ -313,6 +313,7 @@ def _match_ms2_one_raw_numba(self,
             )
         else:
             print(f"`{raw_name}` is not found in ms_file_dict.")
+        return psm_df_one_raw
     
     def match_ms2_multi_raw(self,
         psm_df: pd.DataFrame,
@@ -371,10 +372,11 @@ def match_ms2_multi_raw(self,
 
         self.ms_loader_thread_num = process_num
         # if process_num <= 1 or len(self._ms_file_dict) <= 1:
+        psm_df_list = []
         for raw_name, df_group in tqdm.tqdm(
             self.psm_df.groupby('raw_name')
         ):
-            self._match_ms2_one_raw_numba(raw_name, df_group)
+            psm_df_list.append(self._match_ms2_one_raw_numba(raw_name, df_group))
         # else:
         #     with mp.get_context("spawn").Pool(processes=process_num) as p:
         #         df_groupby = self.psm_df.groupby('raw_name')
@@ -387,7 +389,8 @@ def match_ms2_multi_raw(self,
         #                 gen_group_df(df_groupby)
         #             ), df_groupby.ngroups
         #         )
-                    
+
+        self.psm_df = pd.concat(psm_df_list, ignore_index=True)     
         return (
             self.psm_df, self.fragment_mz_df, 
             self.matched_intensity_df, self.matched_mz_err_df
@@ -495,7 +498,7 @@ def _match_ms2_one_raw_numba(self,
                 )
         else:
             print(f"`{raw_name}` is not found in ms_file_dict.")
-            
+        return psm_df_one_raw
 
     def match_ms2_multi_raw(self, 
         psm_df: pd.DataFrame, 
diff --git a/nbs/test_dia_matching.ipynb b/nbs/test_dia_matching.ipynb
index e1ca41c..2455365 100644
--- a/nbs/test_dia_matching.ipynb
+++ b/nbs/test_dia_matching.ipynb
@@ -1179,6 +1179,13 @@
     "sum(metrics_list)/len(metrics_list)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,

From ce221160710ef98e91f4d1d7c7c8f6c0b8fa7f97 Mon Sep 17 00:00:00 2001
From: jalew188 <jalew188@gmail.com>
Date: Thu, 4 Jan 2024 22:06:25 +0100
Subject: [PATCH 08/10] FEAT add PCC in psm_plot

---
 alpharaw/viz/df_utils.py               |  11 +++
 alpharaw/viz/psm_plot.py               |   3 +
 alpharaw/wrappers/alphapept_wrapper.py | 108 ++++++++++++-------------
 3 files changed, 68 insertions(+), 54 deletions(-)

diff --git a/alpharaw/viz/df_utils.py b/alpharaw/viz/df_utils.py
index b799d70..3ea4020 100644
--- a/alpharaw/viz/df_utils.py
+++ b/alpharaw/viz/df_utils.py
@@ -240,10 +240,21 @@ def make_psm_plot_df(
             ion_name=query_ion_names,
         ))
 
+        def PCC_sim(x, y):
+            x = x-np.mean(x)
+            y = y-np.mean(y)
+            return np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y)+1e-8)
+        PCC = PCC_sim(
+            -query_intensities[matched_bools],
+            matched_spec_intens
+        )
+
     df = pd.concat([
         spec_df, matched_df, mirror_df
     ], ignore_index=True)
     df["mod_sites"] = mod_sites
+    if len(mirror_df) > 0:
+        df["pcc"] = PCC
     return df
 
 def get_modified_sequence(
diff --git a/alpharaw/viz/psm_plot.py b/alpharaw/viz/psm_plot.py
index 03c3a1a..9289f35 100644
--- a/alpharaw/viz/psm_plot.py
+++ b/alpharaw/viz/psm_plot.py
@@ -225,6 +225,9 @@ def __init__(self,
     def plot(self, plot_df, sequence, title, 
         plot_unmatched_peaks=False
     ):
+        
+        if "pcc" in plot_df.columns and len(plot_df) > 0:
+            title = f"{title} (R={plot_df.pcc.values[0]:.3f})"
         self._init_plot(title)
 
         self.peak_plot.plot(
diff --git a/alpharaw/wrappers/alphapept_wrapper.py b/alpharaw/wrappers/alphapept_wrapper.py
index 240a772..b2ce70f 100644
--- a/alpharaw/wrappers/alphapept_wrapper.py
+++ b/alpharaw/wrappers/alphapept_wrapper.py
@@ -6,60 +6,6 @@
     MSData_Base, ms_reader_provider
 )
 
-def get_peak_lists(starts, ends, peak_df):
-    mass_list = [peak_df.mz.values[start:end].tolist() for start,end in zip(starts,ends)]
-    inten_list = [peak_df.intensity.values[start:end].tolist() for start,end in zip(starts,ends)]
-    return mass_list, inten_list
-
-def extract_ms1(raw_data:MSData_Base, query_data:dict):
-    spec_df = raw_data.spectrum_df.query("ms_level==1")
-    scans = spec_df.spec_idx.values
-    rts = spec_df.rt.values
-    ms_levels = spec_df.ms_level.values
-
-    mass_list_ms1, int_list_ms1 = get_peak_lists(
-        spec_df.peak_start_idx.values, 
-        spec_df.peak_stop_idx.values,
-        raw_data.peak_df
-    )
-
-    query_data["scan_list_ms1"] = scans
-    query_data["rt_list_ms1"] = rts
-    query_data["mass_list_ms1"] = np.array(mass_list_ms1, dtype=object)
-    query_data["int_list_ms1"] = np.array(int_list_ms1, dtype=object)
-    query_data["ms_list_ms1"] = ms_levels
-
-def extract_ms2(raw_data:MSData_Base, query_data:dict):
-    spec_df = raw_data.spectrum_df.query("ms_level==2")
-    scans = spec_df.spec_idx.values
-    rts = spec_df.rt.values
-    ms_levels = spec_df.ms_level.values
-    precursor_mzs2 = spec_df.precursor_mz.values
-    charges = spec_df.charge.values
-    charges[charges<=0] = 2
-
-    mass_list_ms2, int_list_ms2 = get_peak_lists(
-        spec_df.peak_start_idx.values, 
-        spec_df.peak_stop_idx.values,
-        raw_data.peak_df
-    )
-
-    query_data["scan_list_ms2"] = scans
-    query_data["rt_list_ms2"] = rts
-    query_data["mass_list_ms2"] = mass_list_ms2
-    query_data["int_list_ms2"] = int_list_ms2
-    query_data["ms_list_ms2"] = ms_levels
-    query_data["prec_mass_list2"] = precursor_mzs2
-    query_data["mono_mzs2"] = precursor_mzs2
-    query_data["charge2"] = charges
-
-def parse_msdata_to_alphapept(raw_data:MSData_Base):
-    query_data = {}
-    extract_ms1(raw_data, query_data)
-    extract_ms2(raw_data, query_data)
-
-    return query_data, raw_data.creation_time
-
 class AlphaPept_HDF_MS2_Reader(MSData_Base):
     """MS2 from AlphaPept HDF"""
     def _import(self, _path):
@@ -123,3 +69,57 @@ def _set_dataframes(self, _path):
 
 ms_reader_provider.register_reader('alphapept', AlphaPept_HDF_MS2_Reader)
 ms_reader_provider.register_reader('alphapept_hdf', AlphaPept_HDF_MS2_Reader)
+
+def get_peak_lists(starts, ends, peak_df):
+    mass_list = [peak_df.mz.values[start:end].tolist() for start,end in zip(starts,ends)]
+    inten_list = [peak_df.intensity.values[start:end].tolist() for start,end in zip(starts,ends)]
+    return mass_list, inten_list
+
+def extract_ms1(raw_data:MSData_Base, query_data:dict):
+    spec_df = raw_data.spectrum_df.query("ms_level==1")
+    scans = spec_df.spec_idx.values
+    rts = spec_df.rt.values
+    ms_levels = spec_df.ms_level.values
+
+    mass_list_ms1, int_list_ms1 = get_peak_lists(
+        spec_df.peak_start_idx.values, 
+        spec_df.peak_stop_idx.values,
+        raw_data.peak_df
+    )
+
+    query_data["scan_list_ms1"] = scans
+    query_data["rt_list_ms1"] = rts
+    query_data["mass_list_ms1"] = np.array(mass_list_ms1, dtype=object)
+    query_data["int_list_ms1"] = np.array(int_list_ms1, dtype=object)
+    query_data["ms_list_ms1"] = ms_levels
+
+def extract_ms2(raw_data:MSData_Base, query_data:dict):
+    spec_df = raw_data.spectrum_df.query("ms_level==2")
+    scans = spec_df.spec_idx.values
+    rts = spec_df.rt.values
+    ms_levels = spec_df.ms_level.values
+    precursor_mzs2 = spec_df.precursor_mz.values
+    charges = spec_df.charge.values
+    charges[charges<=0] = 2
+
+    mass_list_ms2, int_list_ms2 = get_peak_lists(
+        spec_df.peak_start_idx.values, 
+        spec_df.peak_stop_idx.values,
+        raw_data.peak_df
+    )
+
+    query_data["scan_list_ms2"] = scans
+    query_data["rt_list_ms2"] = rts
+    query_data["mass_list_ms2"] = mass_list_ms2
+    query_data["int_list_ms2"] = int_list_ms2
+    query_data["ms_list_ms2"] = ms_levels
+    query_data["prec_mass_list2"] = precursor_mzs2
+    query_data["mono_mzs2"] = precursor_mzs2
+    query_data["charge2"] = charges
+
+def parse_msdata_to_alphapept(raw_data:MSData_Base):
+    query_data = {}
+    extract_ms1(raw_data, query_data)
+    extract_ms2(raw_data, query_data)
+
+    return query_data, raw_data.creation_time

From 36313655e868b419860c846737cbe288d9d6d9ae Mon Sep 17 00:00:00 2001
From: jalew188 <jalew188@gmail.com>
Date: Fri, 5 Jan 2024 14:08:28 +0100
Subject: [PATCH 09/10] FIX mod_sites to make plot_df

---
 alpharaw/viz/df_utils.py | 37 +++++++++++++------------------------
 1 file changed, 13 insertions(+), 24 deletions(-)

diff --git a/alpharaw/viz/df_utils.py b/alpharaw/viz/df_utils.py
index 3ea4020..e5b29ec 100644
--- a/alpharaw/viz/df_utils.py
+++ b/alpharaw/viz/df_utils.py
@@ -32,40 +32,19 @@ def make_psm_plot_df_for_peptide(
     ppm:float = 20.0,
     charged_frag_types: list = ["b_z1","b_z2","y_z1","y_z2"],
     include_fragments:bool=True,
+    fragment_intensity_df:pd.DataFrame=None,
     include_precursor_isotopes:bool=False,
     max_isotope:int = 6,
     min_frag_mz:float = 100.0,
     match_mode:typing.Literal["closest","highest"]="closest",
 )->pd.DataFrame:
-    """
-
-    Args:
-        spec_masses (np.ndarray): _description_
-        spec_intensities (np.ndarray): _description_
-        sequence (str): _description_
-        mods (str): _description_
-        mod_sites (str): _description_
-        charge (int): _description_
-        rt_sec (float, optional): _description_. Defaults to 0.0.
-        mobility (float, optional): _description_. Defaults to 0.0.
-        ppm (float, optional): _description_. Defaults to 20.0.
-        charged_frag_types (list, optional): _description_. Defaults to ["b_z1","b_z2","y_z1","y_z2"].
-        include_fragments (bool, optional): _description_. Defaults to True.
-        include_precursor_isotopes (bool, optional): _description_. Defaults to False.
-        max_isotope (int, optional): _description_. Defaults to 6.
-        min_frag_mz (float, optional): _description_. Defaults to 100.0.
-        match_mode (typing.Literal[&quot;closest&quot;,&quot;highest&quot;], optional): _description_. Defaults to "closest".
-        model_mgr (_type_, optional): _description_. Defaults to None.
-
-    Returns:
-        pd.DataFrame: _description_
-    """
     
     plot_df = make_xic_plot_df_for_peptide(
         sequence, mods, mod_sites, charge,
         rt_sec=rt_sec, mobility=mobility,
         charged_frag_types=charged_frag_types,
         include_fragments=include_fragments,
+        fragment_intensity_df=fragment_intensity_df,
         include_precursor_isotopes=include_precursor_isotopes,
         max_isotope=max_isotope,
         min_frag_mz=min_frag_mz,
@@ -95,6 +74,7 @@ def make_xic_plot_df_for_peptide(
     ms_level:int=2,
     charged_frag_types: list = ["b_z1","b_z2","y_z1","y_z2"],
     include_fragments:bool=True,
+    fragment_intensity_df:pd.DataFrame = None,
     include_precursor_isotopes:bool=False,
     max_isotope:int = 6,
     min_frag_mz:float = 100.0,
@@ -108,8 +88,17 @@ def make_xic_plot_df_for_peptide(
         include_precursor_isotopes=include_precursor_isotopes,
         max_isotope=max_isotope,
     )
+    if fragment_intensity_df is not None:
+        columns = np.intersect1d(
+            fragment_mz_df.columns.values,
+            fragment_intensity_df.columns.values,
+        )
+        fragment_mz_df = fragment_mz_df[columns]
+        fragment_intensity_df = fragment_intensity_df[columns]
+        
     return translate_precursor_fragment_df_to_plot_df(
         precursor_df, fragment_mz_df,
+        fragment_intensity_df=fragment_intensity_df,
         rt_sec=rt_sec,
         mobility=mobility,
         ms_level=ms_level,
@@ -143,7 +132,7 @@ def make_psm_plot_for_dfs(
         query_mass_tols = plot_df.mz.values*ppm*1e-6,
         query_frag_idxes = plot_df.fragment_site.values,
         modified_sequence = plot_df.modified_sequence.values[0],
-        mod_sites="",
+        mod_sites=precursor_df.mod_sites.values[0],
         query_intensities = plot_df.intensity.values 
             if "intensity" in plot_df.columns else None,
         match_mode = match_mode,    

From 5f86048c01a7d96db45a93af2da92e8ad3a94c62 Mon Sep 17 00:00:00 2001
From: jalew188 <jalew188@gmail.com>
Date: Fri, 5 Jan 2024 22:17:20 +0100
Subject: [PATCH 10/10] =?UTF-8?q?Bump=20version:=200.4.0=20=E2=86=92=200.4?=
 =?UTF-8?q?.1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .bumpversion.cfg     | 2 +-
 alpharaw/__init__.py | 2 +-
 docs/conf.py         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.bumpversion.cfg b/.bumpversion.cfg
index e2df489..cdbdd14 100644
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.4.0
+current_version = 0.4.1
 commit = True
 tag = False
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
diff --git a/alpharaw/__init__.py b/alpharaw/__init__.py
index 87a572e..038ea27 100644
--- a/alpharaw/__init__.py
+++ b/alpharaw/__init__.py
@@ -15,7 +15,7 @@ def register_readers():
         print("[WARN] pythonnet is not installed")
 
 __project__ = "alpharaw"
-__version__ = "0.4.0"
+__version__ = "0.4.1"
 __license__ = "Apache"
 __description__ = "An open-source Python package to unify raw MS data access and storage."
 __author__ = "Mann Labs"
diff --git a/docs/conf.py b/docs/conf.py
index 41b01ef..6e0861f 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -23,7 +23,7 @@
 copyright = '2022, Mann Labs, MPIB'
 author = 'Mann Labs, MPIB'
 
-release = "0.4.0"
+release = "0.4.1"
 
 # -- General configuration ---------------------------------------------------