From 73f331a4b808785dddcfb8a2dc746225afe12a25 Mon Sep 17 00:00:00 2001 From: "Zeng, Wen-Feng" Date: Thu, 19 Jan 2023 11:56:14 +0100 Subject: [PATCH 1/9] update some raw access --- .github/workflows/publish_and_release.yml | 4 ++-- HISTORY.md | 0 alpharaw/raw_access/pysciexwifffilereader.py | 2 ++ alpharaw/thermo.py | 16 ++++++++++++---- alpharaw/wrappers/alphatims_wrapper.py | 13 +++++++------ 5 files changed, 23 insertions(+), 12 deletions(-) delete mode 100644 HISTORY.md diff --git a/.github/workflows/publish_and_release.yml b/.github/workflows/publish_and_release.yml index dbe7c66..357e846 100644 --- a/.github/workflows/publish_and_release.yml +++ b/.github/workflows/publish_and_release.yml @@ -44,7 +44,7 @@ jobs: cd release/pypi . ./prepare_pypi_wheel.sh - name: Publish distribution to Test PyPI - uses: pypa/gh-action-pypi-publish@master + uses: pypa/gh-action-pypi-publish@release/v1 with: password: ${{ secrets.TEST_PYPI_ALPHARAW_TOKEN }} repository_url: https://test.pypi.org/legacy/ @@ -54,7 +54,7 @@ jobs: cd release/pypi . ./install_test_pypi_wheel.sh - name: Publish distribution to PyPI - uses: pypa/gh-action-pypi-publish@master + uses: pypa/gh-action-pypi-publish@release/v1 with: password: ${{ secrets.PYPI_ALPHARAW_TOKEN }} Test_PyPi_Release: diff --git a/HISTORY.md b/HISTORY.md deleted file mode 100644 index e69de29..0000000 diff --git a/alpharaw/raw_access/pysciexwifffilereader.py b/alpharaw/raw_access/pysciexwifffilereader.py index 62c8108..71dd759 100644 --- a/alpharaw/raw_access/pysciexwifffilereader.py +++ b/alpharaw/raw_access/pysciexwifffilereader.py @@ -119,6 +119,8 @@ def load_sample(self, sample_id:int, if details.IsSwath and details.MassRangeInfo.Length > 0: center_mz = DotNetWiffOps.get_center_mz(details) isolation_window = DotNetWiffOps.get_isolation_window(details) + if isolation_window <= 0: + isolation_window = 2.0 precursor_mz_list.append(massSpectrumInfo.ParentMZ) precursor_charge_list.append(massSpectrumInfo.ParentChargeState) ce_list.append(float(massSpectrumInfo.CollisionEnergy)) diff --git a/alpharaw/thermo.py b/alpharaw/thermo.py index 08614a5..31d6783 100644 --- a/alpharaw/thermo.py +++ b/alpharaw/thermo.py @@ -61,10 +61,18 @@ def _import(self, mono_mz, charge = rawfile.GetMS2MonoMzAndChargeFromScanNum(i) - precursor_mz_values.append(mono_mz) - precursor_charges.append(charge) - isolation_mz_lowers.append(isolation_center - isolation_width / 2) - isolation_mz_uppers.append(isolation_center + isolation_width / 2) + # In thermo_tof, ms1 = ms_order==2&NCE==0? + if isolation_center <= 0 and mono_mz <= 0: + precursor_mz_values.append(-1.0) + isolation_mz_lowers.append(-1.0) + isolation_mz_uppers.append(-1.0) + precursor_charges.append(0) + ms_order_list[-1] = 1 + else: + precursor_mz_values.append(isolation_center) + isolation_mz_lowers.append(isolation_center - isolation_width / 2) + isolation_mz_uppers.append(isolation_center + isolation_width / 2) + precursor_charges.append(charge) rawfile.Close() peak_indices = np.empty(rawfile.LastSpectrumNumber + 1, np.int64) peak_indices[0] = 0 diff --git a/alpharaw/wrappers/alphatims_wrapper.py b/alpharaw/wrappers/alphatims_wrapper.py index 950f173..eadcf69 100644 --- a/alpharaw/wrappers/alphatims_wrapper.py +++ b/alpharaw/wrappers/alphatims_wrapper.py @@ -102,12 +102,13 @@ def _import_alpharaw_object( self._intensity_min_value = float(np.min(self._intensity_values)) self._intensity_max_value = float(np.max(self._intensity_values)) self._intensity_corrections = np.ones(self._frame_max_index) - self._quad_min_mz_value = float( - np.min( - self._quad_mz_values[self._quad_mz_values != -1] - ) - ) - self._quad_max_mz_value = float(np.max(self._quad_mz_values)) + _q_mzs = self._quad_mz_values[self._quad_mz_values != -1] + if len(_q_mzs) > 0: + self._quad_min_mz_value = float(np.min(_q_mzs)) + self._quad_max_mz_value = float(np.max(_q_mzs)) + else: + self._quad_min_mz_value = 0 + self._quad_max_mz_value = 0 self._precursor_max_index = int(np.max(self._precursor_indices)) + 1 self._acquisition_mode = msdata.file_type + ' ' + ( "DDA" if dda else "DIA" From 56acb646078ea45a025677b71f709ba00918cb41 Mon Sep 17 00:00:00 2001 From: "Zeng, Wen-Feng" Date: Thu, 19 Jan 2023 21:15:19 +0100 Subject: [PATCH 2/9] update precursor mz assignment --- alpharaw/raw_access/pysciexwifffilereader.py | 7 +++++-- alpharaw/thermo.py | 6 +++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/alpharaw/raw_access/pysciexwifffilereader.py b/alpharaw/raw_access/pysciexwifffilereader.py index 71dd759..10586b0 100644 --- a/alpharaw/raw_access/pysciexwifffilereader.py +++ b/alpharaw/raw_access/pysciexwifffilereader.py @@ -120,8 +120,10 @@ def load_sample(self, sample_id:int, center_mz = DotNetWiffOps.get_center_mz(details) isolation_window = DotNetWiffOps.get_isolation_window(details) if isolation_window <= 0: - isolation_window = 2.0 - precursor_mz_list.append(massSpectrumInfo.ParentMZ) + isolation_window = 3.0 + if center_mz <= 0: + center_mz = massSpectrumInfo.ParentMZ + precursor_mz_list.append(center_mz) precursor_charge_list.append(massSpectrumInfo.ParentChargeState) ce_list.append(float(massSpectrumInfo.CollisionEnergy)) isolation_lower_mz_list.append(center_mz-isolation_window/2) @@ -147,4 +149,5 @@ def load_sample(self, sample_id:int, "precursor_charge": np.array(precursor_charge_list, dtype=np.int8), 'isolation_mz_lower': np.array(isolation_lower_mz_list), 'isolation_mz_upper': np.array(isolation_upper_mz_list), + 'nce': np.array(ce_list, dtype=np.float32), } \ No newline at end of file diff --git a/alpharaw/thermo.py b/alpharaw/thermo.py index 31d6783..242404e 100644 --- a/alpharaw/thermo.py +++ b/alpharaw/thermo.py @@ -35,6 +35,7 @@ def _import(self, isolation_mz_uppers = [] precursor_charges = [] ms_order_list = [] + ce_list = [] for i in range( rawfile.FirstSpectrumNumber, rawfile.LastSpectrumNumber + 1 @@ -55,13 +56,15 @@ def _import(self, isolation_mz_lowers.append(-1.0) isolation_mz_uppers.append(-1.0) precursor_charges.append(0) + ce_list.append(0) else: isolation_center = rawfile.GetPrecursorMassForScanNum(i) isolation_width = rawfile.GetIsolationWidthForScanNum(i) mono_mz, charge = rawfile.GetMS2MonoMzAndChargeFromScanNum(i) + ce_list.append(rawfile.GetCollisionEnergyForScanNum(i)) - # In thermo_tof, ms1 = ms_order==2&NCE==0? + # In case that: ms1 = ms_order==2&NCE==0? if isolation_center <= 0 and mono_mz <= 0: precursor_mz_values.append(-1.0) isolation_mz_lowers.append(-1.0) @@ -87,6 +90,7 @@ def _import(self, 'isolation_mz_lower': np.array(isolation_mz_lowers), 'isolation_mz_upper': np.array(isolation_mz_uppers), 'ms_level': np.array(ms_order_list, dtype=np.int8), + 'nce': np.array(ce_list, dtype=np.float32), } def _set_dataframes(self, raw_data:dict): From 7a7f5355e7e2e097bf4998503c5886a60d9aa308 Mon Sep 17 00:00:00 2001 From: "Zeng, Wen-Feng" Date: Mon, 23 Jan 2023 00:04:15 +0100 Subject: [PATCH 3/9] mono mz determine --- alpharaw/thermo.py | 11 +++++++---- nbdev_nbs/match/match_utils.ipynb | 14 +++++++------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/alpharaw/thermo.py b/alpharaw/thermo.py index 242404e..e37945e 100644 --- a/alpharaw/thermo.py +++ b/alpharaw/thermo.py @@ -52,27 +52,30 @@ def _import(self, ms_order = rawfile.GetMSOrderForScanNum(i) ms_order_list.append(ms_order) if ms_order == 1: + ce_list.append(0) precursor_mz_values.append(-1.0) isolation_mz_lowers.append(-1.0) isolation_mz_uppers.append(-1.0) precursor_charges.append(0) - ce_list.append(0) else: + ce_list.append(rawfile.GetCollisionEnergyForScanNum(i)) + isolation_center = rawfile.GetPrecursorMassForScanNum(i) isolation_width = rawfile.GetIsolationWidthForScanNum(i) mono_mz, charge = rawfile.GetMS2MonoMzAndChargeFromScanNum(i) - ce_list.append(rawfile.GetCollisionEnergyForScanNum(i)) + if mono_mz <= 0: + mono_mz = isolation_center # In case that: ms1 = ms_order==2&NCE==0? - if isolation_center <= 0 and mono_mz <= 0: + if mono_mz <= 0: precursor_mz_values.append(-1.0) isolation_mz_lowers.append(-1.0) isolation_mz_uppers.append(-1.0) precursor_charges.append(0) ms_order_list[-1] = 1 else: - precursor_mz_values.append(isolation_center) + precursor_mz_values.append(mono_mz) isolation_mz_lowers.append(isolation_center - isolation_width / 2) isolation_mz_uppers.append(isolation_center + isolation_width / 2) precursor_charges.append(charge) diff --git a/nbdev_nbs/match/match_utils.ipynb b/nbdev_nbs/match/match_utils.ipynb index 4c6dfea..05ad149 100644 --- a/nbdev_nbs/match/match_utils.ipynb +++ b/nbdev_nbs/match/match_utils.ipynb @@ -79,12 +79,12 @@ "#| hide\n", "#unittests\n", "spec_masses = np.arange(10, dtype=float)*100\n", - "query_masses = spec_masses\n", + "query_masses = spec_masses[:5]\n", "Da_tols = query_masses*20*1e-6\n", "spec_intens = np.ones_like(spec_masses)\n", "idxes = match_highest_peaks(spec_masses, spec_intens, query_masses, Da_tols)\n", "assert np.all(\n", - " np.arange(10, dtype=np.int32)==\n", + " np.arange(5, dtype=np.int32)==\n", " idxes\n", ")" ] @@ -98,7 +98,7 @@ "#| hide\n", "#unittests\n", "spec_masses = np.arange(10, dtype=float)*100\n", - "query_masses = spec_masses.copy()\n", + "query_masses = spec_masses.copy()[:5]\n", "query_masses[1]+=0.01 # -1\n", "query_masses[3]+=0.003 # spec_masses[3]\n", "query_masses[5]-=0.0099 # spec_masses[5]\n", @@ -108,7 +108,7 @@ "Da_tols = query_masses*20*1e-6\n", "assert np.allclose(\n", " match_closest_peaks(spec_masses, 0, query_masses, Da_tols),\n", - " [0, -1, 2, 3, 4, 5, -1, -1, 7, 9]\n", + " [0, -1, 2, 3, 4]\n", ")" ] }, @@ -121,12 +121,12 @@ "#| hide\n", "#unittests\n", "spec_masses = np.arange(10, dtype=float)*100\n", - "query_masses = np.arange(20).reshape((10,2))*100\n", + "query_masses = np.arange(20).reshape((10,2))[:5]*100\n", "Da_tols = query_masses*20*1e-6\n", - "target = np.arange(20, dtype=np.int32)\n", + "target = np.arange(20, dtype=np.int32).reshape((10,2))[:5]\n", "target[10:] = -1\n", "assert np.allclose(\n", - " target.reshape((10,2)),\n", + " target,\n", " match_closest_peaks(spec_masses, 0, query_masses, Da_tols)\n", ")" ] From 6a4ba818b787a6bea26a407a7eeee87907a13bbf Mon Sep 17 00:00:00 2001 From: jalew188 Date: Sun, 29 Jan 2023 21:42:39 +0100 Subject: [PATCH 4/9] FIX a few bugs --- alpharaw/__init__.py | 13 ++-- alpharaw/legacy_msdata/mzml.py | 1 - alpharaw/ms_data_base.py | 27 ++++++- alpharaw/mzml.py | 97 ++++++++++++++++++++++++++ alpharaw/sciex.py | 2 +- alpharaw/thermo.py | 2 +- alpharaw/wrappers/alphatims_wrapper.py | 34 +++++++++ 7 files changed, 166 insertions(+), 10 deletions(-) delete mode 100644 alpharaw/legacy_msdata/mzml.py create mode 100644 alpharaw/mzml.py diff --git a/alpharaw/__init__.py b/alpharaw/__init__.py index 544e92d..dc3c315 100644 --- a/alpharaw/__init__.py +++ b/alpharaw/__init__.py @@ -1,14 +1,15 @@ #!python -try: +def register_all_readers(): from .ms_data_base import ms_reader_provider - from .sciex import SciexWiffData - from .thermo import ThermoRawData from .legacy_msdata import mgf - from .legacy_msdata import mzml + from . import mzml from .wrappers import alphapept_wrapper -except ImportError: - pass + try: + from .sciex import SciexWiffData + from .thermo import ThermoRawData + except (RuntimeError, ImportError): + return "[WARN] pythonnet is not installed" __project__ = "alpharaw" __version__ = "0.1.0" diff --git a/alpharaw/legacy_msdata/mzml.py b/alpharaw/legacy_msdata/mzml.py deleted file mode 100644 index 56f74e6..0000000 --- a/alpharaw/legacy_msdata/mzml.py +++ /dev/null @@ -1 +0,0 @@ -from ..ms_data_base import MSData_Base diff --git a/alpharaw/ms_data_base.py b/alpharaw/ms_data_base.py index d175734..66cd204 100644 --- a/alpharaw/ms_data_base.py +++ b/alpharaw/ms_data_base.py @@ -6,7 +6,32 @@ class MSData_Base: """ The base data structure for MS Data, other MSData loader inherit """ - def __init__(self, centroided:bool=True): + + spectrum_df: pd.DataFrame + """ + Spectrum dataframe containing the following columns: + + - `rt` (float64): in minutes + - `precursor_mz` (float64): mono_mz (DDA) or isolation center mz + - `isolation_lower_mz` (float64): left of the isolation window + - `isolation_upper_mz` (float64): right of the isolation window + - `spec_idx` (int64): spectrum index. For thermo, it is `scan_num - 1` + - `peak_start_idx` (int64): peak start position pointing to `self.peak_df` + - `peak_stop_idx` (int64): peak stop position pointing to `self._peak_df` + - `ms_level` (int8): =1 for MS1, =2 for MS2, ... + - [`scan_num`] (int64): Thermo scan number + - [`mobility`] (float64): Bruker timsTOF mobility + + Other columns depends on different vendors and file formats + """ + + peak_df: pd.DataFrame + """ + Peak list dataframe containing the follow columns: + - `mz` (float64): m/z values of the peak + - `intensity` (float32): intensity values of the peak + """ + def __init__(self, centroided:bool=True, **kwargs): """ Parameters ---------- diff --git a/alpharaw/mzml.py b/alpharaw/mzml.py new file mode 100644 index 0000000..bbea474 --- /dev/null +++ b/alpharaw/mzml.py @@ -0,0 +1,97 @@ +import numpy as np +import re +import datetime +import pathlib + +from pyteomics import mzml + +from .ms_data_base import MSData_Base + +# def parse_mzml_item(item_dict: dict) -> tuple: +# rt = float(item_dict.get('scanList').get('scan')[0].get('scan start time')) +# masses = item_dict.get('m/z array') +# intensities = item_dict.get('intensity array') +# ms_level = item_dict.get('ms level') +# prec_mz = -1.0 +# isolation_lower_mz = -1.0 +# isolation_upper_mz = -1.0 +# charge = 0 +# if ms_level == 2: +# try: +# charge = int(item_dict.get('precursorList').get('precursor')[0].get('selectedIonList').get('selectedIon')[0].get( +# 'charge state')) +# except TypeError: +# charge = 0 +# try: +# charge = int(item_dict.get('precursorList').get('precursor')[0].get('selectedIonList').get('selectedIon')[0].get( +# 'charge state')) +# except TypeError: +# charge = 0 + +# prec_mz = item_dict.get('precursorList').get('precursor')[0].get('selectedIonList').get('selectedIon')[0].get( +# 'selected ion m/z') +# try: +# iso_lower = int(item_dict.get('precursorList').get('precursor')[0].get('isolation window lower offset').get( +# 'charge state')) +# iso_upper = int(item_dict.get('precursorList').get('precursor')[0].get('isolation window upper offset').get( +# 'charge state')) +# isolation_upper_mz = prec_mz + iso_upper +# isolation_lower_mz = prec_mz - iso_lower +# except TypeError: +# isolation_upper_mz = prec_mz + 1.5 +# isolation_lower_mz = prec_mz - 1.5 + +# return ( +# rt, prec_mz, isolation_lower_mz, isolation_upper_mz, +# ms_level, charge, masses, intensities +# ) + +# class MSFraggerMzML_Data(MSData_Base): +# def _import(self, +# filename: str, +# ): +# reader = mzml.read(filename, use_index=True) +# spec_indices = np.array(range(1, len(reader) + 1)) + +# scan_list = [] +# rt_list = [] +# mzs_list = [] +# intens_list = [] +# ms_level_list = [] +# prec_mz_list = [] +# charge_list = [] + +# self.file_type = 'unknown' + +# for idx, i in enumerate(spec_indices): +# spec = next(reader) +# if idx == 0: +# ext = re.findall(r"File:\".+\.(\w+)\"", spec['spectrum title'])[0] +# if ext.lower() == 'raw': +# self.file_type = "thermo" + +# scan_list.append(i) +# rt, prec_mz, ms_level, charge, masses, intensities = parse_mzml_item(spec) + +# sortindex = np.argsort(masses) + +# masses = masses[sortindex] +# intensities = intensities[sortindex] + +# rt_list.append(rt) + +# #Remove zero intensities +# to_keep = intensities>0 +# masses = masses[to_keep] +# intensities = intensities[to_keep] + +# mzs_list.append(masses) +# intens_list.append(intensities) +# ms_level_list.append(ms_level) +# prec_mz_list.append(prec_mz) +# charge_list.append(charge) + +# fname = pathlib.Path(filename) +# acquisition_date_time = datetime.datetime.fromtimestamp(fname.stat().st_mtime).strftime('%Y-%m-%dT%H:%M:%S') + +# return acquisition_date_time \ No newline at end of file diff --git a/alpharaw/sciex.py b/alpharaw/sciex.py index 6df4916..a68f006 100644 --- a/alpharaw/sciex.py +++ b/alpharaw/sciex.py @@ -10,7 +10,7 @@ class SciexWiffData(MSData_Base): """ Loading Sciex Wiff data as MSData_Base data structure. """ - def __init__(self, centroided:bool=True): + def __init__(self, centroided:bool=True, **kwargs): """ Parameters ---------- diff --git a/alpharaw/thermo.py b/alpharaw/thermo.py index e37945e..87d4f13 100644 --- a/alpharaw/thermo.py +++ b/alpharaw/thermo.py @@ -9,7 +9,7 @@ class ThermoRawData(MSData_Base): """ Loading Thermo Raw data as MSData_Base data structure. """ - def __init__(self, centroided:bool=True): + def __init__(self, centroided:bool=True, **kwargs): """ Parameters ---------- diff --git a/alpharaw/wrappers/alphatims_wrapper.py b/alpharaw/wrappers/alphatims_wrapper.py index eadcf69..8ade13e 100644 --- a/alpharaw/wrappers/alphatims_wrapper.py +++ b/alpharaw/wrappers/alphatims_wrapper.py @@ -6,6 +6,40 @@ from ..ms_data_base import MSData_Base +class AlphaTimsReader(MSData_Base): + """ + > TimsTOF data are too large, do not use this class + """ + def import_raw(self, burker_d_folder:str): + tims = TimsTOF( + burker_d_folder, + ) + + self.raw_file_path = burker_d_folder + self.file_type = 'bruker' + + self.spectrum_df + + self.spectrum_df['precursor_mz'] = tims.fragment_frames.IsolationMz.values + isolations = tims.fragment_frames.IsolationWidth.values/2 + self.spectrum_df['isolation_lower_mz'] = self.spectrum_df.precursor_mz - isolations + self.spectrum_df['isolation_upper_mz'] = self.spectrum_df.precursor_mz + isolations + self.spectrum_df['peak_start_idx'] = tims._push_indptr[:-1] + self.spectrum_df['peak_stop_idx'] = tims._push_indptr[1:] + self.spectrum_df['rt'] = tims.rt_values/60 + self.spectrum_df['mobility'] = tims.mobility_values + self.spectrum_df['ms_level'] = ( + tims.precursor_indices>0 + ).astype(np.int8)+1 + self.spectrum_df['spec_idx'] = self.spectrum_df.index.values + self.spectrum_df['tims_frame'] = self.spectrum_df.spec_idx // tims.scan_max_index + self.spectrum_df['tims_scan'] = self.spectrum_df.spec_idx % tims.scan_max_index + + self.peak_df['mz'] = tims.mz_values[tims.tof_indices] + self.peak_df['intensity'] = tims.intensity_values.astype(np.float32) + + + class AlphaTimsWrapper(TimsTOF): """Create a AlphaTims object that contains all data in-memory (or memory mapping). From 095bbda5792e262252a29a814b0a50d326c48028 Mon Sep 17 00:00:00 2001 From: jalew188 Date: Sun, 29 Jan 2023 22:21:10 +0100 Subject: [PATCH 5/9] add AlphaPept bruker FF (not done) --- alpharaw/bruker/ap_ff.py | 165 ++++++++++++++++++++++++++++++++++ alpharaw/mzml.py | 12 +-- requirements/requirements.txt | 1 + 3 files changed, 172 insertions(+), 6 deletions(-) create mode 100644 alpharaw/bruker/ap_ff.py diff --git a/alpharaw/bruker/ap_ff.py b/alpharaw/bruker/ap_ff.py new file mode 100644 index 0000000..a2e0379 --- /dev/null +++ b/alpharaw/bruker/ap_ff.py @@ -0,0 +1,165 @@ +import pandas as pd +import numpy as np +import sqlalchemy as db + +import subprocess +import os +import platform + +from tqdm import tqdm + + +def extract_bruker(file:str, base_dir:str = "ext/bruker/FF", config:str = "proteomics_4d.config"): + """Call Bruker Feautre Finder via subprocess. + + Args: + file (str): Filename for feature finding. + base_dir (str, optional): Base dir where the feature finder is stored.. Defaults to "ext/bruker/FF". + config (str, optional): Config file for feature finder. Defaults to "proteomics_4d.config". + + Raises: + NotImplementedError: Unsupported operating system. + FileNotFoundError: Feature finder not found. + FileNotFoundError: Config file not found. + FileNotFoundError: Feature file not found. + """ + feature_path = file + '/'+ os.path.split(file)[-1] + '.features' + + base_dir = os.path.join(os.path.dirname(__file__), base_dir) + + operating_system = platform.system() + + if operating_system == 'Linux': + ff_dir = os.path.join(base_dir, 'linux64','uff-cmdline2') + print('Using Linux FF') + elif operating_system == 'Windows': + ff_dir = os.path.join(base_dir, 'win64','uff-cmdline2.exe') + print('Using Windows FF') + else: + raise NotImplementedError(f"System {operating_system} not supported.") + + if os.path.exists(feature_path): + return feature_path + else: + if not os.path.isfile(ff_dir): + raise FileNotFoundError(f'Bruker feature finder cmd not found here {ff_dir}.') + + config_path = base_dir + '/'+ config + + if not os.path.isfile(config_path): + raise FileNotFoundError(f'Config file not found here {config_path}.') + + if operating_system == 'Windows': + FF_parameters = [ff_dir,'--ff 4d',f'--readconfig "{config_path}"', f'--analysisDirectory "{file}"'] + + process = subprocess.Popen(' '.join(FF_parameters), stdout=subprocess.PIPE) + for line in iter(process.stdout.readline, b''): + logtxt = line.decode('utf8') + print(logtxt[48:].rstrip()) #Remove logging info from FF + elif operating_system == 'Linux': + FF_parameters = [ + ff_dir, + '--ff', + '4d', + '--readconfig', + config_path, + '--analysisDirectory', + file + ] + process = subprocess.run(FF_parameters, stdout=subprocess.PIPE) + + if os.path.exists(feature_path): + return feature_path + else: + raise FileNotFoundError(f"Feature file {feature_path} does not exist.") + +def convert_bruker(feature_path:str)->pd.DataFrame: + """Reads feature table and converts to feature table to be used with AlphaPept. + + Args: + feature_path (str): Path to the feature file from Bruker FF (.features-file). + + Returns: + pd.DataFrame: DataFrame containing features information. + """ + engine_featurefile = db.create_engine('sqlite:///{}'.format(feature_path)) + feature_table = pd.read_sql_table('LcTimsMsFeature', engine_featurefile) + feature_cluster_mapping = pd.read_sql_table('FeatureClusterMapping', engine_featurefile) + + # feature_table['Mass'] = feature_table['MZ'].values * feature_table['Charge'].values - feature_table['Charge'].values*M_PROTON + feature_table = feature_table.rename(columns={ + "MZ": "mz","Mass": "mass", "RT": "rt_apex", + "RT_lower":"rt_start", "RT_upper":"rt_end", + "Mobility": "mobility", "Mobility_lower": "mobility_lower", + "Mobility_upper": "mobility_upper", "Charge":"charge", + "Intensity":'ms1_int_sum_apex',"ClusterCount":'n_isotopes' + }) + feature_table['rt_apex'] = feature_table['rt_apex']/60 + feature_table['rt_start'] = feature_table['rt_start']/60 + feature_table['rt_end'] = feature_table['rt_end']/60 + + feature_cluster_mapping = feature_cluster_mapping.rename(columns={ + "FeatureId": "feature_id", "ClusterId": "cluster_id", + "Monoisotopic": "monoisotopic", "Intensity": "ms1_int_sum_apex" + }) + + return feature_table, feature_cluster_mapping + + +def map_bruker(feature_path:str, feature_table:pd.DataFrame, query_data:dict)->pd.DataFrame: + """Map Ms1 to Ms2 via Table FeaturePrecursorMapping from Bruker FF. + + Args: + feature_path (str): Path to the feature file from Bruker FF (.features-file). + feature_table (pd.DataFrame): Pandas DataFrame containing the features. + query_data (dict): Data structure containing the query data. + + Returns: + pd.DataFrame: DataFrame containing features information. + """ + engine_featurefile = db.create_engine('sqlite:///{}'.format(feature_path)) + + mapping = pd.read_sql_table('FeaturePrecursorMapping', engine_featurefile) + mapping = mapping.set_index('PrecursorId') + feature_table= feature_table.set_index('Id') + + + query_prec_id = query_data['prec_id'] + + #Now look up the feature for each precursor + + mass_matched = [] + mz_matched = [] + rt_matched = [] + query_idx = [] + f_idx = [] + + for idx, prec_id in tqdm(enumerate(query_prec_id)): + try: + f_id = mapping.loc[prec_id]['FeatureId'] + all_matches = feature_table.loc[f_id] + if type(f_id) == np.int64: + match = all_matches + mz_matched.append(match['mz']) + rt_matched.append(match['rt_apex']) + mass_matched.append(match['mass']) + query_idx.append(idx) + f_idx.append(match['FeatureId']) + + else: + for k in range(len(all_matches)): + match = all_matches.iloc[k] + mz_matched.append(match['mz']) + rt_matched.append(match['rt_apex']) + mass_matched.append(match['mass']) + query_idx.append(idx) + f_idx.append(match['FeatureId']) + + except KeyError: + pass + + features = pd.DataFrame(np.array([mass_matched, mz_matched, rt_matched, query_idx, f_idx]).T, columns = ['mass_matched', 'mz_matched', 'rt_matched', 'query_idx', 'feature_idx']) + + features['query_idx'] = features['query_idx'].astype('int') + + return features \ No newline at end of file diff --git a/alpharaw/mzml.py b/alpharaw/mzml.py index bbea474..05ee253 100644 --- a/alpharaw/mzml.py +++ b/alpharaw/mzml.py @@ -1,11 +1,11 @@ -import numpy as np -import re -import datetime -import pathlib +# import numpy as np +# import re +# import datetime +# import pathlib -from pyteomics import mzml +# from pyteomics import mzml -from .ms_data_base import MSData_Base +# from .ms_data_base import MSData_Base # def parse_mzml_item(item_dict: dict) -> tuple: # rt = float(item_dict.get('scanList').get('scan')[0].get('scan start time')) diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 942f33f..bfa3c4e 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -5,5 +5,6 @@ h5py numba pandas numpy +sqlalchemy alphabase \ No newline at end of file From 7588c533cab5363870c556f723089581cec95814 Mon Sep 17 00:00:00 2001 From: jalew188 Date: Thu, 13 Apr 2023 14:13:58 +0200 Subject: [PATCH 6/9] CHROE --- alpharaw/__init__.py | 4 +- alpharaw/utils/centroiding.py | 39 ++++--------------- .../development.txt | 0 .../requirements.txt => requirements.txt | 0 setup.py | 6 +-- 5 files changed, 10 insertions(+), 39 deletions(-) rename requirements/requirements_development.txt => extra_requirements/development.txt (100%) rename requirements/requirements.txt => requirements.txt (100%) diff --git a/alpharaw/__init__.py b/alpharaw/__init__.py index dc3c315..8ba963d 100644 --- a/alpharaw/__init__.py +++ b/alpharaw/__init__.py @@ -1,6 +1,6 @@ #!python -def register_all_readers(): +def register_readers(): from .ms_data_base import ms_reader_provider from .legacy_msdata import mgf from . import mzml @@ -50,5 +50,5 @@ def register_all_readers(): # "Scientific paper": None, } __extra_requirements__ = { - "development": "requirements_development.txt", + "development": "extra_requirements/development.txt", } diff --git a/alpharaw/utils/centroiding.py b/alpharaw/utils/centroiding.py index 039bcf6..f075dad 100644 --- a/alpharaw/utils/centroiding.py +++ b/alpharaw/utils/centroiding.py @@ -3,57 +3,32 @@ import numpy as np from numba import njit from numba.typed import List -import numba @njit -def get_a_centroid_group_all_adjacent( - start_list, end_list, +def get_neighbors_start_stop( + peak_mzs:np.ndarray, + start_list, stop_list, start_idx:int, mz_tol:float, mz_diffs:np.ndarray, - int_grads:np.ndarray, ): for i in range(start_idx, len(mz_diffs)): if mz_diffs[i] > mz_tol: start_list.append(start_idx) - end_list.append(i) + stop_list.append(i+1) return i + 1 start_list.append(start_idx) - end_list.append(len(mz_diffs)) + stop_list.append(len(mz_diffs)) return len(mz_diffs)+1 -@njit -def get_a_centroid_group( - start_list, end_list, - start_idx:int, - mz_tol:float, - mz_diffs:np.ndarray, - int_grads:np.ndarray, -): - go_up = True - for i in range(start_idx, len(int_grads)): - if mz_diffs[i] > mz_tol: - start_list.append(start_idx) - end_list.append(i) - return i + 1 - elif int_grads[i] > 0 and not go_up: - start_list.append(start_idx) - end_list.append(i) - return i - elif go_up: - go_up = False - start_list.append(start_idx) - end_list.append(len(int_grads)) - return len(int_grads)+1 - @njit def get_peaks( mz_array:np.ndarray, int_array: np.ndarray, mz_tol:float, ) -> list: - start_list = List.empty_list(numba.int64) - end_list = List.empty_list(numba.int64) + start_list = List() + end_list = List() gradient = np.diff(int_array) mz_diffs = np.diff(mz_array) start = 0 diff --git a/requirements/requirements_development.txt b/extra_requirements/development.txt similarity index 100% rename from requirements/requirements_development.txt rename to extra_requirements/development.txt diff --git a/requirements/requirements.txt b/requirements.txt similarity index 100% rename from requirements/requirements.txt rename to requirements.txt diff --git a/setup.py b/setup.py index 1558820..81d1871 100644 --- a/setup.py +++ b/setup.py @@ -24,11 +24,7 @@ def get_requirements(): requirement_file_names = package2install.__extra_requirements__ requirement_file_names[""] = "requirements.txt" for extra, requirement_file_name in requirement_file_names.items(): - full_requirement_file_name = os.path.join( - "requirements", - requirement_file_name, - ) - with open(full_requirement_file_name) as requirements_file: + with open(requirement_file_name) as requirements_file: if extra != "": extra_stable = f"{extra}-stable" else: From 1f3573bee372197f5514a08f16b7dcc33fa4935a Mon Sep 17 00:00:00 2001 From: jalew188 Date: Fri, 16 Jun 2023 11:22:03 +0200 Subject: [PATCH 7/9] many updates --- alpharaw/__init__.py | 2 +- alpharaw/legacy_msdata/mgf.py | 11 +- alpharaw/match/match_utils.py | 45 +- alpharaw/match/psm_match.py | 40 +- alpharaw/match/psm_match_alphatims.py | 60 +- alpharaw/ms_data_base.py | 67 +- alpharaw/mzml.py | 235 ++++--- alpharaw/sciex.py | 4 +- alpharaw/thermo.py | 16 +- alpharaw/wrappers/alphapept_wrapper.py | 2 +- nbdev_nbs/match/match_utils.ipynb | 16 +- nbdev_nbs/match/psm_match.ipynb | 30 +- nbdev_nbs/test_mzml.ipynb | 894 +++++++++++++++++++++++++ nbs/test_tims_match.ipynb | 2 +- 14 files changed, 1233 insertions(+), 191 deletions(-) create mode 100644 nbdev_nbs/test_mzml.ipynb diff --git a/alpharaw/__init__.py b/alpharaw/__init__.py index 8ba963d..3257380 100644 --- a/alpharaw/__init__.py +++ b/alpharaw/__init__.py @@ -3,7 +3,7 @@ def register_readers(): from .ms_data_base import ms_reader_provider from .legacy_msdata import mgf - from . import mzml + from .mzml import MzMLReader from .wrappers import alphapept_wrapper try: from .sciex import SciexWiffData diff --git a/alpharaw/legacy_msdata/mgf.py b/alpharaw/legacy_msdata/mgf.py index e2e770e..732a5b5 100644 --- a/alpharaw/legacy_msdata/mgf.py +++ b/alpharaw/legacy_msdata/mgf.py @@ -2,7 +2,8 @@ from alpharaw.ms_data_base import ( index_ragged_list, MSData_Base, - ms_reader_provider + ms_reader_provider, + PEAK_MZ_DTYPE, PEAK_INTENSITY_DTYPE ) def read_until(file, until): @@ -73,8 +74,8 @@ def _import(self, _path:str): rt_list.append(RT) precursor_mz_list.append(precursor_mz) charge_list.append(charge) - masses_list.append(np.array(masses)) - intens_list.append(np.array(intens)) + masses_list.append(np.array(masses, dtype=PEAK_MZ_DTYPE)) + intens_list.append(np.array(intens, dtype=PEAK_INTENSITY_DTYPE)) if isinstance(_path, str): f.close() @@ -111,7 +112,7 @@ def _set_dataframes(self, raw_data:dict): charges = np.zeros(spec_num, np.int8) charges[spec_idxes] = raw_data['precursor_charge'] - self.set_peaks_by_cat_array( + self.set_peak_df_by_indexed_array( raw_data['peak_mz'], raw_data['peak_intensity'], start_idxes,end_idxes @@ -122,7 +123,7 @@ def _set_dataframes(self, raw_data:dict): self.set_precursor_mz( precursor_mzs ) - self.set_precursor_mz_windows( + self.set_isolation_mz_windows( mz_lowers,mz_uppers ) diff --git a/alpharaw/match/match_utils.py b/alpharaw/match/match_utils.py index 22a6217..c828483 100644 --- a/alpharaw/match/match_utils.py +++ b/alpharaw/match/match_utils.py @@ -36,11 +36,13 @@ def match_closest_peaks( Matched indices of spec_mzs, -1 means no peaks were matched. Same shape as query_mzs """ - query_left_mzs = query_mzs-query_mz_tols - query_right_mzs = query_mzs+query_mz_tols + mzs = query_mzs.reshape(-1) + query_mz_tols = query_mz_tols.reshape(-1) + query_left_mzs = mzs-query_mz_tols + query_right_mzs = mzs+query_mz_tols idxes = np.searchsorted(spec_mzs, query_left_mzs) - ret_indices = np.empty_like(query_mzs, dtype=np.int32) - for i,idx in np.ndenumerate(idxes): + ret_indices = np.empty_like(mzs, dtype=np.int32) + for i,idx in enumerate(idxes): min_merr = 1000000 closest_idx = -1 for _idx in range(idx, len(spec_mzs)): @@ -48,11 +50,11 @@ def match_closest_peaks( break elif spec_mzs[_idx] abs(spec_mzs[_idx]-query_mzs[i]): - min_merr = abs(spec_mzs[_idx]-query_mzs[i]) + elif min_merr > abs(spec_mzs[_idx]-mzs[i]): + min_merr = abs(spec_mzs[_idx]-mzs[i]) closest_idx = _idx ret_indices[i] = closest_idx - return ret_indices + return ret_indices.reshape(query_mzs.shape) @numba.njit @@ -85,11 +87,13 @@ def match_highest_peaks( Matched indices of spec_mzs, -1 means no peaks were matched. Same shape as query_mzs """ - query_left_mzs = query_mzs-query_mz_tols - query_right_mzs = query_mzs+query_mz_tols + mzs = query_mzs.reshape(-1) + query_mz_tols = query_mz_tols.reshape(-1) + query_left_mzs = mzs-query_mz_tols + query_right_mzs = mzs+query_mz_tols idxes = np.searchsorted(spec_mzs, query_left_mzs) - ret_indices = np.empty_like(query_mzs, dtype=np.int32) - for i,idx in np.ndenumerate(idxes): + ret_indices = np.empty_like(mzs, dtype=np.int32) + for i,idx in enumerate(idxes): highest = 0 highest_idx = -1 for _idx in range(idx, len(spec_mzs)): @@ -101,7 +105,7 @@ def match_highest_peaks( highest = spec_intens[_idx] highest_idx = _idx ret_indices[i] = highest_idx - return ret_indices + return ret_indices.reshape(query_mzs.shape) @numba.njit def match_profile_peaks( @@ -137,16 +141,18 @@ def match_profile_peaks( np.ndarray: matched last (right-most) indices, the shape is the same as query_mzs. -1 means no peaks are matched for the query mz. """ - query_left_mzs = query_mzs-query_mz_tols - query_right_mzs = query_mzs+query_mz_tols + mzs = query_mzs.reshape(-1) + query_mz_tols = query_mz_tols.reshape(-1) + query_left_mzs = mzs-query_mz_tols + query_right_mzs = mzs+query_mz_tols idxes = np.searchsorted(spec_mzs, query_left_mzs) first_indices = np.full_like( - query_mzs, -1, dtype=np.int32 + mzs, -1, dtype=np.int32 ) last_indices = np.full_like( - query_mzs, -1, dtype=np.int32 + mzs, -1, dtype=np.int32 ) - for i,idx in np.ndenumerate(idxes): + for i,idx in enumerate(idxes): for first_idx in range(idx, len(spec_mzs)): if spec_mzs[first_idx] 1: + elif self.find_k_nearest_ms2_by_im and self.tims_data.scan_max_index > 1: # AlphaTims without AlphaRaw for .d files im_slice = self.tims_data.scan_max_index-np.searchsorted( self.tims_data.mobility_values[::-1], im ) else: im_slice = slice( - im-self.im_win_to_slice_ms2/2, - im+self.im_win_to_slice_ms2/2 + im-self.im_tol_to_slice_ms2, + im+self.im_tol_to_slice_ms2 ) spec_df = self.tims_data[ rt_slice, im_slice, precursor_mz:precursor_mz ] - def find_nearest(array, val): - return np.argmin(np.abs(array-val)) + def find_k_nearest(array, val, k=3): + nearest = np.argmin(np.abs(array-val)) + if nearest <= k//2: + return slice(k) + elif nearest >= len(array)-k//2-1: + return slice(-k, None) + else: + return slice(nearest-k//2, nearest+k//2+1) if ( - self.find_closest_ms2_by_im and - im>0 and self.tims_data.scan_max_index==1 + self.find_k_nearest_ms2_by_im and + im>0 and self.tims_data.scan_max_index>1 ): # RAW from AlphaRaw, mobility===0 in AlphaTims wrapper obj - spec_idxes = spec_df.frame_indices.unique() - if len(spec_idxes) > 1: # im from psm - spec_idx = spec_idxes[ - find_nearest( - self.raw_data.spectrum_df.mobility.values[spec_idxes], im + scan_idxes = np.sort(spec_df.scan_indices.unique()) + if len(scan_idxes) > 1: # im from psm + scan_idxes = scan_idxes[ + find_k_nearest( + self.raw_data.spectrum_df.mobility.values[scan_idxes], + im, self.k_im_nearest ) ] - spec_df = spec_df[spec_df.frame_indices==spec_idx] + spec_df = spec_df[spec_df.scan_indices.isin(scan_idxes)] - if self.find_closest_ms2_by_rt: - rt_values = spec_df.rt_values.unique() + if self.find_k_nearest_ms2_by_rt: + rt_values = np.sort(spec_df.rt_values.unique()) if len(rt_values) > 1: - closest_rt = rt_values[find_nearest(rt_values, rt_sec)] - spec_df = spec_df[spec_df.rt_values==closest_rt] + closest_rts = rt_values[ + find_k_nearest(rt_values, rt_sec, self.k_rt_nearest) + ] + spec_df = spec_df[spec_df.rt_values.isin(closest_rts)] - return spec_df.sort_values('mz_values').reset_index(drop=True) + return spec_df def get_peaks( self, @@ -185,6 +196,7 @@ def get_peaks( np.ndarray: peak intensity values """ spec_df = self.get_peak_df(precursor_mz, rt, im) + spec_df = spec_df.sort_values('mz_values').reset_index(drop=True) return ( spec_df.mz_values.values, spec_df.intensity_values.values diff --git a/alpharaw/ms_data_base.py b/alpharaw/ms_data_base.py index 66cd204..132270b 100644 --- a/alpharaw/ms_data_base.py +++ b/alpharaw/ms_data_base.py @@ -1,6 +1,9 @@ import pandas as pd import numpy as np from alphabase.io.hdf import HDF_File +from alphabase.constants._const import ( + PEAK_MZ_DTYPE, PEAK_INTENSITY_DTYPE +) class MSData_Base: """ @@ -28,9 +31,15 @@ class MSData_Base: peak_df: pd.DataFrame """ Peak list dataframe containing the follow columns: - - `mz` (float64): m/z values of the peak - - `intensity` (float32): intensity values of the peak + - `mz` (PEAK_MZ_DTYPE in alphabase, float32 by default): m/z values of the peak + - `intensity` (PEAK_INTENSITY_DTYPE in alphabase, float32 by default): intensity values of the peak """ + + vocab: list = [ + "CID", "HCD", "ETD", "ECD", "EAD", "EXD", "UVPD", + "ETHCD", "ETCID", "EXCID", "NETD", + "IT", "FT", "TOF", + ] def __init__(self, centroided:bool=True, **kwargs): """ Parameters @@ -47,6 +56,16 @@ def __init__(self, centroided:bool=True, **kwargs): self.centroided = centroided self.creation_time = '' self.file_type = '' + self.instrument = 'none' + + def _get_term_id(self, terminology:str): + """ + Get terminology id from :data:`self.vocab`, -1 if not exist. + """ + try: + return self.vocab.index(terminology) + except ValueError: + return -1 @property def raw_file_path(self)->str: @@ -66,16 +85,20 @@ def load_raw(self, _path:str): self.import_raw(_path) def _save_meta_to_hdf(self, hdf:HDF_File): - hdf.ms_data.creation_time = self.creation_time - hdf.ms_data.raw_file_path = self.raw_file_path - hdf.ms_data.file_type = self.file_type - hdf.ms_data.centroided = self.centroided + hdf.ms_data.meta = { + "creation_time": self.creation_time, + "raw_file_path": self.raw_file_path, + "file_type": self.file_type, + "centroided": self.centroided, + "instrument": self.instrument, + } def _load_meta_from_hdf(self, hdf:HDF_File): - self.creation_time = hdf.ms_data.creation_time - self.raw_file_path = hdf.ms_data.raw_file_path - self.file_type = hdf.ms_data.file_type - self.centroided = hdf.ms_data.centroided + self.creation_time = hdf.ms_data.meta.creation_time + self.raw_file_path = hdf.ms_data.meta.raw_file_path + self.file_type = hdf.ms_data.meta.file_type + self.centroided = hdf.ms_data.meta.centroided + self.instrument = hdf.ms_data.meta.instrument def save_hdf(self, _path:str): hdf = HDF_File( @@ -123,6 +146,13 @@ def _check_df(self): self._check_rt() # self._check_mobility() self._check_precursor_mz() + self._check_peak_dtypes() + + def _check_peak_dtypes(self): + if self.peak_df.mz.dtype != PEAK_MZ_DTYPE: + self.peak_df.mz = self.peak_df.mz.astype(PEAK_MZ_DTYPE) + if self.peak_df.intensity.dtype != PEAK_INTENSITY_DTYPE: + self.peak_df.intensity = self.peak_df.intensity.astype(PEAK_INTENSITY_DTYPE) def _check_rt(self): assert 'rt' in self.spectrum_df.columns @@ -147,25 +177,25 @@ def create_spectrum_df(self, ) self.spectrum_df['spec_idx'] = self.spectrum_df.index.values - def set_peaks_by_cat_array(self, + def set_peak_df_by_indexed_array(self, mz_array:np.ndarray, intensity_array:np.ndarray, peak_start_indices:np.ndarray, peak_stop_indices:np.ndarray, ): self.peak_df = pd.DataFrame({ - 'mz': mz_array, - 'intensity': intensity_array + 'mz': mz_array.astype(PEAK_MZ_DTYPE), + 'intensity': intensity_array.astype(PEAK_INTENSITY_DTYPE) }) self.spectrum_df['peak_start_idx'] = peak_start_indices self.spectrum_df['peak_stop_idx'] = peak_stop_indices - def set_peaks_by_array_list(self, + def set_peak_df_by_array_list(self, mz_array_list:list, intensity_array_list:list, ): indices = index_ragged_list(mz_array_list) - self.set_peaks_by_cat_array( + self.set_peak_df_by_indexed_array( np.concatenate(mz_array_list), np.concatenate(intensity_array_list), indices[:-1], @@ -193,13 +223,16 @@ def add_column_in_spec_df(self, column_name ] = self.spectrum_df[column_name].astype(dtype) - def add_column_in_df_by_thermo_scan_num(self, + def add_column_in_df_by_scan_num(self, column_name:str, values:np.ndarray, scan_nums:np.ndarray, dtype:np.dtype=np.float64, na_value=np.nan, ): + """ + scan num starts from 1 not 0 + """ self.add_column_in_spec_df( column_name, values, scan_nums-1, @@ -225,7 +258,7 @@ def set_precursor_mz(self, spec_idxes, np.float64, -1.0 ) - def set_precursor_mz_windows(self, + def set_isolation_mz_windows(self, precursor_lower_mz_values:np.ndarray, precursor_upper_mz_values:np.ndarray, spec_idxes:np.ndarray=None, diff --git a/alpharaw/mzml.py b/alpharaw/mzml.py index 05ee253..97e2d49 100644 --- a/alpharaw/mzml.py +++ b/alpharaw/mzml.py @@ -1,97 +1,166 @@ -# import numpy as np -# import re -# import datetime -# import pathlib +import numpy as np -# from pyteomics import mzml +from pyteomics import mzml -# from .ms_data_base import MSData_Base +from .ms_data_base import ( + MSData_Base, PEAK_MZ_DTYPE, PEAK_INTENSITY_DTYPE +) -# def parse_mzml_item(item_dict: dict) -> tuple: -# rt = float(item_dict.get('scanList').get('scan')[0].get('scan start time')) -# masses = item_dict.get('m/z array') -# intensities = item_dict.get('intensity array') -# ms_level = item_dict.get('ms level') -# prec_mz = -1.0 -# isolation_lower_mz = -1.0 -# isolation_upper_mz = -1.0 -# charge = 0 -# if ms_level == 2: -# try: -# charge = int(item_dict.get('precursorList').get('precursor')[0].get('selectedIonList').get('selectedIon')[0].get( -# 'charge state')) -# except TypeError: -# charge = 0 -# try: -# charge = int(item_dict.get('precursorList').get('precursor')[0].get('selectedIonList').get('selectedIon')[0].get( -# 'charge state')) -# except TypeError: -# charge = 0 +class MzMLReader(MSData_Base): + def _import(self, + filename: str, + ): + self.file_type = 'mzml' + if isinstance(filename, str): + reader = mzml.read(filename, use_index=True) + else: + reader = filename + spec_indices = np.arange(len(reader),dtype=int) -# prec_mz = item_dict.get('precursorList').get('precursor')[0].get('selectedIonList').get('selectedIon')[0].get( -# 'selected ion m/z') -# try: -# iso_lower = int(item_dict.get('precursorList').get('precursor')[0].get('isolation window lower offset').get( -# 'charge state')) -# iso_upper = int(item_dict.get('precursorList').get('precursor')[0].get('isolation window upper offset').get( -# 'charge state')) -# isolation_upper_mz = prec_mz + iso_upper -# isolation_lower_mz = prec_mz - iso_lower -# except TypeError: -# isolation_upper_mz = prec_mz + 1.5 -# isolation_lower_mz = prec_mz - 1.5 + rt_list = [] + mzs_list = [] + intens_list = [] + ms_level_list = [] + prec_mz_list = [] + charge_list = [] + _peak_indices = [] + isolation_lower_mz_list = [] + isolation_upper_mz_list = [] + nce_list = [] -# return ( -# rt, prec_mz, isolation_lower_mz, isolation_upper_mz, -# ms_level, charge, masses, intensities -# ) + for i in spec_indices: + spec = next(reader) -# class MSFraggerMzML_Data(MSData_Base): -# def _import(self, -# filename: str, -# ): -# reader = mzml.read(filename, use_index=True) -# spec_indices = np.array(range(1, len(reader) + 1)) + ( + rt, prec_mz, isolation_lower_mz, isolation_upper_mz, + ms_level, nce, charge, masses, intensities + ) = parse_mzml_entry(spec) -# scan_list = [] -# rt_list = [] -# mzs_list = [] -# intens_list = [] -# ms_level_list = [] -# prec_mz_list = [] -# charge_list = [] - -# self.file_type = 'unknown' - -# for idx, i in enumerate(spec_indices): -# spec = next(reader) -# if idx == 0: -# ext = re.findall(r"File:\".+\.(\w+)\"", spec['spectrum title'])[0] -# if ext.lower() == 'raw': -# self.file_type = "thermo" - -# scan_list.append(i) -# rt, prec_mz, ms_level, charge, masses, intensities = parse_mzml_item(spec) + nce_list.append(nce) -# sortindex = np.argsort(masses) + sortindex = np.argsort(masses) -# masses = masses[sortindex] -# intensities = intensities[sortindex] + masses = masses[sortindex] + intensities = intensities[sortindex] -# rt_list.append(rt) + rt_list.append(rt) -# #Remove zero intensities -# to_keep = intensities>0 -# masses = masses[to_keep] -# intensities = intensities[to_keep] + #Remove zero intensities + to_keep = intensities>0 + masses = masses[to_keep] + intensities = intensities[to_keep] + + _peak_indices.append(len(masses)) -# mzs_list.append(masses) -# intens_list.append(intensities) -# ms_level_list.append(ms_level) -# prec_mz_list.append(prec_mz) -# charge_list.append(charge) + mzs_list.append(masses.astype(PEAK_MZ_DTYPE)) + intens_list.append(intensities.astype(PEAK_INTENSITY_DTYPE)) + ms_level_list.append(ms_level) + prec_mz_list.append(prec_mz) + charge_list.append(charge) + isolation_lower_mz_list.append(isolation_lower_mz) + isolation_upper_mz_list.append(isolation_upper_mz) -# fname = pathlib.Path(filename) -# acquisition_date_time = datetime.datetime.fromtimestamp(fname.stat().st_mtime).strftime('%Y-%m-%dT%H:%M:%S') + if isinstance(filename, str): + reader.close() -# return acquisition_date_time \ No newline at end of file + peak_indices = np.empty(len(spec_indices)+1, np.int64) + peak_indices[0] = 0 + peak_indices[1:] = np.cumsum(_peak_indices) + ret_dict = { + 'peak_indices': peak_indices, + 'peak_mz': np.concatenate(mzs_list), + 'peak_intensity': np.concatenate(intens_list), + 'rt': np.array(rt_list), + 'precursor_mz': np.array(prec_mz_list), + 'precursor_charge': np.array(charge_list, dtype=np.int8), + 'isolation_mz_lower': np.array(isolation_lower_mz_list), + 'isolation_mz_upper': np.array(isolation_upper_mz_list), + 'ms_level': np.array(ms_level_list, dtype=np.int8), + } + nce_list = np.array(nce_list, dtype=np.float32) + if np.any(np.isnan(nce_list)): + return ret_dict + ret_dict["nce"] = nce_list + return ret_dict + + def _set_dataframes(self, raw_data:dict): + self.create_spectrum_df(len(raw_data['rt'])) + self.set_peak_df_by_indexed_array( + raw_data['peak_mz'], + raw_data['peak_intensity'], + raw_data['peak_indices'][:-1], + raw_data['peak_indices'][1:], + ) + self.add_column_in_spec_df( + 'rt', raw_data['rt'] + ) + self.add_column_in_spec_df( + 'ms_level', raw_data['ms_level'], + dtype=np.int8 + ) + self.set_precursor_mz( + raw_data['precursor_mz'] + ) + self.add_column_in_spec_df( + 'charge', raw_data['precursor_charge'], + dtype=np.int8 + ) + self.set_isolation_mz_windows( + raw_data['isolation_mz_lower'], + raw_data['isolation_mz_upper'], + ) + if "nce" in raw_data: + self.add_column_in_spec_df( + "nce", raw_data["nce"], + dtype=np.float32, + ) + +def parse_mzml_entry(item_dict: dict) -> tuple: + rt = float(item_dict.get('scanList').get('scan')[0].get('scan start time')) + masses = item_dict.get('m/z array') + intensities = item_dict.get('intensity array') + ms_level = item_dict.get('ms level') + prec_mz = -1.0 + isolation_lower_mz = -1.0 + isolation_upper_mz = -1.0 + charge = 0 + nce = 0.0 + if ms_level == 2: + try: + charge = int(item_dict.get('precursorList').get('precursor')[0].get('selectedIonList').get('selectedIon')[0].get( + 'charge state')) + except TypeError: + charge = 0 + try: + charge = int(item_dict.get('precursorList').get('precursor')[0].get('selectedIonList').get('selectedIon')[0].get( + 'charge state')) + except TypeError: + charge = 0 + + prec_mz = item_dict.get('precursorList').get('precursor')[0].get('selectedIonList').get('selectedIon')[0].get( + 'selected ion m/z') + try: + iso_window = item_dict.get('precursorList').get('precursor')[0].get('isolationWindow') + iso_lower = float(iso_window.get('isolation window lower offset')) + iso_upper = float(iso_window.get('isolation window upper offset')) + isolation_upper_mz = prec_mz + iso_upper + isolation_lower_mz = prec_mz - iso_lower + except TypeError: + isolation_upper_mz = prec_mz + 1.5 + isolation_lower_mz = prec_mz - 1.5 + + nce = np.nan + try: + filter_string = item_dict.get("scanList").get("scan")[0].get["filter string"] + if "@hcd" in filter_string: + nce = float(filter_string.split("@hcd")[1].split(" ")[0]) + elif "@cid" in filter_string: + nce = float(filter_string.split("@cid")[1].split(" ")[0]) + else: + nce = np.nan + except: + nce = np.nan + return ( + rt, prec_mz, isolation_lower_mz, isolation_upper_mz, + ms_level, nce, charge, masses, intensities + ) \ No newline at end of file diff --git a/alpharaw/sciex.py b/alpharaw/sciex.py index a68f006..bdc80a8 100644 --- a/alpharaw/sciex.py +++ b/alpharaw/sciex.py @@ -46,7 +46,7 @@ def _import(self, def _set_dataframes(self, raw_data:dict): self.create_spectrum_df(len(raw_data['rt'])) - self.set_peaks_by_cat_array( + self.set_peak_df_by_indexed_array( raw_data['peak_mz'], raw_data['peak_intensity'], raw_data['peak_indices'][:-1], @@ -66,7 +66,7 @@ def _set_dataframes(self, raw_data:dict): 'charge', raw_data['precursor_charge'], dtype=np.int8 ) - self.set_precursor_mz_windows( + self.set_isolation_mz_windows( raw_data['isolation_mz_lower'], raw_data['isolation_mz_upper'], ) diff --git a/alpharaw/thermo.py b/alpharaw/thermo.py index 87d4f13..6206dcf 100644 --- a/alpharaw/thermo.py +++ b/alpharaw/thermo.py @@ -2,7 +2,9 @@ import pandas as pd import os import alpharaw.raw_access.pythermorawfilereader as pyrawfilereader -from .ms_data_base import MSData_Base +from .ms_data_base import ( + MSData_Base, PEAK_MZ_DTYPE, PEAK_INTENSITY_DTYPE +) from .ms_data_base import ms_reader_provider class ThermoRawData(MSData_Base): @@ -44,8 +46,8 @@ def _import(self, masses, intensities = rawfile.GetProfileMassListFromScanNum(i) else: masses, intensities = rawfile.GetCentroidMassListFromScanNum(i) - mz_values.append(masses) - intensity_values.append(intensities.astype(np.float32)) + mz_values.append(masses.astype(PEAK_MZ_DTYPE)) + intensity_values.append(intensities.astype(PEAK_INTENSITY_DTYPE)) _peak_indices.append(len(masses)) rt = rawfile.RTFromScanNum(i) rt_values.append(rt) @@ -98,7 +100,7 @@ def _import(self, def _set_dataframes(self, raw_data:dict): self.create_spectrum_df(len(raw_data['rt'])) - self.set_peaks_by_cat_array( + self.set_peak_df_by_indexed_array( raw_data['peak_mz'], raw_data['peak_intensity'], raw_data['peak_indices'][:-1], @@ -118,10 +120,14 @@ def _set_dataframes(self, raw_data:dict): 'charge', raw_data['precursor_charge'], dtype=np.int8 ) - self.set_precursor_mz_windows( + self.set_isolation_mz_windows( raw_data['isolation_mz_lower'], raw_data['isolation_mz_upper'], ) + self.add_column_in_spec_df( + "nce", raw_data["nce"], + dtype=np.float32, + ) ms_reader_provider.register_reader('thermo', ThermoRawData) ms_reader_provider.register_reader('thermo_raw', ThermoRawData) diff --git a/alpharaw/wrappers/alphapept_wrapper.py b/alpharaw/wrappers/alphapept_wrapper.py index a792601..76f6254 100644 --- a/alpharaw/wrappers/alphapept_wrapper.py +++ b/alpharaw/wrappers/alphapept_wrapper.py @@ -88,7 +88,7 @@ def _set_dataframes(self, _path): rt_values = np.zeros(spec_num) rt_values[spec_idxes] = hdf.Raw.MS2_scans.rt_list_ms2.values - self.set_peaks_by_cat_array( + self.set_peak_df_by_indexed_array( hdf.Raw.MS2_scans.mass_list_ms2.values, hdf.Raw.MS2_scans.int_list_ms2.values, start_idxes, end_idxes diff --git a/nbdev_nbs/match/match_utils.ipynb b/nbdev_nbs/match/match_utils.ipynb index 05ad149..35b4b8b 100644 --- a/nbdev_nbs/match/match_utils.ipynb +++ b/nbdev_nbs/match/match_utils.ipynb @@ -41,8 +41,8 @@ ], "source": [ "#| hide\n", - "spec_masses = np.array([100, 100.1, 199.9, 200.1, 300])\n", - "query_masses = np.array([10,100.0, 200, 300, 400])\n", + "spec_masses = np.array([100, 100.1, 199.9, 200.1, 300], dtype=np.float32)\n", + "query_masses = np.array([10,100.0, 200, 300, 400], dtype=np.float32)\n", "Da_tols = np.ones_like(query_masses)*0.2\n", "first_indices, last_indices = match_profile_peaks(\n", " spec_masses, 0, query_masses, Da_tols,\n", @@ -60,7 +60,7 @@ "source": [ "#| hide\n", "#unittests\n", - "spec_masses = np.arange(10, dtype=float)*100\n", + "spec_masses = np.arange(10, dtype=np.float32)*100\n", "query_masses = spec_masses\n", "Da_tols = query_masses*20*1e-6\n", "idxes = match_closest_peaks(spec_masses, 0, query_masses, Da_tols)\n", @@ -78,7 +78,7 @@ "source": [ "#| hide\n", "#unittests\n", - "spec_masses = np.arange(10, dtype=float)*100\n", + "spec_masses = np.arange(10, dtype=np.float32)*100\n", "query_masses = spec_masses[:5]\n", "Da_tols = query_masses*20*1e-6\n", "spec_intens = np.ones_like(spec_masses)\n", @@ -97,12 +97,10 @@ "source": [ "#| hide\n", "#unittests\n", - "spec_masses = np.arange(10, dtype=float)*100\n", + "spec_masses = np.arange(10, dtype=np.float32)*100\n", "query_masses = spec_masses.copy()[:5]\n", "query_masses[1]+=0.01 # -1\n", "query_masses[3]+=0.003 # spec_masses[3]\n", - "query_masses[5]-=0.0099 # spec_masses[5]\n", - "query_masses[6]+=0.02 #-1\n", "spec_masses[7] = spec_masses[8]-0.007 # matched[7] = -1\n", "spec_masses[8] += 0.01 # matched[8] = 7 as 7 is closer\n", "Da_tols = query_masses*20*1e-6\n", @@ -120,8 +118,8 @@ "source": [ "#| hide\n", "#unittests\n", - "spec_masses = np.arange(10, dtype=float)*100\n", - "query_masses = np.arange(20).reshape((10,2))[:5]*100\n", + "spec_masses = np.arange(10, dtype=np.float32)*100\n", + "query_masses = np.arange(20, dtype=np.float32).reshape((10,2))[:5]*100\n", "Da_tols = query_masses*20*1e-6\n", "target = np.arange(20, dtype=np.int32).reshape((10,2))[:5]\n", "target[10:] = -1\n", diff --git a/nbdev_nbs/match/psm_match.ipynb b/nbdev_nbs/match/psm_match.ipynb index 67229f1..6288708 100644 --- a/nbdev_nbs/match/psm_match.ipynb +++ b/nbdev_nbs/match/psm_match.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -25,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -50,14 +50,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 2/2 [00:03<00:00, 1.66s/it]\n" + "100%|██████████| 2/2 [00:01<00:00, 1.41it/s]\n" ] } ], @@ -251,14 +251,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 2/2 [00:00<00:00, 101.41it/s]\n" + "100%|██████████| 2/2 [00:00<00:00, 174.90it/s]\n" ] } ], @@ -278,14 +278,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1/1 [00:00<00:00, 71.69it/s]\n" + "100%|██████████| 1/1 [00:00<00:00, 111.73it/s]\n" ] } ], @@ -322,6 +322,18 @@ "display_name": "Python 3.8.3 ('base')", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" } }, "nbformat": 4, diff --git a/nbdev_nbs/test_mzml.ipynb b/nbdev_nbs/test_mzml.ipynb new file mode 100644 index 0000000..2b76529 --- /dev/null +++ b/nbdev_nbs/test_mzml.ipynb @@ -0,0 +1,894 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from io import StringIO\n", + "\n", + "mzml = \"/Users/wenfengzeng/Downloads/small.pwiz.1.1.mzML\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
spec_idxpeak_start_idxpeak_stop_idxrtms_levelprecursor_mzchargeisolation_lower_mzisolation_upper_mz
000107390.0049351-1.000-1.00-1.00
1110739255540.0078971-1.000-1.00-1.00
2225554260390.0112182810.790810.29811.29
3326039270450.0228382837.340836.84837.84
4427045278820.0349252725.360724.86725.86
5527882285320.0486202558.870558.37559.37
6628532292940.0619232812.330811.83812.83
7729294373740.0750151-1.000-1.00-1.00
8837374542850.0777881-1.000-1.00-1.00
9954285548370.0812032810.750810.25811.25
101054837557780.0929032837.960837.46838.46
111155778564130.1048032644.060643.56644.56
121256413572050.1172152725.230724.73725.73
131357205578740.1300222559.190558.69559.69
141457874669940.1434521-1.000-1.00-1.00
151566994819220.1464081-1.000-1.00-1.00
161681922825010.1497552811.410810.91811.91
171782501834170.1614422837.360836.86837.86
181883417840870.1733702643.800643.30644.30
191984087847610.1866582558.940558.44559.44
202084761856520.2006952725.140724.64725.64
212185652946650.2136731-1.000-1.00-1.00
2222946651052420.2167471-1.000-1.00-1.00
23231052421058210.2200732810.840810.34811.34
24241058211067590.2329232837.420836.92837.92
25251067591075480.2447452674.640674.14675.14
26261075481082350.2591722643.740643.24644.24
27271082351091000.2726632725.360724.86725.86
28281091001197640.2854831-1.000-1.00-1.00
29291197641336630.2888981-1.000-1.00-1.00
30301336631346010.3037032837.390836.89837.89
31311346011352540.3156502643.800643.30644.30
32321352541359560.3285272558.750558.25559.25
33331359561365430.3429152882.450881.95882.95
34341365431453120.3585581-1.000-1.00-1.00
35351453121566120.3614281-1.000-1.00-1.00
36361566121571840.3647552810.730810.23811.23
37371571841582480.3765782837.350836.85837.85
38381582481589010.3886732643.730643.23644.23
39391589011597760.4019622725.680725.18726.18
40401597761605300.4151322674.700674.20675.20
41411605301782640.4284831-1.000-1.00-1.00
42421782641936140.4332221-1.000-1.00-1.00
43431936141942250.4365672810.820810.32811.32
44441942251952350.4483202837.780837.28838.28
45451952351959480.4605652674.840674.34675.34
46461959481966070.4731032558.900558.40559.40
47471966071972430.4872372882.540882.04883.04
\n", + "
" + ], + "text/plain": [ + " spec_idx peak_start_idx peak_stop_idx rt ms_level precursor_mz \\\n", + "0 0 0 10739 0.004935 1 -1.00 \n", + "1 1 10739 25554 0.007897 1 -1.00 \n", + "2 2 25554 26039 0.011218 2 810.79 \n", + "3 3 26039 27045 0.022838 2 837.34 \n", + "4 4 27045 27882 0.034925 2 725.36 \n", + "5 5 27882 28532 0.048620 2 558.87 \n", + "6 6 28532 29294 0.061923 2 812.33 \n", + "7 7 29294 37374 0.075015 1 -1.00 \n", + "8 8 37374 54285 0.077788 1 -1.00 \n", + "9 9 54285 54837 0.081203 2 810.75 \n", + "10 10 54837 55778 0.092903 2 837.96 \n", + "11 11 55778 56413 0.104803 2 644.06 \n", + "12 12 56413 57205 0.117215 2 725.23 \n", + "13 13 57205 57874 0.130022 2 559.19 \n", + "14 14 57874 66994 0.143452 1 -1.00 \n", + "15 15 66994 81922 0.146408 1 -1.00 \n", + "16 16 81922 82501 0.149755 2 811.41 \n", + "17 17 82501 83417 0.161442 2 837.36 \n", + "18 18 83417 84087 0.173370 2 643.80 \n", + "19 19 84087 84761 0.186658 2 558.94 \n", + "20 20 84761 85652 0.200695 2 725.14 \n", + "21 21 85652 94665 0.213673 1 -1.00 \n", + "22 22 94665 105242 0.216747 1 -1.00 \n", + "23 23 105242 105821 0.220073 2 810.84 \n", + "24 24 105821 106759 0.232923 2 837.42 \n", + "25 25 106759 107548 0.244745 2 674.64 \n", + "26 26 107548 108235 0.259172 2 643.74 \n", + "27 27 108235 109100 0.272663 2 725.36 \n", + "28 28 109100 119764 0.285483 1 -1.00 \n", + "29 29 119764 133663 0.288898 1 -1.00 \n", + "30 30 133663 134601 0.303703 2 837.39 \n", + "31 31 134601 135254 0.315650 2 643.80 \n", + "32 32 135254 135956 0.328527 2 558.75 \n", + "33 33 135956 136543 0.342915 2 882.45 \n", + "34 34 136543 145312 0.358558 1 -1.00 \n", + "35 35 145312 156612 0.361428 1 -1.00 \n", + "36 36 156612 157184 0.364755 2 810.73 \n", + "37 37 157184 158248 0.376578 2 837.35 \n", + "38 38 158248 158901 0.388673 2 643.73 \n", + "39 39 158901 159776 0.401962 2 725.68 \n", + "40 40 159776 160530 0.415132 2 674.70 \n", + "41 41 160530 178264 0.428483 1 -1.00 \n", + "42 42 178264 193614 0.433222 1 -1.00 \n", + "43 43 193614 194225 0.436567 2 810.82 \n", + "44 44 194225 195235 0.448320 2 837.78 \n", + "45 45 195235 195948 0.460565 2 674.84 \n", + "46 46 195948 196607 0.473103 2 558.90 \n", + "47 47 196607 197243 0.487237 2 882.54 \n", + "\n", + " charge isolation_lower_mz isolation_upper_mz \n", + "0 0 -1.00 -1.00 \n", + "1 0 -1.00 -1.00 \n", + "2 0 810.29 811.29 \n", + "3 0 836.84 837.84 \n", + "4 0 724.86 725.86 \n", + "5 0 558.37 559.37 \n", + "6 0 811.83 812.83 \n", + "7 0 -1.00 -1.00 \n", + "8 0 -1.00 -1.00 \n", + "9 0 810.25 811.25 \n", + "10 0 837.46 838.46 \n", + "11 0 643.56 644.56 \n", + "12 0 724.73 725.73 \n", + "13 0 558.69 559.69 \n", + "14 0 -1.00 -1.00 \n", + "15 0 -1.00 -1.00 \n", + "16 0 810.91 811.91 \n", + "17 0 836.86 837.86 \n", + "18 0 643.30 644.30 \n", + "19 0 558.44 559.44 \n", + "20 0 724.64 725.64 \n", + "21 0 -1.00 -1.00 \n", + "22 0 -1.00 -1.00 \n", + "23 0 810.34 811.34 \n", + "24 0 836.92 837.92 \n", + "25 0 674.14 675.14 \n", + "26 0 643.24 644.24 \n", + "27 0 724.86 725.86 \n", + "28 0 -1.00 -1.00 \n", + "29 0 -1.00 -1.00 \n", + "30 0 836.89 837.89 \n", + "31 0 643.30 644.30 \n", + "32 0 558.25 559.25 \n", + "33 0 881.95 882.95 \n", + "34 0 -1.00 -1.00 \n", + "35 0 -1.00 -1.00 \n", + "36 0 810.23 811.23 \n", + "37 0 836.85 837.85 \n", + "38 0 643.23 644.23 \n", + "39 0 725.18 726.18 \n", + "40 0 674.20 675.20 \n", + "41 0 -1.00 -1.00 \n", + "42 0 -1.00 -1.00 \n", + "43 0 810.32 811.32 \n", + "44 0 837.28 838.28 \n", + "45 0 674.34 675.34 \n", + "46 0 558.40 559.40 \n", + "47 0 882.04 883.04 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from alpharaw.mzml import MzMLReader\n", + "\n", + "reader = MzMLReader()\n", + "reader.import_raw(mzml)\n", + "reader.spectrum_df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mzintensity
0204.7595831422.173584
1204.7598423215.493164
2204.7601013887.355957
3204.7603452843.165527
4204.760605582.906738
.........
1972381547.7767331.261027
1972391723.5192871.640921
1972401724.3239751.251971
1972411724.9913335.156138
1972421743.3612063.192361
\n", + "

197243 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " mz intensity\n", + "0 204.759583 1422.173584\n", + "1 204.759842 3215.493164\n", + "2 204.760101 3887.355957\n", + "3 204.760345 2843.165527\n", + "4 204.760605 582.906738\n", + "... ... ...\n", + "197238 1547.776733 1.261027\n", + "197239 1723.519287 1.640921\n", + "197240 1724.323975 1.251971\n", + "197241 1724.991333 5.156138\n", + "197242 1743.361206 3.192361\n", + "\n", + "[197243 rows x 2 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "reader.peak_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/nbs/test_tims_match.ipynb b/nbs/test_tims_match.ipynb index ee03bf0..1a3ccac 100644 --- a/nbs/test_tims_match.ipynb +++ b/nbs/test_tims_match.ipynb @@ -952,7 +952,7 @@ ], "source": [ "tims_match.tolerance = 50\n", - "tims_match.rt_sec_win_to_slice_ms2 = 120\n", + "tims_match.rt_sec_tol_to_slice_ms2 = 60\n", "tims_psm_df, frag_mz_df, frag_int_df, frag_tol_df = tims_match.match_ms2_one_raw(tims_psm_df, True)\n", "tims_psm_df['tims_match'] = 0\n", "for i, (start,stop) in enumerate(tims_psm_df[['frag_start_idx','frag_stop_idx']].values):\n", From 527ebafc147515313a318e01eeefa11e4861f048 Mon Sep 17 00:00:00 2001 From: jalew188 Date: Fri, 18 Aug 2023 13:14:23 +0200 Subject: [PATCH 8/9] updates from multiple OSs --- alpharaw/legacy_msdata/mgf.py | 60 ++++++---- alpharaw/ms_data_base.py | 87 +++++++++----- alpharaw/mzml.py | 32 ----- alpharaw/sciex.py | 27 ----- alpharaw/thermo.py | 31 ----- alpharaw/wrappers/alphapept_wrapper.py | 26 ++--- nbdev_nbs/legacy_msdata/mgf.ipynb | 136 +++++++++++----------- nbdev_nbs/match/psm_match.ipynb | 24 +--- nbdev_nbs/match/psm_match_alphatims.ipynb | 16 ++- nbdev_nbs/test_mzml.ipynb | 25 +--- 10 files changed, 201 insertions(+), 263 deletions(-) diff --git a/alpharaw/legacy_msdata/mgf.py b/alpharaw/legacy_msdata/mgf.py index 732a5b5..ef4dd48 100644 --- a/alpharaw/legacy_msdata/mgf.py +++ b/alpharaw/legacy_msdata/mgf.py @@ -34,13 +34,16 @@ def _import(self, _path:str): f = open(_path) else: f = _path - scanset = set() + scan_mz_dict = {} + scan_charge_dict = {} masses_list = [] intens_list = [] spec_idx_list = [] + scan_list = [] rt_list = [] precursor_mz_list = [] charge_list = [] + self._has_chimeras = False while True: line = f.readline() if not line: break @@ -64,12 +67,18 @@ def _import(self, _path:str): elif line.startswith('PEPMASS='): precursor_mz = float(line.split('=')[1]) elif line.startswith('CHARGE='): - charge = float(line.split('=')[1].strip()[:-1]) + charge = int(line.split('=')[1].strip()[:-1]) if not scan: title = find_line(lines, 'TITLE=') scan = parse_scan_from_TITLE(title) - if scan in scanset: continue - scanset.add(scan) + if scan in scan_mz_dict: + scan_mz_dict[scan].append(precursor_mz) + scan_charge_dict[scan].append(charge) + self._has_chimeras = True + continue + scan_mz_dict[scan] = [precursor_mz] + scan_charge_dict[scan] = [charge] + scan_list.append(scan) spec_idx_list.append(scan-1) rt_list.append(RT) precursor_mz_list.append(precursor_mz) @@ -79,7 +88,9 @@ def _import(self, _path:str): if isinstance(_path, str): f.close() - precursor_mz_list = np.array(precursor_mz_list) + if self._has_chimeras: + precursor_mz_list = [scan_mz_dict[scan] for scan in scan_list] + charge_list = [scan_charge_dict[scan] for scan in scan_list] return { 'peak_indices': index_ragged_list(masses_list), @@ -87,10 +98,9 @@ def _import(self, _path:str): 'peak_intensity': np.concatenate(intens_list), 'rt': np.array(rt_list), 'precursor_mz': precursor_mz_list, - 'isolation_mz_lower': precursor_mz_list-2, - 'isolation_mz_upper': precursor_mz_list+2, 'spec_idx': np.array(spec_idx_list, dtype=np.int64), - 'precursor_charge': np.array(charge_list, dtype=np.int8), + 'scan': np.array(scan_list, dtype=np.int64), + 'precursor_charge': charge_list, } def _set_dataframes(self, raw_data:dict): @@ -103,28 +113,28 @@ def _set_dataframes(self, raw_data:dict): end_idxes[spec_idxes] = raw_data['peak_indices'][1:] rt_values = np.zeros(spec_num) rt_values[spec_idxes] = raw_data['rt'] - precursor_mzs = np.zeros(spec_num) - precursor_mzs[spec_idxes] = raw_data['precursor_mz'] - mz_lowers = np.zeros(spec_num) - mz_lowers[spec_idxes] = raw_data['isolation_mz_lower'] - mz_uppers = np.zeros(spec_num) - mz_uppers[spec_idxes] = raw_data['isolation_mz_upper'] - charges = np.zeros(spec_num, np.int8) - charges[spec_idxes] = raw_data['precursor_charge'] + if self._has_chimeras: + precursor_mzs = [[]]*spec_num + charges = [[]]*spec_num + mz_vals = raw_data['precursor_mz'] + ch_vals = raw_data["precursor_charge"] + for i,idx in enumerate(spec_idxes): + precursor_mzs[idx] = mz_vals[i] + charges[idx] = ch_vals[i] + else: + precursor_mzs = np.zeros(spec_num) + precursor_mzs[spec_idxes] = raw_data['precursor_mz'] + charges = np.zeros(spec_num, np.int8) + charges[spec_idxes] = raw_data['precursor_charge'] + + self.spectrum_df["charge"] = charges + self.spectrum_df["precursor_mz"] = precursor_mzs self.set_peak_df_by_indexed_array( raw_data['peak_mz'], raw_data['peak_intensity'], start_idxes,end_idxes ) - self.add_column_in_spec_df('rt', rt_values) - self.add_column_in_spec_df('charge', charges) + self.add_column_in_spec_df_by_spec_idxes('rt', raw_data['rt'], spec_idxes, na_value=0) self.spectrum_df['ms_level'] = 2 - self.set_precursor_mz( - precursor_mzs - ) - self.set_isolation_mz_windows( - mz_lowers,mz_uppers - ) - ms_reader_provider.register_reader('mgf', MGFReader) diff --git a/alpharaw/ms_data_base.py b/alpharaw/ms_data_base.py index 132270b..50c98de 100644 --- a/alpharaw/ms_data_base.py +++ b/alpharaw/ms_data_base.py @@ -134,10 +134,45 @@ def _import(self, _path): f"{self.__class__} must implement `_import()`" ) - def _set_dataframes(self, raw_data): - raise NotImplementedError( - f"{self.__class__} must implement `_set_dataframes()`" + def _set_dataframes(self, raw_data:dict): + self.create_spectrum_df(len(raw_data['rt'])) + self.set_peak_df_by_indexed_array( + raw_data['peak_mz'], + raw_data['peak_intensity'], + raw_data['peak_indices'][:-1], + raw_data['peak_indices'][1:], + ) + self.spectrum_df["rt"] = raw_data['rt'] + self.spectrum_df["ms_level"] = np.array( + raw_data['ms_level'], dtype=np.int8 + ) + self.spectrum_df["precursor_mz"] = np.array( + raw_data['precursor_mz'], dtype=np.float64 + ) + + self.spectrum_df["charge"] = np.array( + raw_data['precursor_charge'], + dtype=np.int8 + ) + self.spectrum_df["isolation_lower_mz"] = np.array( + raw_data['isolation_mz_lower'], dtype=np.float64 + ) + self.spectrum_df["isolation_mz_upper"] = np.array( + raw_data['isolation_mz_upper'], dtype=np.float64 ) + if "nce" in raw_data: + self.spectrum_df["nce"] = np.array( + raw_data["nce"], + dtype=np.float32, + ) + if "fragmentation" in raw_data: + self.spectrum_df["fragmentation"] = np.array( + raw_data["fragmentation"] + ) + if "detector" in raw_data: + self.spectrum_df["detector"] = np.array( + raw_data["detector"] + ) def _read_creation_time(self, raw_data): pass @@ -202,26 +237,22 @@ def set_peak_df_by_array_list(self, indices[1:] ) - def add_column_in_spec_df(self, + def add_column_in_spec_df_by_spec_idxes(self, column_name:str, values:np.ndarray, - spec_idxes:np.ndarray = None, - dtype:np.dtype=np.float64, na_value=np.nan, + spec_idxes:np.ndarray, + dtype:np.dtype=np.float64, + na_value=np.nan, ): - if spec_idxes is None: - self.spectrum_df[ - column_name - ] = np.array(values, dtype=dtype) - else: - self.spectrum_df.loc[ - spec_idxes, column_name - ] = values - self.spectrum_df[column_name].fillna( - na_value, inplace=True - ) - self.spectrum_df[ - column_name - ] = self.spectrum_df[column_name].astype(dtype) + self.spectrum_df.loc[ + spec_idxes, column_name + ] = values + self.spectrum_df[column_name].fillna( + na_value, inplace=True + ) + self.spectrum_df[ + column_name + ] = self.spectrum_df[column_name].astype(dtype) def add_column_in_df_by_scan_num(self, column_name:str, @@ -233,7 +264,7 @@ def add_column_in_df_by_scan_num(self, """ scan num starts from 1 not 0 """ - self.add_column_in_spec_df( + self.add_column_in_spec_df_by_spec_idxes( column_name, values, scan_nums-1, dtype, na_value @@ -248,27 +279,27 @@ def get_peaks(self, spec_idx): self.peak_df.intensity.values[start:end], ) - def set_precursor_mz(self, + def set_precursor_mz_by_spec_idxes(self, precursor_mz_values:np.ndarray, - spec_idxes:np.ndarray=None, + spec_idxes:np.ndarray, ): - self.add_column_in_spec_df( + self.add_column_in_spec_df_by_spec_idxes( 'precursor_mz', precursor_mz_values, spec_idxes, np.float64, -1.0 ) - def set_isolation_mz_windows(self, + def set_isolation_mz_windows_by_spec_idxes(self, precursor_lower_mz_values:np.ndarray, precursor_upper_mz_values:np.ndarray, - spec_idxes:np.ndarray=None, + spec_idxes:np.ndarray, ): - self.add_column_in_spec_df( + self.add_column_in_spec_df_by_spec_idxes( 'isolation_lower_mz', precursor_lower_mz_values, spec_idxes, np.float64, -1.0 ) - self.add_column_in_spec_df( + self.add_column_in_spec_df_by_spec_idxes( 'isolation_upper_mz', precursor_upper_mz_values, spec_idxes, np.float64, -1.0 diff --git a/alpharaw/mzml.py b/alpharaw/mzml.py index 97e2d49..d258ca5 100644 --- a/alpharaw/mzml.py +++ b/alpharaw/mzml.py @@ -83,38 +83,6 @@ def _import(self, ret_dict["nce"] = nce_list return ret_dict - def _set_dataframes(self, raw_data:dict): - self.create_spectrum_df(len(raw_data['rt'])) - self.set_peak_df_by_indexed_array( - raw_data['peak_mz'], - raw_data['peak_intensity'], - raw_data['peak_indices'][:-1], - raw_data['peak_indices'][1:], - ) - self.add_column_in_spec_df( - 'rt', raw_data['rt'] - ) - self.add_column_in_spec_df( - 'ms_level', raw_data['ms_level'], - dtype=np.int8 - ) - self.set_precursor_mz( - raw_data['precursor_mz'] - ) - self.add_column_in_spec_df( - 'charge', raw_data['precursor_charge'], - dtype=np.int8 - ) - self.set_isolation_mz_windows( - raw_data['isolation_mz_lower'], - raw_data['isolation_mz_upper'], - ) - if "nce" in raw_data: - self.add_column_in_spec_df( - "nce", raw_data["nce"], - dtype=np.float32, - ) - def parse_mzml_entry(item_dict: dict) -> tuple: rt = float(item_dict.get('scanList').get('scan')[0].get('scan start time')) masses = item_dict.get('m/z array') diff --git a/alpharaw/sciex.py b/alpharaw/sciex.py index bdc80a8..096a199 100644 --- a/alpharaw/sciex.py +++ b/alpharaw/sciex.py @@ -43,33 +43,6 @@ def _import(self, self.creation_time = wiff_reader.wiffSample.Details.AcquisitionDateTime.ToString("O") wiff_reader.close() return data_dict - - def _set_dataframes(self, raw_data:dict): - self.create_spectrum_df(len(raw_data['rt'])) - self.set_peak_df_by_indexed_array( - raw_data['peak_mz'], - raw_data['peak_intensity'], - raw_data['peak_indices'][:-1], - raw_data['peak_indices'][1:], - ) - self.add_column_in_spec_df( - 'rt', raw_data['rt'] - ) - self.add_column_in_spec_df( - 'ms_level', raw_data['ms_level'], - dtype=np.int8 - ) - self.set_precursor_mz( - raw_data['precursor_mz'] - ) - self.add_column_in_spec_df( - 'charge', raw_data['precursor_charge'], - dtype=np.int8 - ) - self.set_isolation_mz_windows( - raw_data['isolation_mz_lower'], - raw_data['isolation_mz_upper'], - ) ms_reader_provider.register_reader('sciex', SciexWiffData) ms_reader_provider.register_reader('sciex_wiff', SciexWiffData) diff --git a/alpharaw/thermo.py b/alpharaw/thermo.py index 6206dcf..f8f774d 100644 --- a/alpharaw/thermo.py +++ b/alpharaw/thermo.py @@ -98,36 +98,5 @@ def _import(self, 'nce': np.array(ce_list, dtype=np.float32), } - def _set_dataframes(self, raw_data:dict): - self.create_spectrum_df(len(raw_data['rt'])) - self.set_peak_df_by_indexed_array( - raw_data['peak_mz'], - raw_data['peak_intensity'], - raw_data['peak_indices'][:-1], - raw_data['peak_indices'][1:], - ) - self.add_column_in_spec_df( - 'rt', raw_data['rt'] - ) - self.add_column_in_spec_df( - 'ms_level', raw_data['ms_level'], - dtype=np.int8 - ) - self.set_precursor_mz( - raw_data['precursor_mz'] - ) - self.add_column_in_spec_df( - 'charge', raw_data['precursor_charge'], - dtype=np.int8 - ) - self.set_isolation_mz_windows( - raw_data['isolation_mz_lower'], - raw_data['isolation_mz_upper'], - ) - self.add_column_in_spec_df( - "nce", raw_data["nce"], - dtype=np.float32, - ) - ms_reader_provider.register_reader('thermo', ThermoRawData) ms_reader_provider.register_reader('thermo_raw', ThermoRawData) diff --git a/alpharaw/wrappers/alphapept_wrapper.py b/alpharaw/wrappers/alphapept_wrapper.py index 76f6254..80c18d2 100644 --- a/alpharaw/wrappers/alphapept_wrapper.py +++ b/alpharaw/wrappers/alphapept_wrapper.py @@ -85,8 +85,6 @@ def _set_dataframes(self, _path): start_idxes[spec_idxes] = peak_indices[:-1] end_idxes = np.full(spec_num, -1, dtype=np.int64) end_idxes[spec_idxes] = peak_indices[1:] - rt_values = np.zeros(spec_num) - rt_values[spec_idxes] = hdf.Raw.MS2_scans.rt_list_ms2.values self.set_peak_df_by_indexed_array( hdf.Raw.MS2_scans.mass_list_ms2.values, @@ -94,25 +92,27 @@ def _set_dataframes(self, _path): start_idxes, end_idxes ) - self.add_column_in_spec_df( - 'rt', rt_values + self.add_column_in_spec_df_by_spec_idxes( + "rt", hdf.Raw.MS2_scans.rt_list_ms2.values, + spec_idxes, dtype=np.float64, na_value=0 ) self.spectrum_df['ms_level'] = 2 if hasattr(hdf.Raw.MS2_scans, 'mobility2'): - self.add_column_in_spec_df( - 'mobility', hdf.Raw.MS2_scans.mobility2.values + self.add_column_in_spec_df_by_spec_idxes( + 'mobility', + hdf.Raw.MS2_scans.mobility2.values, + spec_idxes, ) if hasattr(hdf.Raw.MS2_scans, 'mono_mzs2'): - precursor_mzs = np.zeros(spec_num) - precursor_mzs[spec_idxes] = hdf.Raw.MS2_scans.mono_mzs2.values - self.set_precursor_mz( - precursor_mzs - ) - self.set_precursor_mz_windows( - precursor_mzs-1.5, precursor_mzs+1.5 + self.add_column_in_spec_df_by_spec_idxes( + "precursor_mz", hdf.Raw.MS2_scans.mono_mzs2.values, + spec_idxes ) + # self.set_isolation_mz_windows( + # precursor_mzs-1.5, precursor_mzs+1.5 + # ) ms_reader_provider.register_reader('alphapept', AlphaPept_HDF_MS2_Reader) ms_reader_provider.register_reader('alphapept_hdf', AlphaPept_HDF_MS2_Reader) diff --git a/nbdev_nbs/legacy_msdata/mgf.ipynb b/nbdev_nbs/legacy_msdata/mgf.ipynb index ff40da5..adf706d 100644 --- a/nbdev_nbs/legacy_msdata/mgf.ipynb +++ b/nbdev_nbs/legacy_msdata/mgf.ipynb @@ -62,161 +62,151 @@ " \n", " \n", " spec_idx\n", + " charge\n", + " precursor_mz\n", " peak_start_idx\n", " peak_stop_idx\n", " rt\n", - " charge\n", " ms_level\n", - " precursor_mz\n", " isolation_lower_mz\n", " isolation_upper_mz\n", - " rt_sec\n", " \n", " \n", " \n", " \n", " 0\n", " 0\n", + " [3]\n", + " [272.276336]\n", " 0\n", " 62\n", " 0.016667\n", - " 3.0\n", " 2\n", - " 272.276336\n", - " 270.276336\n", - " 274.276336\n", - " 1.0\n", + " -1.0\n", + " -1.0\n", " \n", " \n", " 1\n", " 1\n", + " []\n", + " []\n", " -1\n", " -1\n", " 0.000000\n", - " 0.0\n", " 2\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.0\n", + " -1.0\n", + " -1.0\n", " \n", " \n", " 2\n", " 2\n", + " []\n", + " []\n", " -1\n", " -1\n", " 0.000000\n", - " 0.0\n", " 2\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.0\n", + " -1.0\n", + " -1.0\n", " \n", " \n", " 3\n", " 3\n", + " [2]\n", + " [287.427959]\n", " 62\n", " 119\n", " 0.033333\n", - " 2.0\n", " 2\n", - " 287.427959\n", - " 285.427959\n", - " 289.427959\n", - " 2.0\n", + " -1.0\n", + " -1.0\n", " \n", " \n", " 4\n", " 4\n", + " [2]\n", + " [287.427959]\n", " 119\n", " 121\n", " 0.050000\n", - " 2.0\n", " 2\n", - " 287.427959\n", - " 285.427959\n", - " 289.427959\n", - " 3.0\n", + " -1.0\n", + " -1.0\n", " \n", " \n", " 5\n", " 5\n", + " []\n", + " []\n", " -1\n", " -1\n", " 0.000000\n", - " 0.0\n", " 2\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.0\n", + " -1.0\n", + " -1.0\n", " \n", " \n", " 6\n", " 6\n", + " [2, 2]\n", + " [287.427959, 288.427959]\n", " 121\n", " 124\n", " 0.066667\n", - " 2.0\n", " 2\n", - " 287.427959\n", - " 285.427959\n", - " 289.427959\n", - " 4.0\n", + " -1.0\n", + " -1.0\n", " \n", " \n", " 7\n", " 7\n", + " []\n", + " []\n", " -1\n", " -1\n", " 0.000000\n", - " 0.0\n", " 2\n", - " 0.000000\n", - " 0.000000\n", - " 0.000000\n", - " 0.0\n", + " -1.0\n", + " -1.0\n", " \n", " \n", " 8\n", " 8\n", + " [2]\n", + " [287.427959]\n", " 124\n", " 125\n", " 0.083333\n", - " 2.0\n", " 2\n", - " 287.427959\n", - " 285.427959\n", - " 289.427959\n", - " 5.0\n", + " -1.0\n", + " -1.0\n", " \n", " \n", "\n", "" ], "text/plain": [ - " spec_idx peak_start_idx peak_stop_idx rt charge ms_level \\\n", - "0 0 0 62 0.016667 3.0 2 \n", - "1 1 -1 -1 0.000000 0.0 2 \n", - "2 2 -1 -1 0.000000 0.0 2 \n", - "3 3 62 119 0.033333 2.0 2 \n", - "4 4 119 121 0.050000 2.0 2 \n", - "5 5 -1 -1 0.000000 0.0 2 \n", - "6 6 121 124 0.066667 2.0 2 \n", - "7 7 -1 -1 0.000000 0.0 2 \n", - "8 8 124 125 0.083333 2.0 2 \n", + " spec_idx charge precursor_mz peak_start_idx peak_stop_idx \\\n", + "0 0 [3] [272.276336] 0 62 \n", + "1 1 [] [] -1 -1 \n", + "2 2 [] [] -1 -1 \n", + "3 3 [2] [287.427959] 62 119 \n", + "4 4 [2] [287.427959] 119 121 \n", + "5 5 [] [] -1 -1 \n", + "6 6 [2, 2] [287.427959, 288.427959] 121 124 \n", + "7 7 [] [] -1 -1 \n", + "8 8 [2] [287.427959] 124 125 \n", "\n", - " precursor_mz isolation_lower_mz isolation_upper_mz rt_sec \n", - "0 272.276336 270.276336 274.276336 1.0 \n", - "1 0.000000 0.000000 0.000000 0.0 \n", - "2 0.000000 0.000000 0.000000 0.0 \n", - "3 287.427959 285.427959 289.427959 2.0 \n", - "4 287.427959 285.427959 289.427959 3.0 \n", - "5 0.000000 0.000000 0.000000 0.0 \n", - "6 287.427959 285.427959 289.427959 4.0 \n", - "7 0.000000 0.000000 0.000000 0.0 \n", - "8 287.427959 285.427959 289.427959 5.0 " + " rt ms_level isolation_lower_mz isolation_upper_mz \n", + "0 0.016667 2 -1.0 -1.0 \n", + "1 0.000000 2 -1.0 -1.0 \n", + "2 0.000000 2 -1.0 -1.0 \n", + "3 0.033333 2 -1.0 -1.0 \n", + "4 0.050000 2 -1.0 -1.0 \n", + "5 0.000000 2 -1.0 -1.0 \n", + "6 0.066667 2 -1.0 -1.0 \n", + "7 0.000000 2 -1.0 -1.0 \n", + "8 0.083333 2 -1.0 -1.0 " ] }, "execution_count": null, @@ -380,6 +370,16 @@ "402.705571 402.705571\n", "END IONS\n", "BEGIN IONS\n", + "TITLE=02445a_BA7-TUM_HLA_7_01_01-DDA-1h-R1.32733.32733.2.1.dta\n", + "CHARGE=2+\n", + "RTINSECONDS=4.0\n", + "SCAN=7\n", + "PEPMASS=288.427959\n", + "103.34669 5304.0\n", + "104.66884 5639.7\n", + "402.705571 402.705571\n", + "END IONS\n", + "BEGIN IONS\n", "TITLE=02445a_BA7-TUM_HLA_7_01_01-DDA-1h-R1.23669.23669.2.0.dta\n", "CHARGE=2+\n", "SCAN=9\n", diff --git a/nbdev_nbs/match/psm_match.ipynb b/nbdev_nbs/match/psm_match.ipynb index 6288708..fcd8c03 100644 --- a/nbdev_nbs/match/psm_match.ipynb +++ b/nbdev_nbs/match/psm_match.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -25,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -251,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -278,7 +278,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -322,18 +322,6 @@ "display_name": "Python 3.8.3 ('base')", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" } }, "nbformat": 4, diff --git a/nbdev_nbs/match/psm_match_alphatims.ipynb b/nbdev_nbs/match/psm_match_alphatims.ipynb index 563d8fb..c271d2b 100644 --- a/nbdev_nbs/match/psm_match_alphatims.ipynb +++ b/nbdev_nbs/match/psm_match_alphatims.ipynb @@ -18,7 +18,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "WARNING:root:WARNING: Temp mmap arrays are written to /var/folders/hn/0f5pbk7j55sd4r5bxjp25_rm0000gn/T/temp_mmap_gb9j_x96. Cleanup of this folder is OS dependant, and might need to be triggered manually! Current space: 586,177,458,176\n", + "WARNING:root:WARNING: Temp mmap arrays are written to /var/folders/fh/hf8t3l1x02d42ggk3b304_rh0000gn/T/temp_mmap_v85nc51s. Cleanup of this folder is OS dependant, and might need to be triggered manually! Current space: 706,796,924,928\n", "WARNING:root:WARNING: No Bruker libraries are available for this operating system. Mobility and m/z values need to be estimated. While this estimation often returns acceptable results with errors < 0.02 Th, huge errors (e.g. offsets of 6 Th) have already been observed for some samples!\n" ] } @@ -47,7 +47,19 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[4], line 185\u001b[0m\n\u001b[1;32m 183\u001b[0m psm_df, mz_df, inten_df, merr_df \u001b[39m=\u001b[39m matching\u001b[39m.\u001b[39mmatch_ms2_one_raw(psm_df)\n\u001b[1;32m 184\u001b[0m \u001b[39m#np.sum(matching.matched_intensity_df.values!=0,axis=1)\u001b[39;00m\n\u001b[0;32m--> 185\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mlen\u001b[39m(merr_df\u001b[39m.\u001b[39mvalues[\u001b[39m~\u001b[39mnp\u001b[39m.\u001b[39misinf(merr_df\u001b[39m.\u001b[39mvalues)])\u001b[39m==\u001b[39m\u001b[39m3\u001b[39m\n\u001b[1;32m 186\u001b[0m \u001b[39massert\u001b[39;00m np\u001b[39m.\u001b[39mcount_nonzero(inten_df\u001b[39m.\u001b[39mvalues)\u001b[39m==\u001b[39m\u001b[39m3\u001b[39m\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], "source": [ "#| hide\n", "#unittest\n", diff --git a/nbdev_nbs/test_mzml.ipynb b/nbdev_nbs/test_mzml.ipynb index 2b76529..e6881b1 100644 --- a/nbdev_nbs/test_mzml.ipynb +++ b/nbdev_nbs/test_mzml.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -731,7 +731,7 @@ "47 0 882.04 883.04 " ] }, - "execution_count": 2, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -746,7 +746,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -852,7 +852,7 @@ "[197243 rows x 2 columns]" ] }, - "execution_count": 3, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -874,20 +874,7 @@ "display_name": "base", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" - }, - "orig_nbformat": 4 + } }, "nbformat": 4, "nbformat_minor": 2 From c1a2ee27dec1ff550088a5178d92092a46a93fe8 Mon Sep 17 00:00:00 2001 From: jalew188 Date: Fri, 18 Aug 2023 13:14:43 +0200 Subject: [PATCH 9/9] =?UTF-8?q?Bump=20version:=200.1.0=20=E2=86=92=200.1.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- alpharaw/__init__.py | 2 +- docs/conf.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index b791d6e..1498aba 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.0 +current_version = 0.1.1 commit = True tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? diff --git a/alpharaw/__init__.py b/alpharaw/__init__.py index 3257380..ad9f9ef 100644 --- a/alpharaw/__init__.py +++ b/alpharaw/__init__.py @@ -12,7 +12,7 @@ def register_readers(): return "[WARN] pythonnet is not installed" __project__ = "alpharaw" -__version__ = "0.1.0" +__version__ = "0.1.1" __license__ = "Apache" __description__ = "An open-source Python package to unify raw MS data accession and storage." __author__ = "Mann Labs" diff --git a/docs/conf.py b/docs/conf.py index 55b9f56..1298339 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -23,7 +23,7 @@ copyright = '2022, Mann Labs, MPIB' author = 'Mann Labs, MPIB' -release = "0.1.0" +release = "0.1.1" # -- General configuration ---------------------------------------------------