Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

53 docs base and thermo #55

Merged
merged 7 commits into from
Jun 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
190 changes: 159 additions & 31 deletions alpharaw/ms_data_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,14 @@

class MSData_Base:
"""
The base data structure for MS Data, other MSData loader inherit
The base data structure for MS RAW Data, other MSData loaders inherit this class.

Parameters
----------
centroided : bool, optional
jalew188 marked this conversation as resolved.
Show resolved Hide resolved
If centroiding the peak data, by default True
save_as_hdf : bool, optional
If automatically save the data into HDF5 format, by default False
"""

column_dtypes = {
Expand All @@ -25,7 +32,7 @@ class MSData_Base:
"""
Spectrum dataframe containing the following columns:

- `rt` (float64): in minutes
- `rt` (float64): in minutes. `rt_sec` will be RT in seconds, which is not included by default.
- `precursor_mz` (float64): mono_mz (DDA) or isolation center mz
- `isolation_lower_mz` (float64): left of the isolation window
- `isolation_upper_mz` (float64): right of the isolation window
Expand Down Expand Up @@ -62,15 +69,13 @@ class MSData_Base:
"FT",
"TOF",
]
"""
These spectrum infomation items in str format can be one-to-one mapped into
unique token IDs (indices), for exampel "CID"=0, "HCD"=1, ...
Token IDs are better for storage in HDF5 format.
"""

def __init__(self, centroided: bool = True, save_as_hdf: bool = False, **kwargs):
"""
Parameters
----------
centroided : bool, optional
if peaks will be centroided after loading,
by default True
"""
# A spectrum contains peaks
self.spectrum_df: pd.DataFrame = pd.DataFrame()
# A peak contains mz, intensity, and ...
Expand All @@ -82,9 +87,19 @@ def __init__(self, centroided: bool = True, save_as_hdf: bool = False, **kwargs)
self.file_type = ""
self.instrument = "none"

def _get_term_id(self, terminology: str):
def _get_term_id(self, terminology: str) -> int:
"""
Get terminology id from :data:`self.vocab`, -1 if not exist.
Get terminology ID from :attr:`.MSData_Base.vocab`, -1 if not exist.

Parameters
----------
terminology : str
The terminology name from :attr:`.MSData_Base.vocab`, such as "CID", "HCD", ...

Returns
-------
int
Terminology ID, which is the index in :attr:`.MSData_Base.vocab`.
"""
try:
return self.vocab.index(terminology)
Expand All @@ -96,20 +111,36 @@ def raw_file_path(self) -> str:
return self._raw_file_path

@raw_file_path.setter
def raw_file_path(self, _path: str):
self._raw_file_path = _path
def raw_file_path(self, raw_file_path: str):
self._raw_file_path = raw_file_path

def import_raw(self, _path: str):
self.raw_file_path = _path
raw_data = self._import(_path)
self._set_dataframes(raw_data)
def import_raw(self, raw_file_path: str):
"""
Import a raw file. It involves three steps:
```
raw_data_dict = self._import(raw_file_path)
self._set_dataframes(raw_data_dict)
self._check_df()
```

Parameters
----------
raw_file_path : str
Absolute or relative path of the raw file.
"""
self.raw_file_path = raw_file_path
raw_data_dict = self._import(raw_file_path)
self._set_dataframes(raw_data_dict)
self._check_df()

if self._save_as_hdf:
self.save_hdf(_path + ".hdf")
self.save_hdf(raw_file_path + ".hdf")

def load_raw(self, _path: str):
self.import_raw(_path)
def load_raw(self, raw_file_path: str):
"""
Wrapper of :func:`.MSData_Base.import_raw`
"""
self.import_raw(raw_file_path)

def _save_meta_to_hdf(self, hdf: HDF_File):
hdf.ms_data.meta = {
Expand All @@ -127,15 +158,35 @@ def _load_meta_from_hdf(self, hdf: HDF_File):
self.centroided = hdf.ms_data.meta.centroided
self.instrument = hdf.ms_data.meta.instrument

def save_hdf(self, _path: str):
hdf = HDF_File(_path, read_only=False, truncate=True, delete_existing=True)
def save_hdf(self, hdf_file_path: str):
"""
Save data into HDF5 file

Parameters
----------
hdf_file_path : str
Absolute or relative path of HDF5 file.
"""
hdf = HDF_File(
hdf_file_path, read_only=False, truncate=True, delete_existing=True
)

hdf.ms_data = {"spectrum_df": self.spectrum_df, "peak_df": self.peak_df}

self._save_meta_to_hdf(hdf)

def load_hdf(self, _path: str):
hdf = HDF_File(_path, read_only=True, truncate=False, delete_existing=False)
def load_hdf(self, hdf_file_path: str):
"""
Load data from HDF5 file.

Parameters
----------
hdf_file_path : str
Absolute or relative path of HDF5 file.
"""
hdf = HDF_File(
hdf_file_path, read_only=True, truncate=False, delete_existing=False
)

self.spectrum_df = hdf.ms_data.spectrum_df.values
self.peak_df = hdf.ms_data.peak_df.values
Expand All @@ -144,10 +195,43 @@ def load_hdf(self, _path: str):
self._load_meta_from_hdf(hdf)

def reset_spec_idxes(self):
"""
Reset spec indexes to make sure spec_idx values are continuous ranging from 0 to N.
"""
self.spectrum_df.reset_index(drop=True, inplace=True)
self.spectrum_df["spec_idx"] = self.spectrum_df.index.values

def _import(self, _path):
def _import(self, _path: str) -> dict:
"""
Parameters
----------
_path : str
Path of raw file.

Returns
-------
dict
Example:
```
spec_dict = {
"_peak_indices": _peak_indices,
"peak_mz": np.concatenate(mz_values).copy(),
"peak_intensity": np.concatenate(intensity_values).copy(),
"rt": np.array(rt_values).copy(),
"precursor_mz": np.array(precursor_mz_values).copy(),
"precursor_charge": np.array(precursor_charges, dtype=np.int8).copy(),
"isolation_lower_mz": np.array(isolation_mz_lowers).copy(),
"isolation_upper_mz": np.array(isolation_mz_uppers).copy(),
"ms_level": np.array(ms_order_list, dtype=np.int8).copy(),
"nce": np.array(ce_list, dtype=np.float32).copy(),
}
```

Raises
------
NotImplementedError
Sub-class of `MSData_Base` must implement this method.
"""
raise NotImplementedError(f"{self.__class__} must implement `_import()`")

def _set_dataframes(self, raw_data: dict):
Expand Down Expand Up @@ -200,6 +284,14 @@ def create_spectrum_df(
self,
spectrum_num: int,
):
"""
Create a empty spectrum dataframe from the number of spectra.

Parameters
----------
spectrum_num : int
The number of spectra.
"""
self.spectrum_df = pd.DataFrame(index=np.arange(spectrum_num, dtype=np.int64))
self.spectrum_df["spec_idx"] = self.spectrum_df.index.values

Expand Down Expand Up @@ -345,6 +437,12 @@ def index_ragged_list(ragged_list: list) -> np.ndarray:


class MSData_HDF(MSData_Base):
"""
Wrapper of reader for alpharaw's HDF5 spectrum file.
This class is registered as "alpharaw", "raw.hdf", "alpharaw_hdf", "hdf" and "hdf5"
in :data:`ms_reader_provider` instance.
"""

def import_raw(self, _path: str):
self.raw_file_path = _path
self.load_hdf(_path)
Expand All @@ -356,20 +454,50 @@ class MSReaderProvider:
def __init__(self):
self.ms_reader_dict = {}

def register_reader(self, ms2_type: str, reader_class):
self.ms_reader_dict[ms2_type.lower()] = reader_class
def register_reader(self, ms_file_type: str, reader_class: type):
"""
Register a new reader for `ms_file_type` format with `reader_class`.

Parameters
----------
ms_file_type : str
AlphaRaw supported MS file types.
reader_class : type
AlphaRaw supported MS class types.
"""
self.ms_reader_dict[ms_file_type.lower()] = reader_class

def get_reader(
self, file_type: str, *, centroided: bool = True, **kwargs
self, ms_file_type: str, *, centroided: bool = True, **kwargs
) -> MSData_Base:
file_type = file_type.lower()
if file_type not in self.ms_reader_dict:
"""
Get the MS reader for the given `ms_file_type`.

Parameters
----------
ms_file_type : str
AlphaRaw supported MS file types.
centroided : bool, optional
jalew188 marked this conversation as resolved.
Show resolved Hide resolved
If centroiding the data, by default True.

Returns
-------
MSData_Base
Instance of corresponding sub-class of `MSData_Base`.
"""
ms_file_type = ms_file_type.lower()
if ms_file_type not in self.ms_reader_dict:
return None
else:
return self.ms_reader_dict[file_type](centroided=centroided, **kwargs)
return self.ms_reader_dict[ms_file_type](centroided=centroided, **kwargs)


ms_reader_provider = MSReaderProvider()
"""
MS data register (:class:`.MSReaderProvider`) performs as a factory to
produce different readers for different file formats.
"""

ms_reader_provider.register_reader("alpharaw", MSData_HDF)
ms_reader_provider.register_reader("raw.hdf", MSData_HDF)
ms_reader_provider.register_reader("alpharaw_hdf", MSData_HDF)
Expand Down
69 changes: 34 additions & 35 deletions alpharaw/thermo.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
ms_reader_provider,
)

#: These thermo spectrum items can be only accessed by trailer dict using RawFileReader APIs.
jalew188 marked this conversation as resolved.
Show resolved Hide resolved
__trailer_extra_list__ = [
"injection_time",
"cv",
Expand All @@ -24,6 +25,8 @@
"funnel_rf_level",
"faims_cv",
]

#: The auxiliary items and types that can be accessed from thermo RawFileReader.
__auxiliary_item_dtypes__ = {
"injection_time": np.float32,
"cv": np.float32,
Expand All @@ -47,6 +50,23 @@
class ThermoRawData(MSData_Base):
"""
Loading Thermo Raw data as MSData_Base data structure.
This class is registered "thermo" and "thermo_raw" in :data:`ms_reader_provider`.

Parameters
----------
centroided : bool, optional
If peaks will be centroided after loading. By defaults True.
process_count : int, optional
number of spectra to load in each batch, by default 10.
mp_batch_size : int, optional
automatically save hdf after load raw data, by default 5000.
save_as_hdf : bool, optional
is DDA data, by default False.
dda : bool, optional
_description_, by default False.
auxiliary_items : list, optional
Additional spectrum items, candidates are in :data:`__auxiliary_item_dtypes__`.
By default [].
"""

def __init__(
Expand All @@ -59,34 +79,6 @@ def __init__(
auxiliary_items: list = [],
**kwargs,
):
"""
Parameters
----------
centroided : bool, default = True
if peaks will be centroided after loading,
by default True

process_count : int, default = 8
number of processes to use for loading

mp_batch_size : int, default = 10000
number of spectra to load in each batch

save_as_hdf : bool, default = False
automatically save hdf after load raw data.

dda : bool, default = False
is DDA data

auxiliary_items : list, default = []
Candidates are:
"injection_time", "cv",
"max_ion_time", "agc_target", "energy_ev",
"injection_optics_settling_time",
"funnel_rf_level", "faims_cv",
"detector", "activation", "analyzer",
"detector_id", "activation_id", "analyzer_id",
"""
super().__init__(centroided, save_as_hdf=save_as_hdf, **kwargs)
self.file_type = "thermo"
self.process_count = process_count
Expand All @@ -99,6 +91,19 @@ def _import(
self,
raw_file_path: str,
) -> dict:
"""
Re-implementation of :func:`MSData_Base._import` to enable :func:`.MSData_Base.import_raw`.

Parameters
----------
raw_file_path : str
File path of the raw data.
jalew188 marked this conversation as resolved.
Show resolved Hide resolved

Returns
-------
dict
Spectrum information in a temporary dict format.
jalew188 marked this conversation as resolved.
Show resolved Hide resolved
"""
rawfile = pyrawfilereader.RawFileReader(raw_file_path)
self.creation_time = rawfile.GetCreationDate()

Expand Down Expand Up @@ -185,13 +190,7 @@ def _import_batch(
is dda data.

auxiliary_items : list
Candidates:
"injection_time", "cv",
"max_ion_time", "agc_target", "energy_ev",
"injection_optics_settling_time",
"funnel_rf_level", "faims_cv",
"activation", "analyzer",
"activation_id", "analyzer_id",
Candidates are in :data:`__auxiliary_item_dtypes__`.

Returns
-------
Expand Down
Loading