diff --git a/alpharaw/__init__.py b/alpharaw/__init__.py index 29b5952..8b6b502 100644 --- a/alpharaw/__init__.py +++ b/alpharaw/__init__.py @@ -1,5 +1,8 @@ #!python +import warnings +warnings.filterwarnings("ignore") + def register_readers(): from .ms_data_base import ms_reader_provider from .legacy_msdata import mgf diff --git a/alpharaw/ms_data_base.py b/alpharaw/ms_data_base.py index f10f40d..3d1778a 100644 --- a/alpharaw/ms_data_base.py +++ b/alpharaw/ms_data_base.py @@ -40,7 +40,12 @@ class MSData_Base: "ETHCD", "ETCID", "EXCID", "NETD", "IT", "FT", "TOF", ] - def __init__(self, centroided:bool=True, **kwargs): + def __init__( + self, + centroided:bool=True, + save_as_hdf:bool=False, + **kwargs + ): """ Parameters ---------- @@ -54,6 +59,7 @@ def __init__(self, centroided:bool=True, **kwargs): self.peak_df:pd.DataFrame = pd.DataFrame() self._raw_file_path = '' self.centroided = centroided + self.save_as_hdf = save_as_hdf self.creation_time = '' self.file_type = '' self.instrument = 'none' @@ -81,6 +87,9 @@ def import_raw(self, _path:str): self._set_dataframes(raw_data) self._check_df() + if self.save_as_hdf: + self.save_hdf(_path+'.hdf') + def load_raw(self, _path:str): self.import_raw(_path) @@ -123,7 +132,8 @@ def load_hdf(self, _path:str): self.spectrum_df = hdf.ms_data.spectrum_df.values self.peak_df = hdf.ms_data.peak_df.values - self._load_meta_from_hdf(hdf) + if hasattr(hdf.ms_data, "meta"): + self._load_meta_from_hdf(hdf) def reset_spec_idxes(self): self.spectrum_df.reset_index(drop=True, inplace=True) @@ -173,6 +183,10 @@ def _set_dataframes(self, raw_data:dict): self.spectrum_df["detector"] = np.array( raw_data["detector"] ) + if "injection_time" in raw_data: + self.spectrum_df["injection_time"] = np.array( + raw_data["injection_time"] + ) def _read_creation_time(self, raw_data): pass diff --git a/alpharaw/raw_access/pythermorawfilereader.py b/alpharaw/raw_access/pythermorawfilereader.py index 690e236..c0390af 100644 --- a/alpharaw/raw_access/pythermorawfilereader.py +++ b/alpharaw/raw_access/pythermorawfilereader.py @@ -522,3 +522,10 @@ def GetCentroidMassListFromScanNum(self, scanNumber): DotNetArrayToNPArray(segmentedScan.Positions), DotNetArrayToNPArray(segmentedScan.Intensities) ) + + def GetInjectionTimeForScanNum(self, scanNumber): + """Returns the recorded injection time for the current controller. This function is only valid for + MS controllers.""" + trailer = self.source.GetTrailerExtraInformation(scanNumber) + trailer_dict = {trailer.Labels[i]: trailer.Values[i] for i in range(trailer.Length)} + return float(trailer_dict['Ion Injection Time (ms):']) \ No newline at end of file diff --git a/alpharaw/sciex.py b/alpharaw/sciex.py index 39f10f1..096a199 100644 --- a/alpharaw/sciex.py +++ b/alpharaw/sciex.py @@ -43,10 +43,6 @@ def _import(self, self.creation_time = wiff_reader.wiffSample.Details.AcquisitionDateTime.ToString("O") wiff_reader.close() return data_dict - - def import_raw(self, _path: str): - super().import_raw(_path) - self.save_hdf(_path+".hdf") ms_reader_provider.register_reader('sciex', SciexWiffData) ms_reader_provider.register_reader('sciex_wiff', SciexWiffData) diff --git a/alpharaw/thermo.py b/alpharaw/thermo.py index 1659b0e..5751d73 100644 --- a/alpharaw/thermo.py +++ b/alpharaw/thermo.py @@ -50,8 +50,10 @@ def _import_batch( precursor_charges = [] ms_order_list = [] ce_list = [] + injection_time_list = [] cv_list = [] + for i in range( start, stop @@ -67,6 +69,7 @@ def _import_batch( rt_values.append(rt) ms_order = rawfile.GetMSOrderForScanNum(i) ms_order_list.append(ms_order) + injection_time_list.append(rawfile.GetInjectionTimeForScanNum(i)) if ms_order == 1: ce_list.append(0) @@ -104,17 +107,20 @@ def _import_batch( precursor_charges.append(charge) rawfile.Close() + # copys of numpy arrays are needed to move them explicitly to cpython heap + # otherwise mono might interfere later return { '_peak_indices': _peak_indices, - 'peak_mz': np.concatenate(mz_values), - 'peak_intensity': np.concatenate(intensity_values), - 'rt': np.array(rt_values), - 'precursor_mz': np.array(precursor_mz_values), - 'precursor_charge': np.array(precursor_charges, dtype=np.int8), - 'isolation_lower_mz': np.array(isolation_mz_lowers), - 'isolation_upper_mz': np.array(isolation_mz_uppers), - 'ms_level': np.array(ms_order_list, dtype=np.int8), - 'nce': np.array(ce_list, dtype=np.float32), + 'peak_mz': np.concatenate(mz_values).copy(), + 'peak_intensity': np.concatenate(intensity_values).copy(), + 'rt': np.array(rt_values).copy(), + 'precursor_mz': np.array(precursor_mz_values).copy(), + 'precursor_charge': np.array(precursor_charges, dtype=np.int8).copy(), + 'isolation_lower_mz': np.array(isolation_mz_lowers).copy(), + 'isolation_upper_mz': np.array(isolation_mz_uppers).copy(), + 'ms_level': np.array(ms_order_list, dtype=np.int8).copy(), + 'nce': np.array(ce_list, dtype=np.float32).copy(), + 'injection_time': np.array(injection_time_list, dtype=np.float32).copy() 'cv': np.array(cv_list, dtype=np.float32), } class ThermoRawData(MSData_Base): @@ -124,7 +130,7 @@ class ThermoRawData(MSData_Base): def __init__(self, centroided : bool = True, process_count : int = 10, - mp_batch_size : int = 10000, + mp_batch_size : int = 5000, **kwargs): """ Parameters @@ -139,7 +145,7 @@ def __init__(self, mp_batch_size : int, default = 10000 number of spectra to load in each batch """ - super().__init__(centroided) + super().__init__(centroided, **kwargs) self.file_type = 'thermo' self.process_count = process_count self.mp_batch_size = mp_batch_size @@ -154,17 +160,15 @@ def _import(self, first_spectrum_number = rawfile.FirstSpectrumNumber last_spectrum_number = rawfile.LastSpectrumNumber - if platform.system() != 'Linux': - batches = np.arange(first_spectrum_number, last_spectrum_number+1, self.mp_batch_size) - batches = np.append(batches, last_spectrum_number+1) - - # use multiprocessing to load batches - _import_batch_partial = partial(_import_batch, raw_file_path, self.centroided) - with mp.get_context("spawn").Pool(processes = self.process_count) as pool: - batches = list(tqdm(pool.imap(_import_batch_partial, zip(batches[:-1], batches[1:])))) + mode = 'spawn' if platform.system() != 'Linux' else 'forkserver' + + batches = np.arange(first_spectrum_number, last_spectrum_number+1, self.mp_batch_size) + batches = np.append(batches, last_spectrum_number+1) - else: - batches = [_import_batch(raw_file_path, self.centroided, (first_spectrum_number, last_spectrum_number+1))] + # use multiprocessing to load batches + _import_batch_partial = partial(_import_batch, raw_file_path, self.centroided) + with mp.get_context(mode).Pool(processes = self.process_count) as pool: + batches = list(tqdm(pool.imap(_import_batch_partial, zip(batches[:-1], batches[1:])))) # collect peak indices _peak_indices = np.concatenate([batch['_peak_indices'] for batch in batches]) @@ -184,45 +188,6 @@ def _import(self, return output_dict - def _set_dataframes(self, raw_data:dict): - self.create_spectrum_df(len(raw_data['rt'])) - self.set_peak_df_by_indexed_array( - raw_data['peak_mz'], - raw_data['peak_intensity'], - raw_data['peak_indices'][:-1], - raw_data['peak_indices'][1:], - ) - self.add_column_in_spec_df( - 'rt', raw_data['rt'] - ) - self.add_column_in_spec_df( - 'ms_level', raw_data['ms_level'], - dtype=np.int8 - ) - self.set_precursor_mz( - raw_data['precursor_mz'] - ) - self.add_column_in_spec_df( - 'charge', raw_data['precursor_charge'], - dtype=np.int8 - ) - self.set_isolation_mz_windows( - raw_data['isolation_mz_lower'], - raw_data['isolation_mz_upper'], - ) - self.add_column_in_spec_df( - "nce", raw_data["nce"], - dtype=np.float32, - ) - self.add_column_in_spec_df( - "cv", raw_data["cv"], - dtype=np.float32, - ) - - - def import_raw(self, _path: str): - super().import_raw(_path) - self.save_hdf(_path+".hdf") ms_reader_provider.register_reader('thermo', ThermoRawData) ms_reader_provider.register_reader('thermo_raw', ThermoRawData)