Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated handling on linux #20

Merged
merged 11 commits into from
Dec 5, 2023
3 changes: 3 additions & 0 deletions alpharaw/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#!python

import warnings
warnings.filterwarnings("ignore")

def register_readers():
from .ms_data_base import ms_reader_provider
from .legacy_msdata import mgf
Expand Down
18 changes: 16 additions & 2 deletions alpharaw/ms_data_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,12 @@ class MSData_Base:
"ETHCD", "ETCID", "EXCID", "NETD",
"IT", "FT", "TOF",
]
def __init__(self, centroided:bool=True, **kwargs):
def __init__(
self,
centroided:bool=True,
save_as_hdf:bool=False,
**kwargs
):
"""
Parameters
----------
Expand All @@ -54,6 +59,7 @@ def __init__(self, centroided:bool=True, **kwargs):
self.peak_df:pd.DataFrame = pd.DataFrame()
self._raw_file_path = ''
self.centroided = centroided
self.save_as_hdf = save_as_hdf
self.creation_time = ''
self.file_type = ''
self.instrument = 'none'
Expand Down Expand Up @@ -81,6 +87,9 @@ def import_raw(self, _path:str):
self._set_dataframes(raw_data)
self._check_df()

if self.save_as_hdf:
self.save_hdf(_path+'.hdf')

def load_raw(self, _path:str):
self.import_raw(_path)

Expand Down Expand Up @@ -123,7 +132,8 @@ def load_hdf(self, _path:str):
self.spectrum_df = hdf.ms_data.spectrum_df.values
self.peak_df = hdf.ms_data.peak_df.values

self._load_meta_from_hdf(hdf)
if hasattr(hdf.ms_data, "meta"):
self._load_meta_from_hdf(hdf)

def reset_spec_idxes(self):
self.spectrum_df.reset_index(drop=True, inplace=True)
Expand Down Expand Up @@ -173,6 +183,10 @@ def _set_dataframes(self, raw_data:dict):
self.spectrum_df["detector"] = np.array(
raw_data["detector"]
)
if "injection_time" in raw_data:
self.spectrum_df["injection_time"] = np.array(
raw_data["injection_time"]
)

def _read_creation_time(self, raw_data):
pass
Expand Down
7 changes: 7 additions & 0 deletions alpharaw/raw_access/pythermorawfilereader.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,3 +522,10 @@ def GetCentroidMassListFromScanNum(self, scanNumber):
DotNetArrayToNPArray(segmentedScan.Positions),
DotNetArrayToNPArray(segmentedScan.Intensities)
)

def GetInjectionTimeForScanNum(self, scanNumber):
"""Returns the recorded injection time for the current controller. This function is only valid for
MS controllers."""
trailer = self.source.GetTrailerExtraInformation(scanNumber)
trailer_dict = {trailer.Labels[i]: trailer.Values[i] for i in range(trailer.Length)}
return float(trailer_dict['Ion Injection Time (ms):'])
4 changes: 0 additions & 4 deletions alpharaw/sciex.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,6 @@ def _import(self,
self.creation_time = wiff_reader.wiffSample.Details.AcquisitionDateTime.ToString("O")
wiff_reader.close()
return data_dict

def import_raw(self, _path: str):
super().import_raw(_path)
self.save_hdf(_path+".hdf")

ms_reader_provider.register_reader('sciex', SciexWiffData)
ms_reader_provider.register_reader('sciex_wiff', SciexWiffData)
Expand Down
85 changes: 25 additions & 60 deletions alpharaw/thermo.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,10 @@ def _import_batch(
precursor_charges = []
ms_order_list = []
ce_list = []
injection_time_list = []
cv_list = []


for i in range(
start,
stop
Expand All @@ -67,6 +69,7 @@ def _import_batch(
rt_values.append(rt)
ms_order = rawfile.GetMSOrderForScanNum(i)
ms_order_list.append(ms_order)
injection_time_list.append(rawfile.GetInjectionTimeForScanNum(i))

if ms_order == 1:
ce_list.append(0)
Expand Down Expand Up @@ -104,17 +107,20 @@ def _import_batch(
precursor_charges.append(charge)
rawfile.Close()

# copys of numpy arrays are needed to move them explicitly to cpython heap
# otherwise mono might interfere later
return {
'_peak_indices': _peak_indices,
'peak_mz': np.concatenate(mz_values),
'peak_intensity': np.concatenate(intensity_values),
'rt': np.array(rt_values),
'precursor_mz': np.array(precursor_mz_values),
'precursor_charge': np.array(precursor_charges, dtype=np.int8),
'isolation_lower_mz': np.array(isolation_mz_lowers),
'isolation_upper_mz': np.array(isolation_mz_uppers),
'ms_level': np.array(ms_order_list, dtype=np.int8),
'nce': np.array(ce_list, dtype=np.float32),
'peak_mz': np.concatenate(mz_values).copy(),
'peak_intensity': np.concatenate(intensity_values).copy(),
'rt': np.array(rt_values).copy(),
'precursor_mz': np.array(precursor_mz_values).copy(),
'precursor_charge': np.array(precursor_charges, dtype=np.int8).copy(),
'isolation_lower_mz': np.array(isolation_mz_lowers).copy(),
'isolation_upper_mz': np.array(isolation_mz_uppers).copy(),
'ms_level': np.array(ms_order_list, dtype=np.int8).copy(),
'nce': np.array(ce_list, dtype=np.float32).copy(),
'injection_time': np.array(injection_time_list, dtype=np.float32).copy()
'cv': np.array(cv_list, dtype=np.float32),
}
class ThermoRawData(MSData_Base):
Expand All @@ -124,7 +130,7 @@ class ThermoRawData(MSData_Base):
def __init__(self,
centroided : bool = True,
process_count : int = 10,
mp_batch_size : int = 10000,
mp_batch_size : int = 5000,
**kwargs):
"""
Parameters
Expand All @@ -139,7 +145,7 @@ def __init__(self,
mp_batch_size : int, default = 10000
number of spectra to load in each batch
"""
super().__init__(centroided)
super().__init__(centroided, **kwargs)
self.file_type = 'thermo'
self.process_count = process_count
self.mp_batch_size = mp_batch_size
Expand All @@ -154,17 +160,15 @@ def _import(self,
first_spectrum_number = rawfile.FirstSpectrumNumber
last_spectrum_number = rawfile.LastSpectrumNumber

if platform.system() != 'Linux':
batches = np.arange(first_spectrum_number, last_spectrum_number+1, self.mp_batch_size)
batches = np.append(batches, last_spectrum_number+1)

# use multiprocessing to load batches
_import_batch_partial = partial(_import_batch, raw_file_path, self.centroided)
with mp.get_context("spawn").Pool(processes = self.process_count) as pool:
batches = list(tqdm(pool.imap(_import_batch_partial, zip(batches[:-1], batches[1:]))))
mode = 'spawn' if platform.system() != 'Linux' else 'forkserver'

batches = np.arange(first_spectrum_number, last_spectrum_number+1, self.mp_batch_size)
batches = np.append(batches, last_spectrum_number+1)

else:
batches = [_import_batch(raw_file_path, self.centroided, (first_spectrum_number, last_spectrum_number+1))]
# use multiprocessing to load batches
_import_batch_partial = partial(_import_batch, raw_file_path, self.centroided)
with mp.get_context(mode).Pool(processes = self.process_count) as pool:
batches = list(tqdm(pool.imap(_import_batch_partial, zip(batches[:-1], batches[1:]))))

# collect peak indices
_peak_indices = np.concatenate([batch['_peak_indices'] for batch in batches])
Expand All @@ -184,45 +188,6 @@ def _import(self,

return output_dict

def _set_dataframes(self, raw_data:dict):
self.create_spectrum_df(len(raw_data['rt']))
self.set_peak_df_by_indexed_array(
raw_data['peak_mz'],
raw_data['peak_intensity'],
raw_data['peak_indices'][:-1],
raw_data['peak_indices'][1:],
)
self.add_column_in_spec_df(
'rt', raw_data['rt']
)
self.add_column_in_spec_df(
'ms_level', raw_data['ms_level'],
dtype=np.int8
)
self.set_precursor_mz(
raw_data['precursor_mz']
)
self.add_column_in_spec_df(
'charge', raw_data['precursor_charge'],
dtype=np.int8
)
self.set_isolation_mz_windows(
raw_data['isolation_mz_lower'],
raw_data['isolation_mz_upper'],
)
self.add_column_in_spec_df(
"nce", raw_data["nce"],
dtype=np.float32,
)
self.add_column_in_spec_df(
"cv", raw_data["cv"],
dtype=np.float32,
)


def import_raw(self, _path: str):
super().import_raw(_path)
self.save_hdf(_path+".hdf")

ms_reader_provider.register_reader('thermo', ThermoRawData)
ms_reader_provider.register_reader('thermo_raw', ThermoRawData)