diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 297672f..9abeb7f 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -12,9 +12,7 @@ env: jobs: build: - runs-on: ubuntu-latest - steps: - uses: actions/checkout@v3 - name: Build diff --git a/Cargo.lock b/Cargo.lock index c6b75c8..1795706 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1227,7 +1227,7 @@ dependencies = [ [[package]] name = "timsrust" -version = "0.2.4" +version = "0.3.0" dependencies = [ "bytemuck", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index dc11952..932a754 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "timsrust" -version = "0.2.4" +version = "0.3.0" edition = "2021" description = "A crate to read Bruker timsTOF data" license = "Apache-2.0" diff --git a/benches/speed_performance.rs b/benches/speed_performance.rs index 40ef114..579b3e1 100644 --- a/benches/speed_performance.rs +++ b/benches/speed_performance.rs @@ -1,6 +1,13 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion}; use timsrust::FileReader; +const DDA_TEST: &str = + "/mnt/c/Users/Sander.Willems/Documents/data/tims05_300SPD/20230505_TIMS05_PaSk_MA_HeLa_6min_ddaP_S1-C10_1_2323.d/"; +const DIA_TEST: &str = + "/mnt/c/Users/Sander.Willems/Documents/data/20230505_TIMS05_PaSk_SA_HeLa_6min_diaP_8scans_S1-D3_1_2329.d/"; +const SYP_TEST: &str = + "/mnt/c/Users/Sander.Willems/Documents/data/20230505_TIMS05_PaSk_SA_HeLa_6min_syP_5scans_30Da_S1-D4_1_2330.d/"; + fn read_all_frames(file_reader: &FileReader) { file_reader.read_all_frames(); } @@ -17,27 +24,70 @@ fn read_all_spectra(file_reader: &FileReader) { file_reader.read_all_spectra(); } -fn criterion_benchmark(c: &mut Criterion) { +fn criterion_benchmark_dda(c: &mut Criterion) { // c.bench_function("fib 20", |b| b.iter(|| fibonacci(black_box(20)))); let mut group = c.benchmark_group("sample-size-example"); group.significance_level(0.001).sample_size(10); - let d_folder_name: &str = "/home/sander/data/20230505_TIMS05_PaSk_MA_HeLa_6min_ddaP_S1-C10_1_2323.d/"; + let d_folder_name: &str = DDA_TEST; let file_reader: FileReader = FileReader::new(d_folder_name.to_string()).unwrap(); - group.bench_function("read_all_frames 6m dda", |b| { + group.bench_function("DDA read_all_frames 6m", |b| { b.iter(|| read_all_frames(black_box(&file_reader))) }); - group.bench_function("read_all_ms1_frames 6m dda", |b| { + group.bench_function("DDA read_all_ms1_frames 6m", |b| { b.iter(|| read_all_ms1_frames(black_box(&file_reader))) }); - group.bench_function("read_all_ms2_frames 6m dda", |b| { + group.bench_function("DDA read_all_ms2_frames 6m", |b| { b.iter(|| read_all_ms2_frames(black_box(&file_reader))) }); - group.bench_function("read_all_spectra 6m dda", |b| { + group.bench_function("DDA read_all_spectra 6m", |b| { b.iter(|| read_all_spectra(black_box(&file_reader))) }); group.finish(); } -criterion_group!(benches, criterion_benchmark); +fn criterion_benchmark_dia(c: &mut Criterion) { + // c.bench_function("fib 20", |b| b.iter(|| fibonacci(black_box(20)))); + let mut group = c.benchmark_group("sample-size-example"); + group.significance_level(0.001).sample_size(10); + let d_folder_name: &str = DIA_TEST; + let file_reader: FileReader = + FileReader::new(d_folder_name.to_string()).unwrap(); + group.bench_function("DIA read_all_frames 6m", |b| { + b.iter(|| read_all_frames(black_box(&file_reader))) + }); + group.bench_function("DIA read_all_ms1_frames 6m", |b| { + b.iter(|| read_all_ms1_frames(black_box(&file_reader))) + }); + group.bench_function("DIA read_all_ms2_frames 6m", |b| { + b.iter(|| read_all_ms2_frames(black_box(&file_reader))) + }); + group.finish(); +} + +fn criterion_benchmark_syp(c: &mut Criterion) { + // c.bench_function("fib 20", |b| b.iter(|| fibonacci(black_box(20)))); + let mut group = c.benchmark_group("sample-size-example"); + group.significance_level(0.001).sample_size(10); + let d_folder_name: &str = SYP_TEST; + let file_reader: FileReader = + FileReader::new(d_folder_name.to_string()).unwrap(); + group.bench_function("SYP read_all_frames 6m", |b| { + b.iter(|| read_all_frames(black_box(&file_reader))) + }); + group.bench_function("SYP read_all_ms1_frames 6m", |b| { + b.iter(|| read_all_ms1_frames(black_box(&file_reader))) + }); + group.bench_function("SYP read_all_ms2_frames 6m", |b| { + b.iter(|| read_all_ms2_frames(black_box(&file_reader))) + }); + group.finish(); +} + +criterion_group!( + benches, + criterion_benchmark_dda, + // criterion_benchmark_dia, + // criterion_benchmark_syp +); criterion_main!(benches); diff --git a/src/acquisition.rs b/src/acquisition.rs deleted file mode 100644 index 18f0500..0000000 --- a/src/acquisition.rs +++ /dev/null @@ -1,7 +0,0 @@ -/// The kind of acquisition that was used. -#[derive(Debug, PartialEq, Clone, Copy)] -pub enum AcquisitionType { - DDAPASEF, - DIAPASEF, - Unknown, -} diff --git a/src/calibration.rs b/src/calibration.rs deleted file mode 100644 index 7b89913..0000000 --- a/src/calibration.rs +++ /dev/null @@ -1,29 +0,0 @@ -use crate::{ - converters::{ConvertableIndex, Tof2MzConverter}, - spectra::RawSpectrum, - Precursor, -}; - -pub struct Tof2MzCalibrator; - -impl Tof2MzCalibrator { - pub fn find_unfragmented_precursors( - spectra: &Vec, - mz_reader: &Tof2MzConverter, - precursors: &Vec, - tolerance: f64, - ) -> Vec<(f64, u32)> { - let mut hits: Vec<(f64, u32)> = vec![]; - for (index, spectrum) in spectra.iter().enumerate() { - let precursor_mz: f64 = precursors[index].mz; - for &tof_index in spectrum.tof_indices.iter() { - let mz = mz_reader.convert(tof_index); - if (mz - precursor_mz).abs() < tolerance { - let hit = (precursor_mz, tof_index); - hits.push(hit); - } - } - } - hits - } -} diff --git a/src/converters.rs b/src/converters.rs deleted file mode 100644 index fab7f40..0000000 --- a/src/converters.rs +++ /dev/null @@ -1,89 +0,0 @@ -use linreg::linear_regression; - -/// Converting from an index domain (e.g. Time of Flight) to a continuous domain (m/z). -pub trait ConvertableIndex { - /// Convert any index (even fractional) to a continuous value. - fn convert + Copy>(&self, index: T) -> f64; -} - -/// A converter from TOF -> m/z. -#[derive(Debug, Copy, Clone)] -pub struct Tof2MzConverter { - tof_intercept: f64, - tof_slope: f64, -} - -impl Tof2MzConverter { - pub fn new(mz_min: f64, mz_max: f64, tof_max_index: u32) -> Self { - let tof_intercept: f64 = mz_min.sqrt(); - let tof_slope: f64 = - (mz_max.sqrt() - tof_intercept) / tof_max_index as f64; - Self { - tof_intercept, - tof_slope, - } - } - - pub fn from_unfragmented_precursors(data: &Vec<(f64, u32)>) -> Self { - let x: Vec = data.iter().map(|(_, x_val)| *x_val).collect(); - let y: Vec = - data.iter().map(|(y_val, _)| (*y_val).sqrt()).collect(); - let (tof_slope, tof_intercept) = linear_regression(&x, &y).unwrap(); - Self { - tof_intercept, - tof_slope, - } - } -} - -impl ConvertableIndex for Tof2MzConverter { - fn convert + Copy>(&self, index: T) -> f64 { - let tof_index_f64: f64 = index.into(); - (self.tof_intercept + self.tof_slope * tof_index_f64).powi(2) - } -} - -/// A converter from Scan -> ion mobility. -#[derive(Debug, Copy, Clone)] -pub struct Scan2ImConverter { - scan_intercept: f64, - scan_slope: f64, -} - -impl Scan2ImConverter { - pub fn new(im_min: f64, im_max: f64, scan_max_index: u32) -> Self { - let scan_intercept: f64 = im_max; - let scan_slope: f64 = (im_min - scan_intercept) / scan_max_index as f64; - Self { - scan_intercept, - scan_slope, - } - } -} - -impl ConvertableIndex for Scan2ImConverter { - fn convert + Copy>(&self, index: T) -> f64 { - let scan_index_f64: f64 = index.into(); - self.scan_intercept + self.scan_slope * scan_index_f64 - } -} - -/// A converter from Frame -> retention time. -#[derive(Debug, Clone)] -pub struct Frame2RtConverter { - rt_values: Vec, -} - -impl Frame2RtConverter { - pub fn new(rt_values: Vec) -> Self { - Self { rt_values } - } -} - -impl ConvertableIndex for Frame2RtConverter { - fn convert + Copy>(&self, index: T) -> f64 { - let lower_value: f64 = self.rt_values[index.into().floor() as usize]; - let upper_value: f64 = self.rt_values[index.into().ceil() as usize]; - (lower_value + upper_value) / 2. - } -} diff --git a/src/domain_converters.rs b/src/domain_converters.rs new file mode 100644 index 0000000..11c1387 --- /dev/null +++ b/src/domain_converters.rs @@ -0,0 +1,13 @@ +//! Allows conversions between domains (e.g. Time of Flight and m/z) +mod frame_to_rt; +mod scan_to_im; +mod tof_to_mz; + +pub use frame_to_rt::Frame2RtConverter; +pub use scan_to_im::Scan2ImConverter; +pub use tof_to_mz::Tof2MzConverter; + +/// Convert from one domain (e.g. Time of Flight) to another (m/z). +pub trait ConvertableDomain { + fn convert + Copy>(&self, value: T) -> f64; +} diff --git a/src/domain_converters/frame_to_rt.rs b/src/domain_converters/frame_to_rt.rs new file mode 100644 index 0000000..5c83388 --- /dev/null +++ b/src/domain_converters/frame_to_rt.rs @@ -0,0 +1,19 @@ +/// A converter from Frame -> retention time. +#[derive(Debug, Clone)] +pub struct Frame2RtConverter { + rt_values: Vec, +} + +impl Frame2RtConverter { + pub fn from_values(rt_values: Vec) -> Self { + Self { rt_values } + } +} + +impl super::ConvertableDomain for Frame2RtConverter { + fn convert + Copy>(&self, value: T) -> f64 { + let lower_value: f64 = self.rt_values[value.into().floor() as usize]; + let upper_value: f64 = self.rt_values[value.into().ceil() as usize]; + (lower_value + upper_value) / 2. + } +} diff --git a/src/domain_converters/scan_to_im.rs b/src/domain_converters/scan_to_im.rs new file mode 100644 index 0000000..68339f4 --- /dev/null +++ b/src/domain_converters/scan_to_im.rs @@ -0,0 +1,28 @@ +/// A converter from Scan -> (inversed) ion mobility. +#[derive(Debug, Clone)] +pub struct Scan2ImConverter { + scan_intercept: f64, + scan_slope: f64, +} + +impl Scan2ImConverter { + pub fn from_boundaries( + im_min: f64, + im_max: f64, + scan_max_index: u32, + ) -> Self { + let scan_intercept: f64 = im_max; + let scan_slope: f64 = (im_min - scan_intercept) / scan_max_index as f64; + Self { + scan_intercept, + scan_slope, + } + } +} + +impl super::ConvertableDomain for Scan2ImConverter { + fn convert + Copy>(&self, value: T) -> f64 { + let scan_index_f64: f64 = value.into(); + self.scan_intercept + self.scan_slope * scan_index_f64 + } +} diff --git a/src/domain_converters/tof_to_mz.rs b/src/domain_converters/tof_to_mz.rs new file mode 100644 index 0000000..e23ac18 --- /dev/null +++ b/src/domain_converters/tof_to_mz.rs @@ -0,0 +1,42 @@ +use linreg::linear_regression; + +/// A converter from TOF -> m/z. +#[derive(Debug, Clone)] +pub struct Tof2MzConverter { + tof_intercept: f64, + tof_slope: f64, +} + +impl Tof2MzConverter { + pub fn from_boundaries( + mz_min: f64, + mz_max: f64, + tof_max_index: u32, + ) -> Self { + let tof_intercept: f64 = mz_min.sqrt(); + let tof_slope: f64 = + (mz_max.sqrt() - tof_intercept) / tof_max_index as f64; + Self { + tof_intercept, + tof_slope, + } + } + + pub fn from_pairs(data: &Vec<(f64, u32)>) -> Self { + let x: Vec = data.iter().map(|(_, x_val)| *x_val).collect(); + let y: Vec = + data.iter().map(|(y_val, _)| (*y_val).sqrt()).collect(); + let (tof_slope, tof_intercept) = linear_regression(&x, &y).unwrap(); + Self { + tof_intercept, + tof_slope, + } + } +} + +impl super::ConvertableDomain for Tof2MzConverter { + fn convert + Copy>(&self, value: T) -> f64 { + let tof_index_f64: f64 = value.into(); + (self.tof_intercept + self.tof_slope * tof_index_f64).powi(2) + } +} diff --git a/src/errors.rs b/src/errors.rs index e429674..7af743c 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -1,4 +1,7 @@ -use crate::file_readers; +use crate::{ + file_readers, + // io::readers::common::{sql_reader::SqlError, tdf_blobs::TdfBlobError}, +}; /// An error that is produced by timsrust (uses [thiserror]). #[derive(thiserror::Error, Debug)] @@ -6,4 +9,8 @@ pub enum Error { /// An error to indicate a path is not a Bruker File Format. #[error("FileFormatError: {0}")] FileFormatError(#[from] file_readers::FileFormatError), + // #[error("SqlError: {0}")] + // SqlError(#[from] SqlError), + // #[error("BinError: {0}")] + // BinError(#[from] TdfBlobError), } diff --git a/src/file_readers.rs b/src/file_readers.rs index 56eb5cd..b14d0fd 100644 --- a/src/file_readers.rs +++ b/src/file_readers.rs @@ -1,104 +1,159 @@ -use crate::{ - converters::{Frame2RtConverter, Scan2ImConverter, Tof2MzConverter}, - Error, -}; +use std::{fs, path::PathBuf}; -mod common; -mod file_formats; -mod frame_readers; -mod spectrum_readers; +use crate::io::readers::file_readers::sql_reader::frames::SqlFrame; +use crate::io::readers::SpectrumReader; +use crate::{io::readers::FrameReader, Error}; -use { - self::{ - file_formats::FileFormat, frame_readers::ReadableFrames, - spectrum_readers::ReadableSpectra, - }, - crate::{Frame, Spectrum}, -}; +use crate::ms_data::{Frame, Spectrum}; +use rayon::iter::ParallelIterator; -pub use file_formats::FileFormatError; - -use self::frame_readers::tdf_reader::TDFReader; - -/// A reader to read [frames](crate::Frame) and [spectra](crate::Spectrum). +/// A reader to read [frames](crate::ms_data::Frame) and [spectra](crate::ms_data::Spectrum). pub struct FileReader { - format: FileFormat, + frame_reader: Option, + spectrum_reader: Option, } -///NOTE: The functions to read a single frame or spectrum are not optimized. -/// In case many frames or spectra are required, it is best to use -/// any of the functions that directly return a `Vec`. impl FileReader { + // TODO refactor out + // TODO proper error handling + // TODO update docs pub fn new>(path_name: T) -> Result { let format: FileFormat = FileFormat::parse(path_name)?; - Ok(Self { format }) + let frame_reader = match &format { + FileFormat::DFolder(path) => Some(FrameReader::new(&path)), + FileFormat::MS2Folder(_) => None, + }; + let spectrum_reader = match &format { + FileFormat::DFolder(path) => { + let reader = SpectrumReader::new(path); + // reader.calibrate(); + Some(reader) + }, + FileFormat::MS2Folder(path) => Some(SpectrumReader::new(path)), + }; + Ok(Self { + frame_reader, + spectrum_reader, + }) } pub fn read_single_frame(&self, index: usize) -> Frame { - self.format.read_single_frame(index) + self.frame_reader.as_ref().unwrap().get(index) + } + + fn read_multiple_frames<'a, F: Fn(&SqlFrame) -> bool + Sync + Send + 'a>( + &self, + predicate: F, + ) -> Vec { + self.frame_reader + .as_ref() + .unwrap() + .parallel_filter(|x| predicate(x)) + .collect() } pub fn read_all_frames(&self) -> Vec { - self.format.read_all_frames() + self.read_multiple_frames(|_| true) } - /// NOTE: The returned vec contains all frames to not disrupt indexing. - /// MS2 frames are set to unknown and not read. pub fn read_all_ms1_frames(&self) -> Vec { - self.format.read_all_ms1_frames() + self.read_multiple_frames(|x| x.msms_type == 0) } - /// NOTE: The returned vec contains all frames to not disrupt indexing. - /// MS1 frames are set to unknown and not read. pub fn read_all_ms2_frames(&self) -> Vec { - self.format.read_all_ms2_frames() + self.read_multiple_frames(|x| x.msms_type != 0) } pub fn read_single_spectrum(&self, index: usize) -> Spectrum { - self.format.read_single_spectrum(index) + self.spectrum_reader.as_ref().unwrap().get(index) } - ///NOTE: ddaPASEF MS2 spectra are automatically calibrated with - /// all unfragmented precursor signals. - /// Hence, reading spectra individually through `read_single_spectrum` - /// might yield slightly different mz values. pub fn read_all_spectra(&self) -> Vec { - self.format.read_all_spectra() + self.spectrum_reader.as_ref().unwrap().get_all() } +} - pub fn get_frame_converter(&self) -> Result { - match &self.format { - FileFormat::DFolder(path) => Ok(TDFReader::new( - &path.to_str().unwrap_or_default().to_string(), - ) - .rt_converter), - _ => Err(Error::FileFormatError( - FileFormatError::MetadataFilesAreMissing, - )), +pub enum FileFormat { + DFolder(PathBuf), + MS2Folder(PathBuf), +} + +impl FileFormat { + // TODO make into proper struct + pub fn parse( + input: impl AsRef, + ) -> Result { + let path: PathBuf = input.as_ref().to_path_buf(); + if !path.exists() { + return Err(FileFormatError::DirectoryDoesNotExist); } + let extension: &str = path + .extension() + .unwrap_or_default() + .to_str() + .unwrap_or_default(); + let format = match extension { + "d" => Self::DFolder(path), + _ => Self::MS2Folder(path), + }; + format.is_valid()?; + Ok(format) } - pub fn get_scan_converter(&self) -> Result { - match &self.format { - FileFormat::DFolder(path) => Ok(TDFReader::new( - &path.to_str().unwrap_or_default().to_string(), - ) - .im_converter), - _ => Err(Error::FileFormatError( - FileFormatError::MetadataFilesAreMissing, - )), + /// FileFormat is guaranteed to be `valid` if it is constructed + fn is_valid(&self) -> Result<(), FileFormatError> { + match &self { + Self::DFolder(path) => { + if !folder_contains_extension(path, "tdf_bin") { + return Err(FileFormatError::BinaryFilesAreMissing); + } + if !folder_contains_extension(path, "tdf") { + return Err(FileFormatError::MetadataFilesAreMissing); + } + }, + Self::MS2Folder(path) => { + if !folder_contains_extension(path, "bin") { + return Err(FileFormatError::BinaryFilesAreMissing); + } + if !folder_contains_extension(path, "parquet") { + return Err(FileFormatError::MetadataFilesAreMissing); + } + }, } + Ok(()) } +} - pub fn get_tof_converter(&self) -> Result { - match &self.format { - FileFormat::DFolder(path) => Ok(TDFReader::new( - &path.to_str().unwrap_or_default().to_string(), - ) - .mz_converter), - _ => Err(Error::FileFormatError( - FileFormatError::MetadataFilesAreMissing, - )), +fn folder_contains_extension( + input: impl AsRef, + extension: &str, +) -> bool { + let folder_path: PathBuf = input.as_ref().to_path_buf(); + if !folder_path.is_dir() { + return false; + } + if let Ok(entries) = fs::read_dir(folder_path) { + for entry in entries { + if let Ok(entry) = entry { + if let Some(ext) = entry.path().extension() { + if ext == extension { + return true; + } + } + } } } + false +} + +#[derive(thiserror::Error, Debug)] +pub enum FileFormatError { + #[error("DirectoryDoesNotExist")] + DirectoryDoesNotExist, + #[error("NoParentWithBrukerExtension")] + NoParentWithBrukerExtension, + #[error("BinaryFilesAreMissing")] + BinaryFilesAreMissing, + #[error("MetadataFilesAreMissing")] + MetadataFilesAreMissing, } diff --git a/src/file_readers/common/ms_data_blobs.rs b/src/file_readers/common/ms_data_blobs.rs deleted file mode 100644 index c261b9b..0000000 --- a/src/file_readers/common/ms_data_blobs.rs +++ /dev/null @@ -1,104 +0,0 @@ -mod parsers; - -use std::fs::File; - -use memmap2::Mmap; -use zstd::decode_all; - -use crate::{Frame, Spectrum}; - -use self::parsers::parse_frame; - -#[derive(Debug, Default)] -pub struct BinFileReader { - file_offsets: Vec, - mmap: Option, -} - -impl BinFileReader { - pub fn new(file_name: String, file_offsets: Vec) -> Self { - let tdf_bin_file: File = File::open(&file_name) - .expect("File cannot be opened. Is the path correct?"); - let mmap: Option = - Some(unsafe { Mmap::map(&tdf_bin_file).unwrap() }); - Self { file_offsets, mmap } - } - - fn read_blob(&self, index: usize) -> Vec { - let offset: u64 = self.file_offsets[index as usize]; - if let Some(mmap) = self.mmap.as_ref() { - let raw_byte_count: &[u8] = - &mmap[offset as usize..(offset + 4) as usize]; - let byte_count: u32 = - u32::from_le_bytes(raw_byte_count.try_into().unwrap()); - if byte_count > 8 { - let compressed_blob: &[u8] = &mmap[(offset + 8) as usize - ..offset as usize + byte_count as usize]; - let blob: Vec = decode_all(compressed_blob).unwrap(); - return blob; - } - }; - return vec![]; - } - - pub fn size(&self) -> usize { - self.file_offsets.len() - } -} - -pub trait ReadableFromBinFile { - fn parse_from_ms_data_blob(buffer: Vec, index: usize) -> Self; - - fn read_from_file(bin_file: &BinFileReader, index: usize) -> Self - where - Self: Sized, - { - let blob: Vec = bin_file.read_blob(index); - Self::parse_from_ms_data_blob(blob, index) - } -} - -impl ReadableFromBinFile for Spectrum { - fn parse_from_ms_data_blob(blob: Vec, index: usize) -> Self { - let mut spectrum: Spectrum = Spectrum::default(); - spectrum.index = index; - if blob.len() == 0 { - return spectrum; - }; - let size: usize = blob.len() / std::mem::size_of::(); - let first: &[u8] = &blob[0 * size..1 * size]; - let second: &[u8] = &blob[1 * size..2 * size]; - let third: &[u8] = &blob[2 * size..3 * size]; - let fourth: &[u8] = &blob[3 * size..4 * size]; - let mut spectrum_data: Vec = vec![0; size]; - for i in 0..size { - spectrum_data[i] = first[i] as u32; - spectrum_data[i] |= (second[i] as u32) << 8; - spectrum_data[i] |= (third[i] as u32) << 16; - spectrum_data[i] |= (fourth[i] as u32) << 24; - } - let scan_count: usize = blob.len() / 3 / std::mem::size_of::(); - let tof_indices_bytes: &[u32] = - &spectrum_data[..scan_count as usize * 2]; - let intensities_bytes: &[u32] = - &spectrum_data[scan_count as usize * 2..]; - let mz_values: &[f64] = - bytemuck::cast_slice::(tof_indices_bytes); - let intensity_values: &[f32] = - bytemuck::cast_slice::(intensities_bytes); - spectrum.intensities = - intensity_values.iter().map(|&x| x as f64).collect(); - spectrum.mz_values = mz_values.to_vec(); - spectrum - } -} - -impl ReadableFromBinFile for Frame { - fn parse_from_ms_data_blob(blob: Vec, index: usize) -> Self { - let mut frame = Frame::default(); - (frame.scan_offsets, frame.tof_indices, frame.intensities) = - parse_frame(blob); - frame.index = index; - frame - } -} diff --git a/src/file_readers/common/ms_data_blobs/parsers.rs b/src/file_readers/common/ms_data_blobs/parsers.rs deleted file mode 100644 index bdb2918..0000000 --- a/src/file_readers/common/ms_data_blobs/parsers.rs +++ /dev/null @@ -1,82 +0,0 @@ -const U32_SIZE: usize = std::mem::size_of::(); - -#[inline(always)] -fn get_u32_from_blob(blob: &Vec, index: usize) -> u32 { - let size: usize = blob.len() / U32_SIZE; - return concatenate_four_bytes_into_u32( - blob[index], - blob[size + index], - blob[2 * size + index], - blob[3 * size + index], - ); -} - -#[inline(always)] -fn concatenate_four_bytes_into_u32(b1: u8, b2: u8, b3: u8, b4: u8) -> u32 { - (b1 as u32) | ((b2 as u32) << 8) | ((b3 as u32) << 16) | ((b4 as u32) << 24) -} - -pub fn parse_frame(blob: Vec) -> (Vec, Vec, Vec) { - let mut tof_indices: Vec = vec![]; - let mut intensities: Vec = vec![]; - let mut scan_offsets: Vec = vec![]; - if blob.len() != 0 { - let scan_count: usize = get_u32_from_blob(&blob, 0) as usize; - let peak_count: usize = (blob.len() / U32_SIZE - scan_count) / 2; - scan_offsets = read_scan_offsets(scan_count, peak_count, &blob); - intensities = read_intensities(scan_count, peak_count, &blob); - tof_indices = - read_tof_indices(scan_count, peak_count, &blob, &scan_offsets); - } - (scan_offsets, tof_indices, intensities) -} - -fn read_scan_offsets( - scan_count: usize, - peak_count: usize, - blob: &Vec, -) -> Vec { - let mut scan_offsets: Vec = Vec::with_capacity(scan_count + 1); - scan_offsets.push(0); - for scan_index in 0..scan_count - 1 { - let index = scan_index + 1; - let scan_size: usize = (get_u32_from_blob(blob, index) / 2) as usize; - scan_offsets.push(scan_offsets[scan_index] + scan_size); - } - scan_offsets.push(peak_count); - scan_offsets -} - -fn read_intensities( - scan_count: usize, - peak_count: usize, - blob: &Vec, -) -> Vec { - let mut intensities: Vec = Vec::with_capacity(peak_count); - for peak_index in 0..peak_count { - let index: usize = scan_count + 1 + 2 * peak_index; - intensities.push(get_u32_from_blob(blob, index)); - } - intensities -} - -fn read_tof_indices( - scan_count: usize, - peak_count: usize, - blob: &Vec, - scan_offsets: &Vec, -) -> Vec { - let mut tof_indices: Vec = Vec::with_capacity(peak_count); - for scan_index in 0..scan_count { - let start_offset: usize = scan_offsets[scan_index]; - let end_offset: usize = scan_offsets[scan_index + 1]; - let mut current_sum: u32 = 0; - for peak_index in start_offset..end_offset { - let index = scan_count + 2 * peak_index; - let tof_index: u32 = get_u32_from_blob(blob, index); - current_sum += tof_index; - tof_indices.push(current_sum - 1); - } - } - tof_indices -} diff --git a/src/file_readers/common/parquet_reader.rs b/src/file_readers/common/parquet_reader.rs deleted file mode 100644 index 531ad02..0000000 --- a/src/file_readers/common/parquet_reader.rs +++ /dev/null @@ -1,46 +0,0 @@ -use parquet::file::reader::{FileReader, SerializedFileReader}; -use std::fs::File; - -use crate::Precursor; - -pub fn read_parquet_precursors( - parquet_file_name: &String, -) -> (Vec, Vec) { - let file: File = File::open(parquet_file_name).unwrap(); - let reader: SerializedFileReader = - SerializedFileReader::new(file).unwrap(); - let mut precursors: Vec = vec![]; - let mut offsets: Vec = vec![]; - for record in reader.get_row_iter(None).unwrap() { - let mut precursor: Precursor = Precursor::default(); - for (name, field) in record.get_column_iter() { - match name.to_string().as_str() { - "Id" => precursor.index = field.to_string().parse().unwrap(), - "RetentionTime" => { - precursor.rt = field.to_string().parse().unwrap() - }, - "MonoisotopicMz" => { - precursor.mz = field.to_string().parse().unwrap_or(0.0) - }, - "Charge" => { - precursor.charge = - field.to_string().parse().unwrap_or(0.0) as usize - }, - "Intensity" => { - precursor.intensity = field.to_string().parse().unwrap() - }, - "ooK0" => precursor.im = field.to_string().parse().unwrap(), - "MS1ParentFrameId" => { - precursor.frame_index = - field.to_string().parse::().unwrap() as usize - }, - "BinaryOffset" => { - offsets.push(field.to_string().parse().unwrap()) - }, - _ => {}, - } - } - precursors.push(precursor); - } - (precursors, offsets) -} diff --git a/src/file_readers/common/sql_reader.rs b/src/file_readers/common/sql_reader.rs deleted file mode 100644 index a5e10f0..0000000 --- a/src/file_readers/common/sql_reader.rs +++ /dev/null @@ -1,67 +0,0 @@ -mod metadata; -mod tables; - -pub use tables::*; - -use rusqlite::{Connection, Result, Statement}; -use std::path::Path; - -#[derive(Debug)] -pub struct SqlReader { - pub path: String, -} - -impl SqlReader { - fn read_column_from_table( - &self, - column_name: &str, - table_name: &str, - ) -> Vec { - let column_names: Vec = - self.get_table_columns(table_name).unwrap(); - let order_by: String = column_names.join(", "); - let query: String = format!( - "SELECT {} FROM {} ORDER BY {}", - column_name, table_name, order_by - ); - - self.get_data_from_sql(&query) - } - - pub fn get_data_from_sql( - &self, - query: &String, - ) -> Vec { - let connection: Connection = get_sql_connection(&self.path); - let mut stmt: Statement = connection.prepare(&query).unwrap(); - let rows = stmt - .query_map( - [], - // |row| row.get::(0) - |row| match row.get::(0) { - Ok(value) => Ok(value), - _ => Ok(T::default()), - }, - ) - .unwrap(); - rows.collect::>>().unwrap() - } - - fn get_table_columns(&self, table_name: &str) -> Result> { - let connection: Connection = get_sql_connection(&self.path); - let query = format!("PRAGMA table_info({})", table_name); - let mut stmt: Statement = connection.prepare(&query)?; - let rows = stmt.query_map([], |row| row.get::(1))?; - rows.collect() - } -} - -fn get_sql_connection(path: &String) -> Connection { - let db_file_path: std::path::PathBuf = Path::new(path).join("analysis.tdf"); - let connection: Connection = Connection::open(&db_file_path).unwrap(); - connection -} - -pub trait ReadableFromSql { - fn from_sql(sql_reader: &SqlReader) -> Self; -} diff --git a/src/file_readers/common/sql_reader/metadata.rs b/src/file_readers/common/sql_reader/metadata.rs deleted file mode 100644 index 6e1e29b..0000000 --- a/src/file_readers/common/sql_reader/metadata.rs +++ /dev/null @@ -1,89 +0,0 @@ -use rusqlite::{Connection, Statement}; - -use crate::converters::{Scan2ImConverter, Tof2MzConverter}; - -use super::{get_sql_connection, ReadableFromSql, SqlReader}; - -fn read_tof_max_index(connection: &Connection) -> u32 { - let tof_max_index_string: String = connection - .query_row( - "SELECT Value FROM GlobalMetadata WHERE Key = 'DigitizerNumSamples'", - [], - |row| row.get(0), - ) - .unwrap(); - let tof_max_index: u32 = tof_max_index_string.parse().unwrap(); - tof_max_index -} - -fn read_mz_max_value(connection: &Connection) -> f64 { - let mz_max_value_string: String = connection - .query_row( - "SELECT Value FROM GlobalMetadata WHERE Key = 'MzAcqRangeUpper'", - [], - |row| row.get(0), - ) - .unwrap(); - let mz_max_value: f64 = mz_max_value_string.parse().unwrap(); - mz_max_value -} - -fn read_mz_min_value(connection: &Connection) -> f64 { - let mz_min_value_string: String = connection - .query_row( - "SELECT Value FROM GlobalMetadata WHERE Key = 'MzAcqRangeLower'", - [], - |row| row.get(0), - ) - .unwrap(); - let mz_min_value: f64 = mz_min_value_string.parse().unwrap(); - mz_min_value -} - -impl SqlReader { - fn read_metadata(&self, value_name: &str) -> String { - let connection: Connection = get_sql_connection(&self.path); - let query: String = format!( - "SELECT Value FROM GlobalMetadata WHERE Key = '{}'", - value_name - ); - let mut stmt: Statement = connection.prepare(&query).unwrap(); - let value_str: String = stmt.query_row([], |row| row.get(0)).unwrap(); - value_str - } - - pub fn read_im_information(&self) -> (u32, f64, f64) { - let lower_im_value: f64 = self - .read_metadata("OneOverK0AcqRangeLower") - .parse() - .unwrap(); - let upper_im_value: f64 = self - .read_metadata("OneOverK0AcqRangeUpper") - .parse() - .unwrap(); - let scan_max_index: u32 = 927; - (scan_max_index, lower_im_value, upper_im_value) - } - - pub fn read_mz_information(&self) -> (u32, f64, f64) { - let connection: Connection = get_sql_connection(&self.path); - let tof_max_index: u32 = read_tof_max_index(&connection); - let lower_mz_value: f64 = read_mz_min_value(&connection); - let upper_mz_value: f64 = read_mz_max_value(&connection); - (tof_max_index, lower_mz_value, upper_mz_value) - } -} - -impl ReadableFromSql for Tof2MzConverter { - fn from_sql(sql_reader: &SqlReader) -> Self { - let (tof_max_index, mz_min, mz_max) = sql_reader.read_mz_information(); - Tof2MzConverter::new(mz_min, mz_max, tof_max_index) - } -} - -impl ReadableFromSql for Scan2ImConverter { - fn from_sql(sql_reader: &SqlReader) -> Self { - let (scan_max_index, im_min, im_max) = sql_reader.read_im_information(); - Scan2ImConverter::new(im_min, im_max, scan_max_index) - } -} diff --git a/src/file_readers/common/sql_reader/tables.rs b/src/file_readers/common/sql_reader/tables.rs deleted file mode 100644 index 0a10b58..0000000 --- a/src/file_readers/common/sql_reader/tables.rs +++ /dev/null @@ -1,11 +0,0 @@ -mod dia_frames_info; -mod dia_frames_msms; -mod frames; -mod pasef_frame_msms; -mod precursors; - -pub use dia_frames_info::DiaFramesInfoTable; -pub use dia_frames_msms::DiaFramesMsMsTable; -pub use frames::FrameTable; -pub use pasef_frame_msms::PasefFrameMsMsTable; -pub use precursors::PrecursorTable; diff --git a/src/file_readers/common/sql_reader/tables/dia_frames_info.rs b/src/file_readers/common/sql_reader/tables/dia_frames_info.rs deleted file mode 100644 index 1511c2d..0000000 --- a/src/file_readers/common/sql_reader/tables/dia_frames_info.rs +++ /dev/null @@ -1,17 +0,0 @@ -use crate::file_readers::common::sql_reader::{ReadableFromSql, SqlReader}; - -#[derive(Debug)] -pub struct DiaFramesInfoTable { - pub frame: Vec, - pub group: Vec, -} - -impl ReadableFromSql for DiaFramesInfoTable { - fn from_sql(sql_reader: &SqlReader) -> Self { - let table_name: &str = "DiaFrameMsMsInfo"; - DiaFramesInfoTable { - frame: sql_reader.read_column_from_table("Frame", table_name), - group: sql_reader.read_column_from_table("WindowGroup", table_name), - } - } -} diff --git a/src/file_readers/common/sql_reader/tables/dia_frames_msms.rs b/src/file_readers/common/sql_reader/tables/dia_frames_msms.rs deleted file mode 100644 index 807eb29..0000000 --- a/src/file_readers/common/sql_reader/tables/dia_frames_msms.rs +++ /dev/null @@ -1,27 +0,0 @@ -use crate::file_readers::common::sql_reader::{ReadableFromSql, SqlReader}; - -#[derive(Debug)] -pub struct DiaFramesMsMsTable { - pub group: Vec, - pub scan_start: Vec, - pub scan_end: Vec, - pub mz_center: Vec, - pub mz_width: Vec, -} - -impl ReadableFromSql for DiaFramesMsMsTable { - fn from_sql(sql_reader: &SqlReader) -> Self { - let table_name: &str = "DiaFrameMsMsWindows"; - DiaFramesMsMsTable { - group: sql_reader.read_column_from_table("WindowGroup", table_name), - scan_start: sql_reader - .read_column_from_table("ScanNumBegin", table_name), - scan_end: sql_reader - .read_column_from_table("ScanNumEnd", table_name), - mz_center: sql_reader - .read_column_from_table("IsolationMz", table_name), - mz_width: sql_reader - .read_column_from_table("IsolationWidth", table_name), - } - } -} diff --git a/src/file_readers/common/sql_reader/tables/frames.rs b/src/file_readers/common/sql_reader/tables/frames.rs deleted file mode 100644 index 56508e2..0000000 --- a/src/file_readers/common/sql_reader/tables/frames.rs +++ /dev/null @@ -1,31 +0,0 @@ -use crate::file_readers::common::sql_reader::{ReadableFromSql, SqlReader}; - -#[derive(Debug)] -pub struct FrameTable { - pub id: Vec, - pub scan_mode: Vec, - pub msms_type: Vec, - pub peak_count: Vec, - pub rt: Vec, - pub scan_count: Vec, - pub offsets: Vec, -} - -impl ReadableFromSql for FrameTable { - fn from_sql(sql_reader: &SqlReader) -> Self { - let table_name: &str = "Frames"; - FrameTable { - id: sql_reader.read_column_from_table("Id", table_name), - scan_mode: sql_reader - .read_column_from_table("ScanMode", table_name), - msms_type: sql_reader - .read_column_from_table("MsMsType", table_name), - peak_count: sql_reader - .read_column_from_table("NumPeaks", table_name), - rt: sql_reader.read_column_from_table("Time", table_name), - scan_count: sql_reader - .read_column_from_table("NumScans", table_name), - offsets: sql_reader.read_column_from_table("TimsId", "Frames"), - } - } -} diff --git a/src/file_readers/common/sql_reader/tables/pasef_frame_msms.rs b/src/file_readers/common/sql_reader/tables/pasef_frame_msms.rs deleted file mode 100644 index 49332e8..0000000 --- a/src/file_readers/common/sql_reader/tables/pasef_frame_msms.rs +++ /dev/null @@ -1,33 +0,0 @@ -use crate::file_readers::common::sql_reader::{ReadableFromSql, SqlReader}; - -#[derive(Debug)] -pub struct PasefFrameMsMsTable { - pub frame: Vec, - pub scan_start: Vec, - pub scan_end: Vec, - pub mz_center: Vec, - pub mz_width: Vec, - pub collision_energy: Vec, - pub precursor: Vec, -} - -impl ReadableFromSql for PasefFrameMsMsTable { - fn from_sql(sql_reader: &SqlReader) -> Self { - let table_name: &str = "PasefFrameMsMsInfo"; - PasefFrameMsMsTable { - frame: sql_reader.read_column_from_table("Frame", table_name), - scan_start: sql_reader - .read_column_from_table("ScanNumBegin", table_name), - scan_end: sql_reader - .read_column_from_table("ScanNumEnd", table_name), - mz_center: sql_reader - .read_column_from_table("IsolationMz", table_name), - mz_width: sql_reader - .read_column_from_table("IsolationWidth", table_name), - collision_energy: sql_reader - .read_column_from_table("CollisionEnergy", table_name), - precursor: sql_reader - .read_column_from_table("Precursor", table_name), - } - } -} diff --git a/src/file_readers/common/sql_reader/tables/precursors.rs b/src/file_readers/common/sql_reader/tables/precursors.rs deleted file mode 100644 index 5187ec1..0000000 --- a/src/file_readers/common/sql_reader/tables/precursors.rs +++ /dev/null @@ -1,28 +0,0 @@ -use crate::file_readers::common::sql_reader::{ReadableFromSql, SqlReader}; - -#[derive(Debug)] -pub struct PrecursorTable { - pub id: Vec, - pub mz: Vec, - pub charge: Vec, - pub scan_average: Vec, - pub intensity: Vec, - pub precursor_frame: Vec, -} - -impl ReadableFromSql for PrecursorTable { - fn from_sql(sql_reader: &SqlReader) -> Self { - let table_name: &str = "Precursors"; - PrecursorTable { - id: sql_reader.read_column_from_table("Id", table_name), - mz: sql_reader.read_column_from_table("MonoisotopicMz", table_name), - charge: sql_reader.read_column_from_table("Charge", table_name), - scan_average: sql_reader - .read_column_from_table("ScanNumber", table_name), - intensity: sql_reader - .read_column_from_table("Intensity", table_name), - precursor_frame: sql_reader - .read_column_from_table("Parent", table_name), - } - } -} diff --git a/src/file_readers/file_formats.rs b/src/file_readers/file_formats.rs deleted file mode 100644 index ec83680..0000000 --- a/src/file_readers/file_formats.rs +++ /dev/null @@ -1,85 +0,0 @@ -use std::{fs, path::PathBuf}; - -pub enum FileFormat { - DFolder(PathBuf), - MS2Folder(PathBuf), -} - -impl FileFormat { - pub fn parse( - input: impl AsRef, - ) -> Result { - let path: PathBuf = input.as_ref().to_path_buf(); - if !path.exists() { - return Err(FileFormatError::DirectoryDoesNotExist); - } - let extension: &str = path - .extension() - .unwrap_or_default() - .to_str() - .unwrap_or_default(); - let format = match extension { - "d" => Self::DFolder(path), - _ => Self::MS2Folder(path), - }; - format.is_valid()?; - Ok(format) - } - - /// FileFormat is guaranteed to be `valid` if it is constructed - fn is_valid(&self) -> Result<(), FileFormatError> { - match &self { - Self::DFolder(path) => { - if !folder_contains_extension(path, "tdf_bin") { - return Err(FileFormatError::BinaryFilesAreMissing); - } - if !folder_contains_extension(path, "tdf") { - return Err(FileFormatError::MetadataFilesAreMissing); - } - }, - Self::MS2Folder(path) => { - if !folder_contains_extension(path, "bin") { - return Err(FileFormatError::BinaryFilesAreMissing); - } - if !folder_contains_extension(path, "parquet") { - return Err(FileFormatError::MetadataFilesAreMissing); - } - }, - } - Ok(()) - } -} - -fn folder_contains_extension( - input: impl AsRef, - extension: &str, -) -> bool { - let folder_path: PathBuf = input.as_ref().to_path_buf(); - if !folder_path.is_dir() { - return false; - } - if let Ok(entries) = fs::read_dir(folder_path) { - for entry in entries { - if let Ok(entry) = entry { - if let Some(ext) = entry.path().extension() { - if ext == extension { - return true; - } - } - } - } - } - false -} - -#[derive(thiserror::Error, Debug)] -pub enum FileFormatError { - #[error("DirectoryDoesNotExist")] - DirectoryDoesNotExist, - #[error("NoParentWithBrukerExtension")] - NoParentWithBrukerExtension, - #[error("BinaryFilesAreMissing")] - BinaryFilesAreMissing, - #[error("MetadataFilesAreMissing")] - MetadataFilesAreMissing, -} diff --git a/src/file_readers/frame_readers.rs b/src/file_readers/frame_readers.rs deleted file mode 100644 index 939f2e0..0000000 --- a/src/file_readers/frame_readers.rs +++ /dev/null @@ -1,50 +0,0 @@ -use crate::Frame; - -use self::tdf_reader::TDFReader; - -use super::file_formats::FileFormat; - -pub mod tdf_reader; - -pub trait ReadableFrames { - fn read_single_frame(&self, index: usize) -> Frame; - - fn read_all_frames(&self) -> Vec; - - fn read_all_ms1_frames(&self) -> Vec; - - fn read_all_ms2_frames(&self) -> Vec; -} - -impl FileFormat { - fn unwrap_frame_reader(&self) -> Box { - let result = match &self { - Self::DFolder(path) => Box::new(TDFReader::new( - &path.to_str().unwrap_or_default().to_string(), - )) as Box, - Self::MS2Folder(path) => panic!( - "Folder {:} is not frame readable", - path.to_str().unwrap_or_default().to_string() - ), - }; - result - } -} - -impl ReadableFrames for FileFormat { - fn read_single_frame(&self, index: usize) -> Frame { - self.unwrap_frame_reader().read_single_frame(index) - } - - fn read_all_frames(&self) -> Vec { - self.unwrap_frame_reader().read_all_frames() - } - - fn read_all_ms1_frames(&self) -> Vec { - self.unwrap_frame_reader().read_all_ms1_frames() - } - - fn read_all_ms2_frames(&self) -> Vec { - self.unwrap_frame_reader().read_all_ms2_frames() - } -} diff --git a/src/file_readers/frame_readers/tdf_reader.rs b/src/file_readers/frame_readers/tdf_reader.rs deleted file mode 100644 index 1c7a264..0000000 --- a/src/file_readers/frame_readers/tdf_reader.rs +++ /dev/null @@ -1,111 +0,0 @@ -use { - crate::{ - acquisition::AcquisitionType, - converters::{ - ConvertableIndex, Frame2RtConverter, Scan2ImConverter, - Tof2MzConverter, - }, - file_readers::{ - common::{ - ms_data_blobs::{BinFileReader, ReadableFromBinFile}, - sql_reader::{FrameTable, ReadableFromSql, SqlReader}, - }, - ReadableFrames, - }, - Frame, FrameType, - }, - rayon::prelude::*, - std::path::Path, -}; - -#[derive(Debug)] -pub struct TDFReader { - pub path: String, - pub tdf_sql_reader: SqlReader, - tdf_bin_reader: BinFileReader, - pub rt_converter: Frame2RtConverter, - pub im_converter: Scan2ImConverter, - pub mz_converter: Tof2MzConverter, - pub frame_table: FrameTable, - frame_types: Vec, -} - -impl TDFReader { - pub fn new(path: &String) -> Self { - let tdf_sql_reader: SqlReader = SqlReader { - path: String::from(path), - }; - let frame_table: FrameTable = FrameTable::from_sql(&tdf_sql_reader); - let file_name: String = Path::new(&path) - .join("analysis.tdf_bin") - .to_string_lossy() - .to_string(); - let tdf_bin_reader: BinFileReader = BinFileReader::new( - String::from(&file_name), - frame_table.offsets.clone(), - ); - let frame_types: Vec = frame_table - .msms_type - .iter() - .map(|msms_type| match msms_type { - 0 => FrameType::MS1, - 8 => FrameType::MS2(AcquisitionType::DDAPASEF), - 9 => FrameType::MS2(AcquisitionType::DIAPASEF), - _ => FrameType::Unknown, - }) - .collect(); - Self { - path: path.to_string(), - tdf_bin_reader: tdf_bin_reader, - rt_converter: Self::get_rt_converter(&frame_table), - im_converter: Scan2ImConverter::from_sql(&tdf_sql_reader), - mz_converter: Tof2MzConverter::from_sql(&tdf_sql_reader), - frame_table: frame_table, - tdf_sql_reader: tdf_sql_reader, - frame_types: frame_types, - } - } - - fn get_rt_converter(frame_table: &FrameTable) -> Frame2RtConverter { - let retention_times: Vec = frame_table.rt.clone(); - Frame2RtConverter::new(retention_times) - } -} - -impl ReadableFrames for TDFReader { - fn read_single_frame(&self, index: usize) -> Frame { - let mut frame: Frame = - Frame::read_from_file(&self.tdf_bin_reader, index); - frame.rt = self.rt_converter.convert(index as u32); - frame.index = self.frame_table.id[index]; - frame.frame_type = self.frame_types[index]; - frame - } - - fn read_all_frames(&self) -> Vec { - (0..self.tdf_bin_reader.size()) - .into_par_iter() - .map(|index| self.read_single_frame(index)) - .collect() - } - - fn read_all_ms1_frames(&self) -> Vec { - (0..self.tdf_bin_reader.size()) - .into_par_iter() - .map(|index| match self.frame_types[index] { - FrameType::MS1 => self.read_single_frame(index), - _ => Frame::default(), - }) - .collect() - } - - fn read_all_ms2_frames(&self) -> Vec { - (0..self.tdf_bin_reader.size()) - .into_par_iter() - .map(|index| match self.frame_types[index] { - FrameType::MS2(_) => self.read_single_frame(index), - _ => Frame::default(), - }) - .collect() - } -} diff --git a/src/file_readers/spectrum_readers.rs b/src/file_readers/spectrum_readers.rs deleted file mode 100644 index 24d6ff5..0000000 --- a/src/file_readers/spectrum_readers.rs +++ /dev/null @@ -1,38 +0,0 @@ -use crate::Spectrum; - -use self::{dda_reader::DDASpectrumReader, mini_tdf_reader::MiniTDFReader}; - -use super::file_formats::FileFormat; - -pub mod dda_reader; -pub mod mini_tdf_reader; - -pub trait ReadableSpectra { - fn read_single_spectrum(&self, index: usize) -> Spectrum; - - fn read_all_spectra(&self) -> Vec; -} - -impl FileFormat { - fn unwrap_spectrum_reader(&self) -> Box { - let result = match &self { - Self::DFolder(path) => Box::new(DDASpectrumReader::new( - path.to_str().unwrap_or_default().to_string(), - )) as Box, - Self::MS2Folder(path) => Box::new(MiniTDFReader::new( - path.to_str().unwrap_or_default().to_string(), - )) as Box, - }; - result - } -} - -impl ReadableSpectra for FileFormat { - fn read_single_spectrum(&self, index: usize) -> Spectrum { - self.unwrap_spectrum_reader().read_single_spectrum(index) - } - - fn read_all_spectra(&self) -> Vec { - self.unwrap_spectrum_reader().read_all_spectra() - } -} diff --git a/src/file_readers/spectrum_readers/dda_reader.rs b/src/file_readers/spectrum_readers/dda_reader.rs deleted file mode 100644 index e0d545f..0000000 --- a/src/file_readers/spectrum_readers/dda_reader.rs +++ /dev/null @@ -1,135 +0,0 @@ -mod precursors; - -use crate::{ - calibration::Tof2MzCalibrator, - converters::Tof2MzConverter, - file_readers::{ - frame_readers::{tdf_reader::TDFReader, ReadableFrames}, - ReadableSpectra, - }, - frames::Frame, - spectra::RawSpectrum, - spectra::{self, RawSpectrumProcessor}, - vec_utils::group_and_sum, - Spectrum, -}; - -use rayon::prelude::*; - -use self::precursors::PrecursorReader; - -const SMOOTHING_WINDOW: u32 = 1; -const CENTROIDING_WINDOW: u32 = 1; - -#[derive(Debug)] -pub struct DDASpectrumReader { - pub path_name: String, - precursor_reader: PrecursorReader, - mz_reader: Tof2MzConverter, - ms2_frames: Vec, -} - -impl DDASpectrumReader { - pub fn new(path_name: String) -> Self { - let tdf_reader: TDFReader = TDFReader::new(&path_name.to_string()); - let mz_reader: Tof2MzConverter = tdf_reader.mz_converter; - let ms2_frames: Vec = tdf_reader.read_all_ms2_frames(); - let precursor_reader: PrecursorReader = - PrecursorReader::new(&tdf_reader); - Self { - path_name, - precursor_reader, - mz_reader, - ms2_frames, - } - } - - pub fn read_single_raw_spectrum(&self, index: usize) -> RawSpectrum { - let start: usize = self.precursor_reader.offsets[index]; - let end: usize = self.precursor_reader.offsets[index + 1]; - let selection: &[usize] = &self.precursor_reader.order[start..end]; - let mut tof_indices: Vec = vec![]; - let mut intensities: Vec = vec![]; - for &index in selection.iter() { - let frame: usize = - self.precursor_reader.pasef_frames.frame[index] - 1; - if self.ms2_frames[frame].intensities.len() == 0 { - continue; - } - let scan_start: usize = - self.precursor_reader.pasef_frames.scan_start[index]; - let scan_end: usize = - self.precursor_reader.pasef_frames.scan_end[index]; - let offset_start: usize = - self.ms2_frames[frame].scan_offsets[scan_start] as usize; - let offset_end: usize = - self.ms2_frames[frame].scan_offsets[scan_end] as usize; - let tof_selection: &[u32] = - &self.ms2_frames[frame].tof_indices[offset_start..offset_end]; - let intensity_selection: &[u32] = - &self.ms2_frames[frame].intensities[offset_start..offset_end]; - tof_indices.extend(tof_selection); - intensities.extend(intensity_selection); - } - let (raw_tof_indices, raw_intensities) = group_and_sum( - tof_indices, - intensities.iter().map(|x| *x as u64).collect(), - ); - let raw_spectrum = RawSpectrum { - tof_indices: raw_tof_indices, - intensities: raw_intensities, - processed_state: spectra::RawProcessedSpectrumState::Profile, - index: index, - }; - let spectrum_processer = RawSpectrumProcessor { raw_spectrum }; - spectrum_processer - .smooth(SMOOTHING_WINDOW) - .centroid(CENTROIDING_WINDOW) - .raw_spectrum - } - - pub fn process_single_raw_spectrum( - &self, - raw_spectrum: RawSpectrum, - mz_reader: &Tof2MzConverter, - ) -> Spectrum { - let index: usize = raw_spectrum.index as usize; - let spectrum_processer = RawSpectrumProcessor { raw_spectrum }; - let spectrum = spectrum_processer - .finalize(self.precursor_reader.precursors[index], mz_reader); - spectrum - } -} - -impl ReadableSpectra for DDASpectrumReader { - fn read_single_spectrum(&self, index: usize) -> Spectrum { - let raw_spectrum = self.read_single_raw_spectrum(index); - self.process_single_raw_spectrum(raw_spectrum, &self.mz_reader) - } - - fn read_all_spectra(&self) -> Vec { - let raw_spectra: Vec = (0..self.precursor_reader.count) - .into_par_iter() - .map(|index| self.read_single_raw_spectrum(index)) - .collect(); - let hits = Tof2MzCalibrator::find_unfragmented_precursors( - &raw_spectra, - &self.mz_reader, - &self.precursor_reader.precursors, - 0.1, - ); - let mz_reader: Tof2MzConverter; - if hits.len() >= 2 { - mz_reader = Tof2MzConverter::from_unfragmented_precursors(&hits); - } else { - mz_reader = self.mz_reader - } - let spectra: Vec = raw_spectra - .into_par_iter() - .map(|spectrum| { - self.process_single_raw_spectrum(spectrum, &mz_reader) - }) - .collect(); - spectra - } -} diff --git a/src/file_readers/spectrum_readers/dda_reader/precursors.rs b/src/file_readers/spectrum_readers/dda_reader/precursors.rs deleted file mode 100644 index 482ede4..0000000 --- a/src/file_readers/spectrum_readers/dda_reader/precursors.rs +++ /dev/null @@ -1,76 +0,0 @@ -use rayon::prelude::*; - -use crate::{ - converters::{ConvertableIndex, Scan2ImConverter}, - file_readers::{ - common::sql_reader::{ - PasefFrameMsMsTable, PrecursorTable, ReadableFromSql, - }, - frame_readers::tdf_reader::TDFReader, - }, - vec_utils::argsort, - Precursor, -}; - -#[derive(Debug)] -pub struct PrecursorReader { - pub precursors: Vec, - pub pasef_frames: PasefFrameMsMsTable, - pub order: Vec, - pub offsets: Vec, - pub count: usize, -} - -impl PrecursorReader { - pub fn new(tdf_reader: &TDFReader) -> Self { - let select_collision_energy_sql = String::from( - "SELECT CollisionEnergy FROM PasefFrameMsMsInfo GROUP BY Precursor", - ); - let pasef_frames: PasefFrameMsMsTable = - PasefFrameMsMsTable::from_sql(&tdf_reader.tdf_sql_reader); - let im_reader: Scan2ImConverter = tdf_reader.im_converter; - let precursor_table: PrecursorTable = - PrecursorTable::from_sql(&tdf_reader.tdf_sql_reader); - let retention_times: Vec = tdf_reader.frame_table.rt.clone(); - let collision_energies = tdf_reader - .tdf_sql_reader - .get_data_from_sql(&select_collision_energy_sql); - let precursors: Vec = (0..precursor_table.mz.len()) - .into_par_iter() - .map(|index| { - let frame_id: usize = precursor_table.precursor_frame[index]; - let scan_id: f64 = precursor_table.scan_average[index]; - Precursor { - mz: precursor_table.mz[index], - rt: retention_times[frame_id], - im: im_reader.convert(scan_id), - charge: precursor_table.charge[index], - intensity: precursor_table.intensity[index], - index: index + 1, //TODO? - frame_index: frame_id, - collision_energy: collision_energies[index], - } - }) - .collect(); - let order: Vec = argsort(&pasef_frames.precursor); - let count: usize = *pasef_frames.precursor.iter().max().unwrap(); - let mut offsets: Vec = Vec::with_capacity(count + 1); - offsets.push(0); - for (offset, &index) in order.iter().enumerate().take(order.len() - 1) { - let second_index: usize = order[offset + 1]; - if pasef_frames.precursor[index] - != pasef_frames.precursor[second_index] - { - offsets.push(offset + 1) - } - } - offsets.push(order.len()); - Self { - precursors, - pasef_frames, - order, - offsets, - count, - } - } -} diff --git a/src/file_readers/spectrum_readers/mini_tdf_reader.rs b/src/file_readers/spectrum_readers/mini_tdf_reader.rs deleted file mode 100644 index 8ba343d..0000000 --- a/src/file_readers/spectrum_readers/mini_tdf_reader.rs +++ /dev/null @@ -1,128 +0,0 @@ -use crate::file_readers::FileFormatError; -use std::fs; -use { - crate::{ - file_readers::{ - common::{ - ms_data_blobs::{BinFileReader, ReadableFromBinFile}, - parquet_reader::read_parquet_precursors, - }, - ReadableSpectra, - }, - precursors::QuadrupoleEvent, - Precursor, Spectrum, - }, - rayon::prelude::*, - std::path::PathBuf, -}; - -#[derive(Debug)] -pub struct MiniTDFReader { - pub path_name: String, - parquet_file_name: String, - precursors: Vec, - offsets: Vec, - frame_reader: BinFileReader, -} - -fn find_ms2spectrum_file( - ms2_dir_path: &str, - extension: String, -) -> Result { - let files = fs::read_dir(ms2_dir_path).unwrap(); - for file in files { - let filename = file - .unwrap() - .path() - .file_name() - .unwrap() - .to_str() - .unwrap() - .to_owned(); - if filename - .ends_with(std::format!("ms2spectrum.{}", extension).as_str()) - { - return Ok(filename); - } - } - let err = match extension.as_str() { - "parquet" => FileFormatError::MetadataFilesAreMissing, - "bin" => FileFormatError::BinaryFilesAreMissing, - _ => FileFormatError::BinaryFilesAreMissing, - }; - println!( - "{}", - format!( - "No '*.ms2spectrum.{}' file found in '{}'", - extension, ms2_dir_path - ) - ); - return Err(err); -} - -impl MiniTDFReader { - pub fn new(path_name: String) -> Self { - let parquet_file_name: String = String::default(); - let precursors: Vec = Vec::default(); - let offsets: Vec = Vec::default(); - let frame_reader: BinFileReader = BinFileReader::default(); - let mut reader: MiniTDFReader = MiniTDFReader { - path_name, - parquet_file_name, - precursors, - offsets, - frame_reader, - }; - reader.read_parquet_file_name(); - reader.read_precursors(); - reader.set_spectrum_reader(); - reader - } - - fn read_parquet_file_name(&mut self) { - let mut path: PathBuf = PathBuf::from(&self.path_name); - let ms2_parquet_file = - find_ms2spectrum_file(&self.path_name, "parquet".to_owned()) - .unwrap(); - path.push(ms2_parquet_file); - self.parquet_file_name = path.to_string_lossy().into_owned(); - } - - fn read_precursors(&mut self) { - (self.precursors, self.offsets) = - read_parquet_precursors(&self.parquet_file_name); - } - fn set_spectrum_reader(&mut self) { - let mut path: PathBuf = PathBuf::from(&self.path_name); - let ms2_bin_file = - find_ms2spectrum_file(&self.path_name, "bin".to_owned()).unwrap(); - path.push(ms2_bin_file); - let file_name: String = path.to_string_lossy().into_owned(); - self.frame_reader = - BinFileReader::new(String::from(&file_name), self.offsets.clone()); - } -} - -impl ReadableSpectra for MiniTDFReader { - fn read_single_spectrum(&self, index: usize) -> Spectrum { - let mut spectrum: Spectrum = - Spectrum::read_from_file(&self.frame_reader, index); - spectrum.precursor = QuadrupoleEvent::Precursor(self.precursors[index]); - spectrum.index = self.precursors[index].index; - spectrum - } - - fn read_all_spectra(&self) -> Vec { - let size: usize = self.offsets.len(); - let mut spectra: Vec = (0..size) - .into_par_iter() - .map(|index| self.read_single_spectrum(index)) - .collect(); - spectra.sort_by(|a, b| { - let x = b.precursor.unwrap_as_precursor().index as f64; - let y = a.precursor.unwrap_as_precursor().index as f64; - y.total_cmp(&x) - }); - spectra - } -} diff --git a/src/frames.rs b/src/frames.rs deleted file mode 100644 index ad7df8d..0000000 --- a/src/frames.rs +++ /dev/null @@ -1,26 +0,0 @@ -use crate::acquisition::AcquisitionType; - -/// A frame with all unprocessed data as it was acquired. -#[derive(Debug, PartialEq, Default)] -pub struct Frame { - pub scan_offsets: Vec, - pub tof_indices: Vec, - pub intensities: Vec, - pub index: usize, - pub rt: f64, - pub frame_type: FrameType, -} - -/// The kind of frame, determined by acquisition. -#[derive(Debug, PartialEq, Clone, Copy)] -pub enum FrameType { - MS1, - MS2(AcquisitionType), - Unknown, -} - -impl Default for FrameType { - fn default() -> Self { - Self::Unknown - } -} diff --git a/src/io.rs b/src/io.rs new file mode 100644 index 0000000..74b4093 --- /dev/null +++ b/src/io.rs @@ -0,0 +1,4 @@ +//! Handles all input and output + +pub mod readers; +pub mod writers; diff --git a/src/io/readers.rs b/src/io/readers.rs new file mode 100644 index 0000000..03d5248 --- /dev/null +++ b/src/io/readers.rs @@ -0,0 +1,12 @@ +pub(crate) mod file_readers; +mod frame_reader; +mod metadata_reader; +mod precursor_reader; +mod quad_settings_reader; +mod spectrum_reader; + +pub use frame_reader::*; +pub use metadata_reader::*; +pub use precursor_reader::*; +pub use quad_settings_reader::*; +pub use spectrum_reader::*; diff --git a/src/file_readers/common.rs b/src/io/readers/file_readers.rs similarity index 63% rename from src/file_readers/common.rs rename to src/io/readers/file_readers.rs index 4d34103..38aa955 100644 --- a/src/file_readers/common.rs +++ b/src/io/readers/file_readers.rs @@ -1,3 +1,3 @@ -pub mod ms_data_blobs; pub mod parquet_reader; pub mod sql_reader; +pub mod tdf_blob_reader; diff --git a/src/io/readers/file_readers/parquet_reader.rs b/src/io/readers/file_readers/parquet_reader.rs new file mode 100644 index 0000000..c881761 --- /dev/null +++ b/src/io/readers/file_readers/parquet_reader.rs @@ -0,0 +1,45 @@ +pub mod precursors; + +use parquet::file::reader::{FileReader, SerializedFileReader}; +use std::{fs::File, io, path::Path, str::FromStr}; + +pub trait ReadableParquetTable { + fn update_from_parquet_file(&mut self, key: &str, value: String); + + fn parse_default_field(field: String) -> T { + field.parse().unwrap_or_default() + } + + fn from_parquet_file( + file_name: impl AsRef, + ) -> Result, ParquetError> + where + Self: Sized + Default, + { + let file: File = File::open(file_name)?; + let reader: SerializedFileReader = + SerializedFileReader::new(file)?; + let results: Vec = reader + .get_row_iter(None)? + .map(|record| { + let mut result = Self::default(); + for (name, field) in record.get_column_iter() { + result.update_from_parquet_file( + name.to_string().as_str(), + field.to_string(), + ); + } + result + }) + .collect(); + Ok(results) + } +} + +#[derive(Debug, thiserror::Error)] +pub enum ParquetError { + #[error("Cannot read file {0}")] + IO(#[from] io::Error), + #[error("Cannot iterate over row {0}")] + ParquetIO(#[from] parquet::errors::ParquetError), +} diff --git a/src/io/readers/file_readers/parquet_reader/precursors.rs b/src/io/readers/file_readers/parquet_reader/precursors.rs new file mode 100644 index 0000000..3ce5302 --- /dev/null +++ b/src/io/readers/file_readers/parquet_reader/precursors.rs @@ -0,0 +1,35 @@ +use super::ReadableParquetTable; + +#[derive(Default, Debug, PartialEq)] +pub struct ParquetPrecursor { + pub mz: f64, + pub rt: f64, + pub im: f64, + pub charge: usize, + pub intensity: f64, + pub index: usize, + pub frame_index: usize, + pub offset: u64, + pub collision_energy: f64, +} + +impl ReadableParquetTable for ParquetPrecursor { + fn update_from_parquet_file(&mut self, key: &str, value: String) { + match key { + "Id" => self.index = Self::parse_default_field(value), + "RetentionTime" => self.rt = Self::parse_default_field(value), + "MonoisotopicMz" => self.mz = Self::parse_default_field(value), + "Charge" => self.charge = Self::parse_default_field(value), + "Intensity" => self.intensity = Self::parse_default_field(value), + "ooK0" => self.im = Self::parse_default_field(value), + "MS1ParentFrameId" => { + self.frame_index = Self::parse_default_field(value) + }, + "BinaryOffset" => self.offset = Self::parse_default_field(value), + "CollisionEnergy" => { + self.collision_energy = Self::parse_default_field(value) + }, + _ => {}, + } + } +} diff --git a/src/io/readers/file_readers/sql_reader.rs b/src/io/readers/file_readers/sql_reader.rs new file mode 100644 index 0000000..6704532 --- /dev/null +++ b/src/io/readers/file_readers/sql_reader.rs @@ -0,0 +1,90 @@ +pub mod frame_groups; +pub mod frames; +pub mod metadata; +pub mod pasef_frame_msms; +pub mod precursors; +pub mod quad_settings; + +use std::{ + collections::HashMap, + path::{Path, PathBuf}, +}; + +use rusqlite::Connection; + +#[derive(Debug)] +pub struct SqlReader { + connection: Connection, + path: PathBuf, +} + +impl SqlReader { + pub fn open(file_name: impl AsRef) -> Result { + let path = file_name.as_ref().to_path_buf(); + let connection = Connection::open(&path)?; + Ok(Self { connection, path }) + } + + pub fn read_column_from_table( + &self, + column_name: &str, + table_name: &str, + ) -> Result, SqlError> { + let query = format!("SELECT {} FROM {}", column_name, table_name); + let mut stmt = self.connection.prepare(&query)?; + let rows = stmt.query_map([], |row| match row.get::(0) { + Ok(value) => Ok(value), + _ => Ok(T::default()), + })?; + let result = rows.collect::, _>>()?; + Ok(result) + } + + pub fn get_path(&self) -> PathBuf { + self.path.clone() + } +} + +pub trait ReadableSqlTable { + fn get_sql_query() -> String; + + fn from_sql_row(row: &rusqlite::Row) -> Self; + + fn from_sql_reader(reader: &SqlReader) -> Result, SqlError> + where + Self: Sized, + { + let query = Self::get_sql_query(); + let mut stmt = reader.connection.prepare(&query)?; + let rows = stmt.query_map([], |row| Ok(Self::from_sql_row(row)))?; + let result = rows.collect::, _>>()?; + Ok(result) + } +} + +pub trait ReadableSqlHashMap { + fn get_sql_query() -> String; + + fn from_sql_reader( + reader: &SqlReader, + ) -> Result, SqlError> + where + Self: Sized, + { + let query = Self::get_sql_query(); + let mut stmt = reader.connection.prepare(&query)?; + let mut result = HashMap::new(); + let rows = stmt.query_map([], |row| { + let key: String = row.get(0)?; + let value: String = row.get(1)?; + result.insert(key, value); + Ok(()) + })?; + rows.collect::, _>>()?; + Ok(result) + } +} + +#[derive(thiserror::Error, Debug)] +#[error("SqlError: {0}")] +pub struct SqlError(#[from] rusqlite::Error); diff --git a/src/io/readers/file_readers/sql_reader/frame_groups.rs b/src/io/readers/file_readers/sql_reader/frame_groups.rs new file mode 100644 index 0000000..a46e72e --- /dev/null +++ b/src/io/readers/file_readers/sql_reader/frame_groups.rs @@ -0,0 +1,20 @@ +use super::ReadableSqlTable; + +#[derive(Debug, PartialEq)] +pub struct SqlWindowGroup { + pub frame: usize, + pub window_group: u8, +} + +impl ReadableSqlTable for SqlWindowGroup { + fn get_sql_query() -> String { + "SELECT Frame, WindowGroup FROM DiaFrameMsMsInfo".to_string() + } + + fn from_sql_row(row: &rusqlite::Row) -> Self { + Self { + frame: row.get(0).unwrap_or_default(), + window_group: row.get(1).unwrap_or_default(), + } + } +} diff --git a/src/io/readers/file_readers/sql_reader/frames.rs b/src/io/readers/file_readers/sql_reader/frames.rs new file mode 100644 index 0000000..e1d7337 --- /dev/null +++ b/src/io/readers/file_readers/sql_reader/frames.rs @@ -0,0 +1,32 @@ +use super::ReadableSqlTable; + +#[derive(Debug, PartialEq)] +pub struct SqlFrame { + pub id: usize, + pub scan_mode: u8, + pub msms_type: u8, + pub peak_count: u64, + pub rt: f64, + pub scan_count: u64, + pub binary_offset: usize, + pub accumulation_time: f64, +} + +impl ReadableSqlTable for SqlFrame { + fn get_sql_query() -> String { + "SELECT Id, ScanMode, MsMsType, NumPeaks, Time, NumScans, TimsId, AccumulationTime FROM Frames".to_string() + } + + fn from_sql_row(row: &rusqlite::Row) -> Self { + Self { + id: row.get(0).unwrap_or_default(), + scan_mode: row.get(1).unwrap_or_default(), + msms_type: row.get(2).unwrap_or_default(), + peak_count: row.get(3).unwrap_or_default(), + rt: row.get(4).unwrap_or_default(), + scan_count: row.get(5).unwrap_or_default(), + binary_offset: row.get(6).unwrap_or_default(), + accumulation_time: row.get(7).unwrap_or_default(), + } + } +} diff --git a/src/io/readers/file_readers/sql_reader/metadata.rs b/src/io/readers/file_readers/sql_reader/metadata.rs new file mode 100644 index 0000000..fb045ba --- /dev/null +++ b/src/io/readers/file_readers/sql_reader/metadata.rs @@ -0,0 +1,9 @@ +use super::ReadableSqlHashMap; + +pub struct SqlMetadata; + +impl ReadableSqlHashMap for SqlMetadata { + fn get_sql_query() -> String { + "SELECT Key, Value FROM GlobalMetadata".to_string() + } +} diff --git a/src/io/readers/file_readers/sql_reader/pasef_frame_msms.rs b/src/io/readers/file_readers/sql_reader/pasef_frame_msms.rs new file mode 100644 index 0000000..51e09cd --- /dev/null +++ b/src/io/readers/file_readers/sql_reader/pasef_frame_msms.rs @@ -0,0 +1,30 @@ +use super::ReadableSqlTable; + +#[derive(Debug, PartialEq)] +pub struct SqlPasefFrameMsMs { + pub frame: usize, + pub scan_start: usize, + pub scan_end: usize, + pub isolation_mz: f64, + pub isolation_width: f64, + pub collision_energy: f64, + pub precursor: usize, +} + +impl ReadableSqlTable for SqlPasefFrameMsMs { + fn get_sql_query() -> String { + "SELECT Frame, ScanNumBegin, ScanNumEnd, IsolationMz, IsolationWidth, CollisionEnergy, Precursor FROM PasefFrameMsMsInfo".to_string() + } + + fn from_sql_row(row: &rusqlite::Row) -> Self { + Self { + frame: row.get(0).unwrap_or_default(), + scan_start: row.get(1).unwrap_or_default(), + scan_end: row.get(2).unwrap_or_default(), + isolation_mz: row.get(3).unwrap_or_default(), + isolation_width: row.get(4).unwrap_or_default(), + collision_energy: row.get(5).unwrap_or_default(), + precursor: row.get(6).unwrap_or_default(), + } + } +} diff --git a/src/io/readers/file_readers/sql_reader/precursors.rs b/src/io/readers/file_readers/sql_reader/precursors.rs new file mode 100644 index 0000000..c2b00aa --- /dev/null +++ b/src/io/readers/file_readers/sql_reader/precursors.rs @@ -0,0 +1,28 @@ +use super::ReadableSqlTable; + +#[derive(Debug, PartialEq)] +pub struct SqlPrecursor { + pub id: usize, + pub mz: f64, + pub charge: usize, + pub scan_average: f64, + pub intensity: f64, + pub precursor_frame: usize, +} + +impl ReadableSqlTable for SqlPrecursor { + fn get_sql_query() -> String { + "SELECT Id, MonoisotopicMz, Charge, ScanNumber, Intensity, Parent FROM Precursors".to_string() + } + + fn from_sql_row(row: &rusqlite::Row) -> Self { + Self { + id: row.get(0).unwrap_or_default(), + mz: row.get(1).unwrap_or_default(), + charge: row.get(2).unwrap_or_default(), + scan_average: row.get(3).unwrap_or_default(), + intensity: row.get(4).unwrap_or_default(), + precursor_frame: row.get(5).unwrap_or_default(), + } + } +} diff --git a/src/io/readers/file_readers/sql_reader/quad_settings.rs b/src/io/readers/file_readers/sql_reader/quad_settings.rs new file mode 100644 index 0000000..d7d69b4 --- /dev/null +++ b/src/io/readers/file_readers/sql_reader/quad_settings.rs @@ -0,0 +1,28 @@ +use super::ReadableSqlTable; + +#[derive(Debug, PartialEq)] +pub struct SqlQuadSettings { + pub window_group: usize, + pub scan_start: usize, + pub scan_end: usize, + pub mz_center: f64, + pub mz_width: f64, + pub collision_energy: f64, +} + +impl ReadableSqlTable for SqlQuadSettings { + fn get_sql_query() -> String { + "SELECT WindowGroup, ScanNumBegin, ScanNumEnd, IsolationMz, IsolationWidth, CollisionEnergy FROM DiaFrameMsMsWindows".to_string() + } + + fn from_sql_row(row: &rusqlite::Row) -> Self { + Self { + window_group: row.get(0).unwrap_or_default(), + scan_start: row.get(1).unwrap_or_default(), + scan_end: row.get(2).unwrap_or_default(), + mz_center: row.get(3).unwrap_or_default(), + mz_width: row.get(4).unwrap_or_default(), + collision_energy: row.get(5).unwrap_or_default(), + } + } +} diff --git a/src/io/readers/file_readers/tdf_blob_reader.rs b/src/io/readers/file_readers/tdf_blob_reader.rs new file mode 100644 index 0000000..fad363c --- /dev/null +++ b/src/io/readers/file_readers/tdf_blob_reader.rs @@ -0,0 +1,140 @@ +mod tdf_blobs; + +use memmap2::Mmap; +use std::fs::File; +use std::io; +use std::path::{Path, PathBuf}; +pub use tdf_blobs::*; +use zstd::decode_all; + +const U32_SIZE: usize = std::mem::size_of::(); +const HEADER_SIZE: usize = 2; + +#[derive(Debug)] +pub struct TdfBlobReader { + path: PathBuf, + mmap: Mmap, + global_file_offset: usize, +} + +impl TdfBlobReader { + // TODO parse compression1 + pub fn new(file_name: impl AsRef) -> Result { + let path: PathBuf = file_name.as_ref().to_path_buf(); + let file: File = File::open(&path)?; + let mmap: Mmap = unsafe { Mmap::map(&file)? }; + Ok(Self { + path, + mmap, + global_file_offset: 0, + }) + } + + pub fn get_blob(&self, offset: usize) -> Result { + let offset: usize = self.get_offset(offset)?; + let byte_count: usize = self.get_byte_count(offset)?; + let compressed_bytes: &[u8] = + self.get_compressed_bytes(offset, byte_count); + match decode_all(compressed_bytes) { + Ok(bytes) => Ok(TdfBlob::new(bytes)), + Err(_) => Err(TdfBlobError::Decompression(self.path.clone())), + } + } + + fn get_offset(&self, offset: usize) -> Result { + let offset = self.global_file_offset + offset; + self.check_valid_offset(offset) + } + + fn check_valid_offset(&self, offset: usize) -> Result { + if (offset + U32_SIZE) >= self.mmap.len() { + return Err(TdfBlobError::Offset(offset, self.path.clone())); + } + Ok(offset) + } + + fn get_byte_count(&self, offset: usize) -> Result { + let raw_byte_count: &[u8] = + &self.mmap[offset as usize..(offset + U32_SIZE) as usize]; + let byte_count = + u32::from_le_bytes(raw_byte_count.try_into().unwrap()) as usize; + self.check_valid_byte_count(byte_count, offset) + } + + fn check_valid_byte_count( + &self, + byte_count: usize, + offset: usize, + ) -> Result { + if (byte_count < (HEADER_SIZE * U32_SIZE)) + || ((offset + byte_count) > self.len()) + { + return Err(TdfBlobError::ByteCount( + byte_count, + offset, + self.path.clone(), + )); + } + Ok(byte_count) + } + + fn get_compressed_bytes(&self, offset: usize, byte_count: usize) -> &[u8] { + &self.mmap[(offset + HEADER_SIZE * U32_SIZE)..offset + byte_count] + } + + pub fn len(&self) -> usize { + self.mmap.len() + } +} + +#[derive(Debug)] +pub struct IndexedTdfBlobReader { + blob_reader: TdfBlobReader, + binary_offsets: Vec, +} + +impl IndexedTdfBlobReader { + pub fn new( + file_name: impl AsRef, + binary_offsets: Vec, + ) -> Result { + Ok(Self { + binary_offsets, + blob_reader: TdfBlobReader::new(file_name)?, + }) + } + + pub fn get_blob(&self, index: usize) -> Result { + self.check_valid_index(index)?; + let offset = self.binary_offsets[index]; + self.blob_reader.get_blob(offset) + } + + fn check_valid_index(&self, index: usize) -> Result { + if index >= self.len() { + return Err(TdfBlobError::Index( + index, + self.blob_reader.path.clone(), + )); + } + Ok(index) + } + + pub fn len(&self) -> usize { + self.binary_offsets.len() + } +} + +#[derive(Debug, thiserror::Error)] +pub enum TdfBlobError { + #[error("Cannot read or mmap file {0}")] + IO(#[from] io::Error), + #[error("Index {0} is invalid for file {1}")] + Index(usize, PathBuf), + #[error("Offset {0} is invalid for file {1}")] + Offset(usize, PathBuf), + #[error("Byte count {0} from offset {1} is invalid for file {2}")] + ByteCount(usize, usize, PathBuf), + #[error("Zstd decompression failed for file {0}")] + Decompression(PathBuf), +} diff --git a/src/io/readers/file_readers/tdf_blob_reader/tdf_blobs.rs b/src/io/readers/file_readers/tdf_blob_reader/tdf_blobs.rs new file mode 100644 index 0000000..b75d494 --- /dev/null +++ b/src/io/readers/file_readers/tdf_blob_reader/tdf_blobs.rs @@ -0,0 +1,37 @@ +const U32_SIZE: usize = std::mem::size_of::(); + +#[derive(Debug, Default)] +pub struct TdfBlob { + bytes: Vec, +} + +impl TdfBlob { + pub fn new(bytes: Vec) -> Self { + Self { bytes } + } + + pub fn get(&self, index: usize) -> u32 { + debug_assert!(index < self.len()); + Self::concatenate_bytes( + self.bytes[index], + self.bytes[index + self.len()], + self.bytes[index + 2 * self.len()], + self.bytes[index + 3 * self.len()], + ) + } + + fn concatenate_bytes(b1: u8, b2: u8, b3: u8, b4: u8) -> u32 { + b1 as u32 + | ((b2 as u32) << 8) + | ((b3 as u32) << 16) + | ((b4 as u32) << 24) + } + + pub fn len(&self) -> usize { + self.bytes.len() / U32_SIZE + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} diff --git a/src/io/readers/frame_reader.rs b/src/io/readers/frame_reader.rs new file mode 100644 index 0000000..e13b6be --- /dev/null +++ b/src/io/readers/frame_reader.rs @@ -0,0 +1,181 @@ +use std::{ + path::{Path, PathBuf}, + sync::Arc, + vec, +}; + +use rayon::iter::{IntoParallelIterator, ParallelIterator}; + +use crate::{ + ms_data::{AcquisitionType, Frame, MSLevel, QuadrupoleSettings}, + utils::find_extension, +}; + +use super::{ + file_readers::{ + sql_reader::{ + frame_groups::SqlWindowGroup, frames::SqlFrame, ReadableSqlTable, + SqlReader, + }, + tdf_blob_reader::{TdfBlob, TdfBlobReader}, + }, + QuadrupoleSettingsReader, +}; + +#[derive(Debug)] +pub struct FrameReader { + path: PathBuf, + tdf_bin_reader: TdfBlobReader, + sql_frames: Vec, + acquisition: AcquisitionType, + window_groups: Vec, + quadrupole_settings: Vec>, +} + +impl FrameReader { + pub fn new(path: impl AsRef) -> Self { + let sql_path = find_extension(&path, "analysis.tdf").unwrap(); + let tdf_sql_reader = SqlReader::open(sql_path).unwrap(); + let sql_frames = SqlFrame::from_sql_reader(&tdf_sql_reader).unwrap(); + let bin_path = find_extension(&path, "analysis.tdf_bin").unwrap(); + let tdf_bin_reader = TdfBlobReader::new(bin_path).unwrap(); + let acquisition = if sql_frames.iter().any(|x| x.msms_type == 8) { + AcquisitionType::DDAPASEF + } else if sql_frames.iter().any(|x| x.msms_type == 9) { + AcquisitionType::DIAPASEF + // TODO: can also be diagonalpasef + } else { + AcquisitionType::Unknown + }; + let mut window_groups = vec![0; sql_frames.len()]; + let quadrupole_settings; + if acquisition == AcquisitionType::DIAPASEF { + for window_group in + SqlWindowGroup::from_sql_reader(&tdf_sql_reader).unwrap() + { + window_groups[window_group.frame - 1] = + window_group.window_group; + } + quadrupole_settings = + QuadrupoleSettingsReader::new(tdf_sql_reader.get_path()); + } else { + quadrupole_settings = vec![]; + } + Self { + path: path.as_ref().to_path_buf(), + tdf_bin_reader, + sql_frames, + acquisition, + window_groups, + quadrupole_settings: quadrupole_settings + .into_iter() + .map(|x| Arc::new(x)) + .collect(), + } + } + + pub fn parallel_filter<'a, F: Fn(&SqlFrame) -> bool + Sync + Send + 'a>( + &'a self, + predicate: F, + ) -> impl ParallelIterator + 'a { + (0..self.len()) + .into_par_iter() + .filter(move |x| predicate(&self.sql_frames[*x])) + .map(move |x| self.get(x)) + } + + pub fn get(&self, index: usize) -> Frame { + let mut frame: Frame = Frame::default(); + let sql_frame = &self.sql_frames[index]; + frame.index = sql_frame.id; + let blob = match self.tdf_bin_reader.get_blob(sql_frame.binary_offset) { + Ok(blob) => blob, + Err(_) => return frame, + }; + let scan_count: usize = blob.get(0) as usize; + let peak_count: usize = (blob.len() - scan_count) / 2; + frame.scan_offsets = read_scan_offsets(scan_count, peak_count, &blob); + frame.intensities = read_intensities(scan_count, peak_count, &blob); + frame.tof_indices = read_tof_indices( + scan_count, + peak_count, + &blob, + &frame.scan_offsets, + ); + frame.ms_level = MSLevel::read_from_msms_type(sql_frame.msms_type); + frame.rt = sql_frame.rt; + frame.acquisition_type = self.acquisition; + frame.intensity_correction_factor = 1.0 / sql_frame.accumulation_time; + if (self.acquisition == AcquisitionType::DIAPASEF) + & (frame.ms_level == MSLevel::MS2) + { + let window_group = self.window_groups[index]; + frame.window_group = window_group; + frame.quadrupole_settings = + self.quadrupole_settings[window_group as usize - 1].clone(); + } + frame + } + + pub fn get_acquisition(&self) -> AcquisitionType { + self.acquisition + } + + pub fn len(&self) -> usize { + self.sql_frames.len() + } + + pub fn get_path(&self) -> PathBuf { + self.path.clone() + } +} + +fn read_scan_offsets( + scan_count: usize, + peak_count: usize, + blob: &TdfBlob, +) -> Vec { + let mut scan_offsets: Vec = Vec::with_capacity(scan_count + 1); + scan_offsets.push(0); + for scan_index in 0..scan_count - 1 { + let index = scan_index + 1; + let scan_size: usize = (blob.get(index) / 2) as usize; + scan_offsets.push(scan_offsets[scan_index] + scan_size); + } + scan_offsets.push(peak_count); + scan_offsets +} + +fn read_intensities( + scan_count: usize, + peak_count: usize, + blob: &TdfBlob, +) -> Vec { + let mut intensities: Vec = Vec::with_capacity(peak_count); + for peak_index in 0..peak_count { + let index: usize = scan_count + 1 + 2 * peak_index; + intensities.push(blob.get(index)); + } + intensities +} + +fn read_tof_indices( + scan_count: usize, + peak_count: usize, + blob: &TdfBlob, + scan_offsets: &Vec, +) -> Vec { + let mut tof_indices: Vec = Vec::with_capacity(peak_count); + for scan_index in 0..scan_count { + let start_offset: usize = scan_offsets[scan_index]; + let end_offset: usize = scan_offsets[scan_index + 1]; + let mut current_sum: u32 = 0; + for peak_index in start_offset..end_offset { + let index = scan_count + 2 * peak_index; + let tof_index: u32 = blob.get(index); + current_sum += tof_index; + tof_indices.push(current_sum - 1); + } + } + tof_indices +} diff --git a/src/io/readers/metadata_reader.rs b/src/io/readers/metadata_reader.rs new file mode 100644 index 0000000..a439239 --- /dev/null +++ b/src/io/readers/metadata_reader.rs @@ -0,0 +1,88 @@ +use std::{collections::HashMap, path::Path}; + +use crate::{ + domain_converters::{Frame2RtConverter, Scan2ImConverter, Tof2MzConverter}, + ms_data::Metadata, +}; + +use super::file_readers::sql_reader::{ + metadata::SqlMetadata, ReadableSqlHashMap, SqlReader, +}; + +const OTOF_CONTROL: &str = "Bruker otofControl"; + +pub struct MetadataReader; + +impl MetadataReader { + pub fn new(path: impl AsRef) -> Metadata { + let sql_path = path.as_ref(); + let tdf_sql_reader = SqlReader::open(&sql_path).unwrap(); + let sql_metadata: HashMap = + SqlMetadata::from_sql_reader(&tdf_sql_reader).unwrap(); + let compression_type = sql_metadata + .get("TimsCompressionType") + .unwrap() + .parse() + .unwrap(); + Metadata { + path: path.as_ref().to_path_buf(), + rt_converter: get_rt_converter(&tdf_sql_reader), + im_converter: get_im_converter(&sql_metadata, &tdf_sql_reader), + mz_converter: get_mz_converter(&sql_metadata), + compression_type, + } + } +} + +fn get_rt_converter(tdf_sql_reader: &SqlReader) -> Frame2RtConverter { + let rt_values: Vec = tdf_sql_reader + .read_column_from_table("Time", "Frames") + .unwrap(); + Frame2RtConverter::from_values(rt_values) +} + +fn get_mz_converter(sql_metadata: &HashMap) -> Tof2MzConverter { + let software = sql_metadata.get("AcquisitionSoftware").unwrap(); + let tof_max_index: u32 = sql_metadata + .get("DigitizerNumSamples") + .unwrap() + .parse() + .unwrap(); + let mut mz_min: f64 = sql_metadata + .get("MzAcqRangeLower") + .unwrap() + .parse() + .unwrap(); + let mut mz_max: f64 = sql_metadata + .get("MzAcqRangeUpper") + .unwrap() + .parse() + .unwrap(); + if software == OTOF_CONTROL { + mz_min -= 5.0; + mz_max += 5.0; + } + Tof2MzConverter::from_boundaries(mz_min, mz_max, tof_max_index) +} + +fn get_im_converter( + sql_metadata: &HashMap, + tdf_sql_reader: &SqlReader, +) -> Scan2ImConverter { + let scan_counts: Vec = tdf_sql_reader + .read_column_from_table("NumScans", "Frames") + .unwrap(); + let scan_max_index = *scan_counts.iter().max().unwrap(); + // let scan_max_index = 927; + let im_min: f64 = sql_metadata + .get("OneOverK0AcqRangeLower") + .unwrap() + .parse() + .unwrap(); + let im_max: f64 = sql_metadata + .get("OneOverK0AcqRangeUpper") + .unwrap() + .parse() + .unwrap(); + Scan2ImConverter::from_boundaries(im_min, im_max, scan_max_index) +} diff --git a/src/io/readers/precursor_reader.rs b/src/io/readers/precursor_reader.rs new file mode 100644 index 0000000..23a804b --- /dev/null +++ b/src/io/readers/precursor_reader.rs @@ -0,0 +1,50 @@ +mod minitdf; +mod tdf; + +use core::fmt; +use std::path::{Path, PathBuf}; + +use minitdf::MiniTDFPrecursorReader; +use tdf::TDFPrecursorReader; + +use crate::ms_data::Precursor; + +pub struct PrecursorReader { + precursor_reader: Box, +} + +impl fmt::Debug for PrecursorReader { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "PrecursorReader {{ /* fields omitted */ }}") + } +} + +impl PrecursorReader { + pub fn new(path: impl AsRef) -> Self { + let precursor_reader: Box = + match path.as_ref().extension().and_then(|e| e.to_str()) { + Some("parquet") => Box::new(MiniTDFPrecursorReader::new(path)), + Some("tdf") => Box::new(TDFPrecursorReader::new(path)), + _ => panic!(), + }; + Self { precursor_reader } + } + + pub fn get(&self, index: usize) -> Precursor { + self.precursor_reader.get(index) + } + + pub fn get_path(&self) -> PathBuf { + self.precursor_reader.get_path() + } + + pub fn len(&self) -> usize { + self.precursor_reader.len() + } +} + +trait PrecursorReaderTrait: Sync { + fn get(&self, index: usize) -> Precursor; + fn get_path(&self) -> PathBuf; + fn len(&self) -> usize; +} diff --git a/src/io/readers/precursor_reader/minitdf.rs b/src/io/readers/precursor_reader/minitdf.rs new file mode 100644 index 0000000..0d5ee06 --- /dev/null +++ b/src/io/readers/precursor_reader/minitdf.rs @@ -0,0 +1,50 @@ +use std::path::{Path, PathBuf}; + +use crate::{ + io::readers::file_readers::parquet_reader::{ + precursors::ParquetPrecursor, ReadableParquetTable, + }, + ms_data::Precursor, +}; + +use super::PrecursorReaderTrait; + +#[derive(Debug)] +pub struct MiniTDFPrecursorReader { + path: PathBuf, + parquet_precursors: Vec, +} + +impl MiniTDFPrecursorReader { + pub fn new(path: impl AsRef) -> Self { + let parquet_precursors = + ParquetPrecursor::from_parquet_file(&path).unwrap(); + Self { + path: path.as_ref().to_path_buf(), + parquet_precursors, + } + } +} + +impl PrecursorReaderTrait for MiniTDFPrecursorReader { + fn get(&self, index: usize) -> Precursor { + let x = &self.parquet_precursors[index]; + Precursor { + mz: x.mz, + rt: x.rt, + im: x.im, + charge: Some(x.charge), + intensity: Some(x.intensity), + index: x.index, + frame_index: x.frame_index, + } + } + + fn len(&self) -> usize { + self.parquet_precursors.len() + } + + fn get_path(&self) -> PathBuf { + self.path.clone() + } +} diff --git a/src/io/readers/precursor_reader/tdf.rs b/src/io/readers/precursor_reader/tdf.rs new file mode 100644 index 0000000..8619a1e --- /dev/null +++ b/src/io/readers/precursor_reader/tdf.rs @@ -0,0 +1,60 @@ +mod dda; +mod dia; + +use std::path::{Path, PathBuf}; + +use dda::DDATDFPrecursorReader; +use dia::DIATDFPrecursorReader; + +use crate::{ + io::readers::file_readers::sql_reader::SqlReader, + ms_data::{AcquisitionType, Precursor}, +}; + +use super::PrecursorReaderTrait; + +pub struct TDFPrecursorReader { + precursor_reader: Box, +} + +impl TDFPrecursorReader { + pub fn new(path: impl AsRef) -> Self { + let sql_path = path.as_ref(); + let tdf_sql_reader = SqlReader::open(sql_path).unwrap(); + let sql_frames: Vec = tdf_sql_reader + .read_column_from_table("ScanMode", "Frames") + .unwrap(); + let acquisition_type = if sql_frames.iter().any(|&x| x == 8) { + AcquisitionType::DDAPASEF + } else if sql_frames.iter().any(|&x| x == 9) { + AcquisitionType::DIAPASEF + } else { + AcquisitionType::Unknown + }; + let precursor_reader: Box = + match acquisition_type { + AcquisitionType::DDAPASEF => { + Box::new(DDATDFPrecursorReader::new(path)) + }, + AcquisitionType::DIAPASEF => { + Box::new(DIATDFPrecursorReader::new(path)) + }, + _ => panic!(), + }; + Self { precursor_reader } + } +} + +impl PrecursorReaderTrait for TDFPrecursorReader { + fn get(&self, index: usize) -> Precursor { + self.precursor_reader.get(index) + } + + fn len(&self) -> usize { + self.precursor_reader.len() + } + + fn get_path(&self) -> PathBuf { + self.precursor_reader.get_path() + } +} diff --git a/src/io/readers/precursor_reader/tdf/dda.rs b/src/io/readers/precursor_reader/tdf/dda.rs new file mode 100644 index 0000000..763307a --- /dev/null +++ b/src/io/readers/precursor_reader/tdf/dda.rs @@ -0,0 +1,67 @@ +use std::path::{Path, PathBuf}; + +use crate::{ + domain_converters::{ + ConvertableDomain, Frame2RtConverter, Scan2ImConverter, + }, + io::readers::{ + file_readers::sql_reader::{ + precursors::SqlPrecursor, ReadableSqlTable, SqlReader, + }, + MetadataReader, + }, + ms_data::Precursor, +}; + +use super::PrecursorReaderTrait; + +#[derive(Debug)] +pub struct DDATDFPrecursorReader { + path: PathBuf, + sql_precursors: Vec, + rt_converter: Frame2RtConverter, + im_converter: Scan2ImConverter, +} + +impl DDATDFPrecursorReader { + pub fn new(path: impl AsRef) -> Self { + let sql_path = path.as_ref(); + let tdf_sql_reader = SqlReader::open(sql_path).unwrap(); + let metadata = MetadataReader::new(&path); + let rt_converter: Frame2RtConverter = metadata.rt_converter; + let im_converter: Scan2ImConverter = metadata.im_converter; + let sql_precursors = + SqlPrecursor::from_sql_reader(&tdf_sql_reader).unwrap(); + Self { + path: path.as_ref().to_path_buf(), + sql_precursors, + rt_converter, + im_converter, + } + } +} + +impl PrecursorReaderTrait for DDATDFPrecursorReader { + fn get(&self, index: usize) -> Precursor { + let sql_precursor = &self.sql_precursors[index]; + let frame_id: usize = sql_precursor.precursor_frame; + let scan_id: f64 = sql_precursor.scan_average; + Precursor { + mz: sql_precursor.mz, + rt: self.rt_converter.convert(frame_id as u32), + im: self.im_converter.convert(scan_id), + charge: Some(sql_precursor.charge), + intensity: Some(sql_precursor.intensity), + index: index + 1, + frame_index: frame_id, + } + } + + fn len(&self) -> usize { + self.sql_precursors.len() + } + + fn get_path(&self) -> PathBuf { + self.path.clone() + } +} diff --git a/src/io/readers/precursor_reader/tdf/dia.rs b/src/io/readers/precursor_reader/tdf/dia.rs new file mode 100644 index 0000000..d604769 --- /dev/null +++ b/src/io/readers/precursor_reader/tdf/dia.rs @@ -0,0 +1,87 @@ +use std::path::{Path, PathBuf}; + +use crate::{ + domain_converters::{ + ConvertableDomain, Frame2RtConverter, Scan2ImConverter, + }, + io::readers::{ + file_readers::sql_reader::{ + frame_groups::SqlWindowGroup, ReadableSqlTable, SqlReader, + }, + MetadataReader, QuadrupoleSettingsReader, + }, + ms_data::{Precursor, QuadrupoleSettings}, +}; + +use super::PrecursorReaderTrait; + +#[derive(Debug)] +pub struct DIATDFPrecursorReader { + path: PathBuf, + expanded_quadrupole_settings: Vec, + rt_converter: Frame2RtConverter, + im_converter: Scan2ImConverter, +} + +impl DIATDFPrecursorReader { + pub fn new(path: impl AsRef) -> Self { + let sql_path = path.as_ref(); + let tdf_sql_reader = SqlReader::open(sql_path).unwrap(); + let metadata = MetadataReader::new(&path); + let rt_converter: Frame2RtConverter = metadata.rt_converter; + let im_converter: Scan2ImConverter = metadata.im_converter; + let window_groups = + SqlWindowGroup::from_sql_reader(&tdf_sql_reader).unwrap(); + let quadrupole_settings = + QuadrupoleSettingsReader::new(tdf_sql_reader.get_path()); + let mut expanded_quadrupole_settings: Vec = vec![]; + for window_group in window_groups { + let window = window_group.window_group; + let frame = window_group.frame; + let group = &quadrupole_settings[window as usize - 1]; + for sub_window in 0..group.isolation_mz.len() { + let sub_quad_settings = QuadrupoleSettings { + index: frame, + scan_starts: vec![group.scan_starts[sub_window]], + scan_ends: vec![group.scan_ends[sub_window]], + isolation_mz: vec![group.isolation_mz[sub_window]], + isolation_width: vec![group.isolation_width[sub_window]], + collision_energy: vec![group.collision_energy[sub_window]], + }; + expanded_quadrupole_settings.push(sub_quad_settings) + } + } + Self { + path: path.as_ref().to_path_buf(), + expanded_quadrupole_settings, + rt_converter, + im_converter, + } + } +} + +impl PrecursorReaderTrait for DIATDFPrecursorReader { + fn get(&self, index: usize) -> Precursor { + let quad_settings = &self.expanded_quadrupole_settings[index]; + let scan_id = (quad_settings.scan_starts[0] + + quad_settings.scan_ends[0]) as f32 + / 2.0; + Precursor { + mz: quad_settings.isolation_mz[0], + rt: self.rt_converter.convert(quad_settings.index as u32 - 1), + im: self.im_converter.convert(scan_id), + charge: None, + intensity: None, + index: index, + frame_index: quad_settings.index, + } + } + + fn len(&self) -> usize { + self.expanded_quadrupole_settings.len() + } + + fn get_path(&self) -> PathBuf { + self.path.clone() + } +} diff --git a/src/io/readers/quad_settings_reader.rs b/src/io/readers/quad_settings_reader.rs new file mode 100644 index 0000000..16367a5 --- /dev/null +++ b/src/io/readers/quad_settings_reader.rs @@ -0,0 +1,83 @@ +use std::path::Path; + +use crate::{ms_data::QuadrupoleSettings, utils::vec_utils::argsort}; + +use super::file_readers::sql_reader::{ + quad_settings::SqlQuadSettings, ReadableSqlTable, SqlReader, +}; + +pub struct QuadrupoleSettingsReader { + quadrupole_settings: Vec, + sql_quadrupole_settings: Vec, +} + +impl QuadrupoleSettingsReader { + pub fn new(path: impl AsRef) -> Vec { + let sql_path = path.as_ref(); + let tdf_sql_reader = SqlReader::open(&sql_path).unwrap(); + let sql_quadrupole_settings = + SqlQuadSettings::from_sql_reader(&tdf_sql_reader).unwrap(); + let window_group_count = sql_quadrupole_settings + .iter() + .map(|x| x.window_group) + .max() + .unwrap() as usize; + let quadrupole_settings = (0..window_group_count) + .map(|window_group| { + let mut quad = QuadrupoleSettings::default(); + quad.index = window_group + 1; + quad + }) + .collect(); + let mut quad_reader = Self { + quadrupole_settings, + sql_quadrupole_settings, + }; + quad_reader.update_from_sql_quadrupole_settings(); + quad_reader.resort_groups(); + quad_reader.quadrupole_settings + } + + fn update_from_sql_quadrupole_settings(&mut self) { + for window_group in self.sql_quadrupole_settings.iter() { + let group = window_group.window_group - 1; + self.quadrupole_settings[group] + .scan_starts + .push(window_group.scan_start); + self.quadrupole_settings[group] + .scan_ends + .push(window_group.scan_end); + self.quadrupole_settings[group] + .collision_energy + .push(window_group.collision_energy); + self.quadrupole_settings[group] + .isolation_mz + .push(window_group.mz_center); + self.quadrupole_settings[group] + .isolation_width + .push(window_group.mz_width); + } + } + + fn resort_groups(&mut self) { + self.quadrupole_settings = self + .quadrupole_settings + .iter() + .map(|_window| { + let mut window = _window.clone(); + let order = argsort(&window.scan_starts); + window.isolation_mz = + order.iter().map(|&i| window.isolation_mz[i]).collect(); + window.isolation_width = + order.iter().map(|&i| window.isolation_width[i]).collect(); + window.collision_energy = + order.iter().map(|&i| window.collision_energy[i]).collect(); + window.scan_starts = + order.iter().map(|&i| window.scan_starts[i]).collect(); + window.scan_ends = + order.iter().map(|&i| window.scan_ends[i]).collect(); + window + }) + .collect(); + } +} diff --git a/src/io/readers/spectrum_reader.rs b/src/io/readers/spectrum_reader.rs new file mode 100644 index 0000000..0082ca3 --- /dev/null +++ b/src/io/readers/spectrum_reader.rs @@ -0,0 +1,64 @@ +mod minitdf; +mod tdf; + +use core::fmt; +use minitdf::MiniTDFSpectrumReader; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; +use std::path::{Path, PathBuf}; +use tdf::TDFSpectrumReader; + +use crate::ms_data::Spectrum; + +pub struct SpectrumReader { + spectrum_reader: Box, +} + +impl fmt::Debug for SpectrumReader { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "SpectrumReader {{ /* fields omitted */ }}") + } +} + +impl SpectrumReader { + pub fn new(path: impl AsRef) -> Self { + let spectrum_reader: Box = + match path.as_ref().extension().and_then(|e| e.to_str()) { + Some("ms2") => Box::new(MiniTDFSpectrumReader::new(path)), + Some("d") => Box::new(TDFSpectrumReader::new(path)), + _ => panic!(), + }; + Self { spectrum_reader } + } + + pub fn get(&self, index: usize) -> Spectrum { + self.spectrum_reader.get(index) + } + + pub fn get_path(&self) -> PathBuf { + self.spectrum_reader.get_path() + } + + pub fn len(&self) -> usize { + self.spectrum_reader.len() + } + + pub fn get_all(&self) -> Vec { + let mut spectra: Vec = (0..self.len()) + .into_par_iter() + .map(|index| self.get(index)) + .collect(); + spectra.sort_by_key(|x| x.precursor.index); + spectra + } + + pub fn calibrate(&mut self) { + self.spectrum_reader.calibrate(); + } +} + +trait SpectrumReaderTrait: Sync { + fn get(&self, index: usize) -> Spectrum; + fn get_path(&self) -> PathBuf; + fn len(&self) -> usize; + fn calibrate(&mut self); +} diff --git a/src/io/readers/spectrum_reader/minitdf.rs b/src/io/readers/spectrum_reader/minitdf.rs new file mode 100644 index 0000000..84a6f4a --- /dev/null +++ b/src/io/readers/spectrum_reader/minitdf.rs @@ -0,0 +1,101 @@ +use std::path::{Path, PathBuf}; + +use crate::{ + io::readers::{ + file_readers::{ + parquet_reader::{ + precursors::ParquetPrecursor, ReadableParquetTable, + }, + tdf_blob_reader::IndexedTdfBlobReader, + }, + PrecursorReader, + }, + ms_data::Spectrum, + utils::find_extension, +}; + +use super::SpectrumReaderTrait; + +#[derive(Debug)] +pub struct MiniTDFSpectrumReader { + path: PathBuf, + precursor_reader: PrecursorReader, + blob_reader: IndexedTdfBlobReader, + collision_energies: Vec, +} + +impl MiniTDFSpectrumReader { + pub fn new(path: impl AsRef) -> Self { + let parquet_file_name = + find_extension(&path, "ms2spectrum.parquet").unwrap(); + let precursor_reader = PrecursorReader::new(&parquet_file_name); + let offsets = ParquetPrecursor::from_parquet_file(&parquet_file_name) + .unwrap() + .iter() + .map(|x| x.offset as usize) + .collect(); + let collision_energies = + ParquetPrecursor::from_parquet_file(&parquet_file_name) + .unwrap() + .iter() + .map(|x| x.collision_energy) + .collect(); + let bin_file_name = find_extension(&path, "bin").unwrap(); + let blob_reader = + IndexedTdfBlobReader::new(&bin_file_name, offsets).unwrap(); + Self { + path: path.as_ref().to_path_buf(), + precursor_reader, + blob_reader, + collision_energies, + } + } +} + +impl SpectrumReaderTrait for MiniTDFSpectrumReader { + fn get(&self, index: usize) -> Spectrum { + let mut spectrum = Spectrum::default(); + spectrum.index = index; + let blob = self.blob_reader.get_blob(index).unwrap(); + if !blob.is_empty() { + let size: usize = blob.len(); + let spectrum_data: Vec = + (0..size).map(|i| blob.get(i)).collect(); + let scan_count: usize = blob.len() / 3; + let tof_indices_bytes: &[u32] = + &spectrum_data[..scan_count as usize * 2]; + let intensities_bytes: &[u32] = + &spectrum_data[scan_count as usize * 2..]; + let mz_values: &[f64] = + bytemuck::cast_slice::(tof_indices_bytes); + let intensity_values: &[f32] = + bytemuck::cast_slice::(intensities_bytes); + spectrum.intensities = + intensity_values.iter().map(|&x| x as f64).collect(); + spectrum.mz_values = mz_values.to_vec(); + } + let precursor = self.precursor_reader.get(index); + spectrum.precursor = precursor; + spectrum.index = precursor.index; + spectrum.collision_energy = self.collision_energies[index]; + spectrum.isolation_mz = precursor.mz; //FIX? + spectrum.isolation_width = if precursor.mz <= 700.0 { + 2.0 + } else if precursor.mz >= 800.0 { + 3.0 + } else { + 2.0 + (precursor.mz - 700.0) / 100.0 + }; //FIX? + spectrum + } + + fn len(&self) -> usize { + self.precursor_reader.len() + } + + fn get_path(&self) -> PathBuf { + self.path.clone() + } + + fn calibrate(&mut self) {} +} diff --git a/src/io/readers/spectrum_reader/tdf.rs b/src/io/readers/spectrum_reader/tdf.rs new file mode 100644 index 0000000..fe1cbcc --- /dev/null +++ b/src/io/readers/spectrum_reader/tdf.rs @@ -0,0 +1,104 @@ +mod dda; +mod dia; +mod raw_spectra; + +use raw_spectra::{RawSpectrum, RawSpectrumReader}; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; +use std::path::{Path, PathBuf}; + +use crate::{ + domain_converters::{ConvertableDomain, Tof2MzConverter}, + io::readers::{ + file_readers::sql_reader::SqlReader, FrameReader, MetadataReader, + PrecursorReader, + }, + ms_data::Spectrum, + utils::find_extension, +}; + +use super::SpectrumReaderTrait; + +const SMOOTHING_WINDOW: u32 = 1; +const CENTROIDING_WINDOW: u32 = 1; +const CALIBRATION_TOLERANCE: f64 = 0.1; + +#[derive(Debug)] +pub struct TDFSpectrumReader { + path: PathBuf, + precursor_reader: PrecursorReader, + mz_reader: Tof2MzConverter, + raw_spectrum_reader: RawSpectrumReader, +} + +impl TDFSpectrumReader { + pub fn new(path_name: impl AsRef) -> Self { + let frame_reader: FrameReader = FrameReader::new(&path_name); + let sql_path = find_extension(&path_name, "analysis.tdf").unwrap(); + let metadata = MetadataReader::new(&sql_path); + let mz_reader: Tof2MzConverter = metadata.mz_converter; + let tdf_sql_reader = SqlReader::open(&sql_path).unwrap(); + let precursor_reader = PrecursorReader::new(&sql_path); + let acquisition_type = frame_reader.get_acquisition(); + let raw_spectrum_reader = RawSpectrumReader::new( + &tdf_sql_reader, + frame_reader, + acquisition_type, + ); + Self { + path: path_name.as_ref().to_path_buf(), + precursor_reader, + mz_reader, + raw_spectrum_reader, + } + } + + pub fn read_single_raw_spectrum(&self, index: usize) -> RawSpectrum { + let raw_spectrum = self.raw_spectrum_reader.get(index); + raw_spectrum + .smooth(SMOOTHING_WINDOW) + .centroid(CENTROIDING_WINDOW) + } +} + +impl SpectrumReaderTrait for TDFSpectrumReader { + fn get(&self, index: usize) -> Spectrum { + let raw_spectrum = self.read_single_raw_spectrum(index); + let spectrum = raw_spectrum + .finalize(self.precursor_reader.get(index), &self.mz_reader); + spectrum + } + + fn len(&self) -> usize { + self.precursor_reader.len() + } + + fn get_path(&self) -> PathBuf { + self.path.clone() + } + + fn calibrate(&mut self) { + let hits: Vec<(f64, u32)> = (0..self.precursor_reader.len()) + .into_par_iter() + .map(|index| { + let spectrum = self.read_single_raw_spectrum(index); + let precursor = self.precursor_reader.get(index); + let precursor_mz: f64 = precursor.mz; + let mut result: Vec<(f64, u32)> = vec![]; + for &tof_index in spectrum.tof_indices.iter() { + let mz = self.mz_reader.convert(tof_index); + if (mz - precursor_mz).abs() < CALIBRATION_TOLERANCE { + let hit = (precursor_mz, tof_index); + result.push(hit); + } + } + result + }) + .reduce(Vec::new, |mut acc, mut vec| { + acc.append(&mut vec); // Concatenate vectors + acc + }); + if hits.len() >= 2 { + self.mz_reader = Tof2MzConverter::from_pairs(&hits); + } + } +} diff --git a/src/io/readers/spectrum_reader/tdf/dda.rs b/src/io/readers/spectrum_reader/tdf/dda.rs new file mode 100644 index 0000000..c5d9eb8 --- /dev/null +++ b/src/io/readers/spectrum_reader/tdf/dda.rs @@ -0,0 +1,99 @@ +use crate::{ + io::readers::{ + file_readers::sql_reader::{ + pasef_frame_msms::SqlPasefFrameMsMs, ReadableSqlTable, SqlReader, + }, + FrameReader, + }, + utils::vec_utils::{argsort, group_and_sum}, +}; + +use super::raw_spectra::{RawSpectrum, RawSpectrumReaderTrait}; + +#[derive(Debug)] +pub struct DDARawSpectrumReader { + order: Vec, + offsets: Vec, + pasef_frames: Vec, + frame_reader: FrameReader, +} + +impl DDARawSpectrumReader { + pub fn new(tdf_sql_reader: &SqlReader, frame_reader: FrameReader) -> Self { + let pasef_frames = + SqlPasefFrameMsMs::from_sql_reader(&tdf_sql_reader).unwrap(); + let pasef_precursors = + &pasef_frames.iter().map(|x| x.precursor).collect(); + let order: Vec = argsort(&pasef_precursors); + let max_precursor = pasef_precursors.iter().max().unwrap(); + let mut offsets: Vec = Vec::with_capacity(max_precursor + 1); + offsets.push(0); + for (offset, &index) in order.iter().enumerate().take(order.len() - 1) { + let second_index: usize = order[offset + 1]; + if pasef_precursors[index] != pasef_precursors[second_index] { + offsets.push(offset + 1) + } + } + offsets.push(order.len()); + Self { + order, + offsets, + pasef_frames, + frame_reader, + } + } + + pub fn iterate_over_pasef_frames( + &self, + index: usize, + ) -> impl Iterator { + let start: usize = self.offsets[index]; + let end: usize = self.offsets[index + 1]; + self.order[start..end] + .iter() + .map(|&x| &self.pasef_frames[x]) + } +} + +impl RawSpectrumReaderTrait for DDARawSpectrumReader { + fn get(&self, index: usize) -> RawSpectrum { + let mut collision_energy = 0.0; + let mut isolation_mz = 0.0; + let mut isolation_width = 0.0; + let mut tof_indices: Vec = vec![]; + let mut intensities: Vec = vec![]; + for pasef_frame in self.iterate_over_pasef_frames(index) { + collision_energy = pasef_frame.collision_energy; + isolation_mz = pasef_frame.isolation_mz; + isolation_width = pasef_frame.isolation_width; + let frame_index: usize = pasef_frame.frame - 1; + let frame = self.frame_reader.get(frame_index); + if frame.intensities.len() == 0 { + continue; + } + let scan_start: usize = pasef_frame.scan_start; + let scan_end: usize = pasef_frame.scan_end; + let offset_start: usize = frame.scan_offsets[scan_start] as usize; + let offset_end: usize = frame.scan_offsets[scan_end] as usize; + let tof_selection: &[u32] = + &frame.tof_indices[offset_start..offset_end]; + let intensity_selection: &[u32] = + &frame.intensities[offset_start..offset_end]; + tof_indices.extend(tof_selection); + intensities.extend(intensity_selection); + } + let (raw_tof_indices, raw_intensities) = group_and_sum( + tof_indices, + intensities.iter().map(|x| *x as u64).collect(), + ); + let raw_spectrum = RawSpectrum { + tof_indices: raw_tof_indices, + intensities: raw_intensities, + index: index, + collision_energy, + isolation_mz, + isolation_width, + }; + raw_spectrum + } +} diff --git a/src/io/readers/spectrum_reader/tdf/dia.rs b/src/io/readers/spectrum_reader/tdf/dia.rs new file mode 100644 index 0000000..493152b --- /dev/null +++ b/src/io/readers/spectrum_reader/tdf/dia.rs @@ -0,0 +1,78 @@ +use crate::{ + io::readers::{ + file_readers::sql_reader::{ + frame_groups::SqlWindowGroup, ReadableSqlTable, SqlReader, + }, + FrameReader, QuadrupoleSettingsReader, + }, + ms_data::QuadrupoleSettings, + utils::vec_utils::group_and_sum, +}; + +use super::raw_spectra::{RawSpectrum, RawSpectrumReaderTrait}; + +#[derive(Debug)] +pub struct DIARawSpectrumReader { + expanded_quadrupole_settings: Vec, + frame_reader: FrameReader, +} + +impl DIARawSpectrumReader { + pub fn new(tdf_sql_reader: &SqlReader, frame_reader: FrameReader) -> Self { + let window_groups = + SqlWindowGroup::from_sql_reader(&tdf_sql_reader).unwrap(); + let quadrupole_settings = + QuadrupoleSettingsReader::new(&tdf_sql_reader.get_path()); + let mut expanded_quadrupole_settings: Vec = vec![]; + for window_group in window_groups { + let window = window_group.window_group; + let frame = window_group.frame; + let group = &quadrupole_settings[window as usize - 1]; + for sub_window in 0..group.isolation_mz.len() { + let sub_quad_settings = QuadrupoleSettings { + index: frame, + scan_starts: vec![group.scan_starts[sub_window]], + scan_ends: vec![group.scan_ends[sub_window]], + isolation_mz: vec![group.isolation_mz[sub_window]], + isolation_width: vec![group.isolation_width[sub_window]], + collision_energy: vec![group.collision_energy[sub_window]], + }; + expanded_quadrupole_settings.push(sub_quad_settings) + } + } + Self { + expanded_quadrupole_settings, + frame_reader, + } + } +} + +impl RawSpectrumReaderTrait for DIARawSpectrumReader { + fn get(&self, index: usize) -> RawSpectrum { + let quad_settings = &self.expanded_quadrupole_settings[index]; + let collision_energy = quad_settings.collision_energy[0]; + let isolation_mz = quad_settings.isolation_mz[0]; + let isolation_width = quad_settings.isolation_width[0]; + let scan_start = quad_settings.scan_starts[0]; + let scan_end = quad_settings.scan_ends[0]; + let frame_index = quad_settings.index - 1; + let frame = self.frame_reader.get(frame_index); + let offset_start = frame.scan_offsets[scan_start] as usize; + let offset_end = frame.scan_offsets[scan_end] as usize; + let tof_indices = &frame.tof_indices[offset_start..offset_end]; + let intensities = &frame.intensities[offset_start..offset_end]; + let (raw_tof_indices, raw_intensities) = group_and_sum( + tof_indices.iter().map(|x| *x).collect(), + intensities.iter().map(|x| *x as u64).collect(), + ); + let raw_spectrum = RawSpectrum { + tof_indices: raw_tof_indices, + intensities: raw_intensities, + index: index, + collision_energy, + isolation_mz, + isolation_width, + }; + raw_spectrum + } +} diff --git a/src/io/readers/spectrum_reader/tdf/raw_spectra.rs b/src/io/readers/spectrum_reader/tdf/raw_spectra.rs new file mode 100644 index 0000000..156b6a4 --- /dev/null +++ b/src/io/readers/spectrum_reader/tdf/raw_spectra.rs @@ -0,0 +1,117 @@ +use core::fmt; + +use crate::{ + domain_converters::{ConvertableDomain, Tof2MzConverter}, + io::readers::{file_readers::sql_reader::SqlReader, FrameReader}, + ms_data::{AcquisitionType, Precursor, Spectrum}, + utils::vec_utils::{filter_with_mask, find_sparse_local_maxima_mask}, +}; + +use super::{dda::DDARawSpectrumReader, dia::DIARawSpectrumReader}; + +#[derive(Debug, PartialEq, Default, Clone)] +pub(crate) struct RawSpectrum { + pub tof_indices: Vec, + pub intensities: Vec, + pub index: usize, + pub collision_energy: f64, + pub isolation_mz: f64, + pub isolation_width: f64, +} + +impl RawSpectrum { + pub fn smooth(mut self, window: u32) -> Self { + let mut smooth_intensities: Vec = self.intensities.clone(); + for (current_index, current_tof) in self.tof_indices.iter().enumerate() + { + let current_intensity: u64 = self.intensities[current_index]; + for (_next_index, next_tof) in + self.tof_indices[current_index + 1..].iter().enumerate() + { + let next_index: usize = _next_index + current_index + 1; + let next_intensity: u64 = self.intensities[next_index]; + if (next_tof - current_tof) <= window { + smooth_intensities[current_index] += next_intensity; + smooth_intensities[next_index] += current_intensity; + } else { + break; + } + } + } + self.intensities = smooth_intensities; + self + } + + pub fn centroid(mut self, window: u32) -> Self { + let local_maxima: Vec = find_sparse_local_maxima_mask( + &self.tof_indices, + &self.intensities, + window, + ); + self.tof_indices = filter_with_mask(&self.tof_indices, &local_maxima); + self.intensities = filter_with_mask(&self.intensities, &local_maxima); + self + } + + pub fn finalize( + &self, + precursor: Precursor, + mz_reader: &Tof2MzConverter, + ) -> Spectrum { + let index = self.index; + let spectrum: Spectrum = Spectrum { + mz_values: self + .tof_indices + .iter() + .map(|&x| mz_reader.convert(x)) + .collect(), + intensities: self.intensities.iter().map(|x| *x as f64).collect(), + precursor: precursor, + index: index, + collision_energy: self.collision_energy, + isolation_mz: self.isolation_mz, + isolation_width: self.isolation_width, + }; + spectrum + } +} + +pub struct RawSpectrumReader { + raw_spectrum_reader: Box, +} + +impl fmt::Debug for RawSpectrumReader { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "RawSpectrumReader {{ /* fields omitted */ }}") + } +} + +impl RawSpectrumReader { + pub fn new( + tdf_sql_reader: &SqlReader, + frame_reader: FrameReader, + acquisition_type: AcquisitionType, + ) -> Self { + let raw_spectrum_reader: Box = + match acquisition_type { + AcquisitionType::DDAPASEF => Box::new( + DDARawSpectrumReader::new(tdf_sql_reader, frame_reader), + ), + AcquisitionType::DIAPASEF => Box::new( + DIARawSpectrumReader::new(tdf_sql_reader, frame_reader), + ), + _ => panic!(), + }; + Self { + raw_spectrum_reader, + } + } + + pub fn get(&self, index: usize) -> RawSpectrum { + self.raw_spectrum_reader.get(index) + } +} + +pub trait RawSpectrumReaderTrait: Sync { + fn get(&self, index: usize) -> RawSpectrum; +} diff --git a/src/io/writers.rs b/src/io/writers.rs new file mode 100644 index 0000000..d626864 --- /dev/null +++ b/src/io/writers.rs @@ -0,0 +1,3 @@ +mod mgf; + +pub use mgf::*; diff --git a/src/io/writers/mgf.rs b/src/io/writers/mgf.rs new file mode 100644 index 0000000..30ab5d2 --- /dev/null +++ b/src/io/writers/mgf.rs @@ -0,0 +1,61 @@ +use std::fs::File; +use std::io::Write; +use std::path::Path; + +use crate::ms_data::Spectrum; + +pub struct MGFWriter; + +impl MGFWriter { + pub fn write_spectra(input_file_path: &str, spectra: &Vec) { + let output_file_path = { + let input_path = Path::new(&input_file_path); + let file_stem = + Path::new(&input_file_path).file_stem().unwrap_or_default(); + let new_file_name = format!("{}.mgf", file_stem.to_string_lossy()); + input_path.with_file_name(new_file_name) + }; + let mut file = + File::create(output_file_path).expect("Failed to create file"); + for spectrum in spectra { + _ = file.write_all("BEGIN IONS\n".as_bytes()); + _ = file.write_all(MGFEntry::write(spectrum).as_bytes()); + _ = file.write_all("END IONS\n".as_bytes()); + } + file.flush().expect("Failed to flush to file"); + } +} + +pub struct MGFEntry; + +impl MGFEntry { + pub fn write_header(spectrum: &Spectrum) -> String { + let precursor = spectrum.precursor; + let title = precursor.index; + let intensity = precursor.intensity.unwrap_or(0.0); + let charge = precursor.charge.unwrap_or(0); + let ms2_data = format!( + "TITLE=index:{}, im:{:.4}, intensity:{:.4}, frame:{}, ce:{:.4}\nPEPMASS={:.4}\nCHARGE={}\nRT={:.2}\n", + title, precursor.im, intensity, precursor.frame_index, spectrum.collision_energy, precursor.mz, charge, precursor.rt + ); + ms2_data + } + + pub fn write_peaks(spectrum: &Spectrum) -> String { + let mut ms2_data: String = String::new(); + for (mz, intensity) in + spectrum.mz_values.iter().zip(spectrum.intensities.iter()) + { + ms2_data.push_str(&format!("{:.4}\t{:.0}\n", mz, intensity)); + } + ms2_data + } + + pub fn write(spectrum: &Spectrum) -> String { + format!( + "{}{}", + Self::write_header(spectrum), + Self::write_peaks(spectrum) + ) + } +} diff --git a/src/lib.rs b/src/lib.rs index c3645c1..085bee8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,8 +4,8 @@ //! //! Two primary data types are exposed: //! -//! * [Spectra](crate::Spectrum): A traditional representation that expresses intensitites in function of mz values for a given precursor. -//! * [Frames](crate::Frame): All recorded data from a single TIMS elution (i.e. at one specific retention_time). +//! * [Spectra](crate::ms_data::Spectrum): A traditional representation that expresses intensitites in function of mz values for a given precursor. +//! * [Frames](crate::ms_data::Frame): All recorded data from a single TIMS elution (i.e. at one specific retention_time). //! //! ## File formats //! @@ -21,24 +21,11 @@ //! * *.ms2spectrum.bin //! * *.ms2spectrum.parquet -mod acquisition; -mod calibration; -mod converters; +pub mod domain_converters; mod errors; mod file_readers; -mod frames; -mod precursors; -mod spectra; -mod vec_utils; +pub mod io; +pub mod ms_data; +mod utils; -pub use crate::{ - acquisition::AcquisitionType, - converters::{ - ConvertableIndex, Frame2RtConverter, Scan2ImConverter, Tof2MzConverter, - }, - errors::*, - file_readers::FileReader, - frames::{Frame, FrameType}, - precursors::{Precursor, QuadrupoleEvent}, - spectra::Spectrum, -}; +pub use crate::{errors::*, file_readers::FileReader}; diff --git a/src/main.rs b/src/main.rs deleted file mode 100644 index 4efdbe8..0000000 --- a/src/main.rs +++ /dev/null @@ -1,27 +0,0 @@ -use std::env; -use timsrust::{FileReader, Spectrum}; - -fn main() { - let args: Vec = env::args().collect(); - let d_folder_name: &str = &args[1]; - let x = FileReader::new(d_folder_name.to_string()).unwrap(); - let dda_spectra: Vec = x.read_all_spectra(); - let precursor_index: usize; - if args.len() >= 3 { - precursor_index = args[2].parse().unwrap_or(0); - } else { - precursor_index = 1000; - } - - println!("precursor {:?}", dda_spectra[precursor_index].precursor); - println!( - "precursor {:?}", - dda_spectra[precursor_index].mz_values.len() - ); - println!( - "precursor {:?}", - dda_spectra[precursor_index].intensities.len() - ); - println!("precursor {:?}", dda_spectra[precursor_index].mz_values); - println!("precursor {:?}", dda_spectra[precursor_index].intensities); -} diff --git a/src/ms_data.rs b/src/ms_data.rs new file mode 100644 index 0000000..5f9173c --- /dev/null +++ b/src/ms_data.rs @@ -0,0 +1,15 @@ +//! Data structures that represent MS data + +mod acquisition; +mod frames; +mod metadata; +mod precursors; +mod quadrupole; +mod spectra; + +pub use acquisition::*; +pub use frames::*; +pub use metadata::*; +pub use precursors::*; +pub use quadrupole::*; +pub use spectra::*; diff --git a/src/ms_data/acquisition.rs b/src/ms_data/acquisition.rs new file mode 100644 index 0000000..b3e7be0 --- /dev/null +++ b/src/ms_data/acquisition.rs @@ -0,0 +1,11 @@ +/// The kind of acquisition that was used. +#[derive(Debug, PartialEq, Clone, Copy, Default)] +pub enum AcquisitionType { + DDAPASEF, + DIAPASEF, + DiagonalDIAPASEF, + // PRMPASEF, + /// Default value. + #[default] + Unknown, +} diff --git a/src/ms_data/frames.rs b/src/ms_data/frames.rs new file mode 100644 index 0000000..c4858e0 --- /dev/null +++ b/src/ms_data/frames.rs @@ -0,0 +1,44 @@ +use super::{AcquisitionType, QuadrupoleSettings}; +use std::sync::Arc; + +/// A frame with all unprocessed data as it was acquired. +#[derive(Debug, PartialEq, Default, Clone)] +pub struct Frame { + pub scan_offsets: Vec, + pub tof_indices: Vec, + pub intensities: Vec, + pub index: usize, + pub rt: f64, + pub acquisition_type: AcquisitionType, + pub ms_level: MSLevel, + pub quadrupole_settings: Arc, + pub intensity_correction_factor: f64, + pub window_group: u8, +} + +impl Frame { + pub fn get_corrected_intensity(&self, index: usize) -> f64 { + self.intensity_correction_factor * self.intensities[index] as f64 + } +} + +/// The MS level used. +#[derive(Debug, PartialEq, Default, Clone, Copy)] +pub enum MSLevel { + MS1, + MS2, + /// Default value. + #[default] + Unknown, +} + +impl MSLevel { + pub fn read_from_msms_type(msms_type: u8) -> MSLevel { + match msms_type { + 0 => MSLevel::MS1, + 8 => MSLevel::MS2, + 9 => MSLevel::MS2, + _ => MSLevel::Unknown, + } + } +} diff --git a/src/ms_data/metadata.rs b/src/ms_data/metadata.rs new file mode 100644 index 0000000..55766a5 --- /dev/null +++ b/src/ms_data/metadata.rs @@ -0,0 +1,15 @@ +use std::path::PathBuf; + +use crate::domain_converters::{ + Frame2RtConverter, Scan2ImConverter, Tof2MzConverter, +}; + +/// Metadata from a single run. +#[derive(Debug, Clone)] +pub struct Metadata { + pub path: PathBuf, + pub rt_converter: Frame2RtConverter, + pub im_converter: Scan2ImConverter, + pub mz_converter: Tof2MzConverter, + pub compression_type: u8, +} diff --git a/src/ms_data/precursors.rs b/src/ms_data/precursors.rs new file mode 100644 index 0000000..4f99d59 --- /dev/null +++ b/src/ms_data/precursors.rs @@ -0,0 +1,11 @@ +/// The MS1 precursor that got selected for fragmentation. +#[derive(Debug, Default, Clone, Copy, PartialEq)] +pub struct Precursor { + pub mz: f64, + pub rt: f64, + pub im: f64, + pub charge: Option, + pub intensity: Option, + pub index: usize, + pub frame_index: usize, +} diff --git a/src/ms_data/quadrupole.rs b/src/ms_data/quadrupole.rs new file mode 100644 index 0000000..19c8d94 --- /dev/null +++ b/src/ms_data/quadrupole.rs @@ -0,0 +1,10 @@ +/// The quadrupole settings used for fragmentation. +#[derive(Debug, Default, Clone, PartialEq)] +pub struct QuadrupoleSettings { + pub index: usize, + pub scan_starts: Vec, + pub scan_ends: Vec, + pub isolation_mz: Vec, + pub isolation_width: Vec, + pub collision_energy: Vec, +} diff --git a/src/ms_data/spectra.rs b/src/ms_data/spectra.rs new file mode 100644 index 0000000..bd9cf25 --- /dev/null +++ b/src/ms_data/spectra.rs @@ -0,0 +1,49 @@ +use super::Precursor; + +/// An MS2 spectrum with centroided mz values and summed intensities. +#[derive(Debug, PartialEq, Default)] +pub struct Spectrum { + pub mz_values: Vec, + pub intensities: Vec, + pub precursor: Precursor, + pub index: usize, + pub collision_energy: f64, + pub isolation_mz: f64, + pub isolation_width: f64, +} + +impl Spectrum { + pub fn get_top_n(&self, n: usize) -> Self { + let top_n = if n == 0 { self.len() } else { n }; + let mut indexed: Vec<(f64, usize)> = + self.intensities.iter().cloned().zip(0..).collect(); + indexed.sort_by(|a, b| { + b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal) + }); + let mut top_indices: Vec = indexed + .iter() + .take(top_n) + .map(|&(_, index)| index) + .collect(); + top_indices.sort(); + Spectrum { + mz_values: top_indices + .iter() + .map(|&index| self.mz_values[index]) + .collect(), + intensities: top_indices + .iter() + .map(|&index| self.intensities[index]) + .collect(), + precursor: self.precursor, + index: self.index, + collision_energy: self.collision_energy, + isolation_mz: self.isolation_mz, + isolation_width: self.isolation_width, + } + } + + pub fn len(&self) -> usize { + self.mz_values.len() + } +} diff --git a/src/precursors.rs b/src/precursors.rs deleted file mode 100644 index 195beb3..0000000 --- a/src/precursors.rs +++ /dev/null @@ -1,38 +0,0 @@ -/// An MS1 precursor that got selected for fragmentation. -#[derive(Debug, Default, Clone, Copy, PartialEq)] -pub struct Precursor { - pub mz: f64, - pub rt: f64, - pub im: f64, - pub charge: usize, - pub intensity: f64, - pub index: usize, - pub frame_index: usize, - pub collision_energy: f64, -} - -/// A type of quadrupole selection. -#[derive(Debug, Clone, Copy, PartialEq)] -pub enum QuadrupoleEvent { - Precursor(Precursor), - // Window(Window), - // PrecursorList(Vec), - None, -} - -impl Default for QuadrupoleEvent { - fn default() -> Self { - Self::None - } -} - -impl QuadrupoleEvent { - pub fn unwrap_as_precursor(&self) -> Precursor { - match self { - QuadrupoleEvent::Precursor(precursor) => *precursor, - _ => { - panic!("Not a precursor"); - }, - } - } -} diff --git a/src/spectra.rs b/src/spectra.rs deleted file mode 100644 index 9f9eadf..0000000 --- a/src/spectra.rs +++ /dev/null @@ -1,117 +0,0 @@ -use crate::{ - converters::{ConvertableIndex, Tof2MzConverter}, - precursors::QuadrupoleEvent, - vec_utils::{filter_with_mask, find_sparse_local_maxima_mask}, - Precursor, -}; - -pub struct RawSpectrumProcessor { - pub raw_spectrum: RawSpectrum, -} - -impl RawSpectrumProcessor { - pub fn smooth(mut self, window: u32) -> Self { - let mut smooth_intensities: Vec = - self.raw_spectrum.intensities.clone(); - for (current_index, current_tof) in - self.raw_spectrum.tof_indices.iter().enumerate() - { - let current_intensity: u64 = - self.raw_spectrum.intensities[current_index]; - for (_next_index, next_tof) in self.raw_spectrum.tof_indices - [current_index + 1..] - .iter() - .enumerate() - { - let next_index: usize = _next_index + current_index + 1; - let next_intensity: u64 = - self.raw_spectrum.intensities[next_index]; - if (next_tof - current_tof) <= window { - smooth_intensities[current_index] += next_intensity; - smooth_intensities[next_index] += current_intensity; - } else { - break; - } - } - } - self.raw_spectrum.intensities = smooth_intensities; - self.raw_spectrum.processed_state = - RawProcessedSpectrumState::SmoothedProfile; - self - } - - pub fn centroid(mut self, window: u32) -> Self { - let local_maxima: Vec = self.find_local_maxima(window); - self.raw_spectrum.tof_indices = - filter_with_mask(&self.raw_spectrum.tof_indices, &local_maxima); - self.raw_spectrum.intensities = - filter_with_mask(&self.raw_spectrum.intensities, &local_maxima); - self.raw_spectrum.processed_state = - RawProcessedSpectrumState::Centroided; - self - } - - fn find_local_maxima(&self, window: u32) -> Vec { - find_sparse_local_maxima_mask( - &self.raw_spectrum.tof_indices, - &self.raw_spectrum.intensities, - window, - ) - } - - pub fn finalize( - &self, - precursor: Precursor, - mz_reader: &Tof2MzConverter, - ) -> Spectrum { - let index = self.raw_spectrum.index; - let spectrum: Spectrum = Spectrum { - mz_values: self - .raw_spectrum - .tof_indices - .iter() - .map(|&x| mz_reader.convert(x)) - .collect(), - intensities: self - .raw_spectrum - .intensities - .iter() - .map(|x| *x as f64) - .collect(), - precursor: QuadrupoleEvent::Precursor(precursor), - index: index, - }; - spectrum - } -} - -#[derive(Debug, PartialEq, Clone)] -pub enum RawProcessedSpectrumState { - Profile, - SmoothedProfile, - Centroided, - Unprocessed, -} - -impl Default for RawProcessedSpectrumState { - fn default() -> Self { - Self::Unprocessed - } -} - -#[derive(Debug, PartialEq, Default, Clone)] -pub struct RawSpectrum { - pub tof_indices: Vec, - pub intensities: Vec, - pub processed_state: RawProcessedSpectrumState, - pub index: usize, -} - -/// An MS2 spectrum with centroided mz values and summed intensities. -#[derive(Debug, PartialEq, Default)] -pub struct Spectrum { - pub mz_values: Vec, - pub intensities: Vec, - pub precursor: QuadrupoleEvent, - pub index: usize, -} diff --git a/src/utils.rs b/src/utils.rs new file mode 100644 index 0000000..7021ffd --- /dev/null +++ b/src/utils.rs @@ -0,0 +1,26 @@ +use std::{ + fs, + path::{Path, PathBuf}, +}; + +pub mod vec_utils; + +pub fn find_extension( + path: impl AsRef, + extension: &str, +) -> Option { + let extension_lower = extension.to_lowercase(); + for entry in fs::read_dir(&path).ok()? { + if let Ok(entry) = entry { + let file_path = entry.path(); + if let Some(file_name) = + file_path.file_name().and_then(|name| name.to_str()) + { + if file_name.to_lowercase().ends_with(&extension_lower) { + return Some(file_path); + } + } + } + } + None +} diff --git a/src/vec_utils.rs b/src/utils/vec_utils.rs similarity index 94% rename from src/vec_utils.rs rename to src/utils/vec_utils.rs index e6e894e..724fc3c 100644 --- a/src/vec_utils.rs +++ b/src/utils/vec_utils.rs @@ -61,9 +61,8 @@ pub fn find_sparse_local_maxima_mask( } pub fn filter_with_mask(vec: &Vec, mask: &Vec) -> Vec { - vec.iter() - .zip(mask.iter()) - .filter(|(_, y_elem)| **y_elem) - .map(|(&x_elem, _)| x_elem) + (0..vec.len()) + .filter(|&x| mask[x]) + .map(|x| vec[x]) .collect() } diff --git a/tests/frame_readers.rs b/tests/frame_readers.rs index aa7de7e..8804a32 100644 --- a/tests/frame_readers.rs +++ b/tests/frame_readers.rs @@ -1,5 +1,8 @@ -use std::path::Path; -use timsrust::{AcquisitionType, FileReader, Frame, FrameType}; +use std::{path::Path, sync::Arc}; +use timsrust::{ + ms_data::{AcquisitionType, Frame, MSLevel, QuadrupoleSettings}, + FileReader, +}; fn get_local_directory() -> &'static Path { Path::new(std::file!()) @@ -8,7 +11,7 @@ fn get_local_directory() -> &'static Path { } #[test] -fn tdf_reader_frames() { +fn tdf_reader_frames1() { let file_name = "test.d"; let file_path = get_local_directory() .join(file_name) @@ -16,7 +19,7 @@ fn tdf_reader_frames() { .unwrap() .to_string(); let frames: Vec = - FileReader::new(file_path).unwrap().read_all_frames(); + FileReader::new(&file_path).unwrap().read_all_ms1_frames(); let expected: Vec = vec![ Frame { scan_offsets: vec![0, 1, 3, 6, 10], @@ -24,34 +27,73 @@ fn tdf_reader_frames() { intensities: (0..10).map(|x| (x + 1) * 2).collect(), index: 1, rt: 0.1, - frame_type: FrameType::MS1, - }, - Frame { - scan_offsets: vec![0, 5, 11, 18, 26], - tof_indices: (10..36).collect(), - intensities: (10..36).map(|x| (x + 1) * 2).collect(), - index: 2, - rt: 0.2, - frame_type: FrameType::MS2(AcquisitionType::DDAPASEF), + ms_level: MSLevel::MS1, + quadrupole_settings: Arc::new(QuadrupoleSettings::default()), + acquisition_type: AcquisitionType::DDAPASEF, + intensity_correction_factor: 1.0 / 100.0, + window_group: 0, }, + // Frame::default(), Frame { scan_offsets: vec![0, 9, 19, 30, 42], tof_indices: (36..78).collect(), intensities: (36..78).map(|x| (x + 1) * 2).collect(), index: 3, rt: 0.3, - frame_type: FrameType::MS1, + ms_level: MSLevel::MS1, + quadrupole_settings: Arc::new(QuadrupoleSettings::default()), + acquisition_type: AcquisitionType::DDAPASEF, + intensity_correction_factor: 1.0 / 100.0, + window_group: 0, }, + // Frame::default(), + ]; + for i in 0..expected.len() { + assert_eq!(&frames[i], &expected[i]) + } +} + +#[test] +fn tdf_reader_frames2() { + let file_name = "test.d"; + let file_path = get_local_directory() + .join(file_name) + .to_str() + .unwrap() + .to_string(); + let frames: Vec = + FileReader::new(&file_path).unwrap().read_all_ms2_frames(); + let expected: Vec = vec![ + // Frame::default(), + Frame { + scan_offsets: vec![0, 5, 11, 18, 26], + tof_indices: (10..36).collect(), + intensities: (10..36).map(|x| (x + 1) * 2).collect(), + index: 2, + rt: 0.2, + ms_level: MSLevel::MS2, + quadrupole_settings: Arc::new(QuadrupoleSettings::default()), + acquisition_type: AcquisitionType::DDAPASEF, + intensity_correction_factor: 1.0 / 100.0, + window_group: 0, + }, + // Frame::default(), Frame { scan_offsets: vec![0, 13, 27, 42, 58], tof_indices: (78..136).collect(), intensities: (78..136).map(|x| (x + 1) * 2).collect(), index: 4, rt: 0.4, - frame_type: FrameType::MS2(AcquisitionType::DDAPASEF), + ms_level: MSLevel::MS2, + quadrupole_settings: Arc::new(QuadrupoleSettings::default()), + acquisition_type: AcquisitionType::DDAPASEF, + intensity_correction_factor: 1.0 / 100.0, + window_group: 0, }, ]; - for i in 0..frames.len() { + for i in 0..expected.len() { assert_eq!(&frames[i], &expected[i]) } } + +// TODO test for DIA diff --git a/tests/spectrum_readers.rs b/tests/spectrum_readers.rs index a19f228..085013f 100644 --- a/tests/spectrum_readers.rs +++ b/tests/spectrum_readers.rs @@ -1,5 +1,8 @@ use std::path::Path; -use timsrust::{FileReader, Precursor, QuadrupoleEvent, Spectrum}; +use timsrust::{ + ms_data::{Precursor, Spectrum}, + FileReader, +}; fn get_local_directory() -> &'static Path { Path::new(std::file!()) @@ -21,32 +24,36 @@ fn minitdf_reader() { Spectrum { mz_values: vec![100.0, 200.002, 300.03, 400.4], intensities: vec![1.0, 2.0, 3.0, 4.0], - precursor: QuadrupoleEvent::Precursor(Precursor { + precursor: Precursor { mz: 123.4567, rt: 12.345, im: 1.234, - charge: 1, - intensity: 0.0, + charge: Some(1), + intensity: Some(0.0), index: 1, frame_index: 1, - collision_energy: 0.0, - }), + }, index: 1, + collision_energy: 0.0, + isolation_mz: 123.4567, + isolation_width: 2.0, }, Spectrum { mz_values: vec![1100.0, 1200.002, 1300.03, 1400.4], intensities: vec![10.0, 20.0, 30.0, 40.0], - precursor: QuadrupoleEvent::Precursor(Precursor { + precursor: Precursor { mz: 987.6543, rt: 9.876, im: 0.9876, - charge: 2, - intensity: 0.0, + charge: Some(2), + intensity: Some(0.0), index: 2, frame_index: 2, - collision_energy: 0.0, - }), + }, index: 2, + collision_energy: 0.0, + isolation_mz: 987.6543, + isolation_width: 3.0, }, ]; for i in 0..spectra.len() { @@ -68,47 +75,53 @@ fn tdf_reader_dda() { Spectrum { mz_values: vec![199.7633445943076], intensities: vec![162.0], - precursor: QuadrupoleEvent::Precursor(Precursor { + precursor: Precursor { mz: 500.0, rt: 0.2, - im: 1.4989212513484358, - charge: 2, - intensity: 10.0, + im: 1.25, + charge: Some(2), + intensity: Some(10.0), index: 1, frame_index: 1, - collision_energy: 0.0, - }), + }, index: 0, + collision_energy: 0.0, + isolation_mz: 500.5, + isolation_width: 2.0, }, Spectrum { mz_values: vec![169.5419900362706, 695.6972509397959], intensities: vec![120.0, 624.0], - precursor: QuadrupoleEvent::Precursor(Precursor { + precursor: Precursor { mz: 501.0, rt: 0.2, - im: 1.4978425026968716, - charge: 3, - intensity: 10.0, + im: 1.0, + charge: Some(3), + intensity: Some(10.0), index: 2, frame_index: 1, - collision_energy: 0.0, - }), + }, index: 1, + collision_energy: 0.0, + isolation_mz: 501.5, + isolation_width: 2.0, }, Spectrum { mz_values: vec![827.1915846690921], intensities: vec![714.0], - precursor: QuadrupoleEvent::Precursor(Precursor { + precursor: Precursor { mz: 502.0, rt: 0.4, - im: 1.4989212513484358, - charge: 2, - intensity: 10.0, + im: 1.25, + charge: Some(2), + intensity: Some(10.0), index: 3, frame_index: 3, - collision_energy: 0.0, - }), + }, index: 2, + collision_energy: 0.0, + isolation_mz: 502.5, + isolation_width: 2.0, }, ]; for i in 0..spectra.len() { diff --git a/tests/test.d/analysis.tdf2 b/tests/test.d/analysis.tdf2 new file mode 100644 index 0000000..e69de29