From 75391bbba8a55733e1bc645ffd6800cf3e23d9b2 Mon Sep 17 00:00:00 2001 From: Michael Lazear Date: Wed, 19 Jul 2023 14:26:12 -0700 Subject: [PATCH] Fix edge cases in parser (#78) - Add test case, bump version to v0.13.4 --- CHANGELOG.md | 4 + Cargo.lock | 16 +++- crates/sage-cli/Cargo.toml | 2 +- crates/sage/Cargo.toml | 4 +- crates/sage/src/mzml.rs | 184 +++++++++++++++++++++++++++---------- 5 files changed, 155 insertions(+), 55 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9876f23..cb70589 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [v0.13.4] +### Fixed +- Bug in mzML parser, where some older specification-compliant mzMLs would not parse. If your mzMLs previously parsed, then there will be no change in behavior. Added a test case + ## [v0.13.3] ### Fixed - Bug in `database.enzyme.restrict` parameter, where `null` values were being overriden with "P" (causing Trypsin/P to behave like Trypsin) diff --git a/Cargo.lock b/Cargo.lock index aacb6fb..73a3ddb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1450,7 +1450,7 @@ checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09" [[package]] name = "sage-cli" -version = "0.13.3" +version = "0.13.4" dependencies = [ "anyhow", "clap", @@ -1485,7 +1485,7 @@ dependencies = [ [[package]] name = "sage-core" -version = "0.13.3" +version = "0.13.4" dependencies = [ "async-compression", "base64 0.13.1", @@ -1749,9 +1749,21 @@ dependencies = [ "mio", "pin-project-lite", "socket2", + "tokio-macros", "winapi", ] +[[package]] +name = "tokio-macros" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d266c00fde287f55d3f1c3e96c500c362a2b8c695076ec180f27918820bc6df8" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "tokio-rustls" version = "0.23.4" diff --git a/crates/sage-cli/Cargo.toml b/crates/sage-cli/Cargo.toml index 87e886d..caad239 100644 --- a/crates/sage-cli/Cargo.toml +++ b/crates/sage-cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sage-cli" -version = "0.13.3" +version = "0.13.4" authors = ["Michael Lazear { $ev.try_get_attribute($key)? - .ok_or_else(|| MzMLError::Malformed)? + .ok_or(MzMLError::Malformed)? .value }; } + macro_rules! extract_value { + ($ev:expr) => {{ + let s = $ev + .try_get_attribute(b"value")? + .ok_or(MzMLError::Malformed)? + .value; + std::str::from_utf8(&s)?.parse()? + }}; + } + loop { match reader.read_event_into_async(&mut buf).await { Ok(Event::Start(ref ev)) => { @@ -176,8 +186,7 @@ impl MzMLReader { Ok(Event::Empty(ref ev)) => match (state, ev.name().into_inner()) { (Some(State::BinaryDataArray), b"cvParam") => { let accession = extract!(ev, b"accession"); - let accession = std::str::from_utf8(&accession)?; - match accession { + match accession.as_ref() { ZLIB_COMPRESSION => compression = true, NO_COMPRESSION => compression = false, FLOAT_64 => binary_dtype = Dtype::F64, @@ -193,11 +202,9 @@ impl MzMLReader { } (Some(State::Spectrum), b"cvParam") => { let accession = extract!(ev, b"accession"); - let accession = std::str::from_utf8(&accession)?; - match accession { + match accession.as_ref() { MS_LEVEL => { - let level = extract!(ev, b"value"); - let level = std::str::from_utf8(&level)?.parse::()?; + let level = extract_value!(ev); if let Some(filter) = self.ms_level { if level != filter { spectrum = Spectrum::default(); @@ -209,8 +216,7 @@ impl MzMLReader { PROFILE => spectrum.representation = Representation::Profile, CENTROID => spectrum.representation = Representation::Centroid, TOTAL_ION_CURRENT => { - let value = extract!(ev, b"value"); - let value = std::str::from_utf8(&value)?.parse::()?; + let value = extract_value!(ev); if value == 0.0 { // No ion current, break out of current state spectrum = Spectrum::default(); @@ -224,48 +230,35 @@ impl MzMLReader { } (Some(State::Precursor), b"cvParam") => { let accession = extract!(ev, b"accession"); - let accession = std::str::from_utf8(&accession)?; - let value = extract!(ev, b"value"); - let value = std::str::from_utf8(&value)?; - match accession { - ISO_WINDOW_LOWER => { - iso_window_lo = Some(value.parse()?); - } - ISO_WINDOW_UPPER => { - iso_window_hi = Some(value.parse()?); - } + match accession.as_ref() { + ISO_WINDOW_LOWER => iso_window_lo = Some(extract_value!(ev)), + ISO_WINDOW_UPPER => iso_window_hi = Some(extract_value!(ev)), _ => {} } } (Some(State::SelectedIon), b"cvParam") => { let accession = extract!(ev, b"accession"); - let accession = std::str::from_utf8(&accession)?; - let value = extract!(ev, b"value"); - let value = std::str::from_utf8(&value)?; - match accession { + match accession.as_ref() { SELECTED_ION_CHARGE => { - precursor.charge = Some(value.parse()?); + precursor.charge = Some(extract_value!(ev)); } SELECTED_ION_MZ => { - precursor.mz = value.parse()?; + precursor.mz = extract_value!(ev); } SELECTED_ION_INT => { - precursor.intensity = Some(value.parse()?); + precursor.intensity = Some(extract_value!(ev)); } _ => {} } } (Some(State::Scan), b"cvParam") => { let accession = extract!(ev, b"accession"); - let accession = std::str::from_utf8(&accession)?; - let value = extract!(ev, b"value"); - let value = std::str::from_utf8(&value)?; - match accession { + match accession.as_ref() { SCAN_START_TIME => { - spectrum.scan_start_time = value.parse()?; + spectrum.scan_start_time = extract_value!(ev); } ION_INJECTION_TIME => { - spectrum.ion_injection_time = value.parse()?; + spectrum.ion_injection_time = extract_value!(ev); } _ => {} } @@ -451,3 +444,94 @@ impl From for MzMLError { Self::Malformed } } + +#[cfg(test)] +mod test { + use crate::{mass::Tolerance, mzml::Representation}; + + use super::{MzMLError, MzMLReader}; + + #[tokio::test] + async fn parse_spectrum_issue_78() -> Result<(), MzMLError> { + let s = r#" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + AAAAoExpYkAAAACA3MpkQAAAAACph2VAAAAAAE4wZkAAAACAlMdmQAAAAECZAmdAAAAAwP9jaEAAAADgj4ZoQAAAAGC7HWlAAAAAAOXFaUAAAADg+4dqQAAAAMC1pmpAAAAA4IGFa0AAAACAaUZsQAAAACBzYW1AAAAAANCjbUAAAACAQ6duQAAAAIDsxG5AAAAAQKIlb0AAAACA5z9vQAAAAIDuw29AAAAAAJQicEAAAAAg9UZwQAAAAKCeVHBAAAAAIInEcEAAAACAcs5wQAAAAOA6BHFAAAAAADoOcUAAAAAgfcRxQAAAAOA68nFAAAAAoPExckAAAADATKVyQAAAAMC10nJAAAAAwBJHc0AAAAAA7FNzQAAAAIAYkXNAAAAAgJzRc0AAAABgE2R0QAAAAMCrc3RAAAAAgE+zdEAAAAAAhMR0QAAAAIC64XRAAAAA4Cf/dEAAAADgy3B1QAAAAMCVgnVAAAAAoDugdUAAAACAX/Z1QAAAAAAAB3ZAAAAAgO4XdkAAAABAqEJ2QAAAAIDp8nZAAAAAIAgRd0AAAACggzR3QAAAAODwT3dAAAAAIHJsd0AAAAAA4YJ3QAAAAGC91ndAAAAAAL3id0AAAADg0xZ4QAAAAOA5NXhAAAAAYDaPeEAAAACgK7p4QAAAACCm0XhAAAAA4GHkeEAAAADgyPJ4QAAAAOB5/3hAAAAAoFtNeUAAAADA8H15QAAAAGAHtXlAAAAAoD7HeUAAAAAAEtR5QAAAAGCx5XlAAAAA4NEJekAAAAAgtVN6QAAAACDCX3pAAAAAIAqmekAAAACg4OR6QAAAAGDymnxAAAAAICV/fUAAAAAgd6Z9QAAAAKDYA4BAAAAAoCoVgEAAAACA/kOAQAAAAKCpYoBAAAAA4MycgEAAAADA3DyBQAAAAKCbrIFAAAAAoPC6gUAAAADgV22CQAAAACABY4NAAAAAQE+qg0AAAADA0vKDQAAAAEDz+oNAAAAAoIxrhEAAAADg6euEQAAAAIAuDIVAAAAAoOwjhUAAAACgZUuFQAAAAADdm4VAAAAAoCzrh0AAAABgYvWHQAAAAOALCohA + + + + + + 3FlbQDg/ZUB8w3FAV2fMQMiOnkCXfP4/T2I2QC6qskAnhOZA/NU2QCc2QEAI1UhAQcAbQRrziUBmHq5AXutSQWZDbkAZGWdAzt6lQYNptUDSFDNBoY4IQAYaQEDeT7Q/16HGP9GtXUCITrQ/Rxu0Pzhc6j9mpjZAX1X8P7tPQ0AqxS5BZTzZPye+m0B7Sa5AfPsPQRr/W0CYwBRBwDh3QMAmtD/nq6E/bJHGPxJ9UUDsy/dAoCYMQRM2a0BkAR9Boo5pQMV0VEArYu5A4kaMQAyTI0BQPRJAML3TQCKVCED85+tArObGP1BVP0EtJuVAdyKAQFjctkFQa2NBixMTQXyyjUFX8eo/IHelQTdFcEFo1zZAhagsQAO53EBIugRB0M+gQfhBgkH0MsJAbGlIQZXg+EHe6CZBsbA2QHMHOECtW6BAjE2oQUpZckBasZ1AtKl3QEZYIUHkip1AQX7TQPqF60GNuaE/USk2QGLF40Im65ZAmXqlQBGuSUC70KBAAneMQeK3aEB87MVA5NigQE/Wb0BO475A + + + + "#; + let mut spectra = MzMLReader::default().parse(s.as_bytes()).await?; + + assert_eq!(spectra.len(), 1); + let s = spectra.pop().unwrap(); + + assert_eq!(s.id, "spectrum=2442"); + assert_eq!(s.ms_level, 2); + assert_eq!(s.representation, Representation::Centroid); + assert_eq!(s.precursors.len(), 1); + assert_eq!(s.precursors[0].charge, Some(2)); + assert!((s.precursors[0].mz - 457.723968) < 0.0001); + assert_eq!( + s.precursors[0].isolation_window, + Some(Tolerance::Da(-1.5, 0.75)) + ); + assert!((s.scan_start_time - 1503.96166992188) < 0.0001); + assert_eq!(s.ion_injection_time, 0.0); + assert_eq!(s.intensity.len(), s.mz.len()); + Ok(()) + } +}