From ac14be87b3580f88a6a22cbbb6ae679551324732 Mon Sep 17 00:00:00 2001 From: mholt Date: Fri, 2 Feb 2024 06:20:36 -0800 Subject: [PATCH] syncing changes for 1.3.0 --- CHANGELOG.md | 10 ++ Cargo.lock | 56 +++++---- Cargo.toml | 3 +- LICENSE-THIRDPARTY.json | 19 ++- src/block_gen.rs | 8 +- src/data_types/variants.rs | 244 +++++++++++++++++++++++++------------ src/phaser.rs | 15 ++- src/wfa_graph.rs | 40 +++--- 8 files changed, 265 insertions(+), 130 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 078bdfe..4deba2a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +# v1.3.0 +## Changes +- Relaxes the requirements for SV deletion and insertion events such that they no longer require an alternate or reference allele, respectively, to have length 1 + +## Internal changes +- The interface for variant creation was modified to reduce panics from invalid variant construction. This modification changes all the return types for the various `Variant::new*(...)` functions from `Variant` to `Result`. + +## Fixed +- SV events with a placeholder ALT sequence (e.g., \, \) are now properly ignored by HiPhase instead of creating an error. + # v1.2.1 ## Fixed - Fixed [a rare issue](https://github.com/PacificBiosciences/pbbioconda/issues/640) where reference alleles with stripped IUPAC codes were throwing errors due to reference mismatch diff --git a/Cargo.lock b/Cargo.lock index 4b51140..b5decdd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -232,7 +232,7 @@ dependencies = [ "proc-macro-error", "proc-macro2", "quote", - "syn", + "syn 1.0.102", ] [[package]] @@ -340,7 +340,7 @@ dependencies = [ "proc-macro2", "quote", "scratch", - "syn", + "syn 1.0.102", ] [[package]] @@ -357,7 +357,7 @@ checksum = "0b75aed41bb2e6367cae39e6326ef817a851db13c13e4f3263714ca3cfb8de56" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.102", ] [[package]] @@ -374,7 +374,7 @@ checksum = "3418329ca0ad70234b9735dc4ceed10af4df60eff9c8e7b06cb5e520d92c3535" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.102", ] [[package]] @@ -400,7 +400,7 @@ checksum = "84278eae0af6e34ff6c1db44c11634a694aafac559ff3080e4db4e4ac35907aa" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.102", ] [[package]] @@ -491,7 +491,7 @@ dependencies = [ "proc-macro-error", "proc-macro2", "quote", - "syn", + "syn 1.0.102", ] [[package]] @@ -523,7 +523,7 @@ dependencies = [ [[package]] name = "hiphase" -version = "1.2.1" +version = "1.3.0" dependencies = [ "bio", "bit-vec", @@ -541,6 +541,7 @@ dependencies = [ "rustc-hash", "serde", "simple-error", + "thiserror", "threadpool", "vergen", ] @@ -775,7 +776,7 @@ checksum = "01fcc0b8149b4632adc89ac3b7b31a12fb6099a0317a4eb2ebff574ef7de7218" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.102", ] [[package]] @@ -933,7 +934,7 @@ dependencies = [ "proc-macro-error-attr", "proc-macro2", "quote", - "syn", + "syn 1.0.102", "version_check", ] @@ -950,9 +951,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.58" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8" +checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" dependencies = [ "unicode-ident", ] @@ -965,9 +966,9 @@ checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" [[package]] name = "quote" -version = "1.0.27" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" dependencies = [ "proc-macro2", ] @@ -1127,7 +1128,7 @@ checksum = "4f1d362ca8fc9c3e3a7484440752472d68a6caa98f1ab81d99b5dfe517cec852" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.102", ] [[package]] @@ -1184,7 +1185,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn", + "syn 1.0.102", ] [[package]] @@ -1198,6 +1199,17 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "syn" +version = "2.0.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f3531638e407dfc0814761abb7c00a5b54992b849452a0646b7f65c9f770f3f" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "termcolor" version = "1.1.3" @@ -1209,22 +1221,22 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.37" +version = "1.0.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10deb33631e3c9018b9baf9dcbbc4f737320d2b576bac10f6aefa048fa407e3e" +checksum = "d54378c645627613241d077a3a79db965db602882668f9136ac42af9ecb730ad" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.37" +version = "1.0.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "982d17546b47146b28f7c22e3d08465f6b8903d0ea13c1660d9d84a6e7adcdbb" +checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.48", ] [[package]] @@ -1407,7 +1419,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn", + "syn 1.0.102", "wasm-bindgen-shared", ] @@ -1429,7 +1441,7 @@ checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.102", "wasm-bindgen-backend", "wasm-bindgen-shared", ] diff --git a/Cargo.toml b/Cargo.toml index 383a808..72d2ae3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "hiphase" -version = "1.2.1" +version = "1.3.0" authors = ["J. Matthew Holt "] description = "A tool for jointly phasing small, structural, and tandem repeat variants for PacBio sequencing data" edition = "2021" @@ -31,6 +31,7 @@ rust-htslib = { version = "0.39.5", default-features = false, features = ["stati rustc-hash = "1.1.0" serde = "1.0.147" simple-error = "0.2.3" +thiserror = "1.0.56" threadpool = "1.8.1" [profile.release] diff --git a/LICENSE-THIRDPARTY.json b/LICENSE-THIRDPARTY.json index a7f7d8a..d43ad74 100644 --- a/LICENSE-THIRDPARTY.json +++ b/LICENSE-THIRDPARTY.json @@ -496,7 +496,7 @@ }, { "name": "hiphase", - "version": "1.2.1", + "version": "1.3.0", "authors": "J. Matthew Holt ", "repository": null, "license": null, @@ -910,7 +910,7 @@ }, { "name": "proc-macro2", - "version": "1.0.58", + "version": "1.0.78", "authors": "David Tolnay |Alex Crichton ", "repository": "https://github.com/dtolnay/proc-macro2", "license": "Apache-2.0 OR MIT", @@ -928,7 +928,7 @@ }, { "name": "quote", - "version": "1.0.27", + "version": "1.0.35", "authors": "David Tolnay ", "repository": "https://github.com/dtolnay/quote", "license": "Apache-2.0 OR MIT", @@ -1160,6 +1160,15 @@ "license_file": null, "description": "Parser for Rust source code" }, + { + "name": "syn", + "version": "2.0.48", + "authors": "David Tolnay ", + "repository": "https://github.com/dtolnay/syn", + "license": "Apache-2.0 OR MIT", + "license_file": null, + "description": "Parser for Rust source code" + }, { "name": "termcolor", "version": "1.1.3", @@ -1171,7 +1180,7 @@ }, { "name": "thiserror", - "version": "1.0.37", + "version": "1.0.56", "authors": "David Tolnay ", "repository": "https://github.com/dtolnay/thiserror", "license": "Apache-2.0 OR MIT", @@ -1180,7 +1189,7 @@ }, { "name": "thiserror-impl", - "version": "1.0.37", + "version": "1.0.56", "authors": "David Tolnay ", "repository": "https://github.com/dtolnay/thiserror", "license": "Apache-2.0 OR MIT", diff --git a/src/block_gen.rs b/src/block_gen.rs index cc2e4ca..8130249 100644 --- a/src/block_gen.rs +++ b/src/block_gen.rs @@ -152,7 +152,7 @@ pub fn is_phasable_variant(record: &bcf::Record, sample_index: usize, min_qualit VariantType::SvDuplication | VariantType::SvInversion | VariantType::SvBreakend | - VariantType::Unknown=> { Ok(false) } + VariantType::Unknown => { Ok(false) } } } } @@ -231,6 +231,12 @@ pub fn get_variant_type(record: &bcf::Record) -> Result; if so, return Unknown because we do not want to phase it + let alt_allele = std::str::from_utf8(record.alleles()[1]).unwrap_or(""); + if alt_allele.starts_with('<') && alt_allele.ends_with('>') { + return Ok(VariantType::Unknown); + } let svtype_str = std::str::from_utf8(svtype[0]).unwrap(); let sv_tag = match svtype_str { diff --git a/src/data_types/variants.rs b/src/data_types/variants.rs index d540453..0ac63e0 100644 --- a/src/data_types/variants.rs +++ b/src/data_types/variants.rs @@ -15,9 +15,9 @@ pub enum VariantType { Deletion, /// REF and ALT lengths > 1 Indel, - /// Must have two alleles and be tagged with SVTYPE=INS + /// Must have two alleles and be tagged with SVTYPE=INS; ALT >= REF SvInsertion, - /// Must have two alleles and be tagged with SVTYPE=DEL + /// Must have two alleles and be tagged with SVTYPE=DEL; ALT <= REF SvDeletion, /// Must have two alleles and be tagged with SVTYPE=DUP SvDuplication, @@ -40,6 +40,26 @@ pub enum Zygosity { Unknown // make sure Unknown is always the last one in the list } +#[derive(thiserror::Error, Debug)] +pub enum VariantError { + #[error("allele0 length must match reference length when index_allele0=0")] + Allele0RefLen, + #[error("allele{index} must be length 1")] + AlleleLen1{ index: usize }, + #[error("allele{index} is empty (length = 0)")] + EmptyAllele{ index: usize }, + #[error("index_allele0 must be < index_allele1")] + IndexAlleleOrder, + #[error("{variant_type:?} does not support multi-allelic sites")] + MultiAllelicNotAllowed { variant_type: VariantType }, + #[error("reference must have length > 1")] + RefLenGT1, + #[error("SV deletion ALT length must be <= REF length")] + SvDeletionLen, + #[error("SV insertion ALT length must be >= REF length")] + SvInsertionLen +} + /// A variant definition structure. /// It currently assumes that chromosome is fixed and that the variant is a SNP. #[derive(Clone, Debug, Eq, PartialEq)] @@ -85,14 +105,21 @@ impl Variant { /// # Panics /// * if `index_allele0 > index_allele1` /// * if the provided sequences do not match a single-nucleotide variant - pub fn new_snv(vcf_index: usize, position: i64, allele0: Vec, allele1: Vec, index_allele0: u8, index_allele1: u8) -> Variant { + pub fn new_snv(vcf_index: usize, position: i64, allele0: Vec, allele1: Vec, index_allele0: u8, index_allele1: u8) -> Result { // we always assume alleles come "sorted" and they are heterozygous - assert!(index_allele0 < index_allele1); - + if index_allele0 >= index_allele1 { + return Err(VariantError::IndexAlleleOrder); + } + // SNV alleles must be length 1 - assert_eq!(allele0.len(), 1); - assert_eq!(allele1.len(), 1); - Variant { + if allele0.len() != 1 { + return Err(VariantError::AlleleLen1 { index: 0 }); + } + if allele1.len() != 1 { + return Err(VariantError::AlleleLen1 { index: 1 }); + } + + Ok(Variant { vcf_index, variant_type: VariantType::Snv, position, @@ -104,7 +131,7 @@ impl Variant { index_allele0, index_allele1, is_ignored: false - } + }) } /// Creates a new deletion variant. @@ -121,25 +148,35 @@ impl Variant { /// * if `index_allele0 > index_allele1` /// * if the provided sequences do not match a deletion variant /// * if the reference allele is passed in and it does not have the same length as `ref_len` - pub fn new_deletion(vcf_index: usize, position: i64, ref_len: usize, allele0: Vec, allele1: Vec, index_allele0: u8, index_allele1: u8) -> Variant { + pub fn new_deletion(vcf_index: usize, position: i64, ref_len: usize, allele0: Vec, allele1: Vec, index_allele0: u8, index_allele1: u8) -> Result { // we always assume alleles come "sorted" and they are heterozygous - assert!(index_allele0 < index_allele1); + if index_allele0 >= index_allele1 { + return Err(VariantError::IndexAlleleOrder); + } // reference length must be greater than 1 to be a deletion - assert!(ref_len > 1); + if ref_len <= 1 { + return Err(VariantError::RefLenGT1); + } if index_allele0 == 0 { // this allele is also the reference allele - assert_eq!(allele0.len(), ref_len); + if allele0.len() != ref_len { + return Err(VariantError::Allele0RefLen); + } } else { // this allele is not the reference, must be a multi-allelic site; but all deletion alts have len = 1 - assert!(allele0.len() == 1); + if allele0.len() != 1 { + return Err(VariantError::AlleleLen1 { index: 0 }); + } } + // this one must always be length 1 - assert!(allele1.len() == 1); + if allele1.len() != 1{ + return Err(VariantError::AlleleLen1 { index: 1 }); + } // make sure the alleles start with the same thing - // assert_eq!(allele0[0], allele1[0]); if allele0[0] != allele1[0] { /* Counter example to requiring alleleX[0] be equal; this is rare, but it does seem to happen @@ -147,7 +184,7 @@ impl Variant { */ trace!("Deletion alleles are unexpected: {position}, {ref_len}, {allele0:?}, {allele1:?}"); } - Variant { + Ok(Variant { vcf_index, variant_type: VariantType::Deletion, position, @@ -159,7 +196,7 @@ impl Variant { index_allele0, index_allele1, is_ignored: false - } + }) } /// Creates a new insertion variant. @@ -174,28 +211,36 @@ impl Variant { /// # Panics /// * if `index_allele0 > index_allele1` /// * if the provided sequences do not match an insertion variant - pub fn new_insertion(vcf_index: usize, position: i64, allele0: Vec, allele1: Vec, index_allele0: u8, index_allele1: u8) -> Variant { + pub fn new_insertion(vcf_index: usize, position: i64, allele0: Vec, allele1: Vec, index_allele0: u8, index_allele1: u8) -> Result { // we always assume alleles come "sorted" and they are heterozygous - assert!(index_allele0 < index_allele1); + if index_allele0 >= index_allele1 { + return Err(VariantError::IndexAlleleOrder); + } if index_allele0 == 0 { // if reference allele is present, it must be length 1 for this type - assert_eq!(allele0.len(), 1); + if allele0.len() != 1 { + return Err(VariantError::AlleleLen1 { index: 0 }); + } } else { // allele0 isn't reference, so it must be >= 1 due to multi-allelics // chr1 2122634 . T C,TG 14.1 - assert!(!allele0.is_empty(), "{position} {allele0:?}"); + if allele0.is_empty() { + return Err(VariantError::EmptyAllele { index: 0 }); + } } // we have to do >= because of some multi-allelics: // chr1 286158 . A ATG,G 34.4 - assert!(!allele1.is_empty(), "{position} {allele1:?}"); + if allele1.is_empty() { + return Err(VariantError::EmptyAllele { index: 1 }); + } // make sure the alleles start with the same thing if allele0[0] != allele1[0] { // no counter example searched for yet, but probably exists, we'll leave this trace for now trace!("Insertion alleles are unexpected: {position}, 1, {allele0:?}, {allele1:?}"); } - Variant { + Ok(Variant { vcf_index, variant_type: VariantType::Insertion, position, @@ -207,7 +252,7 @@ impl Variant { index_allele0, index_allele1, is_ignored: false - } + }) } /// Creates a new indel variant. @@ -224,28 +269,39 @@ impl Variant { /// * if `index_allele0 > index_allele1` /// * if the provided sequences do not match an indel variant /// * if the reference allele is passed in and it does not have the same length as `ref_len` - pub fn new_indel(vcf_index: usize, position: i64, ref_len: usize, allele0: Vec, allele1: Vec, index_allele0: u8, index_allele1: u8) -> Variant { + pub fn new_indel(vcf_index: usize, position: i64, ref_len: usize, allele0: Vec, allele1: Vec, index_allele0: u8, index_allele1: u8) -> Result { // we always assume alleles come "sorted" and they are heterozygous - assert!(index_allele0 < index_allele1); + if index_allele0 >= index_allele1 { + return Err(VariantError::IndexAlleleOrder); + } // reference length must be greater than 1 to be an indel, but ALTs can really be any length after that (>=1 anyways) - assert!(ref_len > 1); + if ref_len <= 1 { + return Err(VariantError::RefLenGT1); + } if index_allele0 == 0 { // this allele is also the reference allele - assert_eq!(allele0.len(), ref_len); + if allele0.len() != ref_len { + return Err(VariantError::Allele0RefLen); + } } else { // it's not a reference allele, since this is an indel, length can be anything >= 1 - assert!(!allele0.is_empty()); + if allele0.is_empty() { + return Err(VariantError::EmptyAllele { index: 0 }); + } } + // this one just has to be >= 1 - assert!(!allele1.is_empty()); + if allele1.is_empty() { + return Err(VariantError::EmptyAllele { index: 1 }); + } // there's no real reason to believe in any shared sequence between alleles // we've seen it not work above, not worth even trying to codify warning here IMO // assert!(???) - Variant { + Ok(Variant { vcf_index, variant_type: VariantType::Indel, position, @@ -257,7 +313,7 @@ impl Variant { index_allele0, index_allele1, is_ignored: false - } + }) } /// Creates a new SV deletion variant. @@ -274,19 +330,31 @@ impl Variant { /// * if `index_allele0 > index_allele1` /// * if the provided sequences do not match a deletion variant /// * if the reference allele is passed in and it does not have the same length as `ref_len` - pub fn new_sv_deletion(vcf_index: usize, position: i64, ref_len: usize, allele0: Vec, allele1: Vec, index_allele0: u8, index_allele1: u8) -> Variant { + pub fn new_sv_deletion(vcf_index: usize, position: i64, ref_len: usize, allele0: Vec, allele1: Vec, index_allele0: u8, index_allele1: u8) -> Result { // we always assume alleles come "sorted" and they are heterozygous - assert!(index_allele0 < index_allele1); + if index_allele0 >= index_allele1 { + return Err(VariantError::IndexAlleleOrder); + } - // this is one difference from plain Deletion - assert_eq!(index_allele0, 0); - assert_eq!(index_allele1, 1); + // this is one difference from plain Deletion, must be 0 and 1 in VCF + if index_allele0 != 0 || index_allele1 != 1 { + return Err(VariantError::MultiAllelicNotAllowed { variant_type: VariantType::SvDeletion }) + } // this allele is also the reference allele - assert_eq!(allele0.len(), ref_len); + if allele0.len() != ref_len { + return Err(VariantError::Allele0RefLen); + } + + // restriction lifted such that now allele0 must be >= allele1 + if allele1.len() > allele0.len() { + return Err(VariantError::SvDeletionLen); + } - // this one must always be length 1 - assert!(allele1.len() == 1); + // allele1 is always <= allele0, so just make sure it's not empty + if allele1.is_empty() { + return Err(VariantError::EmptyAllele { index: 1 }); + } // make sure the alleles start with the same thing if allele0[0] != allele1[0] { @@ -296,7 +364,7 @@ impl Variant { */ trace!("Deletion alleles are unexpected: {}, {}, {:?}, {:?}", position, ref_len, allele0, allele1); } - Variant { + Ok(Variant { vcf_index, variant_type: VariantType::SvDeletion, position, @@ -308,7 +376,7 @@ impl Variant { index_allele0, index_allele1, is_ignored: false - } + }) } /// Creates a new SV insertion variant. @@ -316,6 +384,7 @@ impl Variant { /// # Arguments /// * `vcf_index` - the index of the source VCF file /// * `position` - the coordinate of the variant in a contig + /// * `ref_len` - the length of the reference allele /// * `allele0` - the first allele (usually REF) /// * `allele1` - the second allele (usually ALT[0]) /// * `index_allele0` - the index for allele0, typically 0 for REF @@ -323,31 +392,42 @@ impl Variant { /// # Panics /// * if `index_allele0 > index_allele1` /// * if the provided sequences do not match an insertion variant - pub fn new_sv_insertion(vcf_index: usize, position: i64, allele0: Vec, allele1: Vec, index_allele0: u8, index_allele1: u8) -> Variant { + pub fn new_sv_insertion(vcf_index: usize, position: i64, ref_len: usize, allele0: Vec, allele1: Vec, index_allele0: u8, index_allele1: u8) -> Result { // we always assume alleles come "sorted" and they are heterozygous - assert!(index_allele0 < index_allele1); + if index_allele0 >= index_allele1 { + return Err(VariantError::IndexAlleleOrder); + } - // this is one difference from plain Insertion - assert_eq!(index_allele0, 0); - assert_eq!(index_allele1, 1); + // this is one difference from plain Insertion, must be 0 and 1 in VCF + if index_allele0 != 0 || index_allele1 != 1 { + return Err(VariantError::MultiAllelicNotAllowed { variant_type: VariantType::SvInsertion }) + } - // if reference allele is present, it must be length 1 for this type - assert_eq!(allele0.len(), 1); + // this allele is also the reference allele + if allele0.len() != ref_len { + return Err(VariantError::Allele0RefLen); + } + + // restriction lifted such that now allele1 must be >= allele0 + if allele1.len() < allele0.len() { + return Err(VariantError::SvInsertionLen); + } - // we have to do >= because of some multi-allelics: - // chr1 286158 . A ATG,G 34.4 - assert!(!allele1.is_empty(), "{position} {allele1:?}"); + // allele0 is always <= allele1, so just make sure it's not empty + if allele0.is_empty() { + return Err(VariantError::EmptyAllele { index: 0 }); + } // make sure the alleles start with the same thing if allele0[0] != allele1[0] { // no counter example searched for yet, but probably exists, we'll leave this trace for now trace!("Insertion alleles are unexpected: {}, {}, {:?}, {:?}", position, 1, allele0, allele1); } - Variant { + Ok(Variant { vcf_index, variant_type: VariantType::SvInsertion, position, - ref_len: 1, + ref_len, prefix_len: 0, postfix_len: 0, allele0, @@ -355,7 +435,7 @@ impl Variant { index_allele0, index_allele1, is_ignored: false - } + }) } /// Creates a new tandem repeat variant, functionally these act very similar to indel types. @@ -372,24 +452,30 @@ impl Variant { /// * if `index_allele0 > index_allele1` /// * if the provided sequences do not match a tandem repeat variant /// * if the reference allele is passed in and it does not have the same length as `ref_len` - pub fn new_tandem_repeat(vcf_index: usize, position: i64, ref_len: usize, allele0: Vec, allele1: Vec, index_allele0: u8, index_allele1: u8) -> Variant { + pub fn new_tandem_repeat(vcf_index: usize, position: i64, ref_len: usize, allele0: Vec, allele1: Vec, index_allele0: u8, index_allele1: u8) -> Result { // we always assume alleles come "sorted" and they are heterozygous - assert!(index_allele0 < index_allele1); + if index_allele0 >= index_allele1 { + return Err(VariantError::IndexAlleleOrder); + } // all alleles must be >= 1 for tandem repeats, most are longer though - assert!(ref_len >= 1); + if allele0.is_empty() { + return Err(VariantError::EmptyAllele { index: 0 }); + } + if allele1.is_empty() { + return Err(VariantError::EmptyAllele { index: 1 }); + } if index_allele0 == 0 { - // this allele is also the reference allele - assert_eq!(allele0.len(), ref_len); + // this allele is also the reference allele, make sure the lengths match + if allele0.len() != ref_len { + return Err(VariantError::Allele0RefLen); + } } else { - // it's not a reference allele, since this is an indel, length can be anything >= 1 - assert!(!allele0.is_empty()); + // it's not a reference allele, so nothing to check here } - // this one just has to be >= 1 - assert!(!allele1.is_empty()); - Variant { + Ok(Variant { vcf_index, variant_type: VariantType::TandemRepeat, position, @@ -401,7 +487,7 @@ impl Variant { index_allele0, index_allele1, is_ignored: false - } + }) } /// This will add a prefix to each allele, generally reference genome sequence that will allow for better matching. @@ -556,7 +642,9 @@ impl Variant { /// This will return the index allele for a given haplotype index. /// Input must always be 0 or 1, but it might get converted to something else at multi-allelic sites. /// # Arguments - /// * `index` - must be 0 or 1 + /// * `index` - must be 0, 1, or 2 (unknown) + /// # Panics + /// * if anything other than 0, 1, or 2 is provided pub fn convert_index(&self, index: u8) -> u8 { if index == 0 { self.index_allele0 @@ -581,7 +669,7 @@ mod tests { 0, 1, b"A".to_vec(), b"C".to_vec(), 0, 1 - ); + ).unwrap(); assert_eq!(variant.get_type(), VariantType::Snv); assert_eq!(variant.position(), 1); assert_eq!(variant.get_ref_len(), 1); @@ -601,7 +689,7 @@ mod tests { 0, 10, 3, b"AGT".to_vec(), b"A".to_vec(), 0, 1 - ); + ).unwrap(); assert_eq!(variant.get_type(), VariantType::Deletion); assert_eq!(variant.position(), 10); assert_eq!(variant.get_ref_len(), 3); @@ -614,7 +702,7 @@ mod tests { 0, 10, 4, b"C".to_vec(), b"A".to_vec(), 1, 2 - ); + ).unwrap(); assert_eq!(variant.get_type(), VariantType::Deletion); assert_eq!(variant.position(), 10); assert_eq!(variant.get_ref_len(), 4); @@ -631,7 +719,7 @@ mod tests { 0, 20, b"A".to_vec(), b"AGT".to_vec(), 0, 1 - ); + ).unwrap(); assert_eq!(variant.get_type(), VariantType::Insertion); assert_eq!(variant.position(), 20); assert_eq!(variant.get_ref_len(), 1); @@ -647,7 +735,7 @@ mod tests { 0, 20, 2, b"A".to_vec(), b"AGT".to_vec(), 1, 2 - ); + ).unwrap(); assert_eq!(variant.get_type(), VariantType::Indel); assert_eq!(variant.position(), 20); assert_eq!(variant.get_ref_len(), 2); @@ -659,10 +747,10 @@ mod tests { #[test] fn test_sv_insertion() { let variant = Variant::new_sv_insertion( - 0, 20, + 0, 20, 1, b"A".to_vec(), b"AGT".to_vec(), 0, 1 - ); + ).unwrap(); assert_eq!(variant.get_type(), VariantType::SvInsertion); assert_eq!(variant.position(), 20); assert_eq!(variant.get_ref_len(), 1); @@ -679,7 +767,7 @@ mod tests { 0, 10, 3, b"AGT".to_vec(), b"A".to_vec(), 0, 1 - ); + ).unwrap(); assert_eq!(variant.get_type(), VariantType::SvDeletion); assert_eq!(variant.position(), 10); assert_eq!(variant.get_ref_len(), 3); @@ -697,7 +785,7 @@ mod tests { b"AAAC".to_vec(), b"AAACAAAC".to_vec(), 0, 1 - ); + ).unwrap(); assert_eq!(variant.get_type(), VariantType::TandemRepeat); assert_eq!(variant.position(), 10); assert_eq!(variant.get_ref_len(), 4); @@ -714,7 +802,7 @@ mod tests { 0, 20, 2, b"A".to_vec(), b"AGT".to_vec(), 1, 2 - ); + ).unwrap(); // make sure no fixins yet assert_eq!(variant.get_prefix_len(), 0); diff --git a/src/phaser.rs b/src/phaser.rs index e94aa90..0a05243 100644 --- a/src/phaser.rs +++ b/src/phaser.rs @@ -173,7 +173,7 @@ fn load_variant_calls( let allele0: Vec = all_alleles[index_allele0 as usize].to_vec(); let allele1: Vec = all_alleles[index_allele1 as usize].to_vec(); - let mut new_variant = match variant_type { + let new_variant_result = match variant_type { VariantType::Snv => { Variant::new_snv( pop_index, position, allele0, allele1, index_allele0, index_allele1 @@ -207,7 +207,7 @@ fn load_variant_calls( }, VariantType::SvInsertion => { Variant::new_sv_insertion( - pop_index, position, allele0, allele1, index_allele0, index_allele1 + pop_index, position, ref_len, allele0, allele1, index_allele0, index_allele1 ) }, VariantType::TandemRepeat => { @@ -226,6 +226,13 @@ fn load_variant_calls( } }; + let mut new_variant = match new_variant_result { + Ok(nv) => nv, + Err(e) => { + bail!("Error processing variant in VCF#{pop_index} at {}:{} : {e}", region.get_chrom(), position+1); + } + }; + if reference_buffer > 0 && !is_homozygous { // we have a reference genome and a desire buffer, extend our alleles let mut ref_prefix_start: usize = if (position as usize) > reference_buffer { @@ -678,7 +685,7 @@ pub fn create_unphased_result(phase_problem: &PhaseBlock) -> (PhaseResult, Haplo vec![1], 0, 1 - ); + ).unwrap(); variant_calls.extend(std::iter::repeat(dummy_variant).take(vi_count)); } @@ -836,7 +843,7 @@ mod tests { ).unwrap(); // we should just have the one variant in the block with the translated IUPAC code - let mut expected_variant = Variant::new_indel(0, 4, 4, b"ANGT".to_vec(), b"AT".to_vec(), 0, 1); + let mut expected_variant = Variant::new_indel(0, 4, 4, b"ANGT".to_vec(), b"AT".to_vec(), 0, 1).unwrap(); expected_variant.add_reference_prefix(b"GT"); expected_variant.add_reference_postfix(b"AC"); diff --git a/src/wfa_graph.rs b/src/wfa_graph.rs index 20b065c..1284db3 100644 --- a/src/wfa_graph.rs +++ b/src/wfa_graph.rs @@ -821,7 +821,7 @@ mod tests { #[test] fn test_simple_snv() { let reference = "AAA".as_bytes(); - let variants = vec![Variant::new_snv(0, 1, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1)]; + let variants = vec![Variant::new_snv(0, 1, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1).unwrap()]; let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = WFAGraph::from_reference_variants(&reference, &variants, 0, reference.len()).unwrap(); @@ -846,8 +846,8 @@ mod tests { let reference = "AAAAA".as_bytes(); let variants = vec![ // vcf_index, position, allele0, allele1, index_allele0, index_allele1 - Variant::new_snv(0, 1, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1), - Variant::new_snv(0, 3, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1) + Variant::new_snv(0, 1, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1).unwrap(), + Variant::new_snv(0, 3, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1).unwrap() ]; let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = @@ -886,8 +886,8 @@ mod tests { let reference = "ACGTA".as_bytes(); let variants = vec![ // vcf_index, position, ref_len, allele0, allele1, index_allele0, index_allele1 - Variant::new_deletion(0, 1, 2, "CG".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1), - Variant::new_deletion(0, 2, 2, "GT".as_bytes().to_vec(), "G".as_bytes().to_vec(), 0, 1) + Variant::new_deletion(0, 1, 2, "CG".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1).unwrap(), + Variant::new_deletion(0, 2, 2, "GT".as_bytes().to_vec(), "G".as_bytes().to_vec(), 0, 1).unwrap() ]; let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = @@ -925,8 +925,8 @@ mod tests { let reference = "ACGTA".as_bytes(); let variants = vec![ // vcf_index, position, allele0, allele1, index_allele0, index_allele1 - Variant::new_insertion(0, 2, "G".as_bytes().to_vec(), "GT".as_bytes().to_vec(), 0, 1), - Variant::new_insertion(1, 2, "G".as_bytes().to_vec(), "GT".as_bytes().to_vec(), 0, 1) + Variant::new_insertion(0, 2, "G".as_bytes().to_vec(), "GT".as_bytes().to_vec(), 0, 1).unwrap(), + Variant::new_insertion(1, 2, "G".as_bytes().to_vec(), "GT".as_bytes().to_vec(), 0, 1).unwrap() ]; let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = @@ -958,7 +958,7 @@ mod tests { let reference = "ACGTA".as_bytes(); let variants = vec![ // vcf_index, position, ref_len, allele0, allele1, index_allele0, index_allele1 - Variant::new_indel(0, 2, 2, "G".as_bytes().to_vec(), "GTT".as_bytes().to_vec(), 1, 2) + Variant::new_indel(0, 2, 2, "G".as_bytes().to_vec(), "GTT".as_bytes().to_vec(), 1, 2).unwrap() ]; let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = @@ -991,7 +991,9 @@ mod tests { // we prepended and appended "AA" to the basic SNV test let reference = "AAAAAAA".as_bytes(); // variant coordinate shifted +2 - let variants = vec![Variant::new_snv(0, 3, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1)]; + let variants = vec![ + Variant::new_snv(0, 3, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1).unwrap() + ]; let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = WFAGraph::from_reference_variants(&reference, &variants, 2, reference.len()-2).unwrap(); @@ -1016,12 +1018,12 @@ mod tests { let variants = vec![ // vcf_index, position, ref_len, allele0, allele1, index_allele0, index_allele1 // 3: GTTG>G - Variant::new_deletion(0, 3, 4, "GTTG".as_bytes().to_vec(), "G".as_bytes().to_vec(), 0, 1), + Variant::new_deletion(0, 3, 4, "GTTG".as_bytes().to_vec(), "G".as_bytes().to_vec(), 0, 1).unwrap(), // 4: TT>T - Variant::new_deletion(0, 4, 2, "TT".as_bytes().to_vec(), "T".as_bytes().to_vec(), 0, 1), + Variant::new_deletion(0, 4, 2, "TT".as_bytes().to_vec(), "T".as_bytes().to_vec(), 0, 1).unwrap(), // vcf_index, position, allele0, allele1, index_allele0, index_allele1 // 6: G>[A,C] - Variant::new_snv(0, 6, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 1, 2) + Variant::new_snv(0, 6, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 1, 2).unwrap() ]; let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = @@ -1076,8 +1078,8 @@ mod tests { // the first one should be ignored, the second one included let variants = vec![ // vcf_index, position, allele0, allele1, index_allele0, index_allele1 - Variant::new_snv(0, (ref_start-1) as i64, "A".as_bytes().to_vec(), "T".as_bytes().to_vec(), 0, 1), - Variant::new_snv(0, ref_start as i64, "A".as_bytes().to_vec(), "T".as_bytes().to_vec(), 0, 1) + Variant::new_snv(0, (ref_start-1) as i64, "A".as_bytes().to_vec(), "T".as_bytes().to_vec(), 0, 1).unwrap(), + Variant::new_snv(0, ref_start as i64, "A".as_bytes().to_vec(), "T".as_bytes().to_vec(), 0, 1).unwrap() ]; let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = @@ -1100,7 +1102,7 @@ mod tests { let reference = "ACGTA".as_bytes(); let variants = vec![ // vcf_index, position, ref_len, allele0, allele1, index_allele0, index_allele1 - Variant::new_deletion(0, 3, 3, "TAG".as_bytes().to_vec(), "T".as_bytes().to_vec(), 0, 1) + Variant::new_deletion(0, 3, 3, "TAG".as_bytes().to_vec(), "T".as_bytes().to_vec(), 0, 1).unwrap() ]; let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = @@ -1117,8 +1119,8 @@ mod tests { fn test_hom_variants() { // add a hom variant at base 1; these can be traversed, but provide no index lookup because they are not a "variant" let reference = "AAAAA".as_bytes(); - let variants = vec![Variant::new_snv(0, 3, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1)]; - let hom_variants = vec![Variant::new_snv(0, 1, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1)]; + let variants = vec![Variant::new_snv(0, 3, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1).unwrap()]; + let hom_variants = vec![Variant::new_snv(0, 1, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1).unwrap()]; let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = WFAGraph::from_reference_variants_with_hom(&reference, &variants, &hom_variants, 0, reference.len()).unwrap(); @@ -1144,7 +1146,7 @@ mod tests { #[test] fn test_variant_at_start() { let reference = "AAA".as_bytes(); - let variants = vec![Variant::new_snv(0, 0, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1)]; + let variants = vec![Variant::new_snv(0, 0, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1).unwrap()]; let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = WFAGraph::from_reference_variants(&reference, &variants, 0, reference.len()).unwrap(); @@ -1166,7 +1168,7 @@ mod tests { #[test] fn test_variant_at_end() { let reference = "AAA".as_bytes(); - let variants = vec![Variant::new_snv(0, 2, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1)]; + let variants = vec![Variant::new_snv(0, 2, "A".as_bytes().to_vec(), "C".as_bytes().to_vec(), 0, 1).unwrap()]; let (graph, node_to_alleles): (WFAGraph, NodeAlleleMap) = WFAGraph::from_reference_variants(&reference, &variants, 0, reference.len()).unwrap();