diff --git a/misc/fix_glnexus.py b/misc/fix_glnexus.py new file mode 100755 index 00000000..f404ee85 --- /dev/null +++ b/misc/fix_glnexus.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +""" +Fix GLNexus output for input to `mehari annotate seqvars`. + +First, note that multiallelic sites must be corrected by a call to +`bcftools norm` already as an input to this script. + +What we will do is splitting the `FORMAT/RNC` field with a comma +as noodles-vcf expects a list of characters for rather than a +string of length 2 for `##FORMAT=`. +""" + +from pathlib import Path +import sys +from typing import Annotated + +import typer +import vcfpy + + +def main( + path_in: Annotated[Path, typer.Option(help="Path to input VCF")], + path_out: Annotated[Path, typer.Option(help="Path to output VCF")], + quiet: Annotated[bool, typer.Option(help="Disable verbose output")] = False, +): + """ + Fix GLNexus output to be compatible with `noodles-vcf`. + + cf: https://github.com/varfish-org/mehari/issues/356 + """ + if not quiet: + print(f"Opening input file {path_in}", file=sys.stderr) + reader = vcfpy.Reader.from_path(path_in) + header = reader.header.copy() + for line in header.lines: + if line.key == "FORMAT" and line.mapping.get("ID") == "GQ": + line.mapping["Number"] = 1 + line.mapping["Type"] = "Integer" + if not quiet: + print(f"Opening output file {path_out}", file=sys.stderr) + writer = vcfpy.Writer.from_path(path_out, header) + if not quiet: + print("Processing records...", file=sys.stderr) + with reader, writer: + for idx, record in enumerate(reader): + if idx % 10_000 == 0: + print( + f" at {idx} records {record.CHROM}:{record.POS}", file=sys.stderr + ) + for call in record.calls: + if "RNC" in call.data: + if ( + isinstance(call.data["RNC"], list) + and len(call.data["RNC"]) == 1 + ): + call.data["RNC"] = list(call.data["RNC"][0]) + writer.write_record(record) + if not quiet: + print("... done", file=sys.stderr) + print("All done. Have a nice day!", file=sys.stderr) + + +if __name__ == "__main__": + typer.run(main) diff --git a/src/annotate/seqvars/csq.rs b/src/annotate/seqvars/csq.rs index 02a8621d..a951699d 100644 --- a/src/annotate/seqvars/csq.rs +++ b/src/annotate/seqvars/csq.rs @@ -1056,7 +1056,7 @@ mod test { // Compare to SnpEff annotated variants for OPA1, touching special cases. #[test] fn annotate_opa1_hand_picked_vars() -> Result<(), anyhow::Error> { - annotate_opa1_vars("tests/data/annotate/vars/opa1.hand_picked.tsv", true) + annotate_opa1_vars("tests/data/annotate/seqvars/opa1.hand_picked.tsv", true) } // Compare to SnpEff annotated ClinVar variants for OPA1 (slow). @@ -1064,7 +1064,7 @@ mod test { #[test] fn annotate_opa1_clinvar_vars_snpeff() -> Result<(), anyhow::Error> { annotate_opa1_vars( - "tests/data/annotate/vars/clinvar.excerpt.snpeff.opa1.tsv", + "tests/data/annotate/seqvars/clinvar.excerpt.snpeff.opa1.tsv", true, ) } @@ -1074,7 +1074,7 @@ mod test { #[test] fn annotate_opa1_clinvar_vars_vep() -> Result<(), anyhow::Error> { annotate_opa1_vars( - "tests/data/annotate/vars/clinvar.excerpt.vep.opa1.tsv", + "tests/data/annotate/seqvars/clinvar.excerpt.vep.opa1.tsv", true, ) } @@ -1095,7 +1095,7 @@ mod test { // Compare to SnpEff annotated variants for BRCA1, touching special cases. #[test] fn annotate_brca1_hand_picked_vars() -> Result<(), anyhow::Error> { - annotate_brca1_vars("tests/data/annotate/vars/brca1.hand_picked.tsv", true) + annotate_brca1_vars("tests/data/annotate/seqvars/brca1.hand_picked.tsv", true) } // Compare to SnpEff annotated ClinVar variants for BRCA1 (slow). @@ -1103,7 +1103,7 @@ mod test { #[test] fn annotate_brca1_clinvar_vars_snpeff() -> Result<(), anyhow::Error> { annotate_brca1_vars( - "tests/data/annotate/vars/clinvar.excerpt.snpeff.brca1.tsv", + "tests/data/annotate/seqvars/clinvar.excerpt.snpeff.brca1.tsv", true, ) } @@ -1113,7 +1113,7 @@ mod test { #[test] fn annotate_brca1_clinvar_vars_vep() -> Result<(), anyhow::Error> { annotate_brca1_vars( - "tests/data/annotate/vars/clinvar.excerpt.vep.brca1.tsv", + "tests/data/annotate/seqvars/clinvar.excerpt.vep.brca1.tsv", true, ) } diff --git a/src/annotate/seqvars/mod.rs b/src/annotate/seqvars/mod.rs index 94447ce1..89ffaebe 100644 --- a/src/annotate/seqvars/mod.rs +++ b/src/annotate/seqvars/mod.rs @@ -654,7 +654,7 @@ impl VarFishSeqvarTsvWriter { P: AsRef, { Self { - inner: if p.as_ref().extension().unwrap() == "gz" { + inner: if p.as_ref().extension().unwrap_or_default() == "gz" { Box::new(GzEncoder::new( File::create(p).unwrap(), Compression::default(), @@ -1501,7 +1501,17 @@ fn run_with_writer(writer: &mut dyn AnnotatedVcfWriter, args: &Args) -> Result<( if let Some(record) = records.next() { let mut vcf_record = record?; - // TODO: ignores all but the first alternative allele! + // We currently can only process records with one alternate allele. + if vcf_record.alternate_bases().len() != 1 { + tracing::error!( + "Found record with more than one alternate allele. This is currently not supported. \ + Please use `bcftools norm` to split multi-allelic records. Record: {:?}", + &vcf_record + ); + anyhow::bail!("multi-allelic records not supported"); + } + + // Get first alternate allele record. let vcf_var = keys::Var::from_vcf_allele(&vcf_record, 0); // Skip records with a deletion as alternative allele. @@ -1769,19 +1779,20 @@ mod test { transcript_source: TranscriptSource::Both, transcript_picking: false, path_db: String::from("tests/data/annotate/db"), - path_input_vcf: String::from("tests/data/db/create/badly_formed_vcf_entry.vcf"), + path_input_vcf: String::from("tests/data/annotate/seqvars/badly_formed_vcf_entry.vcf"), output: PathOutput { path_output_vcf: None, path_output_tsv: Some(path_out.into_os_string().into_string().unwrap()), }, max_var_count: None, - path_input_ped: String::from("tests/data/db/create/badly_formed_vcf_entry.ped"), + path_input_ped: String::from("tests/data/annotate/seqvars/badly_formed_vcf_entry.ped"), }; run(&args_common, &args)?; let actual = std::fs::read_to_string(args.output.path_output_tsv.unwrap())?; - let expected = std::fs::read_to_string("tests/data/db/create/badly_formed_vcf_entry.tsv")?; + let expected = + std::fs::read_to_string("tests/data/annotate/seqvars/badly_formed_vcf_entry.tsv")?; assert_eq!(&expected, &actual); Ok(()) @@ -1805,19 +1816,59 @@ mod test { transcript_source: TranscriptSource::Both, transcript_picking: false, path_db: String::from("tests/data/annotate/db"), - path_input_vcf: String::from("tests/data/db/create/mitochondrial_variants.vcf"), + path_input_vcf: String::from("tests/data/annotate/seqvars/mitochondrial_variants.vcf"), + output: PathOutput { + path_output_vcf: None, + path_output_tsv: Some(path_out.into_os_string().into_string().unwrap()), + }, + max_var_count: None, + path_input_ped: String::from("tests/data/annotate/seqvars/mitochondrial_variants.ped"), + }; + + run(&args_common, &args)?; + + let actual = std::fs::read_to_string(args.output.path_output_tsv.unwrap())?; + let expected = + std::fs::read_to_string("tests/data/annotate/seqvars/mitochondrial_variants.tsv")?; + assert_eq!(&expected, &actual); + + Ok(()) + } + + /// Test some variants originating from Clair3 on ONT data merged via GLNexus. + /// + /// Note that we currently re-use the GRCh37 databases in + /// `tests/data/annotate/db/grch37` for GRCh38 via symlink. As the below + /// only is a smoke test, this is sufficient. However, for other tests, + /// this will pose a problem. + #[test] + fn test_clair3_glnexus_variants() -> Result<(), anyhow::Error> { + let temp = TempDir::default(); + let path_out = temp.join("output.tsv"); + + let args_common = crate::common::Args { + verbose: Verbosity::new(0, 1), + }; + let args = Args { + genome_release: None, + report_all_transcripts: true, + transcript_source: TranscriptSource::Both, + transcript_picking: false, + path_db: String::from("tests/data/annotate/db"), + path_input_vcf: String::from("tests/data/annotate/seqvars/clair3-glnexus-min.vcf"), output: PathOutput { path_output_vcf: None, path_output_tsv: Some(path_out.into_os_string().into_string().unwrap()), }, max_var_count: None, - path_input_ped: String::from("tests/data/db/create/mitochondrial_variants.ped"), + path_input_ped: String::from("tests/data/annotate/seqvars/clair3-glnexus-min.ped"), }; run(&args_common, &args)?; let actual = std::fs::read_to_string(args.output.path_output_tsv.unwrap())?; - let expected = std::fs::read_to_string("tests/data/db/create/mitochondrial_variants.tsv")?; + let expected = + std::fs::read_to_string("tests/data/annotate/seqvars/clair3-glnexus-min.tsv")?; assert_eq!(&expected, &actual); Ok(()) diff --git a/tests/data/db/create/badly_formed_vcf_entry.ped b/tests/data/annotate/seqvars/badly_formed_vcf_entry.ped similarity index 100% rename from tests/data/db/create/badly_formed_vcf_entry.ped rename to tests/data/annotate/seqvars/badly_formed_vcf_entry.ped diff --git a/tests/data/db/create/badly_formed_vcf_entry.tsv b/tests/data/annotate/seqvars/badly_formed_vcf_entry.tsv similarity index 100% rename from tests/data/db/create/badly_formed_vcf_entry.tsv rename to tests/data/annotate/seqvars/badly_formed_vcf_entry.tsv diff --git a/tests/data/db/create/badly_formed_vcf_entry.vcf b/tests/data/annotate/seqvars/badly_formed_vcf_entry.vcf similarity index 100% rename from tests/data/db/create/badly_formed_vcf_entry.vcf rename to tests/data/annotate/seqvars/badly_formed_vcf_entry.vcf diff --git a/tests/data/annotate/seqvars/bootstrap.sh b/tests/data/annotate/seqvars/bootstrap.sh new file mode 100644 index 00000000..045bfe93 --- /dev/null +++ b/tests/data/annotate/seqvars/bootstrap.sh @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e94d663b3b630fbcbff03414e05f3cc0729a02d02ce17e805952c6c435cc0958 +size 1976 diff --git a/tests/data/annotate/vars/brca1.hand_picked.tsv b/tests/data/annotate/seqvars/brca1.hand_picked.tsv similarity index 100% rename from tests/data/annotate/vars/brca1.hand_picked.tsv rename to tests/data/annotate/seqvars/brca1.hand_picked.tsv diff --git a/tests/data/annotate/seqvars/clair3-glnexus-min.multiallelic.vcf b/tests/data/annotate/seqvars/clair3-glnexus-min.multiallelic.vcf new file mode 100644 index 00000000..b70381c4 --- /dev/null +++ b/tests/data/annotate/seqvars/clair3-glnexus-min.multiallelic.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acdcc6065fcab6156f630b1fe27afb9cedd09adac0f84215a879120ae84a3162 +size 5548 diff --git a/tests/data/annotate/seqvars/clair3-glnexus-min.original.vcf b/tests/data/annotate/seqvars/clair3-glnexus-min.original.vcf new file mode 100644 index 00000000..41e5f0e4 --- /dev/null +++ b/tests/data/annotate/seqvars/clair3-glnexus-min.original.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8adde9396d07ce7379922e57f7c9ca6e0b96ea7b4d1203075f3cb2d017c03f1 +size 5436 diff --git a/tests/data/annotate/seqvars/clair3-glnexus-min.ped b/tests/data/annotate/seqvars/clair3-glnexus-min.ped new file mode 100644 index 00000000..ea53d5d8 --- /dev/null +++ b/tests/data/annotate/seqvars/clair3-glnexus-min.ped @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:add3ba9cd9bf167ea30092263d9c1af9349353efdaf43c46d762016eee729e4c +size 40 diff --git a/tests/data/annotate/seqvars/clair3-glnexus-min.tsv b/tests/data/annotate/seqvars/clair3-glnexus-min.tsv new file mode 100644 index 00000000..54760a06 --- /dev/null +++ b/tests/data/annotate/seqvars/clair3-glnexus-min.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d82cbc30da246c9ecbfcf41cf166713d0af30245d97d8b2d51a534354453745a +size 3709 diff --git a/tests/data/annotate/seqvars/clair3-glnexus-min.vcf b/tests/data/annotate/seqvars/clair3-glnexus-min.vcf new file mode 100644 index 00000000..74db4010 --- /dev/null +++ b/tests/data/annotate/seqvars/clair3-glnexus-min.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7ae1ba6ba0989506e92fbf1fef393fc4eb7e66cba2add3a8dbfb540a7571fbe +size 5791 diff --git a/tests/data/annotate/vars/clinvar.excerpt.snpeff.brca1.tsv b/tests/data/annotate/seqvars/clinvar.excerpt.snpeff.brca1.tsv similarity index 100% rename from tests/data/annotate/vars/clinvar.excerpt.snpeff.brca1.tsv rename to tests/data/annotate/seqvars/clinvar.excerpt.snpeff.brca1.tsv diff --git a/tests/data/annotate/vars/clinvar.excerpt.snpeff.opa1.tsv b/tests/data/annotate/seqvars/clinvar.excerpt.snpeff.opa1.tsv similarity index 100% rename from tests/data/annotate/vars/clinvar.excerpt.snpeff.opa1.tsv rename to tests/data/annotate/seqvars/clinvar.excerpt.snpeff.opa1.tsv diff --git a/tests/data/annotate/vars/clinvar.excerpt.snpeff.vcf.gz b/tests/data/annotate/seqvars/clinvar.excerpt.snpeff.vcf.gz similarity index 100% rename from tests/data/annotate/vars/clinvar.excerpt.snpeff.vcf.gz rename to tests/data/annotate/seqvars/clinvar.excerpt.snpeff.vcf.gz diff --git a/tests/data/annotate/vars/clinvar.excerpt.snpeff.vcf.gz.tbi b/tests/data/annotate/seqvars/clinvar.excerpt.snpeff.vcf.gz.tbi similarity index 100% rename from tests/data/annotate/vars/clinvar.excerpt.snpeff.vcf.gz.tbi rename to tests/data/annotate/seqvars/clinvar.excerpt.snpeff.vcf.gz.tbi diff --git a/tests/data/annotate/vars/clinvar.excerpt.vcf b/tests/data/annotate/seqvars/clinvar.excerpt.vcf similarity index 100% rename from tests/data/annotate/vars/clinvar.excerpt.vcf rename to tests/data/annotate/seqvars/clinvar.excerpt.vcf diff --git a/tests/data/annotate/vars/clinvar.excerpt.vcf.gz b/tests/data/annotate/seqvars/clinvar.excerpt.vcf.gz similarity index 100% rename from tests/data/annotate/vars/clinvar.excerpt.vcf.gz rename to tests/data/annotate/seqvars/clinvar.excerpt.vcf.gz diff --git a/tests/data/annotate/vars/clinvar.excerpt.vcf.gz.tbi b/tests/data/annotate/seqvars/clinvar.excerpt.vcf.gz.tbi similarity index 100% rename from tests/data/annotate/vars/clinvar.excerpt.vcf.gz.tbi rename to tests/data/annotate/seqvars/clinvar.excerpt.vcf.gz.tbi diff --git a/tests/data/annotate/vars/clinvar.excerpt.vep.brca1.tsv b/tests/data/annotate/seqvars/clinvar.excerpt.vep.brca1.tsv similarity index 100% rename from tests/data/annotate/vars/clinvar.excerpt.vep.brca1.tsv rename to tests/data/annotate/seqvars/clinvar.excerpt.vep.brca1.tsv diff --git a/tests/data/annotate/vars/clinvar.excerpt.vep.opa1.tsv b/tests/data/annotate/seqvars/clinvar.excerpt.vep.opa1.tsv similarity index 100% rename from tests/data/annotate/vars/clinvar.excerpt.vep.opa1.tsv rename to tests/data/annotate/seqvars/clinvar.excerpt.vep.opa1.tsv diff --git a/tests/data/annotate/vars/clinvar.excerpt.vep.vcf.gz b/tests/data/annotate/seqvars/clinvar.excerpt.vep.vcf.gz similarity index 100% rename from tests/data/annotate/vars/clinvar.excerpt.vep.vcf.gz rename to tests/data/annotate/seqvars/clinvar.excerpt.vep.vcf.gz diff --git a/tests/data/annotate/vars/clinvar.excerpt.vep.vcf.gz.tbi b/tests/data/annotate/seqvars/clinvar.excerpt.vep.vcf.gz.tbi similarity index 100% rename from tests/data/annotate/vars/clinvar.excerpt.vep.vcf.gz.tbi rename to tests/data/annotate/seqvars/clinvar.excerpt.vep.vcf.gz.tbi diff --git a/tests/data/db/create/mitochondrial_variants.ped b/tests/data/annotate/seqvars/mitochondrial_variants.ped similarity index 100% rename from tests/data/db/create/mitochondrial_variants.ped rename to tests/data/annotate/seqvars/mitochondrial_variants.ped diff --git a/tests/data/db/create/mitochondrial_variants.tsv b/tests/data/annotate/seqvars/mitochondrial_variants.tsv similarity index 100% rename from tests/data/db/create/mitochondrial_variants.tsv rename to tests/data/annotate/seqvars/mitochondrial_variants.tsv diff --git a/tests/data/db/create/mitochondrial_variants.vcf b/tests/data/annotate/seqvars/mitochondrial_variants.vcf similarity index 100% rename from tests/data/db/create/mitochondrial_variants.vcf rename to tests/data/annotate/seqvars/mitochondrial_variants.vcf diff --git a/tests/data/annotate/vars/opa1.hand_picked.tsv b/tests/data/annotate/seqvars/opa1.hand_picked.tsv similarity index 100% rename from tests/data/annotate/vars/opa1.hand_picked.tsv rename to tests/data/annotate/seqvars/opa1.hand_picked.tsv diff --git a/tests/data/annotate/vars/postproc-snpeff.sh b/tests/data/annotate/seqvars/postproc-snpeff.sh similarity index 100% rename from tests/data/annotate/vars/postproc-snpeff.sh rename to tests/data/annotate/seqvars/postproc-snpeff.sh diff --git a/tests/data/annotate/vars/postproc-vep.sh b/tests/data/annotate/seqvars/postproc-vep.sh similarity index 100% rename from tests/data/annotate/vars/postproc-vep.sh rename to tests/data/annotate/seqvars/postproc-vep.sh diff --git a/tests/data/annotate/vars/bootstrap.sh b/tests/data/annotate/vars/bootstrap.sh deleted file mode 100644 index 44d6d83f..00000000 --- a/tests/data/annotate/vars/bootstrap.sh +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bfe8941d2ca9db2c99f7beb13526579b4a87051374f5d7077f0c81bea85f89f5 -size 1845