Skip to content

Commit

Permalink
feat: script fix_glnexus.py prepare GLNexus output for noodles (#356) (
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Feb 21, 2024
1 parent 2db5e3c commit 98d8233
Show file tree
Hide file tree
Showing 31 changed files with 147 additions and 17 deletions.
64 changes: 64 additions & 0 deletions misc/fix_glnexus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/usr/bin/env python3
"""
Fix GLNexus output for input to `mehari annotate seqvars`.
First, note that multiallelic sites must be corrected by a call to
`bcftools norm` already as an input to this script.
What we will do is splitting the `FORMAT/RNC` field with a comma
as noodles-vcf expects a list of characters for rather than a
string of length 2 for `##FORMAT=<ID=RNC,Number=2,Type=Character>`.
"""

from pathlib import Path
import sys
from typing import Annotated

import typer
import vcfpy


def main(
path_in: Annotated[Path, typer.Option(help="Path to input VCF")],
path_out: Annotated[Path, typer.Option(help="Path to output VCF")],
quiet: Annotated[bool, typer.Option(help="Disable verbose output")] = False,
):
"""
Fix GLNexus output to be compatible with `noodles-vcf`.
cf: https://github.com/varfish-org/mehari/issues/356
"""
if not quiet:
print(f"Opening input file {path_in}", file=sys.stderr)
reader = vcfpy.Reader.from_path(path_in)
header = reader.header.copy()
for line in header.lines:
if line.key == "FORMAT" and line.mapping.get("ID") == "GQ":
line.mapping["Number"] = 1
line.mapping["Type"] = "Integer"
if not quiet:
print(f"Opening output file {path_out}", file=sys.stderr)
writer = vcfpy.Writer.from_path(path_out, header)
if not quiet:
print("Processing records...", file=sys.stderr)
with reader, writer:
for idx, record in enumerate(reader):
if idx % 10_000 == 0:
print(
f" at {idx} records {record.CHROM}:{record.POS}", file=sys.stderr
)
for call in record.calls:
if "RNC" in call.data:
if (
isinstance(call.data["RNC"], list)
and len(call.data["RNC"]) == 1
):
call.data["RNC"] = list(call.data["RNC"][0])
writer.write_record(record)
if not quiet:
print("... done", file=sys.stderr)
print("All done. Have a nice day!", file=sys.stderr)


if __name__ == "__main__":
typer.run(main)
12 changes: 6 additions & 6 deletions src/annotate/seqvars/csq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1056,15 +1056,15 @@ mod test {
// Compare to SnpEff annotated variants for OPA1, touching special cases.
#[test]
fn annotate_opa1_hand_picked_vars() -> Result<(), anyhow::Error> {
annotate_opa1_vars("tests/data/annotate/vars/opa1.hand_picked.tsv", true)
annotate_opa1_vars("tests/data/annotate/seqvars/opa1.hand_picked.tsv", true)
}

// Compare to SnpEff annotated ClinVar variants for OPA1 (slow).
#[ignore]
#[test]
fn annotate_opa1_clinvar_vars_snpeff() -> Result<(), anyhow::Error> {
annotate_opa1_vars(
"tests/data/annotate/vars/clinvar.excerpt.snpeff.opa1.tsv",
"tests/data/annotate/seqvars/clinvar.excerpt.snpeff.opa1.tsv",
true,
)
}
Expand All @@ -1074,7 +1074,7 @@ mod test {
#[test]
fn annotate_opa1_clinvar_vars_vep() -> Result<(), anyhow::Error> {
annotate_opa1_vars(
"tests/data/annotate/vars/clinvar.excerpt.vep.opa1.tsv",
"tests/data/annotate/seqvars/clinvar.excerpt.vep.opa1.tsv",
true,
)
}
Expand All @@ -1095,15 +1095,15 @@ mod test {
// Compare to SnpEff annotated variants for BRCA1, touching special cases.
#[test]
fn annotate_brca1_hand_picked_vars() -> Result<(), anyhow::Error> {
annotate_brca1_vars("tests/data/annotate/vars/brca1.hand_picked.tsv", true)
annotate_brca1_vars("tests/data/annotate/seqvars/brca1.hand_picked.tsv", true)
}

// Compare to SnpEff annotated ClinVar variants for BRCA1 (slow).
#[ignore]
#[test]
fn annotate_brca1_clinvar_vars_snpeff() -> Result<(), anyhow::Error> {
annotate_brca1_vars(
"tests/data/annotate/vars/clinvar.excerpt.snpeff.brca1.tsv",
"tests/data/annotate/seqvars/clinvar.excerpt.snpeff.brca1.tsv",
true,
)
}
Expand All @@ -1113,7 +1113,7 @@ mod test {
#[test]
fn annotate_brca1_clinvar_vars_vep() -> Result<(), anyhow::Error> {
annotate_brca1_vars(
"tests/data/annotate/vars/clinvar.excerpt.vep.brca1.tsv",
"tests/data/annotate/seqvars/clinvar.excerpt.vep.brca1.tsv",
true,
)
}
Expand Down
67 changes: 59 additions & 8 deletions src/annotate/seqvars/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -654,7 +654,7 @@ impl VarFishSeqvarTsvWriter {
P: AsRef<Path>,
{
Self {
inner: if p.as_ref().extension().unwrap() == "gz" {
inner: if p.as_ref().extension().unwrap_or_default() == "gz" {
Box::new(GzEncoder::new(
File::create(p).unwrap(),
Compression::default(),
Expand Down Expand Up @@ -1501,7 +1501,17 @@ fn run_with_writer(writer: &mut dyn AnnotatedVcfWriter, args: &Args) -> Result<(
if let Some(record) = records.next() {
let mut vcf_record = record?;

// TODO: ignores all but the first alternative allele!
// We currently can only process records with one alternate allele.
if vcf_record.alternate_bases().len() != 1 {
tracing::error!(
"Found record with more than one alternate allele. This is currently not supported. \
Please use `bcftools norm` to split multi-allelic records. Record: {:?}",
&vcf_record
);
anyhow::bail!("multi-allelic records not supported");
}

// Get first alternate allele record.
let vcf_var = keys::Var::from_vcf_allele(&vcf_record, 0);

// Skip records with a deletion as alternative allele.
Expand Down Expand Up @@ -1769,19 +1779,20 @@ mod test {
transcript_source: TranscriptSource::Both,
transcript_picking: false,
path_db: String::from("tests/data/annotate/db"),
path_input_vcf: String::from("tests/data/db/create/badly_formed_vcf_entry.vcf"),
path_input_vcf: String::from("tests/data/annotate/seqvars/badly_formed_vcf_entry.vcf"),
output: PathOutput {
path_output_vcf: None,
path_output_tsv: Some(path_out.into_os_string().into_string().unwrap()),
},
max_var_count: None,
path_input_ped: String::from("tests/data/db/create/badly_formed_vcf_entry.ped"),
path_input_ped: String::from("tests/data/annotate/seqvars/badly_formed_vcf_entry.ped"),
};

run(&args_common, &args)?;

let actual = std::fs::read_to_string(args.output.path_output_tsv.unwrap())?;
let expected = std::fs::read_to_string("tests/data/db/create/badly_formed_vcf_entry.tsv")?;
let expected =
std::fs::read_to_string("tests/data/annotate/seqvars/badly_formed_vcf_entry.tsv")?;
assert_eq!(&expected, &actual);

Ok(())
Expand All @@ -1805,19 +1816,59 @@ mod test {
transcript_source: TranscriptSource::Both,
transcript_picking: false,
path_db: String::from("tests/data/annotate/db"),
path_input_vcf: String::from("tests/data/db/create/mitochondrial_variants.vcf"),
path_input_vcf: String::from("tests/data/annotate/seqvars/mitochondrial_variants.vcf"),
output: PathOutput {
path_output_vcf: None,
path_output_tsv: Some(path_out.into_os_string().into_string().unwrap()),
},
max_var_count: None,
path_input_ped: String::from("tests/data/annotate/seqvars/mitochondrial_variants.ped"),
};

run(&args_common, &args)?;

let actual = std::fs::read_to_string(args.output.path_output_tsv.unwrap())?;
let expected =
std::fs::read_to_string("tests/data/annotate/seqvars/mitochondrial_variants.tsv")?;
assert_eq!(&expected, &actual);

Ok(())
}

/// Test some variants originating from Clair3 on ONT data merged via GLNexus.
///
/// Note that we currently re-use the GRCh37 databases in
/// `tests/data/annotate/db/grch37` for GRCh38 via symlink. As the below
/// only is a smoke test, this is sufficient. However, for other tests,
/// this will pose a problem.
#[test]
fn test_clair3_glnexus_variants() -> Result<(), anyhow::Error> {
let temp = TempDir::default();
let path_out = temp.join("output.tsv");

let args_common = crate::common::Args {
verbose: Verbosity::new(0, 1),
};
let args = Args {
genome_release: None,
report_all_transcripts: true,
transcript_source: TranscriptSource::Both,
transcript_picking: false,
path_db: String::from("tests/data/annotate/db"),
path_input_vcf: String::from("tests/data/annotate/seqvars/clair3-glnexus-min.vcf"),
output: PathOutput {
path_output_vcf: None,
path_output_tsv: Some(path_out.into_os_string().into_string().unwrap()),
},
max_var_count: None,
path_input_ped: String::from("tests/data/db/create/mitochondrial_variants.ped"),
path_input_ped: String::from("tests/data/annotate/seqvars/clair3-glnexus-min.ped"),
};

run(&args_common, &args)?;

let actual = std::fs::read_to_string(args.output.path_output_tsv.unwrap())?;
let expected = std::fs::read_to_string("tests/data/db/create/mitochondrial_variants.tsv")?;
let expected =
std::fs::read_to_string("tests/data/annotate/seqvars/clair3-glnexus-min.tsv")?;
assert_eq!(&expected, &actual);

Ok(())
Expand Down
3 changes: 3 additions & 0 deletions tests/data/annotate/seqvars/bootstrap.sh
Git LFS file not shown
Git LFS file not shown
3 changes: 3 additions & 0 deletions tests/data/annotate/seqvars/clair3-glnexus-min.original.vcf
Git LFS file not shown
3 changes: 3 additions & 0 deletions tests/data/annotate/seqvars/clair3-glnexus-min.ped
Git LFS file not shown
3 changes: 3 additions & 0 deletions tests/data/annotate/seqvars/clair3-glnexus-min.tsv
Git LFS file not shown
3 changes: 3 additions & 0 deletions tests/data/annotate/seqvars/clair3-glnexus-min.vcf
Git LFS file not shown
File renamed without changes.
File renamed without changes.
3 changes: 0 additions & 3 deletions tests/data/annotate/vars/bootstrap.sh

This file was deleted.

0 comments on commit 98d8233

Please sign in to comment.