Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: script fix_glnexus.py prepare GLNexus output for noodles (#356) #361

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions misc/fix_glnexus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/usr/bin/env python3
"""
Fix GLNexus output for input to `mehari annotate seqvars`.

First, note that multiallelic sites must be corrected by a call to
`bcftools norm` already as an input to this script.

What we will do is splitting the `FORMAT/RNC` field with a comma
as noodles-vcf expects a list of characters for rather than a
string of length 2 for `##FORMAT=<ID=RNC,Number=2,Type=Character>`.
"""

from pathlib import Path
import sys
from typing import Annotated

import typer
import vcfpy


def main(
path_in: Annotated[Path, typer.Option(help="Path to input VCF")],
path_out: Annotated[Path, typer.Option(help="Path to output VCF")],
quiet: Annotated[bool, typer.Option(help="Disable verbose output")] = False,
):
"""
Fix GLNexus output to be compatible with `noodles-vcf`.

cf: https://github.com/varfish-org/mehari/issues/356
"""
if not quiet:
print(f"Opening input file {path_in}", file=sys.stderr)
reader = vcfpy.Reader.from_path(path_in)
header = reader.header.copy()
for line in header.lines:
if line.key == "FORMAT" and line.mapping.get("ID") == "GQ":
line.mapping["Number"] = 1
line.mapping["Type"] = "Integer"
if not quiet:
print(f"Opening output file {path_out}", file=sys.stderr)
writer = vcfpy.Writer.from_path(path_out, header)
if not quiet:
print("Processing records...", file=sys.stderr)
with reader, writer:
for idx, record in enumerate(reader):
if idx % 10_000 == 0:
print(
f" at {idx} records {record.CHROM}:{record.POS}", file=sys.stderr
)
for call in record.calls:
if "RNC" in call.data:
if (
isinstance(call.data["RNC"], list)
and len(call.data["RNC"]) == 1
):
call.data["RNC"] = list(call.data["RNC"][0])
writer.write_record(record)
if not quiet:
print("... done", file=sys.stderr)
print("All done. Have a nice day!", file=sys.stderr)


if __name__ == "__main__":
typer.run(main)
12 changes: 6 additions & 6 deletions src/annotate/seqvars/csq.rs
Original file line number Diff line number Diff line change
@@ -1056,15 +1056,15 @@ mod test {
// Compare to SnpEff annotated variants for OPA1, touching special cases.
#[test]
fn annotate_opa1_hand_picked_vars() -> Result<(), anyhow::Error> {
annotate_opa1_vars("tests/data/annotate/vars/opa1.hand_picked.tsv", true)
annotate_opa1_vars("tests/data/annotate/seqvars/opa1.hand_picked.tsv", true)
}

// Compare to SnpEff annotated ClinVar variants for OPA1 (slow).
#[ignore]
#[test]
fn annotate_opa1_clinvar_vars_snpeff() -> Result<(), anyhow::Error> {
annotate_opa1_vars(
"tests/data/annotate/vars/clinvar.excerpt.snpeff.opa1.tsv",
"tests/data/annotate/seqvars/clinvar.excerpt.snpeff.opa1.tsv",
true,
)
}
@@ -1074,7 +1074,7 @@ mod test {
#[test]
fn annotate_opa1_clinvar_vars_vep() -> Result<(), anyhow::Error> {
annotate_opa1_vars(
"tests/data/annotate/vars/clinvar.excerpt.vep.opa1.tsv",
"tests/data/annotate/seqvars/clinvar.excerpt.vep.opa1.tsv",
true,
)
}
@@ -1095,15 +1095,15 @@ mod test {
// Compare to SnpEff annotated variants for BRCA1, touching special cases.
#[test]
fn annotate_brca1_hand_picked_vars() -> Result<(), anyhow::Error> {
annotate_brca1_vars("tests/data/annotate/vars/brca1.hand_picked.tsv", true)
annotate_brca1_vars("tests/data/annotate/seqvars/brca1.hand_picked.tsv", true)
}

// Compare to SnpEff annotated ClinVar variants for BRCA1 (slow).
#[ignore]
#[test]
fn annotate_brca1_clinvar_vars_snpeff() -> Result<(), anyhow::Error> {
annotate_brca1_vars(
"tests/data/annotate/vars/clinvar.excerpt.snpeff.brca1.tsv",
"tests/data/annotate/seqvars/clinvar.excerpt.snpeff.brca1.tsv",
true,
)
}
@@ -1113,7 +1113,7 @@ mod test {
#[test]
fn annotate_brca1_clinvar_vars_vep() -> Result<(), anyhow::Error> {
annotate_brca1_vars(
"tests/data/annotate/vars/clinvar.excerpt.vep.brca1.tsv",
"tests/data/annotate/seqvars/clinvar.excerpt.vep.brca1.tsv",
true,
)
}
67 changes: 59 additions & 8 deletions src/annotate/seqvars/mod.rs
Original file line number Diff line number Diff line change
@@ -654,7 +654,7 @@ impl VarFishSeqvarTsvWriter {
P: AsRef<Path>,
{
Self {
inner: if p.as_ref().extension().unwrap() == "gz" {
inner: if p.as_ref().extension().unwrap_or_default() == "gz" {
Box::new(GzEncoder::new(
File::create(p).unwrap(),
Compression::default(),
@@ -1501,7 +1501,17 @@ fn run_with_writer(writer: &mut dyn AnnotatedVcfWriter, args: &Args) -> Result<(
if let Some(record) = records.next() {
let mut vcf_record = record?;

// TODO: ignores all but the first alternative allele!
// We currently can only process records with one alternate allele.
if vcf_record.alternate_bases().len() != 1 {
tracing::error!(
"Found record with more than one alternate allele. This is currently not supported. \
Please use `bcftools norm` to split multi-allelic records. Record: {:?}",
&vcf_record
);
anyhow::bail!("multi-allelic records not supported");
}

// Get first alternate allele record.
let vcf_var = keys::Var::from_vcf_allele(&vcf_record, 0);

// Skip records with a deletion as alternative allele.
@@ -1769,19 +1779,20 @@ mod test {
transcript_source: TranscriptSource::Both,
transcript_picking: false,
path_db: String::from("tests/data/annotate/db"),
path_input_vcf: String::from("tests/data/db/create/badly_formed_vcf_entry.vcf"),
path_input_vcf: String::from("tests/data/annotate/seqvars/badly_formed_vcf_entry.vcf"),
output: PathOutput {
path_output_vcf: None,
path_output_tsv: Some(path_out.into_os_string().into_string().unwrap()),
},
max_var_count: None,
path_input_ped: String::from("tests/data/db/create/badly_formed_vcf_entry.ped"),
path_input_ped: String::from("tests/data/annotate/seqvars/badly_formed_vcf_entry.ped"),
};

run(&args_common, &args)?;

let actual = std::fs::read_to_string(args.output.path_output_tsv.unwrap())?;
let expected = std::fs::read_to_string("tests/data/db/create/badly_formed_vcf_entry.tsv")?;
let expected =
std::fs::read_to_string("tests/data/annotate/seqvars/badly_formed_vcf_entry.tsv")?;
assert_eq!(&expected, &actual);

Ok(())
@@ -1805,19 +1816,59 @@ mod test {
transcript_source: TranscriptSource::Both,
transcript_picking: false,
path_db: String::from("tests/data/annotate/db"),
path_input_vcf: String::from("tests/data/db/create/mitochondrial_variants.vcf"),
path_input_vcf: String::from("tests/data/annotate/seqvars/mitochondrial_variants.vcf"),
output: PathOutput {
path_output_vcf: None,
path_output_tsv: Some(path_out.into_os_string().into_string().unwrap()),
},
max_var_count: None,
path_input_ped: String::from("tests/data/annotate/seqvars/mitochondrial_variants.ped"),
};

run(&args_common, &args)?;

let actual = std::fs::read_to_string(args.output.path_output_tsv.unwrap())?;
let expected =
std::fs::read_to_string("tests/data/annotate/seqvars/mitochondrial_variants.tsv")?;
assert_eq!(&expected, &actual);

Ok(())
}

/// Test some variants originating from Clair3 on ONT data merged via GLNexus.
///
/// Note that we currently re-use the GRCh37 databases in
/// `tests/data/annotate/db/grch37` for GRCh38 via symlink. As the below
/// only is a smoke test, this is sufficient. However, for other tests,
/// this will pose a problem.
#[test]
fn test_clair3_glnexus_variants() -> Result<(), anyhow::Error> {
let temp = TempDir::default();
let path_out = temp.join("output.tsv");

let args_common = crate::common::Args {
verbose: Verbosity::new(0, 1),
};
let args = Args {
genome_release: None,
report_all_transcripts: true,
transcript_source: TranscriptSource::Both,
transcript_picking: false,
path_db: String::from("tests/data/annotate/db"),
path_input_vcf: String::from("tests/data/annotate/seqvars/clair3-glnexus-min.vcf"),
output: PathOutput {
path_output_vcf: None,
path_output_tsv: Some(path_out.into_os_string().into_string().unwrap()),
},
max_var_count: None,
path_input_ped: String::from("tests/data/db/create/mitochondrial_variants.ped"),
path_input_ped: String::from("tests/data/annotate/seqvars/clair3-glnexus-min.ped"),
};

run(&args_common, &args)?;

let actual = std::fs::read_to_string(args.output.path_output_tsv.unwrap())?;
let expected = std::fs::read_to_string("tests/data/db/create/mitochondrial_variants.tsv")?;
let expected =
std::fs::read_to_string("tests/data/annotate/seqvars/clair3-glnexus-min.tsv")?;
assert_eq!(&expected, &actual);

Ok(())
3 changes: 3 additions & 0 deletions tests/data/annotate/seqvars/bootstrap.sh
Git LFS file not shown
Git LFS file not shown
3 changes: 3 additions & 0 deletions tests/data/annotate/seqvars/clair3-glnexus-min.original.vcf
Git LFS file not shown
3 changes: 3 additions & 0 deletions tests/data/annotate/seqvars/clair3-glnexus-min.ped
Git LFS file not shown
3 changes: 3 additions & 0 deletions tests/data/annotate/seqvars/clair3-glnexus-min.tsv
Git LFS file not shown
3 changes: 3 additions & 0 deletions tests/data/annotate/seqvars/clair3-glnexus-min.vcf
Git LFS file not shown
File renamed without changes.
File renamed without changes.
3 changes: 0 additions & 3 deletions tests/data/annotate/vars/bootstrap.sh

This file was deleted.

Loading