Skip to content

Commit d12f1e1

Browse files
committed
feat: script fix_glnexus.py prepare GLNexus output for noodles (#356)
1 parent ac5116b commit d12f1e1

5 files changed

+74
-4
lines changed

misc/fix_glnexus.py

+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Fix GLNexus output for input to `mehari annotate seqvars`.
4+
5+
First, note that multiallelic sites must be corrected by a call to
6+
`bcftools norm` already as an input to this script.
7+
8+
What we will do is splitting the `FORMAT/RNC` field with a comma
9+
as noodles-vcf expects a list of characters for rather than a
10+
string of length 2 for `##FORMAT=<ID=RNC,Number=2,Type=Character>`.
11+
"""
12+
13+
from pathlib import Path
14+
import sys
15+
from typing import Annotated
16+
17+
import typer
18+
import vcfpy
19+
20+
21+
def main(
22+
path_in: Annotated[Path, typer.Option(help="Path to input VCF")],
23+
path_out: Annotated[Path, typer.Option(help="Path to output VCF")],
24+
quiet: Annotated[bool, typer.Option(help="Disable verbose output")] = False,
25+
):
26+
"""
27+
Fix GLNexus output to be compatible with `noodles-vcf`.
28+
29+
cf: https://github.com/varfish-org/mehari/issues/356
30+
"""
31+
if not quiet:
32+
print(f"Opening input file {path_in}", file=sys.stderr)
33+
reader = vcfpy.Reader.from_path(path_in)
34+
header = reader.header.copy()
35+
for line in header.lines:
36+
if line.key == "FORMAT" and line.mapping.get("ID") == "GQ":
37+
line.mapping["Number"] = 1
38+
line.mapping["Type"] = "Integer"
39+
if not quiet:
40+
print(f"Opening output file {path_out}", file=sys.stderr)
41+
writer = vcfpy.Writer.from_path(path_out, header)
42+
if not quiet:
43+
print("Processing records...", file=sys.stderr)
44+
with reader, writer:
45+
for idx, record in enumerate(reader):
46+
if idx % 10_000 == 0:
47+
print(
48+
f" at {idx} records {record.CHROM}:{record.POS}", file=sys.stderr
49+
)
50+
for call in record.calls:
51+
if "RNC" in call.data:
52+
if (
53+
isinstance(call.data["RNC"], list)
54+
and len(call.data["RNC"]) == 1
55+
):
56+
call.data["RNC"] = list(call.data["RNC"][0])
57+
writer.write_record(record)
58+
if not quiet:
59+
print("... done", file=sys.stderr)
60+
print("All done. Have a nice day!", file=sys.stderr)
61+
62+
63+
if __name__ == "__main__":
64+
typer.run(main)
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:f8adde9396d07ce7379922e57f7c9ca6e0b96ea7b4d1203075f3cb2d017c03f1
3-
size 5436
2+
oid sha256:acdcc6065fcab6156f630b1fe27afb9cedd09adac0f84215a879120ae84a3162
3+
size 5548
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:f8adde9396d07ce7379922e57f7c9ca6e0b96ea7b4d1203075f3cb2d017c03f1
3+
size 5436
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:d82cbc30da246c9ecbfcf41cf166713d0af30245d97d8b2d51a534354453745a
3+
size 3709
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:6a9770c6d67d60c1aea6c5ac2e22c949bd7bced118e632151844535031547500
3-
size 5677
2+
oid sha256:b7ae1ba6ba0989506e92fbf1fef393fc4eb7e66cba2add3a8dbfb540a7571fbe
3+
size 5791

0 commit comments

Comments
 (0)