Skip to content

Commit aba47c9

Browse files
authored
feat: adding support for ClinCNV (#253) (#257)
Note that the TSV files must be converted to VCF first using the helper script in misc/convert_clincnv.py.
1 parent de832b2 commit aba47c9

File tree

8 files changed

+395
-15
lines changed

8 files changed

+395
-15
lines changed

misc/convert_clincnv.py

Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
#!/usr/bin/env python
2+
"""Helper script to convert ClinCNV files to VCF format for import."""
3+
4+
import csv
5+
import enum
6+
from typing import Annotated
7+
from pathlib import Path
8+
import logging
9+
10+
from bioutils import assemblies
11+
import pydantic
12+
import logzero
13+
from logzero import logger
14+
import typer
15+
import vcfpy
16+
17+
18+
@enum.unique
19+
class GenomeRelease(enum.Enum):
20+
"""Enumeration for genome releases"""
21+
22+
#: GRCh37 release
23+
GRCH37 = "GRCh37"
24+
#: GRCh38 release
25+
GRCH38 = "GRCh38"
26+
27+
28+
#: Canonical contig names.
29+
CANONICAL_CONTIGS = [
30+
*[str(i) for i in range(1, 23)],
31+
"X", "Y", "MT"
32+
]
33+
34+
def get_clincnv_version(path: Path) -> str:
35+
"""Get ClinCNV version from file."""
36+
with path.open("rt") as inputf:
37+
for line in inputf:
38+
if line.startswith("##ClinCNV version:"):
39+
return line.strip().split(": ")[1].strip()
40+
elif not line.startswith("#"):
41+
break
42+
raise RuntimeError("Could not determine ClinCNV version")
43+
44+
45+
def create_header(
46+
sample_name: str, genome_release: GenomeRelease, clincnv_version: str
47+
) -> vcfpy.Header:
48+
"""Create header with the given values."""
49+
header = vcfpy.Header(samples=vcfpy.SamplesInfos([sample_name]))
50+
header.add_line(vcfpy.HeaderLine(key="fileformat", value="VCFv4.2"))
51+
header.add_line(vcfpy.HeaderLine(key="source", value=f"ClinCNV {clincnv_version}"))
52+
# FILTER
53+
header.add_filter_line({"ID": "PASS", "Description": "All filters passed"})
54+
header.add_filter_line(
55+
{
56+
"ID": "LowQual",
57+
"Description": "Loglikelyhood (reported as GQ) is less than 20q",
58+
}
59+
)
60+
# INFO
61+
header.add_info_line(
62+
{
63+
"ID": "SVLEN",
64+
"Number": ".",
65+
"Type": "Integer",
66+
"Description": "Difference in length between REF and ALT alleles",
67+
}
68+
)
69+
header.add_info_line(
70+
{
71+
"ID": "END",
72+
"Number": 1,
73+
"Type": "Integer",
74+
"Description": "End position of the variant described in this record",
75+
}
76+
)
77+
header.add_info_line(
78+
{
79+
"ID": "SVTYPE",
80+
"Number": 1,
81+
"Type": "String",
82+
"Description": "Type of structural variant",
83+
}
84+
)
85+
# FORMAT
86+
header.add_format_line(
87+
{
88+
"ID": "CN",
89+
"Number": 1,
90+
"Type": "Integer",
91+
"Description": "Segment most-likely copy-number call",
92+
}
93+
)
94+
header.add_format_line(
95+
{
96+
"ID": "GT",
97+
"Number": 1,
98+
"Type": "String",
99+
"Description": "Segment genotype 0 or 1",
100+
}
101+
)
102+
header.add_format_line(
103+
{"ID": "NP", "Number": 1, "Type": "Integer", "Description": "Number of regions"}
104+
)
105+
header.add_format_line(
106+
{
107+
"ID": "GQ",
108+
"Number": 1,
109+
"Type": "Integer",
110+
"Description": "Loglikelyhood of call",
111+
}
112+
)
113+
# CONTIG
114+
if genome_release == GenomeRelease.GRCH37:
115+
# we need p13 as we want chrMT
116+
assembly_info = assemblies.get_assembly("GRCh37.p12")
117+
else:
118+
assembly_info = assemblies.get_assembly("GRCh38")
119+
for seq in assembly_info["sequences"]:
120+
if seq["name"].replace("chr", "") in CANONICAL_CONTIGS:
121+
header.add_contig_line(
122+
{
123+
"ID": seq["name"] if genome_release == GenomeRelease.GRCH37 else f"chr{seq['name']}",
124+
"length": seq["length"],
125+
"assembly": genome_release.value,
126+
}
127+
)
128+
129+
return header
130+
131+
132+
class ClinCnvRecord(pydantic.BaseModel):
133+
chr: str
134+
start: int
135+
end: int
136+
cn_change: int = pydantic.Field(validation_alias="CN_change")
137+
loglikelihood: int
138+
no_of_regions: int
139+
length_kb: str = pydantic.Field(validation_alias="length_KB")
140+
potential_af: str = pydantic.Field(validation_alias="potential_AF")
141+
genes: str
142+
qvalue: str
143+
overlap_af_genomes_imgag: str = pydantic.Field(validation_alias="overlap af_genomes_imgag")
144+
cn_pathogenic: str
145+
dosage_sensitive_disease_genes: str
146+
clinvar_cnvs: str
147+
omim: str
148+
gene_info: str
149+
ngsd_pathogenic_cnvs: str
150+
151+
152+
153+
def run_processing(
154+
path_in: Path,
155+
path_out: Path,
156+
header: vcfpy.Header,
157+
):
158+
"""Run the actual processing"""
159+
logger.info(" - skipping header")
160+
with path_in.open("rt") as inputf:
161+
# skip over ## header
162+
while True:
163+
pos = inputf.tell()
164+
line = inputf.readline()
165+
if line is None:
166+
break
167+
if not line.startswith("##"):
168+
break
169+
inputf.seek(pos)
170+
inputf.read(1) # skip comment from #chr... header
171+
172+
reader = csv.DictReader(inputf, delimiter="\t")
173+
with vcfpy.Writer.from_path(path_out, header) as writer:
174+
for raw_record in reader:
175+
clincnv_record = ClinCnvRecord(**raw_record)
176+
record = vcfpy.Record(
177+
CHROM=clincnv_record.chr,
178+
POS=clincnv_record.start,
179+
ID=[],
180+
QUAL=None,
181+
REF="N",
182+
ALT=[vcfpy.SymbolicAllele("DEL" if clincnv_record.cn_change < 2 else "DUP")],
183+
FILTER=[] if clincnv_record.loglikelihood >= 20 else ["LowQual"],
184+
INFO={
185+
"END": clincnv_record.end,
186+
"SVTYPE": "DEL" if clincnv_record.cn_change <2 else "DUP",
187+
"SVLEN": [clincnv_record.end - clincnv_record.start + 1],
188+
},
189+
FORMAT=["GT", "CN", "GQ", "NP"],
190+
calls=[vcfpy.Call(sample=header.samples.names[0], data={
191+
"GT": "1",
192+
"CN": clincnv_record.cn_change,
193+
"GQ": clincnv_record.loglikelihood,
194+
"NP": clincnv_record.no_of_regions,
195+
})]
196+
)
197+
198+
writer.write_record(record)
199+
200+
201+
def main(
202+
path_in: Annotated[Path, typer.Option(help="Path to input TSV file")],
203+
path_out: Annotated[Path, typer.Option(help="Path to output VCF")],
204+
sample_name: Annotated[str, typer.Option(help="Sample name to use in VCF header")],
205+
genome_release: Annotated[
206+
GenomeRelease,
207+
typer.Option(
208+
help="Genome release to use for VCF header",
209+
show_default=True,
210+
),
211+
],
212+
verbose: Annotated[
213+
bool, typer.Option("--verbose", "-v", help="Enable verbose output")
214+
] = False,
215+
):
216+
"""Convert ClinCNV TSV file to VCF format.
217+
218+
Notes:
219+
220+
Loglikelyhood is written out as in GQ tag.
221+
"""
222+
# Setup logging
223+
if verbose: # pragma: no cover
224+
level = logging.DEBUG
225+
else:
226+
# Remove module name and line number if not running in debug mode.s
227+
formatter = logzero.LogFormatter(
228+
fmt="%(color)s[%(levelname)1.1s %(asctime)s]%(end_color)s %(message)s"
229+
)
230+
logzero.formatter(formatter)
231+
level = logging.INFO
232+
logzero.loglevel(level=level)
233+
234+
logger.info("Starting conversion...")
235+
logger.info(" - getting ClinCNV version")
236+
clincnv_version = get_clincnv_version(path_in)
237+
logger.info(" - creating header")
238+
header = create_header(sample_name, genome_release, clincnv_version)
239+
logger.info(" - processing records")
240+
run_processing(path_in, path_out, header)
241+
logger.info("... done with conversion")
242+
243+
logger.info("All done. Have a nice day!")
244+
245+
246+
if __name__ == "__main__":
247+
typer.run(main)

misc/convert_clincnv.requirements.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
vcfpy
2+
typer
3+
bioutils
4+
logzero
5+
pydantic

misc/fix_freebayes.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ def main(
1818
Fix FreeBayes VCF files to be compatible with the VCF4.2 standard.
1919
2020
- Ensure the FORMAT=GQ field is an Integer.
21+
- If AD is missing, derive AD from DP and AO.
2122
"""
2223
if not quiet:
2324
print(f"Opening input file {path_in}", file=sys.stderr)
@@ -35,12 +36,17 @@ def main(
3536
with reader, writer:
3637
for idx, record in enumerate(reader):
3738
if idx % 10_000 == 0:
38-
print(f" at {idx} records {record.CHROM}:{record.POS}", file=sys.stderr)
39-
if idx > 100_000:
40-
break
39+
print(
40+
f" at {idx} records {record.CHROM}:{record.POS}", file=sys.stderr
41+
)
42+
if "AD" not in record.FORMAT and "AO" in record.FORMAT and "DP" in record.FORMAT:
43+
record.FORMAT.append("AD")
4144
for call in record.calls:
4245
if "GQ" in call.data:
4346
call.data["GQ"] = int(call.data["GQ"])
47+
if "AD" not in call.data and "AO" in call.data and "DP" in call.data:
48+
assert len(call.data["AO"]) == 1
49+
call.data["AD"] = [call.data["DP"] - call.data["AO"][0], call.data["AO"][0]]
4450
writer.write_record(record)
4551
if not quiet:
4652
print("... done", file=sys.stderr)

src/annotate/seqvars/mod.rs

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -615,26 +615,26 @@ impl GenotypeCalls {
615615
result.push_str(&format!("\"\"\"gt\"\"\":\"\"\"{}\"\"\"", gt));
616616
}
617617

618-
if prev {
619-
result.push(',');
620-
}
621618
if let Some(ad) = &entry.ad {
619+
if prev {
620+
result.push(',');
621+
}
622622
prev = true;
623623
result.push_str(&format!("\"\"\"ad\"\"\":{}", ad));
624624
}
625625

626-
if prev {
627-
result.push(',');
628-
}
629626
if let Some(dp) = &entry.dp {
627+
if prev {
628+
result.push(',');
629+
}
630630
prev = true;
631631
result.push_str(&format!("\"\"\"dp\"\"\":{}", dp));
632632
}
633633

634-
if prev {
635-
result.push(',');
636-
}
637634
if let Some(gq) = &entry.gq {
635+
if prev {
636+
result.push(',');
637+
}
638638
// prev = true;
639639
result.push_str(&format!("\"\"\"gq\"\"\":{}", gq));
640640
}

0 commit comments

Comments
 (0)