Skip to content

Commit ed35cde

Browse files
authored
Updated Cyrius to call all star alleles (up to *139) (#3)
Removed the --knownFunction and --includeNewStar options.
1 parent caa0fe5 commit ed35cde

39 files changed

+598
-1097
lines changed

README.md

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,18 @@
11
# Cyrius: WGS-based CYP2D6 genotyper
2-
Cyrius is a tool to genotype CYP2D6 from a whole-genome sequencing (WGS) BAM file. Cyrius uses a novel method to solve the problems caused by the high sequence similarity with the pseudogene paralog CYP2D7 and thus is able to detect all star alleles, particularly those that contain structural variants, accurately. Please refer to our [preprint](https://www.biorxiv.org/content/10.1101/2020.05.05.077966v1) for details about the method.
2+
Cyrius is a tool to genotype CYP2D6 from a whole-genome sequencing (WGS) BAM file. Cyrius uses a novel method to solve the problems caused by the high sequence similarity with the pseudogene paralog CYP2D7 and thus is able to detect all star alleles, particularly those that contain structural variants, accurately. Please refer to our [preprint](https://www.biorxiv.org/content/10.1101/2020.05.05.077966v2) for details about the method.
33

44
## Running the program
55

66
This Python3 program can be run as follows:
77
```bash
8-
star_caller.py --manifest MANIFEST_FILE \
9-
--genome [19/37/38] \
10-
--prefix OUTPUT_FILE_PREFIX \
11-
--outDir OUTPUT_DIRECTORY \
12-
--threads NUMBER_THREADS
8+
python3 star_caller.py --manifest MANIFEST_FILE \
9+
--genome [19/37/38] \
10+
--prefix OUTPUT_FILE_PREFIX \
11+
--outDir OUTPUT_DIRECTORY \
12+
--threads NUMBER_THREADS
1313
```
1414
The manifest is a text file in which each line should list the absolute path to an input BAM/CRAM file.
15-
For CRAM input, it’s suggested to provide the path to the reference fasta file with `--reference` in the command.
16-
Additionally, there is an option `--knownFunction` to call only star alleles with known functions, as well as an option `--includeNewStar` to call all star alleles including the newly added, uncurated ones (\*115-\*139) in PharmVar.
15+
For CRAM input, it’s suggested to provide the path to the reference fasta file with `--reference` in the command.
1716

1817
## Interpreting the output
1918

caller/call_cn.py renamed to caller/call_variants.py

Lines changed: 94 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,15 @@
3333
process_raw_call_gc,
3434
process_raw_call_denovo,
3535
)
36-
from depth_calling.haplotype import get_haplotypes_from_bam, extract_hap
36+
from depth_calling.haplotype import (
37+
get_haplotypes_from_bam,
38+
get_haplotypes_from_bam_single_region,
39+
extract_hap,
40+
)
41+
from depth_calling.snp_count import (
42+
get_supporting_reads,
43+
get_supporting_reads_single_region,
44+
)
3745

3846

3947
INTRON1_BP_APPROX = 42130500
@@ -93,6 +101,11 @@
93101
"g.42129042T>C",
94102
"g.42129174C>A",
95103
"g.42129180A>T",
104+
"g.42127526C>T",
105+
"g.42128325A>G",
106+
"g.42126877G>A",
107+
"g.42127973T>C",
108+
"g.42127556T>C",
96109
]
97110

98111

@@ -197,7 +210,7 @@ def good_read(read):
197210
return read.is_secondary == 0 and read.is_supplementary == 0
198211

199212

200-
def get_allele_counts_42128936(bamfile_handle, genome):
213+
def get_allele_counts_var42128936(bamfile_handle, genome):
201214
"""
202215
Search for the inserstions at 42128936 defining
203216
*30/*40/*58 in read sequences
@@ -223,6 +236,23 @@ def get_allele_counts_42128936(bamfile_handle, genome):
223236
return (ref_read, long_ins_read, short_ins_read)
224237

225238

239+
def update_var42128936(
240+
var_list, var_alt, var_ref, ref_read, long_ins_read, short_ins_read
241+
):
242+
"""
243+
Update variant read counts for g42128936.
244+
"""
245+
if "g.42128936-42128937insGGGGCGAAAGGGGCGAAA" in var_list:
246+
long_ins_index = var_list.index("g.42128936-42128937insGGGGCGAAAGGGGCGAAA")
247+
var_alt[long_ins_index] = long_ins_read
248+
var_ref[long_ins_index] = short_ins_read + ref_read
249+
if "g.42128936-42128937insGGGGCGAAA" in var_list:
250+
short_ins_index = var_list.index("g.42128936-42128937insGGGGCGAAA")
251+
var_alt[short_ins_index] = short_ins_read
252+
var_ref[short_ins_index] = long_ins_read + ref_read
253+
return var_alt, var_ref
254+
255+
226256
def call_exon9gc(d6_count, d7_count, full_length_cn):
227257
"""
228258
Call exon 9 conversion
@@ -257,28 +287,82 @@ def call_exon9gc(d6_count, d7_count, full_length_cn):
257287
return None
258288

259289

260-
def call_var42126938(bamfile, cnvtag, site42126938, base_db, target_positions):
290+
def call_var42126938(bamfile, full_length_cn, base_db):
261291
"""
262-
Call variant g.42126938C>T (gene conversion variant in homology region)
292+
Call variant g.42126938C>T (gene conversion variant in homology region)
263293
based on read depth and phased haplotypes
264294
"""
265-
dcn = {"star5": 3, "cn2": 4}
266-
assert cnvtag in dcn
267-
full_length_cn = dcn[cnvtag]
268-
d6_cn = call_cn_snp(full_length_cn, [site42126938[0]], [site42126938[1]], 0.8)[0]
269295
var_called = []
270296
# Whether g.42126938C>T is on the same haplotype as g.42126611C>G
271297
G_haplotype = False
298+
snp_d6, snp_d7 = get_supporting_reads(
299+
bamfile, base_db.dsnp1, base_db.dsnp2, base_db.nchr, base_db.dindex
300+
)
301+
d6_d7_base_count = [snp_d6[-1], snp_d7[-1]]
302+
d6_cn = call_cn_snp(
303+
full_length_cn, [d6_d7_base_count[0]], [d6_d7_base_count[1]], 0.8
304+
)[0]
272305
if d6_cn is not None and d6_cn < full_length_cn - 2:
273-
haplotype_per_read = get_haplotypes_from_bam(bamfile, base_db, target_positions)
306+
haplotype_per_read = get_haplotypes_from_bam(
307+
bamfile, base_db, range(len(base_db.dsnp1))
308+
)
274309
recombinant_read_count = extract_hap(haplotype_per_read, [0, 2])
275310
if "12" in recombinant_read_count and sum(recombinant_read_count["12"]) > 1:
276311
G_hap_count = extract_hap(haplotype_per_read, [1, 2])
277312
for _ in range(full_length_cn - 2 - d6_cn):
278313
var_called.append("g.42126938C>T")
279314
if "12" in G_hap_count and sum(G_hap_count["12"]) > 1:
280315
G_haplotype = True
281-
return var_called, G_haplotype
316+
return d6_d7_base_count, var_called, G_haplotype
317+
318+
319+
def call_var42127526_var42127556(bamfile, cnvtag, base_db):
320+
"""
321+
Call variant g.42127526C>T (gene conversion variant in homology region)
322+
based on read depth and phased haplotypes
323+
"""
324+
var_called = []
325+
var_ref, var_alt, var_ref_forward, var_ref_reverse = get_supporting_reads_single_region(
326+
bamfile, base_db.dsnp1, base_db.nchr, base_db.dindex
327+
)
328+
var7526_count = [var_ref[0], var_alt[0]]
329+
var7556_count = [var_ref[1], var_alt[1]]
330+
if cnvtag in CNVTAG_LOOKUP_TABLE:
331+
d6_cn = CNVTAG_LOOKUP_TABLE[cnvtag].exon9_to_intron1
332+
var7526_cn = call_cn_snp(d6_cn, [var7526_count[1]], [var7526_count[0]])[0]
333+
var7556_cn = call_cn_snp(d6_cn, [var7556_count[1]], [var7556_count[0]])[0]
334+
haplotype_per_read = get_haplotypes_from_bam_single_region(
335+
bamfile, base_db, range(len(base_db.dsnp1))
336+
)
337+
recombinant_read_count = extract_hap(haplotype_per_read, [0, 1, 2])
338+
if "211" in recombinant_read_count and sum(recombinant_read_count["211"]) > 1:
339+
for _ in range(var7526_cn):
340+
var_called.append("g.42127526C>T")
341+
elif "221" in recombinant_read_count and sum(recombinant_read_count["221"]) > 1:
342+
for _ in range(min(var7526_cn, var7556_cn)):
343+
var_called.append("g.42127526C>T")
344+
var_called.append("g.42127556T>C")
345+
return var7526_count, var7556_count, var_called
346+
347+
348+
def call_var42127803hap(bamfile, cnvtag, base_db):
349+
"""
350+
Call haplotype with regard to g.42127803C>T and g.42127941G>A
351+
"""
352+
diff_haplotype = False
353+
if cnvtag == "cn2":
354+
haplotype_per_read = get_haplotypes_from_bam_single_region(
355+
bamfile, base_db, range(len(base_db.dsnp1))
356+
)
357+
recombinant_read_count = extract_hap(haplotype_per_read, [0, 1])
358+
if (
359+
"12" in recombinant_read_count
360+
and sum(recombinant_read_count["12"]) > 1
361+
and "21" in recombinant_read_count
362+
and sum(recombinant_read_count["21"]) > 1
363+
):
364+
diff_haplotype = True
365+
return diff_haplotype
282366

283367

284368
def get_called_variants(var_list, cn_prob_processed, starting_index=0):

caller/cnv_hybrid.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,11 @@ def get_cnvtag(total_cn, rawv, cn_call_per_site, exon9gc_call_stringent, spacer_
115115
and exon9gc_call_stringent <= exon9_intron4_sites_consensus
116116
):
117117
exon9region_sites_consensus = exon9gc_call_stringent
118+
elif (
119+
exon9region_sites_consensus > exon9gc_call_stringent
120+
and exon9gc_call_stringent >= exon9_intron4_sites_consensus
121+
):
122+
exon9region_sites_consensus = exon9gc_call_stringent
118123
else:
119124
exon9region_sites = [
120125
a

caller/construct_star_table.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@
2222

2323

2424
# Exon 9 gene conversion
25-
EXON9GC_ALLELES = ["*36", "*4N", "*57", "*83"]
26-
EXON9GC_PAIR_ALLELES = {"*36": "*10", "*4N": "*4A"}
25+
EXON9GC_ALLELES = ["*36", "*4.013", "*57", "*83"]
26+
EXON9GC_PAIR_ALLELES = {"*36": "*10", "*4.013": "*4"}
2727

2828

2929
def make_hap_dic(variant_list, star_set, hap_dic):
@@ -51,7 +51,7 @@ def get_hap_table(hap_table):
5151
for line in f:
5252
at = line.strip().split()
5353
star_id = at[0]
54-
variant_list = sorted(at[1:-2])
54+
variant_list = sorted(at[1:-1])
5555
var_list_joined = "_".join(variant_list)
5656
dhap.setdefault(var_list_joined, star_id)
5757
dstar.setdefault(star_id, var_list_joined)

0 commit comments

Comments
 (0)