Skip to content

Commit

Permalink
Minor update (#8)
Browse files Browse the repository at this point in the history
-Update README with troubleshooting tips
-Support remote URLs (tested Amazon s3)
-Minor update to variant calling
  • Loading branch information
xiao-chen-xc authored Dec 16, 2020
1 parent ed35cde commit 0a12bcc
Show file tree
Hide file tree
Showing 12 changed files with 74 additions and 48 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,10 @@ A .json file is also produced that contains more information about each sample.
| Raw_star_allele | Raw star allele call |
| d67_snp_call | CYP2D6 copy number call at CYP2D6/7 differentiating sites |
| d67_snp_raw | Raw CYP2D6 copy number at CYP2D6/7 differentiating sites |

## Troubleshooting

Common causes for Cyrius to produce no-calls are:
-Low sequencing depth. We suggest a sequencing depth of 30x, which is the standard practice recommended by clinical genome sequencing.
-The depth of the CYP2D6/CYP2D7 region is much lower than the rest of the genome, most likely because reads are aligned to alternative contigs. If your reference genome includes alternative contigs, we suggest alt-aware alignment so that alignments to the primary assembly take precedence over alternative contigs.
-The majority of reads in CYP2D6/CYP2D7 region have a mapping quality of zero. This is probably due to some post-processing tools like bwa-postalt that modifies the mapQ in the BAM. We recommend using the BAM file before such post-processing steps as input to Cyrius.
51 changes: 30 additions & 21 deletions caller/cnv_hybrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ def get_cnvtag(total_cn, rawv, cn_call_per_site, exon9gc_call_stringent, spacer_
sites downstream of but close to the gene), exon9 to intron1 sites and sites
upstream of intron1.
"""
exon9region_sites_consensus = None
exon9_intron4_sites_consensus = None
intron4_intron1_sites_consensus = None
intron1_upstream_sites_consensus = None

exon9_intron4_sites = [
a
for a in cn_call_per_site[EXON9_END_POSITION:INTRON4_BP_POSITION]
Expand All @@ -51,11 +56,12 @@ def get_cnvtag(total_cn, rawv, cn_call_per_site, exon9gc_call_stringent, spacer_
exon9_intron4_sites_counter = sorted(
Counter(exon9_intron4_sites).items(), key=lambda kv: kv[1], reverse=True
)
exon9_intron4_sites_consensus = (
exon9_intron4_sites_counter[0][0]
if exon9_intron4_sites_counter[0][1] >= EXON9_TO_INTRON4_SITES_MIN
else None
)
if exon9_intron4_sites_counter != []:
exon9_intron4_sites_consensus = (
exon9_intron4_sites_counter[0][0]
if exon9_intron4_sites_counter[0][1] >= EXON9_TO_INTRON4_SITES_MIN
else None
)

intron4_intron1_sites = [
a
Expand All @@ -65,11 +71,13 @@ def get_cnvtag(total_cn, rawv, cn_call_per_site, exon9gc_call_stringent, spacer_
intron4_intron1_sites_counter = sorted(
Counter(intron4_intron1_sites).items(), key=lambda kv: kv[1], reverse=True
)
intron4_intron1_sites_consensus = (
intron4_intron1_sites_counter[0][0]
if intron4_intron1_sites_counter[0][1] >= INTRON4_TO_INTRON1_SITES_MIN
else None
)

if intron4_intron1_sites_counter != []:
intron4_intron1_sites_consensus = (
intron4_intron1_sites_counter[0][0]
if intron4_intron1_sites_counter[0][1] >= INTRON4_TO_INTRON1_SITES_MIN
else None
)

if (
exon9_intron4_sites_consensus is None
Expand All @@ -88,19 +96,19 @@ def get_cnvtag(total_cn, rawv, cn_call_per_site, exon9gc_call_stringent, spacer_
intron1_upstream_sites_counter = sorted(
Counter(intron1_upstream_sites).items(), key=lambda kv: kv[1], reverse=True
)
intron1_upstream_sites_consensus = (
intron1_upstream_sites_counter[0][0]
if intron1_upstream_sites_counter[0][1] >= INTRON1_UPSTREAM_SITES_MIN
else None
)
if intron1_upstream_sites_counter != []:
intron1_upstream_sites_consensus = (
intron1_upstream_sites_counter[0][0]
if intron1_upstream_sites_counter[0][1] >= INTRON1_UPSTREAM_SITES_MIN
else None
)
if intron1_upstream_sites_consensus is None:
if (
intron1_upstream_sites.count(total_cn - 2)
>= INTRON1_UPSTREAM_SITES_MIN_LOOSE
):
intron1_upstream_sites_consensus = total_cn - 2

exon9region_sites_consensus = None
if spacer_cn is not None:
# spacer CN indicates the number of copies starting with CYP2D7
exon9region_sites_consensus = total_cn - spacer_cn
Expand Down Expand Up @@ -129,11 +137,12 @@ def get_cnvtag(total_cn, rawv, cn_call_per_site, exon9gc_call_stringent, spacer_
exon9region_sites_counter = sorted(
Counter(exon9region_sites).items(), key=lambda kv: kv[1], reverse=True
)
exon9region_sites_consensus = (
exon9region_sites_counter[0][0]
if exon9region_sites_counter[0][1] >= EXON9REGION_SITES_MIN
else None
)
if exon9region_sites_counter != []:
exon9region_sites_consensus = (
exon9region_sites_counter[0][0]
if exon9region_sites_counter[0][1] >= EXON9REGION_SITES_MIN
else None
)
if exon9region_sites_consensus is None and exon9gc_call_stringent is not None:
exon9region_sites_consensus = exon9gc_call_stringent

Expand Down
17 changes: 12 additions & 5 deletions caller/match_star_allele.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,17 +363,24 @@ def get_final_call_clean(final_call, cnvcall, spacer_cn):
genotype += "*4"
return genotype
elif split_call == ["*4", "*4"]:
if cn == 1:
return "*4/*68+*4"
if cn == 2:
return "*68+*4/*68+*4"
elif cn == 3:
return "*68+*4/*68+*68+*4"
elif cn == 4:
return "*68+*68+*4/*68+*68+*4"
if cnvcall == "star68":
if split_call[0] == "*4":
return split_call[1] + "/*68+*4"
if split_call[1] == "*4":
return split_call[0] + "/*68+*4"
# *45 is found in tandem with *68 in Africans
if "*45" in split_call:
var = [a for a in split_call if a != "*45"]
if len(var) == 1:
genotype = var[0] + "/"
for _ in range(cn):
genotype += "*68+"
genotype += "*45"
return genotype

if cnvcall == "dup_star68":
var = [a for a in split_call if a not in ["*4", "*68"]]
if len(var) == 2 and len(set(var)) == 1:
Expand Down
3 changes: 3 additions & 0 deletions data/CYP2D6_target_variant_19.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#chr pos_CYP2D6 base_ALT pos_CYP2D7 base_REF variant_type variant_name
chr22 42522613 GTCACCAGGAAA 42536326 CTCACCAGGAAA gene_conversion g.42126611C>G
chr22 42522879 A 42536592 G snp g.42126877G>A
chr22 42522916 G 42536629 c,t snp_multi g.42126914C>G
chr22 42522916 T 42536629 c,g snp_multi g.42126914C>T
chr22 42522928 A 42536641 G snp g.42126926G>A
chr22 42523459 T 42537172 C snp g.42127457C>T
chr22 42523475 T 42537188 C snp g.42127473C>T
Expand Down Expand Up @@ -84,6 +86,7 @@ chr22 42525821 T 42539492 G gene_conversion g.42129819G>T
chr22 42525823 A 42539494 G snp g.42129821G>A
chr22 42525829 G 42539500 C snp g.42129827C>G
chr22 42525838 A 42539509 G snp g.42129836G>A
chr22 42525908 A 42539579 G snp g.42129906G>A
chr22 42526656 CAAG 42540327 CAG gene_conversion g.42130655-42130656insA
chr22 42526669 T 42540341 C snp g.42130667C>T
chr22 42526670 T 42540342 C snp g.42130668C>T
Expand Down
3 changes: 3 additions & 0 deletions data/CYP2D6_target_variant_37.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#chr pos_CYP2D6 base_ALT pos_CYP2D7 base_REF variant_type variant_name
22 42522613 GTCACCAGGAAA 42536326 CTCACCAGGAAA gene_conversion g.42126611C>G
22 42522879 A 42536592 G snp g.42126877G>A
22 42522916 G 42536629 c,t snp_multi g.42126914C>G
22 42522916 T 42536629 c,g snp_multi g.42126914C>T
22 42522928 A 42536641 G snp g.42126926G>A
22 42523459 T 42537172 C snp g.42127457C>T
22 42523475 T 42537188 C snp g.42127473C>T
Expand Down Expand Up @@ -84,6 +86,7 @@
22 42525823 A 42539494 G snp g.42129821G>A
22 42525829 G 42539500 C snp g.42129827C>G
22 42525838 A 42539509 G snp g.42129836G>A
22 42525908 A 42539579 G snp g.42129906G>A
22 42526656 CAAG 42540327 CAG gene_conversion g.42130655-42130656insA
22 42526669 T 42540341 C snp g.42130667C>T
22 42526670 T 42540342 C snp g.42130668C>T
Expand Down
3 changes: 3 additions & 0 deletions data/CYP2D6_target_variant_38.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#chr pos_CYP2D6 base_ALT pos_CYP2D7 base_REF variant_type variant_name
chr22 42126611 GTCACCAGGAAA 42140315 CTCACCAGGAAA gene_conversion g.42126611C>G
chr22 42126877 A 42140581 G snp g.42126877G>A
chr22 42126914 G 42140618 c,t snp_multi g.42126914C>G
chr22 42126914 T 42140618 c,g snp_multi g.42126914C>T
chr22 42126926 A 42140630 G snp g.42126926G>A
chr22 42127457 T 42141162 C snp g.42127457C>T
chr22 42127473 T 42141178 C snp g.42127473C>T
Expand Down Expand Up @@ -84,6 +86,7 @@ chr22 42129819 T 42143491 G gene_conversion g.42129819G>T
chr22 42129821 A 42143493 G snp g.42129821G>A
chr22 42129827 G 42143499 C snp g.42129827C>G
chr22 42129836 A 42143508 G snp g.42129836G>A
chr22 42129906 A 42143578 G snp g.42129906G>A
chr22 42130654 CAAG 42144326 CAG gene_conversion g.42130655-42130656insA
chr22 42130667 T 42144340 C snp g.42130667C>T
chr22 42130668 T 42144341 C snp g.42130668C>T
Expand Down
3 changes: 0 additions & 3 deletions data/CYP2D6_target_variant_homology_region_19.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,7 @@ chr22 42522751 T 42536464 C snp g.42126749C>T
chr22 42522754 T 42536467 C snp g.42126752C>T
chr22 42522898 A 42536611 G snp g.42126896G>A
chr22 42522906 G 42536619 A snp g.42126904A>G
chr22 42522916 G 42536629 c,t snp_multi g.42126914C>G
chr22 42522916 T 42536629 c,g snp_multi g.42126914C>T
chr22 42522958 G 42536671 T snp g.42126956T>G
chr22 42522982 TGAGAGT 42536695 TGAGT snp g.42126981-42126982insGA
chr22 42525889 G 42539560 A snp g.42129887A>G
chr22 42525908 A 42539579 G snp g.42129906G>A
chr22 42525912 G 42539583 C snp g.42129910C>G
3 changes: 0 additions & 3 deletions data/CYP2D6_target_variant_homology_region_37.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,7 @@
22 42522754 T 42536467 C snp g.42126752C>T
22 42522898 A 42536611 G snp g.42126896G>A
22 42522906 G 42536619 A snp g.42126904A>G
22 42522916 G 42536629 c,t snp_multi g.42126914C>G
22 42522916 T 42536629 c,g snp_multi g.42126914C>T
22 42522958 G 42536671 T snp g.42126956T>G
22 42522982 TGAGAGT 42536695 TGAGT snp g.42126981-42126982insGA
22 42525889 G 42539560 A snp g.42129887A>G
22 42525908 A 42539579 G snp g.42129906G>A
22 42525912 G 42539583 C snp g.42129910C>G
3 changes: 0 additions & 3 deletions data/CYP2D6_target_variant_homology_region_38.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,7 @@ chr22 42126749 T 42140453 C snp g.42126749C>T
chr22 42126752 T 42140456 C snp g.42126752C>T
chr22 42126896 A 42140600 G snp g.42126896G>A
chr22 42126904 G 42140608 A snp g.42126904A>G
chr22 42126914 G 42140618 c,t snp_multi g.42126914C>G
chr22 42126914 T 42140618 c,g snp_multi g.42126914C>T
chr22 42126956 G 42140660 T snp g.42126956T>G
chr22 42126980 TGAGAGT 42140684 TGAGT snp g.42126981-42126982insGA
chr22 42129887 G 42143559 A snp g.42129887A>G
chr22 42129906 A 42143578 G snp g.42129906G>A
chr22 42129910 G 42143582 C snp g.42129910C>G
24 changes: 15 additions & 9 deletions depth_calling/bin_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,17 +203,23 @@ def count_reads_and_prepare_for_normalization(
lregion = [
(region[0], region[1], region[2], gc) for (region, gc) in region_dic["norm"]
]
get_normed_depth_bam = partial(
get_normalization_region_values, bam=bamf, reference=reference
)
region_groups = partition(lregion, nCores)
pool = mp.Pool(nCores)
result = pool.map(get_normed_depth_bam, region_groups)
pool.close()
for result_group in result:
for region_out in result_group:

if nCores == 1:
for region_out in get_normalization_region_values(lregion, bamf, reference):
counts_for_normalization.append(region_out[0])
gc_for_normalization.append(region_out[1])
else:
get_normed_depth_bam = partial(
get_normalization_region_values, bam=bamf, reference=reference
)
region_groups = partition(lregion, nCores)
pool = mp.Pool(nCores)
result = pool.map(get_normed_depth_bam, region_groups)
pool.close()
for result_group in result:
for region_out in result_group:
counts_for_normalization.append(region_out[0])
gc_for_normalization.append(region_out[1])

bamfile.close()

Expand Down
1 change: 0 additions & 1 deletion depth_calling/haplotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ def get_bases_per_read(
stepper="nofilter",
ignore_overlaps=False,
ignore_orphan=False,
min_base_quality=1,
):
site_position = pileupcolumn.pos + 1
if site_position == snp_position:
Expand Down
4 changes: 1 addition & 3 deletions star_caller.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,9 +516,7 @@ def main():
count_file = None
if path_count_file is not None:
count_file = os.path.join(path_count_file, sample_id + "_count.txt")
if os.path.exists(bam_name) == 0 or (
count_file is not None and os.path.exists(count_file) == 0
):
if "://" not in bam_name and os.path.exists(bam_name) == 0:
logging.warning("Input file for sample %s does not exist.", sample_id)
else:
logging.info(
Expand Down

0 comments on commit 0a12bcc

Please sign in to comment.