Skip to content

Commit 86c0988

Browse files
authored
Merge pull request #89 from chrovis/fix/fix-exon-intron-boundary
Return protein-hgvs nil when variant overlaps exon/intron boundaries
2 parents 77042d6 + 982a784 commit 86c0988

File tree

3 files changed

+51
-48
lines changed

3 files changed

+51
-48
lines changed

src/varity/vcf_to_hgvs/protein.clj

+47-37
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,18 @@
1414
[varity.ref-gene :as rg]
1515
[varity.vcf-to-hgvs.common :refer [diff-bases] :as common]))
1616

17+
(defn- overlap-exon-intron-boundary?
18+
[exon-ranges pos ref alt]
19+
(let [nref (count ref)
20+
nalt (count alt)]
21+
(and (not (= 1 nref nalt))
22+
(not= 1 (count exon-ranges))
23+
(some (fn [[s e]]
24+
(and (not= s e)
25+
(or (and (< pos s) (<= s (+ pos nref -1)))
26+
(and (<= pos e) (< e (+ pos nref -1))))))
27+
exon-ranges))))
28+
1729
(defn alt-exon-ranges
1830
"Returns exon ranges a variant applied."
1931
[exon-ranges pos ref alt]
@@ -25,35 +37,27 @@
2537
:else :same)
2638
tpos (+ pos (min nref nalt))
2739
d (Math/abs (- nref nalt))]
28-
(when (and (not (= 1 nref nalt))
29-
(not= 1 (count exon-ranges))
30-
(some (fn [[s e]]
31-
(and (not= s e)
32-
(or (and (< pos s) (<= s (+ pos nref -1)))
33-
(and (<= pos e) (< e (+ pos nref -1))))))
34-
exon-ranges))
35-
(throw
36-
(ex-info
37-
"Variants overlapping a boundary of exon/intron are unsupported"
38-
{:exon-ranges exon-ranges, :pos pos, :ref ref, :alt alt})))
39-
(->> exon-ranges
40-
(keep (fn [[s e]]
41-
(case typ
42-
:ins (cond
43-
(< tpos s) [(+ s d) (+ e d)]
44-
(<= s tpos e) [s (+ e d)]
45-
:else [s e])
46-
:del (let [dels tpos
47-
dele (dec (+ tpos d))]
48-
(cond
49-
(< dele s) [(- s d) (- e d)]
50-
(<= dels s) (when (< dele e) [dels (- e d)])
51-
(<= dels e) (if (< dele e)
52-
[s (- e d)]
53-
[s (dec dels)])
54-
:else [s e]))
55-
:same [s e])))
56-
vec)))
40+
(if (overlap-exon-intron-boundary? exon-ranges pos ref alt)
41+
(do (log/warn "Variants overlapping a boundary of exon/intron are unsupported")
42+
nil)
43+
(->> exon-ranges
44+
(keep (fn [[s e]]
45+
(case typ
46+
:ins (cond
47+
(< tpos s) [(+ s d) (+ e d)]
48+
(<= s tpos e) [s (+ e d)]
49+
:else [s e])
50+
:del (let [dels tpos
51+
dele (dec (+ tpos d))]
52+
(cond
53+
(< dele s) [(- s d) (- e d)]
54+
(<= dels s) (when (< dele e) [dels (- e d)])
55+
(<= dels e) (if (< dele e)
56+
[s (- e d)]
57+
[s (dec dels)])
58+
:else [s e]))
59+
:same [s e])))
60+
vec))))
5761

5862
(defn exon-sequence
5963
"Extracts bases in exon from supplied sequence, returning the sequence of
@@ -197,12 +201,14 @@
197201
:c-ter-adjusted-alt-prot-seq (codon/amino-acid-sequence
198202
(cond-> ter-site-adjusted-alt-seq
199203
(= strand :reverse) util-seq/revcomp))
200-
:alt-rg (-> rg
201-
(assoc :exon-ranges alt-exon-ranges*)
202-
(update :cds-start apply-offset*)
203-
(update :cds-end apply-offset*)
204-
(update :tx-end apply-offset*))
205-
:ref-include-ter-site ref-include-ter-site}))
204+
:alt-rg (when alt-exon-ranges*
205+
(-> rg
206+
(assoc :exon-ranges alt-exon-ranges*)
207+
(update :cds-start apply-offset*)
208+
(update :cds-end apply-offset*)
209+
(update :tx-end apply-offset*)))
210+
:ref-include-ter-site ref-include-ter-site
211+
:overlap-exon-intron-boundary (overlap-exon-intron-boundary? exon-ranges pos ref alt)}))
206212

207213
(defn- protein-position
208214
"Converts genomic position to protein position. If pos is outside of CDS,
@@ -255,6 +261,9 @@
255261
(= ref-exon-seq alt-exon-seq)
256262
{:type :no-effect, :pos 1, :ref nil, :alt nil}
257263

264+
(:overlap-exon-intron-boundary seq-info)
265+
{:type :overlap-exon-intron-boundary, :pos nil, :ref nil, :alt nil}
266+
258267
(pos? (mod (count ref-exon-seq) 3))
259268
(do (log/warnf "CDS length is indivisible by 3: %d (%s, %s)"
260269
(count ref-exon-seq) (:name rg) (:name2 rg))
@@ -459,7 +468,7 @@
459468
(let [seq-info (read-sequence-info seq-rdr rg pos ref alt)]
460469
(when-let [pvariant (->protein-variant rg pos ref alt seq-info options)]
461470
(let [{ppos :pos, pref :ref, palt :alt}
462-
(if-not (#{:no-effect :unknown} (:type pvariant))
471+
(if-not (#{:no-effect :unknown :overlap-exon-intron-boundary} (:type pvariant))
463472
(common/apply-3'-rule pvariant (:ref-prot-seq seq-info))
464473
pvariant)]
465474
(case (:type pvariant)
@@ -472,7 +481,8 @@
472481
:frame-shift (protein-frame-shift ppos seq-info)
473482
:extension (protein-extension ppos pref palt seq-info)
474483
:no-effect (mut/protein-no-effect)
475-
:unknown (mut/protein-unknown-mutation))))))
484+
:unknown (mut/protein-unknown-mutation)
485+
:overlap-exon-intron-boundary nil)))))
476486

477487
(defn ->hgvs
478488
([variant seq-rdr rg]

test/varity/vcf_to_hgvs/protein_test.clj

+1-4
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,7 @@
1717
6 "XX" "X" [[2 4] [7 10]]
1818
9 "XXX" "XXX" [[2 4] [8 11]])
1919
;; Variants overlapping a boundary of exon/intron
20-
(are [p r a] (thrown-with-msg?
21-
Exception
22-
#"unsupported"
23-
(#'prot/alt-exon-ranges [[2 4] [8 11]] p r a))
20+
(are [p r a] (nil? (#'prot/alt-exon-ranges [[2 4] [8 11]] p r a))
2421
3 "XXX" "XXX"
2522
6 "XXX" "X"
2623
3 "XXX" "X"

test/varity/vcf_to_hgvs_test.clj

+3-7
Original file line numberDiff line numberDiff line change
@@ -286,15 +286,11 @@
286286
(vcf-variant->protein-hgvs {:chr "chr7", :pos 55191823, :ref "T", :alt "G"}
287287
test-ref-seq-file rgidx)))))
288288

289-
(cavia-testing "throws Exception if inputs overlap exon/intron boundaries"
289+
(cavia-testing "case that inputs overlap exon/intron boundaries"
290290
(let [rgidx (rg/index (rg/load-ref-genes test-ref-gene-file))]
291291
(are [chr pos ref alt]
292-
(thrown-with-msg?
293-
Exception
294-
#"unsupported"
295-
(vcf-variant->protein-hgvs
296-
{:chr chr, :pos pos, :ref ref, :alt alt}
297-
test-ref-seq-file rgidx))
292+
(= [] (vcf-variant->protein-hgvs {:chr chr, :pos pos, :ref ref, :alt alt}
293+
test-ref-seq-file rgidx))
298294
;; Two variants at the each side of a GT dinucleotide splice donor site
299295
"chr1" 26773716 "CGGTGA" "CCAGGTGT"
300296
"chr1" 26773714 "AACGGTGAG" "AGCGGT"

0 commit comments

Comments
 (0)