Skip to content

Commit 2f5fc0d

Browse files
committed
fix: add in-frame check process to indel for around ter site variant
1 parent 4847fa7 commit 2f5fc0d

File tree

3 files changed

+105
-3
lines changed

3 files changed

+105
-3
lines changed

src/varity/vcf_to_hgvs/protein.clj

+47-3
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,22 @@
177177
(and (= pos ter-start-pos) (<= ter-end-pos pos-end))
178178
(and (= pos-end ter-start-pos) (<= pos ter-end-pos)))))
179179

180+
(defn- ref-include-from-ter-upstream-and-over-ter-end?
181+
[{:keys [strand cds-start cds-end]} pos ref alt]
182+
(let [[del _ offset _] (diff-bases ref alt)
183+
pos (+ pos offset)
184+
ndel (count del)
185+
ter-start-pos (if (= strand :forward)
186+
(- cds-end 2)
187+
(+ cds-start 2))
188+
ter-end-pos (if (= strand :forward)
189+
cds-end
190+
cds-start)
191+
pos-end (+ pos (if (= ndel 0) 0 (dec ndel)))]
192+
(if (= strand :forward)
193+
(and (< pos ter-start-pos) (< ter-end-pos pos-end))
194+
(and (< ter-start-pos pos-end) (< pos ter-end-pos)))))
195+
180196
(defn- ter-site-same-pos?
181197
[ref-prot-seq alt-prot-seq]
182198
(and (string/includes? ref-prot-seq "*")
@@ -260,6 +276,25 @@
260276
:else
261277
pos-start*)))
262278

279+
(defn- in-frame?
280+
[pos ref alt {:keys [cds-start cds-end strand] :as _rg}]
281+
(let [[del ins offset] (diff-bases ref alt)
282+
ndel (count del)
283+
nins (count ins)
284+
pos* (+ pos offset)
285+
pos-end (+ pos offset (dec ndel))
286+
over-ter-site? (if (= strand :forward)
287+
(< pos cds-end pos-end)
288+
(< pos cds-start pos-end))
289+
ndel-to-cds-end (if (= strand :forward)
290+
(inc (- cds-end pos*))
291+
(inc (- pos-end cds-start)))
292+
ndel* (if over-ter-site?
293+
ndel-to-cds-end
294+
ndel)]
295+
(or (= ndel nins 1)
296+
(= 0 (rem (- ndel* nins) 3)))))
297+
263298
(defn- apply-offset
264299
[pos ref alt cds-start cds-end exon-ranges pos*]
265300
(let [[del ins offset _] (diff-bases ref alt)
@@ -288,6 +323,7 @@
288323
ref-include-utr-ini-site-boundary (include-utr-ini-site-boundary? rg pos ref alt)
289324
ref-include-ter-site (include-ter-site? rg pos ref alt)
290325
ref-include-from-ter-start-and-over-ter-end (ref-include-from-ter-start-and-over-ter-end? rg pos ref alt)
326+
ref-include-from-ter-upstream-and-over-ter-end (ref-include-from-ter-upstream-and-over-ter-end? rg pos ref alt)
291327
frameshift-within-cds (frameshift-within-cds? rg pos ref alt)
292328
alt-seq (common/alt-sequence ref-seq tx-start pos ref alt)
293329
alt-exon-ranges* (alt-exon-ranges exon-ranges pos ref alt)
@@ -301,7 +337,8 @@
301337
alt-up-exon-seq (make-alt-up-exon-seq alt-up-exon-seq tx-start (dec alt-cds-start) alt-exon-ranges* strand)
302338
alt-down-exon-seq (make-alt-down-exon-seq alt-down-exon-seq (inc alt-cds-end) alt-tx-end alt-exon-ranges* strand)
303339
ter-site-adjusted-alt-seq (make-ter-site-adjusted-alt-seq alt-cds-exon-seq alt-up-exon-seq alt-down-exon-seq
304-
strand cds-start cds-end pos ref ref-include-ter-site)]
340+
strand cds-start cds-end pos ref ref-include-ter-site)
341+
in-frame (in-frame? pos ref alt rg)]
305342
{:ref-exon-seq ref-cds-exon-seq
306343
:ref-prot-seq (codon/amino-acid-sequence (cond-> ref-cds-exon-seq
307344
(= strand :reverse) util-seq/revcomp))
@@ -327,8 +364,10 @@
327364
:ref-include-utr-ini-site-boundary ref-include-utr-ini-site-boundary
328365
:ref-include-ter-site ref-include-ter-site
329366
:ref-include-from-ter-start-and-over-ter-end ref-include-from-ter-start-and-over-ter-end
367+
:ref-include-from-ter-upstream-and-over-ter-end ref-include-from-ter-upstream-and-over-ter-end
330368
:frameshift-within-cds frameshift-within-cds
331-
:utr-variant (utr-variant? cds-start cds-end pos ref alt)})))
369+
:utr-variant (utr-variant? cds-start cds-end pos ref alt)
370+
:in-frame in-frame})))
332371

333372
(defn- protein-position
334373
"Converts genomic position to protein position. If pos is outside of CDS,
@@ -621,7 +660,8 @@
621660

622661
(defn- protein-indel
623662
[ppos pref palt {:keys [ref-prot-seq c-ter-adjusted-alt-prot-seq
624-
ref-include-ter-site frameshift-within-cds] :as seq-info}]
663+
ref-include-ter-site frameshift-within-cds
664+
ref-include-from-ter-upstream-and-over-ter-end in-frame] :as seq-info}]
625665
(let [[pref* palt* ppos*] (if ref-include-ter-site
626666
(let [{adjusted-ppos :ppos} (get-first-diff-aa-info ppos ref-prot-seq c-ter-adjusted-alt-prot-seq)
627667
ppos (or adjusted-ppos ppos)
@@ -658,6 +698,10 @@
658698

659699
(empty? ins)
660700
(protein-deletion ppos* pref* palt*)
701+
702+
(and ref-include-from-ter-upstream-and-over-ter-end
703+
(not in-frame))
704+
(protein-frame-shift ppos* seq-info)
661705

662706
alt-retain-ter-site?
663707
(mut/protein-indel (mut/->long-amino-acid (first del))

test/varity/vcf_to_hgvs/protein_test.clj

+57
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,28 @@
170170
false? :reverse 11 "C" "CATG"
171171
false? :reverse 8 "CGTC" "C")))
172172

173+
(deftest ref-include-from-ter-upstream-and-over-ter-end?-test
174+
(let [cds-start 10
175+
cds-end 21]
176+
(are [p strand pos ref alt] (p (#'prot/ref-include-from-ter-upstream-and-over-ter-end? {:strand strand
177+
:cds-start cds-start
178+
:cds-end cds-end}
179+
pos
180+
ref
181+
alt))
182+
true? :forward 17 "AATAAG" "A"
183+
true? :forward 17 "AATAAG" "AGC"
184+
true? :reverse 8 "CGTCAC" "C"
185+
true? :reverse 8 "CGTCAC" "CTG"
186+
false? :forward 17 "AATAA" "A"
187+
false? :forward 18 "A" "T"
188+
false? :forward 19 "T" "TCCCT"
189+
false? :forward 18 "ATA" "A"
190+
false? :reverse 9 "GTCAC" "G"
191+
false? :reverse 10 "T" "A"
192+
false? :reverse 11 "C" "CATG"
193+
false? :reverse 8 "CGTC" "C")))
194+
173195
(deftest ter-site-same-pos?-test
174196
(are [p ref alt] (p (#'prot/ter-site-same-pos? ref alt))
175197
true? "MTGA*" "MTGA*"
@@ -256,6 +278,41 @@
256278
100 115 99
257279
75 110 50)))
258280

281+
(deftest in-frame?-test
282+
(let [in-frame? (fn [pos ref alt strand] (#'prot/in-frame? pos ref alt {:cds-start 101
283+
:cds-end 300
284+
:strand strand}))]
285+
(testing "within cds"
286+
(are [pred pos ref alt strand] (pred (in-frame? pos ref alt strand))
287+
true? 200 "A" "T" :forward
288+
true? 200 "AGGC" "A" :forward
289+
true? 200 "A" "ATCG" :forward
290+
true? 200 "AGT" "ACCCTG" :forward
291+
true? 150 "T" "A" :reverse
292+
true? 150 "GCTC" "G" :reverse
293+
true? 150 "T" "TCCG" :reverse
294+
true? 150 "TGG" "TCCAAC" :reverse
295+
false? 200 "AGG" "A" :forward
296+
false? 200 "A" "ATC" :forward
297+
false? 200 "AGT" "ACCCT" :forward
298+
false? 150 "GCT" "G" :reverse
299+
false? 150 "T" "TCC" :reverse
300+
false? 150 "TGG" "TCCAA" :reverse))
301+
(testing "over ter site"
302+
(are [pred pos ref alt strand] (pred (in-frame? pos ref alt strand))
303+
true? 294 "CAGTTGAAG" "C" :forward
304+
true? 294 "CAGTTGAAG" "CGTC" :forward
305+
true? 296 "GTTGAAG" "GCCAA" :forward
306+
false? 295 "AGTTGAAG" "A" :forward
307+
false? 295 "AGTTGAAG" "ACTC" :forward
308+
false? 294 "CAGTTGAAG" "CGT" :forward
309+
true? 99 "GTTTACGA" "G" :reverse
310+
true? 99 "GTTTACGA" "GCCT" :reverse
311+
true? 99 "GTTTACGAC" "GA" :reverse
312+
false? 99 "GTTTACG" "G" :reverse
313+
false? 96 "CAAGTTTACG" "C" :reverse
314+
false? 99 "GTTTACGA" "GAT" :reverse))))
315+
259316
(deftest apply-offset-test
260317
(testing "ref not include exon terminal"
261318
(let [ref "GCTGACC"

test/varity/vcf_to_hgvs_test.clj

+1
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,7 @@
275275
"chr10" 87965466 "AGTCT" "A" '("p.V403Efs*12" "p.V576Efs*12" "p.V206Efs*12") ; not actual example (+)
276276
"chr10" 87965465 "AAGTCT" "A" '("p.V403Nfs*17" "p.V576Nfs*17" "p.V206Nfs*17") ; not actual example (+)
277277
"chr10" 87965463 "AAAAGTCT" "A" '("p.K402Efs*12" "p.K575Efs*12" "p.K205Efs*12") ; not actual example (+)
278+
"chr13" 24421117 "ACTTAGC" "A" '("p.G1724Vfs*3") ; not actual example (-)
278279

279280
;; Extension
280281
"chr2" 189011772 "T" "C" '("p.*1467Qext*45") ; cf. ClinVar 101338

0 commit comments

Comments
 (0)