Skip to content

Commit 7803933

Browse files
committed
fix: add three-prime-rule option to vcf->hgvs
1 parent 2f5fc0d commit 7803933

File tree

2 files changed

+82
-24
lines changed

2 files changed

+82
-24
lines changed

src/varity/vcf_to_hgvs.clj

+43-22
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,16 @@
3030
(some? (re-matches #"(NM_|ENST)\d+(\.\d+)?" (:name rg))))
3131

3232
(defn select-variant
33-
[var seq-rdr rg]
33+
[var seq-rdr rg & {:keys [three-prime-rule]}]
3434
(if-let [nvar (normalize-variant var seq-rdr rg)]
3535
(let [var-start-cds-coord (rg/cds-coord (:pos var) rg)
3636
var-end-cds-coord (rg/cds-coord (+ (:pos var) (max (count (:ref var)) (count (:alt var)))) rg)
3737
nvar-start-cds-coord (rg/cds-coord (:pos nvar) rg)
38-
nvar-end-cds-coord (rg/cds-coord (+ (:pos nvar) (max (count (:ref nvar)) (count (:alt nvar)))) rg)]
39-
(if (= (:region var-start-cds-coord) (:region nvar-start-cds-coord)
40-
(:region var-end-cds-coord) (:region nvar-end-cds-coord))
38+
nvar-end-cds-coord (rg/cds-coord (+ (:pos nvar) (max (count (:ref nvar)) (count (:alt nvar)))) rg)
39+
restrict-cds (get three-prime-rule :restrict-cds true)]
40+
(if (or (= (:region var-start-cds-coord) (:region nvar-start-cds-coord)
41+
(:region var-end-cds-coord) (:region nvar-end-cds-coord))
42+
(not restrict-cds))
4143
nvar
4244
var))
4345
var))
@@ -119,7 +121,8 @@
119121
(filter coding-dna-ref-gene?)
120122
(map (fn [rg]
121123
(assoc (select-variant {:chr chr, :pos pos, :ref ref, :alt alt}
122-
seq-rdr rg)
124+
seq-rdr rg
125+
:three-prime-rule (:three-prime-rule options))
123126
:rg rg)))
124127
(map (fn [{:keys [rg] :as m}]
125128
(when (:verbose? options)
@@ -135,7 +138,8 @@
135138
(let [options (merge default-options options)]
136139
(if (valid-ref? seq-rdr chr pos ref)
137140
(let [nv (select-variant {:chr chr, :pos pos, :ref ref, :alt alt}
138-
seq-rdr rg)]
141+
seq-rdr rg
142+
:three-prime-rule (:three-prime-rule options))]
139143
(when (:verbose? options)
140144
(print-debug-info nv seq-rdr rg))
141145
(coding-dna/->hgvs (assoc nv :rg rg) seq-rdr rg options))
@@ -189,7 +193,8 @@
189193
(filter coding-dna-ref-gene?)
190194
(map (fn [rg]
191195
(assoc (select-variant {:chr chr, :pos pos, :ref ref, :alt alt}
192-
seq-rdr rg)
196+
seq-rdr rg
197+
:three-prime-rule (:three-prime-rule options))
193198
:rg rg)))
194199
(filter #(cds-affected? % (:rg %)))
195200
(keep (fn [{:keys [rg] :as m}]
@@ -206,7 +211,8 @@
206211
(let [options (merge default-options options)]
207212
(if (valid-ref? seq-rdr chr pos ref)
208213
(let [nv (select-variant {:chr chr, :pos pos, :ref ref, :alt alt}
209-
seq-rdr rg)]
214+
seq-rdr rg
215+
:three-prime-rule (:three-prime-rule options))]
210216
(when (cds-affected? nv rg)
211217
(when (:verbose? options)
212218
(print-debug-info nv seq-rdr rg))
@@ -264,15 +270,23 @@
264270
(->> (rg/ref-genes chr pos rgidx (:tx-margin options))
265271
(filter coding-dna-ref-gene?)
266272
(map (fn [rg]
267-
(assoc (select-variant {:chr chr, :pos pos, :ref ref, :alt alt}
268-
seq-rdr rg)
269-
:rg rg)))
270-
(map (fn [{:keys [rg] :as m}]
273+
{:coding-dna (assoc (select-variant {:chr chr, :pos pos, :ref ref, :alt alt}
274+
seq-rdr rg
275+
:three-prime-rule (get-in options [:three-prime-rule :coding-dna]))
276+
:rg rg
277+
:type :coding-dna)
278+
:protein (assoc (select-variant {:chr chr, :pos pos, :ref ref, :alt alt}
279+
seq-rdr rg
280+
:three-prime-rule (get-in options [:three-prime-rule :protein]))
281+
:rg rg
282+
:type :protein)}))
283+
(map (fn [{:keys [coding-dna protein]}]
271284
(when (:verbose? options)
272-
(print-debug-info m seq-rdr rg))
273-
{:coding-dna (coding-dna/->hgvs m seq-rdr rg options)
274-
:protein (if (cds-affected? m rg)
275-
(prot/->hgvs m seq-rdr rg options))}))
285+
(print-debug-info coding-dna seq-rdr (:rg coding-dna))
286+
(print-debug-info protein seq-rdr (:rg protein)))
287+
{:coding-dna (coding-dna/->hgvs coding-dna seq-rdr (:rg coding-dna) options)
288+
:protein (when (cds-affected? coding-dna (:rg coding-dna))
289+
(prot/->hgvs protein seq-rdr (:rg protein) options))}))
276290
distinct)
277291
(throw (ex-info "ref is not found on the position."
278292
{:type ::invalid-ref
@@ -282,14 +296,21 @@
282296
[{:keys [pos ref alt]} seq-rdr {:keys [chr] :as rg} & [options]]
283297
(let [options (merge default-options options)]
284298
(if (valid-ref? seq-rdr chr pos ref)
285-
(let [nv (select-variant {:chr chr, :pos pos, :ref ref, :alt alt}
286-
seq-rdr rg)]
299+
(let [nv (assoc (select-variant {:chr chr, :pos pos, :ref ref, :alt alt}
300+
seq-rdr rg
301+
:three-prime-rule (get-in options [:three-prime-rule :coding-dna]))
302+
:type :coding-dna)
303+
pnv (assoc (select-variant {:chr chr, :pos pos, :ref ref, :alt alt}
304+
seq-rdr rg
305+
:three-prime-rule (get-in options [:three-prime-rule :protein]))
306+
:type :protein)]
287307
(when (:verbose? options)
288-
(print-debug-info nv seq-rdr rg))
289-
308+
(print-debug-info nv seq-rdr rg)
309+
(print-debug-info pnv seq-rdr rg))
310+
(println (:three-prime-rule options))
290311
{:coding-dna (coding-dna/->hgvs nv seq-rdr rg options)
291-
:protein (if (cds-affected? nv rg)
292-
(prot/->hgvs nv seq-rdr rg options))})
312+
:protein (when (cds-affected? nv rg)
313+
(prot/->hgvs pnv seq-rdr rg options))})
293314
(throw (ex-info "ref is not found on the position."
294315
{:type ::invalid-ref
295316
:variant {:chr chr, :pos pos, :ref ref, :alt alt}})))))

test/varity/vcf_to_hgvs_test.clj

+39-2
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,17 @@
169169
;; tx-margin
170170
"chr5" 1295113 "G" "A" {:tx-margin 5000} '("NM_001193376:c.-124C>T"
171171
"NM_198253:c.-124C>T")
172-
"chr5" 1295113 "G" "A" {:tx-margin 0} '())))
172+
"chr5" 1295113 "G" "A" {:tx-margin 0} '()
173+
174+
;; three-prime-rule
175+
"chr13" 24421115 "TGACTTAGCCT" "T" {:three-prime-rule {:restrict-cds true}} '("NM_006437:c.5169_*3delAGGCTAAGTC")
176+
"chr13" 24421115 "TGACTTAGCCT" "T" {:three-prime-rule {:restrict-cds false}} '("NM_006437:c.5170_*4delGGCTAAGTCA")
177+
"chr3" 53495165 "GGAT" "G" {:three-prime-rule {:restrict-cds true} :prefer-insertion? true :prefer-deletion? true} '("NM_000720:c.-1_2delGAT"
178+
"NM_001128839:c.-1_2delGAT"
179+
"NM_001128840:c.-1_2delGAT")
180+
"chr3" 53495165 "GGAT" "G" {:three-prime-rule {:restrict-cds false} :prefer-insertion? true :prefer-deletion? true} '("NM_000720:c.20_22delTGA"
181+
"NM_001128839:c.20_22delTGA"
182+
"NM_001128840:c.20_22delTGA"))))
173183
(cavia-testing "throws Exception if inputs are illegal"
174184
(let [rgidx (rg/index (rg/load-ref-genes test-ref-gene-file))]
175185
(is (thrown? Exception
@@ -350,10 +360,16 @@
350360

351361
;; prefer-extension-for-initial-codon-alt?
352362
"chr10" 121593814 "CCATGGT" "C" {:prefer-extension-for-initial-codon-alt? true} '("p.M1Vext-17") ;; not actual example (-)
353-
"chr2" 197434979 "AGTCTTGGCGATCTTCGCCATTTT" "A" {:prefer-extension-for-initial-codon-alt? true} '("p.M1Sext-?")))) ;; not actual example (-)
363+
"chr2" 197434979 "AGTCTTGGCGATCTTCGCCATTTT" "A" {:prefer-extension-for-initial-codon-alt? true} '("p.M1Sext-?") ;; not actual example (-)
354364
;; prefer-extension-for-initial-codon-alt?, initiation codon is altered to termination codon
355365
"chr9" 27109592 "T" "TTTA" {:prefer-extension-for-initial-codon-alt? true} '("p.?") ;; not actual example (+)
356366

367+
;; three-prime-rule
368+
"chr13" 24421115 "TGACTTAGCCT" "T" {:three-prime-rule {:restrict-cds true}} '("p.G1724Nfs*6")
369+
"chr13" 24421115 "TGACTTAGCCT" "T" {:three-prime-rule {:restrict-cds false}} '("p.G1724delinsNETEF")
370+
"chr3" 53495165 "GGAT" "G" {:three-prime-rule {:restrict-cds true} :prefer-insertion? true :prefer-deletion? true} '("p.?")
371+
"chr3" 53495165 "GGAT" "G" {:three-prime-rule {:restrict-cds false} :prefer-insertion? true :prefer-deletion? true} '("p.M7del"))))
372+
357373
(cavia-testing "throws Exception if inputs are illegal"
358374
(let [rgidx (rg/index (rg/load-ref-genes test-ref-gene-file))]
359375
(is (thrown? Exception
@@ -403,3 +419,24 @@
403419
"NR_024540"
404420
"ENSP00000496776.1"
405421
"ENSP00000496776")))
422+
423+
(defn- vcf-variant->hgvs-texts
424+
[variant seq-rdr rgidx & [options]]
425+
(map (fn [{:keys [coding-dna protein]}]
426+
{:coding-dna (hgvs/format coding-dna {:show-bases? true
427+
:range-format :coord})
428+
:protein (hgvs/format protein {:amino-acid-format :short
429+
:show-ter-site? true
430+
:ter-format :short})})
431+
(vcf-variant->hgvs variant seq-rdr rgidx options)))
432+
433+
(defslowtest vcf-variant->hgvs-test
434+
(cavia-testing "options"
435+
(let [rgidx (rg/index (rg/load-ref-genes test-ref-gene-file))]
436+
(are [chr pos ref alt opts e]
437+
(= (vcf-variant->hgvs-texts {:chr chr, :pos pos, :ref ref, :alt alt}
438+
test-ref-seq-file rgidx (merge {:prefer-insertion? true
439+
:prefer-deletion? true}
440+
opts)) e)
441+
"chr13" 24421115 "TGACTTAGCCT" "T" {:three-prime-rule {:coding-dna {:restrict-cds true} :protein {:restrict-cds true}}} '({:coding-dna "NM_006437:c.5169_*3delAGGCTAAGTC", :protein "p.G1724Nfs*6"})
442+
"chr13" 24421115 "TGACTTAGCCT" "T" {:three-prime-rule {:coding-dna {:restrict-cds false} :protein {:restrict-cds false}}} '({:coding-dna "NM_006437:c.5170_*4delGGCTAAGTCA", :protein "p.G1724delinsNETEF"})))))

0 commit comments

Comments
 (0)