From 7c2971b903bf0bad7a7c28bd5cf29338a8e2dbf1 Mon Sep 17 00:00:00 2001 From: Nobuaki Karasawa <nkarasawa@xcoo.jp> Date: Mon, 20 Jan 2025 09:05:06 +0900 Subject: [PATCH 1/3] fix: return no-effect when ter site is inserted around ter site --- src/varity/vcf_to_hgvs/protein.clj | 85 ++++++++++++++++-------------- test/varity/vcf_to_hgvs_test.clj | 1 + 2 files changed, 46 insertions(+), 40 deletions(-) diff --git a/src/varity/vcf_to_hgvs/protein.clj b/src/varity/vcf_to_hgvs/protein.clj index c4e0d54..bd53fee 100644 --- a/src/varity/vcf_to_hgvs/protein.clj +++ b/src/varity/vcf_to_hgvs/protein.clj @@ -179,8 +179,11 @@ (defn- ter-site-same-pos? [ref-prot-seq alt-prot-seq] - (let [ter-site-pos (dec (count ref-prot-seq))] - (= \* (get alt-prot-seq ter-site-pos)))) + (let [ref-ter-pos (count ref-prot-seq) + alt-ter-pos (inc (count (first (string/split alt-prot-seq #"\*"))))] + (and (string/includes? ref-prot-seq "*") + (string/includes? alt-prot-seq "*") + (= ref-ter-pos alt-ter-pos)))) (defn- cds-start-upstream? [cds-start pos ref alt] @@ -443,7 +446,9 @@ :else :frame-shift) - (and (pos? npref) (= (first palt-only) \*)) :substitution + (and (pos? npref) (= (first palt-only) \*)) (if (ter-site-same-pos? ref-prot-seq alt-prot-seq*) + :no-effect + :substitution) (not= ref-prot-rest alt-prot-rest) (cond (or (and (= (first alt-prot-rest) \*) (>= nprefo npalto) @@ -572,45 +577,45 @@ (coord/unknown-coordinate)))))))) (defn- protein-extension - [ppos pref palt {:keys [ref-prot-seq alt-tx-prot-seq c-ter-adjusted-alt-prot-seq ini-offset prefer-extension-for-initial-codon-alt?] :as seq-info}] - (cond - (and (not= ppos 1) - (ter-site-same-pos? ref-prot-seq c-ter-adjusted-alt-prot-seq)) - (mut/protein-no-effect) + [ppos pref palt {:keys [ref-prot-seq alt-tx-prot-seq ini-offset prefer-extension-for-initial-codon-alt?] :as seq-info}] + (let [alt-prot-seq* (format-alt-prot-seq seq-info)] + (cond + (and (not= ppos 1) + (ter-site-same-pos? ref-prot-seq alt-prot-seq*)) + (mut/protein-no-effect) - (and (= ppos 1) (not prefer-extension-for-initial-codon-alt?)) - (mut/protein-unknown-mutation) + (and (= ppos 1) (not prefer-extension-for-initial-codon-alt?)) + (mut/protein-unknown-mutation) - :else - (let [[_ ins offset _] (diff-bases (or pref "") (or palt "")) - alt-prot-seq* (format-alt-prot-seq seq-info) - ini-site ((comp str first) ref-prot-seq) - first-diff-aa-info (if (= ppos 1) - {:ppos 1 - :pref ini-site} - (get-first-diff-aa-info ppos - ref-prot-seq - alt-prot-seq*)) - rest-seq (if (= ppos 1) - (-> alt-tx-prot-seq - (subs 0 ini-offset) - reverse - (#(apply str %))) - (subs alt-prot-seq* (:ppos first-diff-aa-info))) - alt-aa (mut/->long-amino-acid (if (= ppos 1) - (or (last ins) (first rest-seq)) - (:palt first-diff-aa-info))) - alt-aa-offset (if (and (= ppos 1) (nil? (last ins))) -1 0) - new-aa-pos (some-> (string/index-of rest-seq (:pref first-diff-aa-info)) inc (+ alt-aa-offset))] - (if (and (= ppos 1) (= alt-aa "Ter")) - (mut/protein-unknown-mutation) - (mut/protein-extension (if (= ppos 1) (mut/->long-amino-acid ini-site) "Ter") - (coord/protein-coordinate (if (= ppos 1) 1 (+ ppos offset))) - alt-aa - (if (= ppos 1) :upstream :downstream) - (if new-aa-pos - (coord/protein-coordinate new-aa-pos) - (coord/unknown-coordinate))))))) + :else + (let [[_ ins offset _] (diff-bases (or pref "") (or palt "")) + ini-site ((comp str first) ref-prot-seq) + first-diff-aa-info (if (= ppos 1) + {:ppos 1 + :pref ini-site} + (get-first-diff-aa-info ppos + ref-prot-seq + alt-prot-seq*)) + rest-seq (if (= ppos 1) + (-> alt-tx-prot-seq + (subs 0 ini-offset) + reverse + (#(apply str %))) + (subs alt-prot-seq* (:ppos first-diff-aa-info))) + alt-aa (mut/->long-amino-acid (if (= ppos 1) + (or (last ins) (first rest-seq)) + (:palt first-diff-aa-info))) + alt-aa-offset (if (and (= ppos 1) (nil? (last ins))) -1 0) + new-aa-pos (some-> (string/index-of rest-seq (:pref first-diff-aa-info)) inc (+ alt-aa-offset))] + (if (and (= ppos 1) (= alt-aa "Ter")) + (mut/protein-unknown-mutation) + (mut/protein-extension (if (= ppos 1) (mut/->long-amino-acid ini-site) "Ter") + (coord/protein-coordinate (if (= ppos 1) 1 (+ ppos offset))) + alt-aa + (if (= ppos 1) :upstream :downstream) + (if new-aa-pos + (coord/protein-coordinate new-aa-pos) + (coord/unknown-coordinate)))))))) (defn- protein-indel [ppos pref palt {:keys [ref-prot-seq c-ter-adjusted-alt-prot-seq diff --git a/test/varity/vcf_to_hgvs_test.clj b/test/varity/vcf_to_hgvs_test.clj index b59f4d9..2d667ec 100644 --- a/test/varity/vcf_to_hgvs_test.clj +++ b/test/varity/vcf_to_hgvs_test.clj @@ -301,6 +301,7 @@ "chr11" 14279340 "G" "A" '("p.=") ; not actual example (-) "chr7" 55019277 "G" "GTC" '("p.=") ; not actual example (+) "chr17" 21042835 "T" "TG" '("p.=") ; not actual example (-) + "chr13" 24421121 "A" "ATTA" '("p.=") ; not actual example (-) ;; unknown "chr12" 40393453 "G" "A" '("p.?") ; not actual example (+) From 4610278093d9379c0e2c26d8a8acb082275cebc4 Mon Sep 17 00:00:00 2001 From: Nobuaki Karasawa <nkarasawa@xcoo.jp> Date: Tue, 21 Jan 2025 15:55:11 +0900 Subject: [PATCH 2/3] fix: fix protein-extension condition --- src/varity/vcf_to_hgvs/protein.clj | 10 ++++++---- test/varity/vcf_to_hgvs_test.clj | 2 ++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/varity/vcf_to_hgvs/protein.clj b/src/varity/vcf_to_hgvs/protein.clj index bd53fee..f5d5794 100644 --- a/src/varity/vcf_to_hgvs/protein.clj +++ b/src/varity/vcf_to_hgvs/protein.clj @@ -428,8 +428,12 @@ pref-only palt-only) ini-site-affected (ini-site-affected? ref-exon-seq alt-exon-seq) + first-diff-aa-is-ter-site (first-diff-aa-is-ter-site? base-ppos + ref-prot-seq + alt-prot-seq*) t (cond - ref-include-from-ter-start-and-over-ter-end :frame-shift + (and ref-include-from-ter-start-and-over-ter-end + (not first-diff-aa-is-ter-site)) :frame-shift (= (+ base-ppos offset) (count ref-prot-seq)) (if (and (= "" pref-only palt-only) (ter-site-same-pos? ref-prot-seq alt-prot-seq*)) :no-effect @@ -455,9 +459,7 @@ (= palt (subs pref 0 (count palt)))) (= (first palt-only) \*)) :fs-ter-substitution ref-include-ter-site :indel - (first-diff-aa-is-ter-site? base-ppos - ref-prot-seq - alt-prot-seq*) :extension + first-diff-aa-is-ter-site :extension :else :frame-shift) (or (and (zero? nprefo) (zero? npalto)) (and (= nprefo 1) (= npalto 1))) :substitution diff --git a/test/varity/vcf_to_hgvs_test.clj b/test/varity/vcf_to_hgvs_test.clj index 2d667ec..2e4c204 100644 --- a/test/varity/vcf_to_hgvs_test.clj +++ b/test/varity/vcf_to_hgvs_test.clj @@ -280,6 +280,8 @@ "chr11" 125655318 "TGA" "TAT" '("p.*477Yext*17" "p.*443Yext*17" "p.*477Yext*24") "chr10" 8074014 "C" "CATGGGTT" '("p.*445Yext*64" "p.*444Yext*64") ; not actual example (+) "chr10" 87965468 "TC" "T" '("p.*404Eext*11" "p.*577Eext*11" "p.*207Eext*11") ; not actual example (+) + "chrX" 15823239 "ATAA" "A" '("p.*483Text*?") ; not actual example (+) + "chr13" 24421118 "CTTA" "C" '("p.*1725Vext*2") ; not actual example (-) ;; NOTE: There are very few correct examples... ;; Extension without termination site From aaf40df800f5f9b8e641a366d6a674e6d2fa01e3 Mon Sep 17 00:00:00 2001 From: Nobuaki Karasawa <nkarasawa@xcoo.jp> Date: Tue, 21 Jan 2025 15:56:54 +0900 Subject: [PATCH 3/3] refactor: tweak ter site pos check fn --- src/varity/vcf_to_hgvs/protein.clj | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/varity/vcf_to_hgvs/protein.clj b/src/varity/vcf_to_hgvs/protein.clj index f5d5794..2e5b5a4 100644 --- a/src/varity/vcf_to_hgvs/protein.clj +++ b/src/varity/vcf_to_hgvs/protein.clj @@ -179,10 +179,10 @@ (defn- ter-site-same-pos? [ref-prot-seq alt-prot-seq] - (let [ref-ter-pos (count ref-prot-seq) - alt-ter-pos (inc (count (first (string/split alt-prot-seq #"\*"))))] - (and (string/includes? ref-prot-seq "*") - (string/includes? alt-prot-seq "*") + (and (string/includes? ref-prot-seq "*") + (string/includes? alt-prot-seq "*") + (let [ref-ter-pos (count ref-prot-seq) + alt-ter-pos (inc (count (first (string/split alt-prot-seq #"\*"))))] (= ref-ter-pos alt-ter-pos)))) (defn- cds-start-upstream?