From 7c2971b903bf0bad7a7c28bd5cf29338a8e2dbf1 Mon Sep 17 00:00:00 2001
From: Nobuaki Karasawa <nkarasawa@xcoo.jp>
Date: Mon, 20 Jan 2025 09:05:06 +0900
Subject: [PATCH 1/3] fix: return no-effect when ter site is inserted around
 ter site

---
 src/varity/vcf_to_hgvs/protein.clj | 85 ++++++++++++++++--------------
 test/varity/vcf_to_hgvs_test.clj   |  1 +
 2 files changed, 46 insertions(+), 40 deletions(-)

diff --git a/src/varity/vcf_to_hgvs/protein.clj b/src/varity/vcf_to_hgvs/protein.clj
index c4e0d54..bd53fee 100644
--- a/src/varity/vcf_to_hgvs/protein.clj
+++ b/src/varity/vcf_to_hgvs/protein.clj
@@ -179,8 +179,11 @@
 
 (defn- ter-site-same-pos?
   [ref-prot-seq alt-prot-seq]
-  (let [ter-site-pos (dec (count ref-prot-seq))]
-    (= \* (get alt-prot-seq ter-site-pos))))
+  (let [ref-ter-pos (count ref-prot-seq)
+        alt-ter-pos (inc (count (first (string/split alt-prot-seq #"\*"))))]
+    (and (string/includes? ref-prot-seq "*")
+         (string/includes? alt-prot-seq "*")
+         (= ref-ter-pos alt-ter-pos))))
 
 (defn- cds-start-upstream?
   [cds-start pos ref alt]
@@ -443,7 +446,9 @@
 
                                            :else
                                            :frame-shift)
-              (and (pos? npref) (= (first palt-only) \*)) :substitution
+              (and (pos? npref) (= (first palt-only) \*)) (if (ter-site-same-pos? ref-prot-seq alt-prot-seq*)
+                                                            :no-effect
+                                                            :substitution)
               (not= ref-prot-rest alt-prot-rest) (cond
                                                    (or (and (= (first alt-prot-rest) \*)
                                                             (>= nprefo npalto)
@@ -572,45 +577,45 @@
                                      (coord/unknown-coordinate))))))))
 
 (defn- protein-extension
-  [ppos pref palt {:keys [ref-prot-seq alt-tx-prot-seq c-ter-adjusted-alt-prot-seq ini-offset prefer-extension-for-initial-codon-alt?] :as seq-info}]
-  (cond
-    (and (not= ppos 1)
-         (ter-site-same-pos? ref-prot-seq c-ter-adjusted-alt-prot-seq))
-    (mut/protein-no-effect)
+  [ppos pref palt {:keys [ref-prot-seq alt-tx-prot-seq ini-offset prefer-extension-for-initial-codon-alt?] :as seq-info}]
+  (let [alt-prot-seq* (format-alt-prot-seq seq-info)]
+    (cond
+      (and (not= ppos 1)
+           (ter-site-same-pos? ref-prot-seq alt-prot-seq*))
+      (mut/protein-no-effect)
 
-    (and (= ppos 1) (not prefer-extension-for-initial-codon-alt?))
-    (mut/protein-unknown-mutation)
+      (and (= ppos 1) (not prefer-extension-for-initial-codon-alt?))
+      (mut/protein-unknown-mutation)
 
-    :else
-    (let [[_ ins offset _] (diff-bases (or pref "") (or palt ""))
-          alt-prot-seq* (format-alt-prot-seq seq-info)
-          ini-site ((comp str first) ref-prot-seq)
-          first-diff-aa-info (if (= ppos 1)
-                               {:ppos 1
-                                :pref ini-site}
-                               (get-first-diff-aa-info ppos
-                                                       ref-prot-seq
-                                                       alt-prot-seq*))
-          rest-seq (if (= ppos 1)
-                     (-> alt-tx-prot-seq
-                         (subs 0 ini-offset)
-                         reverse
-                         (#(apply str %)))
-                     (subs alt-prot-seq* (:ppos first-diff-aa-info)))
-          alt-aa (mut/->long-amino-acid (if (= ppos 1)
-                                          (or (last ins) (first rest-seq))
-                                          (:palt first-diff-aa-info)))
-          alt-aa-offset (if (and (= ppos 1) (nil? (last ins))) -1 0)
-          new-aa-pos (some-> (string/index-of rest-seq (:pref first-diff-aa-info)) inc (+ alt-aa-offset))]
-      (if (and (= ppos 1) (= alt-aa "Ter"))
-        (mut/protein-unknown-mutation)
-        (mut/protein-extension (if (= ppos 1) (mut/->long-amino-acid ini-site) "Ter")
-                               (coord/protein-coordinate (if (= ppos 1) 1 (+ ppos offset)))
-                               alt-aa
-                               (if (= ppos 1) :upstream :downstream)
-                               (if new-aa-pos
-                                 (coord/protein-coordinate new-aa-pos)
-                                 (coord/unknown-coordinate)))))))
+      :else
+      (let [[_ ins offset _] (diff-bases (or pref "") (or palt ""))
+            ini-site ((comp str first) ref-prot-seq)
+            first-diff-aa-info (if (= ppos 1)
+                                 {:ppos 1
+                                  :pref ini-site}
+                                 (get-first-diff-aa-info ppos
+                                                         ref-prot-seq
+                                                         alt-prot-seq*))
+            rest-seq (if (= ppos 1)
+                       (-> alt-tx-prot-seq
+                           (subs 0 ini-offset)
+                           reverse
+                           (#(apply str %)))
+                       (subs alt-prot-seq* (:ppos first-diff-aa-info)))
+            alt-aa (mut/->long-amino-acid (if (= ppos 1)
+                                            (or (last ins) (first rest-seq))
+                                            (:palt first-diff-aa-info)))
+            alt-aa-offset (if (and (= ppos 1) (nil? (last ins))) -1 0)
+            new-aa-pos (some-> (string/index-of rest-seq (:pref first-diff-aa-info)) inc (+ alt-aa-offset))]
+        (if (and (= ppos 1) (= alt-aa "Ter"))
+          (mut/protein-unknown-mutation)
+          (mut/protein-extension (if (= ppos 1) (mut/->long-amino-acid ini-site) "Ter")
+                                 (coord/protein-coordinate (if (= ppos 1) 1 (+ ppos offset)))
+                                 alt-aa
+                                 (if (= ppos 1) :upstream :downstream)
+                                 (if new-aa-pos
+                                   (coord/protein-coordinate new-aa-pos)
+                                   (coord/unknown-coordinate))))))))
 
 (defn- protein-indel
   [ppos pref palt {:keys [ref-prot-seq c-ter-adjusted-alt-prot-seq
diff --git a/test/varity/vcf_to_hgvs_test.clj b/test/varity/vcf_to_hgvs_test.clj
index b59f4d9..2d667ec 100644
--- a/test/varity/vcf_to_hgvs_test.clj
+++ b/test/varity/vcf_to_hgvs_test.clj
@@ -301,6 +301,7 @@
         "chr11" 14279340 "G" "A" '("p.=") ; not actual example (-)
         "chr7" 55019277 "G" "GTC" '("p.=") ; not actual example (+)
         "chr17" 21042835 "T" "TG" '("p.=") ; not actual example (-)
+        "chr13" 24421121 "A" "ATTA" '("p.=") ; not actual example (-)
 
         ;; unknown
         "chr12" 40393453 "G" "A" '("p.?") ; not actual example (+)

From 4610278093d9379c0e2c26d8a8acb082275cebc4 Mon Sep 17 00:00:00 2001
From: Nobuaki Karasawa <nkarasawa@xcoo.jp>
Date: Tue, 21 Jan 2025 15:55:11 +0900
Subject: [PATCH 2/3] fix: fix protein-extension condition

---
 src/varity/vcf_to_hgvs/protein.clj | 10 ++++++----
 test/varity/vcf_to_hgvs_test.clj   |  2 ++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/varity/vcf_to_hgvs/protein.clj b/src/varity/vcf_to_hgvs/protein.clj
index bd53fee..f5d5794 100644
--- a/src/varity/vcf_to_hgvs/protein.clj
+++ b/src/varity/vcf_to_hgvs/protein.clj
@@ -428,8 +428,12 @@
                                                      pref-only
                                                      palt-only)
           ini-site-affected (ini-site-affected? ref-exon-seq alt-exon-seq)
+          first-diff-aa-is-ter-site (first-diff-aa-is-ter-site? base-ppos
+                                                                ref-prot-seq
+                                                                alt-prot-seq*)
           t (cond
-              ref-include-from-ter-start-and-over-ter-end :frame-shift
+              (and ref-include-from-ter-start-and-over-ter-end
+                   (not first-diff-aa-is-ter-site)) :frame-shift
               (= (+ base-ppos offset) (count ref-prot-seq)) (if (and (= "" pref-only palt-only)
                                                                      (ter-site-same-pos? ref-prot-seq alt-prot-seq*))
                                                               :no-effect
@@ -455,9 +459,7 @@
                                                             (= palt (subs pref 0 (count palt))))
                                                        (= (first palt-only) \*)) :fs-ter-substitution
                                                    ref-include-ter-site :indel
-                                                   (first-diff-aa-is-ter-site? base-ppos
-                                                                               ref-prot-seq
-                                                                               alt-prot-seq*) :extension
+                                                   first-diff-aa-is-ter-site :extension
                                                    :else :frame-shift)
               (or (and (zero? nprefo) (zero? npalto))
                   (and (= nprefo 1) (= npalto 1))) :substitution
diff --git a/test/varity/vcf_to_hgvs_test.clj b/test/varity/vcf_to_hgvs_test.clj
index 2d667ec..2e4c204 100644
--- a/test/varity/vcf_to_hgvs_test.clj
+++ b/test/varity/vcf_to_hgvs_test.clj
@@ -280,6 +280,8 @@
         "chr11" 125655318 "TGA" "TAT" '("p.*477Yext*17" "p.*443Yext*17" "p.*477Yext*24")
         "chr10" 8074014 "C" "CATGGGTT" '("p.*445Yext*64" "p.*444Yext*64") ; not actual example (+)
         "chr10" 87965468 "TC" "T" '("p.*404Eext*11" "p.*577Eext*11" "p.*207Eext*11") ; not actual example (+)
+        "chrX" 15823239 "ATAA" "A" '("p.*483Text*?") ; not actual example (+)
+        "chr13" 24421118 "CTTA" "C" '("p.*1725Vext*2") ; not actual example (-)
         ;; NOTE: There are very few correct examples...
 
         ;; Extension without termination site

From aaf40df800f5f9b8e641a366d6a674e6d2fa01e3 Mon Sep 17 00:00:00 2001
From: Nobuaki Karasawa <nkarasawa@xcoo.jp>
Date: Tue, 21 Jan 2025 15:56:54 +0900
Subject: [PATCH 3/3] refactor: tweak ter site pos check fn

---
 src/varity/vcf_to_hgvs/protein.clj | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/varity/vcf_to_hgvs/protein.clj b/src/varity/vcf_to_hgvs/protein.clj
index f5d5794..2e5b5a4 100644
--- a/src/varity/vcf_to_hgvs/protein.clj
+++ b/src/varity/vcf_to_hgvs/protein.clj
@@ -179,10 +179,10 @@
 
 (defn- ter-site-same-pos?
   [ref-prot-seq alt-prot-seq]
-  (let [ref-ter-pos (count ref-prot-seq)
-        alt-ter-pos (inc (count (first (string/split alt-prot-seq #"\*"))))]
-    (and (string/includes? ref-prot-seq "*")
-         (string/includes? alt-prot-seq "*")
+  (and (string/includes? ref-prot-seq "*")
+       (string/includes? alt-prot-seq "*")
+       (let [ref-ter-pos (count ref-prot-seq)
+             alt-ter-pos (inc (count (first (string/split alt-prot-seq #"\*"))))]
          (= ref-ter-pos alt-ter-pos))))
 
 (defn- cds-start-upstream?