From f76cc96dc29e03c3d41f4d1d77ffc2547889df97 Mon Sep 17 00:00:00 2001
From: Vladyslav Levchenko <akadlevchenko@gmail.com>
Date: Tue, 24 Sep 2024 19:31:01 +0300
Subject: [PATCH 1/3] updated cadd tool(merge issues)

---
 tests/pipeline.ipynb     |  39 +++++++++++++
 tests/tools/cadd/cadd.py | 115 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 154 insertions(+)
 create mode 100644 tests/tools/cadd/cadd.py

diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb
index 23df568..a9d9289 100644
--- a/tests/pipeline.ipynb
+++ b/tests/pipeline.ipynb
@@ -1693,6 +1693,45 @@
     "display(results)"
    ],
    "id": "ba435cd29d565f7d"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "from tests.tools.cadd.cadd import add_cadd_eval_column\n",
+    "from api.data.refactoring import merge_gnomad_lovd, parse_gnomad, set_gnomad_dtypes\n",
+    "from api import (store_database_for_eys_gene,\n",
+    "                 parse_lovd,\n",
+    "                 set_lovd_dtypes,\n",
+    "                 LOVD_PATH,\n",
+    "                 GNOMAD_PATH)\n",
+    "import pandas as pd\n",
+    "\n",
+    "store_database_for_eys_gene('lovd', False)\n",
+    "store_database_for_eys_gene('gnomad', False)\n",
+    "\n",
+    "lovd_data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")\n",
+    "gnomad_data = parse_gnomad(GNOMAD_PATH+'/gnomad_data.csv')\n",
+    "\n",
+    "set_lovd_dtypes(lovd_data)\n",
+    "set_gnomad_dtypes(gnomad_data)\n",
+    "\n",
+    "variants_on_genome = lovd_data[\"Variants_On_Genome\"].copy()\n",
+    "\n",
+    "lovd_data = pd.merge(lovd_data[\"Variants_On_Transcripts\"],\n",
+    "                       variants_on_genome[['id','VariantOnGenome/DNA','VariantOnGenome/DNA/hg38','chromosome','position_g_start','position_g_end']],\n",
+    "                       on='id',\n",
+    "                       how='left')\n",
+    "\n",
+    "gnomad_data = gnomad_data.copy()\n",
+    "data = merge_gnomad_lovd(lovd_data, gnomad_data)\n",
+    "first_100_rows = data.head(100).copy()\n",
+    "result_data = add_cadd_eval_column(first_100_rows)\n",
+    "result_data"
+   ],
+   "id": "1df284690ce590f4"
   }
  ],
  "metadata": {
diff --git a/tests/tools/cadd/cadd.py b/tests/tools/cadd/cadd.py
new file mode 100644
index 0000000..5b13ba8
--- /dev/null
+++ b/tests/tools/cadd/cadd.py
@@ -0,0 +1,115 @@
+""" Module provides interface to web APIs of CADD tool. """
+import argparse
+
+import pandas as pd
+import requests
+
+
+class BadResponseException(Exception):
+    """Custom exception for bad responses."""
+
+
+class DownloadError(Exception):
+    """Custom exception for download errors."""
+
+
+def fetch_cadd_scores(cadd_version, chromosome, start, end=None):
+    """
+    Fetches CADD (Combined Annotation Dependent Depletion) scores for either a single SNV or a range of genomic
+    positions.
+
+    :param str cadd_version: Version of the CADD model used, e.g., "v1.3" or "GRCh38-v1.7".
+    :param int chromosome: Chromosome number where the SNV or genomic region is located.
+    :param int start: Genomic start position (or single position for SNV) of the region.
+    :param int end: (Optional) Genomic end position of the region. If not provided, fetches a single SNV.
+    :return: A dictionary containing CADD scores and annotations for the specified SNV or region, or None if an
+    error occurs.
+    """
+
+    if end:
+        url = f"https://cadd.gs.washington.edu/api/v1.0/{cadd_version}/{chromosome}:{start}-{end}"
+    else:
+        url = f"https://cadd.gs.washington.edu/api/v1.0/{cadd_version}/{chromosome}:{start}"
+
+    try:
+        response = requests.get(url, timeout=30)
+        if response.status_code == 200:
+            data = response.json()
+            return data
+        raise BadResponseException(f"Error: {response.status_code} - {response.text}")
+    except requests.exceptions.Timeout as exc:
+        raise DownloadError("Error: Timeout occurred while trying to reach the server.") from exc
+    except requests.exceptions.RequestException as req_err:
+        raise DownloadError(f"Error: {req_err}") from req_err
+    except ValueError as exc:
+        raise BadResponseException("Error: Invalid JSON format in response.") from exc
+
+
+def evaluate_cadd_score(row, cadd_version="GRCh38-v1.7"):
+    """
+    Evaluates the CADD score for a given row in the DataFrame and returns the highest PHRED score evaluation.
+    Handles cases where the response is malformed or incomplete.
+
+    :param row: A row from the DataFrame.
+    :param str cadd_version: The CADD version to use for fetching the score.
+    :return: A string indicating the evaluation result based on the highest PHRED score, or an error message.
+    """
+    position = row.loc["hg38_gnomad_format"]
+    chromosome = row.loc["chromosome"]
+    if pd.isna(chromosome) or pd.isna(position):
+        chromosome = row.loc["Chromosome_gnomad"]
+        position= row.loc["Position_gnomad"]
+    else:
+        position = row.loc["hg38_gnomad_format"].split('-')[1]
+
+    score = fetch_cadd_scores(cadd_version, chromosome, position)
+
+    if score is None or not isinstance(score, list) or len(score) < 2:
+        return "CADD score unavailable or invalid format"
+
+    try:
+        score_df = pd.DataFrame(score[1:], columns=score[0])
+    except (IndexError, ValueError) as e:
+        return f"Error processing CADD score: {e}"
+
+    if "PHRED" not in score_df.columns:
+        return "PHRED score unavailable"
+
+    sorted_df = score_df.sort_values(by="PHRED", ascending=False)
+    highest_score_row = sorted_df.iloc[0]
+
+    return highest_score_row.loc['PHRED']
+
+
+def add_cadd_eval_column(data, cadd_version="GRCh38-v1.7"):
+    """
+    Adds a column 'cadd_eval' to the DataFrame based on CADD score evaluations for each row.
+
+    :param data: The merged DataFrame with genomic data.
+    :param str cadd_version: The version of the CADD model to use for score fetching.
+    :return: The updated DataFrame with the 'cadd_eval' column.
+    """
+    data["cadd_eval(PHRED)"] = data.apply(evaluate_cadd_score, axis=1, cadd_version=cadd_version)
+    return data
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Fetch CADD scores for genomic positions.")
+    parser.add_argument("version", help="CADD version, e.g., 'v1.3' or 'GRCh38-v1.7'")
+    parser.add_argument("chromosome", type=int, help="Chromosome number")
+    parser.add_argument("--position", type=int, help="Genomic position (for single SNV)")
+    parser.add_argument("--start", type=int,
+                        help="Genomic start position (for a range of positions)")
+    parser.add_argument("--end", type=int, help="Genomic end position (for a range of positions)")
+
+    args = parser.parse_args()
+
+    if args.position:
+        result = fetch_cadd_scores(args.version, args.chromosome, args.position)
+        print(result)
+    elif args.start and args.end:
+        result = fetch_cadd_scores(args.version, args.chromosome, args.start, args.end)
+        print(result)
+    else:
+        print("Please provide either '--position' for single SNV \
+              or '--start' and '--end' for a range of positions.")

From 20dacd50adfa3aabf9df64537461a2a8d15f8004 Mon Sep 17 00:00:00 2001
From: Vladyslav Levchenko <akadlevchenko@gmail.com>
Date: Wed, 25 Sep 2024 23:01:13 +0300
Subject: [PATCH 2/3] exceptions fix and moved cadd_scripts.py to cadd folder

---
 tests/tools/cadd/cadd.py   | 47 ++++++++++++++------------------------
 tests/tools/cadd_script.py | 24 +++++++++++++++++++
 2 files changed, 41 insertions(+), 30 deletions(-)
 create mode 100644 tests/tools/cadd_script.py

diff --git a/tests/tools/cadd/cadd.py b/tests/tools/cadd/cadd.py
index 5b13ba8..61f8655 100644
--- a/tests/tools/cadd/cadd.py
+++ b/tests/tools/cadd/cadd.py
@@ -1,6 +1,4 @@
 """ Module provides interface to web APIs of CADD tool. """
-import argparse
-
 import pandas as pd
 import requests
 
@@ -36,13 +34,23 @@ def fetch_cadd_scores(cadd_version, chromosome, start, end=None):
         if response.status_code == 200:
             data = response.json()
             return data
-        raise BadResponseException(f"Error: {response.status_code} - {response.text}")
+        raise BadResponseException(
+            f"Error: Received status code {response.status_code} - {response.reason}: {response.text}")
+
     except requests.exceptions.Timeout as exc:
-        raise DownloadError("Error: Timeout occurred while trying to reach the server.") from exc
+        raise DownloadError(
+            "Error: Timeout occurred while trying to reach the server. "
+            "Please check your internet connection or the server status.") from exc
+
     except requests.exceptions.RequestException as req_err:
-        raise DownloadError(f"Error: {req_err}") from req_err
+        raise DownloadError(
+            f"Error: An unexpected error occurred while making the request. "
+            f"Details: {req_err}") from req_err
+
     except ValueError as exc:
-        raise BadResponseException("Error: Invalid JSON format in response.") from exc
+        raise BadResponseException(
+            "Error: Invalid JSON format in response. "
+            "Please ensure the server is returning valid JSON.") from exc
 
 
 def evaluate_cadd_score(row, cadd_version="GRCh38-v1.7"):
@@ -65,15 +73,15 @@ def evaluate_cadd_score(row, cadd_version="GRCh38-v1.7"):
     score = fetch_cadd_scores(cadd_version, chromosome, position)
 
     if score is None or not isinstance(score, list) or len(score) < 2:
-        return "CADD score unavailable or invalid format"
+        raise ValueError("CADD score unavailable or invalid format")
 
     try:
         score_df = pd.DataFrame(score[1:], columns=score[0])
     except (IndexError, ValueError) as e:
-        return f"Error processing CADD score: {e}"
+        raise ValueError(f"Error processing CADD score: {e}") from e
 
     if "PHRED" not in score_df.columns:
-        return "PHRED score unavailable"
+        raise KeyError("PHRED score unavailable")
 
     sorted_df = score_df.sort_values(by="PHRED", ascending=False)
     highest_score_row = sorted_df.iloc[0]
@@ -92,24 +100,3 @@ def add_cadd_eval_column(data, cadd_version="GRCh38-v1.7"):
     data["cadd_eval(PHRED)"] = data.apply(evaluate_cadd_score, axis=1, cadd_version=cadd_version)
     return data
 
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Fetch CADD scores for genomic positions.")
-    parser.add_argument("version", help="CADD version, e.g., 'v1.3' or 'GRCh38-v1.7'")
-    parser.add_argument("chromosome", type=int, help="Chromosome number")
-    parser.add_argument("--position", type=int, help="Genomic position (for single SNV)")
-    parser.add_argument("--start", type=int,
-                        help="Genomic start position (for a range of positions)")
-    parser.add_argument("--end", type=int, help="Genomic end position (for a range of positions)")
-
-    args = parser.parse_args()
-
-    if args.position:
-        result = fetch_cadd_scores(args.version, args.chromosome, args.position)
-        print(result)
-    elif args.start and args.end:
-        result = fetch_cadd_scores(args.version, args.chromosome, args.start, args.end)
-        print(result)
-    else:
-        print("Please provide either '--position' for single SNV \
-              or '--start' and '--end' for a range of positions.")
diff --git a/tests/tools/cadd_script.py b/tests/tools/cadd_script.py
new file mode 100644
index 0000000..87527db
--- /dev/null
+++ b/tests/tools/cadd_script.py
@@ -0,0 +1,24 @@
+import argparse
+
+from tests.tools.cadd.cadd import fetch_cadd_scores
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Fetch CADD scores for genomic positions.")
+    parser.add_argument("version", help="CADD version, e.g., 'v1.3' or 'GRCh38-v1.7'")
+    parser.add_argument("chromosome", type=int, help="Chromosome number")
+    parser.add_argument("--position", type=int, help="Genomic position (for single SNV)")
+    parser.add_argument("--start", type=int,
+                        help="Genomic start position (for a range of positions)")
+    parser.add_argument("--end", type=int, help="Genomic end position (for a range of positions)")
+
+    args = parser.parse_args()
+
+    if args.position:
+        result = fetch_cadd_scores(args.version, args.chromosome, args.position)
+        print(result)
+    elif args.start and args.end:
+        result = fetch_cadd_scores(args.version, args.chromosome, args.start, args.end)
+        print(result)
+    else:
+        print("Please provide either '--position' for single SNV \
+              or '--start' and '--end' for a range of positions.")
\ No newline at end of file

From 50305aed01eb3796e49a9c0136fa4145968ef459 Mon Sep 17 00:00:00 2001
From: Vladyslav Levchenko <akadlevchenko@gmail.com>
Date: Sun, 29 Sep 2024 14:17:12 +0300
Subject: [PATCH 3/3] file position fix

---
 tests/pipeline.ipynb           |  5 ++--
 tests/tools/{cadd => }/cadd.py | 43 +++++++++++++++++++++++++++-------
 tests/tools/cadd_script.py     | 24 -------------------
 3 files changed, 37 insertions(+), 35 deletions(-)
 rename tests/tools/{cadd => }/cadd.py (69%)
 delete mode 100644 tests/tools/cadd_script.py

diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb
index a9d9289..ca70d11 100644
--- a/tests/pipeline.ipynb
+++ b/tests/pipeline.ipynb
@@ -15,7 +15,6 @@
    },
    "source": [
     "import pandas as pd\n",
-    "import requests\n",
     "\n",
     "from api.data import (store_database_for_eys_gene,\n",
     "                      parse_lovd,\n",
@@ -1680,7 +1679,7 @@
    "outputs": [],
    "execution_count": null,
    "source": [
-    "from api.tools import get_revel_scores\n",
+    "\n",
     "\n",
     "chromosome = 6\n",
     "position = 65655758\n",
@@ -1700,7 +1699,7 @@
    "outputs": [],
    "execution_count": null,
    "source": [
-    "from tests.tools.cadd.cadd import add_cadd_eval_column\n",
+    "from tests.tools.cadd import add_cadd_eval_column\n",
     "from api.data.refactoring import merge_gnomad_lovd, parse_gnomad, set_gnomad_dtypes\n",
     "from api import (store_database_for_eys_gene,\n",
     "                 parse_lovd,\n",
diff --git a/tests/tools/cadd/cadd.py b/tests/tools/cadd.py
similarity index 69%
rename from tests/tools/cadd/cadd.py
rename to tests/tools/cadd.py
index 61f8655..a82e7bb 100644
--- a/tests/tools/cadd/cadd.py
+++ b/tests/tools/cadd.py
@@ -1,6 +1,7 @@
 """ Module provides interface to web APIs of CADD tool. """
-import pandas as pd
+import argparse
 import requests
+import pandas as pd
 
 
 class BadResponseException(Exception):
@@ -13,14 +14,16 @@ class DownloadError(Exception):
 
 def fetch_cadd_scores(cadd_version, chromosome, start, end=None):
     """
-    Fetches CADD (Combined Annotation Dependent Depletion) scores for either a single SNV or a range of genomic
-    positions.
+    Fetches CADD (Combined Annotation Dependent Depletion)
+    scores for either a single SNV or a range of genomic positions.
 
     :param str cadd_version: Version of the CADD model used, e.g., "v1.3" or "GRCh38-v1.7".
     :param int chromosome: Chromosome number where the SNV or genomic region is located.
     :param int start: Genomic start position (or single position for SNV) of the region.
-    :param int end: (Optional) Genomic end position of the region. If not provided, fetches a single SNV.
-    :return: A dictionary containing CADD scores and annotations for the specified SNV or region, or None if an
+    :param int end: (Optional) Genomic end position of the region.
+    If not provided, fetches a single SNV.
+    :return: A dictionary containing CADD scores and annotations
+    for the specified SNV or region, or None if an
     error occurs.
     """
 
@@ -35,7 +38,8 @@ def fetch_cadd_scores(cadd_version, chromosome, start, end=None):
             data = response.json()
             return data
         raise BadResponseException(
-            f"Error: Received status code {response.status_code} - {response.reason}: {response.text}")
+            f"Error: Received status code {response.status_code} - "
+            f"{response.reason}: {response.text}")
 
     except requests.exceptions.Timeout as exc:
         raise DownloadError(
@@ -55,12 +59,14 @@ def fetch_cadd_scores(cadd_version, chromosome, start, end=None):
 
 def evaluate_cadd_score(row, cadd_version="GRCh38-v1.7"):
     """
-    Evaluates the CADD score for a given row in the DataFrame and returns the highest PHRED score evaluation.
+    Evaluates the CADD score for a given row in the
+     DataFrame and returns the highest PHRED score evaluation.
     Handles cases where the response is malformed or incomplete.
 
     :param row: A row from the DataFrame.
     :param str cadd_version: The CADD version to use for fetching the score.
-    :return: A string indicating the evaluation result based on the highest PHRED score, or an error message.
+    :return: A string indicating the evaluation result based
+     on the highest PHRED score, or an error message.
     """
     position = row.loc["hg38_gnomad_format"]
     chromosome = row.loc["chromosome"]
@@ -100,3 +106,24 @@ def add_cadd_eval_column(data, cadd_version="GRCh38-v1.7"):
     data["cadd_eval(PHRED)"] = data.apply(evaluate_cadd_score, axis=1, cadd_version=cadd_version)
     return data
 
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Fetch CADD scores for genomic positions.")
+    parser.add_argument("version", help="CADD version, e.g., 'v1.3' or 'GRCh38-v1.7'")
+    parser.add_argument("chromosome", type=int, help="Chromosome number")
+    parser.add_argument("--position", type=int, help="Genomic position (for single SNV)")
+    parser.add_argument("--start", type=int,
+                        help="Genomic start position (for a range of positions)")
+    parser.add_argument("--end", type=int, help="Genomic end position (for a range of positions)")
+
+    args = parser.parse_args()
+
+    if args.position:
+        result = fetch_cadd_scores(args.version, args.chromosome, args.position)
+        print(result)
+    elif args.start and args.end:
+        result = fetch_cadd_scores(args.version, args.chromosome, args.start, args.end)
+        print(result)
+    else:
+        print("Please provide either '--position' for single SNV \
+              or '--start' and '--end' for a range of positions.")
diff --git a/tests/tools/cadd_script.py b/tests/tools/cadd_script.py
deleted file mode 100644
index 87527db..0000000
--- a/tests/tools/cadd_script.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import argparse
-
-from tests.tools.cadd.cadd import fetch_cadd_scores
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Fetch CADD scores for genomic positions.")
-    parser.add_argument("version", help="CADD version, e.g., 'v1.3' or 'GRCh38-v1.7'")
-    parser.add_argument("chromosome", type=int, help="Chromosome number")
-    parser.add_argument("--position", type=int, help="Genomic position (for single SNV)")
-    parser.add_argument("--start", type=int,
-                        help="Genomic start position (for a range of positions)")
-    parser.add_argument("--end", type=int, help="Genomic end position (for a range of positions)")
-
-    args = parser.parse_args()
-
-    if args.position:
-        result = fetch_cadd_scores(args.version, args.chromosome, args.position)
-        print(result)
-    elif args.start and args.end:
-        result = fetch_cadd_scores(args.version, args.chromosome, args.start, args.end)
-        print(result)
-    else:
-        print("Please provide either '--position' for single SNV \
-              or '--start' and '--end' for a range of positions.")
\ No newline at end of file