updated cadd tool(merge issues)

Strexas · Sep 24, 2024 · f76cc96 · f76cc96
1 parent dd78258
commit f76cc96
Show file tree

Hide file tree

Showing 2 changed files with 154 additions and 0 deletions.
diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb
@@ -1693,6 +1693,45 @@
     "display(results)"
    ],
    "id": "ba435cd29d565f7d"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "from tests.tools.cadd.cadd import add_cadd_eval_column\n",
+    "from api.data.refactoring import merge_gnomad_lovd, parse_gnomad, set_gnomad_dtypes\n",
+    "from api import (store_database_for_eys_gene,\n",
+    "                 parse_lovd,\n",
+    "                 set_lovd_dtypes,\n",
+    "                 LOVD_PATH,\n",
+    "                 GNOMAD_PATH)\n",
+    "import pandas as pd\n",
+    "\n",
+    "store_database_for_eys_gene('lovd', False)\n",
+    "store_database_for_eys_gene('gnomad', False)\n",
+    "\n",
+    "lovd_data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")\n",
+    "gnomad_data = parse_gnomad(GNOMAD_PATH+'/gnomad_data.csv')\n",
+    "\n",
+    "set_lovd_dtypes(lovd_data)\n",
+    "set_gnomad_dtypes(gnomad_data)\n",
+    "\n",
+    "variants_on_genome = lovd_data[\"Variants_On_Genome\"].copy()\n",
+    "\n",
+    "lovd_data = pd.merge(lovd_data[\"Variants_On_Transcripts\"],\n",
+    "                       variants_on_genome[['id','VariantOnGenome/DNA','VariantOnGenome/DNA/hg38','chromosome','position_g_start','position_g_end']],\n",
+    "                       on='id',\n",
+    "                       how='left')\n",
+    "\n",
+    "gnomad_data = gnomad_data.copy()\n",
+    "data = merge_gnomad_lovd(lovd_data, gnomad_data)\n",
+    "first_100_rows = data.head(100).copy()\n",
+    "result_data = add_cadd_eval_column(first_100_rows)\n",
+    "result_data"
+   ],
+   "id": "1df284690ce590f4"
   }
  ],
  "metadata": {

diff --git a/tests/tools/cadd/cadd.py b/tests/tools/cadd/cadd.py
@@ -0,0 +1,115 @@
+""" Module provides interface to web APIs of CADD tool. """
+import argparse
+
+import pandas as pd
+import requests
+
+
+class BadResponseException(Exception):
+    """Custom exception for bad responses."""
+
+
+class DownloadError(Exception):
+    """Custom exception for download errors."""
+
+
+def fetch_cadd_scores(cadd_version, chromosome, start, end=None):
+    """
+    Fetches CADD (Combined Annotation Dependent Depletion) scores for either a single SNV or a range of genomic
+    positions.
+
+    :param str cadd_version: Version of the CADD model used, e.g., "v1.3" or "GRCh38-v1.7".
+    :param int chromosome: Chromosome number where the SNV or genomic region is located.
+    :param int start: Genomic start position (or single position for SNV) of the region.
+    :param int end: (Optional) Genomic end position of the region. If not provided, fetches a single SNV.
+    :return: A dictionary containing CADD scores and annotations for the specified SNV or region, or None if an
+    error occurs.
+    """
+
+    if end:
+        url = f"https://cadd.gs.washington.edu/api/v1.0/{cadd_version}/{chromosome}:{start}-{end}"
+    else:
+        url = f"https://cadd.gs.washington.edu/api/v1.0/{cadd_version}/{chromosome}:{start}"
+
+    try:
+        response = requests.get(url, timeout=30)
+        if response.status_code == 200:
+            data = response.json()
+            return data
+        raise BadResponseException(f"Error: {response.status_code} - {response.text}")
+    except requests.exceptions.Timeout as exc:
+        raise DownloadError("Error: Timeout occurred while trying to reach the server.") from exc
+    except requests.exceptions.RequestException as req_err:
+        raise DownloadError(f"Error: {req_err}") from req_err
+    except ValueError as exc:
+        raise BadResponseException("Error: Invalid JSON format in response.") from exc
+
+
+def evaluate_cadd_score(row, cadd_version="GRCh38-v1.7"):
+    """
+    Evaluates the CADD score for a given row in the DataFrame and returns the highest PHRED score evaluation.
+    Handles cases where the response is malformed or incomplete.
+
+    :param row: A row from the DataFrame.
+    :param str cadd_version: The CADD version to use for fetching the score.
+    :return: A string indicating the evaluation result based on the highest PHRED score, or an error message.
+    """
+    position = row.loc["hg38_gnomad_format"]
+    chromosome = row.loc["chromosome"]
+    if pd.isna(chromosome) or pd.isna(position):
+        chromosome = row.loc["Chromosome_gnomad"]
+        position= row.loc["Position_gnomad"]
+    else:
+        position = row.loc["hg38_gnomad_format"].split('-')[1]
+
+    score = fetch_cadd_scores(cadd_version, chromosome, position)
+
+    if score is None or not isinstance(score, list) or len(score) < 2:
+        return "CADD score unavailable or invalid format"
+
+    try:
+        score_df = pd.DataFrame(score[1:], columns=score[0])
+    except (IndexError, ValueError) as e:
+        return f"Error processing CADD score: {e}"
+
+    if "PHRED" not in score_df.columns:
+        return "PHRED score unavailable"
+
+    sorted_df = score_df.sort_values(by="PHRED", ascending=False)
+    highest_score_row = sorted_df.iloc[0]
+
+    return highest_score_row.loc['PHRED']
+
+
+def add_cadd_eval_column(data, cadd_version="GRCh38-v1.7"):
+    """
+    Adds a column 'cadd_eval' to the DataFrame based on CADD score evaluations for each row.
+
+    :param data: The merged DataFrame with genomic data.
+    :param str cadd_version: The version of the CADD model to use for score fetching.
+    :return: The updated DataFrame with the 'cadd_eval' column.
+    """
+    data["cadd_eval(PHRED)"] = data.apply(evaluate_cadd_score, axis=1, cadd_version=cadd_version)
+    return data
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Fetch CADD scores for genomic positions.")
+    parser.add_argument("version", help="CADD version, e.g., 'v1.3' or 'GRCh38-v1.7'")
+    parser.add_argument("chromosome", type=int, help="Chromosome number")
+    parser.add_argument("--position", type=int, help="Genomic position (for single SNV)")
+    parser.add_argument("--start", type=int,
+                        help="Genomic start position (for a range of positions)")
+    parser.add_argument("--end", type=int, help="Genomic end position (for a range of positions)")
+
+    args = parser.parse_args()
+
+    if args.position:
+        result = fetch_cadd_scores(args.version, args.chromosome, args.position)
+        print(result)
+    elif args.start and args.end:
+        result = fetch_cadd_scores(args.version, args.chromosome, args.start, args.end)
+        print(result)
+    else:
+        print("Please provide either '--position' for single SNV \
+              or '--start' and '--end' for a range of positions.")