Skip to content

VLE/KBE-4/cadd_integration #65

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 40 additions & 2 deletions tests/pipeline.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
},
"source": [
"import pandas as pd\n",
"import requests\n",
"\n",
"from api.data import (store_database_for_eys_gene,\n",
" parse_lovd,\n",
Expand Down Expand Up @@ -1680,7 +1679,7 @@
"outputs": [],
"execution_count": null,
"source": [
"from api.tools import get_revel_scores\n",
"\n",
"\n",
"chromosome = 6\n",
"position = 65655758\n",
Expand All @@ -1693,6 +1692,45 @@
"display(results)"
],
"id": "ba435cd29d565f7d"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"from tests.tools.cadd import add_cadd_eval_column\n",
"from api.data.refactoring import merge_gnomad_lovd, parse_gnomad, set_gnomad_dtypes\n",
"from api import (store_database_for_eys_gene,\n",
" parse_lovd,\n",
" set_lovd_dtypes,\n",
" LOVD_PATH,\n",
" GNOMAD_PATH)\n",
"import pandas as pd\n",
"\n",
"store_database_for_eys_gene('lovd', False)\n",
"store_database_for_eys_gene('gnomad', False)\n",
"\n",
"lovd_data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")\n",
"gnomad_data = parse_gnomad(GNOMAD_PATH+'/gnomad_data.csv')\n",
"\n",
"set_lovd_dtypes(lovd_data)\n",
"set_gnomad_dtypes(gnomad_data)\n",
"\n",
"variants_on_genome = lovd_data[\"Variants_On_Genome\"].copy()\n",
"\n",
"lovd_data = pd.merge(lovd_data[\"Variants_On_Transcripts\"],\n",
" variants_on_genome[['id','VariantOnGenome/DNA','VariantOnGenome/DNA/hg38','chromosome','position_g_start','position_g_end']],\n",
" on='id',\n",
" how='left')\n",
"\n",
"gnomad_data = gnomad_data.copy()\n",
"data = merge_gnomad_lovd(lovd_data, gnomad_data)\n",
"first_100_rows = data.head(100).copy()\n",
"result_data = add_cadd_eval_column(first_100_rows)\n",
"result_data"
],
"id": "1df284690ce590f4"
}
],
"metadata": {
Expand Down
129 changes: 129 additions & 0 deletions tests/tools/cadd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
""" Module provides interface to web APIs of CADD tool. """
import argparse
import requests
import pandas as pd


class BadResponseException(Exception):
"""Custom exception for bad responses."""


class DownloadError(Exception):
"""Custom exception for download errors."""


def fetch_cadd_scores(cadd_version, chromosome, start, end=None):
"""
Fetches CADD (Combined Annotation Dependent Depletion)
scores for either a single SNV or a range of genomic positions.

:param str cadd_version: Version of the CADD model used, e.g., "v1.3" or "GRCh38-v1.7".
:param int chromosome: Chromosome number where the SNV or genomic region is located.
:param int start: Genomic start position (or single position for SNV) of the region.
:param int end: (Optional) Genomic end position of the region.
If not provided, fetches a single SNV.
:return: A dictionary containing CADD scores and annotations
for the specified SNV or region, or None if an
error occurs.
"""

if end:
url = f"https://cadd.gs.washington.edu/api/v1.0/{cadd_version}/{chromosome}:{start}-{end}"
else:
url = f"https://cadd.gs.washington.edu/api/v1.0/{cadd_version}/{chromosome}:{start}"

try:
response = requests.get(url, timeout=30)
if response.status_code == 200:
data = response.json()
return data
raise BadResponseException(
f"Error: Received status code {response.status_code} - "
f"{response.reason}: {response.text}")

except requests.exceptions.Timeout as exc:
raise DownloadError(
"Error: Timeout occurred while trying to reach the server. "
"Please check your internet connection or the server status.") from exc

except requests.exceptions.RequestException as req_err:
raise DownloadError(
f"Error: An unexpected error occurred while making the request. "
f"Details: {req_err}") from req_err

except ValueError as exc:
raise BadResponseException(
"Error: Invalid JSON format in response. "
"Please ensure the server is returning valid JSON.") from exc


def evaluate_cadd_score(row, cadd_version="GRCh38-v1.7"):
"""
Evaluates the CADD score for a given row in the
DataFrame and returns the highest PHRED score evaluation.
Handles cases where the response is malformed or incomplete.

:param row: A row from the DataFrame.
:param str cadd_version: The CADD version to use for fetching the score.
:return: A string indicating the evaluation result based
on the highest PHRED score, or an error message.
"""
position = row.loc["hg38_gnomad_format"]
chromosome = row.loc["chromosome"]
if pd.isna(chromosome) or pd.isna(position):
chromosome = row.loc["Chromosome_gnomad"]
position= row.loc["Position_gnomad"]
else:
position = row.loc["hg38_gnomad_format"].split('-')[1]

score = fetch_cadd_scores(cadd_version, chromosome, position)

if score is None or not isinstance(score, list) or len(score) < 2:
raise ValueError("CADD score unavailable or invalid format")

try:
score_df = pd.DataFrame(score[1:], columns=score[0])
except (IndexError, ValueError) as e:
raise ValueError(f"Error processing CADD score: {e}") from e

if "PHRED" not in score_df.columns:
raise KeyError("PHRED score unavailable")

sorted_df = score_df.sort_values(by="PHRED", ascending=False)
highest_score_row = sorted_df.iloc[0]

return highest_score_row.loc['PHRED']


def add_cadd_eval_column(data, cadd_version="GRCh38-v1.7"):
"""
Adds a column 'cadd_eval' to the DataFrame based on CADD score evaluations for each row.

:param data: The merged DataFrame with genomic data.
:param str cadd_version: The version of the CADD model to use for score fetching.
:return: The updated DataFrame with the 'cadd_eval' column.
"""
data["cadd_eval(PHRED)"] = data.apply(evaluate_cadd_score, axis=1, cadd_version=cadd_version)
return data


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Fetch CADD scores for genomic positions.")
parser.add_argument("version", help="CADD version, e.g., 'v1.3' or 'GRCh38-v1.7'")
parser.add_argument("chromosome", type=int, help="Chromosome number")
parser.add_argument("--position", type=int, help="Genomic position (for single SNV)")
parser.add_argument("--start", type=int,
help="Genomic start position (for a range of positions)")
parser.add_argument("--end", type=int, help="Genomic end position (for a range of positions)")

args = parser.parse_args()

if args.position:
result = fetch_cadd_scores(args.version, args.chromosome, args.position)
print(result)
elif args.start and args.end:
result = fetch_cadd_scores(args.version, args.chromosome, args.start, args.end)
print(result)
else:
print("Please provide either '--position' for single SNV \
or '--start' and '--end' for a range of positions.")
Loading