From f76cc96dc29e03c3d41f4d1d77ffc2547889df97 Mon Sep 17 00:00:00 2001 From: Vladyslav Levchenko Date: Tue, 24 Sep 2024 19:31:01 +0300 Subject: [PATCH 1/3] updated cadd tool(merge issues) --- tests/pipeline.ipynb | 39 +++++++++++++ tests/tools/cadd/cadd.py | 115 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+) create mode 100644 tests/tools/cadd/cadd.py diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 23df568..a9d9289 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -1693,6 +1693,45 @@ "display(results)" ], "id": "ba435cd29d565f7d" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "from tests.tools.cadd.cadd import add_cadd_eval_column\n", + "from api.data.refactoring import merge_gnomad_lovd, parse_gnomad, set_gnomad_dtypes\n", + "from api import (store_database_for_eys_gene,\n", + " parse_lovd,\n", + " set_lovd_dtypes,\n", + " LOVD_PATH,\n", + " GNOMAD_PATH)\n", + "import pandas as pd\n", + "\n", + "store_database_for_eys_gene('lovd', False)\n", + "store_database_for_eys_gene('gnomad', False)\n", + "\n", + "lovd_data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")\n", + "gnomad_data = parse_gnomad(GNOMAD_PATH+'/gnomad_data.csv')\n", + "\n", + "set_lovd_dtypes(lovd_data)\n", + "set_gnomad_dtypes(gnomad_data)\n", + "\n", + "variants_on_genome = lovd_data[\"Variants_On_Genome\"].copy()\n", + "\n", + "lovd_data = pd.merge(lovd_data[\"Variants_On_Transcripts\"],\n", + " variants_on_genome[['id','VariantOnGenome/DNA','VariantOnGenome/DNA/hg38','chromosome','position_g_start','position_g_end']],\n", + " on='id',\n", + " how='left')\n", + "\n", + "gnomad_data = gnomad_data.copy()\n", + "data = merge_gnomad_lovd(lovd_data, gnomad_data)\n", + "first_100_rows = data.head(100).copy()\n", + "result_data = add_cadd_eval_column(first_100_rows)\n", + "result_data" + ], + "id": "1df284690ce590f4" } ], "metadata": { diff --git a/tests/tools/cadd/cadd.py b/tests/tools/cadd/cadd.py new file mode 100644 index 0000000..5b13ba8 --- /dev/null +++ b/tests/tools/cadd/cadd.py @@ -0,0 +1,115 @@ +""" Module provides interface to web APIs of CADD tool. """ +import argparse + +import pandas as pd +import requests + + +class BadResponseException(Exception): + """Custom exception for bad responses.""" + + +class DownloadError(Exception): + """Custom exception for download errors.""" + + +def fetch_cadd_scores(cadd_version, chromosome, start, end=None): + """ + Fetches CADD (Combined Annotation Dependent Depletion) scores for either a single SNV or a range of genomic + positions. + + :param str cadd_version: Version of the CADD model used, e.g., "v1.3" or "GRCh38-v1.7". + :param int chromosome: Chromosome number where the SNV or genomic region is located. + :param int start: Genomic start position (or single position for SNV) of the region. + :param int end: (Optional) Genomic end position of the region. If not provided, fetches a single SNV. + :return: A dictionary containing CADD scores and annotations for the specified SNV or region, or None if an + error occurs. + """ + + if end: + url = f"https://cadd.gs.washington.edu/api/v1.0/{cadd_version}/{chromosome}:{start}-{end}" + else: + url = f"https://cadd.gs.washington.edu/api/v1.0/{cadd_version}/{chromosome}:{start}" + + try: + response = requests.get(url, timeout=30) + if response.status_code == 200: + data = response.json() + return data + raise BadResponseException(f"Error: {response.status_code} - {response.text}") + except requests.exceptions.Timeout as exc: + raise DownloadError("Error: Timeout occurred while trying to reach the server.") from exc + except requests.exceptions.RequestException as req_err: + raise DownloadError(f"Error: {req_err}") from req_err + except ValueError as exc: + raise BadResponseException("Error: Invalid JSON format in response.") from exc + + +def evaluate_cadd_score(row, cadd_version="GRCh38-v1.7"): + """ + Evaluates the CADD score for a given row in the DataFrame and returns the highest PHRED score evaluation. + Handles cases where the response is malformed or incomplete. + + :param row: A row from the DataFrame. + :param str cadd_version: The CADD version to use for fetching the score. + :return: A string indicating the evaluation result based on the highest PHRED score, or an error message. + """ + position = row.loc["hg38_gnomad_format"] + chromosome = row.loc["chromosome"] + if pd.isna(chromosome) or pd.isna(position): + chromosome = row.loc["Chromosome_gnomad"] + position= row.loc["Position_gnomad"] + else: + position = row.loc["hg38_gnomad_format"].split('-')[1] + + score = fetch_cadd_scores(cadd_version, chromosome, position) + + if score is None or not isinstance(score, list) or len(score) < 2: + return "CADD score unavailable or invalid format" + + try: + score_df = pd.DataFrame(score[1:], columns=score[0]) + except (IndexError, ValueError) as e: + return f"Error processing CADD score: {e}" + + if "PHRED" not in score_df.columns: + return "PHRED score unavailable" + + sorted_df = score_df.sort_values(by="PHRED", ascending=False) + highest_score_row = sorted_df.iloc[0] + + return highest_score_row.loc['PHRED'] + + +def add_cadd_eval_column(data, cadd_version="GRCh38-v1.7"): + """ + Adds a column 'cadd_eval' to the DataFrame based on CADD score evaluations for each row. + + :param data: The merged DataFrame with genomic data. + :param str cadd_version: The version of the CADD model to use for score fetching. + :return: The updated DataFrame with the 'cadd_eval' column. + """ + data["cadd_eval(PHRED)"] = data.apply(evaluate_cadd_score, axis=1, cadd_version=cadd_version) + return data + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Fetch CADD scores for genomic positions.") + parser.add_argument("version", help="CADD version, e.g., 'v1.3' or 'GRCh38-v1.7'") + parser.add_argument("chromosome", type=int, help="Chromosome number") + parser.add_argument("--position", type=int, help="Genomic position (for single SNV)") + parser.add_argument("--start", type=int, + help="Genomic start position (for a range of positions)") + parser.add_argument("--end", type=int, help="Genomic end position (for a range of positions)") + + args = parser.parse_args() + + if args.position: + result = fetch_cadd_scores(args.version, args.chromosome, args.position) + print(result) + elif args.start and args.end: + result = fetch_cadd_scores(args.version, args.chromosome, args.start, args.end) + print(result) + else: + print("Please provide either '--position' for single SNV \ + or '--start' and '--end' for a range of positions.") From 20dacd50adfa3aabf9df64537461a2a8d15f8004 Mon Sep 17 00:00:00 2001 From: Vladyslav Levchenko Date: Wed, 25 Sep 2024 23:01:13 +0300 Subject: [PATCH 2/3] exceptions fix and moved cadd_scripts.py to cadd folder --- tests/tools/cadd/cadd.py | 47 ++++++++++++++------------------------ tests/tools/cadd_script.py | 24 +++++++++++++++++++ 2 files changed, 41 insertions(+), 30 deletions(-) create mode 100644 tests/tools/cadd_script.py diff --git a/tests/tools/cadd/cadd.py b/tests/tools/cadd/cadd.py index 5b13ba8..61f8655 100644 --- a/tests/tools/cadd/cadd.py +++ b/tests/tools/cadd/cadd.py @@ -1,6 +1,4 @@ """ Module provides interface to web APIs of CADD tool. """ -import argparse - import pandas as pd import requests @@ -36,13 +34,23 @@ def fetch_cadd_scores(cadd_version, chromosome, start, end=None): if response.status_code == 200: data = response.json() return data - raise BadResponseException(f"Error: {response.status_code} - {response.text}") + raise BadResponseException( + f"Error: Received status code {response.status_code} - {response.reason}: {response.text}") + except requests.exceptions.Timeout as exc: - raise DownloadError("Error: Timeout occurred while trying to reach the server.") from exc + raise DownloadError( + "Error: Timeout occurred while trying to reach the server. " + "Please check your internet connection or the server status.") from exc + except requests.exceptions.RequestException as req_err: - raise DownloadError(f"Error: {req_err}") from req_err + raise DownloadError( + f"Error: An unexpected error occurred while making the request. " + f"Details: {req_err}") from req_err + except ValueError as exc: - raise BadResponseException("Error: Invalid JSON format in response.") from exc + raise BadResponseException( + "Error: Invalid JSON format in response. " + "Please ensure the server is returning valid JSON.") from exc def evaluate_cadd_score(row, cadd_version="GRCh38-v1.7"): @@ -65,15 +73,15 @@ def evaluate_cadd_score(row, cadd_version="GRCh38-v1.7"): score = fetch_cadd_scores(cadd_version, chromosome, position) if score is None or not isinstance(score, list) or len(score) < 2: - return "CADD score unavailable or invalid format" + raise ValueError("CADD score unavailable or invalid format") try: score_df = pd.DataFrame(score[1:], columns=score[0]) except (IndexError, ValueError) as e: - return f"Error processing CADD score: {e}" + raise ValueError(f"Error processing CADD score: {e}") from e if "PHRED" not in score_df.columns: - return "PHRED score unavailable" + raise KeyError("PHRED score unavailable") sorted_df = score_df.sort_values(by="PHRED", ascending=False) highest_score_row = sorted_df.iloc[0] @@ -92,24 +100,3 @@ def add_cadd_eval_column(data, cadd_version="GRCh38-v1.7"): data["cadd_eval(PHRED)"] = data.apply(evaluate_cadd_score, axis=1, cadd_version=cadd_version) return data - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Fetch CADD scores for genomic positions.") - parser.add_argument("version", help="CADD version, e.g., 'v1.3' or 'GRCh38-v1.7'") - parser.add_argument("chromosome", type=int, help="Chromosome number") - parser.add_argument("--position", type=int, help="Genomic position (for single SNV)") - parser.add_argument("--start", type=int, - help="Genomic start position (for a range of positions)") - parser.add_argument("--end", type=int, help="Genomic end position (for a range of positions)") - - args = parser.parse_args() - - if args.position: - result = fetch_cadd_scores(args.version, args.chromosome, args.position) - print(result) - elif args.start and args.end: - result = fetch_cadd_scores(args.version, args.chromosome, args.start, args.end) - print(result) - else: - print("Please provide either '--position' for single SNV \ - or '--start' and '--end' for a range of positions.") diff --git a/tests/tools/cadd_script.py b/tests/tools/cadd_script.py new file mode 100644 index 0000000..87527db --- /dev/null +++ b/tests/tools/cadd_script.py @@ -0,0 +1,24 @@ +import argparse + +from tests.tools.cadd.cadd import fetch_cadd_scores + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Fetch CADD scores for genomic positions.") + parser.add_argument("version", help="CADD version, e.g., 'v1.3' or 'GRCh38-v1.7'") + parser.add_argument("chromosome", type=int, help="Chromosome number") + parser.add_argument("--position", type=int, help="Genomic position (for single SNV)") + parser.add_argument("--start", type=int, + help="Genomic start position (for a range of positions)") + parser.add_argument("--end", type=int, help="Genomic end position (for a range of positions)") + + args = parser.parse_args() + + if args.position: + result = fetch_cadd_scores(args.version, args.chromosome, args.position) + print(result) + elif args.start and args.end: + result = fetch_cadd_scores(args.version, args.chromosome, args.start, args.end) + print(result) + else: + print("Please provide either '--position' for single SNV \ + or '--start' and '--end' for a range of positions.") \ No newline at end of file From 50305aed01eb3796e49a9c0136fa4145968ef459 Mon Sep 17 00:00:00 2001 From: Vladyslav Levchenko Date: Sun, 29 Sep 2024 14:17:12 +0300 Subject: [PATCH 3/3] file position fix --- tests/pipeline.ipynb | 5 ++-- tests/tools/{cadd => }/cadd.py | 43 +++++++++++++++++++++++++++------- tests/tools/cadd_script.py | 24 ------------------- 3 files changed, 37 insertions(+), 35 deletions(-) rename tests/tools/{cadd => }/cadd.py (69%) delete mode 100644 tests/tools/cadd_script.py diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index a9d9289..ca70d11 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -15,7 +15,6 @@ }, "source": [ "import pandas as pd\n", - "import requests\n", "\n", "from api.data import (store_database_for_eys_gene,\n", " parse_lovd,\n", @@ -1680,7 +1679,7 @@ "outputs": [], "execution_count": null, "source": [ - "from api.tools import get_revel_scores\n", + "\n", "\n", "chromosome = 6\n", "position = 65655758\n", @@ -1700,7 +1699,7 @@ "outputs": [], "execution_count": null, "source": [ - "from tests.tools.cadd.cadd import add_cadd_eval_column\n", + "from tests.tools.cadd import add_cadd_eval_column\n", "from api.data.refactoring import merge_gnomad_lovd, parse_gnomad, set_gnomad_dtypes\n", "from api import (store_database_for_eys_gene,\n", " parse_lovd,\n", diff --git a/tests/tools/cadd/cadd.py b/tests/tools/cadd.py similarity index 69% rename from tests/tools/cadd/cadd.py rename to tests/tools/cadd.py index 61f8655..a82e7bb 100644 --- a/tests/tools/cadd/cadd.py +++ b/tests/tools/cadd.py @@ -1,6 +1,7 @@ """ Module provides interface to web APIs of CADD tool. """ -import pandas as pd +import argparse import requests +import pandas as pd class BadResponseException(Exception): @@ -13,14 +14,16 @@ class DownloadError(Exception): def fetch_cadd_scores(cadd_version, chromosome, start, end=None): """ - Fetches CADD (Combined Annotation Dependent Depletion) scores for either a single SNV or a range of genomic - positions. + Fetches CADD (Combined Annotation Dependent Depletion) + scores for either a single SNV or a range of genomic positions. :param str cadd_version: Version of the CADD model used, e.g., "v1.3" or "GRCh38-v1.7". :param int chromosome: Chromosome number where the SNV or genomic region is located. :param int start: Genomic start position (or single position for SNV) of the region. - :param int end: (Optional) Genomic end position of the region. If not provided, fetches a single SNV. - :return: A dictionary containing CADD scores and annotations for the specified SNV or region, or None if an + :param int end: (Optional) Genomic end position of the region. + If not provided, fetches a single SNV. + :return: A dictionary containing CADD scores and annotations + for the specified SNV or region, or None if an error occurs. """ @@ -35,7 +38,8 @@ def fetch_cadd_scores(cadd_version, chromosome, start, end=None): data = response.json() return data raise BadResponseException( - f"Error: Received status code {response.status_code} - {response.reason}: {response.text}") + f"Error: Received status code {response.status_code} - " + f"{response.reason}: {response.text}") except requests.exceptions.Timeout as exc: raise DownloadError( @@ -55,12 +59,14 @@ def fetch_cadd_scores(cadd_version, chromosome, start, end=None): def evaluate_cadd_score(row, cadd_version="GRCh38-v1.7"): """ - Evaluates the CADD score for a given row in the DataFrame and returns the highest PHRED score evaluation. + Evaluates the CADD score for a given row in the + DataFrame and returns the highest PHRED score evaluation. Handles cases where the response is malformed or incomplete. :param row: A row from the DataFrame. :param str cadd_version: The CADD version to use for fetching the score. - :return: A string indicating the evaluation result based on the highest PHRED score, or an error message. + :return: A string indicating the evaluation result based + on the highest PHRED score, or an error message. """ position = row.loc["hg38_gnomad_format"] chromosome = row.loc["chromosome"] @@ -100,3 +106,24 @@ def add_cadd_eval_column(data, cadd_version="GRCh38-v1.7"): data["cadd_eval(PHRED)"] = data.apply(evaluate_cadd_score, axis=1, cadd_version=cadd_version) return data + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Fetch CADD scores for genomic positions.") + parser.add_argument("version", help="CADD version, e.g., 'v1.3' or 'GRCh38-v1.7'") + parser.add_argument("chromosome", type=int, help="Chromosome number") + parser.add_argument("--position", type=int, help="Genomic position (for single SNV)") + parser.add_argument("--start", type=int, + help="Genomic start position (for a range of positions)") + parser.add_argument("--end", type=int, help="Genomic end position (for a range of positions)") + + args = parser.parse_args() + + if args.position: + result = fetch_cadd_scores(args.version, args.chromosome, args.position) + print(result) + elif args.start and args.end: + result = fetch_cadd_scores(args.version, args.chromosome, args.start, args.end) + print(result) + else: + print("Please provide either '--position' for single SNV \ + or '--start' and '--end' for a range of positions.") diff --git a/tests/tools/cadd_script.py b/tests/tools/cadd_script.py deleted file mode 100644 index 87527db..0000000 --- a/tests/tools/cadd_script.py +++ /dev/null @@ -1,24 +0,0 @@ -import argparse - -from tests.tools.cadd.cadd import fetch_cadd_scores - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Fetch CADD scores for genomic positions.") - parser.add_argument("version", help="CADD version, e.g., 'v1.3' or 'GRCh38-v1.7'") - parser.add_argument("chromosome", type=int, help="Chromosome number") - parser.add_argument("--position", type=int, help="Genomic position (for single SNV)") - parser.add_argument("--start", type=int, - help="Genomic start position (for a range of positions)") - parser.add_argument("--end", type=int, help="Genomic end position (for a range of positions)") - - args = parser.parse_args() - - if args.position: - result = fetch_cadd_scores(args.version, args.chromosome, args.position) - print(result) - elif args.start and args.end: - result = fetch_cadd_scores(args.version, args.chromosome, args.start, args.end) - print(result) - else: - print("Please provide either '--position' for single SNV \ - or '--start' and '--end' for a range of positions.") \ No newline at end of file