From 197864c56ecd4d396ddac539c55680bcce5c17c8 Mon Sep 17 00:00:00 2001 From: Kajus CC <42713684+KajusC@users.noreply.github.com> Date: Thu, 22 Aug 2024 20:28:25 +0300 Subject: [PATCH] Removed from last branch --- api/__init__.py | 2 - api/data/__init__.py | 3 -- api/data/refactoring.py | 115 ---------------------------------------- tests/pipeline.ipynb | 115 ---------------------------------------- 4 files changed, 235 deletions(-) diff --git a/api/__init__.py b/api/__init__.py index fb618dd..459952b 100644 --- a/api/__init__.py +++ b/api/__init__.py @@ -56,6 +56,4 @@ parse_lovd, from_clinvar_name_to_cdna_position, save_lovd_as_vcf, - request_clinvar_api_data, - get_variant_ids_from_clinvar_name_api, ) diff --git a/api/data/__init__.py b/api/data/__init__.py index 9598171..7cd3997 100644 --- a/api/data/__init__.py +++ b/api/data/__init__.py @@ -56,8 +56,5 @@ parse_lovd, from_clinvar_name_to_cdna_position, save_lovd_as_vcf, - request_clinvar_api_data, - get_variant_ids_from_clinvar_name_api, - extract_nested_json, request_gnomad_api_data, ) diff --git a/api/data/refactoring.py b/api/data/refactoring.py index 8057842..1ac916b 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -159,121 +159,6 @@ def save_lovd_as_vcf(data, save_to="./lovd.vcf"): f.write("\n") -def get_variant_ids_from_clinvar_name_api(name, count=100): - """ - Extracts variant ids from ClinVar `name` variable. /n - key of dictionary is the size of the list of ids. - - :param str name: name of variant - :param int count: number of ids to extract - :returns: ids of variants - :rtype: str - """ - - result = {} - base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=clinvar&term=" - clinvar_url = f"{base_url}{name}&retmode=json&retmax={count}" - - request = requests.get(clinvar_url) - - if request.status_code != 200: - raise ValueError(f"Request failed with status code {request.status_code}") - - data = request.json() - - result['idlist'] = data['esearchresult']['idlist'] - result['count'] = data['esearchresult']['count'] - - return result - - -def extract_nested_json(flat_parsed, parsed_from, required_column, prefix, join_prefix): - """ - Extracts nested JSON data from dictionary. - - :param DataFrame parsed_from: normalised JSON data - :param str required_column: column to extract - :param str prefix: prefix for extracted columns - """ - - data_set = parsed_from.get(required_column, []) - for idx, data in enumerate(data_set): - flat_data = pd.json_normalize(data, sep='_') - flat_data = flat_data.add_prefix(f'{prefix}_{idx}_') - flat_parsed = flat_parsed.join(flat_data, rsuffix=f'_{idx}_{join_prefix}') - - -def request_clinvar_api_data(gene_id): - """ - Requests ClinVar API for data about variant with given id. - Converts it to pandas dataframe. - - :param str gene_id: id of variant (may be multiple) - :returns: DataFrame from ClinVar API - :rtype: DataFrame - """ - base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=clinvar&id=" - clinvar_url = f"{base_url}{gene_id}&retmode=json" - - request = requests.get(clinvar_url) - - if request.status_code != 200: - raise ValueError(f"Request failed with status code {request.status_code}") - - results = request.json()['result'] - - flattened_data = [] - - for uid in results['uids']: - entry = results[uid] - - flattened_entry = pd.json_normalize(entry, sep='_') - - variation_set = flattened_entry.at[0, 'variation_set'] - for idx, var_set in enumerate(variation_set): - flat_var_set = pd.json_normalize(var_set, sep='_') - flat_var_set = flat_var_set.add_prefix(f'variation_set_{idx}_') - - extract_nested_json(flat_var_set, var_set, 'variation_loc', f'variation_set_{idx}_loc', 'loc') - extract_nested_json(flat_var_set, var_set, 'variation_xrefs', f'variation_set_{idx}_xrefs', 'xrefs') - extract_nested_json(flat_var_set, var_set, 'allele_freq_set', f'variation_set_{idx}_allele_freq', 'allele_freq') - - flat_var_set = flat_var_set.drop( - columns=[f'variation_set_{idx}_variation_loc', - f'variation_set_{idx}_variation_xrefs', - f'variation_set_{idx}_allele_freq_set']) - flattened_entry = flattened_entry.join(flat_var_set, rsuffix=f'_{idx}_vs') - - # this extraction is different from the previous ones - - genes = flattened_entry.at[0, 'genes'] - for idx, gene in enumerate(genes): - flat_genes = pd.json_normalize(gene, sep='_') - flat_genes = flat_genes.add_prefix(f'gene_{idx}_') - flattened_entry = flattened_entry.join(flat_genes, rsuffix=f'_{idx}_g') - - germline_classification_trait_set = flattened_entry.at[0, - 'germline_classification_trait_set'] - for idx, germline_set in enumerate(germline_classification_trait_set): - flat_germline_set = pd.json_normalize(germline_set, sep='_') - flat_germline_set = flat_germline_set.add_prefix(f'germline_set_{idx}_') - - extract_nested_json(flat_germline_set, germline_set, 'trait_xrefs', f'germline_set_{idx}_trait_xrefs', 'trait_xrefs') - - flat_germline_set = flat_germline_set.drop(columns=[f'germline_set_{idx}_trait_xrefs']) - flattened_entry = flattened_entry.join(flat_germline_set, rsuffix=f'_{idx}_gls') - - flattened_entry = flattened_entry.drop(columns=['variation_set', - 'genes', - 'germline_classification_trait_set']) - - flattened_data.append(flattened_entry) - - df = pd.concat(flattened_data, ignore_index=True) - - return df - - def request_gnomad_api_data(gene_name, to_file=True): """ Requests gnomAD API for data about a specific gene containing: diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index a838cd4..044e76d 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -132,121 +132,6 @@ "outputs": [], "execution_count": null }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "variation_ids = '148002'\n", - "\n", - "frames = request_clinvar_api_data(variation_ids)\n", - "\n", - "display(frames)" - ], - "id": "b21c3487476b684f", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "clinvar_data = pd.read_csv(\"C:\\\\Users\\\\Kajus\\\\Desktop\\\\clinvar_results.txt\", sep='\\t')\n", - "\n", - "display(clinvar_data)" - ], - "id": "8cb4bbe3f35562d5", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "Explanation of whats happening in the code below:\n", - "\n", - "Function to get all the ids from a gene name:\n", - "```python\n", - "get_variant_ids_from_clinvar_name_api(name: str, count: int)\n", - "```\n", - "\n", - "function gets the ids from the clinvar api, the name is the gene name and the count is the maximum number of ids to get (api's limit is 500)\n", - "\n", - "function returns a dictionary with the count and the list of ids:\n", - "\n", - "```json\n", - "{\n", - " 'count': int,\n", - " 'idlist': List[str]\n", - "}\n", - "```\n", - "\n", - "if the count is greater than the api's limit, the function will split the list of ids into smaller lists of 500 and then request the data from the api in chunks of 500 ids:\n", - "\n", - "```python\n", - "id_lists = [id_list[i:i + max] for i in range(0, size, max)]\n", - "```\n", - "\n", - "then the function will request the data from the api and concatenate the dataframes into a single dataframe:\n", - "\n", - "```python\n", - "frames = request_clinvar_api_data(join)\n", - "variations = pd.concat([variations, frames], ignore_index=True)\n", - "```\n", - "\n", - "The variant extraction function contains a lot of nested lists and dictionaries, so the function will flatten the data and then concatenate the dataframes into a single dataframe\n", - "\n", - "**NOTE**\n", - "\n", - "> joining function may have been implemented wrong due to the waiting time of the api.\n" - ], - "id": "976f9632a8ef29e3" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "import pandas as pd\n", - "\n", - "variations = pd.DataFrame()\n", - "\n", - "max = 500\n", - "name = \"EYS\"\n", - "count = 2147483647\n", - "\n", - "id_array = get_variant_ids_from_clinvar_name_api(name, count)\n", - "size = int(id_array['count'])\n", - "id_list = id_array['idlist']\n", - "\n", - "id_lists = [id_list[i:i + max] for i in range(0, size, max)]\n", - "\n", - "track = 0\n", - "for lists in id_lists:\n", - " join = \",\".join(lists)\n", - " frame = request_clinvar_api_data(join)\n", - " \n", - " variations = pd.concat([variations, frame], ignore_index=True)\n", - " \n", - " print(f\"{track + 1}/{len(id_lists)}\")\n", - " track += 1\n", - "\n", - "display(variations)\n" - ], - "id": "129175e3a2e568be", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "clinvar_data = pd.read_csv('C:\\\\Users\\\\Kajus\\\\Desktop\\\\clinvar_result.txt', sep='\\t')\n", - "\n", - "display(clinvar_data)" - ], - "id": "c85507a3e2c584da", - "outputs": [], - "execution_count": null - }, { "metadata": { "ExecuteTime": {