Removed from last branch

Strexas · Aug 22, 2024 · 197864c · 197864c
1 parent 5c30ce1
commit 197864c
Show file tree

Hide file tree

Showing 4 changed files with 0 additions and 235 deletions.
diff --git a/api/__init__.py b/api/__init__.py
@@ -56,6 +56,4 @@
     parse_lovd,
     from_clinvar_name_to_cdna_position,
     save_lovd_as_vcf,
-    request_clinvar_api_data,
-    get_variant_ids_from_clinvar_name_api,
 )
diff --git a/api/data/__init__.py b/api/data/__init__.py
@@ -56,8 +56,5 @@
     parse_lovd,
     from_clinvar_name_to_cdna_position,
     save_lovd_as_vcf,
-    request_clinvar_api_data,
-    get_variant_ids_from_clinvar_name_api,
-    extract_nested_json,
     request_gnomad_api_data,
 )
diff --git a/api/data/refactoring.py b/api/data/refactoring.py
@@ -159,121 +159,6 @@ def save_lovd_as_vcf(data, save_to="./lovd.vcf"):
             f.write("\n")
 
 
-def get_variant_ids_from_clinvar_name_api(name, count=100):
-    """
-    Extracts variant ids from ClinVar `name` variable. /n
-    key of dictionary is the size of the list of ids.
-
-    :param str name: name of variant
-    :param int count: number of ids to extract
-    :returns: ids of variants
-    :rtype: str
-    """
-
-    result = {}
-    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=clinvar&term="
-    clinvar_url = f"{base_url}{name}&retmode=json&retmax={count}"
-
-    request = requests.get(clinvar_url)
-
-    if request.status_code != 200:
-        raise ValueError(f"Request failed with status code {request.status_code}")
-
-    data = request.json()
-
-    result['idlist'] = data['esearchresult']['idlist']
-    result['count'] = data['esearchresult']['count']
-
-    return result
-
-
-def extract_nested_json(flat_parsed, parsed_from, required_column, prefix, join_prefix):
-    """
-    Extracts nested JSON data from dictionary.
-
-    :param DataFrame parsed_from: normalised JSON data
-    :param str required_column: column to extract
-    :param str prefix: prefix for extracted columns
-    """
-
-    data_set = parsed_from.get(required_column, [])
-    for idx, data in enumerate(data_set):
-        flat_data = pd.json_normalize(data, sep='_')
-        flat_data = flat_data.add_prefix(f'{prefix}_{idx}_')
-        flat_parsed = flat_parsed.join(flat_data, rsuffix=f'_{idx}_{join_prefix}')
-
-
-def request_clinvar_api_data(gene_id):
-    """
-    Requests ClinVar API for data about variant with given id.
-    Converts it to pandas dataframe.
-
-    :param str gene_id: id of variant (may be multiple)
-    :returns: DataFrame from ClinVar API
-    :rtype: DataFrame
-    """
-    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=clinvar&id="
-    clinvar_url = f"{base_url}{gene_id}&retmode=json"
-
-    request = requests.get(clinvar_url)
-
-    if request.status_code != 200:
-        raise ValueError(f"Request failed with status code {request.status_code}")
-
-    results = request.json()['result']
-
-    flattened_data = []
-
-    for uid in results['uids']:
-        entry = results[uid]
-
-        flattened_entry = pd.json_normalize(entry, sep='_')
-
-        variation_set = flattened_entry.at[0, 'variation_set']
-        for idx, var_set in enumerate(variation_set):
-            flat_var_set = pd.json_normalize(var_set, sep='_')
-            flat_var_set = flat_var_set.add_prefix(f'variation_set_{idx}_')
-
-            extract_nested_json(flat_var_set, var_set, 'variation_loc', f'variation_set_{idx}_loc', 'loc')
-            extract_nested_json(flat_var_set, var_set, 'variation_xrefs', f'variation_set_{idx}_xrefs', 'xrefs')
-            extract_nested_json(flat_var_set, var_set, 'allele_freq_set', f'variation_set_{idx}_allele_freq', 'allele_freq')
-
-            flat_var_set = flat_var_set.drop(
-                columns=[f'variation_set_{idx}_variation_loc',
-                         f'variation_set_{idx}_variation_xrefs',
-                         f'variation_set_{idx}_allele_freq_set'])
-            flattened_entry = flattened_entry.join(flat_var_set, rsuffix=f'_{idx}_vs')
-
-        # this extraction is different from the previous ones
-
-        genes = flattened_entry.at[0, 'genes']
-        for idx, gene in enumerate(genes):
-            flat_genes = pd.json_normalize(gene, sep='_')
-            flat_genes = flat_genes.add_prefix(f'gene_{idx}_')
-            flattened_entry = flattened_entry.join(flat_genes, rsuffix=f'_{idx}_g')
-
-        germline_classification_trait_set = flattened_entry.at[0,
-        'germline_classification_trait_set']
-        for idx, germline_set in enumerate(germline_classification_trait_set):
-            flat_germline_set = pd.json_normalize(germline_set, sep='_')
-            flat_germline_set = flat_germline_set.add_prefix(f'germline_set_{idx}_')
-
-            extract_nested_json(flat_germline_set, germline_set, 'trait_xrefs', f'germline_set_{idx}_trait_xrefs', 'trait_xrefs')
-
-            flat_germline_set = flat_germline_set.drop(columns=[f'germline_set_{idx}_trait_xrefs'])
-            flattened_entry = flattened_entry.join(flat_germline_set, rsuffix=f'_{idx}_gls')
-
-        flattened_entry = flattened_entry.drop(columns=['variation_set',
-                                                        'genes',
-                                                        'germline_classification_trait_set'])
-
-        flattened_data.append(flattened_entry)
-
-    df = pd.concat(flattened_data, ignore_index=True)
-
-    return df
-
-
 def request_gnomad_api_data(gene_name, to_file=True):
     """
     Requests gnomAD API for data about a specific gene containing:

diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb
@@ -132,121 +132,6 @@
    "outputs": [],
    "execution_count": null
   },
-  {
-   "metadata": {},
-   "cell_type": "code",
-   "source": [
-    "variation_ids = '148002'\n",
-    "\n",
-    "frames = request_clinvar_api_data(variation_ids)\n",
-    "\n",
-    "display(frames)"
-   ],
-   "id": "b21c3487476b684f",
-   "outputs": [],
-   "execution_count": null
-  },
-  {
-   "metadata": {},
-   "cell_type": "code",
-   "source": [
-    "clinvar_data = pd.read_csv(\"C:\\\\Users\\\\Kajus\\\\Desktop\\\\clinvar_results.txt\", sep='\\t')\n",
-    "\n",
-    "display(clinvar_data)"
-   ],
-   "id": "8cb4bbe3f35562d5",
-   "outputs": [],
-   "execution_count": null
-  },
-  {
-   "metadata": {},
-   "cell_type": "markdown",
-   "source": [
-    "Explanation of whats happening in the code below:\n",
-    "\n",
-    "Function to get all the ids from a gene name:\n",
-    "```python\n",
-    "get_variant_ids_from_clinvar_name_api(name: str, count: int)\n",
-    "```\n",
-    "\n",
-    "function gets the ids from the clinvar api, the name is the gene name and the count is the maximum number of ids to get (api's limit is 500)\n",
-    "\n",
-    "function returns a dictionary with the count and the list of ids:\n",
-    "\n",
-    "```json\n",
-    "{\n",
-    "    'count': int,\n",
-    "    'idlist': List[str]\n",
-    "}\n",
-    "```\n",
-    "\n",
-    "if the count is greater than the api's limit, the function will split the list of ids into smaller lists of 500 and then request the data from the api in chunks of 500 ids:\n",
-    "\n",
-    "```python\n",
-    "id_lists = [id_list[i:i + max] for i in range(0, size, max)]\n",
-    "```\n",
-    "\n",
-    "then the function will request the data from the api and concatenate the dataframes into a single dataframe:\n",
-    "\n",
-    "```python\n",
-    "frames = request_clinvar_api_data(join)\n",
-    "variations = pd.concat([variations, frames], ignore_index=True)\n",
-    "```\n",
-    "\n",
-    "The variant extraction function contains a lot of nested lists and dictionaries, so the function will flatten the data and then concatenate the dataframes into a single dataframe\n",
-    "\n",
-    "**NOTE**\n",
-    "\n",
-    "> joining function may have been implemented wrong due to the waiting time of the api.\n"
-   ],
-   "id": "976f9632a8ef29e3"
-  },
-  {
-   "metadata": {},
-   "cell_type": "code",
-   "source": [
-    "import pandas as pd\n",
-    "\n",
-    "variations = pd.DataFrame()\n",
-    "\n",
-    "max = 500\n",
-    "name = \"EYS\"\n",
-    "count = 2147483647\n",
-    "\n",
-    "id_array = get_variant_ids_from_clinvar_name_api(name, count)\n",
-    "size = int(id_array['count'])\n",
-    "id_list = id_array['idlist']\n",
-    "\n",
-    "id_lists = [id_list[i:i + max] for i in range(0, size, max)]\n",
-    "\n",
-    "track = 0\n",
-    "for lists in id_lists:\n",
-    "    join = \",\".join(lists)\n",
-    "    frame = request_clinvar_api_data(join)\n",
-    "    \n",
-    "    variations = pd.concat([variations, frame], ignore_index=True)\n",
-    "    \n",
-    "    print(f\"{track + 1}/{len(id_lists)}\")\n",
-    "    track += 1\n",
-    "\n",
-    "display(variations)\n"
-   ],
-   "id": "129175e3a2e568be",
-   "outputs": [],
-   "execution_count": null
-  },
-  {
-   "metadata": {},
-   "cell_type": "code",
-   "source": [
-    "clinvar_data = pd.read_csv('C:\\\\Users\\\\Kajus\\\\Desktop\\\\clinvar_result.txt', sep='\\t')\n",
-    "\n",
-    "display(clinvar_data)"
-   ],
-   "id": "c85507a3e2c584da",
-   "outputs": [],
-   "execution_count": null
-  },
   {
    "metadata": {
     "ExecuteTime": {