Skip to content

Commit

Permalink
Removed from last branch
Browse files Browse the repository at this point in the history
  • Loading branch information
KajusC committed Aug 22, 2024
1 parent 5c30ce1 commit 197864c
Show file tree
Hide file tree
Showing 4 changed files with 0 additions and 235 deletions.
2 changes: 0 additions & 2 deletions api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,4 @@
parse_lovd,
from_clinvar_name_to_cdna_position,
save_lovd_as_vcf,
request_clinvar_api_data,
get_variant_ids_from_clinvar_name_api,
)
3 changes: 0 additions & 3 deletions api/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,5 @@
parse_lovd,
from_clinvar_name_to_cdna_position,
save_lovd_as_vcf,
request_clinvar_api_data,
get_variant_ids_from_clinvar_name_api,
extract_nested_json,
request_gnomad_api_data,
)
115 changes: 0 additions & 115 deletions api/data/refactoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,121 +159,6 @@ def save_lovd_as_vcf(data, save_to="./lovd.vcf"):
f.write("\n")


def get_variant_ids_from_clinvar_name_api(name, count=100):
"""
Extracts variant ids from ClinVar `name` variable. /n
key of dictionary is the size of the list of ids.
:param str name: name of variant
:param int count: number of ids to extract
:returns: ids of variants
:rtype: str
"""

result = {}
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=clinvar&term="
clinvar_url = f"{base_url}{name}&retmode=json&retmax={count}"

request = requests.get(clinvar_url)

if request.status_code != 200:
raise ValueError(f"Request failed with status code {request.status_code}")

data = request.json()

result['idlist'] = data['esearchresult']['idlist']
result['count'] = data['esearchresult']['count']

return result


def extract_nested_json(flat_parsed, parsed_from, required_column, prefix, join_prefix):
"""
Extracts nested JSON data from dictionary.
:param DataFrame parsed_from: normalised JSON data
:param str required_column: column to extract
:param str prefix: prefix for extracted columns
"""

data_set = parsed_from.get(required_column, [])
for idx, data in enumerate(data_set):
flat_data = pd.json_normalize(data, sep='_')
flat_data = flat_data.add_prefix(f'{prefix}_{idx}_')
flat_parsed = flat_parsed.join(flat_data, rsuffix=f'_{idx}_{join_prefix}')


def request_clinvar_api_data(gene_id):
"""
Requests ClinVar API for data about variant with given id.
Converts it to pandas dataframe.
:param str gene_id: id of variant (may be multiple)
:returns: DataFrame from ClinVar API
:rtype: DataFrame
"""
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=clinvar&id="
clinvar_url = f"{base_url}{gene_id}&retmode=json"

request = requests.get(clinvar_url)

if request.status_code != 200:
raise ValueError(f"Request failed with status code {request.status_code}")

results = request.json()['result']

flattened_data = []

for uid in results['uids']:
entry = results[uid]

flattened_entry = pd.json_normalize(entry, sep='_')

variation_set = flattened_entry.at[0, 'variation_set']
for idx, var_set in enumerate(variation_set):
flat_var_set = pd.json_normalize(var_set, sep='_')
flat_var_set = flat_var_set.add_prefix(f'variation_set_{idx}_')

extract_nested_json(flat_var_set, var_set, 'variation_loc', f'variation_set_{idx}_loc', 'loc')
extract_nested_json(flat_var_set, var_set, 'variation_xrefs', f'variation_set_{idx}_xrefs', 'xrefs')
extract_nested_json(flat_var_set, var_set, 'allele_freq_set', f'variation_set_{idx}_allele_freq', 'allele_freq')

flat_var_set = flat_var_set.drop(
columns=[f'variation_set_{idx}_variation_loc',
f'variation_set_{idx}_variation_xrefs',
f'variation_set_{idx}_allele_freq_set'])
flattened_entry = flattened_entry.join(flat_var_set, rsuffix=f'_{idx}_vs')

# this extraction is different from the previous ones

genes = flattened_entry.at[0, 'genes']
for idx, gene in enumerate(genes):
flat_genes = pd.json_normalize(gene, sep='_')
flat_genes = flat_genes.add_prefix(f'gene_{idx}_')
flattened_entry = flattened_entry.join(flat_genes, rsuffix=f'_{idx}_g')

germline_classification_trait_set = flattened_entry.at[0,
'germline_classification_trait_set']
for idx, germline_set in enumerate(germline_classification_trait_set):
flat_germline_set = pd.json_normalize(germline_set, sep='_')
flat_germline_set = flat_germline_set.add_prefix(f'germline_set_{idx}_')

extract_nested_json(flat_germline_set, germline_set, 'trait_xrefs', f'germline_set_{idx}_trait_xrefs', 'trait_xrefs')

flat_germline_set = flat_germline_set.drop(columns=[f'germline_set_{idx}_trait_xrefs'])
flattened_entry = flattened_entry.join(flat_germline_set, rsuffix=f'_{idx}_gls')

flattened_entry = flattened_entry.drop(columns=['variation_set',
'genes',
'germline_classification_trait_set'])

flattened_data.append(flattened_entry)

df = pd.concat(flattened_data, ignore_index=True)

return df


def request_gnomad_api_data(gene_name, to_file=True):
"""
Requests gnomAD API for data about a specific gene containing:
Expand Down
115 changes: 0 additions & 115 deletions tests/pipeline.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -132,121 +132,6 @@
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": [
"variation_ids = '148002'\n",
"\n",
"frames = request_clinvar_api_data(variation_ids)\n",
"\n",
"display(frames)"
],
"id": "b21c3487476b684f",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": [
"clinvar_data = pd.read_csv(\"C:\\\\Users\\\\Kajus\\\\Desktop\\\\clinvar_results.txt\", sep='\\t')\n",
"\n",
"display(clinvar_data)"
],
"id": "8cb4bbe3f35562d5",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"Explanation of whats happening in the code below:\n",
"\n",
"Function to get all the ids from a gene name:\n",
"```python\n",
"get_variant_ids_from_clinvar_name_api(name: str, count: int)\n",
"```\n",
"\n",
"function gets the ids from the clinvar api, the name is the gene name and the count is the maximum number of ids to get (api's limit is 500)\n",
"\n",
"function returns a dictionary with the count and the list of ids:\n",
"\n",
"```json\n",
"{\n",
" 'count': int,\n",
" 'idlist': List[str]\n",
"}\n",
"```\n",
"\n",
"if the count is greater than the api's limit, the function will split the list of ids into smaller lists of 500 and then request the data from the api in chunks of 500 ids:\n",
"\n",
"```python\n",
"id_lists = [id_list[i:i + max] for i in range(0, size, max)]\n",
"```\n",
"\n",
"then the function will request the data from the api and concatenate the dataframes into a single dataframe:\n",
"\n",
"```python\n",
"frames = request_clinvar_api_data(join)\n",
"variations = pd.concat([variations, frames], ignore_index=True)\n",
"```\n",
"\n",
"The variant extraction function contains a lot of nested lists and dictionaries, so the function will flatten the data and then concatenate the dataframes into a single dataframe\n",
"\n",
"**NOTE**\n",
"\n",
"> joining function may have been implemented wrong due to the waiting time of the api.\n"
],
"id": "976f9632a8ef29e3"
},
{
"metadata": {},
"cell_type": "code",
"source": [
"import pandas as pd\n",
"\n",
"variations = pd.DataFrame()\n",
"\n",
"max = 500\n",
"name = \"EYS\"\n",
"count = 2147483647\n",
"\n",
"id_array = get_variant_ids_from_clinvar_name_api(name, count)\n",
"size = int(id_array['count'])\n",
"id_list = id_array['idlist']\n",
"\n",
"id_lists = [id_list[i:i + max] for i in range(0, size, max)]\n",
"\n",
"track = 0\n",
"for lists in id_lists:\n",
" join = \",\".join(lists)\n",
" frame = request_clinvar_api_data(join)\n",
" \n",
" variations = pd.concat([variations, frame], ignore_index=True)\n",
" \n",
" print(f\"{track + 1}/{len(id_lists)}\")\n",
" track += 1\n",
"\n",
"display(variations)\n"
],
"id": "129175e3a2e568be",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": [
"clinvar_data = pd.read_csv('C:\\\\Users\\\\Kajus\\\\Desktop\\\\clinvar_result.txt', sep='\\t')\n",
"\n",
"display(clinvar_data)"
],
"id": "c85507a3e2c584da",
"outputs": [],
"execution_count": null
},
{
"metadata": {
"ExecuteTime": {
Expand Down

0 comments on commit 197864c

Please sign in to comment.