From 4d3575a2e77834118bf4d4a51c2ccf6ae95f83c5 Mon Sep 17 00:00:00 2001 From: Kajus CC <42713684+KajusC@users.noreply.github.com> Date: Wed, 28 Aug 2024 21:33:19 +0300 Subject: [PATCH] Downloaded data from API, formatted code --- api/data/__init__.py | 7 +- api/data/refactoring.py | 110 ++++++------ tests/pipeline.ipynb | 359 ++++++++++++++++------------------------ 3 files changed, 203 insertions(+), 273 deletions(-) diff --git a/api/data/__init__.py b/api/data/__init__.py index 7cd3997..bd40c79 100644 --- a/api/data/__init__.py +++ b/api/data/__init__.py @@ -28,7 +28,9 @@ LOVD_TABLES_DATA_TYPES, # Paths for database downloads - DATABASES_DOWNLOAD_PATHS + DATABASES_DOWNLOAD_PATHS, + + GNOMAD_PATH, ) # DATA COLLECTION IMPORT @@ -57,4 +59,7 @@ from_clinvar_name_to_cdna_position, save_lovd_as_vcf, request_gnomad_api_data, + merge_gnomad_lovd, + parse_gnomad, + set_gnomad_dtypes, ) diff --git a/api/data/refactoring.py b/api/data/refactoring.py index 8aa880d..f2fd6cd 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -247,7 +247,18 @@ def save_lovd_as_vcf(data, save_to="./lovd.vcf"): f.write("\n") -def request_gnomad_api_data(gene_name, to_file=True): +def process_population_data(df, pop_data, name, pop_ids, index): + for pop_id in pop_ids: + df.loc[index, f'{name}_ac_{pop_id}'] = 0 + df.loc[index, f'{name}_an_{pop_id}'] = 0 + if isinstance(pop_data, list): + for pop in pop_data: + variant_id = pop['id'] + df.loc[index, f'{name}_ac_{variant_id}'] = pop['ac'] + df.loc[index, f'{name}_an_{variant_id}'] = pop['an'] + + +def request_gnomad_api_data(gene_name): """ Requests gnomAD API for data about a specific gene containing: - variant_id @@ -305,44 +316,38 @@ def request_gnomad_api_data(gene_name, to_file=True): }} """ - response = requests.post(url, json={'query': query}) - if response.status_code == 200: - data = response.json()['data']['gene']['variants'] - - df = pd.json_normalize(data) - - df['total_ac'] = df['exome.ac'].fillna(0) + df['genome.ac'].fillna(0) - df['total_an'] = df['exome.an'].fillna(0) + df['genome.an'].fillna(0) - - df['cDNA change'] = df['hgvsc'].fillna(0) - df['Protein change'] = df['hgvsp'].fillna(0) - - df['Allele Frequency'] = df['total_ac'] / df['total_an'] - df['Homozygote Count'] = df['exome.ac_hom'].fillna(0) + df['genome.ac_hom'].fillna(0) - exome_populations = df['exome.populations'] - genome_populations = df['genome.populations'] - ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining'] - - def process_population_data(pop_data, name, pop_ids, index): - for pop_id in pop_ids: - df.loc[index, f'{name}_ac_{pop_id}'] = 0 - df.loc[index, f'{name}_an_{pop_id}'] = 0 - if type(pop_data) == list: - for pop in pop_data: - id = pop['id'] - df.loc[index, f'{name}_ac_{id}'] = pop['ac'] - df.loc[index, f'{name}_an_{id}'] = pop['an'] - - for i in range(len(exome_populations)): - exome_pop = exome_populations[i] - process_population_data(exome_pop, 'exome', ids, i) - genome_pop = genome_populations[i] - process_population_data(genome_pop, 'genome', ids, i) - - for id in ids: - df[f'Allele_Frequency_{id}'] = (df[f'exome_ac_{id}'].fillna(0) + df[f'genome_ac_{id}'].fillna(0)) / ( - df[f'exome_an_{id}'].fillna(0) + df[f'genome_an_{id}'].fillna(0)) - population_mapping = { + response = requests.post(url, json={'query': query}, timeout=300)# timeout set to 5 minutes + + if response.status_code != 200: + print('Error:', response.status_code) + return None + + data = response.json()['data']['gene']['variants'] + + df = pd.json_normalize(data) + + df['total_ac'] = df['exome.ac'].fillna(0) + df['genome.ac'].fillna(0) + df['total_an'] = df['exome.an'].fillna(0) + df['genome.an'].fillna(0) + + df['HGVS Consequence'] = df['hgvsc'].fillna(0) # cDNA change + df['Protein Consequence'] = df['hgvsp'].fillna(0) # Protein change + + df['Allele Frequency'] = df['total_ac'] / df['total_an'] + df['Homozygote Count'] = df['exome.ac_hom'].fillna(0) + df['genome.ac_hom'].fillna(0) + exome_populations = df['exome.populations'] + genome_populations = df['genome.populations'] + ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining'] + + for i in range(len(exome_populations)): + exome_pop = exome_populations[i] + process_population_data(df, exome_pop, 'exome', ids, i) + genome_pop = genome_populations[i] + process_population_data(df, genome_pop, 'genome', ids, i) + + for variant_id in ids: + df[f'Allele_Frequency_{variant_id}'] = (df[f'exome_ac_{variant_id}'].fillna(0) + df[f'genome_ac_{variant_id}'].fillna(0)) / ( + df[f'exome_an_{variant_id}'].fillna(0) + df[f'genome_an_{variant_id}'].fillna(0)) + population_mapping = { 'afr': 'African/African American', 'eas': 'East Asian', 'asj': 'Ashkenazi Jew', @@ -355,22 +360,19 @@ def process_population_data(pop_data, name, pop_ids, index): 'remaining': 'Remaining', '': '' } - for i in range(len(df)): - max = 0 - maxid = '' - for id in ids: - if df.loc[i, f'Allele_Frequency_{id}'] > max: - max = df.loc[i, f'Allele_Frequency_{id}'] - maxid = id - df.loc[i, 'Popmax'] = max - df.loc[i, 'Popmax population'] = population_mapping[maxid] - not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency', 'variant_id', + for i in range(len(df)): + max_pop = 0 + maxid = '' + for variant_id in ids: + if df.loc[i, f'Allele_Frequency_{variant_id}'] > max_pop: + max_pop = df.loc[i, f'Allele_Frequency_{variant_id}'] + maxid = variant_id + df.loc[i, 'Popmax'] = max_pop + df.loc[i, 'Popmax population'] = population_mapping[maxid] + not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency', 'variant_id', 'cDNA change', 'Protein change'] - df = df.drop([col for col in df.columns if col not in not_to_drop], axis=1) - if to_file: - df.to_csv('variants.csv', index=True) + df = df.drop([col for col in df.columns if col not in not_to_drop], axis=1) - else: - print('Error:', response.status_code) + df.rename(columns={'variant_id': 'gnomAD ID'}, inplace=True) return df diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index fb86c24..6734e80 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -7,10 +7,6 @@ "collapsed": true, "jupyter": { "outputs_hidden": true - }, - "ExecuteTime": { - "end_time": "2024-08-22T17:20:23.240355Z", - "start_time": "2024-08-22T17:20:21.651097Z" } }, "source": [ @@ -19,30 +15,28 @@ "\n", "from api.data import (store_database_for_eys_gene,\n", " parse_lovd,\n", + " parse_gnomad,\n", " LOVD_PATH,\n", " set_lovd_dtypes,\n", - " request_clinvar_api_data,\n", - " get_variant_ids_from_clinvar_name_api,\n", + " set_gnomad_dtypes,\n", " request_gnomad_api_data,\n", + " merge_gnomad_lovd,\n", + " GNOMAD_PATH,\n", " )\n", "from api.data import save_lovd_as_vcf\n", "\n", + "\n", "pd.options.display.max_columns = 0" ], "outputs": [], - "execution_count": 1 + "execution_count": null }, { "cell_type": "code", "id": "f49f7691a27aa7b4", "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-08-11T16:16:57.305309Z", - "start_time": "2024-08-11T16:16:56.668571Z" - } + "collapsed": false }, - "source": [ "store_database_for_eys_gene(\"lovd\", override=False)" ], @@ -53,12 +47,10 @@ "cell_type": "code", "id": "cf5c45c0f7b9de0f", "metadata": { - "collapsed": false, "jupyter": { "outputs_hidden": false } - }, "source": [ "data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")" @@ -67,62 +59,177 @@ "execution_count": null }, { + "metadata": {}, "cell_type": "code", - "id": "8a089e29bfc8c119", + "source": [ + "gnomad_data = request_gnomad_api_data(\"EYS\")\n", + "\n", + "display(gnomad_data)" + ], + "id": "64482c033c794fb4", + "outputs": [], + "execution_count": null + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-28T18:11:25.802540Z", + "start_time": "2024-08-28T18:11:25.715039Z" + } + }, + "cell_type": "code", + "source": [ + "store_database_for_eys_gene('gnomad', False)\n", + "\n", + "gnomad_data_2 = parse_gnomad(GNOMAD_PATH +'/gnomad_data.csv')" + ], + "id": "60f3f3074a9b19f4", + "outputs": [], + "execution_count": 24 + }, + { "metadata": {}, + "cell_type": "code", + "source": "display(gnomad_data_2)", + "id": "9d3e4d6b5f7be127", + "outputs": [], + "execution_count": null + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-28T18:11:35.536411Z", + "start_time": "2024-08-28T18:11:35.258009Z" + } + }, + "cell_type": "code", "source": [ - "for i in data:\n", - " print(i)\n", - " display(data[i])" + "gnomad_data_2.to_csv('C:\\\\Users\\\\Kajus\\\\Desktop\\\\gnomad_data_downloaded.csv', index=False)\n", + "gnomad_data.to_csv('C:\\\\Users\\\\Kajus\\\\Desktop\\\\gnomad_data_api.csv', index=False)" + ], + "id": "2e869f5c77dbe3d3", + "outputs": [], + "execution_count": 26 + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "len(gnomad_data_2), len(gnomad_data)\n", + "\n", + "print(len(gnomad_data_2) - len(gnomad_data))" ], + "id": "9efafb201061c146", "outputs": [], "execution_count": null }, { + "metadata": {}, "cell_type": "code", - "id": "ef07740b2fa63e42", + "source": [ + "gnomad_data_2.rename(columns={'gnomAD ID': 'variant_id'}, inplace=True)\n", + "\n", + "missing_from_api = []\n", + "\n", + "for i in gnomad_data['variant_id']:\n", + " if(i in gnomad_data_2['variant_id'].values):\n", + " continue\n", + " missing_from_api.append(i)\n", + "\n", + "len(missing_from_api)\n", + "\n", + "missing_data = gnomad_data.loc[gnomad_data['variant_id'].isin(missing_from_api)]\n", + "\n", + "missing_data" + ], + "id": "d0eb0a6db96d31c8", + "outputs": [], + "execution_count": null + }, + { "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false + "ExecuteTime": { + "end_time": "2024-08-28T18:06:31.488622Z", + "start_time": "2024-08-28T18:06:31.471299Z" } }, + "cell_type": "code", + "source": "missing_data.to_csv('C:\\\\Users\\\\Kajus\\\\Desktop\\\\gnomad_data_missing.csv', index=False)", + "id": "388120b03b094511", + "outputs": [], + "execution_count": 23 + }, + { + "metadata": {}, + "cell_type": "code", "source": [ "set_lovd_dtypes(data)\n", + "set_gnomad_dtypes(gnomad_data)\n", + "\n", + "variants_on_genome = data[\"Variants_On_Genome\"].copy()\n", + "\n", + "lovd_data = pd.merge(data[\"Variants_On_Transcripts\"],\n", + " variants_on_genome[['id','VariantOnGenome/DNA/hg38']],\n", + " on='id',\n", + " how='left')\n", + "\n", + "gnomad_data = gnomad_data.copy()\n", + "final_data = merge_gnomad_lovd(lovd_data, gnomad_data)\n", + "final_data" + ], + "id": "96453d88e353aeb1", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ "for i in data:\n", " print(i)\n", - " display(data[i].info())" + " display(data[i])" ], + "id": "8a089e29bfc8c119", "outputs": [], "execution_count": null }, { - "cell_type": "code", - "id": "c968af1617be40db", "metadata": {}, + "cell_type": "code", "source": [ - "save_lovd_as_vcf(data[\"Variants_On_Genome\"], \"./lovd.vcf\")" + "set_lovd_dtypes(data)\n", + "for i in data:\n", + " print(i)\n", + " display(data[i].info())" ], + "id": "ef07740b2fa63e42", "outputs": [], "execution_count": null }, { + "metadata": {}, "cell_type": "code", - "id": "c7ff16903e0c52bd", + "source": "save_lovd_as_vcf(data[\"Variants_On_Genome\"], \"./lovd.vcf\")", + "id": "c968af1617be40db", + "outputs": [], + "execution_count": null + }, + { "metadata": {}, + "cell_type": "code", "source": [ "from subprocess import Popen\n", "\n", "process = Popen(\"spliceai -I ./lovd.vcf -O ./lovd_output.vcf -R ../tools/spliceai/hg38.fa -A grch38\".split())\n", "process.wait()" ], + "id": "c7ff16903e0c52bd", "outputs": [], "execution_count": null }, { - "cell_type": "code", - "id": "0514ccc3-5c91-41ad-ab15-f4158030ea14", "metadata": {}, + "cell_type": "code", "source": [ "from api.tools import get_revel_scores\n", "\n", @@ -133,201 +240,17 @@ "\n", "display(results)" ], + "id": "0514ccc3-5c91-41ad-ab15-f4158030ea14", "outputs": [], "execution_count": null }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2024-08-22T17:23:41.828469Z", - "start_time": "2024-08-22T17:21:09.627424Z" - } - }, - "cell_type": "code", - "source": [ - "gnomad_from_api = request_gnomad_api_data(\"EYS\", False)\n", - "\n", - "display(gnomad_from_api)" - ], - "id": "64482c033c794fb4", - "outputs": [ - { - "data": { - "text/plain": [ - " variant_id cDNA change ... Popmax Popmax population\n", - "0 6-63720525-A-G c.*71T>C ... 0.000016 African/African American\n", - "1 6-63720525-A-T c.*71T>A ... 0.000192 East Asian\n", - "2 6-63720525-A-C c.*71T>G ... 0.000000 \n", - "3 6-63720526-T-A c.*70A>T ... 0.000020 South Asian\n", - "4 6-63720527-G-T c.*69C>A ... 0.000000 \n", - "... ... ... ... ... ...\n", - "14295 6-65495479-G-T c.-69C>A ... 0.000000 \n", - "14296 6-65495479-G-A c.-69C>T ... 0.000031 African/African American\n", - "14297 6-65495482-A-G c.-72T>C ... 0.000070 Admixed American\n", - "14298 6-65495484-T-G c.-74A>C ... 0.000060 South Asian\n", - "14299 6-65495485-T-C c.-75A>G ... 0.000012 South Asian\n", - "\n", - "[14300 rows x 7 columns]" - ], - "text/html": [ - "
\n", - " | variant_id | \n", - "cDNA change | \n", - "Protein change | \n", - "Allele Frequency | \n", - "Homozygote Count | \n", - "Popmax | \n", - "Popmax population | \n", - "
---|---|---|---|---|---|---|---|
0 | \n", - "6-63720525-A-G | \n", - "c.*71T>C | \n", - "0 | \n", - "1.807419e-06 | \n", - "0.0 | \n", - "0.000016 | \n", - "African/African American | \n", - "
1 | \n", - "6-63720525-A-T | \n", - "c.*71T>A | \n", - "0 | \n", - "6.573844e-06 | \n", - "0.0 | \n", - "0.000192 | \n", - "East Asian | \n", - "
2 | \n", - "6-63720525-A-C | \n", - "c.*71T>G | \n", - "0 | \n", - "0.000000e+00 | \n", - "0.0 | \n", - "0.000000 | \n", - "\n", - " |
3 | \n", - "6-63720526-T-A | \n", - "c.*70A>T | \n", - "0 | \n", - "1.045299e-06 | \n", - "0.0 | \n", - "0.000020 | \n", - "South Asian | \n", - "
4 | \n", - "6-63720527-G-T | \n", - "c.*69C>A | \n", - "0 | \n", - "0.000000e+00 | \n", - "0.0 | \n", - "0.000000 | \n", - "\n", - " |
... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "
14295 | \n", - "6-65495479-G-T | \n", - "c.-69C>A | \n", - "0 | \n", - "0.000000e+00 | \n", - "0.0 | \n", - "0.000000 | \n", - "\n", - " |
14296 | \n", - "6-65495479-G-A | \n", - "c.-69C>T | \n", - "0 | \n", - "1.446349e-06 | \n", - "0.0 | \n", - "0.000031 | \n", - "African/African American | \n", - "
14297 | \n", - "6-65495482-A-G | \n", - "c.-72T>C | \n", - "0 | \n", - "2.629510e-06 | \n", - "0.0 | \n", - "0.000070 | \n", - "Admixed American | \n", - "
14298 | \n", - "6-65495484-T-G | \n", - "c.-74A>C | \n", - "0 | \n", - "3.645085e-06 | \n", - "0.0 | \n", - "0.000060 | \n", - "South Asian | \n", - "
14299 | \n", - "6-65495485-T-C | \n", - "c.-75A>G | \n", - "0 | \n", - "7.310070e-07 | \n", - "0.0 | \n", - "0.000012 | \n", - "South Asian | \n", - "
14300 rows × 7 columns
\n", - "