From 252a468d4d4ed164721b571955f485ad7c5a9841 Mon Sep 17 00:00:00 2001 From: Kajus CC <42713684+KajusC@users.noreply.github.com> Date: Sat, 14 Sep 2024 21:02:23 +0300 Subject: [PATCH] Moved function to collection.py; added path parameter with overriding option; implemented one log function; Changed that its only getting EYS --- api/data/__init__.py | 3 +- api/data/collection.py | 151 ++++ api/data/refactoring.py | 141 ---- tests/pipeline.ipynb | 1503 ++++++++++++++++++++++++++++++++++++++- 4 files changed, 1624 insertions(+), 174 deletions(-) diff --git a/api/data/__init__.py b/api/data/__init__.py index bd40c79..7bfdfbe 100644 --- a/api/data/__init__.py +++ b/api/data/__init__.py @@ -46,9 +46,11 @@ download_lovd_database_for_eys_gene, download_genes_lovd, download_database_for_eys_gene, + download_data_from_gnomad_eys, # Functions for storing databases store_database_for_eys_gene + ) # DATA REFACTORING IMPORT @@ -58,7 +60,6 @@ parse_lovd, from_clinvar_name_to_cdna_position, save_lovd_as_vcf, - request_gnomad_api_data, merge_gnomad_lovd, parse_gnomad, set_gnomad_dtypes, diff --git a/api/data/collection.py b/api/data/collection.py index 8bb8312..0d43d40 100644 --- a/api/data/collection.py +++ b/api/data/collection.py @@ -6,6 +6,7 @@ import time import requests +import pandas as pd from requests import RequestException from selenium import webdriver @@ -189,3 +190,153 @@ def store_database_for_eys_gene(database_name, override=False): else: download_database_for_eys_gene(database_name, override) +def prepare_popmax_calculation(df, pop_data, name, pop_ids, index): + """ + prepares the calculation of popmax and popmax population for a variant. + genome and exome data of ac and an. + + :param DataFrame df: DataFrame containing gnomAD data + :param dict pop_data: dictionary containing population data + :param str name: name of the population + :param list[str] pop_ids: list of population ids + :param int index: index of the variant + """ + + for pop_id in pop_ids: + df.loc[index, f'{name}_ac_{pop_id}'] = 0 + df.loc[index, f'{name}_an_{pop_id}'] = 0 + if isinstance(pop_data, list): + for pop in pop_data: + variant_id = pop['id'] + df.loc[index, f'{name}_ac_{variant_id}'] = pop['ac'] + df.loc[index, f'{name}_an_{variant_id}'] = pop['an'] + + +def download_data_from_gnomad_eys(path, override=False): + """ + Requests gnomAD API for data about a specific gene containing: + - variant_id + - cDNA change + - protein change + - allele frequency + - homozygote count + - popmax + - popmax population + + :param str gene_name: name of gene + :param bool to_file: if True, saves data to variants.csv + :returns: DataFrame from gnomAD API + :rtype: DataFrame + """ + + url = 'https://gnomad.broadinstitute.org/api' + query = f""" + query{{ + gene(gene_symbol: "EYS", reference_genome: GRCh38) {{ + variants(dataset: gnomad_r4) + {{ + variant_id + chrom + pos + ref + hgvsc + hgvsp + alt + exome {{ + ac + an + ac_hom + populations + {{ + id + ac + an + }} + }} + genome + {{ + ac + an + ac_hom + populations + {{ + id + ac + an + }} + }} + }} + }} + }} + """ + + response = requests.post(url, json={'query': query}, timeout=300) # timeout set to 5 minutes + + if response.status_code != 200: + if not os.path.isfile(path): + f = open('logs.txt', 'x') + f.write(response.text) + else: + f = open('logs.txt', 'a') + f.write(response.text) + + data = response.json()['data']['gene']['variants'] + + df = pd.json_normalize(data) + + df.loc[:, 'total_ac'] = df.loc[:, 'exome.ac'].fillna(0) + df.loc[:, 'genome.ac'].fillna(0) + df.loc[:, 'total_an'] = df.loc[:, 'exome.an'].fillna(0) + df.loc[:, 'genome.an'].fillna(0) + + df.loc[:, 'HGVS Consequence'] = df.loc[:, 'hgvsc'].fillna(0) # cDNA change + df.loc[:, 'Protein Consequence'] = df.loc[:, 'hgvsp'].fillna(0) # Protein change + + df.loc[:, 'Allele Frequency'] = df.loc[:, 'total_ac'] / df.loc[:, 'total_an'] + df.loc[:, 'Homozygote Count'] = df.loc[:, 'exome.ac_hom'].fillna(0) + df.loc[:, 'genome.ac_hom'].fillna(0) + exome_populations = df.loc[:, 'exome.populations'] + genome_populations = df.loc[:, 'genome.populations'] + population_ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining'] + + for i in range(len(exome_populations)): + exome_pop = exome_populations[i] + prepare_popmax_calculation(df, exome_pop, 'exome', population_ids, i) + genome_pop = genome_populations[i] + prepare_popmax_calculation(df, genome_pop, 'genome', population_ids, i) + + for population_id in population_ids: + df.loc[:, f'Allele_Frequency_{population_id}'] = ( + (df.loc[:, f'exome_ac_{population_id}'].fillna(0) + df.loc[:, f'genome_ac_{population_id}'].fillna(0)) / + (df.loc[:, f'exome_an_{population_id}'].fillna(0) + df.loc[:, f'genome_an_{population_id}'].fillna(0))) + population_mapping = { + 'afr': 'African/African American', + 'eas': 'East Asian', + 'asj': 'Ashkenazi Jew', + 'sas': 'South Asian', + 'nfe': 'European (non-Finnish)', + 'fin': 'European (Finnish)', + 'mid': 'Middle Eastern', + 'amr': 'Admixed American', + 'ami': "Amish", + 'remaining': 'Remaining', + '': '' + } + + for i in range(df.shape[0]): + max_pop = 0 + max_id = '' + for population_id in population_ids: + if df.loc[i, f'Allele_Frequency_{population_id}'] > max_pop: + max_pop = df.loc[i, f'Allele_Frequency_{population_id}'] + max_id = population_id + df.loc[i, 'Popmax'] = max_pop + df.loc[i, 'Popmax population'] = population_mapping[max_id] + not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency', + 'variant_id', 'cDNA change', 'Protein change', 'gnomAD ID'] + + df.rename(columns={'variant_id': 'gnomAD ID'}) + + df = df.filter(not_to_drop, axis="columns") + + if not os.path.isfile(path) or override: + df.to_csv(path, index=False) + + return df \ No newline at end of file diff --git a/api/data/refactoring.py b/api/data/refactoring.py index 51f9a4c..fe268f1 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -247,145 +247,4 @@ def save_lovd_as_vcf(data, save_to="./lovd.vcf"): f.write("\n") -def prepare_popmax_calculation(df, pop_data, name, pop_ids, index): - """ - prepares the calculation of popmax and popmax population for a variant. - genome and exome data of ac and an. - - :param DataFrame df: DataFrame containing gnomAD data - :param dict pop_data: dictionary containing population data - :param str name: name of the population - :param list[str] pop_ids: list of population ids - :param int index: index of the variant - """ - - for pop_id in pop_ids: - df.loc[index, f'{name}_ac_{pop_id}'] = 0 - df.loc[index, f'{name}_an_{pop_id}'] = 0 - if isinstance(pop_data, list): - for pop in pop_data: - variant_id = pop['id'] - df.loc[index, f'{name}_ac_{variant_id}'] = pop['ac'] - df.loc[index, f'{name}_an_{variant_id}'] = pop['an'] - - -def request_gnomad_api_data(gene_name): - """ - Requests gnomAD API for data about a specific gene containing: - - variant_id - - cDNA change - - protein change - - allele frequency - - homozygote count - - popmax - - popmax population - - :param str gene_name: name of gene - :param bool to_file: if True, saves data to variants.csv - :returns: DataFrame from gnomAD API - :rtype: DataFrame - """ - - url = 'https://gnomad.broadinstitute.org/api' - query = f""" - query{{ - gene(gene_symbol: "{gene_name}", reference_genome: GRCh38) {{ - variants(dataset: gnomad_r4) - {{ - variant_id - chrom - pos - ref - hgvsc - hgvsp - alt - exome {{ - ac - an - ac_hom - populations - {{ - id - ac - an - }} - }} - genome - {{ - ac - an - ac_hom - populations - {{ - id - ac - an - }} - }} - }} - }} - }} - """ - response = requests.post(url, json={'query': query}, timeout=300) # timeout set to 5 minutes - - if response.status_code != 200: - print('Error:', response.status_code) - - data = response.json()['data']['gene']['variants'] - - df = pd.json_normalize(data) - - df.loc[:, 'total_ac'] = df.loc[:, 'exome.ac'].fillna(0) + df.loc[:, 'genome.ac'].fillna(0) - df.loc[:, 'total_an'] = df.loc[:, 'exome.an'].fillna(0) + df.loc[:, 'genome.an'].fillna(0) - - df.loc[:, 'HGVS Consequence'] = df.loc[:, 'hgvsc'].fillna(0) # cDNA change - df.loc[:, 'Protein Consequence'] = df.loc[:, 'hgvsp'].fillna(0) # Protein change - - df.loc[:, 'Allele Frequency'] = df.loc[:, 'total_ac'] / df.loc[:, 'total_an'] - df.loc[:, 'Homozygote Count'] = df.loc[:, 'exome.ac_hom'].fillna(0) + df.loc[:, 'genome.ac_hom'].fillna(0) - exome_populations = df.loc[:, 'exome.populations'] - genome_populations = df.loc[:, 'genome.populations'] - population_ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining'] - - for i in range(len(exome_populations)): - exome_pop = exome_populations[i] - prepare_popmax_calculation(df, exome_pop, 'exome', population_ids, i) - genome_pop = genome_populations[i] - prepare_popmax_calculation(df, genome_pop, 'genome', population_ids, i) - - for population_id in population_ids: - df.loc[:, f'Allele_Frequency_{population_id}'] = ( - (df.loc[:, f'exome_ac_{population_id}'].fillna(0) + df.loc[:, f'genome_ac_{population_id}'].fillna(0)) / ( - df.loc[:, f'exome_an_{population_id}'].fillna(0) + df.loc[:, f'genome_an_{population_id}'].fillna(0))) - population_mapping = { - 'afr': 'African/African American', - 'eas': 'East Asian', - 'asj': 'Ashkenazi Jew', - 'sas': 'South Asian', - 'nfe': 'European (non-Finnish)', - 'fin': 'European (Finnish)', - 'mid': 'Middle Eastern', - 'amr': 'Admixed American', - 'ami': "Amish", - 'remaining': 'Remaining', - '': '' - } - - for i in range(df.shape[0]): - max_pop = 0 - max_id = '' - for population_id in population_ids: - if df.loc[i, f'Allele_Frequency_{population_id}'] > max_pop: - max_pop = df.loc[i, f'Allele_Frequency_{population_id}'] - max_id = population_id - df.loc[i, 'Popmax'] = max_pop - df.loc[i, 'Popmax population'] = population_mapping[max_id] - not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency', - 'variant_id', 'cDNA change', 'Protein change'] - - df = df.filter(not_to_drop, axis="columns") - - df.rename(columns={'variant_id': 'gnomAD ID'}) - - return df diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 71cf21d..4d97d59 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -7,6 +7,10 @@ "collapsed": true, "jupyter": { "outputs_hidden": true + }, + "ExecuteTime": { + "end_time": "2024-09-14T17:48:45.547065Z", + "start_time": "2024-09-14T17:48:44.657414Z" } }, "source": [ @@ -19,7 +23,7 @@ " LOVD_PATH,\n", " set_lovd_dtypes,\n", " set_gnomad_dtypes,\n", - " request_gnomad_api_data,\n", + " download_data_from_gnomad_eys,\n", " merge_gnomad_lovd,\n", " GNOMAD_PATH,\n", " )\n", @@ -29,19 +33,31 @@ "pd.options.display.max_columns = 0" ], "outputs": [], - "execution_count": null + "execution_count": 1 }, { "cell_type": "code", "id": "f49f7691a27aa7b4", "metadata": { - "collapsed": false + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-09-14T17:48:45.552787Z", + "start_time": "2024-09-14T17:48:45.549075Z" + } }, "source": [ "store_database_for_eys_gene(\"lovd\", override=False)" ], - "outputs": [], - "execution_count": null + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The file at ../data/lovd/lovd_data.txt already exists.\n" + ] + } + ], + "execution_count": 2 }, { "cell_type": "code", @@ -50,28 +66,186 @@ "collapsed": false, "jupyter": { "outputs_hidden": false + }, + "ExecuteTime": { + "end_time": "2024-09-14T17:48:53.637253Z", + "start_time": "2024-09-14T17:48:45.553796Z" } }, "source": [ + "download_path = 'C:\\\\Users\\\\Kajus\\\\Desktop\\\\gnomad_data_downloaded.csv'\n", "data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")" ], "outputs": [], - "execution_count": null + "execution_count": 3 }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-14T17:51:24.853647Z", + "start_time": "2024-09-14T17:48:53.638260Z" + } + }, "cell_type": "code", "source": [ - "gnomad_data = request_gnomad_api_data(\"EYS\")\n", + "gnomad_data = download_data_from_gnomad_eys(path=download_path, override=False)\n", "\n", "display(gnomad_data)" ], "id": "64482c033c794fb4", - "outputs": [], - "execution_count": null + "outputs": [ + { + "data": { + "text/plain": [ + " Popmax Popmax population ... Allele Frequency variant_id\n", + "0 0.000016 African/African American ... 1.807419e-06 6-63720525-A-G\n", + "1 0.000192 East Asian ... 6.573844e-06 6-63720525-A-T\n", + "2 0.000000 ... 0.000000e+00 6-63720525-A-C\n", + "3 0.000020 South Asian ... 1.045299e-06 6-63720526-T-A\n", + "4 0.000000 ... 0.000000e+00 6-63720527-G-T\n", + "... ... ... ... ... ...\n", + "14295 0.000000 ... 0.000000e+00 6-65495479-G-T\n", + "14296 0.000031 African/African American ... 1.446349e-06 6-65495479-G-A\n", + "14297 0.000070 Admixed American ... 2.629510e-06 6-65495482-A-G\n", + "14298 0.000060 South Asian ... 3.645085e-06 6-65495484-T-G\n", + "14299 0.000012 South Asian ... 7.310070e-07 6-65495485-T-C\n", + "\n", + "[14300 rows x 5 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PopmaxPopmax populationHomozygote CountAllele Frequencyvariant_id
00.000016African/African American0.01.807419e-066-63720525-A-G
10.000192East Asian0.06.573844e-066-63720525-A-T
20.0000000.00.000000e+006-63720525-A-C
30.000020South Asian0.01.045299e-066-63720526-T-A
40.0000000.00.000000e+006-63720527-G-T
..................
142950.0000000.00.000000e+006-65495479-G-T
142960.000031African/African American0.01.446349e-066-65495479-G-A
142970.000070Admixed American0.02.629510e-066-65495482-A-G
142980.000060South Asian0.03.645085e-066-65495484-T-G
142990.000012South Asian0.07.310070e-076-65495485-T-C
\n", + "

14300 rows × 5 columns

\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "execution_count": 4 }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-14T17:51:24.927680Z", + "start_time": "2024-09-14T17:51:24.854656Z" + } + }, "cell_type": "code", "source": [ "store_database_for_eys_gene('gnomad', False)\n", @@ -80,29 +254,988 @@ ], "id": "60f3f3074a9b19f4", "outputs": [], - "execution_count": null + "execution_count": 5 }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-14T17:51:24.982839Z", + "start_time": "2024-09-14T17:51:24.928689Z" + } + }, "cell_type": "code", "source": "display(gnomad_data_2)", "id": "9d3e4d6b5f7be127", - "outputs": [], - "execution_count": null + "outputs": [ + { + "data": { + "text/plain": [ + " gnomAD ID ... Hemizygote Count Remaining\n", + "0 6-63720525-A-G ... 0\n", + "1 6-63720525-A-T ... 0\n", + "2 6-63720526-T-A ... 0\n", + "3 6-63720531-C-CAA ... 0\n", + "4 6-63720531-C-G ... 0\n", + "... ... ... ...\n", + "11083 6-65495478-G-A ... 0\n", + "11084 6-65495479-G-A ... 0\n", + "11085 6-65495482-A-G ... 0\n", + "11086 6-65495484-T-G ... 0\n", + "11087 6-65495485-T-C ... 0\n", + "\n", + "[11088 rows x 72 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gnomAD IDChromosomePositionrsIDsReferenceAlternateSourceFilters - exomesFilters - genomesTranscriptHGVS ConsequenceProtein ConsequenceTranscript ConsequenceVEP AnnotationClinVar Clinical SignificanceClinVar Variation IDFlagsAllele CountAllele NumberAllele FrequencyHomozygote CountHemizygote CountFilters - jointGroupMax FAF groupGroupMax FAF frequencycaddrevel_maxspliceai_ds_maxpangolin_largest_dsphylopsift_maxpolyphen_maxAllele Count African/African AmericanAllele Number African/African AmericanHomozygote Count African/African AmericanHemizygote Count African/African AmericanAllele Count Admixed AmericanAllele Number Admixed AmericanHomozygote Count Admixed AmericanHemizygote Count Admixed AmericanAllele Count Ashkenazi JewishAllele Number Ashkenazi JewishHomozygote Count Ashkenazi JewishHemizygote Count Ashkenazi JewishAllele Count East AsianAllele Number East AsianHomozygote Count East AsianHemizygote Count East AsianAllele Count European (Finnish)Allele Number European (Finnish)Homozygote Count European (Finnish)Hemizygote Count European (Finnish)Allele Count Middle EasternAllele Number Middle EasternHomozygote Count Middle EasternHemizygote Count Middle EasternAllele Count European (non-Finnish)Allele Number European (non-Finnish)Homozygote Count European (non-Finnish)Hemizygote Count European (non-Finnish)Allele Count AmishAllele Number AmishHomozygote Count AmishHemizygote Count AmishAllele Count South AsianAllele Number South AsianHomozygote Count South AsianHemizygote Count South AsianAllele Count RemainingAllele Number RemainingHomozygote Count RemainingHemizygote Count Remaining
06-63720525-A-G663720525rs1768331164AGgnomAD Exomes,gnomAD GenomesPASSPASSENST00000503581.6c.*71T>CNaNc.*71T>C3_prime_UTR_variantNaNNaNNaN211065501.807419e-0600PASSNaNNaN0.238NaN0.000.00-2.180NaNNaN162852000317320002004800037994000543340003468001795766000912000553620004408200
16-63720525-A-T663720525rs1768331164ATgnomAD GenomesNaNPASSENST00000503581.6c.*71T>ANaNc.*71T>A3_prime_UTR_variantNaNNaNNaN111065509.037097e-0700PASSNaNNaN0.193NaN0.000.00-2.180NaNNaN062852000317320002004800137994000543340003468000795766000912000553620004408200
26-63720526-T-A663720526NaNTAgnomAD ExomesPASSNaNENST00000503581.6c.*70A>TNaNc.*70A>T3_prime_UTR_variantNaNNaNNaN111088949.017995e-0700PASSNaNNaN1.430NaN0.000.000.276NaNNaN062994000317640002007600037960000543760003462000797828000912001553600004416200
36-63720531-C-CAA663720531NaNCCAAgnomAD ExomesPASSNaNENST00000503581.6c.*63_*64dupNaNc.*63_*64dup3_prime_UTR_variantNaNNaNNaN312140502.471068e-0600PASSnfe3.700000e-074.830NaN0.000.000.460NaNNaN064966000320560002075400038418000546620003736002893326000912000575200014770000
46-63720531-C-G663720531rs927390284CGgnomAD Exomes,gnomAD GenomesPASSPASSENST00000503581.6c.*65G>CNaNc.*65G>C3_prime_UTR_variantNaNNaNNaN512139344.118840e-0600PASSafr5.120000e-065.760NaN0.000.000.460NaNNaN264844000320380002075400038430000546620003758003893334000912000575240004767800
...........................................................................................................................................................................................................................
110836-65495478-G-A665495478rs530118054GAgnomAD Exomes,gnomAD GenomesPASSPASSENST00000503581.6c.-68C>TNaNc.-68C>T5_prime_UTR_variantUncertain significance357751.0NaN14215369929.238825e-0500PASSnfe8.983000e-055.460NaN0.00-0.110.424NaNNaN473554000576320002897600443932002491720015926001191128372000912004883520086016400
110846-65495479-G-A665495479rs1766225632GAgnomAD ExomesPASSNaNENST00000503581.6c.-69C>TNaNc.-69C>T5_prime_UTR_variantNaNNaNNaN215351021.302845e-0600PASSNaNNaN5.920NaN0.00-0.070.437NaNNaN1735680005753000028954001439280004922800059060001126618000912000883160006014200
110856-65495482-A-G665495482rs1766225707AGgnomAD Exomes,gnomAD GenomesPASSPASSENST00000503581.6c.-72T>CNaNc.-72T>C5_prime_UTR_variantNaNNaNNaN415211962.629510e-0600PASSamr2.337000e-056.350NaN0.000.010.715NaNNaN0731660045698400028844000437180004894600058860001115326000912000878280005958600
110866-65495484-T-G665495484rs1766225807TGgnomAD ExomesPASSNaNENST00000503581.6c.-74A>CNaNc.-74A>C5_prime_UTR_variantNaNNaNNaN515240443.280745e-0600PASSsas2.164000e-055.000NaN0.01-0.010.725NaNNaN0733240005689400028836000436480004888400058700001118110000912005878180005974800
110876-65495485-T-C665495485NaNTCgnomAD ExomesPASSNaNENST00000503581.6c.-75A>GNaNc.-75A>G5_prime_UTR_variantNaNNaNNaN115203226.577554e-0700PASSNaNNaN10.300NaN0.00-0.091.170NaNNaN0732920005669400028814000435540004881800058480001115146000912001876360005960800
\n", + "

11088 rows × 72 columns

\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "execution_count": 6 }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-14T17:51:25.045318Z", + "start_time": "2024-09-14T17:51:24.983847Z" + } + }, "cell_type": "code", - "source": [ - "gnomad_data_2.to_csv('C:\\\\Users\\\\Kajus\\\\Desktop\\\\gnomad_data_downloaded.csv', index=False)\n", - "gnomad_data.to_csv('C:\\\\Users\\\\Kajus\\\\Desktop\\\\gnomad_data_api.csv', index=False)" - ], + "source": "gnomad_data.to_csv('C:\\\\Users\\\\Kajus\\\\Desktop\\\\gnomad_data_api.csv', index=False)", "id": "2e869f5c77dbe3d3", "outputs": [], - "execution_count": null + "execution_count": 7 }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-14T17:51:25.050214Z", + "start_time": "2024-09-14T17:51:25.046323Z" + } + }, "cell_type": "code", "source": [ "len(gnomad_data_2), len(gnomad_data)\n", @@ -110,37 +1243,343 @@ "print(len(gnomad_data_2) - len(gnomad_data))" ], "id": "9efafb201061c146", - "outputs": [], - "execution_count": null + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-3212\n" + ] + } + ], + "execution_count": 8 }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-14T17:52:20.264323Z", + "start_time": "2024-09-14T17:52:20.251983Z" + } + }, "cell_type": "code", "source": "gnomad_data", "id": "96283480cccf641", - "outputs": [], - "execution_count": null + "outputs": [ + { + "data": { + "text/plain": [ + " Popmax Popmax population ... Allele Frequency variant_id\n", + "0 0.000016 African/African American ... 1.807419e-06 6-63720525-A-G\n", + "1 0.000192 East Asian ... 6.573844e-06 6-63720525-A-T\n", + "2 0.000000 ... 0.000000e+00 6-63720525-A-C\n", + "3 0.000020 South Asian ... 1.045299e-06 6-63720526-T-A\n", + "4 0.000000 ... 0.000000e+00 6-63720527-G-T\n", + "... ... ... ... ... ...\n", + "14295 0.000000 ... 0.000000e+00 6-65495479-G-T\n", + "14296 0.000031 African/African American ... 1.446349e-06 6-65495479-G-A\n", + "14297 0.000070 Admixed American ... 2.629510e-06 6-65495482-A-G\n", + "14298 0.000060 South Asian ... 3.645085e-06 6-65495484-T-G\n", + "14299 0.000012 South Asian ... 7.310070e-07 6-65495485-T-C\n", + "\n", + "[14300 rows x 5 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PopmaxPopmax populationHomozygote CountAllele Frequencyvariant_id
00.000016African/African American0.01.807419e-066-63720525-A-G
10.000192East Asian0.06.573844e-066-63720525-A-T
20.0000000.00.000000e+006-63720525-A-C
30.000020South Asian0.01.045299e-066-63720526-T-A
40.0000000.00.000000e+006-63720527-G-T
..................
142950.0000000.00.000000e+006-65495479-G-T
142960.000031African/African American0.01.446349e-066-65495479-G-A
142970.000070Admixed American0.02.629510e-066-65495482-A-G
142980.000060South Asian0.03.645085e-066-65495484-T-G
142990.000012South Asian0.07.310070e-076-65495485-T-C
\n", + "

14300 rows × 5 columns

\n", + "
" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 11 }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-14T17:52:47.517346Z", + "start_time": "2024-09-14T17:52:45.257745Z" + } + }, "cell_type": "code", "source": [ "missing_from_api = []\n", "\n", - "for i in gnomad_data['gnomAD ID']:\n", + "for i in gnomad_data['variant_id']:\n", " if(i in gnomad_data_2['gnomAD ID'].values):\n", " continue\n", " missing_from_api.append(i)\n", "\n", "len(missing_from_api)\n", "\n", - "missing_data = gnomad_data.loc[gnomad_data['gnomAD ID'].isin(missing_from_api)]\n", + "missing_data = gnomad_data.loc[gnomad_data['variant_id'].isin(missing_from_api)]\n", "\n", "missing_data" ], "id": "d0eb0a6db96d31c8", - "outputs": [], - "execution_count": null + "outputs": [ + { + "data": { + "text/plain": [ + " Popmax Popmax population ... Allele Frequency variant_id\n", + "2 0.0 ... 0.0 6-63720525-A-C\n", + "4 0.0 ... 0.0 6-63720527-G-T\n", + "5 0.0 ... 0.0 6-63720527-G-A\n", + "8 0.0 ... 0.0 6-63720531-C-A\n", + "14 0.0 ... 0.0 6-63720535-G-T\n", + "... ... ... ... ... ...\n", + "14279 0.0 ... 0.0 6-65495463-G-T\n", + "14287 0.0 ... 0.0 6-65495471-T-C\n", + "14291 0.0 ... 0.0 6-65495477-C-A\n", + "14292 0.0 ... 0.0 6-65495477-C-CACAACTTTACTT\n", + "14295 0.0 ... 0.0 6-65495479-G-T\n", + "\n", + "[3212 rows x 5 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PopmaxPopmax populationHomozygote CountAllele Frequencyvariant_id
20.00.00.06-63720525-A-C
40.00.00.06-63720527-G-T
50.00.00.06-63720527-G-A
80.00.00.06-63720531-C-A
140.00.00.06-63720535-G-T
..................
142790.00.00.06-65495463-G-T
142870.00.00.06-65495471-T-C
142910.00.00.06-65495477-C-A
142920.00.00.06-65495477-C-CACAACTTTACTT
142950.00.00.06-65495479-G-T
\n", + "

3212 rows × 5 columns

\n", + "
" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 12 }, { "metadata": {},