diff --git a/api/data/__init__.py b/api/data/__init__.py index e69de29..7bfdfbe 100644 --- a/api/data/__init__.py +++ b/api/data/__init__.py @@ -0,0 +1,66 @@ +""" +Package for data collection purposes provides both collection and refactoring functionality. + +Data from LOVD, ClinVar and GnomAd databases can be downloaded using this package. GnomAd and +ClinVar are limited with EYS gene, but it is possible to download data for any gene in LOVD. + +All necessary functionality can be imported directly from data without +specifying the module. + +data collection pipeline example is established for project's specific usage. +""" + +# CONSTANTS IMPORT +from .constants import ( + # URLs for LOVD database + LOVD_URL, LOVD_URL_EYS, LOVD_FILE_URL, LOVD_FILE_URL_EYS, + + # URLs for gnomAD database + GNOMAD_URL, GNOMAD_URL_EYS, GNOMAD_FILE_URL_EYS, + + # URLs for ClinVar database + CLINVAR_URL, CLINVAR_URL_EYS, CLINVAR_FILE_URL_EYS, + + # Paths for data storage + DATA_PATH, LOVD_PATH, GNOMAD_PATH, CLINVAR_PATH, + + # Data types for tables + LOVD_TABLES_DATA_TYPES, + + # Paths for database downloads + DATABASES_DOWNLOAD_PATHS, + + GNOMAD_PATH, +) + +# DATA COLLECTION IMPORT +from .collection import ( + # Custom exceptions + BadResponseException, + DownloadError, + + # Custom utility functions + get_file_from_url, + + # Functions for downloading databases + download_lovd_database_for_eys_gene, + download_genes_lovd, + download_database_for_eys_gene, + download_data_from_gnomad_eys, + + # Functions for storing databases + store_database_for_eys_gene + +) + +# DATA REFACTORING IMPORT +from .refactoring import ( + # Functions for refactoring data + set_lovd_dtypes, + parse_lovd, + from_clinvar_name_to_cdna_position, + save_lovd_as_vcf, + merge_gnomad_lovd, + parse_gnomad, + set_gnomad_dtypes, +) diff --git a/api/data/downloading.py b/api/data/downloading.py index cec13b1..0d43d40 100644 --- a/api/data/downloading.py +++ b/api/data/downloading.py @@ -6,6 +6,7 @@ import time import requests +import pandas as pd from requests import RequestException from selenium import webdriver @@ -188,3 +189,154 @@ def store_database_for_eys_gene(database_name, override=False): download_lovd_database_for_eys_gene(override) else: download_database_for_eys_gene(database_name, override) + +def prepare_popmax_calculation(df, pop_data, name, pop_ids, index): + """ + prepares the calculation of popmax and popmax population for a variant. + genome and exome data of ac and an. + + :param DataFrame df: DataFrame containing gnomAD data + :param dict pop_data: dictionary containing population data + :param str name: name of the population + :param list[str] pop_ids: list of population ids + :param int index: index of the variant + """ + + for pop_id in pop_ids: + df.loc[index, f'{name}_ac_{pop_id}'] = 0 + df.loc[index, f'{name}_an_{pop_id}'] = 0 + if isinstance(pop_data, list): + for pop in pop_data: + variant_id = pop['id'] + df.loc[index, f'{name}_ac_{variant_id}'] = pop['ac'] + df.loc[index, f'{name}_an_{variant_id}'] = pop['an'] + + +def download_data_from_gnomad_eys(path, override=False): + """ + Requests gnomAD API for data about a specific gene containing: + - variant_id + - cDNA change + - protein change + - allele frequency + - homozygote count + - popmax + - popmax population + + :param str gene_name: name of gene + :param bool to_file: if True, saves data to variants.csv + :returns: DataFrame from gnomAD API + :rtype: DataFrame + """ + + url = 'https://gnomad.broadinstitute.org/api' + query = f""" + query{{ + gene(gene_symbol: "EYS", reference_genome: GRCh38) {{ + variants(dataset: gnomad_r4) + {{ + variant_id + chrom + pos + ref + hgvsc + hgvsp + alt + exome {{ + ac + an + ac_hom + populations + {{ + id + ac + an + }} + }} + genome + {{ + ac + an + ac_hom + populations + {{ + id + ac + an + }} + }} + }} + }} + }} + """ + + response = requests.post(url, json={'query': query}, timeout=300) # timeout set to 5 minutes + + if response.status_code != 200: + if not os.path.isfile(path): + f = open('logs.txt', 'x') + f.write(response.text) + else: + f = open('logs.txt', 'a') + f.write(response.text) + + data = response.json()['data']['gene']['variants'] + + df = pd.json_normalize(data) + + df.loc[:, 'total_ac'] = df.loc[:, 'exome.ac'].fillna(0) + df.loc[:, 'genome.ac'].fillna(0) + df.loc[:, 'total_an'] = df.loc[:, 'exome.an'].fillna(0) + df.loc[:, 'genome.an'].fillna(0) + + df.loc[:, 'HGVS Consequence'] = df.loc[:, 'hgvsc'].fillna(0) # cDNA change + df.loc[:, 'Protein Consequence'] = df.loc[:, 'hgvsp'].fillna(0) # Protein change + + df.loc[:, 'Allele Frequency'] = df.loc[:, 'total_ac'] / df.loc[:, 'total_an'] + df.loc[:, 'Homozygote Count'] = df.loc[:, 'exome.ac_hom'].fillna(0) + df.loc[:, 'genome.ac_hom'].fillna(0) + exome_populations = df.loc[:, 'exome.populations'] + genome_populations = df.loc[:, 'genome.populations'] + population_ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining'] + + for i in range(len(exome_populations)): + exome_pop = exome_populations[i] + prepare_popmax_calculation(df, exome_pop, 'exome', population_ids, i) + genome_pop = genome_populations[i] + prepare_popmax_calculation(df, genome_pop, 'genome', population_ids, i) + + for population_id in population_ids: + df.loc[:, f'Allele_Frequency_{population_id}'] = ( + (df.loc[:, f'exome_ac_{population_id}'].fillna(0) + df.loc[:, f'genome_ac_{population_id}'].fillna(0)) / + (df.loc[:, f'exome_an_{population_id}'].fillna(0) + df.loc[:, f'genome_an_{population_id}'].fillna(0))) + population_mapping = { + 'afr': 'African/African American', + 'eas': 'East Asian', + 'asj': 'Ashkenazi Jew', + 'sas': 'South Asian', + 'nfe': 'European (non-Finnish)', + 'fin': 'European (Finnish)', + 'mid': 'Middle Eastern', + 'amr': 'Admixed American', + 'ami': "Amish", + 'remaining': 'Remaining', + '': '' + } + + for i in range(df.shape[0]): + max_pop = 0 + max_id = '' + for population_id in population_ids: + if df.loc[i, f'Allele_Frequency_{population_id}'] > max_pop: + max_pop = df.loc[i, f'Allele_Frequency_{population_id}'] + max_id = population_id + df.loc[i, 'Popmax'] = max_pop + df.loc[i, 'Popmax population'] = population_mapping[max_id] + not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency', + 'variant_id', 'cDNA change', 'Protein change', 'gnomAD ID'] + + df.rename(columns={'variant_id': 'gnomAD ID'}) + + df = df.filter(not_to_drop, axis="columns") + + if not os.path.isfile(path) or override: + df.to_csv(path, index=False) + + return df \ No newline at end of file diff --git a/api/data/refactoring.py b/api/data/refactoring.py index 5fa6539..b93ccd7 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -348,128 +348,3 @@ def find_popmax_in_gnomad(data): data.loc[i, 'Popmax'] = max_pop data.loc[i, 'Popmax population'] = population_mapping[max_id] - -def prepare_popmax_calculation(df, pop_data, name, pop_ids, index): - """ - prepares the calculation of popmax and popmax population for a variant. - genome and exome data of ac and an. - - :param DataFrame df: DataFrame containing gnomAD data - :param dict pop_data: dictionary containing population data - :param str name: name of the population - :param list[str] pop_ids: list of population ids - :param int index: index of the variant - """ - - for pop_id in pop_ids: - df.loc[index, f'{name}_ac_{pop_id}'] = 0 - df.loc[index, f'{name}_an_{pop_id}'] = 0 - if isinstance(pop_data, list): - for pop in pop_data: - variant_id = pop['id'] - df.loc[index, f'{name}_ac_{variant_id}'] = pop['ac'] - df.loc[index, f'{name}_an_{variant_id}'] = pop['an'] - - -def request_gnomad_api_data(gene_name): - """ - Requests gnomAD API for data about a specific gene containing: - - variant_id - - cDNA change - - protein change - - allele frequency - - homozygote count - - popmax - - popmax population - - :param str gene_name: name of gene - :param bool to_file: if True, saves data to variants.csv - :returns: DataFrame from gnomAD API - :rtype: DataFrame - """ - - url = 'https://gnomad.broadinstitute.org/api' - query = f""" - query{{ - gene(gene_symbol: "{gene_name}", reference_genome: GRCh38) {{ - variants(dataset: gnomad_r4) - {{ - variant_id - chrom - pos - ref - hgvsc - hgvsp - alt - exome {{ - ac - an - ac_hom - populations - {{ - id - ac - an - }} - }} - genome - {{ - ac - an - ac_hom - populations - {{ - id - ac - an - }} - }} - }} - }} - }} - """ - - response = requests.post(url, json={'query': query}, timeout=300) # timeout set to 5 minutes - - if response.status_code != 200: - print('Error:', response.status_code) - - data = response.json()['data']['gene']['variants'] - - df = pd.json_normalize(data) - - df.loc[:, 'total_ac'] = df.loc[:, 'exome.ac'].fillna(0) + df.loc[:, 'genome.ac'].fillna(0) - df.loc[:, 'total_an'] = df.loc[:, 'exome.an'].fillna(0) + df.loc[:, 'genome.an'].fillna(0) - - df.loc[:, 'HGVS Consequence'] = df.loc[:, 'hgvsc'].fillna(0) # cDNA change - df.loc[:, 'Protein Consequence'] = df.loc[:, 'hgvsp'].fillna(0) # Protein change - - df.loc[:, 'Allele Frequency'] = df.loc[:, 'total_ac'] / df.loc[:, 'total_an'] - df.loc[:, 'Homozygote Count'] = (df.loc[:, 'exome.ac_hom'].fillna(0) - + df.loc[:, 'genome.ac_hom'].fillna(0)) - exome_populations = df.loc[:, 'exome.populations'] - genome_populations = df.loc[:, 'genome.populations'] - population_ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining'] - - for i in range(exome_populations.shape[0]): - exome_pop = exome_populations[i] - prepare_popmax_calculation(df, exome_pop, 'exome', population_ids, i) - genome_pop = genome_populations[i] - prepare_popmax_calculation(df, genome_pop, 'genome', population_ids, i) - - for population_id in population_ids: - df.loc[:, f'Allele_Frequency_{population_id}'] = ( - (df.loc[:, f'exome_ac_{population_id}'].fillna(0) - + df.loc[:, f'genome_ac_{population_id}'].fillna(0)) - / (df.loc[:, f'exome_an_{population_id}'].fillna(0) - + df.loc[:, f'genome_an_{population_id}'].fillna(0))) - - find_popmax_in_gnomad(df) - not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency', - 'variant_id', 'cDNA change', 'Protein change'] - - df = df.filter(not_to_drop, axis="columns") - - df.rename(columns={'variant_id': 'gnomAD ID'}) - - return df diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 3275f3f..23df568 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -7,6 +7,10 @@ "collapsed": true, "jupyter": { "outputs_hidden": true + }, + "ExecuteTime": { + "end_time": "2024-09-14T17:48:45.547065Z", + "start_time": "2024-09-14T17:48:44.657414Z" } }, "source": [ @@ -19,7 +23,7 @@ " LOVD_PATH,\n", " set_lovd_dtypes,\n", " set_gnomad_dtypes,\n", - " request_gnomad_api_data,\n", + " download_data_from_gnomad_eys,\n", " merge_gnomad_lovd,\n", " GNOMAD_PATH,\n", " )\n", @@ -29,19 +33,31 @@ "pd.options.display.max_columns = 0" ], "outputs": [], - "execution_count": null + "execution_count": 1 }, { "cell_type": "code", "id": "f49f7691a27aa7b4", "metadata": { - "collapsed": false + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-09-14T17:48:45.552787Z", + "start_time": "2024-09-14T17:48:45.549075Z" + } }, "source": [ "store_database_for_eys_gene(\"lovd\", override=False)" ], - "outputs": [], - "execution_count": null + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The file at ../data/lovd/lovd_data.txt already exists.\n" + ] + } + ], + "execution_count": 2 }, { "cell_type": "code", @@ -50,28 +66,186 @@ "collapsed": false, "jupyter": { "outputs_hidden": false + }, + "ExecuteTime": { + "end_time": "2024-09-14T17:48:53.637253Z", + "start_time": "2024-09-14T17:48:45.553796Z" } }, "source": [ + "download_path = 'C:\\\\Users\\\\Kajus\\\\Desktop\\\\gnomad_data_downloaded.csv'\n", "data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")" ], "outputs": [], - "execution_count": null + "execution_count": 3 }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-14T17:51:24.853647Z", + "start_time": "2024-09-14T17:48:53.638260Z" + } + }, "cell_type": "code", "source": [ - "gnomad_data = request_gnomad_api_data(\"EYS\")\n", + "gnomad_data = download_data_from_gnomad_eys(path=download_path, override=False)\n", "\n", "display(gnomad_data)" ], "id": "64482c033c794fb4", - "outputs": [], - "execution_count": null + "outputs": [ + { + "data": { + "text/plain": [ + " Popmax Popmax population ... Allele Frequency variant_id\n", + "0 0.000016 African/African American ... 1.807419e-06 6-63720525-A-G\n", + "1 0.000192 East Asian ... 6.573844e-06 6-63720525-A-T\n", + "2 0.000000 ... 0.000000e+00 6-63720525-A-C\n", + "3 0.000020 South Asian ... 1.045299e-06 6-63720526-T-A\n", + "4 0.000000 ... 0.000000e+00 6-63720527-G-T\n", + "... ... ... ... ... ...\n", + "14295 0.000000 ... 0.000000e+00 6-65495479-G-T\n", + "14296 0.000031 African/African American ... 1.446349e-06 6-65495479-G-A\n", + "14297 0.000070 Admixed American ... 2.629510e-06 6-65495482-A-G\n", + "14298 0.000060 South Asian ... 3.645085e-06 6-65495484-T-G\n", + "14299 0.000012 South Asian ... 7.310070e-07 6-65495485-T-C\n", + "\n", + "[14300 rows x 5 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PopmaxPopmax populationHomozygote CountAllele Frequencyvariant_id
00.000016African/African American0.01.807419e-066-63720525-A-G
10.000192East Asian0.06.573844e-066-63720525-A-T
20.0000000.00.000000e+006-63720525-A-C
30.000020South Asian0.01.045299e-066-63720526-T-A
40.0000000.00.000000e+006-63720527-G-T
..................
142950.0000000.00.000000e+006-65495479-G-T
142960.000031African/African American0.01.446349e-066-65495479-G-A
142970.000070Admixed American0.02.629510e-066-65495482-A-G
142980.000060South Asian0.03.645085e-066-65495484-T-G
142990.000012South Asian0.07.310070e-076-65495485-T-C
\n", + "

14300 rows × 5 columns

\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "execution_count": 4 }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-14T17:51:24.927680Z", + "start_time": "2024-09-14T17:51:24.854656Z" + } + }, "cell_type": "code", "source": [ "store_database_for_eys_gene('gnomad', False)\n", @@ -80,29 +254,988 @@ ], "id": "60f3f3074a9b19f4", "outputs": [], - "execution_count": null + "execution_count": 5 }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-14T17:51:24.982839Z", + "start_time": "2024-09-14T17:51:24.928689Z" + } + }, "cell_type": "code", "source": "display(gnomad_data_2)", "id": "9d3e4d6b5f7be127", - "outputs": [], - "execution_count": null + "outputs": [ + { + "data": { + "text/plain": [ + " gnomAD ID ... Hemizygote Count Remaining\n", + "0 6-63720525-A-G ... 0\n", + "1 6-63720525-A-T ... 0\n", + "2 6-63720526-T-A ... 0\n", + "3 6-63720531-C-CAA ... 0\n", + "4 6-63720531-C-G ... 0\n", + "... ... ... ...\n", + "11083 6-65495478-G-A ... 0\n", + "11084 6-65495479-G-A ... 0\n", + "11085 6-65495482-A-G ... 0\n", + "11086 6-65495484-T-G ... 0\n", + "11087 6-65495485-T-C ... 0\n", + "\n", + "[11088 rows x 72 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gnomAD IDChromosomePositionrsIDsReferenceAlternateSourceFilters - exomesFilters - genomesTranscriptHGVS ConsequenceProtein ConsequenceTranscript ConsequenceVEP AnnotationClinVar Clinical SignificanceClinVar Variation IDFlagsAllele CountAllele NumberAllele FrequencyHomozygote CountHemizygote CountFilters - jointGroupMax FAF groupGroupMax FAF frequencycaddrevel_maxspliceai_ds_maxpangolin_largest_dsphylopsift_maxpolyphen_maxAllele Count African/African AmericanAllele Number African/African AmericanHomozygote Count African/African AmericanHemizygote Count African/African AmericanAllele Count Admixed AmericanAllele Number Admixed AmericanHomozygote Count Admixed AmericanHemizygote Count Admixed AmericanAllele Count Ashkenazi JewishAllele Number Ashkenazi JewishHomozygote Count Ashkenazi JewishHemizygote Count Ashkenazi JewishAllele Count East AsianAllele Number East AsianHomozygote Count East AsianHemizygote Count East AsianAllele Count European (Finnish)Allele Number European (Finnish)Homozygote Count European (Finnish)Hemizygote Count European (Finnish)Allele Count Middle EasternAllele Number Middle EasternHomozygote Count Middle EasternHemizygote Count Middle EasternAllele Count European (non-Finnish)Allele Number European (non-Finnish)Homozygote Count European (non-Finnish)Hemizygote Count European (non-Finnish)Allele Count AmishAllele Number AmishHomozygote Count AmishHemizygote Count AmishAllele Count South AsianAllele Number South AsianHomozygote Count South AsianHemizygote Count South AsianAllele Count RemainingAllele Number RemainingHomozygote Count RemainingHemizygote Count Remaining
06-63720525-A-G663720525rs1768331164AGgnomAD Exomes,gnomAD GenomesPASSPASSENST00000503581.6c.*71T>CNaNc.*71T>C3_prime_UTR_variantNaNNaNNaN211065501.807419e-0600PASSNaNNaN0.238NaN0.000.00-2.180NaNNaN162852000317320002004800037994000543340003468001795766000912000553620004408200
16-63720525-A-T663720525rs1768331164ATgnomAD GenomesNaNPASSENST00000503581.6c.*71T>ANaNc.*71T>A3_prime_UTR_variantNaNNaNNaN111065509.037097e-0700PASSNaNNaN0.193NaN0.000.00-2.180NaNNaN062852000317320002004800137994000543340003468000795766000912000553620004408200
26-63720526-T-A663720526NaNTAgnomAD ExomesPASSNaNENST00000503581.6c.*70A>TNaNc.*70A>T3_prime_UTR_variantNaNNaNNaN111088949.017995e-0700PASSNaNNaN1.430NaN0.000.000.276NaNNaN062994000317640002007600037960000543760003462000797828000912001553600004416200
36-63720531-C-CAA663720531NaNCCAAgnomAD ExomesPASSNaNENST00000503581.6c.*63_*64dupNaNc.*63_*64dup3_prime_UTR_variantNaNNaNNaN312140502.471068e-0600PASSnfe3.700000e-074.830NaN0.000.000.460NaNNaN064966000320560002075400038418000546620003736002893326000912000575200014770000
46-63720531-C-G663720531rs927390284CGgnomAD Exomes,gnomAD GenomesPASSPASSENST00000503581.6c.*65G>CNaNc.*65G>C3_prime_UTR_variantNaNNaNNaN512139344.118840e-0600PASSafr5.120000e-065.760NaN0.000.000.460NaNNaN264844000320380002075400038430000546620003758003893334000912000575240004767800
...........................................................................................................................................................................................................................
110836-65495478-G-A665495478rs530118054GAgnomAD Exomes,gnomAD GenomesPASSPASSENST00000503581.6c.-68C>TNaNc.-68C>T5_prime_UTR_variantUncertain significance357751.0NaN14215369929.238825e-0500PASSnfe8.983000e-055.460NaN0.00-0.110.424NaNNaN473554000576320002897600443932002491720015926001191128372000912004883520086016400
110846-65495479-G-A665495479rs1766225632GAgnomAD ExomesPASSNaNENST00000503581.6c.-69C>TNaNc.-69C>T5_prime_UTR_variantNaNNaNNaN215351021.302845e-0600PASSNaNNaN5.920NaN0.00-0.070.437NaNNaN1735680005753000028954001439280004922800059060001126618000912000883160006014200
110856-65495482-A-G665495482rs1766225707AGgnomAD Exomes,gnomAD GenomesPASSPASSENST00000503581.6c.-72T>CNaNc.-72T>C5_prime_UTR_variantNaNNaNNaN415211962.629510e-0600PASSamr2.337000e-056.350NaN0.000.010.715NaNNaN0731660045698400028844000437180004894600058860001115326000912000878280005958600
110866-65495484-T-G665495484rs1766225807TGgnomAD ExomesPASSNaNENST00000503581.6c.-74A>CNaNc.-74A>C5_prime_UTR_variantNaNNaNNaN515240443.280745e-0600PASSsas2.164000e-055.000NaN0.01-0.010.725NaNNaN0733240005689400028836000436480004888400058700001118110000912005878180005974800
110876-65495485-T-C665495485NaNTCgnomAD ExomesPASSNaNENST00000503581.6c.-75A>GNaNc.-75A>G5_prime_UTR_variantNaNNaNNaN115203226.577554e-0700PASSNaNNaN10.300NaN0.00-0.091.170NaNNaN0732920005669400028814000435540004881800058480001115146000912001876360005960800
\n", + "

11088 rows × 72 columns

\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "execution_count": 6 }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-14T17:51:25.045318Z", + "start_time": "2024-09-14T17:51:24.983847Z" + } + }, "cell_type": "code", - "source": [ - "gnomad_data_2.to_csv('C:\\\\Users\\\\Kajus\\\\Desktop\\\\gnomad_data_downloaded.csv', index=False)\n", - "gnomad_data.to_csv('C:\\\\Users\\\\Kajus\\\\Desktop\\\\gnomad_data_api.csv', index=False)" - ], + "source": "gnomad_data.to_csv('C:\\\\Users\\\\Kajus\\\\Desktop\\\\gnomad_data_api.csv', index=False)", "id": "2e869f5c77dbe3d3", "outputs": [], - "execution_count": null + "execution_count": 7 }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-14T17:51:25.050214Z", + "start_time": "2024-09-14T17:51:25.046323Z" + } + }, "cell_type": "code", "source": [ "len(gnomad_data_2), len(gnomad_data)\n", @@ -110,37 +1243,343 @@ "print(len(gnomad_data_2) - len(gnomad_data))" ], "id": "9efafb201061c146", - "outputs": [], - "execution_count": null + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-3212\n" + ] + } + ], + "execution_count": 8 }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-14T17:52:20.264323Z", + "start_time": "2024-09-14T17:52:20.251983Z" + } + }, "cell_type": "code", "source": "gnomad_data", "id": "96283480cccf641", - "outputs": [], - "execution_count": null + "outputs": [ + { + "data": { + "text/plain": [ + " Popmax Popmax population ... Allele Frequency variant_id\n", + "0 0.000016 African/African American ... 1.807419e-06 6-63720525-A-G\n", + "1 0.000192 East Asian ... 6.573844e-06 6-63720525-A-T\n", + "2 0.000000 ... 0.000000e+00 6-63720525-A-C\n", + "3 0.000020 South Asian ... 1.045299e-06 6-63720526-T-A\n", + "4 0.000000 ... 0.000000e+00 6-63720527-G-T\n", + "... ... ... ... ... ...\n", + "14295 0.000000 ... 0.000000e+00 6-65495479-G-T\n", + "14296 0.000031 African/African American ... 1.446349e-06 6-65495479-G-A\n", + "14297 0.000070 Admixed American ... 2.629510e-06 6-65495482-A-G\n", + "14298 0.000060 South Asian ... 3.645085e-06 6-65495484-T-G\n", + "14299 0.000012 South Asian ... 7.310070e-07 6-65495485-T-C\n", + "\n", + "[14300 rows x 5 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PopmaxPopmax populationHomozygote CountAllele Frequencyvariant_id
00.000016African/African American0.01.807419e-066-63720525-A-G
10.000192East Asian0.06.573844e-066-63720525-A-T
20.0000000.00.000000e+006-63720525-A-C
30.000020South Asian0.01.045299e-066-63720526-T-A
40.0000000.00.000000e+006-63720527-G-T
..................
142950.0000000.00.000000e+006-65495479-G-T
142960.000031African/African American0.01.446349e-066-65495479-G-A
142970.000070Admixed American0.02.629510e-066-65495482-A-G
142980.000060South Asian0.03.645085e-066-65495484-T-G
142990.000012South Asian0.07.310070e-076-65495485-T-C
\n", + "

14300 rows × 5 columns

\n", + "
" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 11 }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-14T17:52:47.517346Z", + "start_time": "2024-09-14T17:52:45.257745Z" + } + }, "cell_type": "code", "source": [ "missing_from_api = []\n", "\n", - "for i in gnomad_data['gnomAD ID']:\n", + "for i in gnomad_data['variant_id']:\n", " if(i in gnomad_data_2['gnomAD ID'].values):\n", " continue\n", " missing_from_api.append(i)\n", "\n", "len(missing_from_api)\n", "\n", - "missing_data = gnomad_data.loc[gnomad_data['gnomAD ID'].isin(missing_from_api)]\n", + "missing_data = gnomad_data.loc[gnomad_data['variant_id'].isin(missing_from_api)]\n", "\n", "missing_data" ], "id": "d0eb0a6db96d31c8", - "outputs": [], - "execution_count": null + "outputs": [ + { + "data": { + "text/plain": [ + " Popmax Popmax population ... Allele Frequency variant_id\n", + "2 0.0 ... 0.0 6-63720525-A-C\n", + "4 0.0 ... 0.0 6-63720527-G-T\n", + "5 0.0 ... 0.0 6-63720527-G-A\n", + "8 0.0 ... 0.0 6-63720531-C-A\n", + "14 0.0 ... 0.0 6-63720535-G-T\n", + "... ... ... ... ... ...\n", + "14279 0.0 ... 0.0 6-65495463-G-T\n", + "14287 0.0 ... 0.0 6-65495471-T-C\n", + "14291 0.0 ... 0.0 6-65495477-C-A\n", + "14292 0.0 ... 0.0 6-65495477-C-CACAACTTTACTT\n", + "14295 0.0 ... 0.0 6-65495479-G-T\n", + "\n", + "[3212 rows x 5 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PopmaxPopmax populationHomozygote CountAllele Frequencyvariant_id
20.00.00.06-63720525-A-C
40.00.00.06-63720527-G-T
50.00.00.06-63720527-G-A
80.00.00.06-63720531-C-A
140.00.00.06-63720535-G-T
..................
142790.00.00.06-65495463-G-T
142870.00.00.06-65495471-T-C
142910.00.00.06-65495477-C-A
142920.00.00.06-65495477-C-CACAACTTTACTT
142950.00.00.06-65495479-G-T
\n", + "

3212 rows × 5 columns

\n", + "
" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 12 }, { "metadata": {},