From 252a468d4d4ed164721b571955f485ad7c5a9841 Mon Sep 17 00:00:00 2001 From: Kajus CC <42713684+KajusC@users.noreply.github.com> Date: Sat, 14 Sep 2024 21:02:23 +0300 Subject: [PATCH] Moved function to collection.py; added path parameter with overriding option; implemented one log function; Changed that its only getting EYS --- api/data/__init__.py | 3 +- api/data/collection.py | 151 ++++ api/data/refactoring.py | 141 ---- tests/pipeline.ipynb | 1503 ++++++++++++++++++++++++++++++++++++++- 4 files changed, 1624 insertions(+), 174 deletions(-) diff --git a/api/data/__init__.py b/api/data/__init__.py index bd40c79..7bfdfbe 100644 --- a/api/data/__init__.py +++ b/api/data/__init__.py @@ -46,9 +46,11 @@ download_lovd_database_for_eys_gene, download_genes_lovd, download_database_for_eys_gene, + download_data_from_gnomad_eys, # Functions for storing databases store_database_for_eys_gene + ) # DATA REFACTORING IMPORT @@ -58,7 +60,6 @@ parse_lovd, from_clinvar_name_to_cdna_position, save_lovd_as_vcf, - request_gnomad_api_data, merge_gnomad_lovd, parse_gnomad, set_gnomad_dtypes, diff --git a/api/data/collection.py b/api/data/collection.py index 8bb8312..0d43d40 100644 --- a/api/data/collection.py +++ b/api/data/collection.py @@ -6,6 +6,7 @@ import time import requests +import pandas as pd from requests import RequestException from selenium import webdriver @@ -189,3 +190,153 @@ def store_database_for_eys_gene(database_name, override=False): else: download_database_for_eys_gene(database_name, override) +def prepare_popmax_calculation(df, pop_data, name, pop_ids, index): + """ + prepares the calculation of popmax and popmax population for a variant. + genome and exome data of ac and an. + + :param DataFrame df: DataFrame containing gnomAD data + :param dict pop_data: dictionary containing population data + :param str name: name of the population + :param list[str] pop_ids: list of population ids + :param int index: index of the variant + """ + + for pop_id in pop_ids: + df.loc[index, f'{name}_ac_{pop_id}'] = 0 + df.loc[index, f'{name}_an_{pop_id}'] = 0 + if isinstance(pop_data, list): + for pop in pop_data: + variant_id = pop['id'] + df.loc[index, f'{name}_ac_{variant_id}'] = pop['ac'] + df.loc[index, f'{name}_an_{variant_id}'] = pop['an'] + + +def download_data_from_gnomad_eys(path, override=False): + """ + Requests gnomAD API for data about a specific gene containing: + - variant_id + - cDNA change + - protein change + - allele frequency + - homozygote count + - popmax + - popmax population + + :param str gene_name: name of gene + :param bool to_file: if True, saves data to variants.csv + :returns: DataFrame from gnomAD API + :rtype: DataFrame + """ + + url = 'https://gnomad.broadinstitute.org/api' + query = f""" + query{{ + gene(gene_symbol: "EYS", reference_genome: GRCh38) {{ + variants(dataset: gnomad_r4) + {{ + variant_id + chrom + pos + ref + hgvsc + hgvsp + alt + exome {{ + ac + an + ac_hom + populations + {{ + id + ac + an + }} + }} + genome + {{ + ac + an + ac_hom + populations + {{ + id + ac + an + }} + }} + }} + }} + }} + """ + + response = requests.post(url, json={'query': query}, timeout=300) # timeout set to 5 minutes + + if response.status_code != 200: + if not os.path.isfile(path): + f = open('logs.txt', 'x') + f.write(response.text) + else: + f = open('logs.txt', 'a') + f.write(response.text) + + data = response.json()['data']['gene']['variants'] + + df = pd.json_normalize(data) + + df.loc[:, 'total_ac'] = df.loc[:, 'exome.ac'].fillna(0) + df.loc[:, 'genome.ac'].fillna(0) + df.loc[:, 'total_an'] = df.loc[:, 'exome.an'].fillna(0) + df.loc[:, 'genome.an'].fillna(0) + + df.loc[:, 'HGVS Consequence'] = df.loc[:, 'hgvsc'].fillna(0) # cDNA change + df.loc[:, 'Protein Consequence'] = df.loc[:, 'hgvsp'].fillna(0) # Protein change + + df.loc[:, 'Allele Frequency'] = df.loc[:, 'total_ac'] / df.loc[:, 'total_an'] + df.loc[:, 'Homozygote Count'] = df.loc[:, 'exome.ac_hom'].fillna(0) + df.loc[:, 'genome.ac_hom'].fillna(0) + exome_populations = df.loc[:, 'exome.populations'] + genome_populations = df.loc[:, 'genome.populations'] + population_ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining'] + + for i in range(len(exome_populations)): + exome_pop = exome_populations[i] + prepare_popmax_calculation(df, exome_pop, 'exome', population_ids, i) + genome_pop = genome_populations[i] + prepare_popmax_calculation(df, genome_pop, 'genome', population_ids, i) + + for population_id in population_ids: + df.loc[:, f'Allele_Frequency_{population_id}'] = ( + (df.loc[:, f'exome_ac_{population_id}'].fillna(0) + df.loc[:, f'genome_ac_{population_id}'].fillna(0)) / + (df.loc[:, f'exome_an_{population_id}'].fillna(0) + df.loc[:, f'genome_an_{population_id}'].fillna(0))) + population_mapping = { + 'afr': 'African/African American', + 'eas': 'East Asian', + 'asj': 'Ashkenazi Jew', + 'sas': 'South Asian', + 'nfe': 'European (non-Finnish)', + 'fin': 'European (Finnish)', + 'mid': 'Middle Eastern', + 'amr': 'Admixed American', + 'ami': "Amish", + 'remaining': 'Remaining', + '': '' + } + + for i in range(df.shape[0]): + max_pop = 0 + max_id = '' + for population_id in population_ids: + if df.loc[i, f'Allele_Frequency_{population_id}'] > max_pop: + max_pop = df.loc[i, f'Allele_Frequency_{population_id}'] + max_id = population_id + df.loc[i, 'Popmax'] = max_pop + df.loc[i, 'Popmax population'] = population_mapping[max_id] + not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency', + 'variant_id', 'cDNA change', 'Protein change', 'gnomAD ID'] + + df.rename(columns={'variant_id': 'gnomAD ID'}) + + df = df.filter(not_to_drop, axis="columns") + + if not os.path.isfile(path) or override: + df.to_csv(path, index=False) + + return df \ No newline at end of file diff --git a/api/data/refactoring.py b/api/data/refactoring.py index 51f9a4c..fe268f1 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -247,145 +247,4 @@ def save_lovd_as_vcf(data, save_to="./lovd.vcf"): f.write("\n") -def prepare_popmax_calculation(df, pop_data, name, pop_ids, index): - """ - prepares the calculation of popmax and popmax population for a variant. - genome and exome data of ac and an. - - :param DataFrame df: DataFrame containing gnomAD data - :param dict pop_data: dictionary containing population data - :param str name: name of the population - :param list[str] pop_ids: list of population ids - :param int index: index of the variant - """ - - for pop_id in pop_ids: - df.loc[index, f'{name}_ac_{pop_id}'] = 0 - df.loc[index, f'{name}_an_{pop_id}'] = 0 - if isinstance(pop_data, list): - for pop in pop_data: - variant_id = pop['id'] - df.loc[index, f'{name}_ac_{variant_id}'] = pop['ac'] - df.loc[index, f'{name}_an_{variant_id}'] = pop['an'] - - -def request_gnomad_api_data(gene_name): - """ - Requests gnomAD API for data about a specific gene containing: - - variant_id - - cDNA change - - protein change - - allele frequency - - homozygote count - - popmax - - popmax population - - :param str gene_name: name of gene - :param bool to_file: if True, saves data to variants.csv - :returns: DataFrame from gnomAD API - :rtype: DataFrame - """ - - url = 'https://gnomad.broadinstitute.org/api' - query = f""" - query{{ - gene(gene_symbol: "{gene_name}", reference_genome: GRCh38) {{ - variants(dataset: gnomad_r4) - {{ - variant_id - chrom - pos - ref - hgvsc - hgvsp - alt - exome {{ - ac - an - ac_hom - populations - {{ - id - ac - an - }} - }} - genome - {{ - ac - an - ac_hom - populations - {{ - id - ac - an - }} - }} - }} - }} - }} - """ - response = requests.post(url, json={'query': query}, timeout=300) # timeout set to 5 minutes - - if response.status_code != 200: - print('Error:', response.status_code) - - data = response.json()['data']['gene']['variants'] - - df = pd.json_normalize(data) - - df.loc[:, 'total_ac'] = df.loc[:, 'exome.ac'].fillna(0) + df.loc[:, 'genome.ac'].fillna(0) - df.loc[:, 'total_an'] = df.loc[:, 'exome.an'].fillna(0) + df.loc[:, 'genome.an'].fillna(0) - - df.loc[:, 'HGVS Consequence'] = df.loc[:, 'hgvsc'].fillna(0) # cDNA change - df.loc[:, 'Protein Consequence'] = df.loc[:, 'hgvsp'].fillna(0) # Protein change - - df.loc[:, 'Allele Frequency'] = df.loc[:, 'total_ac'] / df.loc[:, 'total_an'] - df.loc[:, 'Homozygote Count'] = df.loc[:, 'exome.ac_hom'].fillna(0) + df.loc[:, 'genome.ac_hom'].fillna(0) - exome_populations = df.loc[:, 'exome.populations'] - genome_populations = df.loc[:, 'genome.populations'] - population_ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining'] - - for i in range(len(exome_populations)): - exome_pop = exome_populations[i] - prepare_popmax_calculation(df, exome_pop, 'exome', population_ids, i) - genome_pop = genome_populations[i] - prepare_popmax_calculation(df, genome_pop, 'genome', population_ids, i) - - for population_id in population_ids: - df.loc[:, f'Allele_Frequency_{population_id}'] = ( - (df.loc[:, f'exome_ac_{population_id}'].fillna(0) + df.loc[:, f'genome_ac_{population_id}'].fillna(0)) / ( - df.loc[:, f'exome_an_{population_id}'].fillna(0) + df.loc[:, f'genome_an_{population_id}'].fillna(0))) - population_mapping = { - 'afr': 'African/African American', - 'eas': 'East Asian', - 'asj': 'Ashkenazi Jew', - 'sas': 'South Asian', - 'nfe': 'European (non-Finnish)', - 'fin': 'European (Finnish)', - 'mid': 'Middle Eastern', - 'amr': 'Admixed American', - 'ami': "Amish", - 'remaining': 'Remaining', - '': '' - } - - for i in range(df.shape[0]): - max_pop = 0 - max_id = '' - for population_id in population_ids: - if df.loc[i, f'Allele_Frequency_{population_id}'] > max_pop: - max_pop = df.loc[i, f'Allele_Frequency_{population_id}'] - max_id = population_id - df.loc[i, 'Popmax'] = max_pop - df.loc[i, 'Popmax population'] = population_mapping[max_id] - not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency', - 'variant_id', 'cDNA change', 'Protein change'] - - df = df.filter(not_to_drop, axis="columns") - - df.rename(columns={'variant_id': 'gnomAD ID'}) - - return df diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 71cf21d..4d97d59 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -7,6 +7,10 @@ "collapsed": true, "jupyter": { "outputs_hidden": true + }, + "ExecuteTime": { + "end_time": "2024-09-14T17:48:45.547065Z", + "start_time": "2024-09-14T17:48:44.657414Z" } }, "source": [ @@ -19,7 +23,7 @@ " LOVD_PATH,\n", " set_lovd_dtypes,\n", " set_gnomad_dtypes,\n", - " request_gnomad_api_data,\n", + " download_data_from_gnomad_eys,\n", " merge_gnomad_lovd,\n", " GNOMAD_PATH,\n", " )\n", @@ -29,19 +33,31 @@ "pd.options.display.max_columns = 0" ], "outputs": [], - "execution_count": null + "execution_count": 1 }, { "cell_type": "code", "id": "f49f7691a27aa7b4", "metadata": { - "collapsed": false + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-09-14T17:48:45.552787Z", + "start_time": "2024-09-14T17:48:45.549075Z" + } }, "source": [ "store_database_for_eys_gene(\"lovd\", override=False)" ], - "outputs": [], - "execution_count": null + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The file at ../data/lovd/lovd_data.txt already exists.\n" + ] + } + ], + "execution_count": 2 }, { "cell_type": "code", @@ -50,28 +66,186 @@ "collapsed": false, "jupyter": { "outputs_hidden": false + }, + "ExecuteTime": { + "end_time": "2024-09-14T17:48:53.637253Z", + "start_time": "2024-09-14T17:48:45.553796Z" } }, "source": [ + "download_path = 'C:\\\\Users\\\\Kajus\\\\Desktop\\\\gnomad_data_downloaded.csv'\n", "data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")" ], "outputs": [], - "execution_count": null + "execution_count": 3 }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-14T17:51:24.853647Z", + "start_time": "2024-09-14T17:48:53.638260Z" + } + }, "cell_type": "code", "source": [ - "gnomad_data = request_gnomad_api_data(\"EYS\")\n", + "gnomad_data = download_data_from_gnomad_eys(path=download_path, override=False)\n", "\n", "display(gnomad_data)" ], "id": "64482c033c794fb4", - "outputs": [], - "execution_count": null + "outputs": [ + { + "data": { + "text/plain": [ + " Popmax Popmax population ... Allele Frequency variant_id\n", + "0 0.000016 African/African American ... 1.807419e-06 6-63720525-A-G\n", + "1 0.000192 East Asian ... 6.573844e-06 6-63720525-A-T\n", + "2 0.000000 ... 0.000000e+00 6-63720525-A-C\n", + "3 0.000020 South Asian ... 1.045299e-06 6-63720526-T-A\n", + "4 0.000000 ... 0.000000e+00 6-63720527-G-T\n", + "... ... ... ... ... ...\n", + "14295 0.000000 ... 0.000000e+00 6-65495479-G-T\n", + "14296 0.000031 African/African American ... 1.446349e-06 6-65495479-G-A\n", + "14297 0.000070 Admixed American ... 2.629510e-06 6-65495482-A-G\n", + "14298 0.000060 South Asian ... 3.645085e-06 6-65495484-T-G\n", + "14299 0.000012 South Asian ... 7.310070e-07 6-65495485-T-C\n", + "\n", + "[14300 rows x 5 columns]" + ], + "text/html": [ + "
\n", + " | Popmax | \n", + "Popmax population | \n", + "Homozygote Count | \n", + "Allele Frequency | \n", + "variant_id | \n", + "
---|---|---|---|---|---|
0 | \n", + "0.000016 | \n", + "African/African American | \n", + "0.0 | \n", + "1.807419e-06 | \n", + "6-63720525-A-G | \n", + "
1 | \n", + "0.000192 | \n", + "East Asian | \n", + "0.0 | \n", + "6.573844e-06 | \n", + "6-63720525-A-T | \n", + "
2 | \n", + "0.000000 | \n", + "\n", + " | 0.0 | \n", + "0.000000e+00 | \n", + "6-63720525-A-C | \n", + "
3 | \n", + "0.000020 | \n", + "South Asian | \n", + "0.0 | \n", + "1.045299e-06 | \n", + "6-63720526-T-A | \n", + "
4 | \n", + "0.000000 | \n", + "\n", + " | 0.0 | \n", + "0.000000e+00 | \n", + "6-63720527-G-T | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
14295 | \n", + "0.000000 | \n", + "\n", + " | 0.0 | \n", + "0.000000e+00 | \n", + "6-65495479-G-T | \n", + "
14296 | \n", + "0.000031 | \n", + "African/African American | \n", + "0.0 | \n", + "1.446349e-06 | \n", + "6-65495479-G-A | \n", + "
14297 | \n", + "0.000070 | \n", + "Admixed American | \n", + "0.0 | \n", + "2.629510e-06 | \n", + "6-65495482-A-G | \n", + "
14298 | \n", + "0.000060 | \n", + "South Asian | \n", + "0.0 | \n", + "3.645085e-06 | \n", + "6-65495484-T-G | \n", + "
14299 | \n", + "0.000012 | \n", + "South Asian | \n", + "0.0 | \n", + "7.310070e-07 | \n", + "6-65495485-T-C | \n", + "
14300 rows × 5 columns
\n", + "\n", + " | gnomAD ID | \n", + "Chromosome | \n", + "Position | \n", + "rsIDs | \n", + "Reference | \n", + "Alternate | \n", + "Source | \n", + "Filters - exomes | \n", + "Filters - genomes | \n", + "Transcript | \n", + "HGVS Consequence | \n", + "Protein Consequence | \n", + "Transcript Consequence | \n", + "VEP Annotation | \n", + "ClinVar Clinical Significance | \n", + "ClinVar Variation ID | \n", + "Flags | \n", + "Allele Count | \n", + "Allele Number | \n", + "Allele Frequency | \n", + "Homozygote Count | \n", + "Hemizygote Count | \n", + "Filters - joint | \n", + "GroupMax FAF group | \n", + "GroupMax FAF frequency | \n", + "cadd | \n", + "revel_max | \n", + "spliceai_ds_max | \n", + "pangolin_largest_ds | \n", + "phylop | \n", + "sift_max | \n", + "polyphen_max | \n", + "Allele Count African/African American | \n", + "Allele Number African/African American | \n", + "Homozygote Count African/African American | \n", + "Hemizygote Count African/African American | \n", + "Allele Count Admixed American | \n", + "Allele Number Admixed American | \n", + "Homozygote Count Admixed American | \n", + "Hemizygote Count Admixed American | \n", + "Allele Count Ashkenazi Jewish | \n", + "Allele Number Ashkenazi Jewish | \n", + "Homozygote Count Ashkenazi Jewish | \n", + "Hemizygote Count Ashkenazi Jewish | \n", + "Allele Count East Asian | \n", + "Allele Number East Asian | \n", + "Homozygote Count East Asian | \n", + "Hemizygote Count East Asian | \n", + "Allele Count European (Finnish) | \n", + "Allele Number European (Finnish) | \n", + "Homozygote Count European (Finnish) | \n", + "Hemizygote Count European (Finnish) | \n", + "Allele Count Middle Eastern | \n", + "Allele Number Middle Eastern | \n", + "Homozygote Count Middle Eastern | \n", + "Hemizygote Count Middle Eastern | \n", + "Allele Count European (non-Finnish) | \n", + "Allele Number European (non-Finnish) | \n", + "Homozygote Count European (non-Finnish) | \n", + "Hemizygote Count European (non-Finnish) | \n", + "Allele Count Amish | \n", + "Allele Number Amish | \n", + "Homozygote Count Amish | \n", + "Hemizygote Count Amish | \n", + "Allele Count South Asian | \n", + "Allele Number South Asian | \n", + "Homozygote Count South Asian | \n", + "Hemizygote Count South Asian | \n", + "Allele Count Remaining | \n", + "Allele Number Remaining | \n", + "Homozygote Count Remaining | \n", + "Hemizygote Count Remaining | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "6-63720525-A-G | \n", + "6 | \n", + "63720525 | \n", + "rs1768331164 | \n", + "A | \n", + "G | \n", + "gnomAD Exomes,gnomAD Genomes | \n", + "PASS | \n", + "PASS | \n", + "ENST00000503581.6 | \n", + "c.*71T>C | \n", + "NaN | \n", + "c.*71T>C | \n", + "3_prime_UTR_variant | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "2 | \n", + "1106550 | \n", + "1.807419e-06 | \n", + "0 | \n", + "0 | \n", + "PASS | \n", + "NaN | \n", + "NaN | \n", + "0.238 | \n", + "NaN | \n", + "0.00 | \n", + "0.00 | \n", + "-2.180 | \n", + "NaN | \n", + "NaN | \n", + "1 | \n", + "62852 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "31732 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "20048 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "37994 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "54334 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "3468 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "795766 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "912 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "55362 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "44082 | \n", + "0 | \n", + "0 | \n", + "
1 | \n", + "6-63720525-A-T | \n", + "6 | \n", + "63720525 | \n", + "rs1768331164 | \n", + "A | \n", + "T | \n", + "gnomAD Genomes | \n", + "NaN | \n", + "PASS | \n", + "ENST00000503581.6 | \n", + "c.*71T>A | \n", + "NaN | \n", + "c.*71T>A | \n", + "3_prime_UTR_variant | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "1 | \n", + "1106550 | \n", + "9.037097e-07 | \n", + "0 | \n", + "0 | \n", + "PASS | \n", + "NaN | \n", + "NaN | \n", + "0.193 | \n", + "NaN | \n", + "0.00 | \n", + "0.00 | \n", + "-2.180 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "62852 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "31732 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "20048 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "37994 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "54334 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "3468 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "795766 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "912 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "55362 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "44082 | \n", + "0 | \n", + "0 | \n", + "
2 | \n", + "6-63720526-T-A | \n", + "6 | \n", + "63720526 | \n", + "NaN | \n", + "T | \n", + "A | \n", + "gnomAD Exomes | \n", + "PASS | \n", + "NaN | \n", + "ENST00000503581.6 | \n", + "c.*70A>T | \n", + "NaN | \n", + "c.*70A>T | \n", + "3_prime_UTR_variant | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "1 | \n", + "1108894 | \n", + "9.017995e-07 | \n", + "0 | \n", + "0 | \n", + "PASS | \n", + "NaN | \n", + "NaN | \n", + "1.430 | \n", + "NaN | \n", + "0.00 | \n", + "0.00 | \n", + "0.276 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "62994 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "31764 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "20076 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "37960 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "54376 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "3462 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "797828 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "912 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "55360 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "44162 | \n", + "0 | \n", + "0 | \n", + "
3 | \n", + "6-63720531-C-CAA | \n", + "6 | \n", + "63720531 | \n", + "NaN | \n", + "C | \n", + "CAA | \n", + "gnomAD Exomes | \n", + "PASS | \n", + "NaN | \n", + "ENST00000503581.6 | \n", + "c.*63_*64dup | \n", + "NaN | \n", + "c.*63_*64dup | \n", + "3_prime_UTR_variant | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "3 | \n", + "1214050 | \n", + "2.471068e-06 | \n", + "0 | \n", + "0 | \n", + "PASS | \n", + "nfe | \n", + "3.700000e-07 | \n", + "4.830 | \n", + "NaN | \n", + "0.00 | \n", + "0.00 | \n", + "0.460 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "64966 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "32056 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "20754 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "38418 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "54662 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "3736 | \n", + "0 | \n", + "0 | \n", + "2 | \n", + "893326 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "912 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "57520 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "47700 | \n", + "0 | \n", + "0 | \n", + "
4 | \n", + "6-63720531-C-G | \n", + "6 | \n", + "63720531 | \n", + "rs927390284 | \n", + "C | \n", + "G | \n", + "gnomAD Exomes,gnomAD Genomes | \n", + "PASS | \n", + "PASS | \n", + "ENST00000503581.6 | \n", + "c.*65G>C | \n", + "NaN | \n", + "c.*65G>C | \n", + "3_prime_UTR_variant | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "5 | \n", + "1213934 | \n", + "4.118840e-06 | \n", + "0 | \n", + "0 | \n", + "PASS | \n", + "afr | \n", + "5.120000e-06 | \n", + "5.760 | \n", + "NaN | \n", + "0.00 | \n", + "0.00 | \n", + "0.460 | \n", + "NaN | \n", + "NaN | \n", + "2 | \n", + "64844 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "32038 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "20754 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "38430 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "54662 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "3758 | \n", + "0 | \n", + "0 | \n", + "3 | \n", + "893334 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "912 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "57524 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "47678 | \n", + "0 | \n", + "0 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
11083 | \n", + "6-65495478-G-A | \n", + "6 | \n", + "65495478 | \n", + "rs530118054 | \n", + "G | \n", + "A | \n", + "gnomAD Exomes,gnomAD Genomes | \n", + "PASS | \n", + "PASS | \n", + "ENST00000503581.6 | \n", + "c.-68C>T | \n", + "NaN | \n", + "c.-68C>T | \n", + "5_prime_UTR_variant | \n", + "Uncertain significance | \n", + "357751.0 | \n", + "NaN | \n", + "142 | \n", + "1536992 | \n", + "9.238825e-05 | \n", + "0 | \n", + "0 | \n", + "PASS | \n", + "nfe | \n", + "8.983000e-05 | \n", + "5.460 | \n", + "NaN | \n", + "0.00 | \n", + "-0.11 | \n", + "0.424 | \n", + "NaN | \n", + "NaN | \n", + "4 | \n", + "73554 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "57632 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "28976 | \n", + "0 | \n", + "0 | \n", + "4 | \n", + "43932 | \n", + "0 | \n", + "0 | \n", + "2 | \n", + "49172 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "5926 | \n", + "0 | \n", + "0 | \n", + "119 | \n", + "1128372 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "912 | \n", + "0 | \n", + "0 | \n", + "4 | \n", + "88352 | \n", + "0 | \n", + "0 | \n", + "8 | \n", + "60164 | \n", + "0 | \n", + "0 | \n", + "
11084 | \n", + "6-65495479-G-A | \n", + "6 | \n", + "65495479 | \n", + "rs1766225632 | \n", + "G | \n", + "A | \n", + "gnomAD Exomes | \n", + "PASS | \n", + "NaN | \n", + "ENST00000503581.6 | \n", + "c.-69C>T | \n", + "NaN | \n", + "c.-69C>T | \n", + "5_prime_UTR_variant | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "2 | \n", + "1535102 | \n", + "1.302845e-06 | \n", + "0 | \n", + "0 | \n", + "PASS | \n", + "NaN | \n", + "NaN | \n", + "5.920 | \n", + "NaN | \n", + "0.00 | \n", + "-0.07 | \n", + "0.437 | \n", + "NaN | \n", + "NaN | \n", + "1 | \n", + "73568 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "57530 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "28954 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "43928 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "49228 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "5906 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "1126618 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "912 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "88316 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "60142 | \n", + "0 | \n", + "0 | \n", + "
11085 | \n", + "6-65495482-A-G | \n", + "6 | \n", + "65495482 | \n", + "rs1766225707 | \n", + "A | \n", + "G | \n", + "gnomAD Exomes,gnomAD Genomes | \n", + "PASS | \n", + "PASS | \n", + "ENST00000503581.6 | \n", + "c.-72T>C | \n", + "NaN | \n", + "c.-72T>C | \n", + "5_prime_UTR_variant | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "4 | \n", + "1521196 | \n", + "2.629510e-06 | \n", + "0 | \n", + "0 | \n", + "PASS | \n", + "amr | \n", + "2.337000e-05 | \n", + "6.350 | \n", + "NaN | \n", + "0.00 | \n", + "0.01 | \n", + "0.715 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "73166 | \n", + "0 | \n", + "0 | \n", + "4 | \n", + "56984 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "28844 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "43718 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "48946 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "5886 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "1115326 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "912 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "87828 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "59586 | \n", + "0 | \n", + "0 | \n", + "
11086 | \n", + "6-65495484-T-G | \n", + "6 | \n", + "65495484 | \n", + "rs1766225807 | \n", + "T | \n", + "G | \n", + "gnomAD Exomes | \n", + "PASS | \n", + "NaN | \n", + "ENST00000503581.6 | \n", + "c.-74A>C | \n", + "NaN | \n", + "c.-74A>C | \n", + "5_prime_UTR_variant | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "5 | \n", + "1524044 | \n", + "3.280745e-06 | \n", + "0 | \n", + "0 | \n", + "PASS | \n", + "sas | \n", + "2.164000e-05 | \n", + "5.000 | \n", + "NaN | \n", + "0.01 | \n", + "-0.01 | \n", + "0.725 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "73324 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "56894 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "28836 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "43648 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "48884 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "5870 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "1118110 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "912 | \n", + "0 | \n", + "0 | \n", + "5 | \n", + "87818 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "59748 | \n", + "0 | \n", + "0 | \n", + "
11087 | \n", + "6-65495485-T-C | \n", + "6 | \n", + "65495485 | \n", + "NaN | \n", + "T | \n", + "C | \n", + "gnomAD Exomes | \n", + "PASS | \n", + "NaN | \n", + "ENST00000503581.6 | \n", + "c.-75A>G | \n", + "NaN | \n", + "c.-75A>G | \n", + "5_prime_UTR_variant | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "1 | \n", + "1520322 | \n", + "6.577554e-07 | \n", + "0 | \n", + "0 | \n", + "PASS | \n", + "NaN | \n", + "NaN | \n", + "10.300 | \n", + "NaN | \n", + "0.00 | \n", + "-0.09 | \n", + "1.170 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "73292 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "56694 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "28814 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "43554 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "48818 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "5848 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "1115146 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "912 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "87636 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "59608 | \n", + "0 | \n", + "0 | \n", + "
11088 rows × 72 columns
\n", + "\n", + " | Popmax | \n", + "Popmax population | \n", + "Homozygote Count | \n", + "Allele Frequency | \n", + "variant_id | \n", + "
---|---|---|---|---|---|
0 | \n", + "0.000016 | \n", + "African/African American | \n", + "0.0 | \n", + "1.807419e-06 | \n", + "6-63720525-A-G | \n", + "
1 | \n", + "0.000192 | \n", + "East Asian | \n", + "0.0 | \n", + "6.573844e-06 | \n", + "6-63720525-A-T | \n", + "
2 | \n", + "0.000000 | \n", + "\n", + " | 0.0 | \n", + "0.000000e+00 | \n", + "6-63720525-A-C | \n", + "
3 | \n", + "0.000020 | \n", + "South Asian | \n", + "0.0 | \n", + "1.045299e-06 | \n", + "6-63720526-T-A | \n", + "
4 | \n", + "0.000000 | \n", + "\n", + " | 0.0 | \n", + "0.000000e+00 | \n", + "6-63720527-G-T | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
14295 | \n", + "0.000000 | \n", + "\n", + " | 0.0 | \n", + "0.000000e+00 | \n", + "6-65495479-G-T | \n", + "
14296 | \n", + "0.000031 | \n", + "African/African American | \n", + "0.0 | \n", + "1.446349e-06 | \n", + "6-65495479-G-A | \n", + "
14297 | \n", + "0.000070 | \n", + "Admixed American | \n", + "0.0 | \n", + "2.629510e-06 | \n", + "6-65495482-A-G | \n", + "
14298 | \n", + "0.000060 | \n", + "South Asian | \n", + "0.0 | \n", + "3.645085e-06 | \n", + "6-65495484-T-G | \n", + "
14299 | \n", + "0.000012 | \n", + "South Asian | \n", + "0.0 | \n", + "7.310070e-07 | \n", + "6-65495485-T-C | \n", + "
14300 rows × 5 columns
\n", + "\n", + " | Popmax | \n", + "Popmax population | \n", + "Homozygote Count | \n", + "Allele Frequency | \n", + "variant_id | \n", + "
---|---|---|---|---|---|
2 | \n", + "0.0 | \n", + "\n", + " | 0.0 | \n", + "0.0 | \n", + "6-63720525-A-C | \n", + "
4 | \n", + "0.0 | \n", + "\n", + " | 0.0 | \n", + "0.0 | \n", + "6-63720527-G-T | \n", + "
5 | \n", + "0.0 | \n", + "\n", + " | 0.0 | \n", + "0.0 | \n", + "6-63720527-G-A | \n", + "
8 | \n", + "0.0 | \n", + "\n", + " | 0.0 | \n", + "0.0 | \n", + "6-63720531-C-A | \n", + "
14 | \n", + "0.0 | \n", + "\n", + " | 0.0 | \n", + "0.0 | \n", + "6-63720535-G-T | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
14279 | \n", + "0.0 | \n", + "\n", + " | 0.0 | \n", + "0.0 | \n", + "6-65495463-G-T | \n", + "
14287 | \n", + "0.0 | \n", + "\n", + " | 0.0 | \n", + "0.0 | \n", + "6-65495471-T-C | \n", + "
14291 | \n", + "0.0 | \n", + "\n", + " | 0.0 | \n", + "0.0 | \n", + "6-65495477-C-A | \n", + "
14292 | \n", + "0.0 | \n", + "\n", + " | 0.0 | \n", + "0.0 | \n", + "6-65495477-C-CACAACTTTACTT | \n", + "
14295 | \n", + "0.0 | \n", + "\n", + " | 0.0 | \n", + "0.0 | \n", + "6-65495479-G-T | \n", + "
3212 rows × 5 columns
\n", + "