diff --git a/api/data/__init__.py b/api/data/__init__.py index e69de29..7bfdfbe 100644 --- a/api/data/__init__.py +++ b/api/data/__init__.py @@ -0,0 +1,66 @@ +""" +Package for data collection purposes provides both collection and refactoring functionality. + +Data from LOVD, ClinVar and GnomAd databases can be downloaded using this package. GnomAd and +ClinVar are limited with EYS gene, but it is possible to download data for any gene in LOVD. + +All necessary functionality can be imported directly from data without +specifying the module. + +data collection pipeline example is established for project's specific usage. +""" + +# CONSTANTS IMPORT +from .constants import ( + # URLs for LOVD database + LOVD_URL, LOVD_URL_EYS, LOVD_FILE_URL, LOVD_FILE_URL_EYS, + + # URLs for gnomAD database + GNOMAD_URL, GNOMAD_URL_EYS, GNOMAD_FILE_URL_EYS, + + # URLs for ClinVar database + CLINVAR_URL, CLINVAR_URL_EYS, CLINVAR_FILE_URL_EYS, + + # Paths for data storage + DATA_PATH, LOVD_PATH, GNOMAD_PATH, CLINVAR_PATH, + + # Data types for tables + LOVD_TABLES_DATA_TYPES, + + # Paths for database downloads + DATABASES_DOWNLOAD_PATHS, + + GNOMAD_PATH, +) + +# DATA COLLECTION IMPORT +from .collection import ( + # Custom exceptions + BadResponseException, + DownloadError, + + # Custom utility functions + get_file_from_url, + + # Functions for downloading databases + download_lovd_database_for_eys_gene, + download_genes_lovd, + download_database_for_eys_gene, + download_data_from_gnomad_eys, + + # Functions for storing databases + store_database_for_eys_gene + +) + +# DATA REFACTORING IMPORT +from .refactoring import ( + # Functions for refactoring data + set_lovd_dtypes, + parse_lovd, + from_clinvar_name_to_cdna_position, + save_lovd_as_vcf, + merge_gnomad_lovd, + parse_gnomad, + set_gnomad_dtypes, +) diff --git a/api/data/downloading.py b/api/data/downloading.py index cec13b1..0d43d40 100644 --- a/api/data/downloading.py +++ b/api/data/downloading.py @@ -6,6 +6,7 @@ import time import requests +import pandas as pd from requests import RequestException from selenium import webdriver @@ -188,3 +189,154 @@ def store_database_for_eys_gene(database_name, override=False): download_lovd_database_for_eys_gene(override) else: download_database_for_eys_gene(database_name, override) + +def prepare_popmax_calculation(df, pop_data, name, pop_ids, index): + """ + prepares the calculation of popmax and popmax population for a variant. + genome and exome data of ac and an. + + :param DataFrame df: DataFrame containing gnomAD data + :param dict pop_data: dictionary containing population data + :param str name: name of the population + :param list[str] pop_ids: list of population ids + :param int index: index of the variant + """ + + for pop_id in pop_ids: + df.loc[index, f'{name}_ac_{pop_id}'] = 0 + df.loc[index, f'{name}_an_{pop_id}'] = 0 + if isinstance(pop_data, list): + for pop in pop_data: + variant_id = pop['id'] + df.loc[index, f'{name}_ac_{variant_id}'] = pop['ac'] + df.loc[index, f'{name}_an_{variant_id}'] = pop['an'] + + +def download_data_from_gnomad_eys(path, override=False): + """ + Requests gnomAD API for data about a specific gene containing: + - variant_id + - cDNA change + - protein change + - allele frequency + - homozygote count + - popmax + - popmax population + + :param str gene_name: name of gene + :param bool to_file: if True, saves data to variants.csv + :returns: DataFrame from gnomAD API + :rtype: DataFrame + """ + + url = 'https://gnomad.broadinstitute.org/api' + query = f""" + query{{ + gene(gene_symbol: "EYS", reference_genome: GRCh38) {{ + variants(dataset: gnomad_r4) + {{ + variant_id + chrom + pos + ref + hgvsc + hgvsp + alt + exome {{ + ac + an + ac_hom + populations + {{ + id + ac + an + }} + }} + genome + {{ + ac + an + ac_hom + populations + {{ + id + ac + an + }} + }} + }} + }} + }} + """ + + response = requests.post(url, json={'query': query}, timeout=300) # timeout set to 5 minutes + + if response.status_code != 200: + if not os.path.isfile(path): + f = open('logs.txt', 'x') + f.write(response.text) + else: + f = open('logs.txt', 'a') + f.write(response.text) + + data = response.json()['data']['gene']['variants'] + + df = pd.json_normalize(data) + + df.loc[:, 'total_ac'] = df.loc[:, 'exome.ac'].fillna(0) + df.loc[:, 'genome.ac'].fillna(0) + df.loc[:, 'total_an'] = df.loc[:, 'exome.an'].fillna(0) + df.loc[:, 'genome.an'].fillna(0) + + df.loc[:, 'HGVS Consequence'] = df.loc[:, 'hgvsc'].fillna(0) # cDNA change + df.loc[:, 'Protein Consequence'] = df.loc[:, 'hgvsp'].fillna(0) # Protein change + + df.loc[:, 'Allele Frequency'] = df.loc[:, 'total_ac'] / df.loc[:, 'total_an'] + df.loc[:, 'Homozygote Count'] = df.loc[:, 'exome.ac_hom'].fillna(0) + df.loc[:, 'genome.ac_hom'].fillna(0) + exome_populations = df.loc[:, 'exome.populations'] + genome_populations = df.loc[:, 'genome.populations'] + population_ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining'] + + for i in range(len(exome_populations)): + exome_pop = exome_populations[i] + prepare_popmax_calculation(df, exome_pop, 'exome', population_ids, i) + genome_pop = genome_populations[i] + prepare_popmax_calculation(df, genome_pop, 'genome', population_ids, i) + + for population_id in population_ids: + df.loc[:, f'Allele_Frequency_{population_id}'] = ( + (df.loc[:, f'exome_ac_{population_id}'].fillna(0) + df.loc[:, f'genome_ac_{population_id}'].fillna(0)) / + (df.loc[:, f'exome_an_{population_id}'].fillna(0) + df.loc[:, f'genome_an_{population_id}'].fillna(0))) + population_mapping = { + 'afr': 'African/African American', + 'eas': 'East Asian', + 'asj': 'Ashkenazi Jew', + 'sas': 'South Asian', + 'nfe': 'European (non-Finnish)', + 'fin': 'European (Finnish)', + 'mid': 'Middle Eastern', + 'amr': 'Admixed American', + 'ami': "Amish", + 'remaining': 'Remaining', + '': '' + } + + for i in range(df.shape[0]): + max_pop = 0 + max_id = '' + for population_id in population_ids: + if df.loc[i, f'Allele_Frequency_{population_id}'] > max_pop: + max_pop = df.loc[i, f'Allele_Frequency_{population_id}'] + max_id = population_id + df.loc[i, 'Popmax'] = max_pop + df.loc[i, 'Popmax population'] = population_mapping[max_id] + not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency', + 'variant_id', 'cDNA change', 'Protein change', 'gnomAD ID'] + + df.rename(columns={'variant_id': 'gnomAD ID'}) + + df = df.filter(not_to_drop, axis="columns") + + if not os.path.isfile(path) or override: + df.to_csv(path, index=False) + + return df \ No newline at end of file diff --git a/api/data/refactoring.py b/api/data/refactoring.py index 5fa6539..b93ccd7 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -348,128 +348,3 @@ def find_popmax_in_gnomad(data): data.loc[i, 'Popmax'] = max_pop data.loc[i, 'Popmax population'] = population_mapping[max_id] - -def prepare_popmax_calculation(df, pop_data, name, pop_ids, index): - """ - prepares the calculation of popmax and popmax population for a variant. - genome and exome data of ac and an. - - :param DataFrame df: DataFrame containing gnomAD data - :param dict pop_data: dictionary containing population data - :param str name: name of the population - :param list[str] pop_ids: list of population ids - :param int index: index of the variant - """ - - for pop_id in pop_ids: - df.loc[index, f'{name}_ac_{pop_id}'] = 0 - df.loc[index, f'{name}_an_{pop_id}'] = 0 - if isinstance(pop_data, list): - for pop in pop_data: - variant_id = pop['id'] - df.loc[index, f'{name}_ac_{variant_id}'] = pop['ac'] - df.loc[index, f'{name}_an_{variant_id}'] = pop['an'] - - -def request_gnomad_api_data(gene_name): - """ - Requests gnomAD API for data about a specific gene containing: - - variant_id - - cDNA change - - protein change - - allele frequency - - homozygote count - - popmax - - popmax population - - :param str gene_name: name of gene - :param bool to_file: if True, saves data to variants.csv - :returns: DataFrame from gnomAD API - :rtype: DataFrame - """ - - url = 'https://gnomad.broadinstitute.org/api' - query = f""" - query{{ - gene(gene_symbol: "{gene_name}", reference_genome: GRCh38) {{ - variants(dataset: gnomad_r4) - {{ - variant_id - chrom - pos - ref - hgvsc - hgvsp - alt - exome {{ - ac - an - ac_hom - populations - {{ - id - ac - an - }} - }} - genome - {{ - ac - an - ac_hom - populations - {{ - id - ac - an - }} - }} - }} - }} - }} - """ - - response = requests.post(url, json={'query': query}, timeout=300) # timeout set to 5 minutes - - if response.status_code != 200: - print('Error:', response.status_code) - - data = response.json()['data']['gene']['variants'] - - df = pd.json_normalize(data) - - df.loc[:, 'total_ac'] = df.loc[:, 'exome.ac'].fillna(0) + df.loc[:, 'genome.ac'].fillna(0) - df.loc[:, 'total_an'] = df.loc[:, 'exome.an'].fillna(0) + df.loc[:, 'genome.an'].fillna(0) - - df.loc[:, 'HGVS Consequence'] = df.loc[:, 'hgvsc'].fillna(0) # cDNA change - df.loc[:, 'Protein Consequence'] = df.loc[:, 'hgvsp'].fillna(0) # Protein change - - df.loc[:, 'Allele Frequency'] = df.loc[:, 'total_ac'] / df.loc[:, 'total_an'] - df.loc[:, 'Homozygote Count'] = (df.loc[:, 'exome.ac_hom'].fillna(0) - + df.loc[:, 'genome.ac_hom'].fillna(0)) - exome_populations = df.loc[:, 'exome.populations'] - genome_populations = df.loc[:, 'genome.populations'] - population_ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining'] - - for i in range(exome_populations.shape[0]): - exome_pop = exome_populations[i] - prepare_popmax_calculation(df, exome_pop, 'exome', population_ids, i) - genome_pop = genome_populations[i] - prepare_popmax_calculation(df, genome_pop, 'genome', population_ids, i) - - for population_id in population_ids: - df.loc[:, f'Allele_Frequency_{population_id}'] = ( - (df.loc[:, f'exome_ac_{population_id}'].fillna(0) - + df.loc[:, f'genome_ac_{population_id}'].fillna(0)) - / (df.loc[:, f'exome_an_{population_id}'].fillna(0) - + df.loc[:, f'genome_an_{population_id}'].fillna(0))) - - find_popmax_in_gnomad(df) - not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency', - 'variant_id', 'cDNA change', 'Protein change'] - - df = df.filter(not_to_drop, axis="columns") - - df.rename(columns={'variant_id': 'gnomAD ID'}) - - return df diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 3275f3f..23df568 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -7,6 +7,10 @@ "collapsed": true, "jupyter": { "outputs_hidden": true + }, + "ExecuteTime": { + "end_time": "2024-09-14T17:48:45.547065Z", + "start_time": "2024-09-14T17:48:44.657414Z" } }, "source": [ @@ -19,7 +23,7 @@ " LOVD_PATH,\n", " set_lovd_dtypes,\n", " set_gnomad_dtypes,\n", - " request_gnomad_api_data,\n", + " download_data_from_gnomad_eys,\n", " merge_gnomad_lovd,\n", " GNOMAD_PATH,\n", " )\n", @@ -29,19 +33,31 @@ "pd.options.display.max_columns = 0" ], "outputs": [], - "execution_count": null + "execution_count": 1 }, { "cell_type": "code", "id": "f49f7691a27aa7b4", "metadata": { - "collapsed": false + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-09-14T17:48:45.552787Z", + "start_time": "2024-09-14T17:48:45.549075Z" + } }, "source": [ "store_database_for_eys_gene(\"lovd\", override=False)" ], - "outputs": [], - "execution_count": null + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The file at ../data/lovd/lovd_data.txt already exists.\n" + ] + } + ], + "execution_count": 2 }, { "cell_type": "code", @@ -50,28 +66,186 @@ "collapsed": false, "jupyter": { "outputs_hidden": false + }, + "ExecuteTime": { + "end_time": "2024-09-14T17:48:53.637253Z", + "start_time": "2024-09-14T17:48:45.553796Z" } }, "source": [ + "download_path = 'C:\\\\Users\\\\Kajus\\\\Desktop\\\\gnomad_data_downloaded.csv'\n", "data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")" ], "outputs": [], - "execution_count": null + "execution_count": 3 }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-14T17:51:24.853647Z", + "start_time": "2024-09-14T17:48:53.638260Z" + } + }, "cell_type": "code", "source": [ - "gnomad_data = request_gnomad_api_data(\"EYS\")\n", + "gnomad_data = download_data_from_gnomad_eys(path=download_path, override=False)\n", "\n", "display(gnomad_data)" ], "id": "64482c033c794fb4", - "outputs": [], - "execution_count": null + "outputs": [ + { + "data": { + "text/plain": [ + " Popmax Popmax population ... Allele Frequency variant_id\n", + "0 0.000016 African/African American ... 1.807419e-06 6-63720525-A-G\n", + "1 0.000192 East Asian ... 6.573844e-06 6-63720525-A-T\n", + "2 0.000000 ... 0.000000e+00 6-63720525-A-C\n", + "3 0.000020 South Asian ... 1.045299e-06 6-63720526-T-A\n", + "4 0.000000 ... 0.000000e+00 6-63720527-G-T\n", + "... ... ... ... ... ...\n", + "14295 0.000000 ... 0.000000e+00 6-65495479-G-T\n", + "14296 0.000031 African/African American ... 1.446349e-06 6-65495479-G-A\n", + "14297 0.000070 Admixed American ... 2.629510e-06 6-65495482-A-G\n", + "14298 0.000060 South Asian ... 3.645085e-06 6-65495484-T-G\n", + "14299 0.000012 South Asian ... 7.310070e-07 6-65495485-T-C\n", + "\n", + "[14300 rows x 5 columns]" + ], + "text/html": [ + "
\n", + " | Popmax | \n", + "Popmax population | \n", + "Homozygote Count | \n", + "Allele Frequency | \n", + "variant_id | \n", + "
---|---|---|---|---|---|
0 | \n", + "0.000016 | \n", + "African/African American | \n", + "0.0 | \n", + "1.807419e-06 | \n", + "6-63720525-A-G | \n", + "
1 | \n", + "0.000192 | \n", + "East Asian | \n", + "0.0 | \n", + "6.573844e-06 | \n", + "6-63720525-A-T | \n", + "
2 | \n", + "0.000000 | \n", + "\n", + " | 0.0 | \n", + "0.000000e+00 | \n", + "6-63720525-A-C | \n", + "
3 | \n", + "0.000020 | \n", + "South Asian | \n", + "0.0 | \n", + "1.045299e-06 | \n", + "6-63720526-T-A | \n", + "
4 | \n", + "0.000000 | \n", + "\n", + " | 0.0 | \n", + "0.000000e+00 | \n", + "6-63720527-G-T | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
14295 | \n", + "0.000000 | \n", + "\n", + " | 0.0 | \n", + "0.000000e+00 | \n", + "6-65495479-G-T | \n", + "
14296 | \n", + "0.000031 | \n", + "African/African American | \n", + "0.0 | \n", + "1.446349e-06 | \n", + "6-65495479-G-A | \n", + "
14297 | \n", + "0.000070 | \n", + "Admixed American | \n", + "0.0 | \n", + "2.629510e-06 | \n", + "6-65495482-A-G | \n", + "
14298 | \n", + "0.000060 | \n", + "South Asian | \n", + "0.0 | \n", + "3.645085e-06 | \n", + "6-65495484-T-G | \n", + "
14299 | \n", + "0.000012 | \n", + "South Asian | \n", + "0.0 | \n", + "7.310070e-07 | \n", + "6-65495485-T-C | \n", + "
14300 rows × 5 columns
\n", + "\n", + " | gnomAD ID | \n", + "Chromosome | \n", + "Position | \n", + "rsIDs | \n", + "Reference | \n", + "Alternate | \n", + "Source | \n", + "Filters - exomes | \n", + "Filters - genomes | \n", + "Transcript | \n", + "HGVS Consequence | \n", + "Protein Consequence | \n", + "Transcript Consequence | \n", + "VEP Annotation | \n", + "ClinVar Clinical Significance | \n", + "ClinVar Variation ID | \n", + "Flags | \n", + "Allele Count | \n", + "Allele Number | \n", + "Allele Frequency | \n", + "Homozygote Count | \n", + "Hemizygote Count | \n", + "Filters - joint | \n", + "GroupMax FAF group | \n", + "GroupMax FAF frequency | \n", + "cadd | \n", + "revel_max | \n", + "spliceai_ds_max | \n", + "pangolin_largest_ds | \n", + "phylop | \n", + "sift_max | \n", + "polyphen_max | \n", + "Allele Count African/African American | \n", + "Allele Number African/African American | \n", + "Homozygote Count African/African American | \n", + "Hemizygote Count African/African American | \n", + "Allele Count Admixed American | \n", + "Allele Number Admixed American | \n", + "Homozygote Count Admixed American | \n", + "Hemizygote Count Admixed American | \n", + "Allele Count Ashkenazi Jewish | \n", + "Allele Number Ashkenazi Jewish | \n", + "Homozygote Count Ashkenazi Jewish | \n", + "Hemizygote Count Ashkenazi Jewish | \n", + "Allele Count East Asian | \n", + "Allele Number East Asian | \n", + "Homozygote Count East Asian | \n", + "Hemizygote Count East Asian | \n", + "Allele Count European (Finnish) | \n", + "Allele Number European (Finnish) | \n", + "Homozygote Count European (Finnish) | \n", + "Hemizygote Count European (Finnish) | \n", + "Allele Count Middle Eastern | \n", + "Allele Number Middle Eastern | \n", + "Homozygote Count Middle Eastern | \n", + "Hemizygote Count Middle Eastern | \n", + "Allele Count European (non-Finnish) | \n", + "Allele Number European (non-Finnish) | \n", + "Homozygote Count European (non-Finnish) | \n", + "Hemizygote Count European (non-Finnish) | \n", + "Allele Count Amish | \n", + "Allele Number Amish | \n", + "Homozygote Count Amish | \n", + "Hemizygote Count Amish | \n", + "Allele Count South Asian | \n", + "Allele Number South Asian | \n", + "Homozygote Count South Asian | \n", + "Hemizygote Count South Asian | \n", + "Allele Count Remaining | \n", + "Allele Number Remaining | \n", + "Homozygote Count Remaining | \n", + "Hemizygote Count Remaining | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "6-63720525-A-G | \n", + "6 | \n", + "63720525 | \n", + "rs1768331164 | \n", + "A | \n", + "G | \n", + "gnomAD Exomes,gnomAD Genomes | \n", + "PASS | \n", + "PASS | \n", + "ENST00000503581.6 | \n", + "c.*71T>C | \n", + "NaN | \n", + "c.*71T>C | \n", + "3_prime_UTR_variant | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "2 | \n", + "1106550 | \n", + "1.807419e-06 | \n", + "0 | \n", + "0 | \n", + "PASS | \n", + "NaN | \n", + "NaN | \n", + "0.238 | \n", + "NaN | \n", + "0.00 | \n", + "0.00 | \n", + "-2.180 | \n", + "NaN | \n", + "NaN | \n", + "1 | \n", + "62852 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "31732 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "20048 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "37994 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "54334 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "3468 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "795766 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "912 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "55362 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "44082 | \n", + "0 | \n", + "0 | \n", + "
1 | \n", + "6-63720525-A-T | \n", + "6 | \n", + "63720525 | \n", + "rs1768331164 | \n", + "A | \n", + "T | \n", + "gnomAD Genomes | \n", + "NaN | \n", + "PASS | \n", + "ENST00000503581.6 | \n", + "c.*71T>A | \n", + "NaN | \n", + "c.*71T>A | \n", + "3_prime_UTR_variant | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "1 | \n", + "1106550 | \n", + "9.037097e-07 | \n", + "0 | \n", + "0 | \n", + "PASS | \n", + "NaN | \n", + "NaN | \n", + "0.193 | \n", + "NaN | \n", + "0.00 | \n", + "0.00 | \n", + "-2.180 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "62852 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "31732 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "20048 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "37994 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "54334 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "3468 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "795766 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "912 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "55362 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "44082 | \n", + "0 | \n", + "0 | \n", + "
2 | \n", + "6-63720526-T-A | \n", + "6 | \n", + "63720526 | \n", + "NaN | \n", + "T | \n", + "A | \n", + "gnomAD Exomes | \n", + "PASS | \n", + "NaN | \n", + "ENST00000503581.6 | \n", + "c.*70A>T | \n", + "NaN | \n", + "c.*70A>T | \n", + "3_prime_UTR_variant | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "1 | \n", + "1108894 | \n", + "9.017995e-07 | \n", + "0 | \n", + "0 | \n", + "PASS | \n", + "NaN | \n", + "NaN | \n", + "1.430 | \n", + "NaN | \n", + "0.00 | \n", + "0.00 | \n", + "0.276 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "62994 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "31764 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "20076 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "37960 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "54376 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "3462 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "797828 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "912 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "55360 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "44162 | \n", + "0 | \n", + "0 | \n", + "
3 | \n", + "6-63720531-C-CAA | \n", + "6 | \n", + "63720531 | \n", + "NaN | \n", + "C | \n", + "CAA | \n", + "gnomAD Exomes | \n", + "PASS | \n", + "NaN | \n", + "ENST00000503581.6 | \n", + "c.*63_*64dup | \n", + "NaN | \n", + "c.*63_*64dup | \n", + "3_prime_UTR_variant | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "3 | \n", + "1214050 | \n", + "2.471068e-06 | \n", + "0 | \n", + "0 | \n", + "PASS | \n", + "nfe | \n", + "3.700000e-07 | \n", + "4.830 | \n", + "NaN | \n", + "0.00 | \n", + "0.00 | \n", + "0.460 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "64966 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "32056 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "20754 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "38418 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "54662 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "3736 | \n", + "0 | \n", + "0 | \n", + "2 | \n", + "893326 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "912 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "57520 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "47700 | \n", + "0 | \n", + "0 | \n", + "
4 | \n", + "6-63720531-C-G | \n", + "6 | \n", + "63720531 | \n", + "rs927390284 | \n", + "C | \n", + "G | \n", + "gnomAD Exomes,gnomAD Genomes | \n", + "PASS | \n", + "PASS | \n", + "ENST00000503581.6 | \n", + "c.*65G>C | \n", + "NaN | \n", + "c.*65G>C | \n", + "3_prime_UTR_variant | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "5 | \n", + "1213934 | \n", + "4.118840e-06 | \n", + "0 | \n", + "0 | \n", + "PASS | \n", + "afr | \n", + "5.120000e-06 | \n", + "5.760 | \n", + "NaN | \n", + "0.00 | \n", + "0.00 | \n", + "0.460 | \n", + "NaN | \n", + "NaN | \n", + "2 | \n", + "64844 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "32038 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "20754 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "38430 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "54662 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "3758 | \n", + "0 | \n", + "0 | \n", + "3 | \n", + "893334 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "912 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "57524 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "47678 | \n", + "0 | \n", + "0 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
11083 | \n", + "6-65495478-G-A | \n", + "6 | \n", + "65495478 | \n", + "rs530118054 | \n", + "G | \n", + "A | \n", + "gnomAD Exomes,gnomAD Genomes | \n", + "PASS | \n", + "PASS | \n", + "ENST00000503581.6 | \n", + "c.-68C>T | \n", + "NaN | \n", + "c.-68C>T | \n", + "5_prime_UTR_variant | \n", + "Uncertain significance | \n", + "357751.0 | \n", + "NaN | \n", + "142 | \n", + "1536992 | \n", + "9.238825e-05 | \n", + "0 | \n", + "0 | \n", + "PASS | \n", + "nfe | \n", + "8.983000e-05 | \n", + "5.460 | \n", + "NaN | \n", + "0.00 | \n", + "-0.11 | \n", + "0.424 | \n", + "NaN | \n", + "NaN | \n", + "4 | \n", + "73554 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "57632 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "28976 | \n", + "0 | \n", + "0 | \n", + "4 | \n", + "43932 | \n", + "0 | \n", + "0 | \n", + "2 | \n", + "49172 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "5926 | \n", + "0 | \n", + "0 | \n", + "119 | \n", + "1128372 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "912 | \n", + "0 | \n", + "0 | \n", + "4 | \n", + "88352 | \n", + "0 | \n", + "0 | \n", + "8 | \n", + "60164 | \n", + "0 | \n", + "0 | \n", + "
11084 | \n", + "6-65495479-G-A | \n", + "6 | \n", + "65495479 | \n", + "rs1766225632 | \n", + "G | \n", + "A | \n", + "gnomAD Exomes | \n", + "PASS | \n", + "NaN | \n", + "ENST00000503581.6 | \n", + "c.-69C>T | \n", + "NaN | \n", + "c.-69C>T | \n", + "5_prime_UTR_variant | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "2 | \n", + "1535102 | \n", + "1.302845e-06 | \n", + "0 | \n", + "0 | \n", + "PASS | \n", + "NaN | \n", + "NaN | \n", + "5.920 | \n", + "NaN | \n", + "0.00 | \n", + "-0.07 | \n", + "0.437 | \n", + "NaN | \n", + "NaN | \n", + "1 | \n", + "73568 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "57530 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "28954 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "43928 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "49228 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "5906 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "1126618 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "912 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "88316 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "60142 | \n", + "0 | \n", + "0 | \n", + "
11085 | \n", + "6-65495482-A-G | \n", + "6 | \n", + "65495482 | \n", + "rs1766225707 | \n", + "A | \n", + "G | \n", + "gnomAD Exomes,gnomAD Genomes | \n", + "PASS | \n", + "PASS | \n", + "ENST00000503581.6 | \n", + "c.-72T>C | \n", + "NaN | \n", + "c.-72T>C | \n", + "5_prime_UTR_variant | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "4 | \n", + "1521196 | \n", + "2.629510e-06 | \n", + "0 | \n", + "0 | \n", + "PASS | \n", + "amr | \n", + "2.337000e-05 | \n", + "6.350 | \n", + "NaN | \n", + "0.00 | \n", + "0.01 | \n", + "0.715 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "73166 | \n", + "0 | \n", + "0 | \n", + "4 | \n", + "56984 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "28844 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "43718 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "48946 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "5886 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "1115326 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "912 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "87828 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "59586 | \n", + "0 | \n", + "0 | \n", + "
11086 | \n", + "6-65495484-T-G | \n", + "6 | \n", + "65495484 | \n", + "rs1766225807 | \n", + "T | \n", + "G | \n", + "gnomAD Exomes | \n", + "PASS | \n", + "NaN | \n", + "ENST00000503581.6 | \n", + "c.-74A>C | \n", + "NaN | \n", + "c.-74A>C | \n", + "5_prime_UTR_variant | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "5 | \n", + "1524044 | \n", + "3.280745e-06 | \n", + "0 | \n", + "0 | \n", + "PASS | \n", + "sas | \n", + "2.164000e-05 | \n", + "5.000 | \n", + "NaN | \n", + "0.01 | \n", + "-0.01 | \n", + "0.725 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "73324 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "56894 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "28836 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "43648 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "48884 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "5870 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "1118110 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "912 | \n", + "0 | \n", + "0 | \n", + "5 | \n", + "87818 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "59748 | \n", + "0 | \n", + "0 | \n", + "
11087 | \n", + "6-65495485-T-C | \n", + "6 | \n", + "65495485 | \n", + "NaN | \n", + "T | \n", + "C | \n", + "gnomAD Exomes | \n", + "PASS | \n", + "NaN | \n", + "ENST00000503581.6 | \n", + "c.-75A>G | \n", + "NaN | \n", + "c.-75A>G | \n", + "5_prime_UTR_variant | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "1 | \n", + "1520322 | \n", + "6.577554e-07 | \n", + "0 | \n", + "0 | \n", + "PASS | \n", + "NaN | \n", + "NaN | \n", + "10.300 | \n", + "NaN | \n", + "0.00 | \n", + "-0.09 | \n", + "1.170 | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "73292 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "56694 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "28814 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "43554 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "48818 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "5848 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "1115146 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "912 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "87636 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "59608 | \n", + "0 | \n", + "0 | \n", + "
11088 rows × 72 columns
\n", + "\n", + " | Popmax | \n", + "Popmax population | \n", + "Homozygote Count | \n", + "Allele Frequency | \n", + "variant_id | \n", + "
---|---|---|---|---|---|
0 | \n", + "0.000016 | \n", + "African/African American | \n", + "0.0 | \n", + "1.807419e-06 | \n", + "6-63720525-A-G | \n", + "
1 | \n", + "0.000192 | \n", + "East Asian | \n", + "0.0 | \n", + "6.573844e-06 | \n", + "6-63720525-A-T | \n", + "
2 | \n", + "0.000000 | \n", + "\n", + " | 0.0 | \n", + "0.000000e+00 | \n", + "6-63720525-A-C | \n", + "
3 | \n", + "0.000020 | \n", + "South Asian | \n", + "0.0 | \n", + "1.045299e-06 | \n", + "6-63720526-T-A | \n", + "
4 | \n", + "0.000000 | \n", + "\n", + " | 0.0 | \n", + "0.000000e+00 | \n", + "6-63720527-G-T | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
14295 | \n", + "0.000000 | \n", + "\n", + " | 0.0 | \n", + "0.000000e+00 | \n", + "6-65495479-G-T | \n", + "
14296 | \n", + "0.000031 | \n", + "African/African American | \n", + "0.0 | \n", + "1.446349e-06 | \n", + "6-65495479-G-A | \n", + "
14297 | \n", + "0.000070 | \n", + "Admixed American | \n", + "0.0 | \n", + "2.629510e-06 | \n", + "6-65495482-A-G | \n", + "
14298 | \n", + "0.000060 | \n", + "South Asian | \n", + "0.0 | \n", + "3.645085e-06 | \n", + "6-65495484-T-G | \n", + "
14299 | \n", + "0.000012 | \n", + "South Asian | \n", + "0.0 | \n", + "7.310070e-07 | \n", + "6-65495485-T-C | \n", + "
14300 rows × 5 columns
\n", + "\n", + " | Popmax | \n", + "Popmax population | \n", + "Homozygote Count | \n", + "Allele Frequency | \n", + "variant_id | \n", + "
---|---|---|---|---|---|
2 | \n", + "0.0 | \n", + "\n", + " | 0.0 | \n", + "0.0 | \n", + "6-63720525-A-C | \n", + "
4 | \n", + "0.0 | \n", + "\n", + " | 0.0 | \n", + "0.0 | \n", + "6-63720527-G-T | \n", + "
5 | \n", + "0.0 | \n", + "\n", + " | 0.0 | \n", + "0.0 | \n", + "6-63720527-G-A | \n", + "
8 | \n", + "0.0 | \n", + "\n", + " | 0.0 | \n", + "0.0 | \n", + "6-63720531-C-A | \n", + "
14 | \n", + "0.0 | \n", + "\n", + " | 0.0 | \n", + "0.0 | \n", + "6-63720535-G-T | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
14279 | \n", + "0.0 | \n", + "\n", + " | 0.0 | \n", + "0.0 | \n", + "6-65495463-G-T | \n", + "
14287 | \n", + "0.0 | \n", + "\n", + " | 0.0 | \n", + "0.0 | \n", + "6-65495471-T-C | \n", + "
14291 | \n", + "0.0 | \n", + "\n", + " | 0.0 | \n", + "0.0 | \n", + "6-65495477-C-A | \n", + "
14292 | \n", + "0.0 | \n", + "\n", + " | 0.0 | \n", + "0.0 | \n", + "6-65495477-C-CACAACTTTACTT | \n", + "
14295 | \n", + "0.0 | \n", + "\n", + " | 0.0 | \n", + "0.0 | \n", + "6-65495479-G-T | \n", + "
3212 rows × 5 columns
\n", + "