Merge branch 'refs/heads/KCE/GnomAD_api_requests'

# Conflicts: # api/data/__init__.py # api/data/downloading.py # api/data/refactoring.py
Strexas · Sep 17, 2024 · 7452996 · 7452996
2 parents 3b909a9 + 252a468
commit 7452996
Show file tree

Hide file tree

Showing 4 changed files with 1,689 additions and 157 deletions.
diff --git a/api/data/__init__.py b/api/data/__init__.py
@@ -0,0 +1,66 @@
+"""
+Package for data collection purposes provides both collection and refactoring functionality.
+
+Data from LOVD, ClinVar and GnomAd databases can be downloaded using this package. GnomAd and
+ClinVar are limited with EYS gene, but it is possible to download data for any gene in LOVD.
+
+All necessary functionality can be imported directly from data without
+specifying the module.
+
+data collection pipeline example is established for project's specific usage.
+"""
+
+# CONSTANTS IMPORT
+from .constants import (
+  # URLs for LOVD database
+  LOVD_URL, LOVD_URL_EYS, LOVD_FILE_URL, LOVD_FILE_URL_EYS,
+
+  # URLs for gnomAD database
+  GNOMAD_URL, GNOMAD_URL_EYS, GNOMAD_FILE_URL_EYS,
+
+  # URLs for ClinVar database
+  CLINVAR_URL, CLINVAR_URL_EYS, CLINVAR_FILE_URL_EYS,
+
+  # Paths for data storage
+  DATA_PATH, LOVD_PATH, GNOMAD_PATH, CLINVAR_PATH,
+
+  # Data types for tables
+  LOVD_TABLES_DATA_TYPES,
+
+  # Paths for database downloads
+  DATABASES_DOWNLOAD_PATHS,
+
+  GNOMAD_PATH,
+)
+
+# DATA COLLECTION IMPORT
+from .collection import (
+    # Custom exceptions
+    BadResponseException,
+    DownloadError,
+
+    # Custom utility functions
+    get_file_from_url,
+
+    # Functions for downloading databases
+    download_lovd_database_for_eys_gene,
+    download_genes_lovd,
+    download_database_for_eys_gene,
+    download_data_from_gnomad_eys,
+
+    # Functions for storing databases
+    store_database_for_eys_gene
+
+)
+
+# DATA REFACTORING IMPORT
+from .refactoring import (
+    # Functions for refactoring data
+    set_lovd_dtypes,
+    parse_lovd,
+    from_clinvar_name_to_cdna_position,
+    save_lovd_as_vcf,
+    merge_gnomad_lovd,
+    parse_gnomad,
+    set_gnomad_dtypes,
+)
diff --git a/api/data/downloading.py b/api/data/downloading.py
@@ -6,6 +6,7 @@
 import time
 
 import requests
+import pandas as pd
 from requests import RequestException
 
 from selenium import webdriver
@@ -188,3 +189,154 @@ def store_database_for_eys_gene(database_name, override=False):
         download_lovd_database_for_eys_gene(override)
     else:
         download_database_for_eys_gene(database_name, override)
+
+def prepare_popmax_calculation(df, pop_data, name, pop_ids, index):
+    """
+    prepares the calculation of popmax and popmax population for a variant.
+    genome and exome data of ac and an.
+
+    :param DataFrame df: DataFrame containing gnomAD data
+    :param dict pop_data: dictionary containing population data
+    :param str name: name of the population
+    :param list[str] pop_ids: list of population ids
+    :param int index: index of the variant
+    """
+
+    for pop_id in pop_ids:
+        df.loc[index, f'{name}_ac_{pop_id}'] = 0
+        df.loc[index, f'{name}_an_{pop_id}'] = 0
+    if isinstance(pop_data, list):
+        for pop in pop_data:
+            variant_id = pop['id']
+            df.loc[index, f'{name}_ac_{variant_id}'] = pop['ac']
+            df.loc[index, f'{name}_an_{variant_id}'] = pop['an']
+
+
+def download_data_from_gnomad_eys(path, override=False):
+    """
+    Requests gnomAD API for data about a specific gene containing:
+    - variant_id
+    - cDNA change
+    - protein change
+    - allele frequency
+    - homozygote count
+    - popmax
+    - popmax population
+
+    :param str gene_name: name of gene
+    :param bool to_file: if True, saves data to variants.csv
+    :returns: DataFrame from gnomAD API
+    :rtype: DataFrame
+    """
+
+    url = 'https://gnomad.broadinstitute.org/api'
+    query = f"""
+    query{{
+      gene(gene_symbol: "EYS", reference_genome: GRCh38) {{
+        variants(dataset: gnomad_r4)
+        {{
+          variant_id
+          chrom
+          pos
+          ref
+          hgvsc
+          hgvsp
+          alt
+          exome {{
+          ac
+          an
+          ac_hom
+            populations
+            {{
+              id
+              ac
+              an
+            }}
+          }}
+          genome
+          {{
+            ac
+            an
+            ac_hom
+            populations
+            {{
+              id
+              ac
+              an
+            }}
+          }}
+        }}
+      }}
+    }}
+    """
+
+    response = requests.post(url, json={'query': query}, timeout=300)  # timeout set to 5 minutes
+
+    if response.status_code != 200:
+        if not os.path.isfile(path):
+            f = open('logs.txt', 'x')
+            f.write(response.text)
+        else:
+            f = open('logs.txt', 'a')
+            f.write(response.text)
+
+    data = response.json()['data']['gene']['variants']
+
+    df = pd.json_normalize(data)
+
+    df.loc[:, 'total_ac'] = df.loc[:, 'exome.ac'].fillna(0) + df.loc[:, 'genome.ac'].fillna(0)
+    df.loc[:, 'total_an'] = df.loc[:, 'exome.an'].fillna(0) + df.loc[:, 'genome.an'].fillna(0)
+
+    df.loc[:, 'HGVS Consequence'] = df.loc[:, 'hgvsc'].fillna(0)  # cDNA change
+    df.loc[:, 'Protein Consequence'] = df.loc[:, 'hgvsp'].fillna(0)  # Protein change
+
+    df.loc[:, 'Allele Frequency'] = df.loc[:, 'total_ac'] / df.loc[:, 'total_an']
+    df.loc[:, 'Homozygote Count'] = df.loc[:, 'exome.ac_hom'].fillna(0) + df.loc[:, 'genome.ac_hom'].fillna(0)
+    exome_populations = df.loc[:, 'exome.populations']
+    genome_populations = df.loc[:, 'genome.populations']
+    population_ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining']
+
+    for i in range(len(exome_populations)):
+        exome_pop = exome_populations[i]
+        prepare_popmax_calculation(df, exome_pop, 'exome', population_ids, i)
+        genome_pop = genome_populations[i]
+        prepare_popmax_calculation(df, genome_pop, 'genome', population_ids, i)
+
+    for population_id in population_ids:
+        df.loc[:, f'Allele_Frequency_{population_id}'] = (
+               (df.loc[:, f'exome_ac_{population_id}'].fillna(0) + df.loc[:, f'genome_ac_{population_id}'].fillna(0)) /
+               (df.loc[:, f'exome_an_{population_id}'].fillna(0) + df.loc[:, f'genome_an_{population_id}'].fillna(0)))
+    population_mapping = {
+            'afr': 'African/African American',
+            'eas': 'East Asian',
+            'asj': 'Ashkenazi Jew',
+            'sas': 'South Asian',
+            'nfe': 'European (non-Finnish)',
+            'fin': 'European (Finnish)',
+            'mid': 'Middle Eastern',
+            'amr': 'Admixed American',
+            'ami': "Amish",
+            'remaining': 'Remaining',
+            '': ''
+        }
+
+    for i in range(df.shape[0]):
+        max_pop = 0
+        max_id = ''
+        for population_id in population_ids:
+            if df.loc[i, f'Allele_Frequency_{population_id}'] > max_pop:
+                max_pop = df.loc[i, f'Allele_Frequency_{population_id}']
+                max_id = population_id
+        df.loc[i, 'Popmax'] = max_pop
+        df.loc[i, 'Popmax population'] = population_mapping[max_id]
+    not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency',
+                   'variant_id', 'cDNA change', 'Protein change', 'gnomAD ID']
+
+    df.rename(columns={'variant_id': 'gnomAD ID'})
+
+    df = df.filter(not_to_drop, axis="columns")
+
+    if not os.path.isfile(path) or override:
+        df.to_csv(path, index=False)
+
+    return df
diff --git a/api/data/refactoring.py b/api/data/refactoring.py
@@ -348,128 +348,3 @@ def find_popmax_in_gnomad(data):
         data.loc[i, 'Popmax'] = max_pop
         data.loc[i, 'Popmax population'] = population_mapping[max_id]
 
-
-def prepare_popmax_calculation(df, pop_data, name, pop_ids, index):
-    """
-    prepares the calculation of popmax and popmax population for a variant.
-    genome and exome data of ac and an.
-
-    :param DataFrame df: DataFrame containing gnomAD data
-    :param dict pop_data: dictionary containing population data
-    :param str name: name of the population
-    :param list[str] pop_ids: list of population ids
-    :param int index: index of the variant
-    """
-
-    for pop_id in pop_ids:
-        df.loc[index, f'{name}_ac_{pop_id}'] = 0
-        df.loc[index, f'{name}_an_{pop_id}'] = 0
-    if isinstance(pop_data, list):
-        for pop in pop_data:
-            variant_id = pop['id']
-            df.loc[index, f'{name}_ac_{variant_id}'] = pop['ac']
-            df.loc[index, f'{name}_an_{variant_id}'] = pop['an']
-
-
-def request_gnomad_api_data(gene_name):
-    """
-    Requests gnomAD API for data about a specific gene containing:
-    - variant_id
-    - cDNA change
-    - protein change
-    - allele frequency
-    - homozygote count
-    - popmax
-    - popmax population
-
-    :param str gene_name: name of gene
-    :param bool to_file: if True, saves data to variants.csv
-    :returns: DataFrame from gnomAD API
-    :rtype: DataFrame
-    """
-
-    url = 'https://gnomad.broadinstitute.org/api'
-    query = f"""
-    query{{
-      gene(gene_symbol: "{gene_name}", reference_genome: GRCh38) {{
-        variants(dataset: gnomad_r4)
-        {{
-          variant_id
-          chrom
-          pos
-          ref
-          hgvsc
-          hgvsp
-          alt
-          exome {{
-          ac
-          an
-          ac_hom
-            populations
-            {{
-              id
-              ac
-              an
-            }}
-          }}
-          genome
-          {{
-            ac
-            an
-            ac_hom
-            populations
-            {{
-              id
-              ac
-              an
-            }}
-          }}
-        }}
-      }}
-    }}
-    """
-
-    response = requests.post(url, json={'query': query}, timeout=300)  # timeout set to 5 minutes
-
-    if response.status_code != 200:
-        print('Error:', response.status_code)
-
-    data = response.json()['data']['gene']['variants']
-
-    df = pd.json_normalize(data)
-
-    df.loc[:, 'total_ac'] = df.loc[:, 'exome.ac'].fillna(0) + df.loc[:, 'genome.ac'].fillna(0)
-    df.loc[:, 'total_an'] = df.loc[:, 'exome.an'].fillna(0) + df.loc[:, 'genome.an'].fillna(0)
-
-    df.loc[:, 'HGVS Consequence'] = df.loc[:, 'hgvsc'].fillna(0)  # cDNA change
-    df.loc[:, 'Protein Consequence'] = df.loc[:, 'hgvsp'].fillna(0)  # Protein change
-
-    df.loc[:, 'Allele Frequency'] = df.loc[:, 'total_ac'] / df.loc[:, 'total_an']
-    df.loc[:, 'Homozygote Count'] = (df.loc[:, 'exome.ac_hom'].fillna(0)
-                                     + df.loc[:, 'genome.ac_hom'].fillna(0))
-    exome_populations = df.loc[:, 'exome.populations']
-    genome_populations = df.loc[:, 'genome.populations']
-    population_ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining']
-
-    for i in range(exome_populations.shape[0]):
-        exome_pop = exome_populations[i]
-        prepare_popmax_calculation(df, exome_pop, 'exome', population_ids, i)
-        genome_pop = genome_populations[i]
-        prepare_popmax_calculation(df, genome_pop, 'genome', population_ids, i)
-
-    for population_id in population_ids:
-        df.loc[:, f'Allele_Frequency_{population_id}'] = (
-               (df.loc[:, f'exome_ac_{population_id}'].fillna(0)
-                + df.loc[:, f'genome_ac_{population_id}'].fillna(0))
-               / (df.loc[:, f'exome_an_{population_id}'].fillna(0)
-                  + df.loc[:, f'genome_an_{population_id}'].fillna(0)))
-
-    find_popmax_in_gnomad(df)
-    not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency',
-                   'variant_id', 'cDNA change', 'Protein change']
-
-    df = df.filter(not_to_drop, axis="columns")
-
-    df.rename(columns={'variant_id': 'gnomAD ID'})
-
-    return df