Moved function to collection.py;

added path parameter with overriding option; implemented one log function; Changed that its only getting EYS
Strexas · Sep 14, 2024 · 252a468 · 252a468
1 parent 522cf70
commit 252a468
Show file tree

Hide file tree

Showing 4 changed files with 1,624 additions and 174 deletions.
diff --git a/api/data/__init__.py b/api/data/__init__.py
@@ -46,9 +46,11 @@
     download_lovd_database_for_eys_gene,
     download_genes_lovd,
     download_database_for_eys_gene,
+    download_data_from_gnomad_eys,
 
     # Functions for storing databases
     store_database_for_eys_gene
+
 )
 
 # DATA REFACTORING IMPORT
@@ -58,7 +60,6 @@
     parse_lovd,
     from_clinvar_name_to_cdna_position,
     save_lovd_as_vcf,
-    request_gnomad_api_data,
     merge_gnomad_lovd,
     parse_gnomad,
     set_gnomad_dtypes,

diff --git a/api/data/collection.py b/api/data/collection.py
@@ -6,6 +6,7 @@
 import time
 
 import requests
+import pandas as pd
 from requests import RequestException
 
 from selenium import webdriver
@@ -189,3 +190,153 @@ def store_database_for_eys_gene(database_name, override=False):
     else:
         download_database_for_eys_gene(database_name, override)
 
+def prepare_popmax_calculation(df, pop_data, name, pop_ids, index):
+    """
+    prepares the calculation of popmax and popmax population for a variant.
+    genome and exome data of ac and an.
+
+    :param DataFrame df: DataFrame containing gnomAD data
+    :param dict pop_data: dictionary containing population data
+    :param str name: name of the population
+    :param list[str] pop_ids: list of population ids
+    :param int index: index of the variant
+    """
+
+    for pop_id in pop_ids:
+        df.loc[index, f'{name}_ac_{pop_id}'] = 0
+        df.loc[index, f'{name}_an_{pop_id}'] = 0
+    if isinstance(pop_data, list):
+        for pop in pop_data:
+            variant_id = pop['id']
+            df.loc[index, f'{name}_ac_{variant_id}'] = pop['ac']
+            df.loc[index, f'{name}_an_{variant_id}'] = pop['an']
+
+
+def download_data_from_gnomad_eys(path, override=False):
+    """
+    Requests gnomAD API for data about a specific gene containing:
+    - variant_id
+    - cDNA change
+    - protein change
+    - allele frequency
+    - homozygote count
+    - popmax
+    - popmax population
+
+    :param str gene_name: name of gene
+    :param bool to_file: if True, saves data to variants.csv
+    :returns: DataFrame from gnomAD API
+    :rtype: DataFrame
+    """
+
+    url = 'https://gnomad.broadinstitute.org/api'
+    query = f"""
+    query{{
+      gene(gene_symbol: "EYS", reference_genome: GRCh38) {{
+        variants(dataset: gnomad_r4)
+        {{
+          variant_id
+          chrom
+          pos
+          ref
+          hgvsc
+          hgvsp
+          alt
+          exome {{
+          ac
+          an
+          ac_hom
+            populations
+            {{
+              id
+              ac
+              an
+            }}
+          }}
+          genome
+          {{
+            ac
+            an
+            ac_hom
+            populations
+            {{
+              id
+              ac
+              an
+            }}
+          }}
+        }}
+      }}
+    }}
+    """
+
+    response = requests.post(url, json={'query': query}, timeout=300)  # timeout set to 5 minutes
+
+    if response.status_code != 200:
+        if not os.path.isfile(path):
+            f = open('logs.txt', 'x')
+            f.write(response.text)
+        else:
+            f = open('logs.txt', 'a')
+            f.write(response.text)
+
+    data = response.json()['data']['gene']['variants']
+
+    df = pd.json_normalize(data)
+
+    df.loc[:, 'total_ac'] = df.loc[:, 'exome.ac'].fillna(0) + df.loc[:, 'genome.ac'].fillna(0)
+    df.loc[:, 'total_an'] = df.loc[:, 'exome.an'].fillna(0) + df.loc[:, 'genome.an'].fillna(0)
+
+    df.loc[:, 'HGVS Consequence'] = df.loc[:, 'hgvsc'].fillna(0)  # cDNA change
+    df.loc[:, 'Protein Consequence'] = df.loc[:, 'hgvsp'].fillna(0)  # Protein change
+
+    df.loc[:, 'Allele Frequency'] = df.loc[:, 'total_ac'] / df.loc[:, 'total_an']
+    df.loc[:, 'Homozygote Count'] = df.loc[:, 'exome.ac_hom'].fillna(0) + df.loc[:, 'genome.ac_hom'].fillna(0)
+    exome_populations = df.loc[:, 'exome.populations']
+    genome_populations = df.loc[:, 'genome.populations']
+    population_ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining']
+
+    for i in range(len(exome_populations)):
+        exome_pop = exome_populations[i]
+        prepare_popmax_calculation(df, exome_pop, 'exome', population_ids, i)
+        genome_pop = genome_populations[i]
+        prepare_popmax_calculation(df, genome_pop, 'genome', population_ids, i)
+
+    for population_id in population_ids:
+        df.loc[:, f'Allele_Frequency_{population_id}'] = (
+               (df.loc[:, f'exome_ac_{population_id}'].fillna(0) + df.loc[:, f'genome_ac_{population_id}'].fillna(0)) /
+               (df.loc[:, f'exome_an_{population_id}'].fillna(0) + df.loc[:, f'genome_an_{population_id}'].fillna(0)))
+    population_mapping = {
+            'afr': 'African/African American',
+            'eas': 'East Asian',
+            'asj': 'Ashkenazi Jew',
+            'sas': 'South Asian',
+            'nfe': 'European (non-Finnish)',
+            'fin': 'European (Finnish)',
+            'mid': 'Middle Eastern',
+            'amr': 'Admixed American',
+            'ami': "Amish",
+            'remaining': 'Remaining',
+            '': ''
+        }
+
+    for i in range(df.shape[0]):
+        max_pop = 0
+        max_id = ''
+        for population_id in population_ids:
+            if df.loc[i, f'Allele_Frequency_{population_id}'] > max_pop:
+                max_pop = df.loc[i, f'Allele_Frequency_{population_id}']
+                max_id = population_id
+        df.loc[i, 'Popmax'] = max_pop
+        df.loc[i, 'Popmax population'] = population_mapping[max_id]
+    not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency',
+                   'variant_id', 'cDNA change', 'Protein change', 'gnomAD ID']
+
+    df.rename(columns={'variant_id': 'gnomAD ID'})
+
+    df = df.filter(not_to_drop, axis="columns")
+
+    if not os.path.isfile(path) or override:
+        df.to_csv(path, index=False)
+
+    return df
diff --git a/api/data/refactoring.py b/api/data/refactoring.py
@@ -247,145 +247,4 @@ def save_lovd_as_vcf(data, save_to="./lovd.vcf"):
             f.write("\n")
 
 
-def prepare_popmax_calculation(df, pop_data, name, pop_ids, index):
-    """
-    prepares the calculation of popmax and popmax population for a variant.
-    genome and exome data of ac and an.
-
-    :param DataFrame df: DataFrame containing gnomAD data
-    :param dict pop_data: dictionary containing population data
-    :param str name: name of the population
-    :param list[str] pop_ids: list of population ids
-    :param int index: index of the variant
-    """
-
-    for pop_id in pop_ids:
-        df.loc[index, f'{name}_ac_{pop_id}'] = 0
-        df.loc[index, f'{name}_an_{pop_id}'] = 0
-    if isinstance(pop_data, list):
-        for pop in pop_data:
-            variant_id = pop['id']
-            df.loc[index, f'{name}_ac_{variant_id}'] = pop['ac']
-            df.loc[index, f'{name}_an_{variant_id}'] = pop['an']
-
-
-def request_gnomad_api_data(gene_name):
-    """
-    Requests gnomAD API for data about a specific gene containing:
-    - variant_id
-    - cDNA change
-    - protein change
-    - allele frequency
-    - homozygote count
-    - popmax
-    - popmax population
-
-    :param str gene_name: name of gene
-    :param bool to_file: if True, saves data to variants.csv
-    :returns: DataFrame from gnomAD API
-    :rtype: DataFrame
-    """
-
-    url = 'https://gnomad.broadinstitute.org/api'
-    query = f"""
-    query{{
-      gene(gene_symbol: "{gene_name}", reference_genome: GRCh38) {{
-        variants(dataset: gnomad_r4)
-        {{
-          variant_id
-          chrom
-          pos
-          ref
-          hgvsc
-          hgvsp
-          alt
-          exome {{
-          ac
-          an
-          ac_hom
-            populations
-            {{
-              id
-              ac
-              an
-            }}
-          }}
-          genome
-          {{
-            ac
-            an
-            ac_hom
-            populations
-            {{
-              id
-              ac
-              an
-            }}
-          }}
-        }}
-      }}
-    }}
-    """
 
-    response = requests.post(url, json={'query': query}, timeout=300)  # timeout set to 5 minutes
-
-    if response.status_code != 200:
-        print('Error:', response.status_code)
-
-    data = response.json()['data']['gene']['variants']
-
-    df = pd.json_normalize(data)
-
-    df.loc[:, 'total_ac'] = df.loc[:, 'exome.ac'].fillna(0) + df.loc[:, 'genome.ac'].fillna(0)
-    df.loc[:, 'total_an'] = df.loc[:, 'exome.an'].fillna(0) + df.loc[:, 'genome.an'].fillna(0)
-
-    df.loc[:, 'HGVS Consequence'] = df.loc[:, 'hgvsc'].fillna(0)  # cDNA change
-    df.loc[:, 'Protein Consequence'] = df.loc[:, 'hgvsp'].fillna(0)  # Protein change
-
-    df.loc[:, 'Allele Frequency'] = df.loc[:, 'total_ac'] / df.loc[:, 'total_an']
-    df.loc[:, 'Homozygote Count'] = df.loc[:, 'exome.ac_hom'].fillna(0) + df.loc[:, 'genome.ac_hom'].fillna(0)
-    exome_populations = df.loc[:, 'exome.populations']
-    genome_populations = df.loc[:, 'genome.populations']
-    population_ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining']
-
-    for i in range(len(exome_populations)):
-        exome_pop = exome_populations[i]
-        prepare_popmax_calculation(df, exome_pop, 'exome', population_ids, i)
-        genome_pop = genome_populations[i]
-        prepare_popmax_calculation(df, genome_pop, 'genome', population_ids, i)
-
-    for population_id in population_ids:
-        df.loc[:, f'Allele_Frequency_{population_id}'] = (
-               (df.loc[:, f'exome_ac_{population_id}'].fillna(0) + df.loc[:, f'genome_ac_{population_id}'].fillna(0)) / (
-                df.loc[:, f'exome_an_{population_id}'].fillna(0) + df.loc[:, f'genome_an_{population_id}'].fillna(0)))
-    population_mapping = {
-            'afr': 'African/African American',
-            'eas': 'East Asian',
-            'asj': 'Ashkenazi Jew',
-            'sas': 'South Asian',
-            'nfe': 'European (non-Finnish)',
-            'fin': 'European (Finnish)',
-            'mid': 'Middle Eastern',
-            'amr': 'Admixed American',
-            'ami': "Amish",
-            'remaining': 'Remaining',
-            '': ''
-        }
-
-    for i in range(df.shape[0]):
-        max_pop = 0
-        max_id = ''
-        for population_id in population_ids:
-            if df.loc[i, f'Allele_Frequency_{population_id}'] > max_pop:
-                max_pop = df.loc[i, f'Allele_Frequency_{population_id}']
-                max_id = population_id
-        df.loc[i, 'Popmax'] = max_pop
-        df.loc[i, 'Popmax population'] = population_mapping[max_id]
-    not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency',
-                   'variant_id', 'cDNA change', 'Protein change']
-
-    df = df.filter(not_to_drop, axis="columns")
-
-    df.rename(columns={'variant_id': 'gnomAD ID'})
-
-    return df