Strexas · KajusC · Sep 10, 2024 · Jul 31, 2024 · Aug 6, 2024 · Aug 7, 2024
diff --git a/api/__init__.py b/api/__init__.py
@@ -54,5 +54,6 @@
     # Functions for refactoring data
     set_lovd_dtypes,
     parse_lovd,
-    from_clinvar_name_to_cdna_position
+    from_clinvar_name_to_cdna_position,
+    save_lovd_as_vcf,
 )
diff --git a/api/data/__init__.py b/api/data/__init__.py
@@ -0,0 +1,60 @@
+"""
+Package for data collection purposes provides both collection and refactoring functionality.
+
+Data from LOVD, ClinVar and GnomAd databases can be downloaded using this package. GnomAd and
+ClinVar are limited with EYS gene, but it is possible to download data for any gene in LOVD.
+
+All necessary functionality can be imported directly from data without
+specifying the module.
+
+data collection pipeline example is established for project's specific usage.
+"""
+
+# CONSTANTS IMPORT
+from .constants import (
+  # URLs for LOVD database
+  LOVD_URL, LOVD_URL_EYS, LOVD_FILE_URL, LOVD_FILE_URL_EYS,
+
+  # URLs for gnomAD database
+  GNOMAD_URL, GNOMAD_URL_EYS, GNOMAD_FILE_URL_EYS,
+
+  # URLs for ClinVar database
+  CLINVAR_URL, CLINVAR_URL_EYS, CLINVAR_FILE_URL_EYS,
+
+  # Paths for data storage
+  DATA_PATH, LOVD_PATH, GNOMAD_PATH, CLINVAR_PATH,
+
+  # Data types for tables
+  LOVD_TABLES_DATA_TYPES,
+
+  # Paths for database downloads
+  DATABASES_DOWNLOAD_PATHS
+)
+
+# DATA COLLECTION IMPORT
+from .collection import (
+    # Custom exceptions
+    BadResponseException,
+    DownloadError,
+
+    # Custom utility functions
+    get_file_from_url,
+
+    # Functions for downloading databases
+    download_lovd_database_for_eys_gene,
+    download_genes_lovd,
+    download_database_for_eys_gene,
+
+    # Functions for storing databases
+    store_database_for_eys_gene
+)
+
+# DATA REFACTORING IMPORT
+from .refactoring import (
+    # Functions for refactoring data
+    set_lovd_dtypes,
+    parse_lovd,
+    from_clinvar_name_to_cdna_position,
+    save_lovd_as_vcf,
+    request_gnomad_api_data,
+)
diff --git a/api/data/refactoring.py b/api/data/refactoring.py
@@ -3,12 +3,15 @@
 import os
 import logging
 
+import requests
+
 import pandas as pd
 from pandas import DataFrame
 
 from .constants import LOVD_TABLES_DATA_TYPES, LOVD_PATH, GNOMAD_TABLES_DATA_TYPES, GNOMAD_PATH
 
 
+
 def set_lovd_dtypes(df_dict):
     """
     Convert data from LOVD format table to desired data format based on specified data types.
@@ -242,3 +245,132 @@ def save_lovd_as_vcf(data, save_to="./lovd.vcf"):
 
             f.write("\t".join(record))
             f.write("\n")
+
+
+def request_gnomad_api_data(gene_name, to_file=True):
+    """
+    Requests gnomAD API for data about a specific gene containing:
+    - variant_id
+    - cDNA change
+    - protein change
+    - allele frequency
+    - homozygote count
+    - popmax
+    - popmax population
+
+    :param str gene_name: name of gene
+    :param bool to_file: if True, saves data to variants.csv
+    :returns: DataFrame from gnomAD API
+    :rtype: DataFrame
+    """
+
+    url = 'https://gnomad.broadinstitute.org/api'
+    query = f"""
+    query{{
+      gene(gene_symbol: "{gene_name}", reference_genome: GRCh38) {{
+        variants(dataset: gnomad_r4)
+        {{
+          variant_id
+          chrom
+          pos
+          ref
+          hgvsc
+          hgvsp
+          alt
+          exome {{
+          ac
+          an
+          ac_hom
+            populations
+            {{
+              id
+              ac
+              an
+            }}
+          }}
+          genome
+          {{
+            ac
+            an
+            ac_hom
+            populations
+            {{
+              id
+              ac
+              an
+            }}
+          }}
+        }}
+      }}
+    }}
+    """
+
+    response = requests.post(url, json={'query': query})
+    if response.status_code == 200:
+        data = response.json()['data']['gene']['variants']
+
+        df = pd.json_normalize(data)
+
+        df['total_ac'] = df['exome.ac'].fillna(0) + df['genome.ac'].fillna(0)
+        df['total_an'] = df['exome.an'].fillna(0) + df['genome.an'].fillna(0)
+
+        df['cDNA change'] = df['hgvsc'].fillna(0)
+        df['Protein change'] = df['hgvsp'].fillna(0)
+
+        df['Allele Frequency'] = df['total_ac'] / df['total_an']
+        df['Homozygote Count'] = df['exome.ac_hom'].fillna(0) + df['genome.ac_hom'].fillna(0)
+        exome_populations = df['exome.populations']
+        genome_populations = df['genome.populations']
+        ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining']
+
+        def process_population_data(pop_data, name, pop_ids, index):
+            for pop_id in pop_ids:
+                df.loc[index, f'{name}_ac_{pop_id}'] = 0
+                df.loc[index, f'{name}_an_{pop_id}'] = 0
+            if type(pop_data) == list:
+                for pop in pop_data:
+                    id = pop['id']
+                    df.loc[index, f'{name}_ac_{id}'] = pop['ac']
+                    df.loc[index, f'{name}_an_{id}'] = pop['an']
+
+        for i in range(len(exome_populations)):
+            exome_pop = exome_populations[i]
+            process_population_data(exome_pop, 'exome', ids, i)
+            genome_pop = genome_populations[i]
+            process_population_data(genome_pop, 'genome', ids, i)
+
+        for id in ids:
+            df[f'Allele_Frequency_{id}'] = (df[f'exome_ac_{id}'].fillna(0) + df[f'genome_ac_{id}'].fillna(0)) / (
+                        df[f'exome_an_{id}'].fillna(0) + df[f'genome_an_{id}'].fillna(0))
+        population_mapping = {
+            'afr': 'African/African American',
+            'eas': 'East Asian',
+            'asj': 'Ashkenazi Jew',
+            'sas': 'South Asian',
+            'nfe': 'European (non-Finnish)',
+            'fin': 'European (Finnish)',
+            'mid': 'Middle Eastern',
+            'amr': 'Admixed American',
+            'ami': "Amish",
+            'remaining': 'Remaining',
+            '': ''
+        }
+        for i in range(len(df)):
+            max = 0
+            maxid = ''
+            for id in ids:
+                if df.loc[i, f'Allele_Frequency_{id}'] > max:
+                    max = df.loc[i, f'Allele_Frequency_{id}']
+                    maxid = id
+            df.loc[i, 'Popmax'] = max
+            df.loc[i, 'Popmax population'] = population_mapping[maxid]
+        not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency', 'variant_id',
+                       'cDNA change', 'Protein change']
+        df = df.drop([col for col in df.columns if col not in not_to_drop], axis=1)
+        if to_file:
+            df.to_csv('variants.csv', index=True)
+
+    else:
+        print('Error:', response.status_code)
+
+    return df