Merge branch 'main' into VLE/KBE-25/mut_pos_standartisation

Strexas · Sep 11, 2024 · 555e2e7 · 555e2e7
2 parents 68b8472 + 43db108
commit 555e2e7
Show file tree

Hide file tree

Showing 83 changed files with 7,872 additions and 806 deletions.
diff --git a/api/__init__.py b/api/__init__.py
@@ -54,5 +54,6 @@
     # Functions for refactoring data
     set_lovd_dtypes,
     parse_lovd,
-    from_clinvar_name_to_cdna_position
+    from_clinvar_name_to_cdna_position,
+    save_lovd_as_vcf,
 )
diff --git a/api/data/__init__.py b/api/data/__init__.py
@@ -0,0 +1,65 @@
+"""
+Package for data collection purposes provides both collection and refactoring functionality.
+
+Data from LOVD, ClinVar and GnomAd databases can be downloaded using this package. GnomAd and
+ClinVar are limited with EYS gene, but it is possible to download data for any gene in LOVD.
+
+All necessary functionality can be imported directly from data without
+specifying the module.
+
+data collection pipeline example is established for project's specific usage.
+"""
+
+# CONSTANTS IMPORT
+from .constants import (
+  # URLs for LOVD database
+  LOVD_URL, LOVD_URL_EYS, LOVD_FILE_URL, LOVD_FILE_URL_EYS,
+
+  # URLs for gnomAD database
+  GNOMAD_URL, GNOMAD_URL_EYS, GNOMAD_FILE_URL_EYS,
+
+  # URLs for ClinVar database
+  CLINVAR_URL, CLINVAR_URL_EYS, CLINVAR_FILE_URL_EYS,
+
+  # Paths for data storage
+  DATA_PATH, LOVD_PATH, GNOMAD_PATH, CLINVAR_PATH,
+
+  # Data types for tables
+  LOVD_TABLES_DATA_TYPES,
+
+  # Paths for database downloads
+  DATABASES_DOWNLOAD_PATHS,
+
+  GNOMAD_PATH,
+)
+
+# DATA COLLECTION IMPORT
+from .collection import (
+    # Custom exceptions
+    BadResponseException,
+    DownloadError,
+
+    # Custom utility functions
+    get_file_from_url,
+
+    # Functions for downloading databases
+    download_lovd_database_for_eys_gene,
+    download_genes_lovd,
+    download_database_for_eys_gene,
+
+    # Functions for storing databases
+    store_database_for_eys_gene
+)
+
+# DATA REFACTORING IMPORT
+from .refactoring import (
+    # Functions for refactoring data
+    set_lovd_dtypes,
+    parse_lovd,
+    from_clinvar_name_to_cdna_position,
+    save_lovd_as_vcf,
+    request_gnomad_api_data,
+    merge_gnomad_lovd,
+    parse_gnomad,
+    set_gnomad_dtypes,
+)
diff --git a/api/data/refactoring.py b/api/data/refactoring.py
@@ -4,6 +4,8 @@
 import logging
 import re
 
+import requests
+
 import pandas as pd
 from pandas import DataFrame
 
@@ -12,6 +14,7 @@
 from .constants import LOVD_TABLES_DATA_TYPES, LOVD_PATH, GNOMAD_TABLES_DATA_TYPES, GNOMAD_PATH
 
 
+
 def set_lovd_dtypes(df_dict):
     """
     Convert data from LOVD format table to desired data format based on specified data types.
@@ -311,3 +314,147 @@ def save_lovd_as_vcf(data, save_to="./lovd.vcf"):
 
             f.write("\t".join(record))
             f.write("\n")
+
+
+def prepare_popmax_calculation(df, pop_data, name, pop_ids, index):
+    """
+    prepares the calculation of popmax and popmax population for a variant.
+    genome and exome data of ac and an.
+
+    :param DataFrame df: DataFrame containing gnomAD data
+    :param dict pop_data: dictionary containing population data
+    :param str name: name of the population
+    :param list[str] pop_ids: list of population ids
+    :param int index: index of the variant
+    """
+
+    for pop_id in pop_ids:
+        df.loc[index, f'{name}_ac_{pop_id}'] = 0
+        df.loc[index, f'{name}_an_{pop_id}'] = 0
+    if isinstance(pop_data, list):
+        for pop in pop_data:
+            variant_id = pop['id']
+            df.loc[index, f'{name}_ac_{variant_id}'] = pop['ac']
+            df.loc[index, f'{name}_an_{variant_id}'] = pop['an']
+
+
+def request_gnomad_api_data(gene_name):
+    """
+    Requests gnomAD API for data about a specific gene containing:
+    - variant_id
+    - cDNA change
+    - protein change
+    - allele frequency
+    - homozygote count
+    - popmax
+    - popmax population
+
+    :param str gene_name: name of gene
+    :param bool to_file: if True, saves data to variants.csv
+    :returns: DataFrame from gnomAD API
+    :rtype: DataFrame
+    """
+
+    url = 'https://gnomad.broadinstitute.org/api'
+    query = f"""
+    query{{
+      gene(gene_symbol: "{gene_name}", reference_genome: GRCh38) {{
+        variants(dataset: gnomad_r4)
+        {{
+          variant_id
+          chrom
+          pos
+          ref
+          hgvsc
+          hgvsp
+          alt
+          exome {{
+          ac
+          an
+          ac_hom
+            populations
+            {{
+              id
+              ac
+              an
+            }}
+          }}
+          genome
+          {{
+            ac
+            an
+            ac_hom
+            populations
+            {{
+              id
+              ac
+              an
+            }}
+          }}
+        }}
+      }}
+    }}
+    """
+
+    response = requests.post(url, json={'query': query}, timeout=300)  # timeout set to 5 minutes
+
+    if response.status_code != 200:
+        print('Error:', response.status_code)
+
+    data = response.json()['data']['gene']['variants']
+
+    df = pd.json_normalize(data)
+
+    df.loc[:, 'total_ac'] = df.loc[:, 'exome.ac'].fillna(0) + df.loc[:, 'genome.ac'].fillna(0)
+    df.loc[:, 'total_an'] = df.loc[:, 'exome.an'].fillna(0) + df.loc[:, 'genome.an'].fillna(0)
+
+    df.loc[:, 'HGVS Consequence'] = df.loc[:, 'hgvsc'].fillna(0)  # cDNA change
+    df.loc[:, 'Protein Consequence'] = df.loc[:, 'hgvsp'].fillna(0)  # Protein change
+
+    df.loc[:, 'Allele Frequency'] = df.loc[:, 'total_ac'] / df.loc[:, 'total_an']
+    df.loc[:, 'Homozygote Count'] = df.loc[:, 'exome.ac_hom'].fillna(0) + df.loc[:, 'genome.ac_hom'].fillna(0)
+    exome_populations = df.loc[:, 'exome.populations']
+    genome_populations = df.loc[:, 'genome.populations']
+    population_ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining']
+
+    for i in range(len(exome_populations)):
+        exome_pop = exome_populations[i]
+        prepare_popmax_calculation(df, exome_pop, 'exome', population_ids, i)
+        genome_pop = genome_populations[i]
+        prepare_popmax_calculation(df, genome_pop, 'genome', population_ids, i)
+
+    for population_id in population_ids:
+        df.loc[:, f'Allele_Frequency_{population_id}'] = (
+               (df.loc[:, f'exome_ac_{population_id}'].fillna(0) + df.loc[:, f'genome_ac_{population_id}'].fillna(0)) / (
+                df.loc[:, f'exome_an_{population_id}'].fillna(0) + df.loc[:, f'genome_an_{population_id}'].fillna(0)))
+    population_mapping = {
+            'afr': 'African/African American',
+            'eas': 'East Asian',
+            'asj': 'Ashkenazi Jew',
+            'sas': 'South Asian',
+            'nfe': 'European (non-Finnish)',
+            'fin': 'European (Finnish)',
+            'mid': 'Middle Eastern',
+            'amr': 'Admixed American',
+            'ami': "Amish",
+            'remaining': 'Remaining',
+            '': ''
+        }
+
+    for i in range(df.shape[0]):
+        max_pop = 0
+        max_id = ''
+        for population_id in population_ids:
+            if df.loc[i, f'Allele_Frequency_{population_id}'] > max_pop:
+                max_pop = df.loc[i, f'Allele_Frequency_{population_id}']
+                max_id = population_id
+        df.loc[i, 'Popmax'] = max_pop
+        df.loc[i, 'Popmax population'] = population_mapping[max_id]
+    not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency',
+                   'variant_id', 'cDNA change', 'Protein change']
+
+    df = df.filter(not_to_drop, axis="columns")
+
+    df.rename(columns={'variant_id': 'gnomAD ID'})
+
+    return df
diff --git a/app/back-end/src/constants.py b/app/back-end/src/constants.py
@@ -28,7 +28,16 @@
 # Routes
 BASE_ROUTE = "/api/v1"
 WORKSPACE_ROUTE = "/workspace"
+WORKSPACE_FILE_ROUTE = "/workspace/file"
+WORKSPACE_CREATE_ROUTE = "/workspace/create"
+WORKSPACE_RENAME_ROUTE = "/workspace/rename"
+WORKSPACE_DELETE_ROUTE = "/workspace/delete"
+WORKSPACE_AGGREGATE_ROUTE = "/workspace/aggregate"
+WORKSPACE_IMPORT_ROUTE = "/workspace/import"
+WORKSPACE_EXPORT_ROUTE = "/workspace/export"
 
 # Events
 CONSOLE_FEEDBACK_EVENT = "console_feedback"
 WORKSPACE_FILE_SAVE_FEEDBACK_EVENT = "workspace_file_save_feedback"
+WORKSPACE_UPDATE_FEEDBACK_EVENT = "workspace_update_feedback"
+WOKRSPACE_EXPORT_FEEDBACK_EVENT = "workspace_export_feedback"
diff --git a/app/back-end/src/events/workspace_event.py b/app/back-end/src/events/workspace_event.py