Skip to content

Commit

Permalink
Merge branch 'main' into VLE/KBE-25/mut_pos_standartisation
Browse files Browse the repository at this point in the history
  • Loading branch information
Akaud authored Sep 11, 2024
2 parents 68b8472 + 43db108 commit 555e2e7
Show file tree
Hide file tree
Showing 83 changed files with 7,872 additions and 806 deletions.
3 changes: 2 additions & 1 deletion api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,5 +54,6 @@
# Functions for refactoring data
set_lovd_dtypes,
parse_lovd,
from_clinvar_name_to_cdna_position
from_clinvar_name_to_cdna_position,
save_lovd_as_vcf,
)
65 changes: 65 additions & 0 deletions api/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""
Package for data collection purposes provides both collection and refactoring functionality.
Data from LOVD, ClinVar and GnomAd databases can be downloaded using this package. GnomAd and
ClinVar are limited with EYS gene, but it is possible to download data for any gene in LOVD.
All necessary functionality can be imported directly from data without
specifying the module.
data collection pipeline example is established for project's specific usage.
"""

# CONSTANTS IMPORT
from .constants import (
# URLs for LOVD database
LOVD_URL, LOVD_URL_EYS, LOVD_FILE_URL, LOVD_FILE_URL_EYS,

# URLs for gnomAD database
GNOMAD_URL, GNOMAD_URL_EYS, GNOMAD_FILE_URL_EYS,

# URLs for ClinVar database
CLINVAR_URL, CLINVAR_URL_EYS, CLINVAR_FILE_URL_EYS,

# Paths for data storage
DATA_PATH, LOVD_PATH, GNOMAD_PATH, CLINVAR_PATH,

# Data types for tables
LOVD_TABLES_DATA_TYPES,

# Paths for database downloads
DATABASES_DOWNLOAD_PATHS,

GNOMAD_PATH,
)

# DATA COLLECTION IMPORT
from .collection import (
# Custom exceptions
BadResponseException,
DownloadError,

# Custom utility functions
get_file_from_url,

# Functions for downloading databases
download_lovd_database_for_eys_gene,
download_genes_lovd,
download_database_for_eys_gene,

# Functions for storing databases
store_database_for_eys_gene
)

# DATA REFACTORING IMPORT
from .refactoring import (
# Functions for refactoring data
set_lovd_dtypes,
parse_lovd,
from_clinvar_name_to_cdna_position,
save_lovd_as_vcf,
request_gnomad_api_data,
merge_gnomad_lovd,
parse_gnomad,
set_gnomad_dtypes,
)
147 changes: 147 additions & 0 deletions api/data/refactoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import logging
import re

import requests

import pandas as pd
from pandas import DataFrame

Expand All @@ -12,6 +14,7 @@
from .constants import LOVD_TABLES_DATA_TYPES, LOVD_PATH, GNOMAD_TABLES_DATA_TYPES, GNOMAD_PATH



def set_lovd_dtypes(df_dict):
"""
Convert data from LOVD format table to desired data format based on specified data types.
Expand Down Expand Up @@ -311,3 +314,147 @@ def save_lovd_as_vcf(data, save_to="./lovd.vcf"):

f.write("\t".join(record))
f.write("\n")


def prepare_popmax_calculation(df, pop_data, name, pop_ids, index):
"""
prepares the calculation of popmax and popmax population for a variant.
genome and exome data of ac and an.
:param DataFrame df: DataFrame containing gnomAD data
:param dict pop_data: dictionary containing population data
:param str name: name of the population
:param list[str] pop_ids: list of population ids
:param int index: index of the variant
"""

for pop_id in pop_ids:
df.loc[index, f'{name}_ac_{pop_id}'] = 0
df.loc[index, f'{name}_an_{pop_id}'] = 0
if isinstance(pop_data, list):
for pop in pop_data:
variant_id = pop['id']
df.loc[index, f'{name}_ac_{variant_id}'] = pop['ac']
df.loc[index, f'{name}_an_{variant_id}'] = pop['an']


def request_gnomad_api_data(gene_name):
"""
Requests gnomAD API for data about a specific gene containing:
- variant_id
- cDNA change
- protein change
- allele frequency
- homozygote count
- popmax
- popmax population
:param str gene_name: name of gene
:param bool to_file: if True, saves data to variants.csv
:returns: DataFrame from gnomAD API
:rtype: DataFrame
"""

url = 'https://gnomad.broadinstitute.org/api'
query = f"""
query{{
gene(gene_symbol: "{gene_name}", reference_genome: GRCh38) {{
variants(dataset: gnomad_r4)
{{
variant_id
chrom
pos
ref
hgvsc
hgvsp
alt
exome {{
ac
an
ac_hom
populations
{{
id
ac
an
}}
}}
genome
{{
ac
an
ac_hom
populations
{{
id
ac
an
}}
}}
}}
}}
}}
"""

response = requests.post(url, json={'query': query}, timeout=300) # timeout set to 5 minutes

if response.status_code != 200:
print('Error:', response.status_code)

data = response.json()['data']['gene']['variants']

df = pd.json_normalize(data)

df.loc[:, 'total_ac'] = df.loc[:, 'exome.ac'].fillna(0) + df.loc[:, 'genome.ac'].fillna(0)
df.loc[:, 'total_an'] = df.loc[:, 'exome.an'].fillna(0) + df.loc[:, 'genome.an'].fillna(0)

df.loc[:, 'HGVS Consequence'] = df.loc[:, 'hgvsc'].fillna(0) # cDNA change
df.loc[:, 'Protein Consequence'] = df.loc[:, 'hgvsp'].fillna(0) # Protein change

df.loc[:, 'Allele Frequency'] = df.loc[:, 'total_ac'] / df.loc[:, 'total_an']
df.loc[:, 'Homozygote Count'] = df.loc[:, 'exome.ac_hom'].fillna(0) + df.loc[:, 'genome.ac_hom'].fillna(0)
exome_populations = df.loc[:, 'exome.populations']
genome_populations = df.loc[:, 'genome.populations']
population_ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining']

for i in range(len(exome_populations)):
exome_pop = exome_populations[i]
prepare_popmax_calculation(df, exome_pop, 'exome', population_ids, i)
genome_pop = genome_populations[i]
prepare_popmax_calculation(df, genome_pop, 'genome', population_ids, i)

for population_id in population_ids:
df.loc[:, f'Allele_Frequency_{population_id}'] = (
(df.loc[:, f'exome_ac_{population_id}'].fillna(0) + df.loc[:, f'genome_ac_{population_id}'].fillna(0)) / (
df.loc[:, f'exome_an_{population_id}'].fillna(0) + df.loc[:, f'genome_an_{population_id}'].fillna(0)))
population_mapping = {
'afr': 'African/African American',
'eas': 'East Asian',
'asj': 'Ashkenazi Jew',
'sas': 'South Asian',
'nfe': 'European (non-Finnish)',
'fin': 'European (Finnish)',
'mid': 'Middle Eastern',
'amr': 'Admixed American',
'ami': "Amish",
'remaining': 'Remaining',
'': ''
}

for i in range(df.shape[0]):
max_pop = 0
max_id = ''
for population_id in population_ids:
if df.loc[i, f'Allele_Frequency_{population_id}'] > max_pop:
max_pop = df.loc[i, f'Allele_Frequency_{population_id}']
max_id = population_id
df.loc[i, 'Popmax'] = max_pop
df.loc[i, 'Popmax population'] = population_mapping[max_id]
not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency',
'variant_id', 'cDNA change', 'Protein change']

df = df.filter(not_to_drop, axis="columns")

df.rename(columns={'variant_id': 'gnomAD ID'})

return df
9 changes: 9 additions & 0 deletions app/back-end/src/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,16 @@
# Routes
BASE_ROUTE = "/api/v1"
WORKSPACE_ROUTE = "/workspace"
WORKSPACE_FILE_ROUTE = "/workspace/file"
WORKSPACE_CREATE_ROUTE = "/workspace/create"
WORKSPACE_RENAME_ROUTE = "/workspace/rename"
WORKSPACE_DELETE_ROUTE = "/workspace/delete"
WORKSPACE_AGGREGATE_ROUTE = "/workspace/aggregate"
WORKSPACE_IMPORT_ROUTE = "/workspace/import"
WORKSPACE_EXPORT_ROUTE = "/workspace/export"

# Events
CONSOLE_FEEDBACK_EVENT = "console_feedback"
WORKSPACE_FILE_SAVE_FEEDBACK_EVENT = "workspace_file_save_feedback"
WORKSPACE_UPDATE_FEEDBACK_EVENT = "workspace_update_feedback"
WOKRSPACE_EXPORT_FEEDBACK_EVENT = "workspace_export_feedback"
95 changes: 0 additions & 95 deletions app/back-end/src/events/workspace_event.py

This file was deleted.

Loading

0 comments on commit 555e2e7

Please sign in to comment.