Skip to content

Commit

Permalink
Merge branch 'refs/heads/KCE/GnomAD_api_requests'
Browse files Browse the repository at this point in the history
# Conflicts:
#	api/data/__init__.py
#	api/data/downloading.py
#	api/data/refactoring.py
  • Loading branch information
KajusC committed Sep 17, 2024
2 parents 3b909a9 + 252a468 commit 7452996
Show file tree
Hide file tree
Showing 4 changed files with 1,689 additions and 157 deletions.
66 changes: 66 additions & 0 deletions api/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
"""
Package for data collection purposes provides both collection and refactoring functionality.
Data from LOVD, ClinVar and GnomAd databases can be downloaded using this package. GnomAd and
ClinVar are limited with EYS gene, but it is possible to download data for any gene in LOVD.
All necessary functionality can be imported directly from data without
specifying the module.
data collection pipeline example is established for project's specific usage.
"""

# CONSTANTS IMPORT
from .constants import (
# URLs for LOVD database
LOVD_URL, LOVD_URL_EYS, LOVD_FILE_URL, LOVD_FILE_URL_EYS,

# URLs for gnomAD database
GNOMAD_URL, GNOMAD_URL_EYS, GNOMAD_FILE_URL_EYS,

# URLs for ClinVar database
CLINVAR_URL, CLINVAR_URL_EYS, CLINVAR_FILE_URL_EYS,

# Paths for data storage
DATA_PATH, LOVD_PATH, GNOMAD_PATH, CLINVAR_PATH,

# Data types for tables
LOVD_TABLES_DATA_TYPES,

# Paths for database downloads
DATABASES_DOWNLOAD_PATHS,

GNOMAD_PATH,
)

# DATA COLLECTION IMPORT
from .collection import (
# Custom exceptions
BadResponseException,
DownloadError,

# Custom utility functions
get_file_from_url,

# Functions for downloading databases
download_lovd_database_for_eys_gene,
download_genes_lovd,
download_database_for_eys_gene,
download_data_from_gnomad_eys,

# Functions for storing databases
store_database_for_eys_gene

)

# DATA REFACTORING IMPORT
from .refactoring import (
# Functions for refactoring data
set_lovd_dtypes,
parse_lovd,
from_clinvar_name_to_cdna_position,
save_lovd_as_vcf,
merge_gnomad_lovd,
parse_gnomad,
set_gnomad_dtypes,
)
152 changes: 152 additions & 0 deletions api/data/downloading.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import time

import requests
import pandas as pd
from requests import RequestException

from selenium import webdriver
Expand Down Expand Up @@ -188,3 +189,154 @@ def store_database_for_eys_gene(database_name, override=False):
download_lovd_database_for_eys_gene(override)
else:
download_database_for_eys_gene(database_name, override)

def prepare_popmax_calculation(df, pop_data, name, pop_ids, index):
"""
prepares the calculation of popmax and popmax population for a variant.
genome and exome data of ac and an.
:param DataFrame df: DataFrame containing gnomAD data
:param dict pop_data: dictionary containing population data
:param str name: name of the population
:param list[str] pop_ids: list of population ids
:param int index: index of the variant
"""

for pop_id in pop_ids:
df.loc[index, f'{name}_ac_{pop_id}'] = 0
df.loc[index, f'{name}_an_{pop_id}'] = 0
if isinstance(pop_data, list):
for pop in pop_data:
variant_id = pop['id']
df.loc[index, f'{name}_ac_{variant_id}'] = pop['ac']
df.loc[index, f'{name}_an_{variant_id}'] = pop['an']


def download_data_from_gnomad_eys(path, override=False):
"""
Requests gnomAD API for data about a specific gene containing:
- variant_id
- cDNA change
- protein change
- allele frequency
- homozygote count
- popmax
- popmax population
:param str gene_name: name of gene
:param bool to_file: if True, saves data to variants.csv
:returns: DataFrame from gnomAD API
:rtype: DataFrame
"""

url = 'https://gnomad.broadinstitute.org/api'
query = f"""
query{{
gene(gene_symbol: "EYS", reference_genome: GRCh38) {{
variants(dataset: gnomad_r4)
{{
variant_id
chrom
pos
ref
hgvsc
hgvsp
alt
exome {{
ac
an
ac_hom
populations
{{
id
ac
an
}}
}}
genome
{{
ac
an
ac_hom
populations
{{
id
ac
an
}}
}}
}}
}}
}}
"""

response = requests.post(url, json={'query': query}, timeout=300) # timeout set to 5 minutes

if response.status_code != 200:
if not os.path.isfile(path):
f = open('logs.txt', 'x')
f.write(response.text)
else:
f = open('logs.txt', 'a')
f.write(response.text)

data = response.json()['data']['gene']['variants']

df = pd.json_normalize(data)

df.loc[:, 'total_ac'] = df.loc[:, 'exome.ac'].fillna(0) + df.loc[:, 'genome.ac'].fillna(0)
df.loc[:, 'total_an'] = df.loc[:, 'exome.an'].fillna(0) + df.loc[:, 'genome.an'].fillna(0)

df.loc[:, 'HGVS Consequence'] = df.loc[:, 'hgvsc'].fillna(0) # cDNA change
df.loc[:, 'Protein Consequence'] = df.loc[:, 'hgvsp'].fillna(0) # Protein change

df.loc[:, 'Allele Frequency'] = df.loc[:, 'total_ac'] / df.loc[:, 'total_an']
df.loc[:, 'Homozygote Count'] = df.loc[:, 'exome.ac_hom'].fillna(0) + df.loc[:, 'genome.ac_hom'].fillna(0)
exome_populations = df.loc[:, 'exome.populations']
genome_populations = df.loc[:, 'genome.populations']
population_ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining']

for i in range(len(exome_populations)):
exome_pop = exome_populations[i]
prepare_popmax_calculation(df, exome_pop, 'exome', population_ids, i)
genome_pop = genome_populations[i]
prepare_popmax_calculation(df, genome_pop, 'genome', population_ids, i)

for population_id in population_ids:
df.loc[:, f'Allele_Frequency_{population_id}'] = (
(df.loc[:, f'exome_ac_{population_id}'].fillna(0) + df.loc[:, f'genome_ac_{population_id}'].fillna(0)) /
(df.loc[:, f'exome_an_{population_id}'].fillna(0) + df.loc[:, f'genome_an_{population_id}'].fillna(0)))
population_mapping = {
'afr': 'African/African American',
'eas': 'East Asian',
'asj': 'Ashkenazi Jew',
'sas': 'South Asian',
'nfe': 'European (non-Finnish)',
'fin': 'European (Finnish)',
'mid': 'Middle Eastern',
'amr': 'Admixed American',
'ami': "Amish",
'remaining': 'Remaining',
'': ''
}

for i in range(df.shape[0]):
max_pop = 0
max_id = ''
for population_id in population_ids:
if df.loc[i, f'Allele_Frequency_{population_id}'] > max_pop:
max_pop = df.loc[i, f'Allele_Frequency_{population_id}']
max_id = population_id
df.loc[i, 'Popmax'] = max_pop
df.loc[i, 'Popmax population'] = population_mapping[max_id]
not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency',
'variant_id', 'cDNA change', 'Protein change', 'gnomAD ID']

df.rename(columns={'variant_id': 'gnomAD ID'})

df = df.filter(not_to_drop, axis="columns")

if not os.path.isfile(path) or override:
df.to_csv(path, index=False)

return df
125 changes: 0 additions & 125 deletions api/data/refactoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,128 +348,3 @@ def find_popmax_in_gnomad(data):
data.loc[i, 'Popmax'] = max_pop
data.loc[i, 'Popmax population'] = population_mapping[max_id]


def prepare_popmax_calculation(df, pop_data, name, pop_ids, index):
"""
prepares the calculation of popmax and popmax population for a variant.
genome and exome data of ac and an.
:param DataFrame df: DataFrame containing gnomAD data
:param dict pop_data: dictionary containing population data
:param str name: name of the population
:param list[str] pop_ids: list of population ids
:param int index: index of the variant
"""

for pop_id in pop_ids:
df.loc[index, f'{name}_ac_{pop_id}'] = 0
df.loc[index, f'{name}_an_{pop_id}'] = 0
if isinstance(pop_data, list):
for pop in pop_data:
variant_id = pop['id']
df.loc[index, f'{name}_ac_{variant_id}'] = pop['ac']
df.loc[index, f'{name}_an_{variant_id}'] = pop['an']


def request_gnomad_api_data(gene_name):
"""
Requests gnomAD API for data about a specific gene containing:
- variant_id
- cDNA change
- protein change
- allele frequency
- homozygote count
- popmax
- popmax population
:param str gene_name: name of gene
:param bool to_file: if True, saves data to variants.csv
:returns: DataFrame from gnomAD API
:rtype: DataFrame
"""

url = 'https://gnomad.broadinstitute.org/api'
query = f"""
query{{
gene(gene_symbol: "{gene_name}", reference_genome: GRCh38) {{
variants(dataset: gnomad_r4)
{{
variant_id
chrom
pos
ref
hgvsc
hgvsp
alt
exome {{
ac
an
ac_hom
populations
{{
id
ac
an
}}
}}
genome
{{
ac
an
ac_hom
populations
{{
id
ac
an
}}
}}
}}
}}
}}
"""

response = requests.post(url, json={'query': query}, timeout=300) # timeout set to 5 minutes

if response.status_code != 200:
print('Error:', response.status_code)

data = response.json()['data']['gene']['variants']

df = pd.json_normalize(data)

df.loc[:, 'total_ac'] = df.loc[:, 'exome.ac'].fillna(0) + df.loc[:, 'genome.ac'].fillna(0)
df.loc[:, 'total_an'] = df.loc[:, 'exome.an'].fillna(0) + df.loc[:, 'genome.an'].fillna(0)

df.loc[:, 'HGVS Consequence'] = df.loc[:, 'hgvsc'].fillna(0) # cDNA change
df.loc[:, 'Protein Consequence'] = df.loc[:, 'hgvsp'].fillna(0) # Protein change

df.loc[:, 'Allele Frequency'] = df.loc[:, 'total_ac'] / df.loc[:, 'total_an']
df.loc[:, 'Homozygote Count'] = (df.loc[:, 'exome.ac_hom'].fillna(0)
+ df.loc[:, 'genome.ac_hom'].fillna(0))
exome_populations = df.loc[:, 'exome.populations']
genome_populations = df.loc[:, 'genome.populations']
population_ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining']

for i in range(exome_populations.shape[0]):
exome_pop = exome_populations[i]
prepare_popmax_calculation(df, exome_pop, 'exome', population_ids, i)
genome_pop = genome_populations[i]
prepare_popmax_calculation(df, genome_pop, 'genome', population_ids, i)

for population_id in population_ids:
df.loc[:, f'Allele_Frequency_{population_id}'] = (
(df.loc[:, f'exome_ac_{population_id}'].fillna(0)
+ df.loc[:, f'genome_ac_{population_id}'].fillna(0))
/ (df.loc[:, f'exome_an_{population_id}'].fillna(0)
+ df.loc[:, f'genome_an_{population_id}'].fillna(0)))

find_popmax_in_gnomad(df)
not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency',
'variant_id', 'cDNA change', 'Protein change']

df = df.filter(not_to_drop, axis="columns")

df.rename(columns={'variant_id': 'gnomAD ID'})

return df
Loading

0 comments on commit 7452996

Please sign in to comment.