Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

KCE/download_gnomad_from_api #43

Merged
merged 11 commits into from
Sep 10, 2024
3 changes: 2 additions & 1 deletion api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,5 +54,6 @@
# Functions for refactoring data
set_lovd_dtypes,
parse_lovd,
from_clinvar_name_to_cdna_position
from_clinvar_name_to_cdna_position,
save_lovd_as_vcf,
)
60 changes: 60 additions & 0 deletions api/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""
Package for data collection purposes provides both collection and refactoring functionality.

Data from LOVD, ClinVar and GnomAd databases can be downloaded using this package. GnomAd and
ClinVar are limited with EYS gene, but it is possible to download data for any gene in LOVD.

All necessary functionality can be imported directly from data without
specifying the module.

data collection pipeline example is established for project's specific usage.
"""

# CONSTANTS IMPORT
from .constants import (
# URLs for LOVD database
LOVD_URL, LOVD_URL_EYS, LOVD_FILE_URL, LOVD_FILE_URL_EYS,

# URLs for gnomAD database
GNOMAD_URL, GNOMAD_URL_EYS, GNOMAD_FILE_URL_EYS,

# URLs for ClinVar database
CLINVAR_URL, CLINVAR_URL_EYS, CLINVAR_FILE_URL_EYS,

# Paths for data storage
DATA_PATH, LOVD_PATH, GNOMAD_PATH, CLINVAR_PATH,

# Data types for tables
LOVD_TABLES_DATA_TYPES,

# Paths for database downloads
DATABASES_DOWNLOAD_PATHS
)

# DATA COLLECTION IMPORT
from .collection import (
# Custom exceptions
BadResponseException,
DownloadError,

# Custom utility functions
get_file_from_url,

# Functions for downloading databases
download_lovd_database_for_eys_gene,
download_genes_lovd,
download_database_for_eys_gene,

# Functions for storing databases
store_database_for_eys_gene
)

# DATA REFACTORING IMPORT
from .refactoring import (
# Functions for refactoring data
set_lovd_dtypes,
parse_lovd,
from_clinvar_name_to_cdna_position,
save_lovd_as_vcf,
request_gnomad_api_data,
)
132 changes: 132 additions & 0 deletions api/data/refactoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@
import os
import logging

import requests

import pandas as pd
from pandas import DataFrame

from .constants import LOVD_TABLES_DATA_TYPES, LOVD_PATH, GNOMAD_TABLES_DATA_TYPES, GNOMAD_PATH



def set_lovd_dtypes(df_dict):
"""
Convert data from LOVD format table to desired data format based on specified data types.
Expand Down Expand Up @@ -242,3 +245,132 @@ def save_lovd_as_vcf(data, save_to="./lovd.vcf"):

f.write("\t".join(record))
f.write("\n")


def request_gnomad_api_data(gene_name, to_file=True):
"""
Requests gnomAD API for data about a specific gene containing:
- variant_id
- cDNA change
- protein change
- allele frequency
- homozygote count
- popmax
- popmax population

:param str gene_name: name of gene
:param bool to_file: if True, saves data to variants.csv
:returns: DataFrame from gnomAD API
:rtype: DataFrame
"""

url = 'https://gnomad.broadinstitute.org/api'
query = f"""
query{{
gene(gene_symbol: "{gene_name}", reference_genome: GRCh38) {{
variants(dataset: gnomad_r4)
{{
variant_id
chrom
pos
ref
hgvsc
hgvsp
alt
exome {{
ac
an
ac_hom
populations
{{
id
ac
an
}}
}}
genome
{{
ac
an
ac_hom
populations
{{
id
ac
an
}}
}}
}}
}}
}}
"""

response = requests.post(url, json={'query': query})
if response.status_code == 200:
KajusC marked this conversation as resolved.
Show resolved Hide resolved
data = response.json()['data']['gene']['variants']

df = pd.json_normalize(data)

df['total_ac'] = df['exome.ac'].fillna(0) + df['genome.ac'].fillna(0)
df['total_an'] = df['exome.an'].fillna(0) + df['genome.an'].fillna(0)

df['cDNA change'] = df['hgvsc'].fillna(0)
df['Protein change'] = df['hgvsp'].fillna(0)

df['Allele Frequency'] = df['total_ac'] / df['total_an']
df['Homozygote Count'] = df['exome.ac_hom'].fillna(0) + df['genome.ac_hom'].fillna(0)
exome_populations = df['exome.populations']
genome_populations = df['genome.populations']
ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining']

def process_population_data(pop_data, name, pop_ids, index):
for pop_id in pop_ids:
df.loc[index, f'{name}_ac_{pop_id}'] = 0
df.loc[index, f'{name}_an_{pop_id}'] = 0
if type(pop_data) == list:
for pop in pop_data:
id = pop['id']
df.loc[index, f'{name}_ac_{id}'] = pop['ac']
df.loc[index, f'{name}_an_{id}'] = pop['an']

for i in range(len(exome_populations)):
exome_pop = exome_populations[i]
process_population_data(exome_pop, 'exome', ids, i)
genome_pop = genome_populations[i]
process_population_data(genome_pop, 'genome', ids, i)

for id in ids:
df[f'Allele_Frequency_{id}'] = (df[f'exome_ac_{id}'].fillna(0) + df[f'genome_ac_{id}'].fillna(0)) / (
df[f'exome_an_{id}'].fillna(0) + df[f'genome_an_{id}'].fillna(0))
population_mapping = {
'afr': 'African/African American',
'eas': 'East Asian',
'asj': 'Ashkenazi Jew',
'sas': 'South Asian',
'nfe': 'European (non-Finnish)',
'fin': 'European (Finnish)',
'mid': 'Middle Eastern',
'amr': 'Admixed American',
'ami': "Amish",
'remaining': 'Remaining',
'': ''
KajusC marked this conversation as resolved.
Show resolved Hide resolved
}
for i in range(len(df)):
max = 0
maxid = ''
for id in ids:
if df.loc[i, f'Allele_Frequency_{id}'] > max:
max = df.loc[i, f'Allele_Frequency_{id}']
maxid = id
df.loc[i, 'Popmax'] = max
df.loc[i, 'Popmax population'] = population_mapping[maxid]
not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency', 'variant_id',
'cDNA change', 'Protein change']
df = df.drop([col for col in df.columns if col not in not_to_drop], axis=1)
if to_file:
df.to_csv('variants.csv', index=True)
KajusC marked this conversation as resolved.
Show resolved Hide resolved

else:
print('Error:', response.status_code)

return df
Loading
Loading