Skip to content

Commit

Permalink
Moved function to collection.py;
Browse files Browse the repository at this point in the history
added path parameter with overriding option;
implemented one log function;
Changed that its only getting EYS
  • Loading branch information
KajusC committed Sep 14, 2024
1 parent 522cf70 commit 252a468
Show file tree
Hide file tree
Showing 4 changed files with 1,624 additions and 174 deletions.
3 changes: 2 additions & 1 deletion api/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,11 @@
download_lovd_database_for_eys_gene,
download_genes_lovd,
download_database_for_eys_gene,
download_data_from_gnomad_eys,

# Functions for storing databases
store_database_for_eys_gene

)

# DATA REFACTORING IMPORT
Expand All @@ -58,7 +60,6 @@
parse_lovd,
from_clinvar_name_to_cdna_position,
save_lovd_as_vcf,
request_gnomad_api_data,
merge_gnomad_lovd,
parse_gnomad,
set_gnomad_dtypes,
Expand Down
151 changes: 151 additions & 0 deletions api/data/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import time

import requests
import pandas as pd
from requests import RequestException

from selenium import webdriver
Expand Down Expand Up @@ -189,3 +190,153 @@ def store_database_for_eys_gene(database_name, override=False):
else:
download_database_for_eys_gene(database_name, override)

def prepare_popmax_calculation(df, pop_data, name, pop_ids, index):
"""
prepares the calculation of popmax and popmax population for a variant.
genome and exome data of ac and an.
:param DataFrame df: DataFrame containing gnomAD data
:param dict pop_data: dictionary containing population data
:param str name: name of the population
:param list[str] pop_ids: list of population ids
:param int index: index of the variant
"""

for pop_id in pop_ids:
df.loc[index, f'{name}_ac_{pop_id}'] = 0
df.loc[index, f'{name}_an_{pop_id}'] = 0
if isinstance(pop_data, list):
for pop in pop_data:
variant_id = pop['id']
df.loc[index, f'{name}_ac_{variant_id}'] = pop['ac']
df.loc[index, f'{name}_an_{variant_id}'] = pop['an']


def download_data_from_gnomad_eys(path, override=False):
"""
Requests gnomAD API for data about a specific gene containing:
- variant_id
- cDNA change
- protein change
- allele frequency
- homozygote count
- popmax
- popmax population
:param str gene_name: name of gene
:param bool to_file: if True, saves data to variants.csv
:returns: DataFrame from gnomAD API
:rtype: DataFrame
"""

url = 'https://gnomad.broadinstitute.org/api'
query = f"""
query{{
gene(gene_symbol: "EYS", reference_genome: GRCh38) {{
variants(dataset: gnomad_r4)
{{
variant_id
chrom
pos
ref
hgvsc
hgvsp
alt
exome {{
ac
an
ac_hom
populations
{{
id
ac
an
}}
}}
genome
{{
ac
an
ac_hom
populations
{{
id
ac
an
}}
}}
}}
}}
}}
"""

response = requests.post(url, json={'query': query}, timeout=300) # timeout set to 5 minutes

if response.status_code != 200:
if not os.path.isfile(path):
f = open('logs.txt', 'x')
f.write(response.text)
else:
f = open('logs.txt', 'a')
f.write(response.text)

data = response.json()['data']['gene']['variants']

df = pd.json_normalize(data)

df.loc[:, 'total_ac'] = df.loc[:, 'exome.ac'].fillna(0) + df.loc[:, 'genome.ac'].fillna(0)
df.loc[:, 'total_an'] = df.loc[:, 'exome.an'].fillna(0) + df.loc[:, 'genome.an'].fillna(0)

df.loc[:, 'HGVS Consequence'] = df.loc[:, 'hgvsc'].fillna(0) # cDNA change
df.loc[:, 'Protein Consequence'] = df.loc[:, 'hgvsp'].fillna(0) # Protein change

df.loc[:, 'Allele Frequency'] = df.loc[:, 'total_ac'] / df.loc[:, 'total_an']
df.loc[:, 'Homozygote Count'] = df.loc[:, 'exome.ac_hom'].fillna(0) + df.loc[:, 'genome.ac_hom'].fillna(0)
exome_populations = df.loc[:, 'exome.populations']
genome_populations = df.loc[:, 'genome.populations']
population_ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining']

for i in range(len(exome_populations)):
exome_pop = exome_populations[i]
prepare_popmax_calculation(df, exome_pop, 'exome', population_ids, i)
genome_pop = genome_populations[i]
prepare_popmax_calculation(df, genome_pop, 'genome', population_ids, i)

for population_id in population_ids:
df.loc[:, f'Allele_Frequency_{population_id}'] = (
(df.loc[:, f'exome_ac_{population_id}'].fillna(0) + df.loc[:, f'genome_ac_{population_id}'].fillna(0)) /
(df.loc[:, f'exome_an_{population_id}'].fillna(0) + df.loc[:, f'genome_an_{population_id}'].fillna(0)))
population_mapping = {
'afr': 'African/African American',
'eas': 'East Asian',
'asj': 'Ashkenazi Jew',
'sas': 'South Asian',
'nfe': 'European (non-Finnish)',
'fin': 'European (Finnish)',
'mid': 'Middle Eastern',
'amr': 'Admixed American',
'ami': "Amish",
'remaining': 'Remaining',
'': ''
}

for i in range(df.shape[0]):
max_pop = 0
max_id = ''
for population_id in population_ids:
if df.loc[i, f'Allele_Frequency_{population_id}'] > max_pop:
max_pop = df.loc[i, f'Allele_Frequency_{population_id}']
max_id = population_id
df.loc[i, 'Popmax'] = max_pop
df.loc[i, 'Popmax population'] = population_mapping[max_id]
not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency',
'variant_id', 'cDNA change', 'Protein change', 'gnomAD ID']

df.rename(columns={'variant_id': 'gnomAD ID'})

df = df.filter(not_to_drop, axis="columns")

if not os.path.isfile(path) or override:
df.to_csv(path, index=False)

return df
141 changes: 0 additions & 141 deletions api/data/refactoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,145 +247,4 @@ def save_lovd_as_vcf(data, save_to="./lovd.vcf"):
f.write("\n")


def prepare_popmax_calculation(df, pop_data, name, pop_ids, index):
"""
prepares the calculation of popmax and popmax population for a variant.
genome and exome data of ac and an.
:param DataFrame df: DataFrame containing gnomAD data
:param dict pop_data: dictionary containing population data
:param str name: name of the population
:param list[str] pop_ids: list of population ids
:param int index: index of the variant
"""

for pop_id in pop_ids:
df.loc[index, f'{name}_ac_{pop_id}'] = 0
df.loc[index, f'{name}_an_{pop_id}'] = 0
if isinstance(pop_data, list):
for pop in pop_data:
variant_id = pop['id']
df.loc[index, f'{name}_ac_{variant_id}'] = pop['ac']
df.loc[index, f'{name}_an_{variant_id}'] = pop['an']


def request_gnomad_api_data(gene_name):
"""
Requests gnomAD API for data about a specific gene containing:
- variant_id
- cDNA change
- protein change
- allele frequency
- homozygote count
- popmax
- popmax population
:param str gene_name: name of gene
:param bool to_file: if True, saves data to variants.csv
:returns: DataFrame from gnomAD API
:rtype: DataFrame
"""

url = 'https://gnomad.broadinstitute.org/api'
query = f"""
query{{
gene(gene_symbol: "{gene_name}", reference_genome: GRCh38) {{
variants(dataset: gnomad_r4)
{{
variant_id
chrom
pos
ref
hgvsc
hgvsp
alt
exome {{
ac
an
ac_hom
populations
{{
id
ac
an
}}
}}
genome
{{
ac
an
ac_hom
populations
{{
id
ac
an
}}
}}
}}
}}
}}
"""

response = requests.post(url, json={'query': query}, timeout=300) # timeout set to 5 minutes

if response.status_code != 200:
print('Error:', response.status_code)

data = response.json()['data']['gene']['variants']

df = pd.json_normalize(data)

df.loc[:, 'total_ac'] = df.loc[:, 'exome.ac'].fillna(0) + df.loc[:, 'genome.ac'].fillna(0)
df.loc[:, 'total_an'] = df.loc[:, 'exome.an'].fillna(0) + df.loc[:, 'genome.an'].fillna(0)

df.loc[:, 'HGVS Consequence'] = df.loc[:, 'hgvsc'].fillna(0) # cDNA change
df.loc[:, 'Protein Consequence'] = df.loc[:, 'hgvsp'].fillna(0) # Protein change

df.loc[:, 'Allele Frequency'] = df.loc[:, 'total_ac'] / df.loc[:, 'total_an']
df.loc[:, 'Homozygote Count'] = df.loc[:, 'exome.ac_hom'].fillna(0) + df.loc[:, 'genome.ac_hom'].fillna(0)
exome_populations = df.loc[:, 'exome.populations']
genome_populations = df.loc[:, 'genome.populations']
population_ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining']

for i in range(len(exome_populations)):
exome_pop = exome_populations[i]
prepare_popmax_calculation(df, exome_pop, 'exome', population_ids, i)
genome_pop = genome_populations[i]
prepare_popmax_calculation(df, genome_pop, 'genome', population_ids, i)

for population_id in population_ids:
df.loc[:, f'Allele_Frequency_{population_id}'] = (
(df.loc[:, f'exome_ac_{population_id}'].fillna(0) + df.loc[:, f'genome_ac_{population_id}'].fillna(0)) / (
df.loc[:, f'exome_an_{population_id}'].fillna(0) + df.loc[:, f'genome_an_{population_id}'].fillna(0)))
population_mapping = {
'afr': 'African/African American',
'eas': 'East Asian',
'asj': 'Ashkenazi Jew',
'sas': 'South Asian',
'nfe': 'European (non-Finnish)',
'fin': 'European (Finnish)',
'mid': 'Middle Eastern',
'amr': 'Admixed American',
'ami': "Amish",
'remaining': 'Remaining',
'': ''
}

for i in range(df.shape[0]):
max_pop = 0
max_id = ''
for population_id in population_ids:
if df.loc[i, f'Allele_Frequency_{population_id}'] > max_pop:
max_pop = df.loc[i, f'Allele_Frequency_{population_id}']
max_id = population_id
df.loc[i, 'Popmax'] = max_pop
df.loc[i, 'Popmax population'] = population_mapping[max_id]
not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency',
'variant_id', 'cDNA change', 'Protein change']

df = df.filter(not_to_drop, axis="columns")

df.rename(columns={'variant_id': 'gnomAD ID'})

return df
Loading

0 comments on commit 252a468

Please sign in to comment.