Skip to content

Commit

Permalink
Merge pull request #66 from Strexas/KCE/refactor_lovd_download
Browse files Browse the repository at this point in the history
KCE/KBE-29 KBE-33
  • Loading branch information
Strexas authored Sep 30, 2024
2 parents dd78258 + b1b5595 commit 727955a
Show file tree
Hide file tree
Showing 5 changed files with 4,130 additions and 18 deletions.
2 changes: 1 addition & 1 deletion api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
download_database_for_eys_gene,

# Functions for storing databases
store_database_for_eys_gene
download_selected_database_for_eys_gene
)

# DATA REFACTORING IMPORT
Expand Down
4 changes: 2 additions & 2 deletions api/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
)

# DATA COLLECTION IMPORT
from .collection import (
from .downloading import (
# Custom exceptions
BadResponseException,
DownloadError,
Expand All @@ -49,7 +49,7 @@
download_data_from_gnomad_eys,

# Functions for storing databases
store_database_for_eys_gene
download_selected_database_for_eys_gene

)

Expand Down
1 change: 1 addition & 0 deletions api/data/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
LOVD_FILE_URL = "https://databases.lovd.nl/shared/download/all/gene/"
LOVD_FILE_URL_EYS = LOVD_FILE_URL + "EYS"
STORE_AS_LOVD = "../data/lovd/lovd_data.txt"
STORE_AS_GNOMAD = "../data/gnomad/gnomad_data.csv"

GNOMAD_URL = "https://gnomad.broadinstitute.org/gene"
GNOMAD_URL_EYS = "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4"
Expand Down
52 changes: 37 additions & 15 deletions api/data/downloading.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
LOVD_PATH,
DATABASES_DOWNLOAD_PATHS,
LOVD_FILE_URL_EYS,
STORE_AS_LOVD)
STORE_AS_LOVD,
STORE_AS_GNOMAD)


# EXCEPTIONS
Expand Down Expand Up @@ -66,15 +67,15 @@ def get_file_from_url(url, save_to, override=False):
f.write(response.content)


def download_lovd_database_for_eys_gene(override=False):
def download_lovd_database_for_eys_gene(save_to=STORE_AS_LOVD, override=False):
"""
Gets file from url and saves it into provided path. Overrides, if override is True.
:param str save_to: path to save (default: 'data/lovd/lovd_eys.txt')
:param bool override: needs override
"""

url = LOVD_FILE_URL_EYS
save_to = STORE_AS_LOVD

# check if directory exists, if not - create
save_to_dir = os.path.dirname(save_to)
Expand Down Expand Up @@ -176,19 +177,37 @@ def download_database_for_eys_gene(database_name, override=False):
os.rename(latest_file, os_path)


def store_database_for_eys_gene(database_name, override=False):
def download_selected_database_for_eys_gene(database_name, save_path="", override=False):
"""
Calls a function to download a database.
:param database_name: the name of the database that should be downloaded
:param save_path: path to save the data
:param override: should be already existing file be overwritten
"""
if not isinstance(database_name, str):
raise TypeError("Database name should be a string")

database_name = database_name.lower()

# if save_path is not provided, save to default location
if database_name == "lovd" and save_path == "":
save_path = STORE_AS_LOVD
elif database_name == "gnomad" and save_path == "":
save_path = STORE_AS_GNOMAD

# check if database_name is supported
if database_name not in DATABASES_DOWNLOAD_PATHS:
raise IndexError(f"Requested {database_name} database is not supported")
raise IndexError(f"Requested for {database_name} database is not supported")

# download the database
if database_name == "lovd":
download_lovd_database_for_eys_gene(override)
download_lovd_database_for_eys_gene(save_path, override)
elif database_name == "gnomad":
download_data_from_gnomad_eys(save_path, override)
else:
download_database_for_eys_gene(database_name, override)
raise IndexError(f"Requested for {database_name} is not yet supported")


def prepare_popmax_calculation(df, pop_data, name, pop_ids, index):
"""
Expand All @@ -212,7 +231,7 @@ def prepare_popmax_calculation(df, pop_data, name, pop_ids, index):
df.loc[index, f'{name}_an_{variant_id}'] = pop['an']


def download_data_from_gnomad_eys(path, override=False):
def download_data_from_gnomad_eys(path=STORE_AS_GNOMAD, override=False):
"""
Requests gnomAD API for data about a specific gene containing:
- variant_id
Expand All @@ -223,12 +242,15 @@ def download_data_from_gnomad_eys(path, override=False):
- popmax
- popmax population
:param str gene_name: name of gene
:param bool to_file: if True, saves data to variants.csv
:returns: DataFrame from gnomAD API
:rtype: DataFrame
:param str path: path to save the data (default: 'data/gnomad/gnomad_eys.csv')
:param bool override: should an existing file be overriden with a new one
"""

if os.path.exists(path) and not override:
print(f"The file at {path} already exists.")
logging.info("The file at %s already exists.", path)
return

url = 'https://gnomad.broadinstitute.org/api'
query = f"""
query{{
Expand Down Expand Up @@ -276,9 +298,11 @@ def download_data_from_gnomad_eys(path, override=False):
if not os.path.isfile(path):
f = open('logs.txt', 'x')
f.write(response.text)
logging.error("Error while downloading data from gnomAD API. Check logs.txt for more information.")
else:
f = open('logs.txt', 'a')
f.write(response.text)
logging.error("Error while downloading data from gnomAD API. Check logs.txt for more information.")

data = response.json()['data']['gene']['variants']

Expand Down Expand Up @@ -337,6 +361,4 @@ def download_data_from_gnomad_eys(path, override=False):
df = df.filter(not_to_drop, axis="columns")

if not os.path.isfile(path) or override:
df.to_csv(path, index=False)

return df
df.to_csv(path, index=False)
Loading

0 comments on commit 727955a

Please sign in to comment.