Skip to content

Commit

Permalink
Merge branch 'main' into KCE/LOVD_data_conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
Strexas authored Apr 1, 2024
2 parents 04498e2 + 58ef673 commit 7a0c0e0
Show file tree
Hide file tree
Showing 5 changed files with 230 additions and 15 deletions.
27 changes: 27 additions & 0 deletions .github/workflows/mypy.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: MyPy

on: [push]

jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install mypy
pip install -r requirements.txt
- name: Install stubs
run: |
mypy --install-types --non-interactive $(git ls-files '*.py')
- name: Analysing the code with pylint
run: |
mypy $(git ls-files '*.py')
30 changes: 28 additions & 2 deletions data_collection/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

# files
LOVD_URL = "https://databases.lovd.nl/shared/genes"
LOVD_URL_EYS = "https://databases.lovd.nl/shared/genes/EYS"
LOVD_URL_EYS = LOVD_URL + "/EYS"
LOVD_FILE_URL = "https://databases.lovd.nl/shared/download/all/gene"
LOVD_FILE_URL_EYS = "https://databases.lovd.nl/shared/download/all/gene/EYS"
LOVD_FILE_URL_EYS = LOVD_FILE_URL + "/EYS"

GNOMAD_URL = "https://gnomad.broadinstitute.org/gene"
GNOMAD_URL_EYS = "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4"
Expand Down Expand Up @@ -151,3 +151,29 @@
'owned_by': 'Integer',
'Individual/Origin/Geographic': 'String'
}


DATABASES_DOWNLOAD_PATHS = {
"clinvar": {
"button": 'document.getElementsByName(\"EntrezSystem2.PEntrez.clinVar.'
'clinVar_Entrez_ResultsPanel.Entrez_DisplayBar.SendToSubmit\")[0].click()',
"url": CLINVAR_URL_EYS,
"store_as": "clinvar_data.txt",
"clickable": "/html/body/div[1]/div[1]/form/div[1]/div[5]/div/div[2]/"
"div[2]/div[1]/div/div[1]/a[3]",
"function": "download_database_for_eys_gene"
},
"gnomad": {
"button": "document.getElementsByClassName"
"('Button__BaseButton-sc-1eobygi-0 Button-sc-1eobygi-1 indcWT')[4].click()",
"url": GNOMAD_URL_EYS,
"store_as": "gnomad_data.csv",
"clickable": "/html/body/div[1]/div[3]/div[2]/div/div[7]/div[4]/div[2]/button[1]",
"function": "download_database_for_eys_gene"
},
"lovd": {
"url": LOVD_FILE_URL_EYS,
"store_as": "../data/lovd/lovd_data.txt",
"function": "download_lovd_database_for_eys_gene"
}
}
20 changes: 13 additions & 7 deletions data_collection/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
"""Module executes general pipeline for data collection"""
import pandas as pd


from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_dna
from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_dna, store_database_for_eys_gene
from constants import (LOVD_FILE_URL_EYS,
GNOMAD_FILE_URL_EYS,
CLINVAR_FILE_URL_EYS,
DATA_PATH,
from constants import (DATA_PATH,
LOVD_PATH,
GNOMAD_PATH,
CLINVAR_PATH)


def calculate_max_frequency(row):
"""
Calculating maximum allele frequency in GNOMAD row.
Expand All @@ -29,7 +29,9 @@ def calculate_max_frequency(row):
'European (Finnish)',
'European (non-Finnish)',
'Middle Eastern',
'South Asian']
'South Asian',
'Remaining'
]

max_freq = 0
max_pop = population_groups[0]
Expand All @@ -48,9 +50,13 @@ def calculate_max_frequency(row):

# MAIN
# Download all data
get_file_from_url(LOVD_FILE_URL_EYS, LOVD_PATH + "/lovd_data.txt", override=True)
get_file_from_url(GNOMAD_FILE_URL_EYS, GNOMAD_PATH + "/gnomad_data.csv", override=True)
get_file_from_url(CLINVAR_FILE_URL_EYS, CLINVAR_PATH + "/clinvar_data.txt", override=True)

#get_file_from_url(LOVD_FILE_URL, LOVD_PATH + f"/lovd_data.txt", override=True)
#get_file_from_url(GNOMAD_FILE_URL, GNOMAD_PATH + f"/gnomad_data.csv", override=True)
#get_file_from_url(CLINVAR_FILE_URL, CLINVAR_PATH + f"/clinvar_data.txt", override=True)
store_database_for_eys_gene('lovd', True)
store_database_for_eys_gene('gnomad', True)
store_database_for_eys_gene('clinvar', True)

# Read and convert data
lovd_data = from_lovd_to_pandas(LOVD_PATH + "/lovd_data.txt")
Expand Down
165 changes: 160 additions & 5 deletions data_collection/tools.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,24 @@
"""Module providing a functionality to collect data from various sources."""

import glob
import logging
import os
import requests
from requests.exceptions import RequestException
import time

import pandas as pd
import requests
import selenium.common
from pandas import DataFrame
from constants import LOVD_VARIABLES_DATA_TYPES
from requests import RequestException
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

from constants import (LOVD_FILE_URL,
LOVD_PATH,
LOVD_VARIABLES_DATA_TYPES,
DATABASES_DOWNLOAD_PATHS)



Expand All @@ -18,7 +31,6 @@ class DownloadError(Exception):
"""Custom exception for download errors."""



def get_file_from_url(url, save_to, override=False):
"""
Gets file from url and saves it into provided path. Overrides, if override is True.
Expand All @@ -28,6 +40,44 @@ def get_file_from_url(url, save_to, override=False):
:param bool override: needs override
"""

# check if directory exists, if not - create
save_to_dir = os.path.dirname(save_to)
if not os.path.exists(save_to_dir):
os.makedirs(save_to_dir)
# check if directory exists, if not - create
save_to_dir = os.path.dirname(save_to)
if not os.path.exists(save_to_dir):
os.makedirs(save_to_dir)

# check if file exist and needs to override
if os.path.exists(save_to) and not override:
print(f"The file at {save_to} already exists.")
return

try:
response = requests.get(url, timeout=10)
except RequestException as e:
raise DownloadError(f"Error while downloading file from {url}") from e

if response.status_code != 200:
raise BadResponseException(f"Bad response from {url}."
f" Status code: {response.status_code}")

with open(save_to, "wb") as f:
f.write(response.content)


def download_lovd_database_for_eys_gene(database_name, override=False):
"""
Gets file from url and saves it into provided path. Overrides, if override is True.
:param str database_name: database to download
:param bool override: needs override
"""

url = DATABASES_DOWNLOAD_PATHS[database_name]["url"]
save_to = DATABASES_DOWNLOAD_PATHS[database_name]["store_as"]

# check if directory exists, if not - create
save_to_dir = os.path.dirname(save_to)
if not os.path.exists(save_to_dir):
Expand Down Expand Up @@ -165,4 +215,109 @@ def from_clinvar_name_to_dna(name):
break

return name[start:end]



def download_gene_lovd(gene_list: list, folder_path=LOVD_PATH, raise_exception=False):
"""
Downloads data into txt files from gene_list.
:param list gene_list: list of gene's symbols
:param str folder_path: folder to save the data
:param bool raise_exception: raise exception if True, otherwise log
"""

for gene in gene_list:
file_path = os.path.join(folder_path, gene + ".txt")
url = LOVD_FILE_URL + gene
try:
response = requests.get(url, timeout=10)
except RequestException as e:
raise DownloadError(f"Error while downloading file from {url}") from e

if response.status_code != 200:
raise BadResponseException(f"Bad response from {url}."
f" Status code: {response.status_code}")
# If gene does not exist, the first word of the file will be Error
valid = 'Error' not in response.text[:6]
if valid:
get_file_from_url(url, file_path)
elif raise_exception:
raise ValueError(f"Symbol: {gene} does not exist in the LOVD database")
else:
logging.error("Symbol: %s does not exist in the LOVD database", gene)


def download_database_for_eys_gene(database_name, override=False):
"""
downloads chosen database
and handles where it should be saved,
renames the downloaded (latest) file to appropriate name
:param database_name: the name of the database
:param override: should an existing file be overriden with a new one
"""

url = DATABASES_DOWNLOAD_PATHS[database_name]["url"]
button_location = DATABASES_DOWNLOAD_PATHS[database_name]["button"]
clickable = DATABASES_DOWNLOAD_PATHS[database_name]["clickable"]

firefox_options = webdriver.FirefoxOptions()
firefox_options.headless = True
firefox_options.add_argument('--headless')
firefox_options.set_preference("browser.download.folderList", 2)
firefox_options.set_preference("browser.download.manager.showWhenStarting", False)
firefox_options.set_preference("browser.download.dir",
os.path.join(os.getcwd(),
"..",
"data",
database_name))
firefox_options.set_preference("browser.helperApps.neverAsk.saveToDisk",
"application/octet-stream")

driver = webdriver.Firefox(options=firefox_options)
driver.get(url)
WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, clickable)))
driver.execute_script(button_location)

time.sleep(30)
driver.quit()

save_as = DATABASES_DOWNLOAD_PATHS[database_name]["store_as"]
os_path = os.path.join(os.getcwd(), "..", "data", database_name, save_as)

if os.path.exists(os_path) and override:
os.remove(os_path)
elif os.path.exists(os_path) and not override:
print("File already exists")
return
list_of_files = glob.glob(os.path.join(os.getcwd(), "..", "data", database_name, '*'))
latest_file = max(list_of_files, key=os.path.getctime)
os.rename(latest_file, os_path)


def store_database_for_eys_gene(database_name, override=False):
"""
calls a function to download a database
:param database_name: the name of the database that should be downloaded
:param override: should already existing file be overwritten
"""
try:
if database_name not in DATABASES_DOWNLOAD_PATHS:
raise IndexError(f"Requested {database_name} database is not supported")

# pylint: disable=eval-used
eval(DATABASES_DOWNLOAD_PATHS[database_name]["function"])(database_name, override)

except TimeoutError as e:
print(f"Error: {e}")
except selenium.common.InvalidArgumentException as e:
print(f"Error: {e}")
except selenium.common.exceptions.WebDriverException as e:
print(f"Error: {e}")
except ValueError as e:
print(f"Error:{e}")
except IndexError as e:
print(f"Error:{e}")
except BadResponseException as e:
print(f"Error:{e}")
except DownloadError as e:
print(f"Error:{e}")
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
requests
pandas
pandas
selenium

0 comments on commit 7a0c0e0

Please sign in to comment.