Skip to content

Commit

Permalink
Merge pull request #5 from Strexas/JVA/web-scraping
Browse files Browse the repository at this point in the history
JVA/database store
  • Loading branch information
N3UR0515 authored Mar 4, 2024
2 parents 9759668 + 02fc60b commit 8a832e6
Show file tree
Hide file tree
Showing 4 changed files with 158 additions and 11 deletions.
Empty file added data_collection/.gitkeep
Empty file.
21 changes: 13 additions & 8 deletions data_collection/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
"""Module executes general pipeline for data collection"""
import pandas as pd

from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_dna
from constants import (LOVD_FILE_URL_EYS,
GNOMAD_FILE_URL_EYS,
CLINVAR_FILE_URL_EYS,
from tools import store_database_for_eys_gene, from_lovd_to_pandas, from_clinvar_name_to_dna
from constants import (
DATA_PATH,
LOVD_PATH,
GNOMAD_PATH,
CLINVAR_PATH)


def calculate_max_frequency(row):
"""
Calculating maximum allele frequency in GNOMAD row.
Expand All @@ -28,7 +27,9 @@ def calculate_max_frequency(row):
'European (Finnish)',
'European (non-Finnish)',
'Middle Eastern',
'South Asian']
'South Asian',
'Remaining'
]

max_freq = 0
max_pop = population_groups[0]
Expand All @@ -47,9 +48,13 @@ def calculate_max_frequency(row):

# MAIN
# Download all data
get_file_from_url(LOVD_FILE_URL_EYS, LOVD_PATH + "/lovd_data.txt", override=True)
get_file_from_url(GNOMAD_FILE_URL_EYS, GNOMAD_PATH + "/gnomad_data.csv", override=True)
get_file_from_url(CLINVAR_FILE_URL_EYS, CLINVAR_PATH + "/clinvar_data.txt", override=True)

#get_file_from_url(LOVD_FILE_URL, LOVD_PATH + f"/lovd_data.txt", override=True)
#get_file_from_url(GNOMAD_FILE_URL, GNOMAD_PATH + f"/gnomad_data.csv", override=True)
#get_file_from_url(CLINVAR_FILE_URL, CLINVAR_PATH + f"/clinvar_data.txt", override=True)
store_database_for_eys_gene('lovd', True)
store_database_for_eys_gene('gnomad', True)
store_database_for_eys_gene('clinvar', True)

# Read and convert data
lovd_data = from_lovd_to_pandas(LOVD_PATH + "/lovd_data.txt")
Expand Down
144 changes: 142 additions & 2 deletions data_collection/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,20 @@

import os
import requests
from requests.exceptions import RequestException
import pandas as pd
import selenium.common
from pandas import DataFrame
from requests import RequestException
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
import glob
from constants import LOVD_VARIABLES_DATA_TYPES
from constants import (LOVD_FILE_URL_EYS,
GNOMAD_URL_EYS,
CLINVAR_URL_EYS)


# EXCEPTIONS
Expand All @@ -17,7 +27,6 @@ class DownloadError(Exception):
"""Custom exception for download errors."""



def get_file_from_url(url, save_to, override=False):
"""
Gets file from url and saves it into provided path. Overrides, if override is True.
Expand All @@ -27,6 +36,44 @@ def get_file_from_url(url, save_to, override=False):
:param bool override: needs override
"""

# check if directory exists, if not - create
save_to_dir = os.path.dirname(save_to)
if not os.path.exists(save_to_dir):
os.makedirs(save_to_dir)
# check if directory exists, if not - create
save_to_dir = os.path.dirname(save_to)
if not os.path.exists(save_to_dir):
os.makedirs(save_to_dir)

# check if file exist and needs to override
if os.path.exists(save_to) and not override:
print(f"The file at {save_to} already exists.")
return

try:
response = requests.get(url, timeout=10)
except RequestException as e:
raise DownloadError(f"Error while downloading file from {url}") from e

if response.status_code != 200:
raise BadResponseException(f"Bad response from {url}."
f" Status code: {response.status_code}")

with open(save_to, "wb") as f:
f.write(response.content)


def download_lovd_database_for_eys_gene(database_name, override=False):
"""
Gets file from url and saves it into provided path. Overrides, if override is True.
:param str database_name: database to download
:param bool override: needs override
"""

url = DATABASES_DOWNLOAD_PATHS[database_name]["url"]
save_to = DATABASES_DOWNLOAD_PATHS[database_name]["store_as"]

# check if directory exists, if not - create
save_to_dir = os.path.dirname(save_to)
if not os.path.exists(save_to_dir):
Expand Down Expand Up @@ -156,3 +203,96 @@ def from_clinvar_name_to_dna(name):
break

return name[start:end]


def download_database_for_eys_gene(database_name, override=False):
"""
downloads chosen database
and handles where it should be saved,
renames the downloaded (latest) file to appropriate name
:param database_name: the name of the database
:param override: should an existing file be overriden with a new one
"""

url = DATABASES_DOWNLOAD_PATHS[database_name]["url"]
button_location = DATABASES_DOWNLOAD_PATHS[database_name]["button"]
clickable = DATABASES_DOWNLOAD_PATHS[database_name]["clickable"]

firefox_options = webdriver.FirefoxOptions()
firefox_options.headless = True
firefox_options.add_argument('--headless')
firefox_options.set_preference("browser.download.folderList", 2)
firefox_options.set_preference("browser.download.manager.showWhenStarting", False)
firefox_options.set_preference("browser.download.dir", os.path.join(os.getcwd(), "..", "data", database_name))
firefox_options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream")

driver = webdriver.Firefox(options=firefox_options)
driver.get(url)
WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, clickable)))
driver.execute_script(button_location)

time.sleep(30)
driver.quit()

save_as = DATABASES_DOWNLOAD_PATHS[database_name]["store_as"]
os_path = os.path.join(os.getcwd(), "..", "data", database_name, save_as)

if os.path.exists(os_path) and override:
os.remove(os_path)
elif os.path.exists(os_path) and not override:
print("File already exists")
return
list_of_files = glob.glob(os.path.join(os.getcwd(), "..", "data", database_name, '*'))
latest_file = max(list_of_files, key=os.path.getctime)
os.rename(latest_file, os_path)


def store_database_for_eys_gene(database_name, override=False):
"""
calls a function to download a database
:param database_name: the name of the database that should be downloaded
:param override: should already existing file be overwritten
"""
try:
if database_name not in DATABASES_DOWNLOAD_PATHS:
raise IndexError(f"Requested {database_name} database is not supported")

DATABASES_DOWNLOAD_PATHS[database_name]["function"](database_name, override)

except TimeoutError as e:
print(f"Error: {e}")
except selenium.common.InvalidArgumentException as e:
print(f"Error: {e}")
except selenium.common.exceptions.WebDriverException as e:
print(f"Error: {e}")
except ValueError as e:
print(f"Error:{e}")
except IndexError as e:
print(f"Error:{e}")
except BadResponseException as e:
print(f"Error:{e}")
except DownloadError as e:
print(f"Error:{e}")


DATABASES_DOWNLOAD_PATHS = {
"clinvar": {
"button": 'document.getElementsByName(\"EntrezSystem2.PEntrez.clinVar.clinVar_Entrez_ResultsPanel.Entrez_DisplayBar.SendToSubmit\")[0].click()',
"url": CLINVAR_URL_EYS,
"store_as": "clinvar_data.txt",
"clickable": "/html/body/div[1]/div[1]/form/div[1]/div[5]/div/div[2]/div[2]/div[1]/div/div[1]/a[3]",
"function": download_database_for_eys_gene
},
"gnomad": {
"button":"document.getElementsByClassName('Button__BaseButton-sc-1eobygi-0 Button-sc-1eobygi-1 indcWT')[4].click()",
"url": GNOMAD_URL_EYS,
"store_as": "gnomad_data.csv",
"clickable": "/html/body/div[1]/div[3]/div[2]/div/div[7]/div[4]/div[2]/button[1]",
"function": download_database_for_eys_gene
},
"lovd": {
"url": LOVD_FILE_URL_EYS,
"store_as": "../data/lovd/lovd_data.txt",
"function": download_lovd_database_for_eys_gene
}
}
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
requests
pandas
pandas
glob2
selenium

0 comments on commit 8a832e6

Please sign in to comment.