From 76777165d12a7f552411a254901a248d0ee3eba1 Mon Sep 17 00:00:00 2001 From: OGJunius Date: Thu, 15 Feb 2024 11:03:10 +0200 Subject: [PATCH 01/10] Added encoding utf-8 parameter to reading lovd file --- data_collection/tools.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data_collection/tools.py b/data_collection/tools.py index fa00df4..4233787 100644 --- a/data_collection/tools.py +++ b/data_collection/tools.py @@ -234,7 +234,7 @@ def from_lovd_to_pandas(path): d = dict() - with open(path) as f: + with open(path, encoding='utf-8') as f: # skip header [f.readline() for _ in range(4)] @@ -265,9 +265,9 @@ def from_lovd_to_pandas(path): convert_lovd_data_types(frame, table_name) d[table_name] = (frame, notes) + # skip inter tables lines [f.readline() for _ in range(1)] - return d except FileNotFoundError as e: print(f"Error: {e}") From 97c8ebfd3e44587d0ad84eadd29713529cd5ffcc Mon Sep 17 00:00:00 2001 From: OGJunius Date: Thu, 15 Feb 2024 11:07:54 +0200 Subject: [PATCH 02/10] Removed duplicates and typos from LOVD_VARIABLES_DATA_TYPES --- data_collection/tools.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/data_collection/tools.py b/data_collection/tools.py index 4233787..7209623 100644 --- a/data_collection/tools.py +++ b/data_collection/tools.py @@ -61,7 +61,6 @@ class DownloadError(Exception): 'VariantOnTranscript/Exon': 'String', 'symbol': 'String', 'inheritance': 'String', - 'id_omin': 'Integer', 'tissues': 'String', 'features': 'String', 'remarks': 'String', @@ -116,7 +115,6 @@ class DownloadError(Exception): 'Individual/Origin/Population': 'String', 'Individual/Individual_ID': 'String', 'allele': 'Integer', - 'chromosome': 'Integer', 'position_g_start': 'Integer', 'position_g_end': 'Integer', 'type': 'String', From d1d68b5a4c5ab4ef083adacac11c8f07d2fa83c6 Mon Sep 17 00:00:00 2001 From: OGJunius Date: Thu, 15 Feb 2024 11:09:56 +0200 Subject: [PATCH 03/10] Added the 'Remaining' population to the population_groups --- data_collection/pipeline.py | 4 +++- data_collection/tools.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py index 0b12237..0a823d2 100644 --- a/data_collection/pipeline.py +++ b/data_collection/pipeline.py @@ -38,7 +38,9 @@ def calculate_max_frequency(row): 'European (Finnish)', 'European (non-Finnish)', 'Middle Eastern', - 'South Asian'] + 'South Asian', + 'Remaining' + ] max_freq = 0 max_pop = population_groups[0] diff --git a/data_collection/tools.py b/data_collection/tools.py index 7209623..978e078 100644 --- a/data_collection/tools.py +++ b/data_collection/tools.py @@ -317,7 +317,9 @@ def calculate_max_frequency(row): 'European (Finnish)', 'European (non-Finnish)', 'Middle Eastern', - 'South Asian'] + 'South Asian', + 'Remaining' + ] max_freq = 0 max_pop = population_groups[0] From 3c7a2a57f3b5d0cb5979554a1deaaebf832ddfe5 Mon Sep 17 00:00:00 2001 From: OGJunius Date: Mon, 19 Feb 2024 14:31:02 +0200 Subject: [PATCH 04/10] Implemented the feature to download data from gnomad and clinvar databases --- data_collection/pipeline.py | 9 ++-- data_collection/tools.py | 88 +++++++++++++++++++++++++++++++++++++ requirements.txt | 3 +- 3 files changed, 95 insertions(+), 5 deletions(-) diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py index 0a823d2..294ae3f 100644 --- a/data_collection/pipeline.py +++ b/data_collection/pipeline.py @@ -1,6 +1,6 @@ import pandas as pd from pandas import DataFrame, Series -from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_DNA +from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_DNA, download_database # CONSTANTS # files @@ -60,8 +60,10 @@ def calculate_max_frequency(row): # MAIN # Download all data get_file_from_url(LOVD_FILE_URL, LOVD_PATH + f"/lovd_data.txt", override=True) -get_file_from_url(GNOMAD_FILE_URL, GNOMAD_PATH + f"/gnomad_data.csv", override=True) -get_file_from_url(CLINVAR_FILE_URL, CLINVAR_PATH + f"/clinvar_data.txt", override=True) +#get_file_from_url(GNOMAD_FILE_URL, GNOMAD_PATH + f"/gnomad_data.csv", override=True) +#get_file_from_url(CLINVAR_FILE_URL, CLINVAR_PATH + f"/clinvar_data.txt", override=True) +download_database('gnomad', 'gnomad_data.csv', GNOMAD_URL, True) +download_database('clinvar', 'clinvar_data.txt', CLINVAR_URL, True) # Read and convert data lovd_data = from_lovd_to_pandas(LOVD_PATH + "/lovd_data.txt") @@ -99,7 +101,6 @@ def calculate_max_frequency(row): max_values = lovd_with_gnomad.apply(calculate_max_frequency, axis=1) lovd_with_gnomad[['PopMax(gnomad)', 'PopMax population(gnomad)']] = max_values - # Leaving necessary columns lovd_with_gnomad = lovd_with_gnomad.loc[:, ['id', diff --git a/data_collection/tools.py b/data_collection/tools.py index 978e078..755d106 100644 --- a/data_collection/tools.py +++ b/data_collection/tools.py @@ -2,6 +2,14 @@ import os import pandas as pd from pandas import DataFrame +from selenium import webdriver +from selenium.webdriver.firefox.options import Options +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By +import time +import glob + # EXCEPTIONS @@ -333,3 +341,83 @@ def calculate_max_frequency(row): max_pop = group return pd.Series([max_freq, max_pop], index=['PopMax', 'PopMax population']) + + +def download_gnomad_database(url, path): + """ + scrapes the gnomad database + :param url: the url of the database website + :param path: path where the file is saved + """ + firefox_options = webdriver.FirefoxOptions() + firefox_options.headless = True + firefox_options.add_argument('--headless') + firefox_options.set_preference('browser.download.folderList', 2) + firefox_options.set_preference("browser.download.manager.showWhenStarting", False) + firefox_options.set_preference("browser.download.dir", os.path.join(os.getcwd(), "..", "data", path)) + firefox_options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream") + + driver = webdriver.Firefox(options=firefox_options) + driver.get(url) + + wait = WebDriverWait(driver, 30) + + export_button = wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/div[3]/div[2]/div/div[7]/div[4]/div[2]/button[1]'))) + export_button.click() + + + time.sleep(10) + driver.quit() + + +def download_clinvar_database(url, path): + """ + scrapes the clinvar database + :param url: the url of the database website + :param path: path where the file is saved + """ + firefox_options = webdriver.FirefoxOptions() + firefox_options.headless = True + firefox_options.add_argument('--headless') + firefox_options.set_preference("browser.download.folderList", 2) + firefox_options.set_preference("browser.download.manager.showWhenStarting", False) + firefox_options.set_preference("browser.download.dir", os.path.join(os.getcwd(), "..", "data", path)) + firefox_options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream") + + driver = webdriver.Firefox(options=firefox_options) + + driver.get(url) + + driver.execute_script("document.getElementsByName(\"EntrezSystem2.PEntrez.clinVar.clinVar_Entrez_ResultsPanel.Entrez_DisplayBar.SendToSubmit\")[0].click()") + time.sleep(30) + driver.quit() + + +def download_database(database_name, save_as, url, override=False): + """ + calls a function to download a database + and handles where it should be saved + :param database_name: the name of the database that should be downloaded + :param save_as: the name by which the database file should be saved + :param url: the url of the database website + :param override: should already existing file be overwritten + """ + ospath = os.path.join(os.getcwd(), "..", "data", database_name, save_as) + if os.path.exists(ospath): + if override: + os.remove(ospath) + else: + print("File is already downloaded") + return + match database_name: + case 'gnomad': + download_gnomad_database(url, database_name) + case 'clinvar': + download_clinvar_database(url, database_name) + case _: + print('This database is not supported') + + list_of_files = glob.glob(os.path.join(os.getcwd(), "..", "data", database_name, '*')) + latest_file = max(list_of_files, key=os.path.getctime) + os.rename(latest_file, ospath) + diff --git a/requirements.txt b/requirements.txt index a94cf69..22570db 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ requests -pandas \ No newline at end of file +pandas +glob2 \ No newline at end of file From 9d852ff870ade983d8edcd93d8722dcd53eff988 Mon Sep 17 00:00:00 2001 From: OGJunius Date: Tue, 20 Feb 2024 12:10:17 +0200 Subject: [PATCH 05/10] Provided changes --- data_collection/pipeline.py | 7 +-- data_collection/tools.py | 109 ++++++++++++++++++------------------ requirements.txt | 3 +- 3 files changed, 58 insertions(+), 61 deletions(-) diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py index 294ae3f..7f56051 100644 --- a/data_collection/pipeline.py +++ b/data_collection/pipeline.py @@ -1,6 +1,5 @@ import pandas as pd -from pandas import DataFrame, Series -from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_DNA, download_database +from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_DNA, store_database # CONSTANTS # files @@ -62,8 +61,8 @@ def calculate_max_frequency(row): get_file_from_url(LOVD_FILE_URL, LOVD_PATH + f"/lovd_data.txt", override=True) #get_file_from_url(GNOMAD_FILE_URL, GNOMAD_PATH + f"/gnomad_data.csv", override=True) #get_file_from_url(CLINVAR_FILE_URL, CLINVAR_PATH + f"/clinvar_data.txt", override=True) -download_database('gnomad', 'gnomad_data.csv', GNOMAD_URL, True) -download_database('clinvar', 'clinvar_data.txt', CLINVAR_URL, True) +store_database('gnomad', True) +store_database('clinvar', True) # Read and convert data lovd_data = from_lovd_to_pandas(LOVD_PATH + "/lovd_data.txt") diff --git a/data_collection/tools.py b/data_collection/tools.py index 755d106..44509f0 100644 --- a/data_collection/tools.py +++ b/data_collection/tools.py @@ -3,7 +3,6 @@ import pandas as pd from pandas import DataFrame from selenium import webdriver -from selenium.webdriver.firefox.options import Options from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By @@ -11,6 +10,22 @@ import glob +DATABASES_DOWNLOAD_PATHS = { + "clinvar": { + "button": 'document.getElementsByName(\"EntrezSystem2.PEntrez.clinVar.clinVar_Entrez_ResultsPanel.Entrez_DisplayBar.SendToSubmit\")[0].click()', + "url": "https://www.ncbi.nlm.nih.gov/clinvar/?term=EYS%5Bgene%5D&redir=gene", + "store_as": "clinvar_data.txt", + "clickable": "/html/body/div[1]/div[1]/form/div[1]/div[5]/div/div[2]/div[2]/div[1]/div/div[1]/a[3]" + }, + "gnomad": { + "button":"document.getElementsByClassName('Button__BaseButton-sc-1eobygi-0 Button-sc-1eobygi-1 indcWT')[4].click()", + "url": "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4", + "store_as": "gnomad_data.csv", + "clickable": "/html/body/div[1]/div[3]/div[2]/div/div[7]/div[4]/div[2]/button[1]" + } +} + + # EXCEPTIONS class BadResponseException(Exception): @@ -343,81 +358,63 @@ def calculate_max_frequency(row): return pd.Series([max_freq, max_pop], index=['PopMax', 'PopMax population']) -def download_gnomad_database(url, path): +def download_database(url, database_name, button_location, clickable): """ - scrapes the gnomad database - :param url: the url of the database website - :param path: path where the file is saved - """ - firefox_options = webdriver.FirefoxOptions() - firefox_options.headless = True - firefox_options.add_argument('--headless') - firefox_options.set_preference('browser.download.folderList', 2) - firefox_options.set_preference("browser.download.manager.showWhenStarting", False) - firefox_options.set_preference("browser.download.dir", os.path.join(os.getcwd(), "..", "data", path)) - firefox_options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream") - - driver = webdriver.Firefox(options=firefox_options) - driver.get(url) - - wait = WebDriverWait(driver, 30) - - export_button = wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/div[3]/div[2]/div/div[7]/div[4]/div[2]/button[1]'))) - export_button.click() - - - time.sleep(10) - driver.quit() - - -def download_clinvar_database(url, path): - """ - scrapes the clinvar database - :param url: the url of the database website - :param path: path where the file is saved + downloads chosen database + :param url: the url of the database's website + :param database_name: the name of the database + :param button_location: button which should be clicked on page for download + :param clickable: an element in a webpage indicating that the download can start """ firefox_options = webdriver.FirefoxOptions() firefox_options.headless = True firefox_options.add_argument('--headless') firefox_options.set_preference("browser.download.folderList", 2) firefox_options.set_preference("browser.download.manager.showWhenStarting", False) - firefox_options.set_preference("browser.download.dir", os.path.join(os.getcwd(), "..", "data", path)) + firefox_options.set_preference("browser.download.dir", os.path.join(os.getcwd(), "..", "data", database_name)) firefox_options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream") driver = webdriver.Firefox(options=firefox_options) - driver.get(url) - driver.execute_script("document.getElementsByName(\"EntrezSystem2.PEntrez.clinVar.clinVar_Entrez_ResultsPanel.Entrez_DisplayBar.SendToSubmit\")[0].click()") - time.sleep(30) - driver.quit() + try: + WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, clickable))) + driver.execute_script(button_location) + + time.sleep(30) + except TimeoutError as e: + print(f"Error: {e}") + finally: + driver.quit() -def download_database(database_name, save_as, url, override=False): +def store_database(database_name, override=False): """ calls a function to download a database - and handles where it should be saved + and handles where it should be saved, + renames the downloaded (latest) file to appropriate name :param database_name: the name of the database that should be downloaded - :param save_as: the name by which the database file should be saved - :param url: the url of the database website :param override: should already existing file be overwritten """ - ospath = os.path.join(os.getcwd(), "..", "data", database_name, save_as) - if os.path.exists(ospath): - if override: - os.remove(ospath) - else: - print("File is already downloaded") - return - match database_name: - case 'gnomad': - download_gnomad_database(url, database_name) - case 'clinvar': - download_clinvar_database(url, database_name) - case _: - print('This database is not supported') + if database_name not in DATABASES_DOWNLOAD_PATHS.keys(): + print("Requested database is not supported") + return + save_as = DATABASES_DOWNLOAD_PATHS[database_name]["store_as"] + os_path = os.path.join(os.getcwd(), "..", "data", database_name, save_as) + + if os.path.exists(os_path) and override: + os.remove(os_path) + elif os.path.exists(os_path) and not override: + print("File already exits") + return + + url = DATABASES_DOWNLOAD_PATHS[database_name]["url"] + button = DATABASES_DOWNLOAD_PATHS[database_name]["button"] + clickable = DATABASES_DOWNLOAD_PATHS[database_name]["clickable"] + + download_database(url, database_name, button, clickable) list_of_files = glob.glob(os.path.join(os.getcwd(), "..", "data", database_name, '*')) latest_file = max(list_of_files, key=os.path.getctime) - os.rename(latest_file, ospath) + os.rename(latest_file, os_path) diff --git a/requirements.txt b/requirements.txt index 22570db..ca3ef2d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ requests pandas -glob2 \ No newline at end of file +glob2 +selenium \ No newline at end of file From 338de02f42e96fd03502987bbe797d34417daa1b Mon Sep 17 00:00:00 2001 From: OGJunius Date: Tue, 20 Feb 2024 12:11:14 +0200 Subject: [PATCH 06/10] typo --- data_collection/tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_collection/tools.py b/data_collection/tools.py index 44509f0..5bce558 100644 --- a/data_collection/tools.py +++ b/data_collection/tools.py @@ -405,7 +405,7 @@ def store_database(database_name, override=False): if os.path.exists(os_path) and override: os.remove(os_path) elif os.path.exists(os_path) and not override: - print("File already exits") + print("File already exists") return url = DATABASES_DOWNLOAD_PATHS[database_name]["url"] From 478f6797bd1e495fa56c917d5e1447ead7c0d79b Mon Sep 17 00:00:00 2001 From: Junius Date: Sat, 2 Mar 2024 14:07:28 +0200 Subject: [PATCH 07/10] implemented the storing of databases functionality. Added lovd to the downloadable databases. --- data_collection/pipeline.py | 14 ++--- data_collection/tools.py | 116 ++++++++++++++++++++---------------- 2 files changed, 69 insertions(+), 61 deletions(-) diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py index 7f56051..f066e56 100644 --- a/data_collection/pipeline.py +++ b/data_collection/pipeline.py @@ -1,16 +1,11 @@ import pandas as pd -from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_DNA, store_database +from tools import from_lovd_to_pandas, from_clinvar_name_to_DNA, store_database_for_eys_gene # CONSTANTS # files LOVD_URL = "https://databases.lovd.nl/shared/genes/EYS" LOVD_FILE_URL = "https://databases.lovd.nl/shared/download/all/gene/EYS" -GNOMAD_URL = "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4" -GNOMAD_FILE_URL = "https://drive.usercontent.google.com/u/0/uc?id=1crkDCVcC0PSnv0JPGj3FpemBs28-T_3y&export=download" - -CLINVAR_URL = "https://www.ncbi.nlm.nih.gov/clinvar/?term=eys%5Bgene%5D&redir=gene" -CLINVAR_FILE_URL = "https://drive.usercontent.google.com/u/0/uc?id=1RK5XBK3k5h0K6f-qfwJSQj7tlF-H2U6u&export=download" # path DATA_PATH = "../data" @@ -58,11 +53,12 @@ def calculate_max_frequency(row): # MAIN # Download all data -get_file_from_url(LOVD_FILE_URL, LOVD_PATH + f"/lovd_data.txt", override=True) +store_database_for_eys_gene('lovd', True) +#get_file_from_url(LOVD_FILE_URL, LOVD_PATH + f"/lovd_data.txt", override=True) #get_file_from_url(GNOMAD_FILE_URL, GNOMAD_PATH + f"/gnomad_data.csv", override=True) #get_file_from_url(CLINVAR_FILE_URL, CLINVAR_PATH + f"/clinvar_data.txt", override=True) -store_database('gnomad', True) -store_database('clinvar', True) +store_database_for_eys_gene('gnomad', True) +store_database_for_eys_gene('clinvar', True) # Read and convert data lovd_data = from_lovd_to_pandas(LOVD_PATH + "/lovd_data.txt") diff --git a/data_collection/tools.py b/data_collection/tools.py index 5bce558..c934361 100644 --- a/data_collection/tools.py +++ b/data_collection/tools.py @@ -1,6 +1,7 @@ import requests import os import pandas as pd +import selenium.common from pandas import DataFrame from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait @@ -10,23 +11,6 @@ import glob -DATABASES_DOWNLOAD_PATHS = { - "clinvar": { - "button": 'document.getElementsByName(\"EntrezSystem2.PEntrez.clinVar.clinVar_Entrez_ResultsPanel.Entrez_DisplayBar.SendToSubmit\")[0].click()', - "url": "https://www.ncbi.nlm.nih.gov/clinvar/?term=EYS%5Bgene%5D&redir=gene", - "store_as": "clinvar_data.txt", - "clickable": "/html/body/div[1]/div[1]/form/div[1]/div[5]/div/div[2]/div[2]/div[1]/div/div[1]/a[3]" - }, - "gnomad": { - "button":"document.getElementsByClassName('Button__BaseButton-sc-1eobygi-0 Button-sc-1eobygi-1 indcWT')[4].click()", - "url": "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4", - "store_as": "gnomad_data.csv", - "clickable": "/html/body/div[1]/div[3]/div[2]/div/div[7]/div[4]/div[2]/button[1]" - } -} - - - # EXCEPTIONS class BadResponseException(Exception): pass @@ -166,15 +150,16 @@ class DownloadError(Exception): } -def get_file_from_url(url, save_to, override=False): +def get_file_from_url(database_name, override=False): """ Gets file from url and saves it into provided path. Overrides, if override is True. - :param str url: link with file - :param str save_to: path to save + :param str database_name: link with file :param bool override: needs override """ + url = DATABASES_DOWNLOAD_PATHS[database_name]["url"] + save_to = DATABASES_DOWNLOAD_PATHS[database_name]["store_as"] try: # check if directory exists, if not - create save_to_dir = os.path.dirname(save_to) @@ -358,14 +343,19 @@ def calculate_max_frequency(row): return pd.Series([max_freq, max_pop], index=['PopMax', 'PopMax population']) -def download_database(url, database_name, button_location, clickable): +def download_database_for_eys_gene(database_name, override=False): """ downloads chosen database - :param url: the url of the database's website + and handles where it should be saved, + renames the downloaded (latest) file to appropriate name :param database_name: the name of the database - :param button_location: button which should be clicked on page for download - :param clickable: an element in a webpage indicating that the download can start + :param override: should an existing file be overriden with a new one """ + + url = DATABASES_DOWNLOAD_PATHS[database_name]["url"] + button_location = DATABASES_DOWNLOAD_PATHS[database_name]["button"] + clickable = DATABASES_DOWNLOAD_PATHS[database_name]["clickable"] + firefox_options = webdriver.FirefoxOptions() firefox_options.headless = True firefox_options.add_argument('--headless') @@ -376,29 +366,12 @@ def download_database(url, database_name, button_location, clickable): driver = webdriver.Firefox(options=firefox_options) driver.get(url) + WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, clickable))) + driver.execute_script(button_location) - try: - WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, clickable))) - driver.execute_script(button_location) - - time.sleep(30) - except TimeoutError as e: - print(f"Error: {e}") - finally: - driver.quit() + time.sleep(30) + driver.quit() - -def store_database(database_name, override=False): - """ - calls a function to download a database - and handles where it should be saved, - renames the downloaded (latest) file to appropriate name - :param database_name: the name of the database that should be downloaded - :param override: should already existing file be overwritten - """ - if database_name not in DATABASES_DOWNLOAD_PATHS.keys(): - print("Requested database is not supported") - return save_as = DATABASES_DOWNLOAD_PATHS[database_name]["store_as"] os_path = os.path.join(os.getcwd(), "..", "data", database_name, save_as) @@ -407,14 +380,53 @@ def store_database(database_name, override=False): elif os.path.exists(os_path) and not override: print("File already exists") return - - url = DATABASES_DOWNLOAD_PATHS[database_name]["url"] - button = DATABASES_DOWNLOAD_PATHS[database_name]["button"] - clickable = DATABASES_DOWNLOAD_PATHS[database_name]["clickable"] - - download_database(url, database_name, button, clickable) - list_of_files = glob.glob(os.path.join(os.getcwd(), "..", "data", database_name, '*')) latest_file = max(list_of_files, key=os.path.getctime) os.rename(latest_file, os_path) + +def store_database_for_eys_gene(database_name, override=False): + """ + calls a function to download a database + :param database_name: the name of the database that should be downloaded + :param override: should already existing file be overwritten + """ + try: + if database_name not in DATABASES_DOWNLOAD_PATHS: + raise IndexError(f"Requested {database_name} database is not supported") + + DATABASES_DOWNLOAD_PATHS[database_name]["function"](database_name, override) + + except TimeoutError as e: + print(f"Error: {e}") + except selenium.common.InvalidArgumentException as e: + print(f"Error: {e}") + except selenium.common.exceptions.WebDriverException as e: + print(f"Error: {e}") + except ValueError as e: + print(f"Error:{e}") + except IndexError as e: + print(f"Error:{e}") + + +DATABASES_DOWNLOAD_PATHS = { + "clinvar": { + "button": 'document.getElementsByName(\"EntrezSystem2.PEntrez.clinVar.clinVar_Entrez_ResultsPanel.Entrez_DisplayBar.SendToSubmit\")[0].click()', + "url": "https://www.ncbi.nlm.nih.gov/clinvar/?term=EYS%5Bgene%5D&redir=gene", + "store_as": "clinvar_data.txt", + "clickable": "/html/body/div[1]/div[1]/form/div[1]/div[5]/div/div[2]/div[2]/div[1]/div/div[1]/a[3]", + "function": download_database_for_eys_gene + }, + "gnomad": { + "button":"document.getElementsByClassName('Button__BaseButton-sc-1eobygi-0 Button-sc-1eobygi-1 indcWT')[4].click()", + "url": "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4", + "store_as": "gnomad_data.csv", + "clickable": "/html/body/div[1]/div[3]/div[2]/div/div[7]/div[4]/div[2]/button[1]", + "function": download_database_for_eys_gene + }, + "lovd": { + "url": "https://databases.lovd.nl/shared/download/all/gene/EYS", + "store_as": "../data/lovd/lovd_data.txt", + "function": get_file_from_url + } +} \ No newline at end of file From a70b0ae9dce9c0fb9eddbde414397dc26f3a06f1 Mon Sep 17 00:00:00 2001 From: Junius Date: Sat, 2 Mar 2024 14:10:20 +0200 Subject: [PATCH 08/10] implemented the storing of databases functionality. Added lovd to the downloadable databases. --- data_collection/tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_collection/tools.py b/data_collection/tools.py index c934361..ac30162 100644 --- a/data_collection/tools.py +++ b/data_collection/tools.py @@ -154,7 +154,7 @@ def get_file_from_url(database_name, override=False): """ Gets file from url and saves it into provided path. Overrides, if override is True. - :param str database_name: link with file + :param str database_name: database to download :param bool override: needs override """ From 06587ec2f10f5481d29fd9b8672cdb728efa7c56 Mon Sep 17 00:00:00 2001 From: Junius Date: Mon, 4 Mar 2024 21:42:51 +0200 Subject: [PATCH 09/10] restored get_file_from_url function and added download_lovd_database. --- data_collection/tools.py | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/data_collection/tools.py b/data_collection/tools.py index ac30162..b793f55 100644 --- a/data_collection/tools.py +++ b/data_collection/tools.py @@ -3,6 +3,7 @@ import pandas as pd import selenium.common from pandas import DataFrame +from requests import RequestException from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC @@ -150,7 +151,39 @@ class DownloadError(Exception): } -def get_file_from_url(database_name, override=False): +def get_file_from_url(url, save_to, override=False): + """ + Gets file from url and saves it into provided path. Overrides, if override is True. + + :param str url: link with file + :param str save_to: path to save + :param bool override: needs override + """ + + # check if directory exists, if not - create + save_to_dir = os.path.dirname(save_to) + if not os.path.exists(save_to_dir): + os.makedirs(save_to_dir) + + # check if file exist and needs to override + if os.path.exists(save_to) and not override: + print(f"The file at {save_to} already exists.") + return + + try: + response = requests.get(url, timeout=10) + except RequestException as e: + raise DownloadError(f"Error while downloading file from {url}") from e + + if response.status_code != 200: + raise BadResponseException(f"Bad response from {url}." + f" Status code: {response.status_code}") + + with open(save_to, "wb") as f: + f.write(response.content) + + +def download_lovd_database(database_name, override=False): """ Gets file from url and saves it into provided path. Overrides, if override is True. @@ -427,6 +460,6 @@ def store_database_for_eys_gene(database_name, override=False): "lovd": { "url": "https://databases.lovd.nl/shared/download/all/gene/EYS", "store_as": "../data/lovd/lovd_data.txt", - "function": get_file_from_url + "function": download_lovd_database } -} \ No newline at end of file +} From 02fc60b29a0ed03fb1ad7b304105962ebd06c36b Mon Sep 17 00:00:00 2001 From: Junius Date: Mon, 4 Mar 2024 23:13:40 +0200 Subject: [PATCH 10/10] added back .github\workflows file. --- .github/workflows/pylint.yml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 .github/workflows/pylint.yml diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml new file mode 100644 index 0000000..89c0945 --- /dev/null +++ b/.github/workflows/pylint.yml @@ -0,0 +1,24 @@ +name: Pylint + +on: [push] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12"] + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pylint + pip install -r requirements.txt + - name: Analysing the code with pylint + run: | + pylint $(git ls-files '*.py')