From 76777165d12a7f552411a254901a248d0ee3eba1 Mon Sep 17 00:00:00 2001
From: OGJunius <junius.vaitkus1@gmail.com>
Date: Thu, 15 Feb 2024 11:03:10 +0200
Subject: [PATCH 01/10] Added encoding utf-8 parameter to reading lovd file

---
 data_collection/tools.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data_collection/tools.py b/data_collection/tools.py
index fa00df4..4233787 100644
--- a/data_collection/tools.py
+++ b/data_collection/tools.py
@@ -234,7 +234,7 @@ def from_lovd_to_pandas(path):
 
         d = dict()
 
-        with open(path) as f:
+        with open(path, encoding='utf-8') as f:
             # skip header
             [f.readline() for _ in range(4)]
 
@@ -265,9 +265,9 @@ def from_lovd_to_pandas(path):
                 convert_lovd_data_types(frame, table_name)
 
                 d[table_name] = (frame, notes)
+
                 # skip inter tables lines
                 [f.readline() for _ in range(1)]
-
         return d
     except FileNotFoundError as e:
         print(f"Error: {e}")

From 97c8ebfd3e44587d0ad84eadd29713529cd5ffcc Mon Sep 17 00:00:00 2001
From: OGJunius <junius.vaitkus1@gmail.com>
Date: Thu, 15 Feb 2024 11:07:54 +0200
Subject: [PATCH 02/10] Removed duplicates and typos from
 LOVD_VARIABLES_DATA_TYPES

---
 data_collection/tools.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/data_collection/tools.py b/data_collection/tools.py
index 4233787..7209623 100644
--- a/data_collection/tools.py
+++ b/data_collection/tools.py
@@ -61,7 +61,6 @@ class DownloadError(Exception):
     'VariantOnTranscript/Exon': 'String',
     'symbol': 'String',
     'inheritance': 'String',
-    'id_omin': 'Integer',
     'tissues': 'String',
     'features': 'String',
     'remarks': 'String',
@@ -116,7 +115,6 @@ class DownloadError(Exception):
     'Individual/Origin/Population': 'String',
     'Individual/Individual_ID': 'String',
     'allele': 'Integer',
-    'chromosome': 'Integer',
     'position_g_start': 'Integer',
     'position_g_end': 'Integer',
     'type': 'String',

From d1d68b5a4c5ab4ef083adacac11c8f07d2fa83c6 Mon Sep 17 00:00:00 2001
From: OGJunius <junius.vaitkus1@gmail.com>
Date: Thu, 15 Feb 2024 11:09:56 +0200
Subject: [PATCH 03/10] Added the 'Remaining' population to the
 population_groups

---
 data_collection/pipeline.py | 4 +++-
 data_collection/tools.py    | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py
index 0b12237..0a823d2 100644
--- a/data_collection/pipeline.py
+++ b/data_collection/pipeline.py
@@ -38,7 +38,9 @@ def calculate_max_frequency(row):
         'European (Finnish)',
         'European (non-Finnish)',
         'Middle Eastern',
-        'South Asian']
+        'South Asian',
+        'Remaining'
+    ]
 
     max_freq = 0
     max_pop = population_groups[0]
diff --git a/data_collection/tools.py b/data_collection/tools.py
index 7209623..978e078 100644
--- a/data_collection/tools.py
+++ b/data_collection/tools.py
@@ -317,7 +317,9 @@ def calculate_max_frequency(row):
         'European (Finnish)',
         'European (non-Finnish)',
         'Middle Eastern',
-        'South Asian']
+        'South Asian',
+        'Remaining'
+    ]
 
     max_freq = 0
     max_pop = population_groups[0]

From 3c7a2a57f3b5d0cb5979554a1deaaebf832ddfe5 Mon Sep 17 00:00:00 2001
From: OGJunius <junius.vaitkus1@gmail.com>
Date: Mon, 19 Feb 2024 14:31:02 +0200
Subject: [PATCH 04/10] Implemented the feature to download data from gnomad
 and clinvar databases

---
 data_collection/pipeline.py |  9 ++--
 data_collection/tools.py    | 88 +++++++++++++++++++++++++++++++++++++
 requirements.txt            |  3 +-
 3 files changed, 95 insertions(+), 5 deletions(-)

diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py
index 0a823d2..294ae3f 100644
--- a/data_collection/pipeline.py
+++ b/data_collection/pipeline.py
@@ -1,6 +1,6 @@
 import pandas as pd
 from pandas import DataFrame, Series
-from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_DNA
+from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_DNA, download_database
 
 # CONSTANTS
 # files
@@ -60,8 +60,10 @@ def calculate_max_frequency(row):
 # MAIN
 # Download all data
 get_file_from_url(LOVD_FILE_URL, LOVD_PATH + f"/lovd_data.txt", override=True)
-get_file_from_url(GNOMAD_FILE_URL, GNOMAD_PATH + f"/gnomad_data.csv", override=True)
-get_file_from_url(CLINVAR_FILE_URL, CLINVAR_PATH + f"/clinvar_data.txt", override=True)
+#get_file_from_url(GNOMAD_FILE_URL, GNOMAD_PATH + f"/gnomad_data.csv", override=True)
+#get_file_from_url(CLINVAR_FILE_URL, CLINVAR_PATH + f"/clinvar_data.txt", override=True)
+download_database('gnomad', 'gnomad_data.csv', GNOMAD_URL, True)
+download_database('clinvar', 'clinvar_data.txt', CLINVAR_URL, True)
 
 # Read and convert data
 lovd_data = from_lovd_to_pandas(LOVD_PATH + "/lovd_data.txt")
@@ -99,7 +101,6 @@ def calculate_max_frequency(row):
 max_values = lovd_with_gnomad.apply(calculate_max_frequency, axis=1)
 lovd_with_gnomad[['PopMax(gnomad)', 'PopMax population(gnomad)']] = max_values
 
-
 # Leaving necessary columns
 
 lovd_with_gnomad = lovd_with_gnomad.loc[:, ['id',
diff --git a/data_collection/tools.py b/data_collection/tools.py
index 978e078..755d106 100644
--- a/data_collection/tools.py
+++ b/data_collection/tools.py
@@ -2,6 +2,14 @@
 import os
 import pandas as pd
 from pandas import DataFrame
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.by import By
+import time
+import glob
+
 
 
 # EXCEPTIONS
@@ -333,3 +341,83 @@ def calculate_max_frequency(row):
             max_pop = group
 
     return pd.Series([max_freq, max_pop], index=['PopMax', 'PopMax population'])
+
+
+def download_gnomad_database(url, path):
+    """
+    scrapes the gnomad database
+    :param url: the url of the database website
+    :param path: path where the file is saved
+    """
+    firefox_options = webdriver.FirefoxOptions()
+    firefox_options.headless = True
+    firefox_options.add_argument('--headless')
+    firefox_options.set_preference('browser.download.folderList', 2)
+    firefox_options.set_preference("browser.download.manager.showWhenStarting", False)
+    firefox_options.set_preference("browser.download.dir", os.path.join(os.getcwd(), "..", "data", path))
+    firefox_options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream")
+
+    driver = webdriver.Firefox(options=firefox_options)
+    driver.get(url)
+
+    wait = WebDriverWait(driver, 30)
+
+    export_button = wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/div[3]/div[2]/div/div[7]/div[4]/div[2]/button[1]')))
+    export_button.click()
+
+
+    time.sleep(10)
+    driver.quit()
+
+
+def download_clinvar_database(url, path):
+    """
+    scrapes the clinvar database
+    :param url: the url of the database website
+    :param path: path where the file is saved
+    """
+    firefox_options = webdriver.FirefoxOptions()
+    firefox_options.headless = True
+    firefox_options.add_argument('--headless')
+    firefox_options.set_preference("browser.download.folderList", 2)
+    firefox_options.set_preference("browser.download.manager.showWhenStarting", False)
+    firefox_options.set_preference("browser.download.dir", os.path.join(os.getcwd(), "..", "data", path))
+    firefox_options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream")
+
+    driver = webdriver.Firefox(options=firefox_options)
+
+    driver.get(url)
+
+    driver.execute_script("document.getElementsByName(\"EntrezSystem2.PEntrez.clinVar.clinVar_Entrez_ResultsPanel.Entrez_DisplayBar.SendToSubmit\")[0].click()")
+    time.sleep(30)
+    driver.quit()
+
+
+def download_database(database_name, save_as, url, override=False):
+    """
+    calls a function to download a database
+    and handles where it should be saved
+    :param database_name: the name of the database that should be downloaded
+    :param save_as: the name by which the database file should be saved
+    :param url: the url of the database website
+    :param override: should already existing file be overwritten
+    """
+    ospath = os.path.join(os.getcwd(), "..", "data", database_name, save_as)
+    if os.path.exists(ospath):
+        if override:
+            os.remove(ospath)
+        else:
+            print("File is already downloaded")
+            return
+    match database_name:
+        case 'gnomad':
+            download_gnomad_database(url, database_name)
+        case 'clinvar':
+            download_clinvar_database(url, database_name)
+        case _:
+            print('This database is not supported')
+
+    list_of_files = glob.glob(os.path.join(os.getcwd(), "..", "data", database_name, '*'))
+    latest_file = max(list_of_files, key=os.path.getctime)
+    os.rename(latest_file, ospath)
+
diff --git a/requirements.txt b/requirements.txt
index a94cf69..22570db 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
 requests
-pandas
\ No newline at end of file
+pandas
+glob2
\ No newline at end of file

From 9d852ff870ade983d8edcd93d8722dcd53eff988 Mon Sep 17 00:00:00 2001
From: OGJunius <junius.vaitkus1@gmail.com>
Date: Tue, 20 Feb 2024 12:10:17 +0200
Subject: [PATCH 05/10] Provided changes

---
 data_collection/pipeline.py |   7 +--
 data_collection/tools.py    | 109 ++++++++++++++++++------------------
 requirements.txt            |   3 +-
 3 files changed, 58 insertions(+), 61 deletions(-)

diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py
index 294ae3f..7f56051 100644
--- a/data_collection/pipeline.py
+++ b/data_collection/pipeline.py
@@ -1,6 +1,5 @@
 import pandas as pd
-from pandas import DataFrame, Series
-from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_DNA, download_database
+from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_DNA, store_database
 
 # CONSTANTS
 # files
@@ -62,8 +61,8 @@ def calculate_max_frequency(row):
 get_file_from_url(LOVD_FILE_URL, LOVD_PATH + f"/lovd_data.txt", override=True)
 #get_file_from_url(GNOMAD_FILE_URL, GNOMAD_PATH + f"/gnomad_data.csv", override=True)
 #get_file_from_url(CLINVAR_FILE_URL, CLINVAR_PATH + f"/clinvar_data.txt", override=True)
-download_database('gnomad', 'gnomad_data.csv', GNOMAD_URL, True)
-download_database('clinvar', 'clinvar_data.txt', CLINVAR_URL, True)
+store_database('gnomad', True)
+store_database('clinvar', True)
 
 # Read and convert data
 lovd_data = from_lovd_to_pandas(LOVD_PATH + "/lovd_data.txt")
diff --git a/data_collection/tools.py b/data_collection/tools.py
index 755d106..44509f0 100644
--- a/data_collection/tools.py
+++ b/data_collection/tools.py
@@ -3,7 +3,6 @@
 import pandas as pd
 from pandas import DataFrame
 from selenium import webdriver
-from selenium.webdriver.firefox.options import Options
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.common.by import By
@@ -11,6 +10,22 @@
 import glob
 
 
+DATABASES_DOWNLOAD_PATHS = {
+    "clinvar": {
+        "button": 'document.getElementsByName(\"EntrezSystem2.PEntrez.clinVar.clinVar_Entrez_ResultsPanel.Entrez_DisplayBar.SendToSubmit\")[0].click()',
+        "url": "https://www.ncbi.nlm.nih.gov/clinvar/?term=EYS%5Bgene%5D&redir=gene",
+        "store_as": "clinvar_data.txt",
+        "clickable": "/html/body/div[1]/div[1]/form/div[1]/div[5]/div/div[2]/div[2]/div[1]/div/div[1]/a[3]"
+    },
+    "gnomad": {
+        "button":"document.getElementsByClassName('Button__BaseButton-sc-1eobygi-0 Button-sc-1eobygi-1 indcWT')[4].click()",
+        "url": "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4",
+        "store_as": "gnomad_data.csv",
+        "clickable": "/html/body/div[1]/div[3]/div[2]/div/div[7]/div[4]/div[2]/button[1]"
+    }
+}
+
+
 
 # EXCEPTIONS
 class BadResponseException(Exception):
@@ -343,81 +358,63 @@ def calculate_max_frequency(row):
     return pd.Series([max_freq, max_pop], index=['PopMax', 'PopMax population'])
 
 
-def download_gnomad_database(url, path):
+def download_database(url, database_name, button_location, clickable):
     """
-    scrapes the gnomad database
-    :param url: the url of the database website
-    :param path: path where the file is saved
-    """
-    firefox_options = webdriver.FirefoxOptions()
-    firefox_options.headless = True
-    firefox_options.add_argument('--headless')
-    firefox_options.set_preference('browser.download.folderList', 2)
-    firefox_options.set_preference("browser.download.manager.showWhenStarting", False)
-    firefox_options.set_preference("browser.download.dir", os.path.join(os.getcwd(), "..", "data", path))
-    firefox_options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream")
-
-    driver = webdriver.Firefox(options=firefox_options)
-    driver.get(url)
-
-    wait = WebDriverWait(driver, 30)
-
-    export_button = wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/div[3]/div[2]/div/div[7]/div[4]/div[2]/button[1]')))
-    export_button.click()
-
-
-    time.sleep(10)
-    driver.quit()
-
-
-def download_clinvar_database(url, path):
-    """
-    scrapes the clinvar database
-    :param url: the url of the database website
-    :param path: path where the file is saved
+    downloads chosen database
+    :param url: the url of the database's website
+    :param database_name: the name of the database
+    :param button_location: button which should be clicked on page for download
+    :param clickable: an element in a webpage indicating that the download can start
     """
     firefox_options = webdriver.FirefoxOptions()
     firefox_options.headless = True
     firefox_options.add_argument('--headless')
     firefox_options.set_preference("browser.download.folderList", 2)
     firefox_options.set_preference("browser.download.manager.showWhenStarting", False)
-    firefox_options.set_preference("browser.download.dir", os.path.join(os.getcwd(), "..", "data", path))
+    firefox_options.set_preference("browser.download.dir", os.path.join(os.getcwd(), "..", "data", database_name))
     firefox_options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream")
 
     driver = webdriver.Firefox(options=firefox_options)
-
     driver.get(url)
 
-    driver.execute_script("document.getElementsByName(\"EntrezSystem2.PEntrez.clinVar.clinVar_Entrez_ResultsPanel.Entrez_DisplayBar.SendToSubmit\")[0].click()")
-    time.sleep(30)
-    driver.quit()
+    try:
+        WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, clickable)))
+        driver.execute_script(button_location)
+
+        time.sleep(30)
+    except TimeoutError as e:
+        print(f"Error: {e}")
+    finally:
+        driver.quit()
 
 
-def download_database(database_name, save_as, url, override=False):
+def store_database(database_name, override=False):
     """
     calls a function to download a database
-    and handles where it should be saved
+    and handles where it should be saved,
+    renames the downloaded (latest) file to appropriate name
     :param database_name: the name of the database that should be downloaded
-    :param save_as: the name by which the database file should be saved
-    :param url: the url of the database website
     :param override: should already existing file be overwritten
     """
-    ospath = os.path.join(os.getcwd(), "..", "data", database_name, save_as)
-    if os.path.exists(ospath):
-        if override:
-            os.remove(ospath)
-        else:
-            print("File is already downloaded")
-            return
-    match database_name:
-        case 'gnomad':
-            download_gnomad_database(url, database_name)
-        case 'clinvar':
-            download_clinvar_database(url, database_name)
-        case _:
-            print('This database is not supported')
+    if database_name not in DATABASES_DOWNLOAD_PATHS.keys():
+        print("Requested database is not supported")
+        return
+    save_as = DATABASES_DOWNLOAD_PATHS[database_name]["store_as"]
+    os_path = os.path.join(os.getcwd(), "..", "data", database_name, save_as)
+
+    if os.path.exists(os_path) and override:
+        os.remove(os_path)
+    elif os.path.exists(os_path) and not override:
+        print("File already exits")
+        return
+
+    url = DATABASES_DOWNLOAD_PATHS[database_name]["url"]
+    button = DATABASES_DOWNLOAD_PATHS[database_name]["button"]
+    clickable = DATABASES_DOWNLOAD_PATHS[database_name]["clickable"]
+
+    download_database(url, database_name, button, clickable)
 
     list_of_files = glob.glob(os.path.join(os.getcwd(), "..", "data", database_name, '*'))
     latest_file = max(list_of_files, key=os.path.getctime)
-    os.rename(latest_file, ospath)
+    os.rename(latest_file, os_path)
 
diff --git a/requirements.txt b/requirements.txt
index 22570db..ca3ef2d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 requests
 pandas
-glob2
\ No newline at end of file
+glob2
+selenium
\ No newline at end of file

From 338de02f42e96fd03502987bbe797d34417daa1b Mon Sep 17 00:00:00 2001
From: OGJunius <junius.vaitkus1@gmail.com>
Date: Tue, 20 Feb 2024 12:11:14 +0200
Subject: [PATCH 06/10] typo

---
 data_collection/tools.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data_collection/tools.py b/data_collection/tools.py
index 44509f0..5bce558 100644
--- a/data_collection/tools.py
+++ b/data_collection/tools.py
@@ -405,7 +405,7 @@ def store_database(database_name, override=False):
     if os.path.exists(os_path) and override:
         os.remove(os_path)
     elif os.path.exists(os_path) and not override:
-        print("File already exits")
+        print("File already exists")
         return
 
     url = DATABASES_DOWNLOAD_PATHS[database_name]["url"]

From 478f6797bd1e495fa56c917d5e1447ead7c0d79b Mon Sep 17 00:00:00 2001
From: Junius <junius.vaitkus1@gmail.com>
Date: Sat, 2 Mar 2024 14:07:28 +0200
Subject: [PATCH 07/10] implemented the storing of databases functionality.
 Added lovd to the downloadable databases.

---
 data_collection/pipeline.py |  14 ++---
 data_collection/tools.py    | 116 ++++++++++++++++++++----------------
 2 files changed, 69 insertions(+), 61 deletions(-)

diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py
index 7f56051..f066e56 100644
--- a/data_collection/pipeline.py
+++ b/data_collection/pipeline.py
@@ -1,16 +1,11 @@
 import pandas as pd
-from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_DNA, store_database
+from tools import from_lovd_to_pandas, from_clinvar_name_to_DNA, store_database_for_eys_gene
 
 # CONSTANTS
 # files
 LOVD_URL = "https://databases.lovd.nl/shared/genes/EYS"
 LOVD_FILE_URL = "https://databases.lovd.nl/shared/download/all/gene/EYS"
 
-GNOMAD_URL = "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4"
-GNOMAD_FILE_URL = "https://drive.usercontent.google.com/u/0/uc?id=1crkDCVcC0PSnv0JPGj3FpemBs28-T_3y&export=download"
-
-CLINVAR_URL = "https://www.ncbi.nlm.nih.gov/clinvar/?term=eys%5Bgene%5D&redir=gene"
-CLINVAR_FILE_URL = "https://drive.usercontent.google.com/u/0/uc?id=1RK5XBK3k5h0K6f-qfwJSQj7tlF-H2U6u&export=download"
 
 # path
 DATA_PATH = "../data"
@@ -58,11 +53,12 @@ def calculate_max_frequency(row):
 
 # MAIN
 # Download all data
-get_file_from_url(LOVD_FILE_URL, LOVD_PATH + f"/lovd_data.txt", override=True)
+store_database_for_eys_gene('lovd', True)
+#get_file_from_url(LOVD_FILE_URL, LOVD_PATH + f"/lovd_data.txt", override=True)
 #get_file_from_url(GNOMAD_FILE_URL, GNOMAD_PATH + f"/gnomad_data.csv", override=True)
 #get_file_from_url(CLINVAR_FILE_URL, CLINVAR_PATH + f"/clinvar_data.txt", override=True)
-store_database('gnomad', True)
-store_database('clinvar', True)
+store_database_for_eys_gene('gnomad', True)
+store_database_for_eys_gene('clinvar', True)
 
 # Read and convert data
 lovd_data = from_lovd_to_pandas(LOVD_PATH + "/lovd_data.txt")
diff --git a/data_collection/tools.py b/data_collection/tools.py
index 5bce558..c934361 100644
--- a/data_collection/tools.py
+++ b/data_collection/tools.py
@@ -1,6 +1,7 @@
 import requests
 import os
 import pandas as pd
+import selenium.common
 from pandas import DataFrame
 from selenium import webdriver
 from selenium.webdriver.support.ui import WebDriverWait
@@ -10,23 +11,6 @@
 import glob
 
 
-DATABASES_DOWNLOAD_PATHS = {
-    "clinvar": {
-        "button": 'document.getElementsByName(\"EntrezSystem2.PEntrez.clinVar.clinVar_Entrez_ResultsPanel.Entrez_DisplayBar.SendToSubmit\")[0].click()',
-        "url": "https://www.ncbi.nlm.nih.gov/clinvar/?term=EYS%5Bgene%5D&redir=gene",
-        "store_as": "clinvar_data.txt",
-        "clickable": "/html/body/div[1]/div[1]/form/div[1]/div[5]/div/div[2]/div[2]/div[1]/div/div[1]/a[3]"
-    },
-    "gnomad": {
-        "button":"document.getElementsByClassName('Button__BaseButton-sc-1eobygi-0 Button-sc-1eobygi-1 indcWT')[4].click()",
-        "url": "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4",
-        "store_as": "gnomad_data.csv",
-        "clickable": "/html/body/div[1]/div[3]/div[2]/div/div[7]/div[4]/div[2]/button[1]"
-    }
-}
-
-
-
 # EXCEPTIONS
 class BadResponseException(Exception):
     pass
@@ -166,15 +150,16 @@ class DownloadError(Exception):
 }
 
 
-def get_file_from_url(url, save_to, override=False):
+def get_file_from_url(database_name, override=False):
     """
     Gets file from url and saves it into provided path. Overrides, if override is True.
 
-    :param str url: link with file
-    :param str save_to: path to save
+    :param str database_name: link with file
     :param bool override: needs override
     """
 
+    url = DATABASES_DOWNLOAD_PATHS[database_name]["url"]
+    save_to = DATABASES_DOWNLOAD_PATHS[database_name]["store_as"]
     try:
         # check if directory exists, if not - create
         save_to_dir = os.path.dirname(save_to)
@@ -358,14 +343,19 @@ def calculate_max_frequency(row):
     return pd.Series([max_freq, max_pop], index=['PopMax', 'PopMax population'])
 
 
-def download_database(url, database_name, button_location, clickable):
+def download_database_for_eys_gene(database_name, override=False):
     """
     downloads chosen database
-    :param url: the url of the database's website
+    and handles where it should be saved,
+    renames the downloaded (latest) file to appropriate name
     :param database_name: the name of the database
-    :param button_location: button which should be clicked on page for download
-    :param clickable: an element in a webpage indicating that the download can start
+    :param override: should an existing file be overriden with a new one
     """
+
+    url = DATABASES_DOWNLOAD_PATHS[database_name]["url"]
+    button_location = DATABASES_DOWNLOAD_PATHS[database_name]["button"]
+    clickable = DATABASES_DOWNLOAD_PATHS[database_name]["clickable"]
+
     firefox_options = webdriver.FirefoxOptions()
     firefox_options.headless = True
     firefox_options.add_argument('--headless')
@@ -376,29 +366,12 @@ def download_database(url, database_name, button_location, clickable):
 
     driver = webdriver.Firefox(options=firefox_options)
     driver.get(url)
+    WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, clickable)))
+    driver.execute_script(button_location)
 
-    try:
-        WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, clickable)))
-        driver.execute_script(button_location)
-
-        time.sleep(30)
-    except TimeoutError as e:
-        print(f"Error: {e}")
-    finally:
-        driver.quit()
+    time.sleep(30)
+    driver.quit()
 
-
-def store_database(database_name, override=False):
-    """
-    calls a function to download a database
-    and handles where it should be saved,
-    renames the downloaded (latest) file to appropriate name
-    :param database_name: the name of the database that should be downloaded
-    :param override: should already existing file be overwritten
-    """
-    if database_name not in DATABASES_DOWNLOAD_PATHS.keys():
-        print("Requested database is not supported")
-        return
     save_as = DATABASES_DOWNLOAD_PATHS[database_name]["store_as"]
     os_path = os.path.join(os.getcwd(), "..", "data", database_name, save_as)
 
@@ -407,14 +380,53 @@ def store_database(database_name, override=False):
     elif os.path.exists(os_path) and not override:
         print("File already exists")
         return
-
-    url = DATABASES_DOWNLOAD_PATHS[database_name]["url"]
-    button = DATABASES_DOWNLOAD_PATHS[database_name]["button"]
-    clickable = DATABASES_DOWNLOAD_PATHS[database_name]["clickable"]
-
-    download_database(url, database_name, button, clickable)
-
     list_of_files = glob.glob(os.path.join(os.getcwd(), "..", "data", database_name, '*'))
     latest_file = max(list_of_files, key=os.path.getctime)
     os.rename(latest_file, os_path)
 
+
+def store_database_for_eys_gene(database_name, override=False):
+    """
+    calls a function to download a database
+    :param database_name: the name of the database that should be downloaded
+    :param override: should already existing file be overwritten
+    """
+    try:
+        if database_name not in DATABASES_DOWNLOAD_PATHS:
+            raise IndexError(f"Requested {database_name} database is not supported")
+
+        DATABASES_DOWNLOAD_PATHS[database_name]["function"](database_name, override)
+
+    except TimeoutError as e:
+        print(f"Error: {e}")
+    except selenium.common.InvalidArgumentException as e:
+        print(f"Error: {e}")
+    except selenium.common.exceptions.WebDriverException as e:
+        print(f"Error: {e}")
+    except ValueError as e:
+        print(f"Error:{e}")
+    except IndexError as e:
+        print(f"Error:{e}")
+
+
+DATABASES_DOWNLOAD_PATHS = {
+    "clinvar": {
+        "button": 'document.getElementsByName(\"EntrezSystem2.PEntrez.clinVar.clinVar_Entrez_ResultsPanel.Entrez_DisplayBar.SendToSubmit\")[0].click()',
+        "url": "https://www.ncbi.nlm.nih.gov/clinvar/?term=EYS%5Bgene%5D&redir=gene",
+        "store_as": "clinvar_data.txt",
+        "clickable": "/html/body/div[1]/div[1]/form/div[1]/div[5]/div/div[2]/div[2]/div[1]/div/div[1]/a[3]",
+        "function": download_database_for_eys_gene
+    },
+    "gnomad": {
+        "button":"document.getElementsByClassName('Button__BaseButton-sc-1eobygi-0 Button-sc-1eobygi-1 indcWT')[4].click()",
+        "url": "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4",
+        "store_as": "gnomad_data.csv",
+        "clickable": "/html/body/div[1]/div[3]/div[2]/div/div[7]/div[4]/div[2]/button[1]",
+        "function": download_database_for_eys_gene
+    },
+    "lovd": {
+        "url": "https://databases.lovd.nl/shared/download/all/gene/EYS",
+        "store_as": "../data/lovd/lovd_data.txt",
+        "function": get_file_from_url
+    }
+}
\ No newline at end of file

From a70b0ae9dce9c0fb9eddbde414397dc26f3a06f1 Mon Sep 17 00:00:00 2001
From: Junius <junius.vaitkus1@gmail.com>
Date: Sat, 2 Mar 2024 14:10:20 +0200
Subject: [PATCH 08/10] implemented the storing of databases functionality.
 Added lovd to the downloadable databases.

---
 data_collection/tools.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data_collection/tools.py b/data_collection/tools.py
index c934361..ac30162 100644
--- a/data_collection/tools.py
+++ b/data_collection/tools.py
@@ -154,7 +154,7 @@ def get_file_from_url(database_name, override=False):
     """
     Gets file from url and saves it into provided path. Overrides, if override is True.
 
-    :param str database_name: link with file
+    :param str database_name: database to download
     :param bool override: needs override
     """
 

From 06587ec2f10f5481d29fd9b8672cdb728efa7c56 Mon Sep 17 00:00:00 2001
From: Junius <junius.vaitkus1@gmail.com>
Date: Mon, 4 Mar 2024 21:42:51 +0200
Subject: [PATCH 09/10] restored get_file_from_url function and added
 download_lovd_database.

---
 data_collection/tools.py | 39 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)

diff --git a/data_collection/tools.py b/data_collection/tools.py
index ac30162..b793f55 100644
--- a/data_collection/tools.py
+++ b/data_collection/tools.py
@@ -3,6 +3,7 @@
 import pandas as pd
 import selenium.common
 from pandas import DataFrame
+from requests import RequestException
 from selenium import webdriver
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
@@ -150,7 +151,39 @@ class DownloadError(Exception):
 }
 
 
-def get_file_from_url(database_name, override=False):
+def get_file_from_url(url, save_to, override=False):
+    """
+    Gets file from url and saves it into provided path. Overrides, if override is True.
+
+    :param str url: link with file
+    :param str save_to: path to save
+    :param bool override: needs override
+    """
+
+    # check if directory exists, if not - create
+    save_to_dir = os.path.dirname(save_to)
+    if not os.path.exists(save_to_dir):
+        os.makedirs(save_to_dir)
+
+    # check if file exist and needs to override
+    if os.path.exists(save_to) and not override:
+        print(f"The file at {save_to} already exists.")
+        return
+
+    try:
+        response = requests.get(url, timeout=10)
+    except RequestException as e:
+        raise DownloadError(f"Error while downloading file from {url}") from e
+
+    if response.status_code != 200:
+        raise BadResponseException(f"Bad response from {url}."
+                                   f" Status code: {response.status_code}")
+
+    with open(save_to, "wb") as f:
+        f.write(response.content)
+
+
+def download_lovd_database(database_name, override=False):
     """
     Gets file from url and saves it into provided path. Overrides, if override is True.
 
@@ -427,6 +460,6 @@ def store_database_for_eys_gene(database_name, override=False):
     "lovd": {
         "url": "https://databases.lovd.nl/shared/download/all/gene/EYS",
         "store_as": "../data/lovd/lovd_data.txt",
-        "function": get_file_from_url
+        "function": download_lovd_database
     }
-}
\ No newline at end of file
+}

From 02fc60b29a0ed03fb1ad7b304105962ebd06c36b Mon Sep 17 00:00:00 2001
From: Junius <junius.vaitkus1@gmail.com>
Date: Mon, 4 Mar 2024 23:13:40 +0200
Subject: [PATCH 10/10] added back .github\workflows file.

---
 .github/workflows/pylint.yml | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 .github/workflows/pylint.yml

diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
new file mode 100644
index 0000000..89c0945
--- /dev/null
+++ b/.github/workflows/pylint.yml
@@ -0,0 +1,24 @@
+name: Pylint
+
+on: [push]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11", "3.12"]
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install pylint
+        pip install -r requirements.txt
+    - name: Analysing the code with pylint
+      run: |
+        pylint $(git ls-files '*.py')