From 8b4a7d2c237f9aa1e7564ffa15629c076f628f2a Mon Sep 17 00:00:00 2001
From: Dainius Kirsnauskas <75167873+Strexas@users.noreply.github.com>
Date: Mon, 26 Feb 2024 19:53:58 +0200
Subject: [PATCH 01/15] pylint linter workflow

Added github workflow "Pylinter" to run static analysis tool on each push
---
 .github/workflows/pylint.yml | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 .github/workflows/pylint.yml

diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
new file mode 100644
index 0000000..a3f5d43
--- /dev/null
+++ b/.github/workflows/pylint.yml
@@ -0,0 +1,23 @@
+name: Pylint
+
+on: [push]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install pylint
+    - name: Analysing the code with pylint
+      run: |
+        pylint $(git ls-files '*.py')

From 7a2c2fc4f21108211ca0a465a6c5fa8cd266eaab Mon Sep 17 00:00:00 2001
From: Dainius <kirsnauskas1@gmail.com>
Date: Mon, 26 Feb 2024 20:01:39 +0200
Subject: [PATCH 02/15] add install of requirements.txt

---
 .github/workflows/pylint.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
index a3f5d43..78da3fc 100644
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@@ -18,6 +18,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install pylint
+        pip install -r requirements.txt
     - name: Analysing the code with pylint
       run: |
         pylint $(git ls-files '*.py')

From d25b4a375c2ff2557d7658df78b9019ee9ceb645 Mon Sep 17 00:00:00 2001
From: Dainius <kirsnauskas1@gmail.com>
Date: Tue, 27 Feb 2024 10:48:35 +0200
Subject: [PATCH 03/15] changes according to pylint

---
 data_collection/pipeline.py | 35 +++++++++++++++++++----------------
 data_collection/tools.py    | 37 ++-----------------------------------
 2 files changed, 21 insertions(+), 51 deletions(-)

diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py
index 0b12237..b5e952e 100644
--- a/data_collection/pipeline.py
+++ b/data_collection/pipeline.py
@@ -1,6 +1,6 @@
 import pandas as pd
-from pandas import DataFrame, Series
-from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_DNA
+
+from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_dna
 
 # CONSTANTS
 # files
@@ -8,10 +8,12 @@
 LOVD_FILE_URL = "https://databases.lovd.nl/shared/download/all/gene/EYS"
 
 GNOMAD_URL = "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4"
-GNOMAD_FILE_URL = "https://drive.usercontent.google.com/u/0/uc?id=1crkDCVcC0PSnv0JPGj3FpemBs28-T_3y&export=download"
+GNOMAD_FILE_URL = ("https://drive.usercontent.google.com/u/0/uc?id=1crkDCVcC0PSnv0JPGj3FpemBs28"
+                   "-T_3y&export=download")
 
 CLINVAR_URL = "https://www.ncbi.nlm.nih.gov/clinvar/?term=eys%5Bgene%5D&redir=gene"
-CLINVAR_FILE_URL = "https://drive.usercontent.google.com/u/0/uc?id=1RK5XBK3k5h0K6f-qfwJSQj7tlF-H2U6u&export=download"
+CLINVAR_FILE_URL = ("https://drive.usercontent.google.com/u/0/uc?id=1RK5XBK3k5h0K6f-qfwJSQj7tlF"
+                    "-H2U6u&export=download")
 
 # path
 DATA_PATH = "../data"
@@ -57,9 +59,9 @@ def calculate_max_frequency(row):
 
 # MAIN
 # Download all data
-get_file_from_url(LOVD_FILE_URL, LOVD_PATH + f"/lovd_data.txt", override=True)
-get_file_from_url(GNOMAD_FILE_URL, GNOMAD_PATH + f"/gnomad_data.csv", override=True)
-get_file_from_url(CLINVAR_FILE_URL, CLINVAR_PATH + f"/clinvar_data.txt", override=True)
+get_file_from_url(LOVD_FILE_URL, LOVD_PATH + "/lovd_data.txt", override=True)
+get_file_from_url(GNOMAD_FILE_URL, GNOMAD_PATH + "/gnomad_data.csv", override=True)
+get_file_from_url(CLINVAR_FILE_URL, CLINVAR_PATH + "/clinvar_data.txt", override=True)
 
 # Read and convert data
 lovd_data = from_lovd_to_pandas(LOVD_PATH + "/lovd_data.txt")
@@ -75,8 +77,10 @@ def calculate_max_frequency(row):
 notes = lovd_data["Variants_On_Transcripts"][1][::]
 
 # Merging Clinvar
-clinvar = clinvar_data.copy()[["Name(clinvar)", "Germline classification(clinvar)", "Accession(clinvar)"]]
-clinvar["VariantOnTranscript/DNA"] = clinvar["Name(clinvar)"].apply(from_clinvar_name_to_DNA)
+clinvar = clinvar_data.copy()[["Name(clinvar)",
+                               "Germline classification(clinvar)",
+                               "Accession(clinvar)"]]
+clinvar["VariantOnTranscript/DNA"] = clinvar["Name(clinvar)"].apply(from_clinvar_name_to_dna)
 
 main_frame = pd.merge(main_frame,
                       clinvar,
@@ -84,12 +88,12 @@ def calculate_max_frequency(row):
                       on=["VariantOnTranscript/DNA"]).drop("Name(clinvar)", axis=1)
 
 # MERGING GnomAd
-main_frame = pd.merge(main_frame,
-                      gnomad_data,
-                      how="left",
-                      left_on="VariantOnTranscript/DNA",
-                      right_on="HGVS Consequence(gnomad)").drop("HGVS Consequence(gnomad)", axis=1)
-
+main_frame = (pd.merge(main_frame,
+                       gnomad_data,
+                       how="left",
+                       left_on="VariantOnTranscript/DNA",
+                       right_on="HGVS Consequence(gnomad)").drop("HGVS Consequence(gnomad)",
+                                                                 axis=1))
 
 # Calculating frequencies
 lovd_without_association_in_gnomad = pd.isnull(main_frame["Hemizygote Count Remaining(gnomad)"])
@@ -97,7 +101,6 @@ def calculate_max_frequency(row):
 max_values = lovd_with_gnomad.apply(calculate_max_frequency, axis=1)
 lovd_with_gnomad[['PopMax(gnomad)', 'PopMax population(gnomad)']] = max_values
 
-
 # Leaving necessary columns
 
 lovd_with_gnomad = lovd_with_gnomad.loc[:, ['id',
diff --git a/data_collection/tools.py b/data_collection/tools.py
index 77b2001..2f946bb 100644
--- a/data_collection/tools.py
+++ b/data_collection/tools.py
@@ -3,6 +3,7 @@
 import pandas as pd
 from pandas import DataFrame
 
+
 # EXCEPTIONS
 class BadResponseException(Exception):
     pass
@@ -275,7 +276,7 @@ def from_lovd_to_pandas(path):
         print(f"Error: {e}")
 
 
-def from_clinvar_name_to_DNA(name):
+def from_clinvar_name_to_dna(name):
     """
     Custom cleaner to extract DNA from Clinvar name variable.
 
@@ -298,37 +299,3 @@ def from_clinvar_name_to_DNA(name):
             break
 
     return name[start:end]
-
-
-def calculate_max_frequency(row):
-    """
-    Calculating maximum allele frequency in GNOMAD row.
-
-    :param row: row in dataframe
-    :returns: panda series with 'PopMax', 'PopMax population' fields
-    :rtype: pd.Series
-    """
-
-    population_groups = [
-        'Admixed American',
-        'African/African American',
-        'Amish',
-        'Ashkenazi Jewish',
-        'East Asian',
-        'European (Finnish)',
-        'European (non-Finnish)',
-        'Middle Eastern',
-        'South Asian']
-
-    max_freq = 0
-    max_pop = population_groups[0]
-
-    for group in population_groups:
-        count_column = f'Allele Count {group}(gnomad)'
-        number_column = f'Allele Number {group}(gnomad)'
-        freq = row[count_column] / row[number_column]
-        if (freq > max_freq):
-            max_freq = freq
-            max_pop = group
-
-    return pd.Series([max_freq, max_pop], index=['PopMax', 'PopMax population'])
\ No newline at end of file

From 120a59fd081ae27161fcdccab6a4901b87cce4c0 Mon Sep 17 00:00:00 2001
From: Dainius <kirsnauskas1@gmail.com>
Date: Tue, 27 Feb 2024 11:18:15 +0200
Subject: [PATCH 04/15] fix

---
 data_collection/tools.py | 50 ++++++++++++++++------------------------
 1 file changed, 20 insertions(+), 30 deletions(-)

diff --git a/data_collection/tools.py b/data_collection/tools.py
index 2f946bb..9259e3f 100644
--- a/data_collection/tools.py
+++ b/data_collection/tools.py
@@ -1,5 +1,6 @@
-import requests
 import os
+import requests
+from requests.exceptions import RequestException
 import pandas as pd
 from pandas import DataFrame
 
@@ -116,7 +117,6 @@ class DownloadError(Exception):
     'Individual/Origin/Population': 'String',
     'Individual/Individual_ID': 'String',
     'allele': 'Integer',
-    'chromosome': 'Integer',
     'position_g_start': 'Integer',
     'position_g_end': 'Integer',
     'type': 'String',
@@ -154,37 +154,27 @@ def get_file_from_url(url, save_to, override=False):
     :param bool override: needs override
     """
 
-    try:
-        # check if directory exists, if not - create
-        save_to_dir = os.path.dirname(save_to)
-        if not os.path.exists(save_to_dir):
-            os.makedirs(save_to_dir)
-
-        # check if file exist and needs to override
-        if os.path.exists(save_to) and not override:
-            print(f"The file at {save_to} already exists.")
-            return
-
-        try:
-            response = requests.get(url)
-        except requests.exceptions.RequestException as e:
-            raise DownloadError(f"Error downloading file from {url}: {e}")
+    # check if directory exists, if not - create
+    save_to_dir = os.path.dirname(save_to)
+    if not os.path.exists(save_to_dir):
+        os.makedirs(save_to_dir)
 
-        if response.status_code != 200:
-            raise BadResponseException(f"Bad response from {url}. Status code: {response.status_code}")
+    # check if file exist and needs to override
+    if os.path.exists(save_to) and not override:
+        print(f"The file at {save_to} already exists.")
+        return
 
-        with open(save_to, "wb") as f:
-            f.write(response.content)
-
-    # check request exceptions
-    except BadResponseException as e:
-        print(f"Error: {e}")
+    try:
+        response = requests.get(url, timeout=10)
+    except RequestException as e:
+        raise DownloadError(f"Error while downloading file from {url}") from e
 
-    except DownloadError as e:
-        print(f"Error: {e}")
+    if response.status_code != 200:
+        raise BadResponseException(f"Bad response from {url}."
+                                   f" Status code: {response.status_code}")
 
-    except Exception as e:
-        print(f"Error: {e}")
+    with open(save_to, "wb") as f:
+        f.write(response.content)
 
 
 def convert_lovd_data_types(frame, table_name):
@@ -232,7 +222,7 @@ def from_lovd_to_pandas(path):
         if not os.path.exists(path):
             raise FileNotFoundError(f"The file at {path} does not exist.")
 
-        d = dict()
+        d = {}
 
         with open(path) as f:
             # skip header

From 5415a5a060eff5df0345ba7e380ad89c12f6ae79 Mon Sep 17 00:00:00 2001
From: Dainius <kirsnauskas1@gmail.com>
Date: Tue, 27 Feb 2024 11:20:56 +0200
Subject: [PATCH 05/15] remove try

---
 data_collection/tools.py | 63 ++++++++++++++++++----------------------
 1 file changed, 29 insertions(+), 34 deletions(-)

diff --git a/data_collection/tools.py b/data_collection/tools.py
index 9259e3f..867c721 100644
--- a/data_collection/tools.py
+++ b/data_collection/tools.py
@@ -217,53 +217,48 @@ def from_lovd_to_pandas(path):
     :rtype: dict[str, tuple[DataFrame, list[str]]]
     """
 
-    try:
-        # Check if the file exists
-        if not os.path.exists(path):
-            raise FileNotFoundError(f"The file at {path} does not exist.")
+    # Check if the file exists
+    if not os.path.exists(path):
+        raise FileNotFoundError(f"The file at {path} does not exist.")
 
-        d = {}
+    d = {}
 
-        with open(path) as f:
-            # skip header
-            [f.readline() for _ in range(4)]
+    with open(path) as f:
+        # skip header
+        [f.readline() for _ in range(4)]
 
-            while True:
-                line = f.readline()
+        while True:
+            line = f.readline()
 
-                if line == '':
-                    break
+            if line == '':
+                break
 
-                table_name = line.split("##")[1].strip()
+            table_name = line.split("##")[1].strip()
 
-                notes = []
+            notes = []
+            line = f.readline()
+            while line.startswith("##"):
+                notes.append(line[2:-1])
                 line = f.readline()
-                while line.startswith("##"):
-                    notes.append(line[2:-1])
-                    line = f.readline()
 
-                table_header = [column[3:-3] for column in line[:-1].split('\t')]
-                frame = DataFrame([], columns=table_header)
+            table_header = [column[3:-3] for column in line[:-1].split('\t')]
+            frame = DataFrame([], columns=table_header)
+            line = f.readline()
+            while line != '\n':
+                variables = [variable[1:-1] for variable in line[:-1].split('\t')]
+                observation = DataFrame([variables], columns=table_header)
+                frame = pd.concat([frame, observation], ignore_index=True)
                 line = f.readline()
-                while line != '\n':
-                    variables = [variable[1:-1] for variable in line[:-1].split('\t')]
-                    observation = DataFrame([variables], columns=table_header)
-                    frame = pd.concat([frame, observation], ignore_index=True)
-                    line = f.readline()
 
-                # formats the frame
-                convert_lovd_data_types(frame, table_name)
+            # formats the frame
+            convert_lovd_data_types(frame, table_name)
 
-                d[table_name] = (frame, notes)
-                # skip inter tables lines
-                [f.readline() for _ in range(1)]
+            d[table_name] = (frame, notes)
+            # skip inter tables lines
+            [f.readline() for _ in range(1)]
 
-        return d
-    except FileNotFoundError as e:
-        print(f"Error: {e}")
+    return d
 
-    except Exception as e:
-        print(f"Error: {e}")
 
 
 def from_clinvar_name_to_dna(name):

From 874aebc5510ebacd09220c54456577ffac4bb31b Mon Sep 17 00:00:00 2001
From: Dainius <kirsnauskas1@gmail.com>
Date: Tue, 27 Feb 2024 11:26:21 +0200
Subject: [PATCH 06/15] removing .gitkeep

---
 data_collection/.gitkeep | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 data_collection/.gitkeep

diff --git a/data_collection/.gitkeep b/data_collection/.gitkeep
deleted file mode 100644
index e69de29..0000000

From e7364a7c37d95aa01dbeb7ba24d399a8647ed278 Mon Sep 17 00:00:00 2001
From: Dainius <kirsnauskas1@gmail.com>
Date: Tue, 27 Feb 2024 11:26:40 +0200
Subject: [PATCH 07/15] docstrings for module

---
 data_collection/pipeline.py | 1 +
 data_collection/tools.py    | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py
index b5e952e..b910f41 100644
--- a/data_collection/pipeline.py
+++ b/data_collection/pipeline.py
@@ -1,3 +1,4 @@
+"""Module executes general pipeline for data collection"""
 import pandas as pd
 
 from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_dna
diff --git a/data_collection/tools.py b/data_collection/tools.py
index 867c721..66191ff 100644
--- a/data_collection/tools.py
+++ b/data_collection/tools.py
@@ -1,3 +1,5 @@
+"""Module providing a functionality to collect data from various sources."""
+
 import os
 import requests
 from requests.exceptions import RequestException

From 3fb9b6e152d48fa64d0459848cec2b6591136a0b Mon Sep 17 00:00:00 2001
From: Dainius <kirsnauskas1@gmail.com>
Date: Tue, 27 Feb 2024 11:32:30 +0200
Subject: [PATCH 08/15] no assigned suppress

---
 data_collection/tools.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/data_collection/tools.py b/data_collection/tools.py
index 66191ff..952ef9c 100644
--- a/data_collection/tools.py
+++ b/data_collection/tools.py
@@ -225,9 +225,9 @@ def from_lovd_to_pandas(path):
 
     d = {}
 
-    with open(path) as f:
+    with open(path, encoding="UTF-8") as f:
         # skip header
-        [f.readline() for _ in range(4)]
+        [f.readline() for _ in range(4)]  # pylint: disable=expression-not-assigned
 
         while True:
             line = f.readline()
@@ -257,7 +257,7 @@ def from_lovd_to_pandas(path):
 
             d[table_name] = (frame, notes)
             # skip inter tables lines
-            [f.readline() for _ in range(1)]
+            [f.readline() for _ in range(1)]  # pylint: disable=expression-not-assigned
 
     return d
 

From 3c42ca3744bbd1b52ad57183f7bc85799947f968 Mon Sep 17 00:00:00 2001
From: Dainius <kirsnauskas1@gmail.com>
Date: Tue, 27 Feb 2024 11:34:28 +0200
Subject: [PATCH 09/15] exceptions docstrings

---
 data_collection/tools.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/data_collection/tools.py b/data_collection/tools.py
index 952ef9c..57728c6 100644
--- a/data_collection/tools.py
+++ b/data_collection/tools.py
@@ -9,10 +9,12 @@
 
 # EXCEPTIONS
 class BadResponseException(Exception):
+    """Custom exception for bad responses."""
     pass
 
 
 class DownloadError(Exception):
+    """Custom exception for download errors."""
     pass
 
 

From 906829c6aa9a838fe9446b03b1e3ac8b5003de29 Mon Sep 17 00:00:00 2001
From: Dainius <kirsnauskas1@gmail.com>
Date: Tue, 27 Feb 2024 11:36:38 +0200
Subject: [PATCH 10/15] exceptions pass

---
 data_collection/tools.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/data_collection/tools.py b/data_collection/tools.py
index 57728c6..0a8868b 100644
--- a/data_collection/tools.py
+++ b/data_collection/tools.py
@@ -10,12 +10,10 @@
 # EXCEPTIONS
 class BadResponseException(Exception):
     """Custom exception for bad responses."""
-    pass
 
 
 class DownloadError(Exception):
     """Custom exception for download errors."""
-    pass
 
 
 # CONSTANTS
@@ -264,7 +262,6 @@ def from_lovd_to_pandas(path):
     return d
 
 
-
 def from_clinvar_name_to_dna(name):
     """
     Custom cleaner to extract DNA from Clinvar name variable.

From 1f71f580b9aa7bfd736b6e0d8fe9860debf70b01 Mon Sep 17 00:00:00 2001
From: Dainius Kirsnauskas <75167873+Strexas@users.noreply.github.com>
Date: Tue, 27 Feb 2024 11:40:22 +0200
Subject: [PATCH 11/15] Removing unsupported versions

3.7, 3.8 and 3.9 doesn't support match/case statements, therefore were removed
---
 .github/workflows/pylint.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
index 78da3fc..89c0945 100644
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@@ -7,7 +7,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.10", "3.11", "3.12"]
     steps:
     - uses: actions/checkout@v3
     - name: Set up Python ${{ matrix.python-version }}

From 03315c14681cd70007d368d2a7a736e0f6990bed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Paulius=20Preik=C5=A1a?= <preiksap@gmail.com>
Date: Sat, 2 Mar 2024 11:30:23 +0200
Subject: [PATCH 12/15] feat: moved constants to separete file

---
 data_collection/constants.py | 149 +++++++++++++++++++++++++++++++++++
 data_collection/pipeline.py  |  20 +----
 data_collection/tools.py     | 131 +-----------------------------
 3 files changed, 151 insertions(+), 149 deletions(-)
 create mode 100644 data_collection/constants.py

diff --git a/data_collection/constants.py b/data_collection/constants.py
new file mode 100644
index 0000000..ccb12ec
--- /dev/null
+++ b/data_collection/constants.py
@@ -0,0 +1,149 @@
+"""Module for constants used in data collection."""
+
+# files
+LOVD_URL = "https://databases.lovd.nl/shared/genes/EYS"
+LOVD_FILE_URL = "https://databases.lovd.nl/shared/download/all/gene/EYS"
+
+GNOMAD_URL = "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4"
+GNOMAD_FILE_URL = ("https://drive.usercontent.google.com/u/0/uc?id=1crkDCVcC0PSnv0JPGj3FpemBs28"
+                   "-T_3y&export=download")
+
+CLINVAR_URL = "https://www.ncbi.nlm.nih.gov/clinvar/?term=eys%5Bgene%5D&redir=gene"
+CLINVAR_FILE_URL = ("https://drive.usercontent.google.com/u/0/uc?id=1RK5XBK3k5h0K6f-qfwJSQj7tlF"
+                    "-H2U6u&export=download")
+
+# paths
+DATA_PATH = "../data"
+LOVD_PATH = DATA_PATH + "/lovd"
+GNOMAD_PATH = DATA_PATH + "/gnomad"
+CLINVAR_PATH = DATA_PATH + "/clinvar"
+
+# variable data types
+LOVD_VARIABLES_DATA_TYPES = {
+    'id': 'String',
+    'name': 'String',
+    'chromosome': 'Integer',
+    'chrom_band': 'String',
+    'imprinting': 'String',
+    'refseq_genomic': 'String',
+    'refseq_UD': 'String',
+    'reference': 'String',
+    'url_homepage': 'String',
+    'url_external': 'String',
+    'allow_download': 'Boolean',
+    'id_hgnc': 'Integer',
+    'id_entrez': 'Integer',
+    'id_omim': 'Integer',
+    'show_hgmd': 'Boolean',
+    'show_genecards': 'Boolean',
+    'show_genetests': 'Boolean',
+    'show_orphanet': 'Boolean',
+    'note_index': 'String',
+    'note_listing': 'String',
+    'refseq': 'String',
+    'refseq_url': 'String',
+    'disclaimer': 'Boolean',
+    'disclaimer_text': 'String',
+    'header': 'String',
+    'header_align': 'Integer',
+    'footer': 'String',
+    'footer_align': 'Integer',
+    'created_by': 'Integer',
+    'created_date': 'Date',
+    'edited_by': 'Integer',
+    'edited_date': 'Date',
+    'updated_by': 'Integer',
+    'updated_date': 'Date',
+    'transcriptid': 'Integer',
+    'effectid': 'Integer',
+    'position_c_start': 'Integer',
+    'position_c_start_intron': 'Integer',
+    'position_c_end': 'Integer',
+    'position_c_end_intron': 'Integer',
+    'VariantOnTranscript/DNA': 'String',
+    'VariantOnTranscript/RNA': 'String',
+    'VariantOnTranscript/Protein': 'String',
+    'VariantOnTranscript/Exon': 'String',
+    'symbol': 'String',
+    'inheritance': 'String',
+    'id_omin': 'Integer',
+    'tissues': 'String',
+    'features': 'String',
+    'remarks': 'String',
+    'geneid': 'String',
+    'id_mutalyzer': 'Integer',
+    'id_ncbi': 'String',
+    'id_ensembl': 'String',
+    'id_protein_ncbi': 'String',
+    'id_protein_ensembl': 'String',
+    'id_protein_uniprot': 'String',
+    'position_c_mrna_start': 'Integer',
+    'position_c_mrna_end': 'Integer',
+    'position_c_cds_end': 'Integer',
+    'position_g_mrna_start': 'Integer',
+    'position_g_mrna_end': 'Integer',
+    'diseaseid': 'Integer',
+    'individualid': 'Integer',
+    'Phenotype/Inheritance': 'String',
+    'Phenotype/Age': 'String',
+    'Phenotype/Additional': 'String',
+    'Phenotype/Biochem_param': 'String',
+    'Phenotype/Age/Onset': 'String',
+    'Phenotype/Age/Diagnosis': 'String',
+    'Phenotype/Severity_score': 'String',
+    'Phenotype/Onset': 'String',
+    'Phenotype/Protein': 'String',
+    'Phenotype/Tumor/MSI': 'String',
+    'Phenotype/Enzyme/CPK': 'String',
+    'Phenotype/Heart/Myocardium': 'String',
+    'Phenotype/Lung': 'String',
+    'Phenotype/Diagnosis/Definite': 'String',
+    'Phenotype/Diagnosis/Initial': 'String',
+    'Phenotype/Diagnosis/Criteria': 'String',
+    'variants_found': 'Integer',
+    'Screening/Technique': 'String',
+    'Screening/Template': 'String',
+    'Screening/Tissue': 'String',
+    'Screening/Remarks': 'String',
+    'fatherid': 'String',
+    'motherid': 'String',
+    'panelid': 'Integer',
+    'panel_size': 'Integer',
+    'license': 'String',
+    'Individual/Reference': 'String',
+    'Individual/Remarks': 'String',
+    'Individual/Gender': 'String',
+    'Individual/Consanguinity': 'String',
+    'Individual/Age_of_death': 'String',
+    'Individual/VIP': 'String',
+    'Individual/Data_av': 'String',
+    'Individual/Treatment': 'String',
+    'Individual/Origin/Population': 'String',
+    'Individual/Individual_ID': 'String',
+    'allele': 'Integer',
+    'position_g_start': 'Integer',
+    'position_g_end': 'Integer',
+    'type': 'String',
+    'average_frequency': 'Double',
+    'VariantOnGenome/DBID': 'String',
+    'VariantOnGenome/DNA': 'String',
+    'VariantOnGenome/Frequency': 'String',
+    'VariantOnGenome/Reference': 'String',
+    'VariantOnGenome/Restriction_site': 'String',
+    'VariantOnGenome/Published_as': 'String',
+    'VariantOnGenome/Remarks': 'String',
+    'VariantOnGenome/Genetic_origin': 'String',
+    'VariantOnGenome/Segregation': 'String',
+    'VariantOnGenome/dbSNP': 'String',
+    'VariantOnGenome/VIP': 'String',
+    'VariantOnGenome/Methylation': 'String',
+    'VariantOnGenome/ISCN': 'String',
+    'VariantOnGenome/DNA/hg38': 'String',
+    'VariantOnGenome/ClinVar': 'String',
+    'VariantOnGenome/ClinicalClassification': 'String',
+    'VariantOnGenome/ClinicalClassification/Method': 'String',
+    'screeningid': 'Integer',
+    'variantid': 'Integer',
+    'owned_by': 'Integer',
+    'Individual/Origin/Geographic': 'String'
+}
\ No newline at end of file
diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py
index b910f41..26a0ac0 100644
--- a/data_collection/pipeline.py
+++ b/data_collection/pipeline.py
@@ -2,25 +2,7 @@
 import pandas as pd
 
 from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_dna
-
-# CONSTANTS
-# files
-LOVD_URL = "https://databases.lovd.nl/shared/genes/EYS"
-LOVD_FILE_URL = "https://databases.lovd.nl/shared/download/all/gene/EYS"
-
-GNOMAD_URL = "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4"
-GNOMAD_FILE_URL = ("https://drive.usercontent.google.com/u/0/uc?id=1crkDCVcC0PSnv0JPGj3FpemBs28"
-                   "-T_3y&export=download")
-
-CLINVAR_URL = "https://www.ncbi.nlm.nih.gov/clinvar/?term=eys%5Bgene%5D&redir=gene"
-CLINVAR_FILE_URL = ("https://drive.usercontent.google.com/u/0/uc?id=1RK5XBK3k5h0K6f-qfwJSQj7tlF"
-                    "-H2U6u&export=download")
-
-# path
-DATA_PATH = "../data"
-LOVD_PATH = DATA_PATH + "/lovd"
-GNOMAD_PATH = DATA_PATH + "/gnomad"
-CLINVAR_PATH = DATA_PATH + "/clinvar"
+from constants import LOVD_FILE_URL, GNOMAD_FILE_URL, CLINVAR_FILE_URL, DATA_PATH, LOVD_PATH, GNOMAD_PATH, CLINVAR_PATH
 
 
 def calculate_max_frequency(row):
diff --git a/data_collection/tools.py b/data_collection/tools.py
index 0a8868b..586fbe5 100644
--- a/data_collection/tools.py
+++ b/data_collection/tools.py
@@ -5,6 +5,7 @@
 from requests.exceptions import RequestException
 import pandas as pd
 from pandas import DataFrame
+from constants import LOVD_VARIABLES_DATA_TYPES
 
 
 # EXCEPTIONS
@@ -16,136 +17,6 @@ class DownloadError(Exception):
     """Custom exception for download errors."""
 
 
-# CONSTANTS
-LOVD_VARIABLES_DATA_TYPES = {
-    'id': 'String',
-    'name': 'String',
-    'chromosome': 'Integer',
-    'chrom_band': 'String',
-    'imprinting': 'String',
-    'refseq_genomic': 'String',
-    'refseq_UD': 'String',
-    'reference': 'String',
-    'url_homepage': 'String',
-    'url_external': 'String',
-    'allow_download': 'Boolean',
-    'id_hgnc': 'Integer',
-    'id_entrez': 'Integer',
-    'id_omim': 'Integer',
-    'show_hgmd': 'Boolean',
-    'show_genecards': 'Boolean',
-    'show_genetests': 'Boolean',
-    'show_orphanet': 'Boolean',
-    'note_index': 'String',
-    'note_listing': 'String',
-    'refseq': 'String',
-    'refseq_url': 'String',
-    'disclaimer': 'Boolean',
-    'disclaimer_text': 'String',
-    'header': 'String',
-    'header_align': 'Integer',
-    'footer': 'String',
-    'footer_align': 'Integer',
-    'created_by': 'Integer',
-    'created_date': 'Date',
-    'edited_by': 'Integer',
-    'edited_date': 'Date',
-    'updated_by': 'Integer',
-    'updated_date': 'Date',
-    'transcriptid': 'Integer',
-    'effectid': 'Integer',
-    'position_c_start': 'Integer',
-    'position_c_start_intron': 'Integer',
-    'position_c_end': 'Integer',
-    'position_c_end_intron': 'Integer',
-    'VariantOnTranscript/DNA': 'String',
-    'VariantOnTranscript/RNA': 'String',
-    'VariantOnTranscript/Protein': 'String',
-    'VariantOnTranscript/Exon': 'String',
-    'symbol': 'String',
-    'inheritance': 'String',
-    'id_omin': 'Integer',
-    'tissues': 'String',
-    'features': 'String',
-    'remarks': 'String',
-    'geneid': 'String',
-    'id_mutalyzer': 'Integer',
-    'id_ncbi': 'String',
-    'id_ensembl': 'String',
-    'id_protein_ncbi': 'String',
-    'id_protein_ensembl': 'String',
-    'id_protein_uniprot': 'String',
-    'position_c_mrna_start': 'Integer',
-    'position_c_mrna_end': 'Integer',
-    'position_c_cds_end': 'Integer',
-    'position_g_mrna_start': 'Integer',
-    'position_g_mrna_end': 'Integer',
-    'diseaseid': 'Integer',
-    'individualid': 'Integer',
-    'Phenotype/Inheritance': 'String',
-    'Phenotype/Age': 'String',
-    'Phenotype/Additional': 'String',
-    'Phenotype/Biochem_param': 'String',
-    'Phenotype/Age/Onset': 'String',
-    'Phenotype/Age/Diagnosis': 'String',
-    'Phenotype/Severity_score': 'String',
-    'Phenotype/Onset': 'String',
-    'Phenotype/Protein': 'String',
-    'Phenotype/Tumor/MSI': 'String',
-    'Phenotype/Enzyme/CPK': 'String',
-    'Phenotype/Heart/Myocardium': 'String',
-    'Phenotype/Lung': 'String',
-    'Phenotype/Diagnosis/Definite': 'String',
-    'Phenotype/Diagnosis/Initial': 'String',
-    'Phenotype/Diagnosis/Criteria': 'String',
-    'variants_found': 'Integer',
-    'Screening/Technique': 'String',
-    'Screening/Template': 'String',
-    'Screening/Tissue': 'String',
-    'Screening/Remarks': 'String',
-    'fatherid': 'String',
-    'motherid': 'String',
-    'panelid': 'Integer',
-    'panel_size': 'Integer',
-    'license': 'String',
-    'Individual/Reference': 'String',
-    'Individual/Remarks': 'String',
-    'Individual/Gender': 'String',
-    'Individual/Consanguinity': 'String',
-    'Individual/Age_of_death': 'String',
-    'Individual/VIP': 'String',
-    'Individual/Data_av': 'String',
-    'Individual/Treatment': 'String',
-    'Individual/Origin/Population': 'String',
-    'Individual/Individual_ID': 'String',
-    'allele': 'Integer',
-    'position_g_start': 'Integer',
-    'position_g_end': 'Integer',
-    'type': 'String',
-    'average_frequency': 'Double',
-    'VariantOnGenome/DBID': 'String',
-    'VariantOnGenome/DNA': 'String',
-    'VariantOnGenome/Frequency': 'String',
-    'VariantOnGenome/Reference': 'String',
-    'VariantOnGenome/Restriction_site': 'String',
-    'VariantOnGenome/Published_as': 'String',
-    'VariantOnGenome/Remarks': 'String',
-    'VariantOnGenome/Genetic_origin': 'String',
-    'VariantOnGenome/Segregation': 'String',
-    'VariantOnGenome/dbSNP': 'String',
-    'VariantOnGenome/VIP': 'String',
-    'VariantOnGenome/Methylation': 'String',
-    'VariantOnGenome/ISCN': 'String',
-    'VariantOnGenome/DNA/hg38': 'String',
-    'VariantOnGenome/ClinVar': 'String',
-    'VariantOnGenome/ClinicalClassification': 'String',
-    'VariantOnGenome/ClinicalClassification/Method': 'String',
-    'screeningid': 'Integer',
-    'variantid': 'Integer',
-    'owned_by': 'Integer',
-    'Individual/Origin/Geographic': 'String'
-}
-
 
 def get_file_from_url(url, save_to, override=False):
     """

From c4201e2bae4c3d502f4bdf1b37b91d9c3d8f02bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Paulius=20Preik=C5=A1a?= <preiksap@gmail.com>
Date: Sat, 2 Mar 2024 11:37:24 +0200
Subject: [PATCH 13/15] fix: linter erros

---
 data_collection/constants.py | 2 +-
 data_collection/pipeline.py  | 9 +++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/data_collection/constants.py b/data_collection/constants.py
index ccb12ec..41defe5 100644
--- a/data_collection/constants.py
+++ b/data_collection/constants.py
@@ -146,4 +146,4 @@
     'variantid': 'Integer',
     'owned_by': 'Integer',
     'Individual/Origin/Geographic': 'String'
-}
\ No newline at end of file
+}
diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py
index 26a0ac0..9800e7b 100644
--- a/data_collection/pipeline.py
+++ b/data_collection/pipeline.py
@@ -2,8 +2,13 @@
 import pandas as pd
 
 from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_dna
-from constants import LOVD_FILE_URL, GNOMAD_FILE_URL, CLINVAR_FILE_URL, DATA_PATH, LOVD_PATH, GNOMAD_PATH, CLINVAR_PATH
-
+from constants import (LOVD_FILE_URL, 
+                       GNOMAD_FILE_URL, 
+                       CLINVAR_FILE_URL, 
+                       DATA_PATH, 
+                       LOVD_PATH, 
+                       GNOMAD_PATH, 
+                       CLINVAR_PATH)
 
 def calculate_max_frequency(row):
     """

From 2e535a47046b1b997bb2c00c15663485ba7f89da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Paulius=20Preik=C5=A1a?= <preiksap@gmail.com>
Date: Sat, 2 Mar 2024 11:39:31 +0200
Subject: [PATCH 14/15] fix: removed trailing whitespace

---
 data_collection/pipeline.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py
index 9800e7b..ddc3978 100644
--- a/data_collection/pipeline.py
+++ b/data_collection/pipeline.py
@@ -2,12 +2,12 @@
 import pandas as pd
 
 from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_dna
-from constants import (LOVD_FILE_URL, 
-                       GNOMAD_FILE_URL, 
-                       CLINVAR_FILE_URL, 
-                       DATA_PATH, 
-                       LOVD_PATH, 
-                       GNOMAD_PATH, 
+from constants import (LOVD_FILE_URL,
+                       GNOMAD_FILE_URL,
+                       CLINVAR_FILE_URL,
+                       DATA_PATH,
+                       LOVD_PATH,
+                       GNOMAD_PATH,
                        CLINVAR_PATH)
 
 def calculate_max_frequency(row):

From e613bcbc092ccbb58488a4c691f1ee7c5ecf4bc3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Paulius=20Preik=C5=A1a?= <preiksap@gmail.com>
Date: Mon, 4 Mar 2024 09:49:54 +0200
Subject: [PATCH 15/15] feat: added base constants

---
 data_collection/constants.py | 16 ++++++++++------
 data_collection/pipeline.py  | 12 ++++++------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/data_collection/constants.py b/data_collection/constants.py
index 41defe5..fa710ce 100644
--- a/data_collection/constants.py
+++ b/data_collection/constants.py
@@ -1,15 +1,19 @@
 """Module for constants used in data collection."""
 
 # files
-LOVD_URL = "https://databases.lovd.nl/shared/genes/EYS"
-LOVD_FILE_URL = "https://databases.lovd.nl/shared/download/all/gene/EYS"
+LOVD_URL = "https://databases.lovd.nl/shared/genes"
+LOVD_URL_EYS = "https://databases.lovd.nl/shared/genes/EYS"
+LOVD_FILE_URL = "https://databases.lovd.nl/shared/download/all/gene"
+LOVD_FILE_URL_EYS = "https://databases.lovd.nl/shared/download/all/gene/EYS"
 
-GNOMAD_URL = "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4"
-GNOMAD_FILE_URL = ("https://drive.usercontent.google.com/u/0/uc?id=1crkDCVcC0PSnv0JPGj3FpemBs28"
+GNOMAD_URL = "https://gnomad.broadinstitute.org/gene"
+GNOMAD_URL_EYS = "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4"
+GNOMAD_FILE_URL_EYS = ("https://drive.usercontent.google.com/u/0/uc?id=1crkDCVcC0PSnv0JPGj3FpemBs28"
                    "-T_3y&export=download")
 
-CLINVAR_URL = "https://www.ncbi.nlm.nih.gov/clinvar/?term=eys%5Bgene%5D&redir=gene"
-CLINVAR_FILE_URL = ("https://drive.usercontent.google.com/u/0/uc?id=1RK5XBK3k5h0K6f-qfwJSQj7tlF"
+CLINVAR_URL = "https://www.ncbi.nlm.nih.gov/clinvar"
+CLINVAR_URL_EYS = "https://www.ncbi.nlm.nih.gov/clinvar/?term=eys%5Bgene%5D&redir=gene"
+CLINVAR_FILE_URL_EYS = ("https://drive.usercontent.google.com/u/0/uc?id=1RK5XBK3k5h0K6f-qfwJSQj7tlF"
                     "-H2U6u&export=download")
 
 # paths
diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py
index ddc3978..80d3a94 100644
--- a/data_collection/pipeline.py
+++ b/data_collection/pipeline.py
@@ -2,9 +2,9 @@
 import pandas as pd
 
 from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_dna
-from constants import (LOVD_FILE_URL,
-                       GNOMAD_FILE_URL,
-                       CLINVAR_FILE_URL,
+from constants import (LOVD_FILE_URL_EYS,
+                       GNOMAD_FILE_URL_EYS,
+                       CLINVAR_FILE_URL_EYS,
                        DATA_PATH,
                        LOVD_PATH,
                        GNOMAD_PATH,
@@ -47,9 +47,9 @@ def calculate_max_frequency(row):
 
 # MAIN
 # Download all data
-get_file_from_url(LOVD_FILE_URL, LOVD_PATH + "/lovd_data.txt", override=True)
-get_file_from_url(GNOMAD_FILE_URL, GNOMAD_PATH + "/gnomad_data.csv", override=True)
-get_file_from_url(CLINVAR_FILE_URL, CLINVAR_PATH + "/clinvar_data.txt", override=True)
+get_file_from_url(LOVD_FILE_URL_EYS, LOVD_PATH + "/lovd_data.txt", override=True)
+get_file_from_url(GNOMAD_FILE_URL_EYS, GNOMAD_PATH + "/gnomad_data.csv", override=True)
+get_file_from_url(CLINVAR_FILE_URL_EYS, CLINVAR_PATH + "/clinvar_data.txt", override=True)
 
 # Read and convert data
 lovd_data = from_lovd_to_pandas(LOVD_PATH + "/lovd_data.txt")