From 8b4a7d2c237f9aa1e7564ffa15629c076f628f2a Mon Sep 17 00:00:00 2001 From: Dainius Kirsnauskas <75167873+Strexas@users.noreply.github.com> Date: Mon, 26 Feb 2024 19:53:58 +0200 Subject: [PATCH 01/15] pylint linter workflow Added github workflow "Pylinter" to run static analysis tool on each push --- .github/workflows/pylint.yml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 .github/workflows/pylint.yml diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml new file mode 100644 index 0000000..a3f5d43 --- /dev/null +++ b/.github/workflows/pylint.yml @@ -0,0 +1,23 @@ +name: Pylint + +on: [push] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pylint + - name: Analysing the code with pylint + run: | + pylint $(git ls-files '*.py') From 7a2c2fc4f21108211ca0a465a6c5fa8cd266eaab Mon Sep 17 00:00:00 2001 From: Dainius Date: Mon, 26 Feb 2024 20:01:39 +0200 Subject: [PATCH 02/15] add install of requirements.txt --- .github/workflows/pylint.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index a3f5d43..78da3fc 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -18,6 +18,7 @@ jobs: run: | python -m pip install --upgrade pip pip install pylint + pip install -r requirements.txt - name: Analysing the code with pylint run: | pylint $(git ls-files '*.py') From d25b4a375c2ff2557d7658df78b9019ee9ceb645 Mon Sep 17 00:00:00 2001 From: Dainius Date: Tue, 27 Feb 2024 10:48:35 +0200 Subject: [PATCH 03/15] changes according to pylint --- data_collection/pipeline.py | 35 +++++++++++++++++++---------------- data_collection/tools.py | 37 ++----------------------------------- 2 files changed, 21 insertions(+), 51 deletions(-) diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py index 0b12237..b5e952e 100644 --- a/data_collection/pipeline.py +++ b/data_collection/pipeline.py @@ -1,6 +1,6 @@ import pandas as pd -from pandas import DataFrame, Series -from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_DNA + +from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_dna # CONSTANTS # files @@ -8,10 +8,12 @@ LOVD_FILE_URL = "https://databases.lovd.nl/shared/download/all/gene/EYS" GNOMAD_URL = "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4" -GNOMAD_FILE_URL = "https://drive.usercontent.google.com/u/0/uc?id=1crkDCVcC0PSnv0JPGj3FpemBs28-T_3y&export=download" +GNOMAD_FILE_URL = ("https://drive.usercontent.google.com/u/0/uc?id=1crkDCVcC0PSnv0JPGj3FpemBs28" + "-T_3y&export=download") CLINVAR_URL = "https://www.ncbi.nlm.nih.gov/clinvar/?term=eys%5Bgene%5D&redir=gene" -CLINVAR_FILE_URL = "https://drive.usercontent.google.com/u/0/uc?id=1RK5XBK3k5h0K6f-qfwJSQj7tlF-H2U6u&export=download" +CLINVAR_FILE_URL = ("https://drive.usercontent.google.com/u/0/uc?id=1RK5XBK3k5h0K6f-qfwJSQj7tlF" + "-H2U6u&export=download") # path DATA_PATH = "../data" @@ -57,9 +59,9 @@ def calculate_max_frequency(row): # MAIN # Download all data -get_file_from_url(LOVD_FILE_URL, LOVD_PATH + f"/lovd_data.txt", override=True) -get_file_from_url(GNOMAD_FILE_URL, GNOMAD_PATH + f"/gnomad_data.csv", override=True) -get_file_from_url(CLINVAR_FILE_URL, CLINVAR_PATH + f"/clinvar_data.txt", override=True) +get_file_from_url(LOVD_FILE_URL, LOVD_PATH + "/lovd_data.txt", override=True) +get_file_from_url(GNOMAD_FILE_URL, GNOMAD_PATH + "/gnomad_data.csv", override=True) +get_file_from_url(CLINVAR_FILE_URL, CLINVAR_PATH + "/clinvar_data.txt", override=True) # Read and convert data lovd_data = from_lovd_to_pandas(LOVD_PATH + "/lovd_data.txt") @@ -75,8 +77,10 @@ def calculate_max_frequency(row): notes = lovd_data["Variants_On_Transcripts"][1][::] # Merging Clinvar -clinvar = clinvar_data.copy()[["Name(clinvar)", "Germline classification(clinvar)", "Accession(clinvar)"]] -clinvar["VariantOnTranscript/DNA"] = clinvar["Name(clinvar)"].apply(from_clinvar_name_to_DNA) +clinvar = clinvar_data.copy()[["Name(clinvar)", + "Germline classification(clinvar)", + "Accession(clinvar)"]] +clinvar["VariantOnTranscript/DNA"] = clinvar["Name(clinvar)"].apply(from_clinvar_name_to_dna) main_frame = pd.merge(main_frame, clinvar, @@ -84,12 +88,12 @@ def calculate_max_frequency(row): on=["VariantOnTranscript/DNA"]).drop("Name(clinvar)", axis=1) # MERGING GnomAd -main_frame = pd.merge(main_frame, - gnomad_data, - how="left", - left_on="VariantOnTranscript/DNA", - right_on="HGVS Consequence(gnomad)").drop("HGVS Consequence(gnomad)", axis=1) - +main_frame = (pd.merge(main_frame, + gnomad_data, + how="left", + left_on="VariantOnTranscript/DNA", + right_on="HGVS Consequence(gnomad)").drop("HGVS Consequence(gnomad)", + axis=1)) # Calculating frequencies lovd_without_association_in_gnomad = pd.isnull(main_frame["Hemizygote Count Remaining(gnomad)"]) @@ -97,7 +101,6 @@ def calculate_max_frequency(row): max_values = lovd_with_gnomad.apply(calculate_max_frequency, axis=1) lovd_with_gnomad[['PopMax(gnomad)', 'PopMax population(gnomad)']] = max_values - # Leaving necessary columns lovd_with_gnomad = lovd_with_gnomad.loc[:, ['id', diff --git a/data_collection/tools.py b/data_collection/tools.py index 77b2001..2f946bb 100644 --- a/data_collection/tools.py +++ b/data_collection/tools.py @@ -3,6 +3,7 @@ import pandas as pd from pandas import DataFrame + # EXCEPTIONS class BadResponseException(Exception): pass @@ -275,7 +276,7 @@ def from_lovd_to_pandas(path): print(f"Error: {e}") -def from_clinvar_name_to_DNA(name): +def from_clinvar_name_to_dna(name): """ Custom cleaner to extract DNA from Clinvar name variable. @@ -298,37 +299,3 @@ def from_clinvar_name_to_DNA(name): break return name[start:end] - - -def calculate_max_frequency(row): - """ - Calculating maximum allele frequency in GNOMAD row. - - :param row: row in dataframe - :returns: panda series with 'PopMax', 'PopMax population' fields - :rtype: pd.Series - """ - - population_groups = [ - 'Admixed American', - 'African/African American', - 'Amish', - 'Ashkenazi Jewish', - 'East Asian', - 'European (Finnish)', - 'European (non-Finnish)', - 'Middle Eastern', - 'South Asian'] - - max_freq = 0 - max_pop = population_groups[0] - - for group in population_groups: - count_column = f'Allele Count {group}(gnomad)' - number_column = f'Allele Number {group}(gnomad)' - freq = row[count_column] / row[number_column] - if (freq > max_freq): - max_freq = freq - max_pop = group - - return pd.Series([max_freq, max_pop], index=['PopMax', 'PopMax population']) \ No newline at end of file From 120a59fd081ae27161fcdccab6a4901b87cce4c0 Mon Sep 17 00:00:00 2001 From: Dainius Date: Tue, 27 Feb 2024 11:18:15 +0200 Subject: [PATCH 04/15] fix --- data_collection/tools.py | 50 ++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 30 deletions(-) diff --git a/data_collection/tools.py b/data_collection/tools.py index 2f946bb..9259e3f 100644 --- a/data_collection/tools.py +++ b/data_collection/tools.py @@ -1,5 +1,6 @@ -import requests import os +import requests +from requests.exceptions import RequestException import pandas as pd from pandas import DataFrame @@ -116,7 +117,6 @@ class DownloadError(Exception): 'Individual/Origin/Population': 'String', 'Individual/Individual_ID': 'String', 'allele': 'Integer', - 'chromosome': 'Integer', 'position_g_start': 'Integer', 'position_g_end': 'Integer', 'type': 'String', @@ -154,37 +154,27 @@ def get_file_from_url(url, save_to, override=False): :param bool override: needs override """ - try: - # check if directory exists, if not - create - save_to_dir = os.path.dirname(save_to) - if not os.path.exists(save_to_dir): - os.makedirs(save_to_dir) - - # check if file exist and needs to override - if os.path.exists(save_to) and not override: - print(f"The file at {save_to} already exists.") - return - - try: - response = requests.get(url) - except requests.exceptions.RequestException as e: - raise DownloadError(f"Error downloading file from {url}: {e}") + # check if directory exists, if not - create + save_to_dir = os.path.dirname(save_to) + if not os.path.exists(save_to_dir): + os.makedirs(save_to_dir) - if response.status_code != 200: - raise BadResponseException(f"Bad response from {url}. Status code: {response.status_code}") + # check if file exist and needs to override + if os.path.exists(save_to) and not override: + print(f"The file at {save_to} already exists.") + return - with open(save_to, "wb") as f: - f.write(response.content) - - # check request exceptions - except BadResponseException as e: - print(f"Error: {e}") + try: + response = requests.get(url, timeout=10) + except RequestException as e: + raise DownloadError(f"Error while downloading file from {url}") from e - except DownloadError as e: - print(f"Error: {e}") + if response.status_code != 200: + raise BadResponseException(f"Bad response from {url}." + f" Status code: {response.status_code}") - except Exception as e: - print(f"Error: {e}") + with open(save_to, "wb") as f: + f.write(response.content) def convert_lovd_data_types(frame, table_name): @@ -232,7 +222,7 @@ def from_lovd_to_pandas(path): if not os.path.exists(path): raise FileNotFoundError(f"The file at {path} does not exist.") - d = dict() + d = {} with open(path) as f: # skip header From 5415a5a060eff5df0345ba7e380ad89c12f6ae79 Mon Sep 17 00:00:00 2001 From: Dainius Date: Tue, 27 Feb 2024 11:20:56 +0200 Subject: [PATCH 05/15] remove try --- data_collection/tools.py | 63 ++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 34 deletions(-) diff --git a/data_collection/tools.py b/data_collection/tools.py index 9259e3f..867c721 100644 --- a/data_collection/tools.py +++ b/data_collection/tools.py @@ -217,53 +217,48 @@ def from_lovd_to_pandas(path): :rtype: dict[str, tuple[DataFrame, list[str]]] """ - try: - # Check if the file exists - if not os.path.exists(path): - raise FileNotFoundError(f"The file at {path} does not exist.") + # Check if the file exists + if not os.path.exists(path): + raise FileNotFoundError(f"The file at {path} does not exist.") - d = {} + d = {} - with open(path) as f: - # skip header - [f.readline() for _ in range(4)] + with open(path) as f: + # skip header + [f.readline() for _ in range(4)] - while True: - line = f.readline() + while True: + line = f.readline() - if line == '': - break + if line == '': + break - table_name = line.split("##")[1].strip() + table_name = line.split("##")[1].strip() - notes = [] + notes = [] + line = f.readline() + while line.startswith("##"): + notes.append(line[2:-1]) line = f.readline() - while line.startswith("##"): - notes.append(line[2:-1]) - line = f.readline() - table_header = [column[3:-3] for column in line[:-1].split('\t')] - frame = DataFrame([], columns=table_header) + table_header = [column[3:-3] for column in line[:-1].split('\t')] + frame = DataFrame([], columns=table_header) + line = f.readline() + while line != '\n': + variables = [variable[1:-1] for variable in line[:-1].split('\t')] + observation = DataFrame([variables], columns=table_header) + frame = pd.concat([frame, observation], ignore_index=True) line = f.readline() - while line != '\n': - variables = [variable[1:-1] for variable in line[:-1].split('\t')] - observation = DataFrame([variables], columns=table_header) - frame = pd.concat([frame, observation], ignore_index=True) - line = f.readline() - # formats the frame - convert_lovd_data_types(frame, table_name) + # formats the frame + convert_lovd_data_types(frame, table_name) - d[table_name] = (frame, notes) - # skip inter tables lines - [f.readline() for _ in range(1)] + d[table_name] = (frame, notes) + # skip inter tables lines + [f.readline() for _ in range(1)] - return d - except FileNotFoundError as e: - print(f"Error: {e}") + return d - except Exception as e: - print(f"Error: {e}") def from_clinvar_name_to_dna(name): From 874aebc5510ebacd09220c54456577ffac4bb31b Mon Sep 17 00:00:00 2001 From: Dainius Date: Tue, 27 Feb 2024 11:26:21 +0200 Subject: [PATCH 06/15] removing .gitkeep --- data_collection/.gitkeep | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 data_collection/.gitkeep diff --git a/data_collection/.gitkeep b/data_collection/.gitkeep deleted file mode 100644 index e69de29..0000000 From e7364a7c37d95aa01dbeb7ba24d399a8647ed278 Mon Sep 17 00:00:00 2001 From: Dainius Date: Tue, 27 Feb 2024 11:26:40 +0200 Subject: [PATCH 07/15] docstrings for module --- data_collection/pipeline.py | 1 + data_collection/tools.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py index b5e952e..b910f41 100644 --- a/data_collection/pipeline.py +++ b/data_collection/pipeline.py @@ -1,3 +1,4 @@ +"""Module executes general pipeline for data collection""" import pandas as pd from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_dna diff --git a/data_collection/tools.py b/data_collection/tools.py index 867c721..66191ff 100644 --- a/data_collection/tools.py +++ b/data_collection/tools.py @@ -1,3 +1,5 @@ +"""Module providing a functionality to collect data from various sources.""" + import os import requests from requests.exceptions import RequestException From 3fb9b6e152d48fa64d0459848cec2b6591136a0b Mon Sep 17 00:00:00 2001 From: Dainius Date: Tue, 27 Feb 2024 11:32:30 +0200 Subject: [PATCH 08/15] no assigned suppress --- data_collection/tools.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/data_collection/tools.py b/data_collection/tools.py index 66191ff..952ef9c 100644 --- a/data_collection/tools.py +++ b/data_collection/tools.py @@ -225,9 +225,9 @@ def from_lovd_to_pandas(path): d = {} - with open(path) as f: + with open(path, encoding="UTF-8") as f: # skip header - [f.readline() for _ in range(4)] + [f.readline() for _ in range(4)] # pylint: disable=expression-not-assigned while True: line = f.readline() @@ -257,7 +257,7 @@ def from_lovd_to_pandas(path): d[table_name] = (frame, notes) # skip inter tables lines - [f.readline() for _ in range(1)] + [f.readline() for _ in range(1)] # pylint: disable=expression-not-assigned return d From 3c42ca3744bbd1b52ad57183f7bc85799947f968 Mon Sep 17 00:00:00 2001 From: Dainius Date: Tue, 27 Feb 2024 11:34:28 +0200 Subject: [PATCH 09/15] exceptions docstrings --- data_collection/tools.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/data_collection/tools.py b/data_collection/tools.py index 952ef9c..57728c6 100644 --- a/data_collection/tools.py +++ b/data_collection/tools.py @@ -9,10 +9,12 @@ # EXCEPTIONS class BadResponseException(Exception): + """Custom exception for bad responses.""" pass class DownloadError(Exception): + """Custom exception for download errors.""" pass From 906829c6aa9a838fe9446b03b1e3ac8b5003de29 Mon Sep 17 00:00:00 2001 From: Dainius Date: Tue, 27 Feb 2024 11:36:38 +0200 Subject: [PATCH 10/15] exceptions pass --- data_collection/tools.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/data_collection/tools.py b/data_collection/tools.py index 57728c6..0a8868b 100644 --- a/data_collection/tools.py +++ b/data_collection/tools.py @@ -10,12 +10,10 @@ # EXCEPTIONS class BadResponseException(Exception): """Custom exception for bad responses.""" - pass class DownloadError(Exception): """Custom exception for download errors.""" - pass # CONSTANTS @@ -264,7 +262,6 @@ def from_lovd_to_pandas(path): return d - def from_clinvar_name_to_dna(name): """ Custom cleaner to extract DNA from Clinvar name variable. From 1f71f580b9aa7bfd736b6e0d8fe9860debf70b01 Mon Sep 17 00:00:00 2001 From: Dainius Kirsnauskas <75167873+Strexas@users.noreply.github.com> Date: Tue, 27 Feb 2024 11:40:22 +0200 Subject: [PATCH 11/15] Removing unsupported versions 3.7, 3.8 and 3.9 doesn't support match/case statements, therefore were removed --- .github/workflows/pylint.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index 78da3fc..89c0945 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} From 03315c14681cd70007d368d2a7a736e0f6990bed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Paulius=20Preik=C5=A1a?= Date: Sat, 2 Mar 2024 11:30:23 +0200 Subject: [PATCH 12/15] feat: moved constants to separete file --- data_collection/constants.py | 149 +++++++++++++++++++++++++++++++++++ data_collection/pipeline.py | 20 +---- data_collection/tools.py | 131 +----------------------------- 3 files changed, 151 insertions(+), 149 deletions(-) create mode 100644 data_collection/constants.py diff --git a/data_collection/constants.py b/data_collection/constants.py new file mode 100644 index 0000000..ccb12ec --- /dev/null +++ b/data_collection/constants.py @@ -0,0 +1,149 @@ +"""Module for constants used in data collection.""" + +# files +LOVD_URL = "https://databases.lovd.nl/shared/genes/EYS" +LOVD_FILE_URL = "https://databases.lovd.nl/shared/download/all/gene/EYS" + +GNOMAD_URL = "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4" +GNOMAD_FILE_URL = ("https://drive.usercontent.google.com/u/0/uc?id=1crkDCVcC0PSnv0JPGj3FpemBs28" + "-T_3y&export=download") + +CLINVAR_URL = "https://www.ncbi.nlm.nih.gov/clinvar/?term=eys%5Bgene%5D&redir=gene" +CLINVAR_FILE_URL = ("https://drive.usercontent.google.com/u/0/uc?id=1RK5XBK3k5h0K6f-qfwJSQj7tlF" + "-H2U6u&export=download") + +# paths +DATA_PATH = "../data" +LOVD_PATH = DATA_PATH + "/lovd" +GNOMAD_PATH = DATA_PATH + "/gnomad" +CLINVAR_PATH = DATA_PATH + "/clinvar" + +# variable data types +LOVD_VARIABLES_DATA_TYPES = { + 'id': 'String', + 'name': 'String', + 'chromosome': 'Integer', + 'chrom_band': 'String', + 'imprinting': 'String', + 'refseq_genomic': 'String', + 'refseq_UD': 'String', + 'reference': 'String', + 'url_homepage': 'String', + 'url_external': 'String', + 'allow_download': 'Boolean', + 'id_hgnc': 'Integer', + 'id_entrez': 'Integer', + 'id_omim': 'Integer', + 'show_hgmd': 'Boolean', + 'show_genecards': 'Boolean', + 'show_genetests': 'Boolean', + 'show_orphanet': 'Boolean', + 'note_index': 'String', + 'note_listing': 'String', + 'refseq': 'String', + 'refseq_url': 'String', + 'disclaimer': 'Boolean', + 'disclaimer_text': 'String', + 'header': 'String', + 'header_align': 'Integer', + 'footer': 'String', + 'footer_align': 'Integer', + 'created_by': 'Integer', + 'created_date': 'Date', + 'edited_by': 'Integer', + 'edited_date': 'Date', + 'updated_by': 'Integer', + 'updated_date': 'Date', + 'transcriptid': 'Integer', + 'effectid': 'Integer', + 'position_c_start': 'Integer', + 'position_c_start_intron': 'Integer', + 'position_c_end': 'Integer', + 'position_c_end_intron': 'Integer', + 'VariantOnTranscript/DNA': 'String', + 'VariantOnTranscript/RNA': 'String', + 'VariantOnTranscript/Protein': 'String', + 'VariantOnTranscript/Exon': 'String', + 'symbol': 'String', + 'inheritance': 'String', + 'id_omin': 'Integer', + 'tissues': 'String', + 'features': 'String', + 'remarks': 'String', + 'geneid': 'String', + 'id_mutalyzer': 'Integer', + 'id_ncbi': 'String', + 'id_ensembl': 'String', + 'id_protein_ncbi': 'String', + 'id_protein_ensembl': 'String', + 'id_protein_uniprot': 'String', + 'position_c_mrna_start': 'Integer', + 'position_c_mrna_end': 'Integer', + 'position_c_cds_end': 'Integer', + 'position_g_mrna_start': 'Integer', + 'position_g_mrna_end': 'Integer', + 'diseaseid': 'Integer', + 'individualid': 'Integer', + 'Phenotype/Inheritance': 'String', + 'Phenotype/Age': 'String', + 'Phenotype/Additional': 'String', + 'Phenotype/Biochem_param': 'String', + 'Phenotype/Age/Onset': 'String', + 'Phenotype/Age/Diagnosis': 'String', + 'Phenotype/Severity_score': 'String', + 'Phenotype/Onset': 'String', + 'Phenotype/Protein': 'String', + 'Phenotype/Tumor/MSI': 'String', + 'Phenotype/Enzyme/CPK': 'String', + 'Phenotype/Heart/Myocardium': 'String', + 'Phenotype/Lung': 'String', + 'Phenotype/Diagnosis/Definite': 'String', + 'Phenotype/Diagnosis/Initial': 'String', + 'Phenotype/Diagnosis/Criteria': 'String', + 'variants_found': 'Integer', + 'Screening/Technique': 'String', + 'Screening/Template': 'String', + 'Screening/Tissue': 'String', + 'Screening/Remarks': 'String', + 'fatherid': 'String', + 'motherid': 'String', + 'panelid': 'Integer', + 'panel_size': 'Integer', + 'license': 'String', + 'Individual/Reference': 'String', + 'Individual/Remarks': 'String', + 'Individual/Gender': 'String', + 'Individual/Consanguinity': 'String', + 'Individual/Age_of_death': 'String', + 'Individual/VIP': 'String', + 'Individual/Data_av': 'String', + 'Individual/Treatment': 'String', + 'Individual/Origin/Population': 'String', + 'Individual/Individual_ID': 'String', + 'allele': 'Integer', + 'position_g_start': 'Integer', + 'position_g_end': 'Integer', + 'type': 'String', + 'average_frequency': 'Double', + 'VariantOnGenome/DBID': 'String', + 'VariantOnGenome/DNA': 'String', + 'VariantOnGenome/Frequency': 'String', + 'VariantOnGenome/Reference': 'String', + 'VariantOnGenome/Restriction_site': 'String', + 'VariantOnGenome/Published_as': 'String', + 'VariantOnGenome/Remarks': 'String', + 'VariantOnGenome/Genetic_origin': 'String', + 'VariantOnGenome/Segregation': 'String', + 'VariantOnGenome/dbSNP': 'String', + 'VariantOnGenome/VIP': 'String', + 'VariantOnGenome/Methylation': 'String', + 'VariantOnGenome/ISCN': 'String', + 'VariantOnGenome/DNA/hg38': 'String', + 'VariantOnGenome/ClinVar': 'String', + 'VariantOnGenome/ClinicalClassification': 'String', + 'VariantOnGenome/ClinicalClassification/Method': 'String', + 'screeningid': 'Integer', + 'variantid': 'Integer', + 'owned_by': 'Integer', + 'Individual/Origin/Geographic': 'String' +} \ No newline at end of file diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py index b910f41..26a0ac0 100644 --- a/data_collection/pipeline.py +++ b/data_collection/pipeline.py @@ -2,25 +2,7 @@ import pandas as pd from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_dna - -# CONSTANTS -# files -LOVD_URL = "https://databases.lovd.nl/shared/genes/EYS" -LOVD_FILE_URL = "https://databases.lovd.nl/shared/download/all/gene/EYS" - -GNOMAD_URL = "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4" -GNOMAD_FILE_URL = ("https://drive.usercontent.google.com/u/0/uc?id=1crkDCVcC0PSnv0JPGj3FpemBs28" - "-T_3y&export=download") - -CLINVAR_URL = "https://www.ncbi.nlm.nih.gov/clinvar/?term=eys%5Bgene%5D&redir=gene" -CLINVAR_FILE_URL = ("https://drive.usercontent.google.com/u/0/uc?id=1RK5XBK3k5h0K6f-qfwJSQj7tlF" - "-H2U6u&export=download") - -# path -DATA_PATH = "../data" -LOVD_PATH = DATA_PATH + "/lovd" -GNOMAD_PATH = DATA_PATH + "/gnomad" -CLINVAR_PATH = DATA_PATH + "/clinvar" +from constants import LOVD_FILE_URL, GNOMAD_FILE_URL, CLINVAR_FILE_URL, DATA_PATH, LOVD_PATH, GNOMAD_PATH, CLINVAR_PATH def calculate_max_frequency(row): diff --git a/data_collection/tools.py b/data_collection/tools.py index 0a8868b..586fbe5 100644 --- a/data_collection/tools.py +++ b/data_collection/tools.py @@ -5,6 +5,7 @@ from requests.exceptions import RequestException import pandas as pd from pandas import DataFrame +from constants import LOVD_VARIABLES_DATA_TYPES # EXCEPTIONS @@ -16,136 +17,6 @@ class DownloadError(Exception): """Custom exception for download errors.""" -# CONSTANTS -LOVD_VARIABLES_DATA_TYPES = { - 'id': 'String', - 'name': 'String', - 'chromosome': 'Integer', - 'chrom_band': 'String', - 'imprinting': 'String', - 'refseq_genomic': 'String', - 'refseq_UD': 'String', - 'reference': 'String', - 'url_homepage': 'String', - 'url_external': 'String', - 'allow_download': 'Boolean', - 'id_hgnc': 'Integer', - 'id_entrez': 'Integer', - 'id_omim': 'Integer', - 'show_hgmd': 'Boolean', - 'show_genecards': 'Boolean', - 'show_genetests': 'Boolean', - 'show_orphanet': 'Boolean', - 'note_index': 'String', - 'note_listing': 'String', - 'refseq': 'String', - 'refseq_url': 'String', - 'disclaimer': 'Boolean', - 'disclaimer_text': 'String', - 'header': 'String', - 'header_align': 'Integer', - 'footer': 'String', - 'footer_align': 'Integer', - 'created_by': 'Integer', - 'created_date': 'Date', - 'edited_by': 'Integer', - 'edited_date': 'Date', - 'updated_by': 'Integer', - 'updated_date': 'Date', - 'transcriptid': 'Integer', - 'effectid': 'Integer', - 'position_c_start': 'Integer', - 'position_c_start_intron': 'Integer', - 'position_c_end': 'Integer', - 'position_c_end_intron': 'Integer', - 'VariantOnTranscript/DNA': 'String', - 'VariantOnTranscript/RNA': 'String', - 'VariantOnTranscript/Protein': 'String', - 'VariantOnTranscript/Exon': 'String', - 'symbol': 'String', - 'inheritance': 'String', - 'id_omin': 'Integer', - 'tissues': 'String', - 'features': 'String', - 'remarks': 'String', - 'geneid': 'String', - 'id_mutalyzer': 'Integer', - 'id_ncbi': 'String', - 'id_ensembl': 'String', - 'id_protein_ncbi': 'String', - 'id_protein_ensembl': 'String', - 'id_protein_uniprot': 'String', - 'position_c_mrna_start': 'Integer', - 'position_c_mrna_end': 'Integer', - 'position_c_cds_end': 'Integer', - 'position_g_mrna_start': 'Integer', - 'position_g_mrna_end': 'Integer', - 'diseaseid': 'Integer', - 'individualid': 'Integer', - 'Phenotype/Inheritance': 'String', - 'Phenotype/Age': 'String', - 'Phenotype/Additional': 'String', - 'Phenotype/Biochem_param': 'String', - 'Phenotype/Age/Onset': 'String', - 'Phenotype/Age/Diagnosis': 'String', - 'Phenotype/Severity_score': 'String', - 'Phenotype/Onset': 'String', - 'Phenotype/Protein': 'String', - 'Phenotype/Tumor/MSI': 'String', - 'Phenotype/Enzyme/CPK': 'String', - 'Phenotype/Heart/Myocardium': 'String', - 'Phenotype/Lung': 'String', - 'Phenotype/Diagnosis/Definite': 'String', - 'Phenotype/Diagnosis/Initial': 'String', - 'Phenotype/Diagnosis/Criteria': 'String', - 'variants_found': 'Integer', - 'Screening/Technique': 'String', - 'Screening/Template': 'String', - 'Screening/Tissue': 'String', - 'Screening/Remarks': 'String', - 'fatherid': 'String', - 'motherid': 'String', - 'panelid': 'Integer', - 'panel_size': 'Integer', - 'license': 'String', - 'Individual/Reference': 'String', - 'Individual/Remarks': 'String', - 'Individual/Gender': 'String', - 'Individual/Consanguinity': 'String', - 'Individual/Age_of_death': 'String', - 'Individual/VIP': 'String', - 'Individual/Data_av': 'String', - 'Individual/Treatment': 'String', - 'Individual/Origin/Population': 'String', - 'Individual/Individual_ID': 'String', - 'allele': 'Integer', - 'position_g_start': 'Integer', - 'position_g_end': 'Integer', - 'type': 'String', - 'average_frequency': 'Double', - 'VariantOnGenome/DBID': 'String', - 'VariantOnGenome/DNA': 'String', - 'VariantOnGenome/Frequency': 'String', - 'VariantOnGenome/Reference': 'String', - 'VariantOnGenome/Restriction_site': 'String', - 'VariantOnGenome/Published_as': 'String', - 'VariantOnGenome/Remarks': 'String', - 'VariantOnGenome/Genetic_origin': 'String', - 'VariantOnGenome/Segregation': 'String', - 'VariantOnGenome/dbSNP': 'String', - 'VariantOnGenome/VIP': 'String', - 'VariantOnGenome/Methylation': 'String', - 'VariantOnGenome/ISCN': 'String', - 'VariantOnGenome/DNA/hg38': 'String', - 'VariantOnGenome/ClinVar': 'String', - 'VariantOnGenome/ClinicalClassification': 'String', - 'VariantOnGenome/ClinicalClassification/Method': 'String', - 'screeningid': 'Integer', - 'variantid': 'Integer', - 'owned_by': 'Integer', - 'Individual/Origin/Geographic': 'String' -} - def get_file_from_url(url, save_to, override=False): """ From c4201e2bae4c3d502f4bdf1b37b91d9c3d8f02bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Paulius=20Preik=C5=A1a?= Date: Sat, 2 Mar 2024 11:37:24 +0200 Subject: [PATCH 13/15] fix: linter erros --- data_collection/constants.py | 2 +- data_collection/pipeline.py | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/data_collection/constants.py b/data_collection/constants.py index ccb12ec..41defe5 100644 --- a/data_collection/constants.py +++ b/data_collection/constants.py @@ -146,4 +146,4 @@ 'variantid': 'Integer', 'owned_by': 'Integer', 'Individual/Origin/Geographic': 'String' -} \ No newline at end of file +} diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py index 26a0ac0..9800e7b 100644 --- a/data_collection/pipeline.py +++ b/data_collection/pipeline.py @@ -2,8 +2,13 @@ import pandas as pd from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_dna -from constants import LOVD_FILE_URL, GNOMAD_FILE_URL, CLINVAR_FILE_URL, DATA_PATH, LOVD_PATH, GNOMAD_PATH, CLINVAR_PATH - +from constants import (LOVD_FILE_URL, + GNOMAD_FILE_URL, + CLINVAR_FILE_URL, + DATA_PATH, + LOVD_PATH, + GNOMAD_PATH, + CLINVAR_PATH) def calculate_max_frequency(row): """ From 2e535a47046b1b997bb2c00c15663485ba7f89da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Paulius=20Preik=C5=A1a?= Date: Sat, 2 Mar 2024 11:39:31 +0200 Subject: [PATCH 14/15] fix: removed trailing whitespace --- data_collection/pipeline.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py index 9800e7b..ddc3978 100644 --- a/data_collection/pipeline.py +++ b/data_collection/pipeline.py @@ -2,12 +2,12 @@ import pandas as pd from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_dna -from constants import (LOVD_FILE_URL, - GNOMAD_FILE_URL, - CLINVAR_FILE_URL, - DATA_PATH, - LOVD_PATH, - GNOMAD_PATH, +from constants import (LOVD_FILE_URL, + GNOMAD_FILE_URL, + CLINVAR_FILE_URL, + DATA_PATH, + LOVD_PATH, + GNOMAD_PATH, CLINVAR_PATH) def calculate_max_frequency(row): From e613bcbc092ccbb58488a4c691f1ee7c5ecf4bc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Paulius=20Preik=C5=A1a?= Date: Mon, 4 Mar 2024 09:49:54 +0200 Subject: [PATCH 15/15] feat: added base constants --- data_collection/constants.py | 16 ++++++++++------ data_collection/pipeline.py | 12 ++++++------ 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/data_collection/constants.py b/data_collection/constants.py index 41defe5..fa710ce 100644 --- a/data_collection/constants.py +++ b/data_collection/constants.py @@ -1,15 +1,19 @@ """Module for constants used in data collection.""" # files -LOVD_URL = "https://databases.lovd.nl/shared/genes/EYS" -LOVD_FILE_URL = "https://databases.lovd.nl/shared/download/all/gene/EYS" +LOVD_URL = "https://databases.lovd.nl/shared/genes" +LOVD_URL_EYS = "https://databases.lovd.nl/shared/genes/EYS" +LOVD_FILE_URL = "https://databases.lovd.nl/shared/download/all/gene" +LOVD_FILE_URL_EYS = "https://databases.lovd.nl/shared/download/all/gene/EYS" -GNOMAD_URL = "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4" -GNOMAD_FILE_URL = ("https://drive.usercontent.google.com/u/0/uc?id=1crkDCVcC0PSnv0JPGj3FpemBs28" +GNOMAD_URL = "https://gnomad.broadinstitute.org/gene" +GNOMAD_URL_EYS = "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4" +GNOMAD_FILE_URL_EYS = ("https://drive.usercontent.google.com/u/0/uc?id=1crkDCVcC0PSnv0JPGj3FpemBs28" "-T_3y&export=download") -CLINVAR_URL = "https://www.ncbi.nlm.nih.gov/clinvar/?term=eys%5Bgene%5D&redir=gene" -CLINVAR_FILE_URL = ("https://drive.usercontent.google.com/u/0/uc?id=1RK5XBK3k5h0K6f-qfwJSQj7tlF" +CLINVAR_URL = "https://www.ncbi.nlm.nih.gov/clinvar" +CLINVAR_URL_EYS = "https://www.ncbi.nlm.nih.gov/clinvar/?term=eys%5Bgene%5D&redir=gene" +CLINVAR_FILE_URL_EYS = ("https://drive.usercontent.google.com/u/0/uc?id=1RK5XBK3k5h0K6f-qfwJSQj7tlF" "-H2U6u&export=download") # paths diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py index ddc3978..80d3a94 100644 --- a/data_collection/pipeline.py +++ b/data_collection/pipeline.py @@ -2,9 +2,9 @@ import pandas as pd from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_dna -from constants import (LOVD_FILE_URL, - GNOMAD_FILE_URL, - CLINVAR_FILE_URL, +from constants import (LOVD_FILE_URL_EYS, + GNOMAD_FILE_URL_EYS, + CLINVAR_FILE_URL_EYS, DATA_PATH, LOVD_PATH, GNOMAD_PATH, @@ -47,9 +47,9 @@ def calculate_max_frequency(row): # MAIN # Download all data -get_file_from_url(LOVD_FILE_URL, LOVD_PATH + "/lovd_data.txt", override=True) -get_file_from_url(GNOMAD_FILE_URL, GNOMAD_PATH + "/gnomad_data.csv", override=True) -get_file_from_url(CLINVAR_FILE_URL, CLINVAR_PATH + "/clinvar_data.txt", override=True) +get_file_from_url(LOVD_FILE_URL_EYS, LOVD_PATH + "/lovd_data.txt", override=True) +get_file_from_url(GNOMAD_FILE_URL_EYS, GNOMAD_PATH + "/gnomad_data.csv", override=True) +get_file_from_url(CLINVAR_FILE_URL_EYS, CLINVAR_PATH + "/clinvar_data.txt", override=True) # Read and convert data lovd_data = from_lovd_to_pandas(LOVD_PATH + "/lovd_data.txt")