diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml new file mode 100644 index 0000000..8cd53b3 --- /dev/null +++ b/.github/workflows/mypy.yml @@ -0,0 +1,27 @@ +name: MyPy + +on: [push] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12"] + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install mypy + pip install -r requirements.txt + - name: Install stubs + run: | + mypy --install-types --non-interactive $(git ls-files '*.py') + - name: Analysing the code with pylint + run: | + mypy $(git ls-files '*.py') diff --git a/data_collection/.gitkeep b/data_collection/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data_collection/constants.py b/data_collection/constants.py new file mode 100644 index 0000000..fa710ce --- /dev/null +++ b/data_collection/constants.py @@ -0,0 +1,153 @@ +"""Module for constants used in data collection.""" + +# files +LOVD_URL = "https://databases.lovd.nl/shared/genes" +LOVD_URL_EYS = "https://databases.lovd.nl/shared/genes/EYS" +LOVD_FILE_URL = "https://databases.lovd.nl/shared/download/all/gene" +LOVD_FILE_URL_EYS = "https://databases.lovd.nl/shared/download/all/gene/EYS" + +GNOMAD_URL = "https://gnomad.broadinstitute.org/gene" +GNOMAD_URL_EYS = "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4" +GNOMAD_FILE_URL_EYS = ("https://drive.usercontent.google.com/u/0/uc?id=1crkDCVcC0PSnv0JPGj3FpemBs28" + "-T_3y&export=download") + +CLINVAR_URL = "https://www.ncbi.nlm.nih.gov/clinvar" +CLINVAR_URL_EYS = "https://www.ncbi.nlm.nih.gov/clinvar/?term=eys%5Bgene%5D&redir=gene" +CLINVAR_FILE_URL_EYS = ("https://drive.usercontent.google.com/u/0/uc?id=1RK5XBK3k5h0K6f-qfwJSQj7tlF" + "-H2U6u&export=download") + +# paths +DATA_PATH = "../data" +LOVD_PATH = DATA_PATH + "/lovd" +GNOMAD_PATH = DATA_PATH + "/gnomad" +CLINVAR_PATH = DATA_PATH + "/clinvar" + +# variable data types +LOVD_VARIABLES_DATA_TYPES = { + 'id': 'String', + 'name': 'String', + 'chromosome': 'Integer', + 'chrom_band': 'String', + 'imprinting': 'String', + 'refseq_genomic': 'String', + 'refseq_UD': 'String', + 'reference': 'String', + 'url_homepage': 'String', + 'url_external': 'String', + 'allow_download': 'Boolean', + 'id_hgnc': 'Integer', + 'id_entrez': 'Integer', + 'id_omim': 'Integer', + 'show_hgmd': 'Boolean', + 'show_genecards': 'Boolean', + 'show_genetests': 'Boolean', + 'show_orphanet': 'Boolean', + 'note_index': 'String', + 'note_listing': 'String', + 'refseq': 'String', + 'refseq_url': 'String', + 'disclaimer': 'Boolean', + 'disclaimer_text': 'String', + 'header': 'String', + 'header_align': 'Integer', + 'footer': 'String', + 'footer_align': 'Integer', + 'created_by': 'Integer', + 'created_date': 'Date', + 'edited_by': 'Integer', + 'edited_date': 'Date', + 'updated_by': 'Integer', + 'updated_date': 'Date', + 'transcriptid': 'Integer', + 'effectid': 'Integer', + 'position_c_start': 'Integer', + 'position_c_start_intron': 'Integer', + 'position_c_end': 'Integer', + 'position_c_end_intron': 'Integer', + 'VariantOnTranscript/DNA': 'String', + 'VariantOnTranscript/RNA': 'String', + 'VariantOnTranscript/Protein': 'String', + 'VariantOnTranscript/Exon': 'String', + 'symbol': 'String', + 'inheritance': 'String', + 'id_omin': 'Integer', + 'tissues': 'String', + 'features': 'String', + 'remarks': 'String', + 'geneid': 'String', + 'id_mutalyzer': 'Integer', + 'id_ncbi': 'String', + 'id_ensembl': 'String', + 'id_protein_ncbi': 'String', + 'id_protein_ensembl': 'String', + 'id_protein_uniprot': 'String', + 'position_c_mrna_start': 'Integer', + 'position_c_mrna_end': 'Integer', + 'position_c_cds_end': 'Integer', + 'position_g_mrna_start': 'Integer', + 'position_g_mrna_end': 'Integer', + 'diseaseid': 'Integer', + 'individualid': 'Integer', + 'Phenotype/Inheritance': 'String', + 'Phenotype/Age': 'String', + 'Phenotype/Additional': 'String', + 'Phenotype/Biochem_param': 'String', + 'Phenotype/Age/Onset': 'String', + 'Phenotype/Age/Diagnosis': 'String', + 'Phenotype/Severity_score': 'String', + 'Phenotype/Onset': 'String', + 'Phenotype/Protein': 'String', + 'Phenotype/Tumor/MSI': 'String', + 'Phenotype/Enzyme/CPK': 'String', + 'Phenotype/Heart/Myocardium': 'String', + 'Phenotype/Lung': 'String', + 'Phenotype/Diagnosis/Definite': 'String', + 'Phenotype/Diagnosis/Initial': 'String', + 'Phenotype/Diagnosis/Criteria': 'String', + 'variants_found': 'Integer', + 'Screening/Technique': 'String', + 'Screening/Template': 'String', + 'Screening/Tissue': 'String', + 'Screening/Remarks': 'String', + 'fatherid': 'String', + 'motherid': 'String', + 'panelid': 'Integer', + 'panel_size': 'Integer', + 'license': 'String', + 'Individual/Reference': 'String', + 'Individual/Remarks': 'String', + 'Individual/Gender': 'String', + 'Individual/Consanguinity': 'String', + 'Individual/Age_of_death': 'String', + 'Individual/VIP': 'String', + 'Individual/Data_av': 'String', + 'Individual/Treatment': 'String', + 'Individual/Origin/Population': 'String', + 'Individual/Individual_ID': 'String', + 'allele': 'Integer', + 'position_g_start': 'Integer', + 'position_g_end': 'Integer', + 'type': 'String', + 'average_frequency': 'Double', + 'VariantOnGenome/DBID': 'String', + 'VariantOnGenome/DNA': 'String', + 'VariantOnGenome/Frequency': 'String', + 'VariantOnGenome/Reference': 'String', + 'VariantOnGenome/Restriction_site': 'String', + 'VariantOnGenome/Published_as': 'String', + 'VariantOnGenome/Remarks': 'String', + 'VariantOnGenome/Genetic_origin': 'String', + 'VariantOnGenome/Segregation': 'String', + 'VariantOnGenome/dbSNP': 'String', + 'VariantOnGenome/VIP': 'String', + 'VariantOnGenome/Methylation': 'String', + 'VariantOnGenome/ISCN': 'String', + 'VariantOnGenome/DNA/hg38': 'String', + 'VariantOnGenome/ClinVar': 'String', + 'VariantOnGenome/ClinicalClassification': 'String', + 'VariantOnGenome/ClinicalClassification/Method': 'String', + 'screeningid': 'Integer', + 'variantid': 'Integer', + 'owned_by': 'Integer', + 'Individual/Origin/Geographic': 'String' +} diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py index b910f41..dab9b06 100644 --- a/data_collection/pipeline.py +++ b/data_collection/pipeline.py @@ -1,26 +1,12 @@ """Module executes general pipeline for data collection""" import pandas as pd -from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_dna - -# CONSTANTS -# files -LOVD_URL = "https://databases.lovd.nl/shared/genes/EYS" -LOVD_FILE_URL = "https://databases.lovd.nl/shared/download/all/gene/EYS" - -GNOMAD_URL = "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4" -GNOMAD_FILE_URL = ("https://drive.usercontent.google.com/u/0/uc?id=1crkDCVcC0PSnv0JPGj3FpemBs28" - "-T_3y&export=download") - -CLINVAR_URL = "https://www.ncbi.nlm.nih.gov/clinvar/?term=eys%5Bgene%5D&redir=gene" -CLINVAR_FILE_URL = ("https://drive.usercontent.google.com/u/0/uc?id=1RK5XBK3k5h0K6f-qfwJSQj7tlF" - "-H2U6u&export=download") - -# path -DATA_PATH = "../data" -LOVD_PATH = DATA_PATH + "/lovd" -GNOMAD_PATH = DATA_PATH + "/gnomad" -CLINVAR_PATH = DATA_PATH + "/clinvar" +from tools import store_database_for_eys_gene, from_lovd_to_pandas, from_clinvar_name_to_dna +from constants import ( + DATA_PATH, + LOVD_PATH, + GNOMAD_PATH, + CLINVAR_PATH) def calculate_max_frequency(row): @@ -41,7 +27,9 @@ def calculate_max_frequency(row): 'European (Finnish)', 'European (non-Finnish)', 'Middle Eastern', - 'South Asian'] + 'South Asian', + 'Remaining' + ] max_freq = 0 max_pop = population_groups[0] @@ -60,9 +48,13 @@ def calculate_max_frequency(row): # MAIN # Download all data -get_file_from_url(LOVD_FILE_URL, LOVD_PATH + "/lovd_data.txt", override=True) -get_file_from_url(GNOMAD_FILE_URL, GNOMAD_PATH + "/gnomad_data.csv", override=True) -get_file_from_url(CLINVAR_FILE_URL, CLINVAR_PATH + "/clinvar_data.txt", override=True) + +#get_file_from_url(LOVD_FILE_URL, LOVD_PATH + f"/lovd_data.txt", override=True) +#get_file_from_url(GNOMAD_FILE_URL, GNOMAD_PATH + f"/gnomad_data.csv", override=True) +#get_file_from_url(CLINVAR_FILE_URL, CLINVAR_PATH + f"/clinvar_data.txt", override=True) +store_database_for_eys_gene('lovd', True) +store_database_for_eys_gene('gnomad', True) +store_database_for_eys_gene('clinvar', True) # Read and convert data lovd_data = from_lovd_to_pandas(LOVD_PATH + "/lovd_data.txt") diff --git a/data_collection/tools.py b/data_collection/tools.py index c52f548..16904a6 100644 --- a/data_collection/tools.py +++ b/data_collection/tools.py @@ -3,9 +3,20 @@ import os import logging import requests -from requests.exceptions import RequestException import pandas as pd +import selenium.common from pandas import DataFrame +from requests import RequestException +from selenium import webdriver +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By +import time +import glob +from constants import LOVD_VARIABLES_DATA_TYPES +from constants import (LOVD_FILE_URL_EYS, + GNOMAD_URL_EYS, + CLINVAR_URL_EYS) # EXCEPTIONS @@ -16,135 +27,7 @@ class BadResponseException(Exception): class DownloadError(Exception): """Custom exception for download errors.""" -# CONSTANTS -LOVD_VARIABLES_DATA_TYPES = { - 'id': 'String', - 'name': 'String', - 'chromosome': 'Integer', - 'chrom_band': 'String', - 'imprinting': 'String', - 'refseq_genomic': 'String', - 'refseq_UD': 'String', - 'reference': 'String', - 'url_homepage': 'String', - 'url_external': 'String', - 'allow_download': 'Boolean', - 'id_hgnc': 'Integer', - 'id_entrez': 'Integer', - 'id_omim': 'Integer', - 'show_hgmd': 'Boolean', - 'show_genecards': 'Boolean', - 'show_genetests': 'Boolean', - 'show_orphanet': 'Boolean', - 'note_index': 'String', - 'note_listing': 'String', - 'refseq': 'String', - 'refseq_url': 'String', - 'disclaimer': 'Boolean', - 'disclaimer_text': 'String', - 'header': 'String', - 'header_align': 'Integer', - 'footer': 'String', - 'footer_align': 'Integer', - 'created_by': 'Integer', - 'created_date': 'Date', - 'edited_by': 'Integer', - 'edited_date': 'Date', - 'updated_by': 'Integer', - 'updated_date': 'Date', - 'transcriptid': 'Integer', - 'effectid': 'Integer', - 'position_c_start': 'Integer', - 'position_c_start_intron': 'Integer', - 'position_c_end': 'Integer', - 'position_c_end_intron': 'Integer', - 'VariantOnTranscript/DNA': 'String', - 'VariantOnTranscript/RNA': 'String', - 'VariantOnTranscript/Protein': 'String', - 'VariantOnTranscript/Exon': 'String', - 'symbol': 'String', - 'inheritance': 'String', - 'id_omin': 'Integer', - 'tissues': 'String', - 'features': 'String', - 'remarks': 'String', - 'geneid': 'String', - 'id_mutalyzer': 'Integer', - 'id_ncbi': 'String', - 'id_ensembl': 'String', - 'id_protein_ncbi': 'String', - 'id_protein_ensembl': 'String', - 'id_protein_uniprot': 'String', - 'position_c_mrna_start': 'Integer', - 'position_c_mrna_end': 'Integer', - 'position_c_cds_end': 'Integer', - 'position_g_mrna_start': 'Integer', - 'position_g_mrna_end': 'Integer', - 'diseaseid': 'Integer', - 'individualid': 'Integer', - 'Phenotype/Inheritance': 'String', - 'Phenotype/Age': 'String', - 'Phenotype/Additional': 'String', - 'Phenotype/Biochem_param': 'String', - 'Phenotype/Age/Onset': 'String', - 'Phenotype/Age/Diagnosis': 'String', - 'Phenotype/Severity_score': 'String', - 'Phenotype/Onset': 'String', - 'Phenotype/Protein': 'String', - 'Phenotype/Tumor/MSI': 'String', - 'Phenotype/Enzyme/CPK': 'String', - 'Phenotype/Heart/Myocardium': 'String', - 'Phenotype/Lung': 'String', - 'Phenotype/Diagnosis/Definite': 'String', - 'Phenotype/Diagnosis/Initial': 'String', - 'Phenotype/Diagnosis/Criteria': 'String', - 'variants_found': 'Integer', - 'Screening/Technique': 'String', - 'Screening/Template': 'String', - 'Screening/Tissue': 'String', - 'Screening/Remarks': 'String', - 'fatherid': 'String', - 'motherid': 'String', - 'panelid': 'Integer', - 'panel_size': 'Integer', - 'license': 'String', - 'Individual/Reference': 'String', - 'Individual/Remarks': 'String', - 'Individual/Gender': 'String', - 'Individual/Consanguinity': 'String', - 'Individual/Age_of_death': 'String', - 'Individual/VIP': 'String', - 'Individual/Data_av': 'String', - 'Individual/Treatment': 'String', - 'Individual/Origin/Population': 'String', - 'Individual/Individual_ID': 'String', - 'allele': 'Integer', - 'position_g_start': 'Integer', - 'position_g_end': 'Integer', - 'type': 'String', - 'average_frequency': 'Double', - 'VariantOnGenome/DBID': 'String', - 'VariantOnGenome/DNA': 'String', - 'VariantOnGenome/Frequency': 'String', - 'VariantOnGenome/Reference': 'String', - 'VariantOnGenome/Restriction_site': 'String', - 'VariantOnGenome/Published_as': 'String', - 'VariantOnGenome/Remarks': 'String', - 'VariantOnGenome/Genetic_origin': 'String', - 'VariantOnGenome/Segregation': 'String', - 'VariantOnGenome/dbSNP': 'String', - 'VariantOnGenome/VIP': 'String', - 'VariantOnGenome/Methylation': 'String', - 'VariantOnGenome/ISCN': 'String', - 'VariantOnGenome/DNA/hg38': 'String', - 'VariantOnGenome/ClinVar': 'String', - 'VariantOnGenome/ClinicalClassification': 'String', - 'VariantOnGenome/ClinicalClassification/Method': 'String', - 'screeningid': 'Integer', - 'variantid': 'Integer', - 'owned_by': 'Integer', - 'Individual/Origin/Geographic': 'String' -} + def get_file_from_url(url, save_to, override=False): """ @@ -155,6 +38,44 @@ def get_file_from_url(url, save_to, override=False): :param bool override: needs override """ + # check if directory exists, if not - create + save_to_dir = os.path.dirname(save_to) + if not os.path.exists(save_to_dir): + os.makedirs(save_to_dir) + # check if directory exists, if not - create + save_to_dir = os.path.dirname(save_to) + if not os.path.exists(save_to_dir): + os.makedirs(save_to_dir) + + # check if file exist and needs to override + if os.path.exists(save_to) and not override: + print(f"The file at {save_to} already exists.") + return + + try: + response = requests.get(url, timeout=10) + except RequestException as e: + raise DownloadError(f"Error while downloading file from {url}") from e + + if response.status_code != 200: + raise BadResponseException(f"Bad response from {url}." + f" Status code: {response.status_code}") + + with open(save_to, "wb") as f: + f.write(response.content) + + +def download_lovd_database_for_eys_gene(database_name, override=False): + """ + Gets file from url and saves it into provided path. Overrides, if override is True. + + :param str database_name: database to download + :param bool override: needs override + """ + + url = DATABASES_DOWNLOAD_PATHS[database_name]["url"] + save_to = DATABASES_DOWNLOAD_PATHS[database_name]["store_as"] + # check if directory exists, if not - create save_to_dir = os.path.dirname(save_to) if not os.path.exists(save_to_dir): @@ -285,6 +206,7 @@ def from_clinvar_name_to_dna(name): return name[start:end] + def download_gene_lovd(gene_list:list,folder_path,raise_exception = False): """ Downloads data into txt files from gene_list. @@ -313,3 +235,96 @@ def download_gene_lovd(gene_list:list,folder_path,raise_exception = False): raise ValueError(f"Symbol: {gene} does not exist in the LOVD database") else: logging.error("Symbol: %s does not exist in the LOVD database",gene) + + +def download_database_for_eys_gene(database_name, override=False): + """ + downloads chosen database + and handles where it should be saved, + renames the downloaded (latest) file to appropriate name + :param database_name: the name of the database + :param override: should an existing file be overriden with a new one + """ + + url = DATABASES_DOWNLOAD_PATHS[database_name]["url"] + button_location = DATABASES_DOWNLOAD_PATHS[database_name]["button"] + clickable = DATABASES_DOWNLOAD_PATHS[database_name]["clickable"] + + firefox_options = webdriver.FirefoxOptions() + firefox_options.headless = True + firefox_options.add_argument('--headless') + firefox_options.set_preference("browser.download.folderList", 2) + firefox_options.set_preference("browser.download.manager.showWhenStarting", False) + firefox_options.set_preference("browser.download.dir", os.path.join(os.getcwd(), "..", "data", database_name)) + firefox_options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream") + + driver = webdriver.Firefox(options=firefox_options) + driver.get(url) + WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, clickable))) + driver.execute_script(button_location) + + time.sleep(30) + driver.quit() + + save_as = DATABASES_DOWNLOAD_PATHS[database_name]["store_as"] + os_path = os.path.join(os.getcwd(), "..", "data", database_name, save_as) + + if os.path.exists(os_path) and override: + os.remove(os_path) + elif os.path.exists(os_path) and not override: + print("File already exists") + return + list_of_files = glob.glob(os.path.join(os.getcwd(), "..", "data", database_name, '*')) + latest_file = max(list_of_files, key=os.path.getctime) + os.rename(latest_file, os_path) + + +def store_database_for_eys_gene(database_name, override=False): + """ + calls a function to download a database + :param database_name: the name of the database that should be downloaded + :param override: should already existing file be overwritten + """ + try: + if database_name not in DATABASES_DOWNLOAD_PATHS: + raise IndexError(f"Requested {database_name} database is not supported") + + DATABASES_DOWNLOAD_PATHS[database_name]["function"](database_name, override) + + except TimeoutError as e: + print(f"Error: {e}") + except selenium.common.InvalidArgumentException as e: + print(f"Error: {e}") + except selenium.common.exceptions.WebDriverException as e: + print(f"Error: {e}") + except ValueError as e: + print(f"Error:{e}") + except IndexError as e: + print(f"Error:{e}") + except BadResponseException as e: + print(f"Error:{e}") + except DownloadError as e: + print(f"Error:{e}") + + +DATABASES_DOWNLOAD_PATHS = { + "clinvar": { + "button": 'document.getElementsByName(\"EntrezSystem2.PEntrez.clinVar.clinVar_Entrez_ResultsPanel.Entrez_DisplayBar.SendToSubmit\")[0].click()', + "url": CLINVAR_URL_EYS, + "store_as": "clinvar_data.txt", + "clickable": "/html/body/div[1]/div[1]/form/div[1]/div[5]/div/div[2]/div[2]/div[1]/div/div[1]/a[3]", + "function": download_database_for_eys_gene + }, + "gnomad": { + "button":"document.getElementsByClassName('Button__BaseButton-sc-1eobygi-0 Button-sc-1eobygi-1 indcWT')[4].click()", + "url": GNOMAD_URL_EYS, + "store_as": "gnomad_data.csv", + "clickable": "/html/body/div[1]/div[3]/div[2]/div/div[7]/div[4]/div[2]/button[1]", + "function": download_database_for_eys_gene + }, + "lovd": { + "url": LOVD_FILE_URL_EYS, + "store_as": "../data/lovd/lovd_data.txt", + "function": download_lovd_database_for_eys_gene + } +} diff --git a/requirements.txt b/requirements.txt index a94cf69..ca3ef2d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ requests -pandas \ No newline at end of file +pandas +glob2 +selenium \ No newline at end of file