Skip to content

Commit

Permalink
PPR/move_constants
Browse files Browse the repository at this point in the history
feat: moved constants to separete file
  • Loading branch information
Strexas authored Mar 4, 2024
2 parents 03386ac + e613bcb commit 9759668
Show file tree
Hide file tree
Showing 3 changed files with 164 additions and 153 deletions.
153 changes: 153 additions & 0 deletions data_collection/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
"""Module for constants used in data collection."""

# files
LOVD_URL = "https://databases.lovd.nl/shared/genes"
LOVD_URL_EYS = "https://databases.lovd.nl/shared/genes/EYS"
LOVD_FILE_URL = "https://databases.lovd.nl/shared/download/all/gene"
LOVD_FILE_URL_EYS = "https://databases.lovd.nl/shared/download/all/gene/EYS"

GNOMAD_URL = "https://gnomad.broadinstitute.org/gene"
GNOMAD_URL_EYS = "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4"
GNOMAD_FILE_URL_EYS = ("https://drive.usercontent.google.com/u/0/uc?id=1crkDCVcC0PSnv0JPGj3FpemBs28"
"-T_3y&export=download")

CLINVAR_URL = "https://www.ncbi.nlm.nih.gov/clinvar"
CLINVAR_URL_EYS = "https://www.ncbi.nlm.nih.gov/clinvar/?term=eys%5Bgene%5D&redir=gene"
CLINVAR_FILE_URL_EYS = ("https://drive.usercontent.google.com/u/0/uc?id=1RK5XBK3k5h0K6f-qfwJSQj7tlF"
"-H2U6u&export=download")

# paths
DATA_PATH = "../data"
LOVD_PATH = DATA_PATH + "/lovd"
GNOMAD_PATH = DATA_PATH + "/gnomad"
CLINVAR_PATH = DATA_PATH + "/clinvar"

# variable data types
LOVD_VARIABLES_DATA_TYPES = {
'id': 'String',
'name': 'String',
'chromosome': 'Integer',
'chrom_band': 'String',
'imprinting': 'String',
'refseq_genomic': 'String',
'refseq_UD': 'String',
'reference': 'String',
'url_homepage': 'String',
'url_external': 'String',
'allow_download': 'Boolean',
'id_hgnc': 'Integer',
'id_entrez': 'Integer',
'id_omim': 'Integer',
'show_hgmd': 'Boolean',
'show_genecards': 'Boolean',
'show_genetests': 'Boolean',
'show_orphanet': 'Boolean',
'note_index': 'String',
'note_listing': 'String',
'refseq': 'String',
'refseq_url': 'String',
'disclaimer': 'Boolean',
'disclaimer_text': 'String',
'header': 'String',
'header_align': 'Integer',
'footer': 'String',
'footer_align': 'Integer',
'created_by': 'Integer',
'created_date': 'Date',
'edited_by': 'Integer',
'edited_date': 'Date',
'updated_by': 'Integer',
'updated_date': 'Date',
'transcriptid': 'Integer',
'effectid': 'Integer',
'position_c_start': 'Integer',
'position_c_start_intron': 'Integer',
'position_c_end': 'Integer',
'position_c_end_intron': 'Integer',
'VariantOnTranscript/DNA': 'String',
'VariantOnTranscript/RNA': 'String',
'VariantOnTranscript/Protein': 'String',
'VariantOnTranscript/Exon': 'String',
'symbol': 'String',
'inheritance': 'String',
'id_omin': 'Integer',
'tissues': 'String',
'features': 'String',
'remarks': 'String',
'geneid': 'String',
'id_mutalyzer': 'Integer',
'id_ncbi': 'String',
'id_ensembl': 'String',
'id_protein_ncbi': 'String',
'id_protein_ensembl': 'String',
'id_protein_uniprot': 'String',
'position_c_mrna_start': 'Integer',
'position_c_mrna_end': 'Integer',
'position_c_cds_end': 'Integer',
'position_g_mrna_start': 'Integer',
'position_g_mrna_end': 'Integer',
'diseaseid': 'Integer',
'individualid': 'Integer',
'Phenotype/Inheritance': 'String',
'Phenotype/Age': 'String',
'Phenotype/Additional': 'String',
'Phenotype/Biochem_param': 'String',
'Phenotype/Age/Onset': 'String',
'Phenotype/Age/Diagnosis': 'String',
'Phenotype/Severity_score': 'String',
'Phenotype/Onset': 'String',
'Phenotype/Protein': 'String',
'Phenotype/Tumor/MSI': 'String',
'Phenotype/Enzyme/CPK': 'String',
'Phenotype/Heart/Myocardium': 'String',
'Phenotype/Lung': 'String',
'Phenotype/Diagnosis/Definite': 'String',
'Phenotype/Diagnosis/Initial': 'String',
'Phenotype/Diagnosis/Criteria': 'String',
'variants_found': 'Integer',
'Screening/Technique': 'String',
'Screening/Template': 'String',
'Screening/Tissue': 'String',
'Screening/Remarks': 'String',
'fatherid': 'String',
'motherid': 'String',
'panelid': 'Integer',
'panel_size': 'Integer',
'license': 'String',
'Individual/Reference': 'String',
'Individual/Remarks': 'String',
'Individual/Gender': 'String',
'Individual/Consanguinity': 'String',
'Individual/Age_of_death': 'String',
'Individual/VIP': 'String',
'Individual/Data_av': 'String',
'Individual/Treatment': 'String',
'Individual/Origin/Population': 'String',
'Individual/Individual_ID': 'String',
'allele': 'Integer',
'position_g_start': 'Integer',
'position_g_end': 'Integer',
'type': 'String',
'average_frequency': 'Double',
'VariantOnGenome/DBID': 'String',
'VariantOnGenome/DNA': 'String',
'VariantOnGenome/Frequency': 'String',
'VariantOnGenome/Reference': 'String',
'VariantOnGenome/Restriction_site': 'String',
'VariantOnGenome/Published_as': 'String',
'VariantOnGenome/Remarks': 'String',
'VariantOnGenome/Genetic_origin': 'String',
'VariantOnGenome/Segregation': 'String',
'VariantOnGenome/dbSNP': 'String',
'VariantOnGenome/VIP': 'String',
'VariantOnGenome/Methylation': 'String',
'VariantOnGenome/ISCN': 'String',
'VariantOnGenome/DNA/hg38': 'String',
'VariantOnGenome/ClinVar': 'String',
'VariantOnGenome/ClinicalClassification': 'String',
'VariantOnGenome/ClinicalClassification/Method': 'String',
'screeningid': 'Integer',
'variantid': 'Integer',
'owned_by': 'Integer',
'Individual/Origin/Geographic': 'String'
}
33 changes: 10 additions & 23 deletions data_collection/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,13 @@
import pandas as pd

from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_dna

# CONSTANTS
# files
LOVD_URL = "https://databases.lovd.nl/shared/genes/EYS"
LOVD_FILE_URL = "https://databases.lovd.nl/shared/download/all/gene/EYS"

GNOMAD_URL = "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4"
GNOMAD_FILE_URL = ("https://drive.usercontent.google.com/u/0/uc?id=1crkDCVcC0PSnv0JPGj3FpemBs28"
"-T_3y&export=download")

CLINVAR_URL = "https://www.ncbi.nlm.nih.gov/clinvar/?term=eys%5Bgene%5D&redir=gene"
CLINVAR_FILE_URL = ("https://drive.usercontent.google.com/u/0/uc?id=1RK5XBK3k5h0K6f-qfwJSQj7tlF"
"-H2U6u&export=download")

# path
DATA_PATH = "../data"
LOVD_PATH = DATA_PATH + "/lovd"
GNOMAD_PATH = DATA_PATH + "/gnomad"
CLINVAR_PATH = DATA_PATH + "/clinvar"

from constants import (LOVD_FILE_URL_EYS,
GNOMAD_FILE_URL_EYS,
CLINVAR_FILE_URL_EYS,
DATA_PATH,
LOVD_PATH,
GNOMAD_PATH,
CLINVAR_PATH)

def calculate_max_frequency(row):
"""
Expand Down Expand Up @@ -60,9 +47,9 @@ def calculate_max_frequency(row):

# MAIN
# Download all data
get_file_from_url(LOVD_FILE_URL, LOVD_PATH + "/lovd_data.txt", override=True)
get_file_from_url(GNOMAD_FILE_URL, GNOMAD_PATH + "/gnomad_data.csv", override=True)
get_file_from_url(CLINVAR_FILE_URL, CLINVAR_PATH + "/clinvar_data.txt", override=True)
get_file_from_url(LOVD_FILE_URL_EYS, LOVD_PATH + "/lovd_data.txt", override=True)
get_file_from_url(GNOMAD_FILE_URL_EYS, GNOMAD_PATH + "/gnomad_data.csv", override=True)
get_file_from_url(CLINVAR_FILE_URL_EYS, CLINVAR_PATH + "/clinvar_data.txt", override=True)

# Read and convert data
lovd_data = from_lovd_to_pandas(LOVD_PATH + "/lovd_data.txt")
Expand Down
131 changes: 1 addition & 130 deletions data_collection/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from requests.exceptions import RequestException
import pandas as pd
from pandas import DataFrame
from constants import LOVD_VARIABLES_DATA_TYPES


# EXCEPTIONS
Expand All @@ -16,136 +17,6 @@ class DownloadError(Exception):
"""Custom exception for download errors."""


# CONSTANTS
LOVD_VARIABLES_DATA_TYPES = {
'id': 'String',
'name': 'String',
'chromosome': 'Integer',
'chrom_band': 'String',
'imprinting': 'String',
'refseq_genomic': 'String',
'refseq_UD': 'String',
'reference': 'String',
'url_homepage': 'String',
'url_external': 'String',
'allow_download': 'Boolean',
'id_hgnc': 'Integer',
'id_entrez': 'Integer',
'id_omim': 'Integer',
'show_hgmd': 'Boolean',
'show_genecards': 'Boolean',
'show_genetests': 'Boolean',
'show_orphanet': 'Boolean',
'note_index': 'String',
'note_listing': 'String',
'refseq': 'String',
'refseq_url': 'String',
'disclaimer': 'Boolean',
'disclaimer_text': 'String',
'header': 'String',
'header_align': 'Integer',
'footer': 'String',
'footer_align': 'Integer',
'created_by': 'Integer',
'created_date': 'Date',
'edited_by': 'Integer',
'edited_date': 'Date',
'updated_by': 'Integer',
'updated_date': 'Date',
'transcriptid': 'Integer',
'effectid': 'Integer',
'position_c_start': 'Integer',
'position_c_start_intron': 'Integer',
'position_c_end': 'Integer',
'position_c_end_intron': 'Integer',
'VariantOnTranscript/DNA': 'String',
'VariantOnTranscript/RNA': 'String',
'VariantOnTranscript/Protein': 'String',
'VariantOnTranscript/Exon': 'String',
'symbol': 'String',
'inheritance': 'String',
'id_omin': 'Integer',
'tissues': 'String',
'features': 'String',
'remarks': 'String',
'geneid': 'String',
'id_mutalyzer': 'Integer',
'id_ncbi': 'String',
'id_ensembl': 'String',
'id_protein_ncbi': 'String',
'id_protein_ensembl': 'String',
'id_protein_uniprot': 'String',
'position_c_mrna_start': 'Integer',
'position_c_mrna_end': 'Integer',
'position_c_cds_end': 'Integer',
'position_g_mrna_start': 'Integer',
'position_g_mrna_end': 'Integer',
'diseaseid': 'Integer',
'individualid': 'Integer',
'Phenotype/Inheritance': 'String',
'Phenotype/Age': 'String',
'Phenotype/Additional': 'String',
'Phenotype/Biochem_param': 'String',
'Phenotype/Age/Onset': 'String',
'Phenotype/Age/Diagnosis': 'String',
'Phenotype/Severity_score': 'String',
'Phenotype/Onset': 'String',
'Phenotype/Protein': 'String',
'Phenotype/Tumor/MSI': 'String',
'Phenotype/Enzyme/CPK': 'String',
'Phenotype/Heart/Myocardium': 'String',
'Phenotype/Lung': 'String',
'Phenotype/Diagnosis/Definite': 'String',
'Phenotype/Diagnosis/Initial': 'String',
'Phenotype/Diagnosis/Criteria': 'String',
'variants_found': 'Integer',
'Screening/Technique': 'String',
'Screening/Template': 'String',
'Screening/Tissue': 'String',
'Screening/Remarks': 'String',
'fatherid': 'String',
'motherid': 'String',
'panelid': 'Integer',
'panel_size': 'Integer',
'license': 'String',
'Individual/Reference': 'String',
'Individual/Remarks': 'String',
'Individual/Gender': 'String',
'Individual/Consanguinity': 'String',
'Individual/Age_of_death': 'String',
'Individual/VIP': 'String',
'Individual/Data_av': 'String',
'Individual/Treatment': 'String',
'Individual/Origin/Population': 'String',
'Individual/Individual_ID': 'String',
'allele': 'Integer',
'position_g_start': 'Integer',
'position_g_end': 'Integer',
'type': 'String',
'average_frequency': 'Double',
'VariantOnGenome/DBID': 'String',
'VariantOnGenome/DNA': 'String',
'VariantOnGenome/Frequency': 'String',
'VariantOnGenome/Reference': 'String',
'VariantOnGenome/Restriction_site': 'String',
'VariantOnGenome/Published_as': 'String',
'VariantOnGenome/Remarks': 'String',
'VariantOnGenome/Genetic_origin': 'String',
'VariantOnGenome/Segregation': 'String',
'VariantOnGenome/dbSNP': 'String',
'VariantOnGenome/VIP': 'String',
'VariantOnGenome/Methylation': 'String',
'VariantOnGenome/ISCN': 'String',
'VariantOnGenome/DNA/hg38': 'String',
'VariantOnGenome/ClinVar': 'String',
'VariantOnGenome/ClinicalClassification': 'String',
'VariantOnGenome/ClinicalClassification/Method': 'String',
'screeningid': 'Integer',
'variantid': 'Integer',
'owned_by': 'Integer',
'Individual/Origin/Geographic': 'String'
}


def get_file_from_url(url, save_to, override=False):
"""
Expand Down

0 comments on commit 9759668

Please sign in to comment.