Skip to content

Commit

Permalink
Merge branch 'main' into KCE/LOVD_data_conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
Strexas authored Mar 4, 2024
2 parents 08e6f81 + 9759668 commit 04498e2
Show file tree
Hide file tree
Showing 5 changed files with 256 additions and 296 deletions.
24 changes: 24 additions & 0 deletions .github/workflows/pylint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: Pylint

on: [push]

jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pylint
pip install -r requirements.txt
- name: Analysing the code with pylint
run: |
pylint $(git ls-files '*.py')
Empty file removed data_collection/.gitkeep
Empty file.
153 changes: 153 additions & 0 deletions data_collection/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
"""Module for constants used in data collection."""

# files
LOVD_URL = "https://databases.lovd.nl/shared/genes"
LOVD_URL_EYS = "https://databases.lovd.nl/shared/genes/EYS"
LOVD_FILE_URL = "https://databases.lovd.nl/shared/download/all/gene"
LOVD_FILE_URL_EYS = "https://databases.lovd.nl/shared/download/all/gene/EYS"

GNOMAD_URL = "https://gnomad.broadinstitute.org/gene"
GNOMAD_URL_EYS = "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4"
GNOMAD_FILE_URL_EYS = ("https://drive.usercontent.google.com/u/0/uc?id=1crkDCVcC0PSnv0JPGj3FpemBs28"
"-T_3y&export=download")

CLINVAR_URL = "https://www.ncbi.nlm.nih.gov/clinvar"
CLINVAR_URL_EYS = "https://www.ncbi.nlm.nih.gov/clinvar/?term=eys%5Bgene%5D&redir=gene"
CLINVAR_FILE_URL_EYS = ("https://drive.usercontent.google.com/u/0/uc?id=1RK5XBK3k5h0K6f-qfwJSQj7tlF"
"-H2U6u&export=download")

# paths
DATA_PATH = "../data"
LOVD_PATH = DATA_PATH + "/lovd"
GNOMAD_PATH = DATA_PATH + "/gnomad"
CLINVAR_PATH = DATA_PATH + "/clinvar"

# variable data types
LOVD_VARIABLES_DATA_TYPES = {
'id': 'String',
'name': 'String',
'chromosome': 'Integer',
'chrom_band': 'String',
'imprinting': 'String',
'refseq_genomic': 'String',
'refseq_UD': 'String',
'reference': 'String',
'url_homepage': 'String',
'url_external': 'String',
'allow_download': 'Boolean',
'id_hgnc': 'Integer',
'id_entrez': 'Integer',
'id_omim': 'Integer',
'show_hgmd': 'Boolean',
'show_genecards': 'Boolean',
'show_genetests': 'Boolean',
'show_orphanet': 'Boolean',
'note_index': 'String',
'note_listing': 'String',
'refseq': 'String',
'refseq_url': 'String',
'disclaimer': 'Boolean',
'disclaimer_text': 'String',
'header': 'String',
'header_align': 'Integer',
'footer': 'String',
'footer_align': 'Integer',
'created_by': 'Integer',
'created_date': 'Date',
'edited_by': 'Integer',
'edited_date': 'Date',
'updated_by': 'Integer',
'updated_date': 'Date',
'transcriptid': 'Integer',
'effectid': 'Integer',
'position_c_start': 'Integer',
'position_c_start_intron': 'Integer',
'position_c_end': 'Integer',
'position_c_end_intron': 'Integer',
'VariantOnTranscript/DNA': 'String',
'VariantOnTranscript/RNA': 'String',
'VariantOnTranscript/Protein': 'String',
'VariantOnTranscript/Exon': 'String',
'symbol': 'String',
'inheritance': 'String',
'id_omin': 'Integer',
'tissues': 'String',
'features': 'String',
'remarks': 'String',
'geneid': 'String',
'id_mutalyzer': 'Integer',
'id_ncbi': 'String',
'id_ensembl': 'String',
'id_protein_ncbi': 'String',
'id_protein_ensembl': 'String',
'id_protein_uniprot': 'String',
'position_c_mrna_start': 'Integer',
'position_c_mrna_end': 'Integer',
'position_c_cds_end': 'Integer',
'position_g_mrna_start': 'Integer',
'position_g_mrna_end': 'Integer',
'diseaseid': 'Integer',
'individualid': 'Integer',
'Phenotype/Inheritance': 'String',
'Phenotype/Age': 'String',
'Phenotype/Additional': 'String',
'Phenotype/Biochem_param': 'String',
'Phenotype/Age/Onset': 'String',
'Phenotype/Age/Diagnosis': 'String',
'Phenotype/Severity_score': 'String',
'Phenotype/Onset': 'String',
'Phenotype/Protein': 'String',
'Phenotype/Tumor/MSI': 'String',
'Phenotype/Enzyme/CPK': 'String',
'Phenotype/Heart/Myocardium': 'String',
'Phenotype/Lung': 'String',
'Phenotype/Diagnosis/Definite': 'String',
'Phenotype/Diagnosis/Initial': 'String',
'Phenotype/Diagnosis/Criteria': 'String',
'variants_found': 'Integer',
'Screening/Technique': 'String',
'Screening/Template': 'String',
'Screening/Tissue': 'String',
'Screening/Remarks': 'String',
'fatherid': 'String',
'motherid': 'String',
'panelid': 'Integer',
'panel_size': 'Integer',
'license': 'String',
'Individual/Reference': 'String',
'Individual/Remarks': 'String',
'Individual/Gender': 'String',
'Individual/Consanguinity': 'String',
'Individual/Age_of_death': 'String',
'Individual/VIP': 'String',
'Individual/Data_av': 'String',
'Individual/Treatment': 'String',
'Individual/Origin/Population': 'String',
'Individual/Individual_ID': 'String',
'allele': 'Integer',
'position_g_start': 'Integer',
'position_g_end': 'Integer',
'type': 'String',
'average_frequency': 'Double',
'VariantOnGenome/DBID': 'String',
'VariantOnGenome/DNA': 'String',
'VariantOnGenome/Frequency': 'String',
'VariantOnGenome/Reference': 'String',
'VariantOnGenome/Restriction_site': 'String',
'VariantOnGenome/Published_as': 'String',
'VariantOnGenome/Remarks': 'String',
'VariantOnGenome/Genetic_origin': 'String',
'VariantOnGenome/Segregation': 'String',
'VariantOnGenome/dbSNP': 'String',
'VariantOnGenome/VIP': 'String',
'VariantOnGenome/Methylation': 'String',
'VariantOnGenome/ISCN': 'String',
'VariantOnGenome/DNA/hg38': 'String',
'VariantOnGenome/ClinVar': 'String',
'VariantOnGenome/ClinicalClassification': 'String',
'VariantOnGenome/ClinicalClassification/Method': 'String',
'screeningid': 'Integer',
'variantid': 'Integer',
'owned_by': 'Integer',
'Individual/Origin/Geographic': 'String'
}
52 changes: 22 additions & 30 deletions data_collection/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,15 @@
"""Module executes general pipeline for data collection"""
import pandas as pd
from pandas import DataFrame, Series
from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_DNA, convert_lovd_to_datatype

# CONSTANTS
# files
LOVD_URL = "https://databases.lovd.nl/shared/genes/EYS"
LOVD_FILE_URL = "https://databases.lovd.nl/shared/download/all/gene/EYS"

GNOMAD_URL = "https://gnomad.broadinstitute.org/gene/ENSG00000188107?dataset=gnomad_r4"
GNOMAD_FILE_URL = "https://drive.usercontent.google.com/u/0/uc?id=1crkDCVcC0PSnv0JPGj3FpemBs28-T_3y&export=download"

CLINVAR_URL = "https://www.ncbi.nlm.nih.gov/clinvar/?term=eys%5Bgene%5D&redir=gene"
CLINVAR_FILE_URL = "https://drive.usercontent.google.com/u/0/uc?id=1RK5XBK3k5h0K6f-qfwJSQj7tlF-H2U6u&export=download"

# path
DATA_PATH = "../data"
LOVD_PATH = DATA_PATH + "/lovd"
GNOMAD_PATH = DATA_PATH + "/gnomad"
CLINVAR_PATH = DATA_PATH + "/clinvar"

from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_dna
from constants import (LOVD_FILE_URL_EYS,
GNOMAD_FILE_URL_EYS,
CLINVAR_FILE_URL_EYS,
DATA_PATH,
LOVD_PATH,
GNOMAD_PATH,
CLINVAR_PATH)

def calculate_max_frequency(row):
"""
Expand Down Expand Up @@ -57,9 +48,9 @@ def calculate_max_frequency(row):

# MAIN
# Download all data
get_file_from_url(LOVD_FILE_URL, LOVD_PATH + f"/lovd_data.txt", override=True)
get_file_from_url(GNOMAD_FILE_URL, GNOMAD_PATH + f"/gnomad_data.csv", override=True)
get_file_from_url(CLINVAR_FILE_URL, CLINVAR_PATH + f"/clinvar_data.txt", override=True)
get_file_from_url(LOVD_FILE_URL_EYS, LOVD_PATH + "/lovd_data.txt", override=True)
get_file_from_url(GNOMAD_FILE_URL_EYS, GNOMAD_PATH + "/gnomad_data.csv", override=True)
get_file_from_url(CLINVAR_FILE_URL_EYS, CLINVAR_PATH + "/clinvar_data.txt", override=True)

# Read and convert data
lovd_data = from_lovd_to_pandas(LOVD_PATH + "/lovd_data.txt")
Expand All @@ -77,29 +68,30 @@ def calculate_max_frequency(row):
notes = lovd_data["Variants_On_Transcripts"][1][::]

# Merging Clinvar
clinvar = clinvar_data.copy()[["Name(clinvar)", "Germline classification(clinvar)", "Accession(clinvar)"]]
clinvar["VariantOnTranscript/DNA"] = clinvar["Name(clinvar)"].apply(from_clinvar_name_to_DNA)
clinvar = clinvar_data.copy()[["Name(clinvar)",
"Germline classification(clinvar)",
"Accession(clinvar)"]]
clinvar["VariantOnTranscript/DNA"] = clinvar["Name(clinvar)"].apply(from_clinvar_name_to_dna)

main_frame = pd.merge(main_frame,
clinvar,
how="outer",
on=["VariantOnTranscript/DNA"]).drop("Name(clinvar)", axis=1)

# MERGING GnomAd
main_frame = pd.merge(main_frame,
gnomad_data,
how="left",
left_on="VariantOnTranscript/DNA",
right_on="HGVS Consequence(gnomad)").drop("HGVS Consequence(gnomad)", axis=1)

main_frame = (pd.merge(main_frame,
gnomad_data,
how="left",
left_on="VariantOnTranscript/DNA",
right_on="HGVS Consequence(gnomad)").drop("HGVS Consequence(gnomad)",
axis=1))

# Calculating frequencies
lovd_without_association_in_gnomad = pd.isnull(main_frame["Hemizygote Count Remaining(gnomad)"])
lovd_with_gnomad = main_frame[~lovd_without_association_in_gnomad].copy()
max_values = lovd_with_gnomad.apply(calculate_max_frequency, axis=1)
lovd_with_gnomad[['PopMax(gnomad)', 'PopMax population(gnomad)']] = max_values


# Leaving necessary columns

lovd_with_gnomad = lovd_with_gnomad.loc[:, ['id',
Expand Down
Loading

0 comments on commit 04498e2

Please sign in to comment.