From 7a0ed5324a35ac372da556c2e39f0445514f828e Mon Sep 17 00:00:00 2001 From: Kajus CC <42713684+KajusC@users.noreply.github.com> Date: Mon, 13 May 2024 22:20:32 +0300 Subject: [PATCH] Added functionality to retrieve g. gene (VariantOnGenome/DNA/hg38) from Variants_On_Transcripts by comparing ClinVar and LOVD gene sets. --- data_collection/refactoring.py | 52 + tests/pipeline.ipynb | 3713 +++++++++++++++++++++++++++++++- 2 files changed, 3753 insertions(+), 12 deletions(-) diff --git a/data_collection/refactoring.py b/data_collection/refactoring.py index fb647fd..1ea5d68 100644 --- a/data_collection/refactoring.py +++ b/data_collection/refactoring.py @@ -2,6 +2,7 @@ import os import logging +import re import pandas as pd from pandas import DataFrame @@ -125,3 +126,54 @@ def from_clinvar_name_to_cdna_position(name): break return name[start:end] + + +def filter_eys_genes(clinvar_data): + """ + Filters out EYS genes from ClinVar data. + + :param DataFrame clinvar_data: Dataframe data + :returns: filtered data + """ + filtered_data = [] + ends = {'del', 'delins', 'dup', 'ins', 'inv', 'subst'} + for item in clinvar_data["Name"]: + if "(EYS)" in item: + match = re.match(r'^.*\(EYS\):(c\.[A-Za-z0-9_]+>[A-Za-z])(?:\s*\(.*\))?', item) + if match and not any(end in match.group(1) for end in ends): + filtered_data.append(match.group(1)) + + return filtered_data + + +def lovd_gnomad_merge(lovd, clinvar): + """ + Merges LOVD and GnomAD data based on the DNA position. + + :param dict[str, dict[DataFrame, str]] lovd: LOVD data + :param DataFrame clinvar: ClinVar data + :returns: Merged data + :rtype: list[str] + """ + # region_EYS_extraction + filtered_data = filter_eys_genes(clinvar) + + lovd_data = lovd + + gene_ids = [] + + for key, value in lovd_data["Variants_On_Transcripts"]["VariantOnTranscript/DNA"].items(): + if value in filtered_data: + gene_id = key + if gene_id: + gene_ids.append(key) + print(key) + + final_dna = [] + for key, value in lovd_data["Variants_On_Genome"]["VariantOnGenome/DNA/hg38"].items(): + if key in gene_ids: + gene = value + if gene: + final_dna.append(gene) + + return final_dna diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index d2a2fcc..e17bd9e 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -2,34 +2,41 @@ "cells": [ { "cell_type": "code", - "execution_count": null, "id": "initial_id", "metadata": { - "collapsed": true + "collapsed": true, + "ExecuteTime": { + "end_time": "2024-04-29T17:33:27.479376Z", + "start_time": "2024-04-29T17:33:26.832318Z" + } }, - "outputs": [], "source": [ "from data_collection import (store_database_for_eys_gene,\n", " parse_lovd,\n", " LOVD_PATH,\n", " set_lovd_dtypes)" - ] + ], + "outputs": [], + "execution_count": 2 }, { "cell_type": "code", - "outputs": [], "source": [ "store_database_for_eys_gene(\"lovd\", override=False)" ], "metadata": { - "collapsed": false + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-04-29T17:33:28.345029Z", + "start_time": "2024-04-29T17:33:27.480323Z" + } }, "id": "f49f7691a27aa7b4", - "execution_count": null + "outputs": [], + "execution_count": 3 }, { "cell_type": "code", - "outputs": [], "source": [ "data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")\n", "for i in data:\n", @@ -37,14 +44,2673 @@ " display(data[i])" ], "metadata": { - "collapsed": false + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-04-29T17:33:35.172678Z", + "start_time": "2024-04-29T17:33:28.345029Z" + } }, "id": "cf5c45c0f7b9de0f", - "execution_count": null + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Genes\n" + ] + }, + { + "data": { + "text/plain": [ + " id name chromosome chrom_band imprinting \\\n", + "0 EYS eyes shut homolog (Drosophila) 6 q12 unknown \n", + "\n", + " refseq_genomic refseq_UD reference url_homepage \\\n", + "0 NG_023443.2 UD_132085377375 http://www.LOVD.nl/EYS \n", + "\n", + " url_external ... header \\\n", + "0 ... This database is one o... \n", + "\n", + " header_align footer footer_align created_by created_date edited_by \\\n", + "0 -1 -1 00001 2012-02-13 00:00:00 00006 \n", + "\n", + " edited_date updated_by updated_date \n", + "0 2023-08-30 13:08:19 00000 2024-04-19 20:27:30 \n", + "\n", + "[1 rows x 34 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamechromosomechrom_bandimprintingrefseq_genomicrefseq_UDreferenceurl_homepageurl_external...headerheader_alignfooterfooter_aligncreated_bycreated_dateedited_byedited_dateupdated_byupdated_date
0EYSeyes shut homolog (Drosophila)6q12unknownNG_023443.2UD_132085377375http://www.LOVD.nl/EYS...<font color=\\\"#FF0000\\\">This database is one o...-1-1000012012-02-13 00:00:00000062023-08-30 13:08:19000002024-04-19 20:27:30
\n", + "

1 rows × 34 columns

\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Transcripts\n" + ] + }, + { + "data": { + "text/plain": [ + " id geneid name id_mutalyzer id_ncbi \\\n", + "0 00007329 EYS transcript variant 1 001 NM_001142800.1 \n", + "\n", + " id_ensembl id_protein_ncbi id_protein_ensembl id_protein_uniprot remarks \\\n", + "0 NP_001136272.1 \n", + "\n", + " position_c_mrna_start position_c_mrna_end position_c_cds_end \\\n", + "0 -538 10051 9435 \n", + "\n", + " position_g_mrna_start position_g_mrna_end created_by created_date \\\n", + "0 66417118 64429876 0000-00-00 00:00:00 \n", + "\n", + " edited_by edited_date \n", + "0 " + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idgeneidnameid_mutalyzerid_ncbiid_ensemblid_protein_ncbiid_protein_ensemblid_protein_uniprotremarksposition_c_mrna_startposition_c_mrna_endposition_c_cds_endposition_g_mrna_startposition_g_mrna_endcreated_bycreated_dateedited_byedited_date
000007329EYStranscript variant 1001NM_001142800.1NP_001136272.1-53810051943566417118644298760000-00-00 00:00:00
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Diseases\n" + ] + }, + { + "data": { + "text/plain": [ + " id symbol \\\n", + "0 00012 PSORS \n", + "1 00058 CORD \n", + "2 00112 RP \n", + "3 00139 ID \n", + "4 00173 SLOS \n", + "5 00198 ? \n", + "6 02156 - \n", + "7 02440 RP25 \n", + "8 04211 RPar \n", + "9 04214 - \n", + "10 04249 macular dystrophy \n", + "11 05086 HL \n", + "12 05415 USH \n", + "13 05468 uveitis \n", + "14 06906 DEE \n", + "\n", + " name inheritance id_omim \\\n", + "0 psoriasis, pustular, generalized (PSORS) \n", + "1 dystrophy, cone-rod (CORD) \n", + "2 retinitis pigmentosa (RP) 268000 \n", + "3 intellectual disability (ID) \n", + "4 Smith-Lemli-Opitz syndrome (SLOS) AR 270400 \n", + "5 unclassified / mixed \n", + "6 retinitis pigmentosa, X-linked, and sinorespir... 300455 \n", + "7 retinitis pigmentosa, type 25 (RP25) AR 602772 \n", + "8 retinitis pigmentosa, autosomal recessive (RPar) \n", + "9 retinal disease \n", + "10 dystrophy, macular \n", + "11 hearing loss (HL) \n", + "12 Usher syndrome (USH) \n", + "13 uveitis \n", + "14 encephalopathy, developmental and epileptic \n", + "\n", + " tissues features remarks created_by created_date edited_by \\\n", + "0 00006 2012-07-06 21:50:32 00006 \n", + "1 00006 2012-09-22 11:31:25 00006 \n", + "2 00001 2013-02-21 17:12:36 00006 \n", + "3 00084 2013-06-04 18:18:07 00006 \n", + "4 00006 2013-08-01 11:16:14 00006 \n", + "5 00006 2013-09-13 14:21:47 00006 \n", + "6 00006 2014-09-25 23:29:40 00006 \n", + "7 00006 2014-09-25 23:29:40 00006 \n", + "8 00006 2015-02-27 18:58:57 \n", + "9 00006 2015-02-27 19:48:07 00001 \n", + "10 00006 2015-05-04 22:10:58 00006 \n", + "11 00006 2015-10-23 11:41:05 00006 \n", + "12 00006 2018-04-02 16:40:44 \n", + "13 00006 2018-08-22 09:47:04 \n", + "14 00006 2022-04-07 09:24:23 \n", + "\n", + " edited_date \n", + "0 2019-08-12 13:38:21 \n", + "1 2020-08-30 09:43:59 \n", + "2 2021-01-18 09:53:26 \n", + "3 2015-02-09 10:02:49 \n", + "4 2021-12-10 21:51:32 \n", + "5 2016-10-22 17:54:40 \n", + "6 2021-12-10 21:51:32 \n", + "7 2021-12-10 21:51:32 \n", + "8 \n", + "9 2023-03-09 14:26:26 \n", + "10 2024-02-15 21:18:39 \n", + "11 2015-10-23 11:43:00 \n", + "12 \n", + "13 \n", + "14 " + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idsymbolnameinheritanceid_omimtissuesfeaturesremarkscreated_bycreated_dateedited_byedited_date
000012PSORSpsoriasis, pustular, generalized (PSORS)000062012-07-06 21:50:32000062019-08-12 13:38:21
100058CORDdystrophy, cone-rod (CORD)000062012-09-22 11:31:25000062020-08-30 09:43:59
200112RPretinitis pigmentosa (RP)268000000012013-02-21 17:12:36000062021-01-18 09:53:26
300139IDintellectual disability (ID)000842013-06-04 18:18:07000062015-02-09 10:02:49
400173SLOSSmith-Lemli-Opitz syndrome (SLOS)AR270400000062013-08-01 11:16:14000062021-12-10 21:51:32
500198?unclassified / mixed000062013-09-13 14:21:47000062016-10-22 17:54:40
602156-retinitis pigmentosa, X-linked, and sinorespir...300455000062014-09-25 23:29:40000062021-12-10 21:51:32
702440RP25retinitis pigmentosa, type 25 (RP25)AR602772000062014-09-25 23:29:40000062021-12-10 21:51:32
804211RParretinitis pigmentosa, autosomal recessive (RPar)000062015-02-27 18:58:57
904214-retinal disease000062015-02-27 19:48:07000012023-03-09 14:26:26
1004249macular dystrophydystrophy, macular000062015-05-04 22:10:58000062024-02-15 21:18:39
1105086HLhearing loss (HL)000062015-10-23 11:41:05000062015-10-23 11:43:00
1205415USHUsher syndrome (USH)000062018-04-02 16:40:44
1305468uveitisuveitis000062018-08-22 09:47:04
1406906DEEencephalopathy, developmental and epileptic000062022-04-07 09:24:23
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Genes_To_Diseases\n" + ] + }, + { + "data": { + "text/plain": [ + " geneid diseaseid\n", + "0 EYS 00112\n", + "1 EYS 02440" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
geneiddiseaseid
0EYS00112
1EYS02440
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Individuals\n" + ] + }, + { + "data": { + "text/plain": [ + " id fatherid motherid panelid panel_size license owned_by \\\n", + "0 00000135 3 00006 \n", + "1 00000210 1 00039 \n", + "2 00001962 1 00025 \n", + "3 00016605 1 00552 \n", + "4 00033096 1 00229 \n", + "... ... ... ... ... ... ... ... \n", + "1445 00447702 1 00006 \n", + "1446 00447707 1 00006 \n", + "1447 00447716 1 00006 \n", + "1448 00447718 1 00006 \n", + "1449 00447720 1 00006 \n", + "\n", + " Individual/Reference Individual/Remarks \\\n", + "0 {PMID:Marrakchi 2011:21848462} 5-generation family, 3 affecteds (M) \n", + "1 {PMID:Abu-Safieh-2013:23105016} \n", + "2 \n", + "3 \n", + "4 {PMID:Neveling 2012:22334370} \n", + "... ... ... \n", + "1445 {PMID:Weisschuh 2024:37734845} patient, no family history \n", + "1446 {PMID:Weisschuh 2024:37734845} patient, no family history \n", + "1447 {PMID:Weisschuh 2024:37734845} patient, no family history \n", + "1448 {PMID:Weisschuh 2024:37734845} patient, no family history \n", + "1449 {PMID:Weisschuh 2024:37734845} patient \n", + "\n", + " Individual/Gender Individual/Consanguinity Individual/Origin/Geographic \\\n", + "0 M yes Tunisia \n", + "1 (Saudi Arabia) \n", + "2 M ? Germany \n", + "3 \n", + "4 M no \n", + "... ... ... ... \n", + "1445 F Germany \n", + "1446 M Germany \n", + "1447 F Germany \n", + "1448 M Germany \n", + "1449 M Germany \n", + "\n", + " Individual/Age_of_death Individual/VIP Individual/Data_av \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 0 \n", + "4 0 \n", + "... ... ... ... \n", + "1445 0 \n", + "1446 0 \n", + "1447 0 \n", + "1448 0 \n", + "1449 0 \n", + "\n", + " Individual/Treatment Individual/Origin/Population \\\n", + "0 \n", + "1 \n", + "2 white \n", + "3 \n", + "4 \n", + "... ... ... \n", + "1445 \n", + "1446 \n", + "1447 \n", + "1448 \n", + "1449 \n", + "\n", + " Individual/Individual_ID \n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 \n", + "... ... \n", + "1445 SRP-1105 \n", + "1446 SRP-1167 \n", + "1447 SRP-1249 \n", + "1448 SRP-1274 \n", + "1449 SRP-1299 \n", + "\n", + "[1450 rows x 18 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idfatheridmotheridpanelidpanel_sizelicenseowned_byIndividual/ReferenceIndividual/RemarksIndividual/GenderIndividual/ConsanguinityIndividual/Origin/GeographicIndividual/Age_of_deathIndividual/VIPIndividual/Data_avIndividual/TreatmentIndividual/Origin/PopulationIndividual/Individual_ID
000000135300006{PMID:Marrakchi 2011:21848462}5-generation family, 3 affecteds (M)MyesTunisia
100000210100039{PMID:Abu-Safieh-2013:23105016}(Saudi Arabia)
200001962100025M?Germanywhite
3000166051005520
400033096100229{PMID:Neveling 2012:22334370}Mno0
.........................................................
144500447702100006{PMID:Weisschuh 2024:37734845}patient, no family historyFGermany0SRP-1105
144600447707100006{PMID:Weisschuh 2024:37734845}patient, no family historyMGermany0SRP-1167
144700447716100006{PMID:Weisschuh 2024:37734845}patient, no family historyFGermany0SRP-1249
144800447718100006{PMID:Weisschuh 2024:37734845}patient, no family historyMGermany0SRP-1274
144900447720100006{PMID:Weisschuh 2024:37734845}patientMGermany0SRP-1299
\n", + "

1450 rows × 18 columns

\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Individuals_To_Diseases\n" + ] + }, + { + "data": { + "text/plain": [ + " individualid diseaseid\n", + "0 00000135 00012\n", + "1 00000210 00058\n", + "2 00001962 00173\n", + "3 00033096 04214\n", + "4 00033109 04214\n", + "... ... ...\n", + "1444 00447702 00198\n", + "1445 00447707 00198\n", + "1446 00447716 00198\n", + "1447 00447718 00198\n", + "1448 00447720 00198\n", + "\n", + "[1449 rows x 2 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
individualiddiseaseid
00000013500012
10000021000058
20000196200173
30003309604214
40003310904214
.........
14440044770200198
14450044770700198
14460044771600198
14470044771800198
14480044772000198
\n", + "

1449 rows × 2 columns

\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Phenotypes\n" + ] + }, + { + "data": { + "text/plain": [ + " id diseaseid individualid owned_by \\\n", + "0 0000000008 00012 00000135 00006 \n", + "1 0000000026 00058 00000210 00039 \n", + "2 0000000941 00173 00001962 00025 \n", + "3 0000026525 04214 00033096 00229 \n", + "4 0000026538 04214 00033109 00229 \n", + "... ... ... ... ... \n", + "1266 0000336901 00198 00447702 00006 \n", + "1267 0000336906 00198 00447707 00006 \n", + "1268 0000336915 00198 00447716 00006 \n", + "1269 0000336917 00198 00447718 00006 \n", + "1270 0000336919 00198 00447720 00006 \n", + "\n", + " Phenotype/Inheritance Phenotype/Age Phenotype/Additional \\\n", + "0 Familial, autosomal recessive \n", + "1 Familial, autosomal recessive \n", + "2 Familial 2-3 toe syndactyly \n", + "3 Unknown \n", + "4 Unknown \n", + "... ... ... ... \n", + "1266 Unknown \n", + "1267 Unknown \n", + "1268 Unknown \n", + "1269 Unknown \n", + "1270 Unknown \n", + "\n", + " Phenotype/Biochem_param Phenotype/Age/Onset Phenotype/Age/Diagnosis \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 \n", + "... ... ... ... \n", + "1266 \n", + "1267 \n", + "1268 \n", + "1269 \n", + "1270 \n", + "\n", + " Phenotype/Severity_score Phenotype/Onset Phenotype/Protein \\\n", + "0 \n", + "1 \n", + "2 5 \n", + "3 \n", + "4 \n", + "... ... ... ... \n", + "1266 \n", + "1267 \n", + "1268 \n", + "1269 \n", + "1270 \n", + "\n", + " Phenotype/Tumor/MSI Phenotype/Enzyme/CPK Phenotype/Heart/Myocardium \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 \n", + "... ... ... ... \n", + "1266 \n", + "1267 \n", + "1268 \n", + "1269 \n", + "1270 \n", + "\n", + " Phenotype/Lung Phenotype/Diagnosis/Definite \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 \n", + "... ... ... \n", + "1266 \n", + "1267 \n", + "1268 \n", + "1269 \n", + "1270 \n", + "\n", + " Phenotype/Diagnosis/Initial Phenotype/Diagnosis/Criteria \n", + "0 \n", + "1 \n", + "2 \n", + "3 retinitis pigmentosa \n", + "4 retinitis pigmentosa \n", + "... ... ... \n", + "1266 retinitis pigmentosa, simplex \n", + "1267 retinitis pigmentosa, simplex \n", + "1268 retinitis pigmentosa, simplex \n", + "1269 retinitis pigmentosa, simplex \n", + "1270 retinitis pigmentosa, simplex \n", + "\n", + "[1271 rows x 20 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddiseaseidindividualidowned_byPhenotype/InheritancePhenotype/AgePhenotype/AdditionalPhenotype/Biochem_paramPhenotype/Age/OnsetPhenotype/Age/DiagnosisPhenotype/Severity_scorePhenotype/OnsetPhenotype/ProteinPhenotype/Tumor/MSIPhenotype/Enzyme/CPKPhenotype/Heart/MyocardiumPhenotype/LungPhenotype/Diagnosis/DefinitePhenotype/Diagnosis/InitialPhenotype/Diagnosis/Criteria
00000000008000120000013500006Familial, autosomal recessive
10000000026000580000021000039Familial, autosomal recessive
20000000941001730000196200025Familial2-3 toe syndactyly5
30000026525042140003309600229Unknownretinitis pigmentosa
40000026538042140003310900229Unknownretinitis pigmentosa
...............................................................
12660000336901001980044770200006Unknownretinitis pigmentosa, simplex
12670000336906001980044770700006Unknownretinitis pigmentosa, simplex
12680000336915001980044771600006Unknownretinitis pigmentosa, simplex
12690000336917001980044771800006Unknownretinitis pigmentosa, simplex
12700000336919001980044772000006Unknownretinitis pigmentosa, simplex
\n", + "

1271 rows × 20 columns

\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Screenings\n" + ] + }, + { + "data": { + "text/plain": [ + " id individualid variants_found owned_by created_by \\\n", + "0 0000000126 00000135 1 00006 00006 \n", + "1 0000000211 00000210 1 00039 00006 \n", + "2 0000001640 00001962 1 00025 00006 \n", + "3 0000016557 00016605 1 00552 00552 \n", + "4 0000033164 00033096 1 00229 00229 \n", + "... ... ... ... ... ... \n", + "1445 0000449279 00447702 1 00006 00006 \n", + "1446 0000449284 00447707 1 00006 00006 \n", + "1447 0000449293 00447716 1 00006 00006 \n", + "1448 0000449295 00447718 1 00006 00006 \n", + "1449 0000449297 00447720 1 00006 00006 \n", + "\n", + " created_date edited_by edited_date Screening/Technique \\\n", + "0 2012-07-07 19:04:19 00006 2012-07-07 19:12:08 RT-PCR;SEQ \n", + "1 2012-09-22 11:36:24 SEQ \n", + "2 2010-03-11 16:36:41 00025 2012-04-13 15:18:00 SEQ \n", + "3 2014-05-23 13:12:43 SEQ-NG-I \n", + "4 2012-02-04 15:20:01 00006 2012-05-18 13:59:33 SEQ;SEQ-NG-S \n", + "... ... ... ... ... \n", + "1445 2024-01-26 10:23:59 SEQ-NG \n", + "1446 2024-01-26 10:23:59 SEQ-NG \n", + "1447 2024-01-26 10:23:59 SEQ-NG \n", + "1448 2024-01-26 10:23:59 SEQ-NG \n", + "1449 2024-01-26 10:23:59 SEQ-NG \n", + "\n", + " Screening/Template Screening/Tissue Screening/Remarks \n", + "0 DNA;RNA \n", + "1 DNA \n", + "2 DNA \n", + "3 DNA \n", + "4 DNA \n", + "... ... ... ... \n", + "1445 DNA WGS \n", + "1446 DNA WGS \n", + "1447 DNA WGS \n", + "1448 DNA WGS \n", + "1449 DNA WGS \n", + "\n", + "[1450 rows x 12 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idindividualidvariants_foundowned_bycreated_bycreated_dateedited_byedited_dateScreening/TechniqueScreening/TemplateScreening/TissueScreening/Remarks
0000000012600000135100006000062012-07-07 19:04:19000062012-07-07 19:12:08RT-PCR;SEQDNA;RNA
1000000021100000210100039000062012-09-22 11:36:24SEQDNA
2000000164000001962100025000062010-03-11 16:36:41000252012-04-13 15:18:00SEQDNA
3000001655700016605100552005522014-05-23 13:12:43SEQ-NG-IDNA
4000003316400033096100229002292012-02-04 15:20:01000062012-05-18 13:59:33SEQ;SEQ-NG-SDNA
.......................................
1445000044927900447702100006000062024-01-26 10:23:59SEQ-NGDNAWGS
1446000044928400447707100006000062024-01-26 10:23:59SEQ-NGDNAWGS
1447000044929300447716100006000062024-01-26 10:23:59SEQ-NGDNAWGS
1448000044929500447718100006000062024-01-26 10:23:59SEQ-NGDNAWGS
1449000044929700447720100006000062024-01-26 10:23:59SEQ-NGDNAWGS
\n", + "

1450 rows × 12 columns

\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Screenings_To_Genes\n" + ] + }, + { + "data": { + "text/plain": [ + " screeningid geneid\n", + "0 0000000126 IL36RN\n", + "1 0000000211 MKS1\n", + "2 0000001640 DHCR7\n", + "3 0000033164 AHI1\n", + "4 0000033164 EYS\n", + "... ... ...\n", + "1311 0000437646 EYS\n", + "1312 0000437902 EYS\n", + "1313 0000437922 EYS\n", + "1314 0000443144 EYS\n", + "1315 0000443145 EYS\n", + "\n", + "[1316 rows x 2 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
screeningidgeneid
00000000126IL36RN
10000000211MKS1
20000001640DHCR7
30000033164AHI1
40000033164EYS
.........
13110000437646EYS
13120000437902EYS
13130000437922EYS
13140000443144EYS
13150000443145EYS
\n", + "

1316 rows × 2 columns

\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Variants_On_Genome\n" + ] + }, + { + "data": { + "text/plain": [ + " id allele effectid chromosome position_g_start position_g_end \\\n", + "0 0000036426 3 50 6 64498971 64498971 \n", + "1 0000059881 3 55 6 65655758 65655758 \n", + "2 0000059883 1 11 6 65336143 65336143 \n", + "3 0000059884 1 15 6 65300869 65300869 \n", + "4 0000059885 1 11 6 65016998 65016999 \n", + "... ... ... ... ... ... ... \n", + "2536 0000964211 0 30 6 65767634 65767634 \n", + "2537 0000964212 0 30 6 65767643 65767643 \n", + "2538 0000964215 0 50 6 66005927 66005927 \n", + "2539 0000964216 0 50 6 66044874 66044874 \n", + "2540 0000977314 0 90 6 64430943 64430943 \n", + "\n", + " type average_frequency owned_by VariantOnGenome/DBID ... \\\n", + "0 subst 0.000742922 00552 EYS_000007 ... \n", + "1 subst 0.00115297 00229 EYS_000001 ... \n", + "2 subst 0.224189 00229 EYS_000002 ... \n", + "3 subst 0.000837928 00229 EYS_000003 ... \n", + "4 del 0 00229 EYS_000004 ... \n", + "... ... ... ... ... ... \n", + "2536 subst 0.243022 02330 EYS_000248 ... \n", + "2537 del 0 02330 EYS_000926 ... \n", + "2538 subst 0.000112112 02327 EYS_000253 ... \n", + "2539 subst 0.0000818974 02327 EYS_000256 ... \n", + "2540 subst 0.00000659822 01804 EYS_000060 ... \n", + "\n", + " VariantOnGenome/Genetic_origin VariantOnGenome/Segregation \\\n", + "0 Germline \n", + "1 Germline yes \n", + "2 Germline no \n", + "3 Germline \n", + "4 Germline yes \n", + "... ... ... \n", + "2536 CLASSIFICATION record \n", + "2537 CLASSIFICATION record \n", + "2538 CLASSIFICATION record \n", + "2539 CLASSIFICATION record \n", + "2540 CLASSIFICATION record \n", + "\n", + " VariantOnGenome/dbSNP VariantOnGenome/VIP VariantOnGenome/Methylation \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "... ... ... ... \n", + "2536 \n", + "2537 \n", + "2538 \n", + "2539 \n", + "2540 \n", + "\n", + " VariantOnGenome/ISCN VariantOnGenome/DNA/hg38 VariantOnGenome/ClinVar \\\n", + "0 g.63789078A>G \n", + "1 g.64945865T>G \n", + "2 g.64626250G>A \n", + "3 g.64590976G>A \n", + "4 g.64307105_64307106del \n", + "... ... ... ... \n", + "2536 \n", + "2537 \n", + "2538 \n", + "2539 \n", + "2540 \n", + "\n", + " VariantOnGenome/ClinicalClassification \\\n", + "0 VUS \n", + "1 VUS \n", + "2 benign \n", + "3 benign \n", + "4 benign \n", + "... ... \n", + "2536 likely benign \n", + "2537 likely benign \n", + "2538 VUS \n", + "2539 VUS \n", + "2540 pathogenic \n", + "\n", + " VariantOnGenome/ClinicalClassification/Method \n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 \n", + "... ... \n", + "2536 \n", + "2537 \n", + "2538 \n", + "2539 \n", + "2540 \n", + "\n", + "[2541 rows x 26 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idalleleeffectidchromosomeposition_g_startposition_g_endtypeaverage_frequencyowned_byVariantOnGenome/DBID...VariantOnGenome/Genetic_originVariantOnGenome/SegregationVariantOnGenome/dbSNPVariantOnGenome/VIPVariantOnGenome/MethylationVariantOnGenome/ISCNVariantOnGenome/DNA/hg38VariantOnGenome/ClinVarVariantOnGenome/ClinicalClassificationVariantOnGenome/ClinicalClassification/Method
0000003642635066449897164498971subst0.00074292200552EYS_000007...Germline0g.63789078A>GVUS
1000005988135566565575865655758subst0.0011529700229EYS_000001...Germlineyes0g.64945865T>GVUS
2000005988311166533614365336143subst0.22418900229EYS_000002...Germlineno0g.64626250G>Abenign
3000005988411566530086965300869subst0.00083792800229EYS_000003...Germline0g.64590976G>Abenign
4000005988511166501699865016999del000229EYS_000004...Germlineyes0g.64307105_64307106delbenign
..................................................................
2536000096421103066576763465767634subst0.24302202330EYS_000248...CLASSIFICATION recordlikely benign
2537000096421203066576764365767643del002330EYS_000926...CLASSIFICATION recordlikely benign
2538000096421505066600592766005927subst0.00011211202327EYS_000253...CLASSIFICATION recordVUS
2539000096421605066604487466044874subst0.000081897402327EYS_000256...CLASSIFICATION recordVUS
2540000097731409066443094364430943subst0.0000065982201804EYS_000060...CLASSIFICATION recordpathogenic
\n", + "

2541 rows × 26 columns

\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Variants_On_Transcripts\n" + ] + }, + { + "data": { + "text/plain": [ + " id transcriptid effectid position_c_start \\\n", + "0 0000036426 00007329 50 7558 \n", + "1 0000059881 00007329 55 2309 \n", + "2 0000059883 00007329 11 3444 \n", + "3 0000059884 00007329 15 4891 \n", + "4 0000059885 00007329 11 6079 \n", + "... ... ... ... ... \n", + "2536 0000964211 00007329 30 2024 \n", + "2537 0000964212 00007329 30 2024 \n", + "2538 0000964215 00007329 50 1852 \n", + "2539 0000964216 00007329 50 1765 \n", + "2540 0000977314 00007329 90 8984 \n", + "\n", + " position_c_start_intron position_c_end position_c_end_intron \\\n", + "0 0 7558 0 \n", + "1 0 2309 0 \n", + "2 -5 3444 -5 \n", + "3 0 4891 0 \n", + "4 -4 6079 -3 \n", + "... ... ... ... \n", + "2536 -14 2024 -14 \n", + "2537 -15 2024 -15 \n", + "2538 0 1852 0 \n", + "2539 0 1765 0 \n", + "2540 0 8984 0 \n", + "\n", + " VariantOnTranscript/DNA VariantOnTranscript/RNA \\\n", + "0 c.7558T>C r.(?) \n", + "1 c.2309A>C r.(?) \n", + "2 c.3444-5C>T r.(?) \n", + "3 c.4891C>T r.(?) \n", + "4 c.6079-4_6079-3del r.(?) \n", + "... ... ... \n", + "2536 c.2024-14C>T r.(=) \n", + "2537 c.2024-15del r.(=) \n", + "2538 c.1852G>A r.(?) \n", + "2539 c.1765A>G r.(?) \n", + "2540 c.8984T>A r.(?) \n", + "\n", + " VariantOnTranscript/Protein VariantOnTranscript/Exon \n", + "0 p.(Phe2520Leu) 38 \n", + "1 p.(Gln770Pro) 15 \n", + "2 p.(=) 22i \n", + "3 p.(Pro1631Ser) 26 \n", + "4 p.(=) 29i \n", + "... ... ... \n", + "2536 p.(=) \n", + "2537 p.(=) \n", + "2538 p.(Gly618Ser) \n", + "2539 p.(Arg589Gly) \n", + "2540 p.(Ile2995Asn) \n", + "\n", + "[2541 rows x 11 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtranscriptideffectidposition_c_startposition_c_start_intronposition_c_endposition_c_end_intronVariantOnTranscript/DNAVariantOnTranscript/RNAVariantOnTranscript/ProteinVariantOnTranscript/Exon
0000003642600007329507558075580c.7558T>Cr.(?)p.(Phe2520Leu)38
1000005988100007329552309023090c.2309A>Cr.(?)p.(Gln770Pro)15
2000005988300007329113444-53444-5c.3444-5C>Tr.(?)p.(=)22i
3000005988400007329154891048910c.4891C>Tr.(?)p.(Pro1631Ser)26
4000005988500007329116079-46079-3c.6079-4_6079-3delr.(?)p.(=)29i
....................................
2536000096421100007329302024-142024-14c.2024-14C>Tr.(=)p.(=)
2537000096421200007329302024-152024-15c.2024-15delr.(=)p.(=)
2538000096421500007329501852018520c.1852G>Ar.(?)p.(Gly618Ser)
2539000096421600007329501765017650c.1765A>Gr.(?)p.(Arg589Gly)
2540000097731400007329908984089840c.8984T>Ar.(?)p.(Ile2995Asn)
\n", + "

2541 rows × 11 columns

\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Screenings_To_Variants\n" + ] + }, + { + "data": { + "text/plain": [ + " screeningid variantid\n", + "0 0000000126 0000783293\n", + "1 0000000211 0000790459\n", + "2 0000001640 0000235838\n", + "3 0000016557 0000036426\n", + "4 0000033164 0000059884\n", + "... ... ...\n", + "2144 0000449279 0000959046\n", + "2145 0000449284 0000959051\n", + "2146 0000449293 0000959060\n", + "2147 0000449295 0000959474\n", + "2148 0000449297 0000959064\n", + "\n", + "[2149 rows x 2 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
screeningidvariantid
000000001260000783293
100000002110000790459
200000016400000235838
300000165570000036426
400000331640000059884
.........
214400004492790000959046
214500004492840000959051
214600004492930000959060
214700004492950000959474
214800004492970000959064
\n", + "

2149 rows × 2 columns

\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "execution_count": 4 }, { "cell_type": "code", - "outputs": [], "source": [ "set_lovd_dtypes(data)\n", "for i in data:\n", @@ -52,9 +2718,1032 @@ " display(data[i].info())" ], "metadata": { - "collapsed": false + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-04-29T17:33:35.331899Z", + "start_time": "2024-04-29T17:33:35.174198Z" + } }, "id": "ef07740b2fa63e42", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Genes\n", + "\n", + "RangeIndex: 1 entries, 0 to 0\n", + "Data columns (total 34 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 1 non-null string \n", + " 1 name 1 non-null string \n", + " 2 chromosome 1 non-null Int64 \n", + " 3 chrom_band 1 non-null string \n", + " 4 imprinting 1 non-null string \n", + " 5 refseq_genomic 1 non-null string \n", + " 6 refseq_UD 1 non-null string \n", + " 7 reference 1 non-null string \n", + " 8 url_homepage 1 non-null string \n", + " 9 url_external 1 non-null string \n", + " 10 allow_download 1 non-null bool \n", + " 11 id_hgnc 1 non-null Int64 \n", + " 12 id_entrez 1 non-null Int64 \n", + " 13 id_omim 1 non-null Int64 \n", + " 14 show_hgmd 1 non-null bool \n", + " 15 show_genecards 1 non-null bool \n", + " 16 show_genetests 1 non-null bool \n", + " 17 show_orphanet 1 non-null bool \n", + " 18 note_index 1 non-null string \n", + " 19 note_listing 1 non-null string \n", + " 20 refseq 1 non-null string \n", + " 21 refseq_url 1 non-null string \n", + " 22 disclaimer 1 non-null bool \n", + " 23 disclaimer_text 1 non-null string \n", + " 24 header 1 non-null string \n", + " 25 header_align 1 non-null Int64 \n", + " 26 footer 1 non-null string \n", + " 27 footer_align 1 non-null Int64 \n", + " 28 created_by 1 non-null Int64 \n", + " 29 created_date 1 non-null datetime64[ns]\n", + " 30 edited_by 1 non-null Int64 \n", + " 31 edited_date 1 non-null datetime64[ns]\n", + " 32 updated_by 1 non-null Int64 \n", + " 33 updated_date 1 non-null datetime64[ns]\n", + "dtypes: Int64(9), bool(6), datetime64[ns](3), string(16)\n", + "memory usage: 371.0 bytes\n" + ] + }, + { + "data": { + "text/plain": [ + "None" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Transcripts\n", + "\n", + "RangeIndex: 1 entries, 0 to 0\n", + "Data columns (total 19 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 1 non-null Int64 \n", + " 1 geneid 1 non-null string \n", + " 2 name 1 non-null string \n", + " 3 id_mutalyzer 1 non-null Int64 \n", + " 4 id_ncbi 1 non-null string \n", + " 5 id_ensembl 1 non-null string \n", + " 6 id_protein_ncbi 1 non-null string \n", + " 7 id_protein_ensembl 1 non-null string \n", + " 8 id_protein_uniprot 1 non-null string \n", + " 9 remarks 1 non-null string \n", + " 10 position_c_mrna_start 1 non-null Int64 \n", + " 11 position_c_mrna_end 1 non-null Int64 \n", + " 12 position_c_cds_end 1 non-null Int64 \n", + " 13 position_g_mrna_start 1 non-null Int64 \n", + " 14 position_g_mrna_end 1 non-null Int64 \n", + " 15 created_by 0 non-null Int64 \n", + " 16 created_date 0 non-null datetime64[ns]\n", + " 17 edited_by 0 non-null Int64 \n", + " 18 edited_date 0 non-null datetime64[ns]\n", + "dtypes: Int64(9), datetime64[ns](2), string(8)\n", + "memory usage: 293.0 bytes\n" + ] + }, + { + "data": { + "text/plain": [ + "None" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Diseases\n", + "\n", + "RangeIndex: 15 entries, 0 to 14\n", + "Data columns (total 12 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 15 non-null Int64 \n", + " 1 symbol 15 non-null string \n", + " 2 name 15 non-null string \n", + " 3 inheritance 15 non-null string \n", + " 4 id_omim 4 non-null Int64 \n", + " 5 tissues 15 non-null string \n", + " 6 features 15 non-null string \n", + " 7 remarks 15 non-null string \n", + " 8 created_by 15 non-null Int64 \n", + " 9 created_date 15 non-null datetime64[ns]\n", + " 10 edited_by 11 non-null Int64 \n", + " 11 edited_date 11 non-null datetime64[ns]\n", + "dtypes: Int64(4), datetime64[ns](2), string(6)\n", + "memory usage: 1.6 KB\n" + ] + }, + { + "data": { + "text/plain": [ + "None" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Genes_To_Diseases\n", + "\n", + "RangeIndex: 2 entries, 0 to 1\n", + "Data columns (total 2 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 geneid 2 non-null string\n", + " 1 diseaseid 2 non-null Int64 \n", + "dtypes: Int64(1), string(1)\n", + "memory usage: 166.0 bytes\n" + ] + }, + { + "data": { + "text/plain": [ + "None" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Individuals\n", + "\n", + "RangeIndex: 1450 entries, 0 to 1449\n", + "Data columns (total 18 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 1450 non-null Int64 \n", + " 1 fatherid 1450 non-null string\n", + " 2 motherid 1450 non-null string\n", + " 3 panelid 6 non-null Int64 \n", + " 4 panel_size 1450 non-null Int64 \n", + " 5 license 1450 non-null string\n", + " 6 owned_by 1450 non-null Int64 \n", + " 7 Individual/Reference 1450 non-null string\n", + " 8 Individual/Remarks 1450 non-null string\n", + " 9 Individual/Gender 1450 non-null string\n", + " 10 Individual/Consanguinity 1450 non-null string\n", + " 11 Individual/Origin/Geographic 1450 non-null string\n", + " 12 Individual/Age_of_death 1450 non-null string\n", + " 13 Individual/VIP 1450 non-null string\n", + " 14 Individual/Data_av 1450 non-null string\n", + " 15 Individual/Treatment 1450 non-null string\n", + " 16 Individual/Origin/Population 1450 non-null string\n", + " 17 Individual/Individual_ID 1450 non-null string\n", + "dtypes: Int64(4), string(14)\n", + "memory usage: 209.7 KB\n" + ] + }, + { + "data": { + "text/plain": [ + "None" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Individuals_To_Diseases\n", + "\n", + "RangeIndex: 1449 entries, 0 to 1448\n", + "Data columns (total 2 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 individualid 1449 non-null Int64\n", + " 1 diseaseid 1449 non-null Int64\n", + "dtypes: Int64(2)\n", + "memory usage: 25.6 KB\n" + ] + }, + { + "data": { + "text/plain": [ + "None" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Phenotypes\n", + "\n", + "RangeIndex: 1271 entries, 0 to 1270\n", + "Data columns (total 20 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 1271 non-null Int64 \n", + " 1 diseaseid 1271 non-null Int64 \n", + " 2 individualid 1271 non-null Int64 \n", + " 3 owned_by 1271 non-null Int64 \n", + " 4 Phenotype/Inheritance 1271 non-null string\n", + " 5 Phenotype/Age 1271 non-null string\n", + " 6 Phenotype/Additional 1271 non-null string\n", + " 7 Phenotype/Biochem_param 1271 non-null string\n", + " 8 Phenotype/Age/Onset 1271 non-null string\n", + " 9 Phenotype/Age/Diagnosis 1271 non-null string\n", + " 10 Phenotype/Severity_score 1271 non-null string\n", + " 11 Phenotype/Onset 1271 non-null string\n", + " 12 Phenotype/Protein 1271 non-null string\n", + " 13 Phenotype/Tumor/MSI 1271 non-null string\n", + " 14 Phenotype/Enzyme/CPK 1271 non-null string\n", + " 15 Phenotype/Heart/Myocardium 1271 non-null string\n", + " 16 Phenotype/Lung 1271 non-null string\n", + " 17 Phenotype/Diagnosis/Definite 1271 non-null string\n", + " 18 Phenotype/Diagnosis/Initial 1271 non-null string\n", + " 19 Phenotype/Diagnosis/Criteria 1271 non-null string\n", + "dtypes: Int64(4), string(16)\n", + "memory usage: 203.7 KB\n" + ] + }, + { + "data": { + "text/plain": [ + "None" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Screenings\n", + "\n", + "RangeIndex: 1450 entries, 0 to 1449\n", + "Data columns (total 12 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 1450 non-null Int64 \n", + " 1 individualid 1450 non-null Int64 \n", + " 2 variants_found 1450 non-null Int64 \n", + " 3 owned_by 1450 non-null Int64 \n", + " 4 created_by 1450 non-null Int64 \n", + " 5 created_date 1450 non-null datetime64[ns]\n", + " 6 edited_by 15 non-null Int64 \n", + " 7 edited_date 15 non-null datetime64[ns]\n", + " 8 Screening/Technique 1450 non-null string \n", + " 9 Screening/Template 1450 non-null string \n", + " 10 Screening/Tissue 1450 non-null string \n", + " 11 Screening/Remarks 1450 non-null string \n", + "dtypes: Int64(6), datetime64[ns](2), string(4)\n", + "memory usage: 144.6 KB\n" + ] + }, + { + "data": { + "text/plain": [ + "None" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Screenings_To_Genes\n", + "\n", + "RangeIndex: 1316 entries, 0 to 1315\n", + "Data columns (total 2 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 screeningid 1316 non-null Int64 \n", + " 1 geneid 1316 non-null string\n", + "dtypes: Int64(1), string(1)\n", + "memory usage: 22.0 KB\n" + ] + }, + { + "data": { + "text/plain": [ + "None" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Variants_On_Genome\n", + "\n", + "RangeIndex: 2541 entries, 0 to 2540\n", + "Data columns (total 26 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 2541 non-null Int64 \n", + " 1 allele 2541 non-null Int64 \n", + " 2 effectid 2541 non-null Int64 \n", + " 3 chromosome 2541 non-null Int64 \n", + " 4 position_g_start 2540 non-null Int64 \n", + " 5 position_g_end 2540 non-null Int64 \n", + " 6 type 2541 non-null string \n", + " 7 average_frequency 2540 non-null float64\n", + " 8 owned_by 2541 non-null Int64 \n", + " 9 VariantOnGenome/DBID 2541 non-null string \n", + " 10 VariantOnGenome/DNA 2541 non-null string \n", + " 11 VariantOnGenome/Frequency 2541 non-null string \n", + " 12 VariantOnGenome/Reference 2541 non-null string \n", + " 13 VariantOnGenome/Restriction_site 2541 non-null string \n", + " 14 VariantOnGenome/Published_as 2541 non-null string \n", + " 15 VariantOnGenome/Remarks 2541 non-null string \n", + " 16 VariantOnGenome/Genetic_origin 2541 non-null string \n", + " 17 VariantOnGenome/Segregation 2541 non-null string \n", + " 18 VariantOnGenome/dbSNP 2541 non-null string \n", + " 19 VariantOnGenome/VIP 2541 non-null string \n", + " 20 VariantOnGenome/Methylation 2541 non-null string \n", + " 21 VariantOnGenome/ISCN 2541 non-null string \n", + " 22 VariantOnGenome/DNA/hg38 2541 non-null string \n", + " 23 VariantOnGenome/ClinVar 2541 non-null string \n", + " 24 VariantOnGenome/ClinicalClassification 2541 non-null string \n", + " 25 VariantOnGenome/ClinicalClassification/Method 2541 non-null string \n", + "dtypes: Int64(7), float64(1), string(18)\n", + "memory usage: 533.6 KB\n" + ] + }, + { + "data": { + "text/plain": [ + "None" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Variants_On_Transcripts\n", + "\n", + "RangeIndex: 2541 entries, 0 to 2540\n", + "Data columns (total 11 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 2541 non-null Int64 \n", + " 1 transcriptid 2541 non-null Int64 \n", + " 2 effectid 2541 non-null Int64 \n", + " 3 position_c_start 2540 non-null Int64 \n", + " 4 position_c_start_intron 2541 non-null Int64 \n", + " 5 position_c_end 2540 non-null Int64 \n", + " 6 position_c_end_intron 2541 non-null Int64 \n", + " 7 VariantOnTranscript/DNA 2541 non-null string\n", + " 8 VariantOnTranscript/RNA 2541 non-null string\n", + " 9 VariantOnTranscript/Protein 2541 non-null string\n", + " 10 VariantOnTranscript/Exon 2541 non-null string\n", + "dtypes: Int64(7), string(4)\n", + "memory usage: 235.9 KB\n" + ] + }, + { + "data": { + "text/plain": [ + "None" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Screenings_To_Variants\n", + "\n", + "RangeIndex: 2149 entries, 0 to 2148\n", + "Data columns (total 2 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 screeningid 2149 non-null Int64\n", + " 1 variantid 2149 non-null Int64\n", + "dtypes: Int64(2)\n", + "memory usage: 37.9 KB\n" + ] + }, + { + "data": { + "text/plain": [ + "None" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "execution_count": 5 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-29T18:13:06.847273Z", + "start_time": "2024-04-29T18:12:03.410953Z" + } + }, + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "from data_collection import CLINVAR_PATH\n", + "from data_collection import store_database_for_eys_gene\n", + "store_database_for_eys_gene(\"clinvar\", override=False)\n", + "\n", + "\n", + "clinvar_data = pd.read_csv(CLINVAR_PATH + \"/clinvar_data.txt\", sep='\\t')" + ], + "id": "fca037a0adeb3c1a", + "outputs": [], + "execution_count": 6 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-29T18:13:06.874140Z", + "start_time": "2024-04-29T18:13:06.849279Z" + } + }, + "cell_type": "code", + "source": "clinvar_data", + "id": "d29fe8266acc229d", + "outputs": [ + { + "data": { + "text/plain": [ + " Name \\\n", + "0 GRCh38/hg38 6p12.1-q12(chr6:53931543-68149750)x3 \n", + "1 GRCh38/hg38 6p11.2-q12(chr6:57466921-68712228)x3 \n", + "2 GRCh38/hg38 6q12(chr6:63719407-64994324)x3 \n", + "3 NM_001370348.2(PHF3):c.*6334T>C \n", + "4 NM_001370348.2(PHF3):c.*6340C>T \n", + "... ... \n", + "4657 NM_001142800.1(EYS):c.(6078+1_6079-1)_(6191+1_... \n", + "4658 NM_001142800.1(EYS):c.(2641+1_2642-1)_(2846+1_... \n", + "4659 NM_001142800.1:c.(2137+1_2138-1)_(2259+1_2260-... \n", + "4660 NM_001142800.1:c.(2023+1_2024-1)_(3443+1_3444-... \n", + "4661 NM_001142800.1(EYS):c.(2259+1_2260-1)_(2381+1_... \n", + "\n", + " Gene(s) Protein change \\\n", + "0 BAG2|BEND6|BMP5|COL21A1|DST|DST-AS1|ERVH-3|EYS... NaN \n", + "1 ADGRB3|ADGRB3-DT|ERVH-3|EYS|FKBP1C|KHDRBS2|LGS... NaN \n", + "2 EYS|LOC113175011|LOC113175012|LOC123744835|LOC... NaN \n", + "3 EYS|PHF3 NaN \n", + "4 EYS|PHF3 NaN \n", + "... ... ... \n", + "4657 EYS NaN \n", + "4658 EYS NaN \n", + "4659 EYS NaN \n", + "4660 EYS NaN \n", + "4661 EYS NaN \n", + "\n", + " Condition(s) Accession GRCh37Chromosome \\\n", + "0 See cases VCV000148002 6 \n", + "1 See cases VCV000148991 6 \n", + "2 See cases VCV000148291 6 \n", + "3 Retinitis pigmentosa VCV000910562 6 \n", + "4 Retinitis pigmentosa VCV000910563 6 \n", + "... ... ... ... \n", + "4657 Retinitis pigmentosa VCV000636135 NaN \n", + "4658 Retinitis pigmentosa VCV000636134 NaN \n", + "4659 Retinitis pigmentosa VCV000636133 NaN \n", + "4660 Retinal dystrophy VCV000636132 NaN \n", + "4661 Retinal dystrophy VCV000375230 NaN \n", + "\n", + " GRCh37Location GRCh38Chromosome GRCh38Location VariationID \\\n", + "0 53796341 - 68859642 6 53931543 - 68149750 148002 \n", + "1 57329882 - 69422120 6 57466921 - 68712228 148991 \n", + "2 64429303 - 65704217 6 63719407 - 64994324 148291 \n", + "3 64429938 6 63720042 910562 \n", + "4 64429944 6 63720048 910563 \n", + "... ... ... ... ... \n", + "4657 NaN NaN NaN 636135 \n", + "4658 NaN NaN NaN 636134 \n", + "4659 NaN NaN NaN 636133 \n", + "4660 NaN NaN NaN 636132 \n", + "4661 NaN NaN NaN 375230 \n", + "\n", + " ... Germline classification Germline date last evaluated \\\n", + "0 ... Pathogenic Jan 1, 2010 \n", + "1 ... Pathogenic Jan 1, 2012 \n", + "2 ... Likely benign Jan 1, 2011 \n", + "3 ... Uncertain significance Jan 1, 2018 \n", + "4 ... Uncertain significance Jan 1, 2018 \n", + "... ... ... ... \n", + "4657 ... Pathogenic Jan 1, 2018 \n", + "4658 ... Likely pathogenic Jan 1, 2018 \n", + "4659 ... Likely pathogenic Jan 1, 2018 \n", + "4660 ... Pathogenic Jan 1, 2018 \n", + "4661 ... Likely pathogenic NaN \n", + "\n", + " Germline review status Somatic clinical impact \\\n", + "0 no assertion criteria provided NaN \n", + "1 no assertion criteria provided NaN \n", + "2 no assertion criteria provided NaN \n", + "3 criteria provided, single submitter NaN \n", + "4 criteria provided, single submitter NaN \n", + "... ... ... \n", + "4657 no assertion criteria provided NaN \n", + "4658 no assertion criteria provided NaN \n", + "4659 no assertion criteria provided NaN \n", + "4660 no assertion criteria provided NaN \n", + "4661 no assertion criteria provided NaN \n", + "\n", + " Somatic clinical impact date last evaluated \\\n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "4657 NaN \n", + "4658 NaN \n", + "4659 NaN \n", + "4660 NaN \n", + "4661 NaN \n", + "\n", + " Somatic clinical impact review status Oncogenicity classification \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "... ... ... \n", + "4657 NaN NaN \n", + "4658 NaN NaN \n", + "4659 NaN NaN \n", + "4660 NaN NaN \n", + "4661 NaN NaN \n", + "\n", + " Oncogenicity date last evaluated Oncogenicity review status Unnamed: 24 \n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", + "... ... ... ... \n", + "4657 NaN NaN NaN \n", + "4658 NaN NaN NaN \n", + "4659 NaN NaN NaN \n", + "4660 NaN NaN NaN \n", + "4661 NaN NaN NaN \n", + "\n", + "[4662 rows x 25 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameGene(s)Protein changeCondition(s)AccessionGRCh37ChromosomeGRCh37LocationGRCh38ChromosomeGRCh38LocationVariationID...Germline classificationGermline date last evaluatedGermline review statusSomatic clinical impactSomatic clinical impact date last evaluatedSomatic clinical impact review statusOncogenicity classificationOncogenicity date last evaluatedOncogenicity review statusUnnamed: 24
0GRCh38/hg38 6p12.1-q12(chr6:53931543-68149750)x3BAG2|BEND6|BMP5|COL21A1|DST|DST-AS1|ERVH-3|EYS...NaNSee casesVCV000148002653796341 - 68859642653931543 - 68149750148002...PathogenicJan 1, 2010no assertion criteria providedNaNNaNNaNNaNNaNNaNNaN
1GRCh38/hg38 6p11.2-q12(chr6:57466921-68712228)x3ADGRB3|ADGRB3-DT|ERVH-3|EYS|FKBP1C|KHDRBS2|LGS...NaNSee casesVCV000148991657329882 - 69422120657466921 - 68712228148991...PathogenicJan 1, 2012no assertion criteria providedNaNNaNNaNNaNNaNNaNNaN
2GRCh38/hg38 6q12(chr6:63719407-64994324)x3EYS|LOC113175011|LOC113175012|LOC123744835|LOC...NaNSee casesVCV000148291664429303 - 65704217663719407 - 64994324148291...Likely benignJan 1, 2011no assertion criteria providedNaNNaNNaNNaNNaNNaNNaN
3NM_001370348.2(PHF3):c.*6334T>CEYS|PHF3NaNRetinitis pigmentosaVCV000910562664429938663720042910562...Uncertain significanceJan 1, 2018criteria provided, single submitterNaNNaNNaNNaNNaNNaNNaN
4NM_001370348.2(PHF3):c.*6340C>TEYS|PHF3NaNRetinitis pigmentosaVCV000910563664429944663720048910563...Uncertain significanceJan 1, 2018criteria provided, single submitterNaNNaNNaNNaNNaNNaNNaN
..................................................................
4657NM_001142800.1(EYS):c.(6078+1_6079-1)_(6191+1_...EYSNaNRetinitis pigmentosaVCV000636135NaNNaNNaNNaN636135...PathogenicJan 1, 2018no assertion criteria providedNaNNaNNaNNaNNaNNaNNaN
4658NM_001142800.1(EYS):c.(2641+1_2642-1)_(2846+1_...EYSNaNRetinitis pigmentosaVCV000636134NaNNaNNaNNaN636134...Likely pathogenicJan 1, 2018no assertion criteria providedNaNNaNNaNNaNNaNNaNNaN
4659NM_001142800.1:c.(2137+1_2138-1)_(2259+1_2260-...EYSNaNRetinitis pigmentosaVCV000636133NaNNaNNaNNaN636133...Likely pathogenicJan 1, 2018no assertion criteria providedNaNNaNNaNNaNNaNNaNNaN
4660NM_001142800.1:c.(2023+1_2024-1)_(3443+1_3444-...EYSNaNRetinal dystrophyVCV000636132NaNNaNNaNNaN636132...PathogenicJan 1, 2018no assertion criteria providedNaNNaNNaNNaNNaNNaNNaN
4661NM_001142800.1(EYS):c.(2259+1_2260-1)_(2381+1_...EYSNaNRetinal dystrophyVCV000375230NaNNaNNaNNaN375230...Likely pathogenicNaNno assertion criteria providedNaNNaNNaNNaNNaNNaNNaN
\n", + "

4662 rows × 25 columns

\n", + "
" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 7 + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "#region_EYS_extraction\n", + "import pandas as pd\n", + "import re\n", + "from data_collection import store_database_for_eys_gene\n", + "#store_database_for_eys_gene(\"clinvar\", override=False)\n", + "filtered_data = []\n", + "ends = {'del', 'delins', 'dup', 'ins', 'inv', 'subst'}\n", + "clinvar_data = pd.read_csv(CLINVAR_PATH + \"/clinvar_data.txt\", sep='\\t')\n", + "for item in clinvar_data[\"Name\"]:\n", + " if \"(EYS)\" in item:\n", + " match = re.match(r'^.*\\(EYS\\):(c\\.[A-Za-z0-9_]+>[A-Za-z])(?:\\s*\\(.*\\))?', item)\n", + " if match and not any(end in match.group(1) for end in ends):\n", + " filtered_data.append(match.group(1))\n", + "#region_EYS_extraction_end\n", + "\n", + "data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")\n", + "\n", + "gene_ids = []\n", + "\n", + "for key, value in data[\"Variants_On_Transcripts\"][\"VariantOnTranscript/DNA\"].items():\n", + " if value in filtered_data:\n", + " gene_id = key\n", + " if gene_id:\n", + " gene_ids.append(key)\n", + "\n", + "final_dna = []\n", + "for key, value in data[\"Variants_On_Genome\"][\"VariantOnGenome/DNA/hg38\"].items():\n", + " if key in gene_ids:\n", + " gene = value\n", + " if gene:\n", + " final_dna.append(gene)\n", + " \n", + "print(final_dna)\n", + " " + ], + "id": "579b2416a16a1080", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "Section below is a **complete copy** of (which is also above):\n", + "```python\n", + "#region_EYS_extraction\n", + "import pandas as pd\n", + "from data_collection import store_database_for_eys_gene\n", + "#store_database_for_eys_gene(\"clinvar\", override=False)\n", + "filtered_data = []\n", + "ends = {'del', 'delins', 'dup', 'ins', 'inv', 'subst'}\n", + "clinvar_data = pd.read_csv(CLINVAR_PATH + \"/clinvar_data.txt\", sep='\\t')\n", + "for item in clinvar_data[\"Name\"]:\n", + " if \"(EYS)\" in item:\n", + " match = re.match(r'^.*\\(EYS\\):(c\\.[A-Za-z0-9_]+>[A-Za-z])(?:\\s*\\(.*\\))?', item)\n", + " if match and not any(end in match.group(1) for end in ends):\n", + " filtered_data.append(match.group(1))\n", + "#region_EYS_extraction_end\n", + "\n", + "data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")\n", + "\n", + "gene_ids = []\n", + "\n", + "for key, value in data[\"Variants_On_Transcripts\"][\"VariantOnTranscript/DNA\"].items():\n", + " if value in filtered_data:\n", + " gene_id = key\n", + " if gene_id:\n", + " gene_ids.append(key)\n", + "\n", + "final_dna = []\n", + "for key, value in data[\"Variants_On_Genome\"][\"VariantOnGenome/DNA/hg38\"].items():\n", + " if key in gene_ids:\n", + " gene = value\n", + " if gene:\n", + " final_dna.append(gene)\n", + " \n", + "print(final_dna)\n", + "```\n", + "although it doesn't work as intended." + ], + "id": "569f36782bae62ef" + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "from data_collection.refactoring import lovd_gnomad_merge\n", + "import pandas as pd\n", + "\n", + "#store_database_for_eys_gene(\"lovd\", override=False)\n", + "#store_database_for_eys_gene(\"clinvar\", override=False)\n", + "\n", + "lovd_data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")\n", + "clinvar_data = pd.read_csv(CLINVAR_PATH + \"/clinvar_data.txt\", sep='\\t')\n", + "\n", + "filtered_data = lovd_gnomad_merge(lovd_data, clinvar_data)\n", + "\n", + "print(filtered_data)\n" + ], + "id": "10f1d40efdecf9bf", + "outputs": [], "execution_count": null } ],