From 929d1376b0f69c50d1e315c83cb2db1d860db409 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kajus=20=C4=8Cerniauskas?= Date: Wed, 31 Jul 2024 17:47:02 +0300 Subject: [PATCH 01/10] implemented a method request gene information by its id, forms it to dataframe. --- api/__init__.py | 4 +- api/data/__init__.py | 60 + api/data/refactoring.py | 86 ++ tests/pipeline.ipynb | 3245 ++++++++++++++------------------------- 4 files changed, 1338 insertions(+), 2057 deletions(-) diff --git a/api/__init__.py b/api/__init__.py index 940f6f3..f54edfc 100644 --- a/api/__init__.py +++ b/api/__init__.py @@ -54,5 +54,7 @@ # Functions for refactoring data set_lovd_dtypes, parse_lovd, - from_clinvar_name_to_cdna_position + from_clinvar_name_to_cdna_position, + save_lovd_as_vcf, + request_clinvar_api_data ) diff --git a/api/data/__init__.py b/api/data/__init__.py index e69de29..001190d 100644 --- a/api/data/__init__.py +++ b/api/data/__init__.py @@ -0,0 +1,60 @@ +""" +Package for data collection purposes provides both collection and refactoring functionality. + +Data from LOVD, ClinVar and GnomAd databases can be downloaded using this package. GnomAd and +ClinVar are limited with EYS gene, but it is possible to download data for any gene in LOVD. + +All necessary functionality can be imported directly from data without +specifying the module. + +data collection pipeline example is established for project's specific usage. +""" + +# CONSTANTS IMPORT +from .constants import ( + # URLs for LOVD database + LOVD_URL, LOVD_URL_EYS, LOVD_FILE_URL, LOVD_FILE_URL_EYS, + + # URLs for gnomAD database + GNOMAD_URL, GNOMAD_URL_EYS, GNOMAD_FILE_URL_EYS, + + # URLs for ClinVar database + CLINVAR_URL, CLINVAR_URL_EYS, CLINVAR_FILE_URL_EYS, + + # Paths for data storage + DATA_PATH, LOVD_PATH, GNOMAD_PATH, CLINVAR_PATH, + + # Data types for tables + LOVD_TABLES_DATA_TYPES, + + # Paths for database downloads + DATABASES_DOWNLOAD_PATHS +) + +# DATA COLLECTION IMPORT +from .collection import ( + # Custom exceptions + BadResponseException, + DownloadError, + + # Custom utility functions + get_file_from_url, + + # Functions for downloading databases + download_lovd_database_for_eys_gene, + download_genes_lovd, + download_database_for_eys_gene, + + # Functions for storing databases + store_database_for_eys_gene +) + +# DATA REFACTORING IMPORT +from .refactoring import ( + # Functions for refactoring data + set_lovd_dtypes, + parse_lovd, + from_clinvar_name_to_cdna_position, + save_lovd_as_vcf, + request_clinvar_api_data, +) diff --git a/api/data/refactoring.py b/api/data/refactoring.py index efcf1f1..f7142a3 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -3,11 +3,15 @@ import os import logging +import pandas +import requests + import pandas as pd from pandas import DataFrame from .constants import LOVD_TABLES_DATA_TYPES, LOVD_PATH + def set_lovd_dtypes(df_dict): """ Convert data from LOVD format table to desired data format based on specified data types. @@ -154,3 +158,85 @@ def save_lovd_as_vcf(data, save_to="./lovd.vcf"): f.write("\t".join(record)) f.write("\n") + + +def request_clinvar_api_data(gene_id: str): + """ + Requests ClinVar API for data about variant with given id.\n + Converts it to pandas dataframe. + + :param str gene_id: id of variant (may be multiple) + :returns: dataframe from ClinVar API + :rtype: dataframe + """ + + path = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=clinvar&id={gene_id}&retmode=json" + + request = requests.get(path) + + if request.status_code != 200: + raise ValueError(f"Request failed with status code {request.status_code}") + + data = request.json() + + # Extract the 'result' part of the JSON + results = data['result'] + + # Extract the 'uids' part of the JSON + flattened_data = [] + + for uid in results['uids']: + entry = results[uid] + + # Using pd.json_normalize to flatten the JSON data + flattened_entry = pd.json_normalize(entry, sep='_') + + flattened_variation_set = pd.json_normalize(flattened_entry['variation_set'][0], sep='_') + flattened_variation_xrefs = pd.json_normalize(flattened_variation_set['variation_xrefs'][0], sep='_') + flattened_variation_loc0 = pd.json_normalize(flattened_variation_set['variation_loc'][0][0], + sep='_') # 1/2 frames + flattened_variation_loc0 = flattened_variation_loc0.add_prefix('0_') + flattened_variation_loc1 = pd.json_normalize(flattened_variation_set['variation_loc'][0][1], + sep='_') # 2/2 frames + flattened_variation_loc1 = flattened_variation_loc1.add_prefix('1_') + flattened_allele_freq_set = pd.json_normalize(flattened_variation_set['allele_freq_set'][0], sep='_') + + flattened_genes0 = pd.json_normalize(flattened_entry['genes'][0][0], sep='_') # 1/2 frames + flattened_genes0 = flattened_genes0.add_prefix('0_') + flattened_genes1 = pd.json_normalize(flattened_entry['genes'][0][1], sep='_') # 2/2 frames + flattened_genes1 = flattened_genes1.add_prefix('1_') + + flattened_germline_classification_trait_set = pd.json_normalize( + flattened_entry['germline_classification_trait_set'][0], sep='_') + flattened_trait_xrefs = pd.json_normalize(flattened_germline_classification_trait_set['trait_xrefs'][0], + sep='_') + + # dropping extracted nests + flattened_entry = flattened_entry.drop(columns=['variation_set', 'genes', 'germline_classification_trait_set'], + axis=1) + flattened_variation_set = flattened_variation_set.drop( + columns=['variation_xrefs', 'variation_loc', 'allele_freq_set'], axis=1) + flattened_germline_classification_trait_set = flattened_germline_classification_trait_set.drop( + columns=['trait_xrefs'], axis=1) + + # adding extracted nests to the frames + flattened_germline_classification_trait_set = pd.concat( + [flattened_germline_classification_trait_set, flattened_trait_xrefs], axis=1) + + flattened_variation_set = pd.concat([flattened_variation_set, flattened_variation_xrefs], axis=1) + flattened_variation_set = pd.concat([flattened_variation_set, flattened_variation_loc0], axis=1) # might break + flattened_variation_set = pd.concat([flattened_variation_set, flattened_variation_loc1], axis=1) # might break + flattened_variation_set = pd.concat([flattened_variation_set, flattened_allele_freq_set], axis=1) + + flattened_entry = pd.concat([flattened_entry, flattened_variation_set], axis=1) + flattened_entry = pd.concat([flattened_entry, flattened_genes0], axis=1) + flattened_entry = pd.concat([flattened_entry, flattened_genes1], axis=1) + flattened_entry = pd.concat([flattened_entry, flattened_germline_classification_trait_set], axis=1) + + # Append the flattened entry to the list + flattened_data.append(flattened_entry) + + # Concatenate all flattened entries into a single DataFrame + df = pd.concat(flattened_data, ignore_index=True) + + return df diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 67814a7..210a5fe 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -2,44 +2,47 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, "id": "initial_id", "metadata": { - "ExecuteTime": { - "end_time": "2024-05-13T15:38:18.029744Z", - "start_time": "2024-05-13T15:38:17.807980Z" - }, "collapsed": true, "jupyter": { "outputs_hidden": true + }, + "ExecuteTime": { + "end_time": "2024-07-31T14:38:44.492053Z", + "start_time": "2024-07-31T14:38:44.487236Z" } }, - "outputs": [], "source": [ "import pandas as pd\n", "\n", "from api.data import (store_database_for_eys_gene,\n", " parse_lovd,\n", " LOVD_PATH,\n", - " set_lovd_dtypes)\n", + " set_lovd_dtypes,\n", + " )\n", "from api.data import save_lovd_as_vcf\n", "pd.options.display.max_columns = 0" - ] + ], + "outputs": [], + "execution_count": 41 }, { "cell_type": "code", - "execution_count": 2, "id": "f49f7691a27aa7b4", "metadata": { - "ExecuteTime": { - "end_time": "2024-05-13T15:08:35.710520Z", - "start_time": "2024-05-13T15:08:35.708264Z" - }, "collapsed": false, "jupyter": { "outputs_hidden": false + }, + "ExecuteTime": { + "end_time": "2024-07-31T14:38:44.501624Z", + "start_time": "2024-07-31T14:38:44.494128Z" } }, + "source": [ + "store_database_for_eys_gene(\"lovd\", override=False)" + ], "outputs": [ { "name": "stdout", @@ -49,39 +52,41 @@ ] } ], - "source": [ - "store_database_for_eys_gene(\"lovd\", override=False)" - ] + "execution_count": 42 }, { "cell_type": "code", - "execution_count": 2, "id": "cf5c45c0f7b9de0f", "metadata": { - "ExecuteTime": { - "end_time": "2024-05-13T15:38:24.591752Z", - "start_time": "2024-05-13T15:38:19.498594Z" - }, "collapsed": false, "jupyter": { "outputs_hidden": false + }, + "ExecuteTime": { + "end_time": "2024-07-31T14:38:51.917606Z", + "start_time": "2024-07-31T14:38:44.502630Z" } }, - "outputs": [], "source": [ "data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")" - ] + ], + "outputs": [], + "execution_count": 43 }, { "cell_type": "code", - "execution_count": 8, "id": "8a089e29bfc8c119", "metadata": { "ExecuteTime": { - "end_time": "2024-05-13T15:12:07.510712Z", - "start_time": "2024-05-13T15:12:07.366319Z" + "end_time": "2024-07-31T14:38:52.101863Z", + "start_time": "2024-07-31T14:38:51.918626Z" } }, + "source": [ + "for i in data:\n", + " print(i)\n", + " display(data[i])" + ], "outputs": [ { "name": "stdout", @@ -92,6 +97,12 @@ }, { "data": { + "text/plain": [ + " id name ... updated_by updated_date\n", + "0 EYS eyes shut homolog (Drosophila) ... 00006 2024-05-31 11:42:04\n", + "\n", + "[1 rows x 34 columns]" + ], "text/html": [ "
\n", "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
chrhg19_posgrch38_posrefaltaarefaaaltREVELEnsembl_transcriptid
2875612766565575864945865TAQL0.188ENST00000503581;ENST00000370621;ENST00000370616
2875612866565575864945865TCQR0.111ENST00000503581;ENST00000370621;ENST00000370616
2875612966565575864945865TGQP0.344ENST00000503581;ENST00000370621;ENST00000370616
\n", - "
" - ], "text/plain": [ - " chr hg19_pos grch38_pos ref alt aaref aaalt REVEL \\\n", - "28756127 6 65655758 64945865 T A Q L 0.188 \n", - "28756128 6 65655758 64945865 T C Q R 0.111 \n", - "28756129 6 65655758 64945865 T G Q P 0.344 \n", - "\n", - " Ensembl_transcriptid \n", - "28756127 ENST00000503581;ENST00000370621;ENST00000370616 \n", - "28756128 ENST00000503581;ENST00000370621;ENST00000370616 \n", - "28756129 ENST00000503581;ENST00000370621;ENST00000370616 " + "None" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Screenings\n", + "\n", + "RangeIndex: 1465 entries, 0 to 1464\n", + "Data columns (total 12 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 1465 non-null Int64 \n", + " 1 individualid 1465 non-null Int64 \n", + " 2 variants_found 1465 non-null Int64 \n", + " 3 owned_by 1465 non-null Int64 \n", + " 4 created_by 1465 non-null Int64 \n", + " 5 created_date 1465 non-null datetime64[ns]\n", + " 6 edited_by 15 non-null Int64 \n", + " 7 edited_date 15 non-null datetime64[ns]\n", + " 8 Screening/Technique 1465 non-null string \n", + " 9 Screening/Template 1465 non-null string \n", + " 10 Screening/Tissue 1465 non-null string \n", + " 11 Screening/Remarks 1465 non-null string \n", + "dtypes: Int64(6), datetime64[ns](2), string(4)\n", + "memory usage: 146.1 KB\n" + ] + }, + { + "data": { + "text/plain": [ + "None" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Screenings_To_Genes\n", + "\n", + "RangeIndex: 1316 entries, 0 to 1315\n", + "Data columns (total 2 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 screeningid 1316 non-null Int64 \n", + " 1 geneid 1316 non-null string\n", + "dtypes: Int64(1), string(1)\n", + "memory usage: 22.0 KB\n" + ] + }, + { + "data": { + "text/plain": [ + "None" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Variants_On_Genome\n", + "\n", + "RangeIndex: 2560 entries, 0 to 2559\n", + "Data columns (total 26 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 2560 non-null Int64 \n", + " 1 allele 2560 non-null Int64 \n", + " 2 effectid 2560 non-null Int64 \n", + " 3 chromosome 2560 non-null Int64 \n", + " 4 position_g_start 2559 non-null Int64 \n", + " 5 position_g_end 2559 non-null Int64 \n", + " 6 type 2560 non-null string \n", + " 7 average_frequency 2559 non-null float64\n", + " 8 owned_by 2560 non-null Int64 \n", + " 9 VariantOnGenome/DBID 2560 non-null string \n", + " 10 VariantOnGenome/DNA 2560 non-null string \n", + " 11 VariantOnGenome/Frequency 2560 non-null string \n", + " 12 VariantOnGenome/Reference 2560 non-null string \n", + " 13 VariantOnGenome/Restriction_site 2560 non-null string \n", + " 14 VariantOnGenome/Published_as 2560 non-null string \n", + " 15 VariantOnGenome/Remarks 2560 non-null string \n", + " 16 VariantOnGenome/Genetic_origin 2560 non-null string \n", + " 17 VariantOnGenome/Segregation 2560 non-null string \n", + " 18 VariantOnGenome/dbSNP 2560 non-null string \n", + " 19 VariantOnGenome/VIP 2560 non-null string \n", + " 20 VariantOnGenome/Methylation 2560 non-null string \n", + " 21 VariantOnGenome/ISCN 2560 non-null string \n", + " 22 VariantOnGenome/DNA/hg38 2560 non-null string \n", + " 23 VariantOnGenome/ClinVar 2560 non-null string \n", + " 24 VariantOnGenome/ClinicalClassification 2560 non-null string \n", + " 25 VariantOnGenome/ClinicalClassification/Method 2560 non-null string \n", + "dtypes: Int64(7), float64(1), string(18)\n", + "memory usage: 537.6 KB\n" + ] + }, + { + "data": { + "text/plain": [ + "None" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Variants_On_Transcripts\n", + "\n", + "RangeIndex: 2560 entries, 0 to 2559\n", + "Data columns (total 11 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 2560 non-null Int64 \n", + " 1 transcriptid 2560 non-null Int64 \n", + " 2 effectid 2560 non-null Int64 \n", + " 3 position_c_start 2559 non-null Int64 \n", + " 4 position_c_start_intron 2560 non-null Int64 \n", + " 5 position_c_end 2559 non-null Int64 \n", + " 6 position_c_end_intron 2560 non-null Int64 \n", + " 7 VariantOnTranscript/DNA 2560 non-null string\n", + " 8 VariantOnTranscript/RNA 2560 non-null string\n", + " 9 VariantOnTranscript/Protein 2560 non-null string\n", + " 10 VariantOnTranscript/Exon 2560 non-null string\n", + "dtypes: Int64(7), string(4)\n", + "memory usage: 237.6 KB\n" + ] + }, + { + "data": { + "text/plain": [ + "None" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Screenings_To_Variants\n", + "\n", + "RangeIndex: 2168 entries, 0 to 2167\n", + "Data columns (total 2 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 screeningid 2168 non-null Int64\n", + " 1 variantid 2168 non-null Int64\n", + "dtypes: Int64(2)\n", + "memory usage: 38.2 KB\n" + ] + }, + { + "data": { + "text/plain": [ + "None" ] }, "metadata": {}, "output_type": "display_data" } ], + "execution_count": 45 + }, + { + "cell_type": "code", + "id": "c968af1617be40db", + "metadata": { + "ExecuteTime": { + "end_time": "2024-07-31T14:38:52.601560Z", + "start_time": "2024-07-31T14:38:52.346875Z" + } + }, + "source": [ + "save_lovd_as_vcf(data[\"Variants_On_Genome\"], \"./lovd.vcf\")" + ], + "outputs": [ + { + "ename": "KeyError", + "evalue": "'Variants_On_Genome'", + "output_type": "error", + "traceback": [ + "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[1;31mKeyError\u001B[0m Traceback (most recent call last)", + "File \u001B[1;32m~\\PycharmProjects\\kath\\venv\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:3805\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 3804\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m-> 3805\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_engine\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mcasted_key\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 3806\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m err:\n", + "File \u001B[1;32mindex.pyx:167\u001B[0m, in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[1;34m()\u001B[0m\n", + "File \u001B[1;32mindex.pyx:196\u001B[0m, in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[1;34m()\u001B[0m\n", + "File \u001B[1;32mpandas\\\\_libs\\\\hashtable_class_helper.pxi:7081\u001B[0m, in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[1;34m()\u001B[0m\n", + "File \u001B[1;32mpandas\\\\_libs\\\\hashtable_class_helper.pxi:7089\u001B[0m, in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[1;34m()\u001B[0m\n", + "\u001B[1;31mKeyError\u001B[0m: 'Variants_On_Genome'", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001B[1;31mKeyError\u001B[0m Traceback (most recent call last)", + "Cell \u001B[1;32mIn[46], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[43msave_lovd_as_vcf\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdata\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mVariants_On_Genome\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43m./lovd.vcf\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\n", + "File \u001B[1;32m~\\PycharmProjects\\kath\\api\\data\\refactoring.py:140\u001B[0m, in \u001B[0;36msave_lovd_as_vcf\u001B[1;34m(data, save_to)\u001B[0m\n\u001B[0;32m 134\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21msave_lovd_as_vcf\u001B[39m(data, save_to\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m./lovd.vcf\u001B[39m\u001B[38;5;124m\"\u001B[39m):\n\u001B[0;32m 135\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m\"\"\"\u001B[39;00m\n\u001B[0;32m 136\u001B[0m \u001B[38;5;124;03m Gets hg38 variants from LOVD and saves as VCF file.\u001B[39;00m\n\u001B[0;32m 137\u001B[0m \u001B[38;5;124;03m :param DataFrame data: LOVD DataFrame with data\u001B[39;00m\n\u001B[0;32m 138\u001B[0m \u001B[38;5;124;03m :param str save_to: path where to save VCF file.\u001B[39;00m\n\u001B[0;32m 139\u001B[0m \u001B[38;5;124;03m \"\"\"\u001B[39;00m\n\u001B[1;32m--> 140\u001B[0m df \u001B[38;5;241m=\u001B[39m \u001B[43mdata\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mVariants_On_Genome\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m]\u001B[49m\n\u001B[0;32m 141\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mVariantOnGenome/DNA/hg38\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m df\u001B[38;5;241m.\u001B[39mcolumns:\n\u001B[0;32m 142\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mVariantOnGenome/DNA/hg38 is not in the LOVD DataFrame.\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n", + "File \u001B[1;32m~\\PycharmProjects\\kath\\venv\\Lib\\site-packages\\pandas\\core\\frame.py:4102\u001B[0m, in \u001B[0;36mDataFrame.__getitem__\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 4100\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcolumns\u001B[38;5;241m.\u001B[39mnlevels \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m1\u001B[39m:\n\u001B[0;32m 4101\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_getitem_multilevel(key)\n\u001B[1;32m-> 4102\u001B[0m indexer \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcolumns\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mkey\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 4103\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m is_integer(indexer):\n\u001B[0;32m 4104\u001B[0m indexer \u001B[38;5;241m=\u001B[39m [indexer]\n", + "File \u001B[1;32m~\\PycharmProjects\\kath\\venv\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:3812\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 3807\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(casted_key, \u001B[38;5;28mslice\u001B[39m) \u001B[38;5;129;01mor\u001B[39;00m (\n\u001B[0;32m 3808\u001B[0m \u001B[38;5;28misinstance\u001B[39m(casted_key, abc\u001B[38;5;241m.\u001B[39mIterable)\n\u001B[0;32m 3809\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;28many\u001B[39m(\u001B[38;5;28misinstance\u001B[39m(x, \u001B[38;5;28mslice\u001B[39m) \u001B[38;5;28;01mfor\u001B[39;00m x \u001B[38;5;129;01min\u001B[39;00m casted_key)\n\u001B[0;32m 3810\u001B[0m ):\n\u001B[0;32m 3811\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m InvalidIndexError(key)\n\u001B[1;32m-> 3812\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m(key) \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01merr\u001B[39;00m\n\u001B[0;32m 3813\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mTypeError\u001B[39;00m:\n\u001B[0;32m 3814\u001B[0m \u001B[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001B[39;00m\n\u001B[0;32m 3815\u001B[0m \u001B[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001B[39;00m\n\u001B[0;32m 3816\u001B[0m \u001B[38;5;66;03m# the TypeError.\u001B[39;00m\n\u001B[0;32m 3817\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_check_indexing_error(key)\n", + "\u001B[1;31mKeyError\u001B[0m: 'Variants_On_Genome'" + ] + } + ], + "execution_count": 46 + }, + { + "cell_type": "code", + "id": "c7ff16903e0c52bd", + "metadata": { + "ExecuteTime": { + "end_time": "2024-07-31T14:38:52.603569Z", + "start_time": "2024-07-31T14:38:52.602570Z" + } + }, + "source": [ + "from subprocess import Popen\n", + "\n", + "\n", + "process = Popen(\"spliceai -I ./lovd.vcf -O ./lovd_output.vcf -R ../tools/spliceai/hg38.fa -A grch38\".split())\n", + "process.wait()" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "0514ccc3-5c91-41ad-ab15-f4158030ea14", + "metadata": { + "ExecuteTime": { + "end_time": "2024-07-31T14:38:52.605570Z", + "start_time": "2024-07-31T14:38:52.604570Z" + } + }, "source": [ - "from tools import get_revel_scores\n", + "from api.tools import get_revel_scores\n", "\n", "chromosome = 6\n", "position = 65655758\n", @@ -3922,7 +3007,55 @@ "results = get_revel_scores(chromosome, position)\n", "\n", "display(results)" - ] + ], + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "from api.data import request_clinvar_api_data\n", + "\n", + "some_id = 1519786\n", + "try:\n", + " frame = request_clinvar_api_data(some_id)\n", + " display(frame)\n", + "except Exception as e:\n", + " print(e)\n" + ], + "id": "576b841842a7ab61", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "import requests\n", + "from api.data import request_clinvar_api_data\n", + "\n", + "gene_id = '1519786'\n", + "# with gene_id = '1519787' error is raised\n", + "\n", + "#TODO inside request_clinvar_api_data\n", + "# 1. dinamically expand genes to dataframe (might be one, might be more)\n", + "# 2. dinamically expand variation_loc to dataframe (might be one, might be more)\n", + "frames = request_clinvar_api_data(gene_id)\n", + "\n", + "display(frames)" + ], + "id": "b21c3487476b684f", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": "", + "id": "a97fbf604bd8977b", + "outputs": [], + "execution_count": null } ], "metadata": { From ea192ee7589e0069e5fcc4e1d7a34ed122eb7eeb Mon Sep 17 00:00:00 2001 From: Kajus CC <42713684+KajusC@users.noreply.github.com> Date: Tue, 6 Aug 2024 18:02:36 +0300 Subject: [PATCH 02/10] dynamically adds variation properties --- api/data/refactoring.py | 67 ++++---- tests/pipeline.ipynb | 364 +++++++++++++++++++++++++++++++++++----- 2 files changed, 363 insertions(+), 68 deletions(-) diff --git a/api/data/refactoring.py b/api/data/refactoring.py index f7142a3..c6e82ea 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -193,50 +193,61 @@ def request_clinvar_api_data(gene_id: str): flattened_variation_set = pd.json_normalize(flattened_entry['variation_set'][0], sep='_') flattened_variation_xrefs = pd.json_normalize(flattened_variation_set['variation_xrefs'][0], sep='_') - flattened_variation_loc0 = pd.json_normalize(flattened_variation_set['variation_loc'][0][0], - sep='_') # 1/2 frames - flattened_variation_loc0 = flattened_variation_loc0.add_prefix('0_') - flattened_variation_loc1 = pd.json_normalize(flattened_variation_set['variation_loc'][0][1], - sep='_') # 2/2 frames - flattened_variation_loc1 = flattened_variation_loc1.add_prefix('1_') - flattened_allele_freq_set = pd.json_normalize(flattened_variation_set['allele_freq_set'][0], sep='_') - - flattened_genes0 = pd.json_normalize(flattened_entry['genes'][0][0], sep='_') # 1/2 frames - flattened_genes0 = flattened_genes0.add_prefix('0_') - flattened_genes1 = pd.json_normalize(flattened_entry['genes'][0][1], sep='_') # 2/2 frames - flattened_genes1 = flattened_genes1.add_prefix('1_') - - flattened_germline_classification_trait_set = pd.json_normalize( - flattened_entry['germline_classification_trait_set'][0], sep='_') - flattened_trait_xrefs = pd.json_normalize(flattened_germline_classification_trait_set['trait_xrefs'][0], - sep='_') + + variation_loc_size = len(flattened_variation_set['variation_loc'][0]) + for i in range(variation_loc_size): + flattened_variation_loc = pd.json_normalize(flattened_variation_set['variation_loc'][0][i], sep='_') + flattened_variation_loc = flattened_variation_loc.add_prefix(f'{i}_') + flattened_variation_set = pd.concat([flattened_variation_set, flattened_variation_loc], axis=1) + + allele_freq_set_size = len(flattened_variation_set['allele_freq_set'][0]) + for i in range(allele_freq_set_size): + flattened_allele_freq_set = pd.json_normalize(flattened_variation_set['allele_freq_set'][0][i], sep='_') + flattened_allele_freq_set = flattened_allele_freq_set.add_prefix(f'{i}_') + flattened_variation_set = pd.concat([flattened_variation_set, flattened_allele_freq_set], axis=1) + + gene_size = len(flattened_entry['genes'][0]) + for i in range(gene_size): + flattened_genes = pd.json_normalize(flattened_entry['genes'][0][i], sep='_') + flattened_genes = flattened_genes.add_prefix(f'{i}_') + flattened_entry = pd.concat([flattened_entry, flattened_genes], axis=1) + + gremline_classification_trait_set_size = len(flattened_entry['germline_classification_trait_set'][0]) + for i in range(gremline_classification_trait_set_size): + flattened_germline_classification_trait_set = pd.json_normalize( + flattened_entry['germline_classification_trait_set'][0][i], sep='_') + flattened_germline_classification_trait_set = flattened_germline_classification_trait_set.add_prefix( + f'{i}_') + + trait_xrefs_size = len(flattened_germline_classification_trait_set[f'{i}_trait_xrefs'][0]) + for j in range(trait_xrefs_size): + flattened_trait_xrefs = pd.json_normalize( + flattened_germline_classification_trait_set[f'{i}_trait_xrefs'][0][j], sep='_') + flattened_trait_xrefs = flattened_trait_xrefs.add_prefix(f'{j}_') + + flattened_germline_classification_trait_set = pd.concat( + [flattened_germline_classification_trait_set, flattened_trait_xrefs], axis=1) + + flattened_germline_classification_trait_set = flattened_germline_classification_trait_set.drop( + columns=[f'{i}_trait_xrefs'], axis=1) + flattened_entry = pd.concat([flattened_entry, flattened_germline_classification_trait_set], axis=1) # dropping extracted nests flattened_entry = flattened_entry.drop(columns=['variation_set', 'genes', 'germline_classification_trait_set'], axis=1) flattened_variation_set = flattened_variation_set.drop( columns=['variation_xrefs', 'variation_loc', 'allele_freq_set'], axis=1) - flattened_germline_classification_trait_set = flattened_germline_classification_trait_set.drop( - columns=['trait_xrefs'], axis=1) - - # adding extracted nests to the frames - flattened_germline_classification_trait_set = pd.concat( - [flattened_germline_classification_trait_set, flattened_trait_xrefs], axis=1) flattened_variation_set = pd.concat([flattened_variation_set, flattened_variation_xrefs], axis=1) - flattened_variation_set = pd.concat([flattened_variation_set, flattened_variation_loc0], axis=1) # might break - flattened_variation_set = pd.concat([flattened_variation_set, flattened_variation_loc1], axis=1) # might break flattened_variation_set = pd.concat([flattened_variation_set, flattened_allele_freq_set], axis=1) flattened_entry = pd.concat([flattened_entry, flattened_variation_set], axis=1) - flattened_entry = pd.concat([flattened_entry, flattened_genes0], axis=1) - flattened_entry = pd.concat([flattened_entry, flattened_genes1], axis=1) flattened_entry = pd.concat([flattened_entry, flattened_germline_classification_trait_set], axis=1) # Append the flattened entry to the list flattened_data.append(flattened_entry) - # Concatenate all flattened entries into a single DataFrame + # Concatenate all flattened entries into a single DataFrame df = pd.concat(flattened_data, ignore_index=True) return df diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 210a5fe..12bb5a4 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -9,8 +9,8 @@ "outputs_hidden": true }, "ExecuteTime": { - "end_time": "2024-07-31T14:38:44.492053Z", - "start_time": "2024-07-31T14:38:44.487236Z" + "end_time": "2024-08-06T14:19:28.871130Z", + "start_time": "2024-08-06T14:19:28.169833Z" } }, "source": [ @@ -25,7 +25,7 @@ "pd.options.display.max_columns = 0" ], "outputs": [], - "execution_count": 41 + "execution_count": 3 }, { "cell_type": "code", @@ -36,23 +36,15 @@ "outputs_hidden": false }, "ExecuteTime": { - "end_time": "2024-07-31T14:38:44.501624Z", - "start_time": "2024-07-31T14:38:44.494128Z" + "end_time": "2024-08-06T14:19:29.382744Z", + "start_time": "2024-08-06T14:19:28.873646Z" } }, "source": [ "store_database_for_eys_gene(\"lovd\", override=False)" ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The file at ../data/lovd/lovd_data.txt already exists.\n" - ] - } - ], - "execution_count": 42 + "outputs": [], + "execution_count": 4 }, { "cell_type": "code", @@ -63,23 +55,23 @@ "outputs_hidden": false }, "ExecuteTime": { - "end_time": "2024-07-31T14:38:51.917606Z", - "start_time": "2024-07-31T14:38:44.502630Z" + "end_time": "2024-08-06T14:19:35.787015Z", + "start_time": "2024-08-06T14:19:29.383754Z" } }, "source": [ "data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")" ], "outputs": [], - "execution_count": 43 + "execution_count": 5 }, { "cell_type": "code", "id": "8a089e29bfc8c119", "metadata": { "ExecuteTime": { - "end_time": "2024-07-31T14:38:52.101863Z", - "start_time": "2024-07-31T14:38:51.918626Z" + "end_time": "2024-08-06T14:19:35.968177Z", + "start_time": "2024-08-06T14:19:35.788025Z" } }, "source": [ @@ -2471,7 +2463,7 @@ "output_type": "display_data" } ], - "execution_count": 44 + "execution_count": 6 }, { "cell_type": "code", @@ -2482,8 +2474,8 @@ "outputs_hidden": false }, "ExecuteTime": { - "end_time": "2024-07-31T14:38:52.345798Z", - "start_time": "2024-07-31T14:38:52.102870Z" + "end_time": "2024-08-06T14:19:36.151167Z", + "start_time": "2024-08-06T14:19:35.969184Z" } }, "source": [ @@ -2930,15 +2922,15 @@ "output_type": "display_data" } ], - "execution_count": 45 + "execution_count": 7 }, { "cell_type": "code", "id": "c968af1617be40db", "metadata": { "ExecuteTime": { - "end_time": "2024-07-31T14:38:52.601560Z", - "start_time": "2024-07-31T14:38:52.346875Z" + "end_time": "2024-08-06T14:19:36.920697Z", + "start_time": "2024-08-06T14:19:36.151683Z" } }, "source": [ @@ -2952,7 +2944,7 @@ "traceback": [ "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", "\u001B[1;31mKeyError\u001B[0m Traceback (most recent call last)", - "File \u001B[1;32m~\\PycharmProjects\\kath\\venv\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:3805\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 3804\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m-> 3805\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_engine\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mcasted_key\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 3806\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m err:\n", + "File \u001B[1;32m~\\PycharmProjects\\KathChatGPT\\.venv\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:3805\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 3804\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m-> 3805\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_engine\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mcasted_key\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 3806\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m err:\n", "File \u001B[1;32mindex.pyx:167\u001B[0m, in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[1;34m()\u001B[0m\n", "File \u001B[1;32mindex.pyx:196\u001B[0m, in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[1;34m()\u001B[0m\n", "File \u001B[1;32mpandas\\\\_libs\\\\hashtable_class_helper.pxi:7081\u001B[0m, in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[1;34m()\u001B[0m\n", @@ -2960,23 +2952,23 @@ "\u001B[1;31mKeyError\u001B[0m: 'Variants_On_Genome'", "\nThe above exception was the direct cause of the following exception:\n", "\u001B[1;31mKeyError\u001B[0m Traceback (most recent call last)", - "Cell \u001B[1;32mIn[46], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[43msave_lovd_as_vcf\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdata\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mVariants_On_Genome\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43m./lovd.vcf\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\n", + "Cell \u001B[1;32mIn[8], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[43msave_lovd_as_vcf\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdata\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mVariants_On_Genome\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43m./lovd.vcf\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\n", "File \u001B[1;32m~\\PycharmProjects\\kath\\api\\data\\refactoring.py:140\u001B[0m, in \u001B[0;36msave_lovd_as_vcf\u001B[1;34m(data, save_to)\u001B[0m\n\u001B[0;32m 134\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21msave_lovd_as_vcf\u001B[39m(data, save_to\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m./lovd.vcf\u001B[39m\u001B[38;5;124m\"\u001B[39m):\n\u001B[0;32m 135\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m\"\"\"\u001B[39;00m\n\u001B[0;32m 136\u001B[0m \u001B[38;5;124;03m Gets hg38 variants from LOVD and saves as VCF file.\u001B[39;00m\n\u001B[0;32m 137\u001B[0m \u001B[38;5;124;03m :param DataFrame data: LOVD DataFrame with data\u001B[39;00m\n\u001B[0;32m 138\u001B[0m \u001B[38;5;124;03m :param str save_to: path where to save VCF file.\u001B[39;00m\n\u001B[0;32m 139\u001B[0m \u001B[38;5;124;03m \"\"\"\u001B[39;00m\n\u001B[1;32m--> 140\u001B[0m df \u001B[38;5;241m=\u001B[39m \u001B[43mdata\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mVariants_On_Genome\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m]\u001B[49m\n\u001B[0;32m 141\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mVariantOnGenome/DNA/hg38\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m df\u001B[38;5;241m.\u001B[39mcolumns:\n\u001B[0;32m 142\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mVariantOnGenome/DNA/hg38 is not in the LOVD DataFrame.\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n", - "File \u001B[1;32m~\\PycharmProjects\\kath\\venv\\Lib\\site-packages\\pandas\\core\\frame.py:4102\u001B[0m, in \u001B[0;36mDataFrame.__getitem__\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 4100\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcolumns\u001B[38;5;241m.\u001B[39mnlevels \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m1\u001B[39m:\n\u001B[0;32m 4101\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_getitem_multilevel(key)\n\u001B[1;32m-> 4102\u001B[0m indexer \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcolumns\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mkey\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 4103\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m is_integer(indexer):\n\u001B[0;32m 4104\u001B[0m indexer \u001B[38;5;241m=\u001B[39m [indexer]\n", - "File \u001B[1;32m~\\PycharmProjects\\kath\\venv\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:3812\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 3807\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(casted_key, \u001B[38;5;28mslice\u001B[39m) \u001B[38;5;129;01mor\u001B[39;00m (\n\u001B[0;32m 3808\u001B[0m \u001B[38;5;28misinstance\u001B[39m(casted_key, abc\u001B[38;5;241m.\u001B[39mIterable)\n\u001B[0;32m 3809\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;28many\u001B[39m(\u001B[38;5;28misinstance\u001B[39m(x, \u001B[38;5;28mslice\u001B[39m) \u001B[38;5;28;01mfor\u001B[39;00m x \u001B[38;5;129;01min\u001B[39;00m casted_key)\n\u001B[0;32m 3810\u001B[0m ):\n\u001B[0;32m 3811\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m InvalidIndexError(key)\n\u001B[1;32m-> 3812\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m(key) \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01merr\u001B[39;00m\n\u001B[0;32m 3813\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mTypeError\u001B[39;00m:\n\u001B[0;32m 3814\u001B[0m \u001B[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001B[39;00m\n\u001B[0;32m 3815\u001B[0m \u001B[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001B[39;00m\n\u001B[0;32m 3816\u001B[0m \u001B[38;5;66;03m# the TypeError.\u001B[39;00m\n\u001B[0;32m 3817\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_check_indexing_error(key)\n", + "File \u001B[1;32m~\\PycharmProjects\\KathChatGPT\\.venv\\Lib\\site-packages\\pandas\\core\\frame.py:4102\u001B[0m, in \u001B[0;36mDataFrame.__getitem__\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 4100\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcolumns\u001B[38;5;241m.\u001B[39mnlevels \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m1\u001B[39m:\n\u001B[0;32m 4101\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_getitem_multilevel(key)\n\u001B[1;32m-> 4102\u001B[0m indexer \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcolumns\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mkey\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 4103\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m is_integer(indexer):\n\u001B[0;32m 4104\u001B[0m indexer \u001B[38;5;241m=\u001B[39m [indexer]\n", + "File \u001B[1;32m~\\PycharmProjects\\KathChatGPT\\.venv\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:3812\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 3807\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(casted_key, \u001B[38;5;28mslice\u001B[39m) \u001B[38;5;129;01mor\u001B[39;00m (\n\u001B[0;32m 3808\u001B[0m \u001B[38;5;28misinstance\u001B[39m(casted_key, abc\u001B[38;5;241m.\u001B[39mIterable)\n\u001B[0;32m 3809\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;28many\u001B[39m(\u001B[38;5;28misinstance\u001B[39m(x, \u001B[38;5;28mslice\u001B[39m) \u001B[38;5;28;01mfor\u001B[39;00m x \u001B[38;5;129;01min\u001B[39;00m casted_key)\n\u001B[0;32m 3810\u001B[0m ):\n\u001B[0;32m 3811\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m InvalidIndexError(key)\n\u001B[1;32m-> 3812\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m(key) \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01merr\u001B[39;00m\n\u001B[0;32m 3813\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mTypeError\u001B[39;00m:\n\u001B[0;32m 3814\u001B[0m \u001B[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001B[39;00m\n\u001B[0;32m 3815\u001B[0m \u001B[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001B[39;00m\n\u001B[0;32m 3816\u001B[0m \u001B[38;5;66;03m# the TypeError.\u001B[39;00m\n\u001B[0;32m 3817\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_check_indexing_error(key)\n", "\u001B[1;31mKeyError\u001B[0m: 'Variants_On_Genome'" ] } ], - "execution_count": 46 + "execution_count": 8 }, { "cell_type": "code", "id": "c7ff16903e0c52bd", "metadata": { "ExecuteTime": { - "end_time": "2024-07-31T14:38:52.603569Z", - "start_time": "2024-07-31T14:38:52.602570Z" + "end_time": "2024-08-06T14:19:36.921706Z", + "start_time": "2024-08-06T14:19:36.921706Z" } }, "source": [ @@ -2992,12 +2984,7 @@ { "cell_type": "code", "id": "0514ccc3-5c91-41ad-ab15-f4158030ea14", - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-31T14:38:52.605570Z", - "start_time": "2024-07-31T14:38:52.604570Z" - } - }, + "metadata": {}, "source": [ "from api.tools import get_revel_scores\n", "\n", @@ -3049,11 +3036,308 @@ "outputs": [], "execution_count": null }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-06T14:58:14.898227Z", + "start_time": "2024-08-06T14:58:14.228473Z" + } + }, + "cell_type": "code", + "source": [ + "import requests\n", + "\n", + "gene_id = '1519785'\n", + "\n", + "path = f\"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=clinvar&id={gene_id}&retmode=json\"\n", + "\n", + "request = requests.get(path)\n", + "\n", + "if request.status_code != 200:\n", + " raise ValueError(f\"Request failed with status code {request.status_code}\")\n", + "\n", + "data = request.json()\n", + "\n", + " # Extract the 'result' part of the JSON\n", + "results = data['result']\n", + "\n", + "# Extract the 'uids' part of the JSON\n", + "flattened_data = []\n", + "\n", + "for uid in results['uids']:\n", + " entry = results[uid]\n", + "\n", + " # Using pd.json_normalize to flatten the JSON data\n", + " flattened_entry = pd.json_normalize(entry, sep='_')\n", + "\n", + " flattened_variation_set = pd.json_normalize(flattened_entry['variation_set'][0], sep='_')\n", + " flattened_variation_xrefs = pd.json_normalize(flattened_variation_set['variation_xrefs'][0], sep='_')\n", + " \n", + " variation_loc_size = len(flattened_variation_set['variation_loc'][0]) \n", + " for i in range(variation_loc_size):\n", + " flattened_variation_loc = pd.json_normalize(flattened_variation_set['variation_loc'][0][i], sep='_')\n", + " flattened_variation_loc = flattened_variation_loc.add_prefix(f'{i}_')\n", + " flattened_variation_set = pd.concat([flattened_variation_set, flattened_variation_loc], axis=1)\n", + " \n", + " allele_freq_set_size = len(flattened_variation_set['allele_freq_set'][0])\n", + " for i in range(allele_freq_set_size):\n", + " flattened_allele_freq_set = pd.json_normalize(flattened_variation_set['allele_freq_set'][0][i], sep='_')\n", + " flattened_allele_freq_set = flattened_allele_freq_set.add_prefix(f'{i}_')\n", + " flattened_variation_set = pd.concat([flattened_variation_set, flattened_allele_freq_set], axis=1)\n", + " \n", + " gene_size = len(flattened_entry['genes'][0])\n", + " for i in range(gene_size):\n", + " flattened_genes = pd.json_normalize(flattened_entry['genes'][0][i], sep='_')\n", + " flattened_genes = flattened_genes.add_prefix(f'{i}_')\n", + " flattened_entry = pd.concat([flattened_entry, flattened_genes], axis=1)\n", + "\n", + " gremline_classification_trait_set_size = len(flattened_entry['germline_classification_trait_set'][0])\n", + " for i in range(gremline_classification_trait_set_size):\n", + " flattened_germline_classification_trait_set = pd.json_normalize(flattened_entry['germline_classification_trait_set'][0][i], sep='_')\n", + " flattened_germline_classification_trait_set = flattened_germline_classification_trait_set.add_prefix(f'{i}_')\n", + " \n", + " trait_xrefs_size = len(flattened_germline_classification_trait_set[f'{i}_trait_xrefs'][0])\n", + " for j in range(trait_xrefs_size):\n", + " flattened_trait_xrefs = pd.json_normalize(flattened_germline_classification_trait_set[f'{i}_trait_xrefs'][0][j], sep='_')\n", + " flattened_trait_xrefs = flattened_trait_xrefs.add_prefix(f'{j}_')\n", + "\n", + " flattened_germline_classification_trait_set = pd.concat([flattened_germline_classification_trait_set, flattened_trait_xrefs], axis=1)\n", + " \n", + " flattened_germline_classification_trait_set = flattened_germline_classification_trait_set.drop(columns=[f'{i}_trait_xrefs'], axis=1)\n", + " flattened_entry = pd.concat([flattened_entry, flattened_germline_classification_trait_set], axis=1)\n", + " \n", + " # dropping extracted nests\n", + " flattened_entry = flattened_entry.drop(columns=['variation_set', 'genes', 'germline_classification_trait_set'],axis=1)\n", + " flattened_variation_set = flattened_variation_set.drop(columns=['variation_xrefs', 'variation_loc', 'allele_freq_set'], axis=1)\n", + "\n", + " flattened_variation_set = pd.concat([flattened_variation_set, flattened_variation_xrefs], axis=1)\n", + " flattened_variation_set = pd.concat([flattened_variation_set, flattened_allele_freq_set], axis=1)\n", + "\n", + " flattened_entry = pd.concat([flattened_entry, flattened_variation_set], axis=1)\n", + " flattened_entry = pd.concat([flattened_entry, flattened_germline_classification_trait_set], axis=1)\n", + "\n", + " # Append the flattened entry to the list\n", + " flattened_data.append(flattened_entry)\n", + "\n", + " # Concatenate all flattened entries into a single DataFrame\n", + "df = pd.concat(flattened_data, ignore_index=True)\n", + "\n", + "display(df)\n" + ], + "id": "7e9ca83a40035c14", + "outputs": [ + { + "data": { + "text/plain": [ + " uid obj_type ... 3_db_source 3_db_id\n", + "0 1519785 single nucleotide variant ... OMIM 614702\n", + "\n", + "[1 rows x 110 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
uidobj_typeaccessionaccession_versiontitlerecord_statusgene_sortchr_sortlocation_sortvariation_set_namevariation_set_idmolecular_consequence_listprotein_changefda_recognized_databasesupporting_submissions_scvsupporting_submissions_rcvgermline_classification_descriptiongermline_classification_last_evaluatedgermline_classification_review_statusgermline_classification_fda_recognized_databaseclinical_impact_classification_descriptionclinical_impact_classification_last_evaluatedclinical_impact_classification_review_statusclinical_impact_classification_fda_recognized_databaseclinical_impact_classification_trait_setoncogenicity_classification_descriptiononcogenicity_classification_last_evaluatedoncogenicity_classification_review_statusoncogenicity_classification_fda_recognized_databaseoncogenicity_classification_trait_set0_symbol0_geneid0_strand0_source0_trait_name0_db_source0_db_id1_db_source1_db_id2_db_source...1_stop1_inner_start1_inner_stop1_outer_start1_outer_stop1_display_start1_display_stop1_assembly_acc_ver1_annotation_release1_alt1_ref0_source0_value0_minor_allele1_source1_value1_minor_allele2_source2_value2_minor_allele3_source3_value3_minor_allele4_source4_value4_minor_alleledb_sourcedb_id4_source4_value4_minor_allele0_trait_name0_db_source0_db_id1_db_source1_db_id2_db_source2_db_id3_db_source3_db_id
01519785single nucleotide variantVCV001519785VCV001519785.NM_012123.4(MTO1):c.1465+4A>TMTO10600000000000073482248[intron variant][SCV002308196][RCV002024803]Uncertain significance2022/07/06 00:00criteria provided, single submitter1/01/01 00:00[]1/01/01 00:00[]MTO125821+submittedMitochondrial hypertrophic cardiomyopathy with...Orphanet314637MedGenC4749921MONDO...741919717419197174191971GCF_000001405.25Exome Aggregation Consortium (ExAC)0.00002The Genome Aggregation Database (gnomAD)0.00003The Genome Aggregation Database (gnomAD), exomes0.00004Trans-Omics for Precision Medicine (TOPMed)0.000051000 Genomes Project0.00020TdbSNP5550946771000 Genomes Project0.00020TMitochondrial hypertrophic cardiomyopathy with...Orphanet314637MedGenC4749921MONDOMONDO:0013865OMIM614702
\n", + "

1 rows × 110 columns

\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "execution_count": 35 + }, { "metadata": {}, "cell_type": "code", "source": "", - "id": "a97fbf604bd8977b", + "id": "7df7d0cb3b874157", "outputs": [], "execution_count": null } From 6dba6fe6b5398256812ec0c8ca11e297e378a488 Mon Sep 17 00:00:00 2001 From: Kajus CC <42713684+KajusC@users.noreply.github.com> Date: Wed, 7 Aug 2024 16:03:04 +0300 Subject: [PATCH 03/10] Implemented a function which extracts variation ids from gene name, those ids are formatted to dataframe --- api/__init__.py | 3 +- api/data/__init__.py | 1 + api/data/refactoring.py | 148 +- tests/pipeline.ipynb | 4293 +++++++++++---------------------------- 4 files changed, 1262 insertions(+), 3183 deletions(-) diff --git a/api/__init__.py b/api/__init__.py index f54edfc..fb618dd 100644 --- a/api/__init__.py +++ b/api/__init__.py @@ -56,5 +56,6 @@ parse_lovd, from_clinvar_name_to_cdna_position, save_lovd_as_vcf, - request_clinvar_api_data + request_clinvar_api_data, + get_variant_ids_from_clinvar_name_api, ) diff --git a/api/data/__init__.py b/api/data/__init__.py index 001190d..34caf42 100644 --- a/api/data/__init__.py +++ b/api/data/__init__.py @@ -57,4 +57,5 @@ from_clinvar_name_to_cdna_position, save_lovd_as_vcf, request_clinvar_api_data, + get_variant_ids_from_clinvar_name_api, ) diff --git a/api/data/refactoring.py b/api/data/refactoring.py index c6e82ea..79cbe78 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -160,94 +160,116 @@ def save_lovd_as_vcf(data, save_to="./lovd.vcf"): f.write("\n") +def get_variant_ids_from_clinvar_name_api(name: str, count: int = 100): + """ + Extracts variant ids from ClinVar `name` variable. /n + key of dictionary is the size of the list of ids. + + :param str name: name of variant + :param int count: number of ids to extract + :returns: ids of variants + :rtype: str + """ + + result = {} + + separator = "," + clinvar_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=clinvar&term={name}&retmode=json&retmax={count}" + + request = requests.get(clinvar_url) + + if request.status_code != 200: + raise ValueError(f"Request failed with status code {request.status_code}") + + data = request.json() + + ids = data['esearchresult']['idlist'] + + result['idlist'] = ids + result['count'] = data['esearchresult']['count'] + + return result + + def request_clinvar_api_data(gene_id: str): """ - Requests ClinVar API for data about variant with given id.\n + Requests ClinVar API for data about variant with given id. Converts it to pandas dataframe. :param str gene_id: id of variant (may be multiple) :returns: dataframe from ClinVar API :rtype: dataframe """ + clinvar_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=clinvar&id={gene_id}&retmode=json" - path = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=clinvar&id={gene_id}&retmode=json" - - request = requests.get(path) + request = requests.get(clinvar_url) if request.status_code != 200: raise ValueError(f"Request failed with status code {request.status_code}") data = request.json() - # Extract the 'result' part of the JSON results = data['result'] - # Extract the 'uids' part of the JSON flattened_data = [] for uid in results['uids']: entry = results[uid] - # Using pd.json_normalize to flatten the JSON data flattened_entry = pd.json_normalize(entry, sep='_') - flattened_variation_set = pd.json_normalize(flattened_entry['variation_set'][0], sep='_') - flattened_variation_xrefs = pd.json_normalize(flattened_variation_set['variation_xrefs'][0], sep='_') - - variation_loc_size = len(flattened_variation_set['variation_loc'][0]) - for i in range(variation_loc_size): - flattened_variation_loc = pd.json_normalize(flattened_variation_set['variation_loc'][0][i], sep='_') - flattened_variation_loc = flattened_variation_loc.add_prefix(f'{i}_') - flattened_variation_set = pd.concat([flattened_variation_set, flattened_variation_loc], axis=1) - - allele_freq_set_size = len(flattened_variation_set['allele_freq_set'][0]) - for i in range(allele_freq_set_size): - flattened_allele_freq_set = pd.json_normalize(flattened_variation_set['allele_freq_set'][0][i], sep='_') - flattened_allele_freq_set = flattened_allele_freq_set.add_prefix(f'{i}_') - flattened_variation_set = pd.concat([flattened_variation_set, flattened_allele_freq_set], axis=1) - - gene_size = len(flattened_entry['genes'][0]) - for i in range(gene_size): - flattened_genes = pd.json_normalize(flattened_entry['genes'][0][i], sep='_') - flattened_genes = flattened_genes.add_prefix(f'{i}_') - flattened_entry = pd.concat([flattened_entry, flattened_genes], axis=1) - - gremline_classification_trait_set_size = len(flattened_entry['germline_classification_trait_set'][0]) - for i in range(gremline_classification_trait_set_size): - flattened_germline_classification_trait_set = pd.json_normalize( - flattened_entry['germline_classification_trait_set'][0][i], sep='_') - flattened_germline_classification_trait_set = flattened_germline_classification_trait_set.add_prefix( - f'{i}_') - - trait_xrefs_size = len(flattened_germline_classification_trait_set[f'{i}_trait_xrefs'][0]) - for j in range(trait_xrefs_size): - flattened_trait_xrefs = pd.json_normalize( - flattened_germline_classification_trait_set[f'{i}_trait_xrefs'][0][j], sep='_') - flattened_trait_xrefs = flattened_trait_xrefs.add_prefix(f'{j}_') - - flattened_germline_classification_trait_set = pd.concat( - [flattened_germline_classification_trait_set, flattened_trait_xrefs], axis=1) - - flattened_germline_classification_trait_set = flattened_germline_classification_trait_set.drop( - columns=[f'{i}_trait_xrefs'], axis=1) - flattened_entry = pd.concat([flattened_entry, flattened_germline_classification_trait_set], axis=1) - - # dropping extracted nests - flattened_entry = flattened_entry.drop(columns=['variation_set', 'genes', 'germline_classification_trait_set'], - axis=1) - flattened_variation_set = flattened_variation_set.drop( - columns=['variation_xrefs', 'variation_loc', 'allele_freq_set'], axis=1) - - flattened_variation_set = pd.concat([flattened_variation_set, flattened_variation_xrefs], axis=1) - flattened_variation_set = pd.concat([flattened_variation_set, flattened_allele_freq_set], axis=1) - - flattened_entry = pd.concat([flattened_entry, flattened_variation_set], axis=1) - flattened_entry = pd.concat([flattened_entry, flattened_germline_classification_trait_set], axis=1) - - # Append the flattened entry to the list + variation_set = flattened_entry.at[0, 'variation_set'] + for idx, var_set in enumerate(variation_set): + flat_var_set = pd.json_normalize(var_set, sep='_') + flat_var_set = flat_var_set.add_prefix(f'variation_set_{idx}_') + + variation_loc = var_set.get('variation_loc', []) + for loc_idx, loc in enumerate(variation_loc): + flat_loc = pd.json_normalize(loc, sep='_') + flat_loc = flat_loc.add_prefix(f'variation_set_{idx}_loc_{loc_idx}_') + flat_var_set = flat_var_set.join(flat_loc, rsuffix=f'_{idx}_{loc_idx}_vl') + + var_xrefs = var_set.get('variation_xrefs', []) + for var_xrefs_idx, var_xref in enumerate(var_xrefs): + flat_var_xrefs = pd.json_normalize(var_xref, sep='_') + flat_var_xrefs = flat_var_xrefs.add_prefix(f'variation_set_{idx}_var_xrefs_{var_xrefs_idx}_') + flat_var_set = flat_var_set.join(flat_var_xrefs, rsuffix=f'_{idx}_{var_xrefs_idx}_vx') + + allele_freq = var_set.get('allele_freq_set', []) + for allele_freq_idx, allele in enumerate(allele_freq): + flat_allele = pd.json_normalize(allele, sep='_') + flat_allele = flat_allele.add_prefix(f'variation_set_{idx}_allele_freq_{allele_freq_idx}_') + flat_var_set = flat_var_set.join(flat_allele, rsuffix=f'_{idx}_{allele_freq_idx}_af') + + flat_var_set = flat_var_set.drop( + columns=[f'variation_set_{idx}_variation_loc', f'variation_set_{idx}_variation_xrefs', + f'variation_set_{idx}_allele_freq_set']) + flattened_entry = flattened_entry.join(flat_var_set, rsuffix=f'_{idx}_vs') + + genes = flattened_entry.at[0, 'genes'] + for idx, gene in enumerate(genes): + flat_genes = pd.json_normalize(gene, sep='_') + flat_genes = flat_genes.add_prefix(f'gene_{idx}_') + flattened_entry = flattened_entry.join(flat_genes, rsuffix=f'_{idx}_g') + + germline_classification_trait_set = flattened_entry.at[0, 'germline_classification_trait_set'] + for idx, germline_set in enumerate(germline_classification_trait_set): + flat_germline_set = pd.json_normalize(germline_set, sep='_') + flat_germline_set = flat_germline_set.add_prefix(f'germline_set_{idx}_') + + trait_xrefs = flat_germline_set.at[0, f'germline_set_{idx}_trait_xrefs'] + for jdx, trait_xref in enumerate(trait_xrefs): + flat_trait_xrefs = pd.json_normalize(trait_xref, sep='_') + flat_trait_xrefs = flat_trait_xrefs.add_prefix(f'trait_xref_{jdx}_') + flat_germline_set = flat_germline_set.join(flat_trait_xrefs, rsuffix=f'_{idx}_{jdx}_tx') + + flat_germline_set = flat_germline_set.drop(columns=[f'germline_set_{idx}_trait_xrefs']) + flattened_entry = flattened_entry.join(flat_germline_set, rsuffix=f'_{idx}_gls') + + flattened_entry = flattened_entry.drop(columns=['variation_set', 'genes', 'germline_classification_trait_set']) + flattened_data.append(flattened_entry) - # Concatenate all flattened entries into a single DataFrame df = pd.concat(flattened_data, ignore_index=True) - return df + return df \ No newline at end of file diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 12bb5a4..0fb9cfb 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -9,23 +9,27 @@ "outputs_hidden": true }, "ExecuteTime": { - "end_time": "2024-08-06T14:19:28.871130Z", - "start_time": "2024-08-06T14:19:28.169833Z" + "end_time": "2024-08-07T12:32:22.837138Z", + "start_time": "2024-08-07T12:32:21.979038Z" } }, "source": [ "import pandas as pd\n", + "import requests\n", "\n", "from api.data import (store_database_for_eys_gene,\n", " parse_lovd,\n", " LOVD_PATH,\n", " set_lovd_dtypes,\n", + " request_clinvar_api_data,\n", + " get_variant_ids_from_clinvar_name_api,\n", " )\n", "from api.data import save_lovd_as_vcf\n", + "\n", "pd.options.display.max_columns = 0" ], "outputs": [], - "execution_count": 3 + "execution_count": 1 }, { "cell_type": "code", @@ -34,17 +38,13 @@ "collapsed": false, "jupyter": { "outputs_hidden": false - }, - "ExecuteTime": { - "end_time": "2024-08-06T14:19:29.382744Z", - "start_time": "2024-08-06T14:19:28.873646Z" } }, "source": [ "store_database_for_eys_gene(\"lovd\", override=False)" ], "outputs": [], - "execution_count": 4 + "execution_count": null }, { "cell_type": "code", @@ -53,163 +53,309 @@ "collapsed": false, "jupyter": { "outputs_hidden": false - }, - "ExecuteTime": { - "end_time": "2024-08-06T14:19:35.787015Z", - "start_time": "2024-08-06T14:19:29.383754Z" } }, "source": [ "data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")" ], "outputs": [], - "execution_count": 5 + "execution_count": null }, { "cell_type": "code", "id": "8a089e29bfc8c119", + "metadata": {}, + "source": [ + "for i in data:\n", + " print(i)\n", + " display(data[i])" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "ef07740b2fa63e42", "metadata": { - "ExecuteTime": { - "end_time": "2024-08-06T14:19:35.968177Z", - "start_time": "2024-08-06T14:19:35.788025Z" + "collapsed": false, + "jupyter": { + "outputs_hidden": false } }, "source": [ + "set_lovd_dtypes(data)\n", "for i in data:\n", " print(i)\n", - " display(data[i])" + " display(data[i].info())" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "c968af1617be40db", + "metadata": {}, + "source": [ + "save_lovd_as_vcf(data[\"Variants_On_Genome\"], \"./lovd.vcf\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "c7ff16903e0c52bd", + "metadata": {}, + "source": [ + "from subprocess import Popen\n", + "\n", + "process = Popen(\"spliceai -I ./lovd.vcf -O ./lovd_output.vcf -R ../tools/spliceai/hg38.fa -A grch38\".split())\n", + "process.wait()" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "0514ccc3-5c91-41ad-ab15-f4158030ea14", + "metadata": {}, + "source": [ + "from api.tools import get_revel_scores\n", + "\n", + "chromosome = 6\n", + "position = 65655758\n", + "\n", + "results = get_revel_scores(chromosome, position)\n", + "\n", + "display(results)" + ], + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "import requests\n", + "from api.data import request_clinvar_api_data\n", + "\n", + "gene_id = '1519785,1519786'\n", + "\n", + "frames = request_clinvar_api_data(gene_id)\n", + "\n", + "display(frames)" + ], + "id": "b21c3487476b684f", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "gene_id = '1519785'\n", + "\n", + "\n", + "clinvar_url = f\"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=clinvar&id={gene_id}&retmode=json\"\n", + "\n", + "request = requests.get(clinvar_url)\n", + "\n", + "if request.status_code != 200:\n", + " raise ValueError(f\"Request failed with status code {request.status_code}\")\n", + "\n", + "data = request.json()\n", + "\n", + "# Extract the 'result' part of the JSON\n", + "results = data['result']\n", + "\n", + "# Extract the 'uids' part of the JSON\n", + "flattened_data = []\n", + "\n", + "for uid in results['uids']:\n", + " entry = results[uid]\n", + "\n", + " # Using pd.json_normalize to flatten the JSON data\n", + " flattened_entry = pd.json_normalize(entry, sep='_')\n", + "\n", + " # Process variation_set\n", + " variation_set = flattened_entry.at[0, 'variation_set']\n", + " for idx, var_set in enumerate(variation_set):\n", + " flat_var_set = pd.json_normalize(var_set, sep='_')\n", + " flat_var_set = flat_var_set.add_prefix(f'variation_set_{idx}_')\n", + "\n", + " # Process variation_loc within variation_set\n", + " variation_loc = var_set.get('variation_loc', [])\n", + " for loc_idx, loc in enumerate(variation_loc):\n", + " flat_loc = pd.json_normalize(loc, sep='_')\n", + " flat_loc = flat_loc.add_prefix(f'variation_set_{idx}_loc_{loc_idx}_')\n", + " flat_var_set = flat_var_set.join(flat_loc, rsuffix=f'_{idx}_{loc_idx}_vl')\n", + " \n", + " var_xrefs = var_set.get('variation_xrefs', [])\n", + " for var_xrefs_idx, var_xref in enumerate(var_xrefs):\n", + " flat_var_xrefs = pd.json_normalize(var_xref, sep='_')\n", + " flat_var_xrefs = flat_var_xrefs.add_prefix(f'variation_set_{idx}_var_xrefs_{var_xrefs_idx}_')\n", + " flat_var_set = flat_var_set.join(flat_var_xrefs, rsuffix=f'_{idx}_{var_xrefs_idx}_vx')\n", + "\n", + "\n", + " allele_freq = var_set.get('allele_freq_set', [])\n", + " for allele_freq_idx, allele in enumerate(allele_freq):\n", + " flat_allele = pd.json_normalize(allele, sep='_')\n", + " flat_allele = flat_allele.add_prefix(f'variation_set_{idx}_allele_freq_{allele_freq_idx}_')\n", + " flat_var_set = flat_var_set.join(flat_allele, rsuffix=f'_{idx}_{allele_freq_idx}_af')\n", + " \n", + " # drop original nested lists columns\n", + " flat_var_set = flat_var_set.drop(columns=[f'variation_set_{idx}_variation_loc', f'variation_set_{idx}_variation_xrefs', f'variation_set_{idx}_allele_freq_set'])\n", + " \n", + " flattened_entry = flattened_entry.join(flat_var_set, rsuffix=f'_{idx}_vs')\n", + "\n", + " # Process genes\n", + " genes = flattened_entry.at[0, 'genes']\n", + " for idx, gene in enumerate(genes):\n", + " flat_genes = pd.json_normalize(gene, sep='_')\n", + " flat_genes = flat_genes.add_prefix(f'gene_{idx}_')\n", + " flattened_entry = flattened_entry.join(flat_genes, rsuffix=f'_{idx}_g')\n", + " # Process germline_classification_trait_set\n", + " germline_classification_trait_set = flattened_entry.at[0, 'germline_classification_trait_set']\n", + " for idx, germline_set in enumerate(germline_classification_trait_set):\n", + " flat_germline_set = pd.json_normalize(germline_set, sep='_')\n", + " flat_germline_set = flat_germline_set.add_prefix(f'germline_set_{idx}_')\n", + "\n", + " trait_xrefs = flat_germline_set.at[0, f'germline_set_{idx}_trait_xrefs']\n", + " for jdx, trait_xref in enumerate(trait_xrefs):\n", + " flat_trait_xrefs = pd.json_normalize(trait_xref, sep='_')\n", + " flat_trait_xrefs = flat_trait_xrefs.add_prefix(f'trait_xref_{jdx}_')\n", + " flat_germline_set = flat_germline_set.join(flat_trait_xrefs, rsuffix=f'_{idx}_{jdx}_tx')\n", + "\n", + " flat_germline_set = flat_germline_set.drop(columns=[f'germline_set_{idx}_trait_xrefs'])\n", + " flattened_entry = flattened_entry.join(flat_germline_set, rsuffix=f'_{idx}_gls')\n", + "\n", + " # Dropping original nested lists columns\n", + " flattened_entry = flattened_entry.drop(columns=['variation_set', 'genes', 'germline_classification_trait_set'])\n", + "\n", + " # Append the flattened entry to the list\n", + " flattened_data.append(flattened_entry)\n", + "\n", + "# Concatenate all flattened entries into a single DataFrame\n", + "df = pd.concat(flattened_data, ignore_index=True)\n", + "\n", + "display(df)" + ], + "id": "3b9b8bdad8bdb55d", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "Explanation of whats happening in the code below:\n", + "\n", + "Function to get all the ids from a gene name:\n", + "```python\n", + "get_variant_ids_from_clinvar_name_api(name: str, count: int)\n", + "```\n", + "\n", + "function gets the ids from the clinvar api, the name is the gene name and the count is the maximum number of ids to get (api's limit is 500)\n", + "\n", + "function returns a dictionary with the count and the list of ids:\n", + "\n", + "```json\n", + "{\n", + " 'count': int,\n", + " 'idlist': List[str]\n", + "}\n", + "```\n", + "\n", + "if the count is greater than the api's limit, the function will split the list of ids into smaller lists of 500 and then request the data from the api in chunks of 500 ids:\n", + "\n", + "```python\n", + "id_lists = [id_list[i:i + max] for i in range(0, size, max)]\n", + "```\n", + "\n", + "then the function will request the data from the api and concatenate the dataframes into a single dataframe:\n", + "\n", + "```python\n", + "frames = request_clinvar_api_data(join)\n", + "variations = pd.concat([variations, frames], ignore_index=True)\n", + "```\n", + "\n", + "The variant extraction function contains a lot of nested lists and dictionaries, so the function will flatten the data and then concatenate the dataframes into a single dataframe\n", + "\n", + "**NOTE**\n", + "\n", + "> joining function may have been implemented wrong due to the waiting time of the api.\n" + ], + "id": "655a935b2874c218" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-07T12:57:28.089588Z", + "start_time": "2024-08-07T12:55:09.972813Z" + } + }, + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "\n", + "variations = pd.DataFrame()\n", + "\n", + "max = 500\n", + "name = \"EYS\"\n", + "count = 2147483647\n", + "\n", + "id_array = get_variant_ids_from_clinvar_name_api(name, count)\n", + "size = int(id_array['count'])\n", + "id_list = id_array['idlist']\n", + "\n", + "id_lists = [id_list[i:i + max] for i in range(0, size, max)]\n", + "\n", + "track = 0\n", + "for lists in id_lists:\n", + " join = \",\".join(lists)\n", + " frame = request_clinvar_api_data(join)\n", + " \n", + " variations = pd.concat([variations, frame], ignore_index=True)\n", + " \n", + " print(f\"{track + 1}/{len(id_lists)}\")\n", + " track += 1\n", + "\n", + "display(variations)\n" ], + "id": "129175e3a2e568be", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Genes\n" - ] - }, - { - "data": { - "text/plain": [ - " id name ... updated_by updated_date\n", - "0 EYS eyes shut homolog (Drosophila) ... 00006 2024-05-31 11:42:04\n", - "\n", - "[1 rows x 34 columns]" - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamechromosomechrom_bandimprintingrefseq_genomicrefseq_UDreferenceurl_homepageurl_externalallow_downloadid_hgncid_entrezid_omimshow_hgmdshow_genecardsshow_genetestsshow_orphanetnote_indexnote_listingrefseqrefseq_urldisclaimerdisclaimer_textheaderheader_alignfooterfooter_aligncreated_bycreated_dateedited_byedited_dateupdated_byupdated_date
0EYSeyes shut homolog (Drosophila)6q12unknownNG_023443.2UD_132085377375http://www.LOVD.nl/EYS1215553460076124241111<font color=\\\"#FF0000\\\">This database is one o...ghttp://databases.lovd.nl/shared/refseq/EYS_NM_...1<font color=\\\"#FF0000\\\">This database is one o...-1-1000012012-02-13 00:00:00000062023-08-30 13:08:19000062024-05-31 11:42:04
\n", - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Transcripts\n" + "1/10\n", + "2/10\n", + "3/10\n", + "4/10\n", + "5/10\n", + "6/10\n", + "7/10\n", + "8/10\n", + "9/10\n", + "10/10\n" ] }, { "data": { "text/plain": [ - " id geneid ... edited_by edited_date\n", - "0 00007329 EYS ... \n", + " uid obj_type ... gene_1029_strand gene_1029_source\n", + "0 3251429 single nucleotide variant ... NaN NaN\n", + "1 3246148 Deletion ... NaN NaN\n", + "2 3246147 Deletion ... NaN NaN\n", + "3 3246146 Deletion ... NaN NaN\n", + "4 3246145 Deletion ... NaN NaN\n", + "... ... ... ... ... ...\n", + "4778 538 single nucleotide variant ... NaN NaN\n", + "4779 537 single nucleotide variant ... NaN NaN\n", + "4780 536 Deletion ... NaN NaN\n", + "4781 535 Deletion ... NaN NaN\n", + "4782 534 Deletion ... NaN NaN\n", "\n", - "[1 rows x 19 columns]" + "[4783 rows x 4428 columns]" ], "text/html": [ "
\n", @@ -230,3034 +376,951 @@ " \n", " \n", " \n", - " id\n", - " geneid\n", - " name\n", - " id_mutalyzer\n", - " id_ncbi\n", - " id_ensembl\n", - " id_protein_ncbi\n", - " id_protein_ensembl\n", - " id_protein_uniprot\n", - " remarks\n", - " position_c_mrna_start\n", - " position_c_mrna_end\n", - " position_c_cds_end\n", - " position_g_mrna_start\n", - " position_g_mrna_end\n", - " created_by\n", - " created_date\n", - " edited_by\n", - " edited_date\n", + " uid\n", + " obj_type\n", + " accession\n", + " accession_version\n", + " title\n", + " record_status\n", + " gene_sort\n", + " chr_sort\n", + " location_sort\n", + " variation_set_name\n", + " variation_set_id\n", + " molecular_consequence_list\n", + " protein_change\n", + " fda_recognized_database\n", + " supporting_submissions_scv\n", + " supporting_submissions_rcv\n", + " germline_classification_description\n", + " germline_classification_last_evaluated\n", + " germline_classification_review_status\n", + " germline_classification_fda_recognized_database\n", + " clinical_impact_classification_description\n", + " clinical_impact_classification_last_evaluated\n", + " clinical_impact_classification_review_status\n", + " clinical_impact_classification_fda_recognized_database\n", + " clinical_impact_classification_trait_set\n", + " oncogenicity_classification_description\n", + " oncogenicity_classification_last_evaluated\n", + " oncogenicity_classification_review_status\n", + " oncogenicity_classification_fda_recognized_database\n", + " oncogenicity_classification_trait_set\n", + " variation_set_0_measure_id\n", + " variation_set_0_variation_name\n", + " variation_set_0_cdna_change\n", + " variation_set_0_aliases\n", + " variation_set_0_variant_type\n", + " variation_set_0_canonical_spdi\n", + " variation_set_0_loc_0_status\n", + " variation_set_0_loc_0_assembly_name\n", + " variation_set_0_loc_0_chr\n", + " variation_set_0_loc_0_band\n", + " ...\n", + " gene_1020_symbol\n", + " gene_1020_geneid\n", + " gene_1020_strand\n", + " gene_1020_source\n", + " gene_1021_symbol\n", + " gene_1021_geneid\n", + " gene_1021_strand\n", + " gene_1021_source\n", + " gene_1022_symbol\n", + " gene_1022_geneid\n", + " gene_1022_strand\n", + " gene_1022_source\n", + " gene_1023_symbol\n", + " gene_1023_geneid\n", + " gene_1023_strand\n", + " gene_1023_source\n", + " gene_1024_symbol\n", + " gene_1024_geneid\n", + " gene_1024_strand\n", + " gene_1024_source\n", + " gene_1025_symbol\n", + " gene_1025_geneid\n", + " gene_1025_strand\n", + " gene_1025_source\n", + " gene_1026_symbol\n", + " gene_1026_geneid\n", + " gene_1026_strand\n", + " gene_1026_source\n", + " gene_1027_symbol\n", + " gene_1027_geneid\n", + " gene_1027_strand\n", + " gene_1027_source\n", + " gene_1028_symbol\n", + " gene_1028_geneid\n", + " gene_1028_strand\n", + " gene_1028_source\n", + " gene_1029_symbol\n", + " gene_1029_geneid\n", + " gene_1029_strand\n", + " gene_1029_source\n", " \n", " \n", " \n", " \n", " 0\n", - " 00007329\n", + " 3251429\n", + " single nucleotide variant\n", + " VCV003251429\n", + " VCV003251429.\n", + " NM_001142800.2(EYS):c.5886T>C (p.Thr1962=)\n", + " \n", " EYS\n", - " transcript variant 1\n", - " 001\n", - " NM_001142800.1\n", + " 06\n", + " 00000000000064436215\n", + " \n", + " \n", + " [synonymous variant]\n", + " \n", + " \n", + " [SCV005076913]\n", + " [RCV004587835]\n", + " Likely benign\n", + " 2024/04/08 00:00\n", + " criteria provided, single submitter\n", " \n", - " NP_001136272.1\n", " \n", + " 1/01/01 00:00\n", " \n", " \n", - " -538\n", - " 10051\n", - " 9435\n", - " 66417118\n", - " 64429876\n", + " []\n", " \n", - " 0000-00-00 00:00:00\n", + " 1/01/01 00:00\n", " \n", " \n", + " []\n", + " 3410228\n", + " NM_001142800.2(EYS):c.5886T>C (p.Thr1962=)\n", + " c.5886T>C\n", + " []\n", + " single nucleotide variant\n", + " NC_000006.12:64436214:A:G\n", + " current\n", + " GRCh38\n", + " 6\n", + " 6q12\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", - " \n", - "\n", - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Diseases\n" - ] - }, - { - "data": { - "text/plain": [ - " id symbol ... edited_by edited_date\n", - "0 00012 PSORS ... 00006 2019-08-12 13:38:21\n", - "1 00058 CORD ... 00006 2020-08-30 09:43:59\n", - "2 00112 RP ... 00006 2021-01-18 09:53:26\n", - "3 00139 ID ... 00006 2015-02-09 10:02:49\n", - "4 00173 SLOS ... 00006 2021-12-10 21:51:32\n", - "5 00198 ? ... 00006 2016-10-22 17:54:40\n", - "6 02156 - ... 00006 2021-12-10 21:51:32\n", - "7 02440 RP25 ... 00006 2021-12-10 21:51:32\n", - "8 04211 RPar ... \n", - "9 04214 - ... 00001 2023-03-09 14:26:26\n", - "10 04249 macular dystrophy ... 00006 2024-02-15 21:18:39\n", - "11 05086 HL ... 00006 2015-10-23 11:43:00\n", - "12 05415 USH ... \n", - "13 05468 uveitis ... \n", - "14 06906 DEE ... \n", - "\n", - "[15 rows x 12 columns]" - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", " \n", " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", " \n", - " \n", - " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", + " \n", " \n", " \n", + " \n", " \n", + " \n", " \n", - " \n", - " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - "
idsymbolnameinheritanceid_omimtissuesfeaturesremarkscreated_bycreated_dateedited_byedited_date
000012PSORSpsoriasis, pustular, generalized (PSORS)13246148DeletionVCV003246148VCV003246148.NC_000006.11:g.(?_66204859)_(66217229_?)delEYS0699999999999999999999[]000062012-07-06 21:50:32000062019-08-12 13:38:21
100058CORDdystrophy, cone-rod (CORD)[SCV005067530][RCV004578792]Pathogenic2023/01/02 00:00criteria provided, single submitter1/01/01 00:00[]1/01/01 00:00[]3403857NC_000006.11:g.(?_66204859)_(66217229_?)delNC_000006.11:g.(?_66204859)_(66217229_?)del[]Deletion000062012-09-22 11:31:25000062020-08-30 09:43:59previousGRCh3766q12...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
200112RPretinitis pigmentosa (RP)3246147DeletionVCV003246147VCV003246147.NC_000006.11:g.(?_64511633)_(64516181_?)delEYS0699999999999999999999[]268000[SCV005067529][RCV004578791]Likely pathogenic2023/03/08 00:00criteria provided, single submitter1/01/01 00:00[]1/01/01 00:00[]3403856NC_000006.11:g.(?_64511633)_(64516181_?)delNC_000006.11:g.(?_64511633)_(64516181_?)del[]Deletion000012013-02-21 17:12:36000062021-01-18 09:53:26previousGRCh3766q12...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
300139IDintellectual disability (ID)3246146DeletionVCV003246146VCV003246146.NC_000006.11:g.(?_65523280)_(65527746_?)delEYS0699999999999999999999[][SCV005067528][RCV004578790]Likely pathogenic2023/04/30 00:00criteria provided, single submitter1/01/01 00:00[]1/01/01 00:00[]3403855NC_000006.11:g.(?_65523280)_(65527746_?)delNC_000006.11:g.(?_65523280)_(65527746_?)del[]Deletion000842013-06-04 18:18:07000062015-02-09 10:02:49previousGRCh3766q12...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
400173SLOSSmith-Lemli-Opitz syndrome (SLOS)AR2704003246145DeletionVCV003246145VCV003246145.NC_000006.11:g.(?_65587645)_(65596716_?)delEYS0699999999999999999999[]000062013-08-01 11:16:14000062021-12-10 21:51:32
500198?unclassified / mixed[SCV005067527][RCV004578789]Likely pathogenic2023/06/27 00:00criteria provided, single submitter1/01/01 00:00000062013-09-13 14:21:47000062016-10-22 17:54:40
602156-retinitis pigmentosa, X-linked, and sinorespir...[]3004551/01/01 00:00[]3403854NC_000006.11:g.(?_65587645)_(65596716_?)delNC_000006.11:g.(?_65587645)_(65596716_?)del[]Deletion000062014-09-25 23:29:40000062021-12-10 21:51:32previousGRCh3766q12...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
......................................................................................................................................................................................................................................................
702440RP25retinitis pigmentosa, type 25 (RP25)AR6027724778538single nucleotide variantVCV000000538VCV000000538.NM_001142800.2(EYS):c.9405T>A (p.Tyr3135Ter)EYS0600000000000063720626000062014-09-25 23:29:40000062021-12-10 21:51:32
804211RParretinitis pigmentosa, autosomal recessive (RPar)[3 prime UTR variant, nonsense]Y3156*, Y3135*[SCV000020717, SCV000894389, SCV000709692, SCV...[RCV000000568, RCV000593252, RCV003914789, RCV...Pathogenic/Likely pathogenic2024/03/09 00:00criteria provided, multiple submitters, no con...1/01/01 00:00000062015-02-27 18:58:57[]1/01/01 00:00
904214-retinal disease[]15577NM_001142800.2(EYS):c.9405T>A (p.Tyr3135Ter)c.9405T>A[]single nucleotide variantNC_000006.12:63720625:A:TcurrentGRCh3866q12...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4779537single nucleotide variantVCV000000537VCV000000537.NM_001142800.2(EYS):c.5857G>T (p.Glu1953Ter)EYS0600000000000064436244[nonsense]E1953*000062015-02-27 19:48:07000012023-03-09 14:26:26
1004249macular dystrophydystrophy, macular[SCV000020716, SCV002519636, SCV004195857, SCV...[RCV000000567, RCV001387157]Pathogenic2024/02/15 00:00criteria provided, multiple submitters, no con...1/01/01 00:00[]000062015-05-04 22:10:58000062024-02-15 21:18:39
1105086HLhearing loss (HL)1/01/01 00:00[]15576NM_001142800.2(EYS):c.5857G>T (p.Glu1953Ter)c.5857G>T[]single nucleotide variantNC_000006.12:64436243:C:AcurrentGRCh3866q12...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4780536DeletionVCV000000536VCV000000536.NM_001142800.1(EYS):c.1767-24596_2023+238135delLOC4411550600000000000065057728[]000062015-10-23 11:41:05000062015-10-23 11:43:00
1205415USHUsher syndrome (USH)[SCV000020715][RCV000000566]Pathogenic2008/11/01 00:00no assertion criteria provided1/01/01 00:00000062018-04-02 16:40:44[]1/01/01 00:00
1305468uveitisuveitis[]15575NM_001142800.1(EYS):c.1767-24596_2023+238135delNM_001142800.1(EYS):c.1767-24596_2023+238135del[EX12DEL]DeletioncurrentGRCh3866q12...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4781535DeletionVCV000000535VCV000000535.NM_001142800.1(EYS):c.2260-51191_2992+45990delEYS0600000000000064840707[]000062018-08-22 09:47:04[SCV000020714][RCV000000565]Pathogenic2008/11/01 00:00no assertion criteria provided
1406906DEEencephalopathy, developmental and epileptic1/01/01 00:00[]1/01/01 00:00000062022-04-07 09:24:23[]15574NM_001142800.1(EYS):c.2260-51191_2992+45990delNM_001142800.1(EYS):c.2260-51191_2992+45990del[EX15-19DEL]DeletioncurrentGRCh3866q12...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4782534DeletionVCV000000534VCV000000534.NM_001142800.2(EYS):c.2710_2726del (p.Asp904fs)
\n", - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Genes_To_Diseases\n" - ] - }, - { - "data": { - "text/plain": [ - " geneid diseaseid\n", - "0 EYS 00112\n", - "1 EYS 02440" - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
geneiddiseaseid
0EYS00112
1EYS02440
\n", - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Individuals\n" - ] - }, - { - "data": { - "text/plain": [ - " id fatherid ... Individual/Origin/Population Individual/Individual_ID\n", - "0 00000135 ... \n", - "1 00000210 ... \n", - "2 00001962 ... white \n", - "3 00016605 ... \n", - "4 00033096 ... \n", - "... ... ... ... ... ...\n", - "1460 00451166 ... 071788\n", - "1461 00451252 ... 072857\n", - "1462 00451259 ... 073069\n", - "1463 00451333 ... 075139\n", - "1464 00451348 ... 080622\n", - "\n", - "[1465 rows x 18 columns]" - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idfatheridmotheridpanelidpanel_sizelicenseowned_byIndividual/ReferenceIndividual/RemarksIndividual/GenderIndividual/ConsanguinityIndividual/Origin/GeographicIndividual/Age_of_deathIndividual/VIPIndividual/Data_avIndividual/TreatmentIndividual/Origin/PopulationIndividual/Individual_ID
000000135300006{PMID:Marrakchi 2011:21848462}5-generation family, 3 affecteds (M)MyesTunisia
100000210100039{PMID:Abu-Safieh-2013:23105016}(Saudi Arabia)
200001962100025M?Germanywhite
3000166051005520
400033096100229{PMID:Neveling 2012:22334370}Mno0
.........................................................
146000451166100006{PMID:Hitti-Malin 2024:38540785}, {DOI:Hitti-M...0071788
146100451252100006{PMID:Hitti-Malin 2024:38540785}, {DOI:Hitti-M...0072857
146200451259100006{PMID:Hitti-Malin 2024:38540785}, {DOI:Hitti-M...0073069
146300451333100006{PMID:Hitti-Malin 2024:38540785}, {DOI:Hitti-M...0075139
146400451348100006{PMID:Hitti-Malin 2024:38540785}, {DOI:Hitti-M...0080622
\n", - "

1465 rows × 18 columns

\n", - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Individuals_To_Diseases\n" - ] - }, - { - "data": { - "text/plain": [ - " individualid diseaseid\n", - "0 00000135 00012\n", - "1 00000210 00058\n", - "2 00001962 00173\n", - "3 00033096 04214\n", - "4 00033109 04214\n", - "... ... ...\n", - "1459 00451166 04249\n", - "1460 00451252 04249\n", - "1461 00451259 04249\n", - "1462 00451333 04249\n", - "1463 00451348 04249\n", - "\n", - "[1464 rows x 2 columns]" - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
individualiddiseaseid
00000013500012
10000021000058
20000196200173
30003309604214
40003310904214
.........
14590045116604249
14600045125204249
14610045125904249
14620045133304249
14630045134804249
\n", - "

1464 rows × 2 columns

\n", - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Phenotypes\n" - ] - }, - { - "data": { - "text/plain": [ - " id ... Phenotype/Diagnosis/Criteria\n", - "0 0000000008 ... \n", - "1 0000000026 ... \n", - "2 0000000941 ... \n", - "3 0000026525 ... \n", - "4 0000026538 ... \n", - "... ... ... ...\n", - "1272 0000339895 ... \n", - "1273 0000339896 ... \n", - "1274 0000339897 ... \n", - "1275 0000339898 ... \n", - "1276 0000339899 ... \n", - "\n", - "[1277 rows x 20 columns]" - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
iddiseaseidindividualidowned_byPhenotype/InheritancePhenotype/AgePhenotype/AdditionalPhenotype/Biochem_paramPhenotype/Age/OnsetPhenotype/Age/DiagnosisPhenotype/Severity_scorePhenotype/OnsetPhenotype/ProteinPhenotype/Tumor/MSIPhenotype/Enzyme/CPKPhenotype/Heart/MyocardiumPhenotype/LungPhenotype/Diagnosis/DefinitePhenotype/Diagnosis/InitialPhenotype/Diagnosis/Criteria
00000000008000120000013500006Familial, autosomal recessive
10000000026000580000021000039Familial, autosomal recessive
20000000941001730000196200025Familial2-3 toe syndactyly5
30000026525042140003309600229Unknownretinitis pigmentosa
40000026538042140003310900229Unknownretinitis pigmentosa
...............................................................
12720000339895042490045084004405UnknownStargardt disease
12730000339896042490045084104405UnknownStargardt disease
12740000339897042490045084204405Unknowncone-rod dystrophy
12750000339898042490045084304405UnknownStargardt disease
12760000339899042490045084404405Unknownmacular dystrophy
\n", - "

1277 rows × 20 columns

\n", - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Screenings\n" - ] - }, - { - "data": { - "text/plain": [ - " id individualid ... Screening/Tissue Screening/Remarks\n", - "0 0000000126 00000135 ... \n", - "1 0000000211 00000210 ... \n", - "2 0000001640 00001962 ... \n", - "3 0000016557 00016605 ... \n", - "4 0000033164 00033096 ... \n", - "... ... ... ... ... ...\n", - "1460 0000452765 00451166 ... smMIP-based 105 iMD/AMD genes\n", - "1461 0000452851 00451252 ... smMIP-based 105 iMD/AMD genes\n", - "1462 0000452858 00451259 ... smMIP-based 105 iMD/AMD genes\n", - "1463 0000452932 00451333 ... smMIP-based 105 iMD/AMD genes\n", - "1464 0000452947 00451348 ... smMIP-based 105 iMD/AMD genes\n", - "\n", - "[1465 rows x 12 columns]" - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idindividualidvariants_foundowned_bycreated_bycreated_dateedited_byedited_dateScreening/TechniqueScreening/TemplateScreening/TissueScreening/Remarks
0000000012600000135100006000062012-07-07 19:04:19000062012-07-07 19:12:08RT-PCR;SEQDNA;RNA
1000000021100000210100039000062012-09-22 11:36:24SEQDNA
2000000164000001962100025000062010-03-11 16:36:41000252012-04-13 15:18:00SEQDNA
3000001655700016605100552005522014-05-23 13:12:43SEQ-NG-IDNA
4000003316400033096100229002292012-02-04 15:20:01000062012-05-18 13:59:33SEQ;SEQ-NG-SDNA
.......................................
1460000045276500451166100006000062024-05-31 11:39:36SEQDNAsmMIP-based 105 iMD/AMD genes
1461000045285100451252100006000062024-05-31 11:39:36SEQDNAsmMIP-based 105 iMD/AMD genes
1462000045285800451259100006000062024-05-31 11:39:36SEQDNAsmMIP-based 105 iMD/AMD genes
1463000045293200451333100006000062024-05-31 11:39:36SEQDNAsmMIP-based 105 iMD/AMD genes
1464000045294700451348100006000062024-05-31 11:39:36SEQDNAsmMIP-based 105 iMD/AMD genes
\n", - "

1465 rows × 12 columns

\n", - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Screenings_To_Genes\n" - ] - }, - { - "data": { - "text/plain": [ - " screeningid geneid\n", - "0 0000000126 IL36RN\n", - "1 0000000211 MKS1\n", - "2 0000001640 DHCR7\n", - "3 0000033164 AHI1\n", - "4 0000033164 EYS\n", - "... ... ...\n", - "1311 0000437646 EYS\n", - "1312 0000437902 EYS\n", - "1313 0000437922 EYS\n", - "1314 0000443144 EYS\n", - "1315 0000443145 EYS\n", - "\n", - "[1316 rows x 2 columns]" - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
screeningidgeneid
00000000126IL36RN
10000000211MKS1
20000001640DHCR7
30000033164AHI1
40000033164EYS
.........
13110000437646EYS
13120000437902EYS
13130000437922EYS
13140000443144EYS
13150000443145EYS
\n", - "

1316 rows × 2 columns

\n", - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Variants_On_Genome\n" - ] - }, - { - "data": { - "text/plain": [ - " id ... VariantOnGenome/ClinicalClassification/Method\n", - "0 0000036426 ... \n", - "1 0000059881 ... \n", - "2 0000059883 ... \n", - "3 0000059884 ... \n", - "4 0000059885 ... \n", - "... ... ... ...\n", - "2555 0000987292 ... ACMG\n", - "2556 0000987305 ... ACMG\n", - "2557 0000987318 ... ACMG\n", - "2558 0000987322 ... ACMG\n", - "2559 0000987333 ... ACMG\n", - "\n", - "[2560 rows x 26 columns]" - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idalleleeffectidchromosomeposition_g_startposition_g_endtypeaverage_frequencyowned_byVariantOnGenome/DBIDVariantOnGenome/DNAVariantOnGenome/FrequencyVariantOnGenome/ReferenceVariantOnGenome/Restriction_siteVariantOnGenome/Published_asVariantOnGenome/RemarksVariantOnGenome/Genetic_originVariantOnGenome/SegregationVariantOnGenome/dbSNPVariantOnGenome/VIPVariantOnGenome/MethylationVariantOnGenome/ISCNVariantOnGenome/DNA/hg38VariantOnGenome/ClinVarVariantOnGenome/ClinicalClassificationVariantOnGenome/ClinicalClassification/Method
0000003642635066449897164498971subst0.00074292200552EYS_000007g.64498971A>GGermline0g.63789078A>GVUS
1000005988135566565575865655758subst0.0011529700229EYS_000001g.65655758T>GExAC: 60, 19750, 0, 0.003038{PMID:Neveling 2012:22334370}Q770PGermlineyes0g.64945865T>GVUS
2000005988311166533614365336143subst0.22418900229EYS_000002g.65336143G>AExAC: 3936, 19366, 441, 0.2032{PMID:Neveling 2012:22334370}p.?unaffected brother also this variant homozygousGermlineno0g.64626250G>Abenign
3000005988411566530086965300869subst0.00083792800229EYS_000003g.65300869G>AExAC: 12, 19406, 0, 0.0006184{PMID:Neveling 2012:22334370}(P1631S)predicted benign, disease-related variant in o...Germline0g.64590976G>Abenign
4000005988511166501699865016999del000229EYS_000004g.65016998_65016999delExAC: 9866, 18292, 921, 0.5394{PMID:Neveling 2012:22334370}6045-4_6045-3delpredicted benignGermlineyes0g.64307105_64307106delbenign
.................................................................................
2555000098729207066443111164431111subst000006EYS_000662g.64431111C>T{PMID:Hitti-Malin 2024:38540785}, {DOI:Hitti-M...c.8879G>A (Cys2960Tyr)case unsolvedGermline0g.63721215C>Tlikely pathogenicACMG
2556000098730507066611516766115167del000006EYS_000933g.66115167del{PMID:Hitti-Malin 2024:38540785}, {DOI:Hitti-M...no variant 2nd chromosome, case unsolvedGermline0g.65405274dellikely pathogenicACMG
2557000098731807066443114864431148subst3.95361E-500006EYS_000067g.64431148A>G{PMID:Hitti-Malin 2024:38540785}, {DOI:Hitti-M...no variant 2nd chromosome, case unsolvedGermline0g.63721252A>Glikely pathogenicACMG
2558000098732209066443062664430629del000006EYS_000045g.64430626_64430629del{PMID:Hitti-Malin 2024:38540785}, {DOI:Hitti-M...no variant 2nd chromosome, case unsolvedGermline0g.63720730_63720733delpathogenicACMG
2559000098733309066443647764436477del1.349E-500006EYS_000397g.64436477del{PMID:Hitti-Malin 2024:38540785}, {DOI:Hitti-M...no variant 2nd chromosome, case unsolvedGermline0g.63726584delpathogenicACMG
\n", - "

2560 rows × 26 columns

\n", - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Variants_On_Transcripts\n" - ] - }, - { - "data": { - "text/plain": [ - " id ... VariantOnTranscript/Exon\n", - "0 0000036426 ... 38\n", - "1 0000059881 ... 15\n", - "2 0000059883 ... 22i\n", - "3 0000059884 ... 26\n", - "4 0000059885 ... 29i\n", - "... ... ... ...\n", - "2555 0000987292 ... \n", - "2556 0000987305 ... \n", - "2557 0000987318 ... \n", - "2558 0000987322 ... \n", - "2559 0000987333 ... \n", - "\n", - "[2560 rows x 11 columns]" - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idtranscriptideffectidposition_c_startposition_c_start_intronposition_c_endposition_c_end_intronVariantOnTranscript/DNAVariantOnTranscript/RNAVariantOnTranscript/ProteinVariantOnTranscript/Exon
0000003642600007329507558075580c.7558T>Cr.(?)p.(Phe2520Leu)38
1000005988100007329552309023090c.2309A>Cr.(?)p.(Gln770Pro)15
2000005988300007329113444-53444-5c.3444-5C>Tr.(?)p.(=)22i
3000005988400007329154891048910c.4891C>Tr.(?)p.(Pro1631Ser)26
4000005988500007329116079-46079-3c.6079-4_6079-3delr.(?)p.(=)29i
....................................
2555000098729200007329708816088160c.8816G>Ar.(?)p.(Cys2939Tyr)
25560000987305000073297095709570c.957delr.(?)p.(Glu319AspfsTer20)
2557000098731800007329708779087790c.8779T>Cr.(?)p.(Cys2927Arg)
2558000098732200007329909299093020c.9299_9302delr.(?)p.(Thr3100LysfsTer26)
2559000098733300007329908168081680c.8168delr.(?)p.(Gln2723ArgfsTer18)
\n", - "

2560 rows × 11 columns

\n", - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Screenings_To_Variants\n" - ] - }, - { - "data": { - "text/plain": [ - " screeningid variantid\n", - "0 0000000126 0000783293\n", - "1 0000000211 0000790459\n", - "2 0000001640 0000235838\n", - "3 0000016557 0000036426\n", - "4 0000033164 0000059884\n", - "... ... ...\n", - "2163 0000452765 0000987322\n", - "2164 0000452851 0000987196\n", - "2165 0000452858 0000987333\n", - "2166 0000452932 0000987277\n", - "2167 0000452947 0000987292\n", - "\n", - "[2168 rows x 2 columns]" - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
screeningidvariantid
000000001260000783293
100000002110000790459
200000016400000235838
300000165570000036426
400000331640000059884
.........
216300004527650000987322
216400004528510000987196
216500004528580000987333
216600004529320000987277
216700004529470000987292
\n", - "

2168 rows × 2 columns

\n", - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "execution_count": 6 - }, - { - "cell_type": "code", - "id": "ef07740b2fa63e42", - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - }, - "ExecuteTime": { - "end_time": "2024-08-06T14:19:36.151167Z", - "start_time": "2024-08-06T14:19:35.969184Z" - } - }, - "source": [ - "set_lovd_dtypes(data)\n", - "for i in data:\n", - " print(i)\n", - " display(data[i].info())" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Genes\n", - "\n", - "RangeIndex: 1 entries, 0 to 0\n", - "Data columns (total 34 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 1 non-null string \n", - " 1 name 1 non-null string \n", - " 2 chromosome 1 non-null Int64 \n", - " 3 chrom_band 1 non-null string \n", - " 4 imprinting 1 non-null string \n", - " 5 refseq_genomic 1 non-null string \n", - " 6 refseq_UD 1 non-null string \n", - " 7 reference 1 non-null string \n", - " 8 url_homepage 1 non-null string \n", - " 9 url_external 1 non-null string \n", - " 10 allow_download 1 non-null bool \n", - " 11 id_hgnc 1 non-null Int64 \n", - " 12 id_entrez 1 non-null Int64 \n", - " 13 id_omim 1 non-null Int64 \n", - " 14 show_hgmd 1 non-null bool \n", - " 15 show_genecards 1 non-null bool \n", - " 16 show_genetests 1 non-null bool \n", - " 17 show_orphanet 1 non-null bool \n", - " 18 note_index 1 non-null string \n", - " 19 note_listing 1 non-null string \n", - " 20 refseq 1 non-null string \n", - " 21 refseq_url 1 non-null string \n", - " 22 disclaimer 1 non-null bool \n", - " 23 disclaimer_text 1 non-null string \n", - " 24 header 1 non-null string \n", - " 25 header_align 1 non-null Int64 \n", - " 26 footer 1 non-null string \n", - " 27 footer_align 1 non-null Int64 \n", - " 28 created_by 1 non-null Int64 \n", - " 29 created_date 1 non-null datetime64[ns]\n", - " 30 edited_by 1 non-null Int64 \n", - " 31 edited_date 1 non-null datetime64[ns]\n", - " 32 updated_by 1 non-null Int64 \n", - " 33 updated_date 1 non-null datetime64[ns]\n", - "dtypes: Int64(9), bool(6), datetime64[ns](3), string(16)\n", - "memory usage: 371.0 bytes\n" - ] - }, - { - "data": { - "text/plain": [ - "None" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Transcripts\n", - "\n", - "RangeIndex: 1 entries, 0 to 0\n", - "Data columns (total 19 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 1 non-null Int64 \n", - " 1 geneid 1 non-null string \n", - " 2 name 1 non-null string \n", - " 3 id_mutalyzer 1 non-null Int64 \n", - " 4 id_ncbi 1 non-null string \n", - " 5 id_ensembl 1 non-null string \n", - " 6 id_protein_ncbi 1 non-null string \n", - " 7 id_protein_ensembl 1 non-null string \n", - " 8 id_protein_uniprot 1 non-null string \n", - " 9 remarks 1 non-null string \n", - " 10 position_c_mrna_start 1 non-null Int64 \n", - " 11 position_c_mrna_end 1 non-null Int64 \n", - " 12 position_c_cds_end 1 non-null Int64 \n", - " 13 position_g_mrna_start 1 non-null Int64 \n", - " 14 position_g_mrna_end 1 non-null Int64 \n", - " 15 created_by 0 non-null Int64 \n", - " 16 created_date 0 non-null datetime64[ns]\n", - " 17 edited_by 0 non-null Int64 \n", - " 18 edited_date 0 non-null datetime64[ns]\n", - "dtypes: Int64(9), datetime64[ns](2), string(8)\n", - "memory usage: 293.0 bytes\n" - ] - }, - { - "data": { - "text/plain": [ - "None" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Diseases\n", - "\n", - "RangeIndex: 15 entries, 0 to 14\n", - "Data columns (total 12 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 15 non-null Int64 \n", - " 1 symbol 15 non-null string \n", - " 2 name 15 non-null string \n", - " 3 inheritance 15 non-null string \n", - " 4 id_omim 4 non-null Int64 \n", - " 5 tissues 15 non-null string \n", - " 6 features 15 non-null string \n", - " 7 remarks 15 non-null string \n", - " 8 created_by 15 non-null Int64 \n", - " 9 created_date 15 non-null datetime64[ns]\n", - " 10 edited_by 11 non-null Int64 \n", - " 11 edited_date 11 non-null datetime64[ns]\n", - "dtypes: Int64(4), datetime64[ns](2), string(6)\n", - "memory usage: 1.6 KB\n" - ] - }, - { - "data": { - "text/plain": [ - "None" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Genes_To_Diseases\n", - "\n", - "RangeIndex: 2 entries, 0 to 1\n", - "Data columns (total 2 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 geneid 2 non-null string\n", - " 1 diseaseid 2 non-null Int64 \n", - "dtypes: Int64(1), string(1)\n", - "memory usage: 166.0 bytes\n" - ] - }, - { - "data": { - "text/plain": [ - "None" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Individuals\n", - "\n", - "RangeIndex: 1465 entries, 0 to 1464\n", - "Data columns (total 18 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 1465 non-null Int64 \n", - " 1 fatherid 1465 non-null string\n", - " 2 motherid 1465 non-null string\n", - " 3 panelid 6 non-null Int64 \n", - " 4 panel_size 1465 non-null Int64 \n", - " 5 license 1465 non-null string\n", - " 6 owned_by 1465 non-null Int64 \n", - " 7 Individual/Reference 1465 non-null string\n", - " 8 Individual/Remarks 1465 non-null string\n", - " 9 Individual/Gender 1465 non-null string\n", - " 10 Individual/Consanguinity 1465 non-null string\n", - " 11 Individual/Origin/Geographic 1465 non-null string\n", - " 12 Individual/Age_of_death 1465 non-null string\n", - " 13 Individual/VIP 1465 non-null string\n", - " 14 Individual/Data_av 1465 non-null string\n", - " 15 Individual/Treatment 1465 non-null string\n", - " 16 Individual/Origin/Population 1465 non-null string\n", - " 17 Individual/Individual_ID 1465 non-null string\n", - "dtypes: Int64(4), string(14)\n", - "memory usage: 211.9 KB\n" - ] - }, - { - "data": { - "text/plain": [ - "None" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Individuals_To_Diseases\n", - "\n", - "RangeIndex: 1464 entries, 0 to 1463\n", - "Data columns (total 2 columns):\n", - " # Column Non-Null Count Dtype\n", - "--- ------ -------------- -----\n", - " 0 individualid 1464 non-null Int64\n", - " 1 diseaseid 1464 non-null Int64\n", - "dtypes: Int64(2)\n", - "memory usage: 25.9 KB\n" - ] - }, - { - "data": { - "text/plain": [ - "None" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Phenotypes\n", - "\n", - "RangeIndex: 1277 entries, 0 to 1276\n", - "Data columns (total 20 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 1277 non-null Int64 \n", - " 1 diseaseid 1277 non-null Int64 \n", - " 2 individualid 1277 non-null Int64 \n", - " 3 owned_by 1277 non-null Int64 \n", - " 4 Phenotype/Inheritance 1277 non-null string\n", - " 5 Phenotype/Age 1277 non-null string\n", - " 6 Phenotype/Additional 1277 non-null string\n", - " 7 Phenotype/Biochem_param 1277 non-null string\n", - " 8 Phenotype/Age/Onset 1277 non-null string\n", - " 9 Phenotype/Age/Diagnosis 1277 non-null string\n", - " 10 Phenotype/Severity_score 1277 non-null string\n", - " 11 Phenotype/Onset 1277 non-null string\n", - " 12 Phenotype/Protein 1277 non-null string\n", - " 13 Phenotype/Tumor/MSI 1277 non-null string\n", - " 14 Phenotype/Enzyme/CPK 1277 non-null string\n", - " 15 Phenotype/Heart/Myocardium 1277 non-null string\n", - " 16 Phenotype/Lung 1277 non-null string\n", - " 17 Phenotype/Diagnosis/Definite 1277 non-null string\n", - " 18 Phenotype/Diagnosis/Initial 1277 non-null string\n", - " 19 Phenotype/Diagnosis/Criteria 1277 non-null string\n", - "dtypes: Int64(4), string(16)\n", - "memory usage: 204.6 KB\n" - ] - }, - { - "data": { - "text/plain": [ - "None" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Screenings\n", - "\n", - "RangeIndex: 1465 entries, 0 to 1464\n", - "Data columns (total 12 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 1465 non-null Int64 \n", - " 1 individualid 1465 non-null Int64 \n", - " 2 variants_found 1465 non-null Int64 \n", - " 3 owned_by 1465 non-null Int64 \n", - " 4 created_by 1465 non-null Int64 \n", - " 5 created_date 1465 non-null datetime64[ns]\n", - " 6 edited_by 15 non-null Int64 \n", - " 7 edited_date 15 non-null datetime64[ns]\n", - " 8 Screening/Technique 1465 non-null string \n", - " 9 Screening/Template 1465 non-null string \n", - " 10 Screening/Tissue 1465 non-null string \n", - " 11 Screening/Remarks 1465 non-null string \n", - "dtypes: Int64(6), datetime64[ns](2), string(4)\n", - "memory usage: 146.1 KB\n" - ] - }, - { - "data": { - "text/plain": [ - "None" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Screenings_To_Genes\n", - "\n", - "RangeIndex: 1316 entries, 0 to 1315\n", - "Data columns (total 2 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 screeningid 1316 non-null Int64 \n", - " 1 geneid 1316 non-null string\n", - "dtypes: Int64(1), string(1)\n", - "memory usage: 22.0 KB\n" - ] - }, - { - "data": { - "text/plain": [ - "None" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Variants_On_Genome\n", - "\n", - "RangeIndex: 2560 entries, 0 to 2559\n", - "Data columns (total 26 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 2560 non-null Int64 \n", - " 1 allele 2560 non-null Int64 \n", - " 2 effectid 2560 non-null Int64 \n", - " 3 chromosome 2560 non-null Int64 \n", - " 4 position_g_start 2559 non-null Int64 \n", - " 5 position_g_end 2559 non-null Int64 \n", - " 6 type 2560 non-null string \n", - " 7 average_frequency 2559 non-null float64\n", - " 8 owned_by 2560 non-null Int64 \n", - " 9 VariantOnGenome/DBID 2560 non-null string \n", - " 10 VariantOnGenome/DNA 2560 non-null string \n", - " 11 VariantOnGenome/Frequency 2560 non-null string \n", - " 12 VariantOnGenome/Reference 2560 non-null string \n", - " 13 VariantOnGenome/Restriction_site 2560 non-null string \n", - " 14 VariantOnGenome/Published_as 2560 non-null string \n", - " 15 VariantOnGenome/Remarks 2560 non-null string \n", - " 16 VariantOnGenome/Genetic_origin 2560 non-null string \n", - " 17 VariantOnGenome/Segregation 2560 non-null string \n", - " 18 VariantOnGenome/dbSNP 2560 non-null string \n", - " 19 VariantOnGenome/VIP 2560 non-null string \n", - " 20 VariantOnGenome/Methylation 2560 non-null string \n", - " 21 VariantOnGenome/ISCN 2560 non-null string \n", - " 22 VariantOnGenome/DNA/hg38 2560 non-null string \n", - " 23 VariantOnGenome/ClinVar 2560 non-null string \n", - " 24 VariantOnGenome/ClinicalClassification 2560 non-null string \n", - " 25 VariantOnGenome/ClinicalClassification/Method 2560 non-null string \n", - "dtypes: Int64(7), float64(1), string(18)\n", - "memory usage: 537.6 KB\n" - ] - }, - { - "data": { - "text/plain": [ - "None" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Variants_On_Transcripts\n", - "\n", - "RangeIndex: 2560 entries, 0 to 2559\n", - "Data columns (total 11 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 2560 non-null Int64 \n", - " 1 transcriptid 2560 non-null Int64 \n", - " 2 effectid 2560 non-null Int64 \n", - " 3 position_c_start 2559 non-null Int64 \n", - " 4 position_c_start_intron 2560 non-null Int64 \n", - " 5 position_c_end 2559 non-null Int64 \n", - " 6 position_c_end_intron 2560 non-null Int64 \n", - " 7 VariantOnTranscript/DNA 2560 non-null string\n", - " 8 VariantOnTranscript/RNA 2560 non-null string\n", - " 9 VariantOnTranscript/Protein 2560 non-null string\n", - " 10 VariantOnTranscript/Exon 2560 non-null string\n", - "dtypes: Int64(7), string(4)\n", - "memory usage: 237.6 KB\n" - ] - }, - { - "data": { - "text/plain": [ - "None" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Screenings_To_Variants\n", - "\n", - "RangeIndex: 2168 entries, 0 to 2167\n", - "Data columns (total 2 columns):\n", - " # Column Non-Null Count Dtype\n", - "--- ------ -------------- -----\n", - " 0 screeningid 2168 non-null Int64\n", - " 1 variantid 2168 non-null Int64\n", - "dtypes: Int64(2)\n", - "memory usage: 38.2 KB\n" - ] - }, - { - "data": { - "text/plain": [ - "None" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "execution_count": 7 - }, - { - "cell_type": "code", - "id": "c968af1617be40db", - "metadata": { - "ExecuteTime": { - "end_time": "2024-08-06T14:19:36.920697Z", - "start_time": "2024-08-06T14:19:36.151683Z" - } - }, - "source": [ - "save_lovd_as_vcf(data[\"Variants_On_Genome\"], \"./lovd.vcf\")" - ], - "outputs": [ - { - "ename": "KeyError", - "evalue": "'Variants_On_Genome'", - "output_type": "error", - "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mKeyError\u001B[0m Traceback (most recent call last)", - "File \u001B[1;32m~\\PycharmProjects\\KathChatGPT\\.venv\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:3805\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 3804\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m-> 3805\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_engine\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mcasted_key\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 3806\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m err:\n", - "File \u001B[1;32mindex.pyx:167\u001B[0m, in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[1;34m()\u001B[0m\n", - "File \u001B[1;32mindex.pyx:196\u001B[0m, in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[1;34m()\u001B[0m\n", - "File \u001B[1;32mpandas\\\\_libs\\\\hashtable_class_helper.pxi:7081\u001B[0m, in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[1;34m()\u001B[0m\n", - "File \u001B[1;32mpandas\\\\_libs\\\\hashtable_class_helper.pxi:7089\u001B[0m, in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[1;34m()\u001B[0m\n", - "\u001B[1;31mKeyError\u001B[0m: 'Variants_On_Genome'", - "\nThe above exception was the direct cause of the following exception:\n", - "\u001B[1;31mKeyError\u001B[0m Traceback (most recent call last)", - "Cell \u001B[1;32mIn[8], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[43msave_lovd_as_vcf\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdata\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mVariants_On_Genome\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43m./lovd.vcf\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\n", - "File \u001B[1;32m~\\PycharmProjects\\kath\\api\\data\\refactoring.py:140\u001B[0m, in \u001B[0;36msave_lovd_as_vcf\u001B[1;34m(data, save_to)\u001B[0m\n\u001B[0;32m 134\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21msave_lovd_as_vcf\u001B[39m(data, save_to\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m./lovd.vcf\u001B[39m\u001B[38;5;124m\"\u001B[39m):\n\u001B[0;32m 135\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m\"\"\"\u001B[39;00m\n\u001B[0;32m 136\u001B[0m \u001B[38;5;124;03m Gets hg38 variants from LOVD and saves as VCF file.\u001B[39;00m\n\u001B[0;32m 137\u001B[0m \u001B[38;5;124;03m :param DataFrame data: LOVD DataFrame with data\u001B[39;00m\n\u001B[0;32m 138\u001B[0m \u001B[38;5;124;03m :param str save_to: path where to save VCF file.\u001B[39;00m\n\u001B[0;32m 139\u001B[0m \u001B[38;5;124;03m \"\"\"\u001B[39;00m\n\u001B[1;32m--> 140\u001B[0m df \u001B[38;5;241m=\u001B[39m \u001B[43mdata\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mVariants_On_Genome\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m]\u001B[49m\n\u001B[0;32m 141\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mVariantOnGenome/DNA/hg38\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m df\u001B[38;5;241m.\u001B[39mcolumns:\n\u001B[0;32m 142\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mVariantOnGenome/DNA/hg38 is not in the LOVD DataFrame.\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n", - "File \u001B[1;32m~\\PycharmProjects\\KathChatGPT\\.venv\\Lib\\site-packages\\pandas\\core\\frame.py:4102\u001B[0m, in \u001B[0;36mDataFrame.__getitem__\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 4100\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcolumns\u001B[38;5;241m.\u001B[39mnlevels \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m1\u001B[39m:\n\u001B[0;32m 4101\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_getitem_multilevel(key)\n\u001B[1;32m-> 4102\u001B[0m indexer \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcolumns\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget_loc\u001B[49m\u001B[43m(\u001B[49m\u001B[43mkey\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 4103\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m is_integer(indexer):\n\u001B[0;32m 4104\u001B[0m indexer \u001B[38;5;241m=\u001B[39m [indexer]\n", - "File \u001B[1;32m~\\PycharmProjects\\KathChatGPT\\.venv\\Lib\\site-packages\\pandas\\core\\indexes\\base.py:3812\u001B[0m, in \u001B[0;36mIndex.get_loc\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 3807\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(casted_key, \u001B[38;5;28mslice\u001B[39m) \u001B[38;5;129;01mor\u001B[39;00m (\n\u001B[0;32m 3808\u001B[0m \u001B[38;5;28misinstance\u001B[39m(casted_key, abc\u001B[38;5;241m.\u001B[39mIterable)\n\u001B[0;32m 3809\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;28many\u001B[39m(\u001B[38;5;28misinstance\u001B[39m(x, \u001B[38;5;28mslice\u001B[39m) \u001B[38;5;28;01mfor\u001B[39;00m x \u001B[38;5;129;01min\u001B[39;00m casted_key)\n\u001B[0;32m 3810\u001B[0m ):\n\u001B[0;32m 3811\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m InvalidIndexError(key)\n\u001B[1;32m-> 3812\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m(key) \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01merr\u001B[39;00m\n\u001B[0;32m 3813\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mTypeError\u001B[39;00m:\n\u001B[0;32m 3814\u001B[0m \u001B[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001B[39;00m\n\u001B[0;32m 3815\u001B[0m \u001B[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001B[39;00m\n\u001B[0;32m 3816\u001B[0m \u001B[38;5;66;03m# the TypeError.\u001B[39;00m\n\u001B[0;32m 3817\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_check_indexing_error(key)\n", - "\u001B[1;31mKeyError\u001B[0m: 'Variants_On_Genome'" - ] - } - ], - "execution_count": 8 - }, - { - "cell_type": "code", - "id": "c7ff16903e0c52bd", - "metadata": { - "ExecuteTime": { - "end_time": "2024-08-06T14:19:36.921706Z", - "start_time": "2024-08-06T14:19:36.921706Z" - } - }, - "source": [ - "from subprocess import Popen\n", - "\n", - "\n", - "process = Popen(\"spliceai -I ./lovd.vcf -O ./lovd_output.vcf -R ../tools/spliceai/hg38.fa -A grch38\".split())\n", - "process.wait()" - ], - "outputs": [], - "execution_count": null - }, - { - "cell_type": "code", - "id": "0514ccc3-5c91-41ad-ab15-f4158030ea14", - "metadata": {}, - "source": [ - "from api.tools import get_revel_scores\n", - "\n", - "chromosome = 6\n", - "position = 65655758\n", - "\n", - "results = get_revel_scores(chromosome, position)\n", - "\n", - "display(results)" - ], - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "from api.data import request_clinvar_api_data\n", - "\n", - "some_id = 1519786\n", - "try:\n", - " frame = request_clinvar_api_data(some_id)\n", - " display(frame)\n", - "except Exception as e:\n", - " print(e)\n" - ], - "id": "576b841842a7ab61", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "import requests\n", - "from api.data import request_clinvar_api_data\n", - "\n", - "gene_id = '1519786'\n", - "# with gene_id = '1519787' error is raised\n", - "\n", - "#TODO inside request_clinvar_api_data\n", - "# 1. dinamically expand genes to dataframe (might be one, might be more)\n", - "# 2. dinamically expand variation_loc to dataframe (might be one, might be more)\n", - "frames = request_clinvar_api_data(gene_id)\n", - "\n", - "display(frames)" - ], - "id": "b21c3487476b684f", - "outputs": [], - "execution_count": null - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2024-08-06T14:58:14.898227Z", - "start_time": "2024-08-06T14:58:14.228473Z" - } - }, - "cell_type": "code", - "source": [ - "import requests\n", - "\n", - "gene_id = '1519785'\n", - "\n", - "path = f\"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=clinvar&id={gene_id}&retmode=json\"\n", - "\n", - "request = requests.get(path)\n", - "\n", - "if request.status_code != 200:\n", - " raise ValueError(f\"Request failed with status code {request.status_code}\")\n", - "\n", - "data = request.json()\n", - "\n", - " # Extract the 'result' part of the JSON\n", - "results = data['result']\n", - "\n", - "# Extract the 'uids' part of the JSON\n", - "flattened_data = []\n", - "\n", - "for uid in results['uids']:\n", - " entry = results[uid]\n", - "\n", - " # Using pd.json_normalize to flatten the JSON data\n", - " flattened_entry = pd.json_normalize(entry, sep='_')\n", - "\n", - " flattened_variation_set = pd.json_normalize(flattened_entry['variation_set'][0], sep='_')\n", - " flattened_variation_xrefs = pd.json_normalize(flattened_variation_set['variation_xrefs'][0], sep='_')\n", - " \n", - " variation_loc_size = len(flattened_variation_set['variation_loc'][0]) \n", - " for i in range(variation_loc_size):\n", - " flattened_variation_loc = pd.json_normalize(flattened_variation_set['variation_loc'][0][i], sep='_')\n", - " flattened_variation_loc = flattened_variation_loc.add_prefix(f'{i}_')\n", - " flattened_variation_set = pd.concat([flattened_variation_set, flattened_variation_loc], axis=1)\n", - " \n", - " allele_freq_set_size = len(flattened_variation_set['allele_freq_set'][0])\n", - " for i in range(allele_freq_set_size):\n", - " flattened_allele_freq_set = pd.json_normalize(flattened_variation_set['allele_freq_set'][0][i], sep='_')\n", - " flattened_allele_freq_set = flattened_allele_freq_set.add_prefix(f'{i}_')\n", - " flattened_variation_set = pd.concat([flattened_variation_set, flattened_allele_freq_set], axis=1)\n", - " \n", - " gene_size = len(flattened_entry['genes'][0])\n", - " for i in range(gene_size):\n", - " flattened_genes = pd.json_normalize(flattened_entry['genes'][0][i], sep='_')\n", - " flattened_genes = flattened_genes.add_prefix(f'{i}_')\n", - " flattened_entry = pd.concat([flattened_entry, flattened_genes], axis=1)\n", - "\n", - " gremline_classification_trait_set_size = len(flattened_entry['germline_classification_trait_set'][0])\n", - " for i in range(gremline_classification_trait_set_size):\n", - " flattened_germline_classification_trait_set = pd.json_normalize(flattened_entry['germline_classification_trait_set'][0][i], sep='_')\n", - " flattened_germline_classification_trait_set = flattened_germline_classification_trait_set.add_prefix(f'{i}_')\n", - " \n", - " trait_xrefs_size = len(flattened_germline_classification_trait_set[f'{i}_trait_xrefs'][0])\n", - " for j in range(trait_xrefs_size):\n", - " flattened_trait_xrefs = pd.json_normalize(flattened_germline_classification_trait_set[f'{i}_trait_xrefs'][0][j], sep='_')\n", - " flattened_trait_xrefs = flattened_trait_xrefs.add_prefix(f'{j}_')\n", - "\n", - " flattened_germline_classification_trait_set = pd.concat([flattened_germline_classification_trait_set, flattened_trait_xrefs], axis=1)\n", - " \n", - " flattened_germline_classification_trait_set = flattened_germline_classification_trait_set.drop(columns=[f'{i}_trait_xrefs'], axis=1)\n", - " flattened_entry = pd.concat([flattened_entry, flattened_germline_classification_trait_set], axis=1)\n", - " \n", - " # dropping extracted nests\n", - " flattened_entry = flattened_entry.drop(columns=['variation_set', 'genes', 'germline_classification_trait_set'],axis=1)\n", - " flattened_variation_set = flattened_variation_set.drop(columns=['variation_xrefs', 'variation_loc', 'allele_freq_set'], axis=1)\n", - "\n", - " flattened_variation_set = pd.concat([flattened_variation_set, flattened_variation_xrefs], axis=1)\n", - " flattened_variation_set = pd.concat([flattened_variation_set, flattened_allele_freq_set], axis=1)\n", - "\n", - " flattened_entry = pd.concat([flattened_entry, flattened_variation_set], axis=1)\n", - " flattened_entry = pd.concat([flattened_entry, flattened_germline_classification_trait_set], axis=1)\n", - "\n", - " # Append the flattened entry to the list\n", - " flattened_data.append(flattened_entry)\n", - "\n", - " # Concatenate all flattened entries into a single DataFrame\n", - "df = pd.concat(flattened_data, ignore_index=True)\n", - "\n", - "display(df)\n" - ], - "id": "7e9ca83a40035c14", - "outputs": [ - { - "data": { - "text/plain": [ - " uid obj_type ... 3_db_source 3_db_id\n", - "0 1519785 single nucleotide variant ... OMIM 614702\n", - "\n", - "[1 rows x 110 columns]" - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -3269,61 +1332,61 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
uidobj_typeaccessionaccession_versiontitlerecord_statusgene_sortchr_sortlocation_sortvariation_set_namevariation_set_idmolecular_consequence_listprotein_changefda_recognized_databasesupporting_submissions_scvsupporting_submissions_rcvgermline_classification_descriptiongermline_classification_last_evaluatedgermline_classification_review_statusgermline_classification_fda_recognized_databaseclinical_impact_classification_descriptionclinical_impact_classification_last_evaluatedclinical_impact_classification_review_statusclinical_impact_classification_fda_recognized_databaseclinical_impact_classification_trait_setoncogenicity_classification_descriptiononcogenicity_classification_last_evaluatedoncogenicity_classification_review_statusoncogenicity_classification_fda_recognized_databaseoncogenicity_classification_trait_set0_symbol0_geneid0_strand0_source0_trait_name0_db_source0_db_id1_db_source1_db_id2_db_source...1_stop1_inner_start1_inner_stop1_outer_start1_outer_stop1_display_start1_display_stop1_assembly_acc_ver1_annotation_release1_alt1_ref0_source0_value0_minor_allele1_source1_value1_minor_allele2_source2_value2_minor_allele3_source3_value3_minor_allele4_source4_value4_minor_alleledb_sourcedb_id4_source4_value4_minor_allele0_trait_name0_db_source0_db_id1_db_source1_db_id2_db_source2_db_id3_db_source3_db_id
01519785single nucleotide variantVCV001519785VCV001519785.NM_012123.4(MTO1):c.1465+4A>TMTO1EYS060000000000007348224800000000000064902416[intron variant][frameshift variant]D904fs[SCV002308196][RCV002024803]Uncertain significance2022/07/06 00:00criteria provided, single submitter[SCV000020713][RCV000000564]Pathogenic2008/11/01 00:00no assertion criteria provided1/01/01 00:00[]MTO125821+submittedMitochondrial hypertrophic cardiomyopathy with...Orphanet314637MedGenC4749921MONDO...741919717419197174191971GCF_000001405.25Exome Aggregation Consortium (ExAC)0.00002The Genome Aggregation Database (gnomAD)0.00003The Genome Aggregation Database (gnomAD), exomes0.00004Trans-Omics for Precision Medicine (TOPMed)0.000051000 Genomes Project0.00020TdbSNP5550946771000 Genomes Project0.00020TMitochondrial hypertrophic cardiomyopathy with...Orphanet314637MedGenC4749921MONDOMONDO:0013865OMIM61470215573NM_001142800.2(EYS):c.2710_2726del (p.Asp904fs)c.2710_2726del[]DeletionNC_000006.12:64902415:ACCATATCTTCACAGTCACCATA:...currentGRCh3866q12...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", - "

1 rows × 110 columns

\n", + "

4783 rows × 4428 columns

\n", "
" ] }, @@ -3331,15 +1394,7 @@ "output_type": "display_data" } ], - "execution_count": 35 - }, - { - "metadata": {}, - "cell_type": "code", - "source": "", - "id": "7df7d0cb3b874157", - "outputs": [], - "execution_count": null + "execution_count": 37 } ], "metadata": { From 35f43bf27f227ecf2fc4529d4d2ccc9f34abe409 Mon Sep 17 00:00:00 2001 From: Kajus CC <42713684+KajusC@users.noreply.github.com> Date: Mon, 19 Aug 2024 11:25:54 +0300 Subject: [PATCH 04/10] Reformatted the parse --- api/data/__init__.py | 1 + api/data/refactoring.py | 81 ++- tests/pipeline.ipynb | 1295 +++++---------------------------------- 3 files changed, 178 insertions(+), 1199 deletions(-) diff --git a/api/data/__init__.py b/api/data/__init__.py index 34caf42..5ccab15 100644 --- a/api/data/__init__.py +++ b/api/data/__init__.py @@ -58,4 +58,5 @@ save_lovd_as_vcf, request_clinvar_api_data, get_variant_ids_from_clinvar_name_api, + extract_nested_json, ) diff --git a/api/data/refactoring.py b/api/data/refactoring.py index 79cbe78..09eaca1 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -3,7 +3,6 @@ import os import logging -import pandas import requests import pandas as pd @@ -160,7 +159,7 @@ def save_lovd_as_vcf(data, save_to="./lovd.vcf"): f.write("\n") -def get_variant_ids_from_clinvar_name_api(name: str, count: int = 100): +def get_variant_ids_from_clinvar_name_api(name, count=100): """ Extracts variant ids from ClinVar `name` variable. /n key of dictionary is the size of the list of ids. @@ -172,9 +171,8 @@ def get_variant_ids_from_clinvar_name_api(name: str, count: int = 100): """ result = {} - - separator = "," - clinvar_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=clinvar&term={name}&retmode=json&retmax={count}" + base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=clinvar&term=" + clinvar_url = f"{base_url}{name}&retmode=json&retmax={count}" request = requests.get(clinvar_url) @@ -183,33 +181,46 @@ def get_variant_ids_from_clinvar_name_api(name: str, count: int = 100): data = request.json() - ids = data['esearchresult']['idlist'] - - result['idlist'] = ids + result['idlist'] = data['esearchresult']['idlist'] result['count'] = data['esearchresult']['count'] return result -def request_clinvar_api_data(gene_id: str): +def extract_nested_json(flat_parsed, parsed_from, required_column, prefix, join_prefix): + """ + Extracts nested JSON data from dictionary. + + :param DataFrame parsed_from: normalised JSON data + :param str required_column: column to extract + :param str prefix: prefix for extracted columns + """ + + data_set = parsed_from.get(required_column, []) + for idx, data in enumerate(data_set): + flat_data = pd.json_normalize(data, sep='_') + flat_data = flat_data.add_prefix(f'{prefix}_{idx}_') + flat_parsed = flat_parsed.join(flat_data, rsuffix=f'_{idx}_{join_prefix}') + + +def request_clinvar_api_data(gene_id): """ Requests ClinVar API for data about variant with given id. Converts it to pandas dataframe. :param str gene_id: id of variant (may be multiple) - :returns: dataframe from ClinVar API - :rtype: dataframe + :returns: DataFrame from ClinVar API + :rtype: DataFrame """ - clinvar_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=clinvar&id={gene_id}&retmode=json" + base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=clinvar&id=" + clinvar_url = f"{base_url}{gene_id}&retmode=json" request = requests.get(clinvar_url) if request.status_code != 200: raise ValueError(f"Request failed with status code {request.status_code}") - data = request.json() - - results = data['result'] + results = request.json()['result'] flattened_data = [] @@ -223,53 +234,41 @@ def request_clinvar_api_data(gene_id: str): flat_var_set = pd.json_normalize(var_set, sep='_') flat_var_set = flat_var_set.add_prefix(f'variation_set_{idx}_') - variation_loc = var_set.get('variation_loc', []) - for loc_idx, loc in enumerate(variation_loc): - flat_loc = pd.json_normalize(loc, sep='_') - flat_loc = flat_loc.add_prefix(f'variation_set_{idx}_loc_{loc_idx}_') - flat_var_set = flat_var_set.join(flat_loc, rsuffix=f'_{idx}_{loc_idx}_vl') - - var_xrefs = var_set.get('variation_xrefs', []) - for var_xrefs_idx, var_xref in enumerate(var_xrefs): - flat_var_xrefs = pd.json_normalize(var_xref, sep='_') - flat_var_xrefs = flat_var_xrefs.add_prefix(f'variation_set_{idx}_var_xrefs_{var_xrefs_idx}_') - flat_var_set = flat_var_set.join(flat_var_xrefs, rsuffix=f'_{idx}_{var_xrefs_idx}_vx') - - allele_freq = var_set.get('allele_freq_set', []) - for allele_freq_idx, allele in enumerate(allele_freq): - flat_allele = pd.json_normalize(allele, sep='_') - flat_allele = flat_allele.add_prefix(f'variation_set_{idx}_allele_freq_{allele_freq_idx}_') - flat_var_set = flat_var_set.join(flat_allele, rsuffix=f'_{idx}_{allele_freq_idx}_af') + extract_nested_json(flat_var_set, var_set, 'variation_loc', f'variation_set_{idx}_loc', 'loc') + extract_nested_json(flat_var_set, var_set, 'variation_xrefs', f'variation_set_{idx}_xrefs', 'xrefs') + extract_nested_json(flat_var_set, var_set, 'allele_freq_set', f'variation_set_{idx}_allele_freq', 'allele_freq') flat_var_set = flat_var_set.drop( - columns=[f'variation_set_{idx}_variation_loc', f'variation_set_{idx}_variation_xrefs', + columns=[f'variation_set_{idx}_variation_loc', + f'variation_set_{idx}_variation_xrefs', f'variation_set_{idx}_allele_freq_set']) flattened_entry = flattened_entry.join(flat_var_set, rsuffix=f'_{idx}_vs') + # this extraction is different from the previous ones + genes = flattened_entry.at[0, 'genes'] for idx, gene in enumerate(genes): flat_genes = pd.json_normalize(gene, sep='_') flat_genes = flat_genes.add_prefix(f'gene_{idx}_') flattened_entry = flattened_entry.join(flat_genes, rsuffix=f'_{idx}_g') - germline_classification_trait_set = flattened_entry.at[0, 'germline_classification_trait_set'] + germline_classification_trait_set = flattened_entry.at[0, + 'germline_classification_trait_set'] for idx, germline_set in enumerate(germline_classification_trait_set): flat_germline_set = pd.json_normalize(germline_set, sep='_') flat_germline_set = flat_germline_set.add_prefix(f'germline_set_{idx}_') - trait_xrefs = flat_germline_set.at[0, f'germline_set_{idx}_trait_xrefs'] - for jdx, trait_xref in enumerate(trait_xrefs): - flat_trait_xrefs = pd.json_normalize(trait_xref, sep='_') - flat_trait_xrefs = flat_trait_xrefs.add_prefix(f'trait_xref_{jdx}_') - flat_germline_set = flat_germline_set.join(flat_trait_xrefs, rsuffix=f'_{idx}_{jdx}_tx') + extract_nested_json(flat_germline_set, germline_set, 'trait_xrefs', f'germline_set_{idx}_trait_xrefs', 'trait_xrefs') flat_germline_set = flat_germline_set.drop(columns=[f'germline_set_{idx}_trait_xrefs']) flattened_entry = flattened_entry.join(flat_germline_set, rsuffix=f'_{idx}_gls') - flattened_entry = flattened_entry.drop(columns=['variation_set', 'genes', 'germline_classification_trait_set']) + flattened_entry = flattened_entry.drop(columns=['variation_set', + 'genes', + 'germline_classification_trait_set']) flattened_data.append(flattened_entry) df = pd.concat(flattened_data, ignore_index=True) - return df \ No newline at end of file + return df diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 0fb9cfb..91f5df4 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -9,8 +9,8 @@ "outputs_hidden": true }, "ExecuteTime": { - "end_time": "2024-08-07T12:32:22.837138Z", - "start_time": "2024-08-07T12:32:21.979038Z" + "end_time": "2024-08-19T08:25:04.230992Z", + "start_time": "2024-08-19T08:25:03.038301Z" } }, "source": [ @@ -131,231 +131,30 @@ "outputs": [], "execution_count": null }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "import requests\n", - "from api.data import request_clinvar_api_data\n", - "\n", - "gene_id = '1519785,1519786'\n", - "\n", - "frames = request_clinvar_api_data(gene_id)\n", - "\n", - "display(frames)" - ], - "id": "b21c3487476b684f", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "gene_id = '1519785'\n", - "\n", - "\n", - "clinvar_url = f\"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=clinvar&id={gene_id}&retmode=json\"\n", - "\n", - "request = requests.get(clinvar_url)\n", - "\n", - "if request.status_code != 200:\n", - " raise ValueError(f\"Request failed with status code {request.status_code}\")\n", - "\n", - "data = request.json()\n", - "\n", - "# Extract the 'result' part of the JSON\n", - "results = data['result']\n", - "\n", - "# Extract the 'uids' part of the JSON\n", - "flattened_data = []\n", - "\n", - "for uid in results['uids']:\n", - " entry = results[uid]\n", - "\n", - " # Using pd.json_normalize to flatten the JSON data\n", - " flattened_entry = pd.json_normalize(entry, sep='_')\n", - "\n", - " # Process variation_set\n", - " variation_set = flattened_entry.at[0, 'variation_set']\n", - " for idx, var_set in enumerate(variation_set):\n", - " flat_var_set = pd.json_normalize(var_set, sep='_')\n", - " flat_var_set = flat_var_set.add_prefix(f'variation_set_{idx}_')\n", - "\n", - " # Process variation_loc within variation_set\n", - " variation_loc = var_set.get('variation_loc', [])\n", - " for loc_idx, loc in enumerate(variation_loc):\n", - " flat_loc = pd.json_normalize(loc, sep='_')\n", - " flat_loc = flat_loc.add_prefix(f'variation_set_{idx}_loc_{loc_idx}_')\n", - " flat_var_set = flat_var_set.join(flat_loc, rsuffix=f'_{idx}_{loc_idx}_vl')\n", - " \n", - " var_xrefs = var_set.get('variation_xrefs', [])\n", - " for var_xrefs_idx, var_xref in enumerate(var_xrefs):\n", - " flat_var_xrefs = pd.json_normalize(var_xref, sep='_')\n", - " flat_var_xrefs = flat_var_xrefs.add_prefix(f'variation_set_{idx}_var_xrefs_{var_xrefs_idx}_')\n", - " flat_var_set = flat_var_set.join(flat_var_xrefs, rsuffix=f'_{idx}_{var_xrefs_idx}_vx')\n", - "\n", - "\n", - " allele_freq = var_set.get('allele_freq_set', [])\n", - " for allele_freq_idx, allele in enumerate(allele_freq):\n", - " flat_allele = pd.json_normalize(allele, sep='_')\n", - " flat_allele = flat_allele.add_prefix(f'variation_set_{idx}_allele_freq_{allele_freq_idx}_')\n", - " flat_var_set = flat_var_set.join(flat_allele, rsuffix=f'_{idx}_{allele_freq_idx}_af')\n", - " \n", - " # drop original nested lists columns\n", - " flat_var_set = flat_var_set.drop(columns=[f'variation_set_{idx}_variation_loc', f'variation_set_{idx}_variation_xrefs', f'variation_set_{idx}_allele_freq_set'])\n", - " \n", - " flattened_entry = flattened_entry.join(flat_var_set, rsuffix=f'_{idx}_vs')\n", - "\n", - " # Process genes\n", - " genes = flattened_entry.at[0, 'genes']\n", - " for idx, gene in enumerate(genes):\n", - " flat_genes = pd.json_normalize(gene, sep='_')\n", - " flat_genes = flat_genes.add_prefix(f'gene_{idx}_')\n", - " flattened_entry = flattened_entry.join(flat_genes, rsuffix=f'_{idx}_g')\n", - " # Process germline_classification_trait_set\n", - " germline_classification_trait_set = flattened_entry.at[0, 'germline_classification_trait_set']\n", - " for idx, germline_set in enumerate(germline_classification_trait_set):\n", - " flat_germline_set = pd.json_normalize(germline_set, sep='_')\n", - " flat_germline_set = flat_germline_set.add_prefix(f'germline_set_{idx}_')\n", - "\n", - " trait_xrefs = flat_germline_set.at[0, f'germline_set_{idx}_trait_xrefs']\n", - " for jdx, trait_xref in enumerate(trait_xrefs):\n", - " flat_trait_xrefs = pd.json_normalize(trait_xref, sep='_')\n", - " flat_trait_xrefs = flat_trait_xrefs.add_prefix(f'trait_xref_{jdx}_')\n", - " flat_germline_set = flat_germline_set.join(flat_trait_xrefs, rsuffix=f'_{idx}_{jdx}_tx')\n", - "\n", - " flat_germline_set = flat_germline_set.drop(columns=[f'germline_set_{idx}_trait_xrefs'])\n", - " flattened_entry = flattened_entry.join(flat_germline_set, rsuffix=f'_{idx}_gls')\n", - "\n", - " # Dropping original nested lists columns\n", - " flattened_entry = flattened_entry.drop(columns=['variation_set', 'genes', 'germline_classification_trait_set'])\n", - "\n", - " # Append the flattened entry to the list\n", - " flattened_data.append(flattened_entry)\n", - "\n", - "# Concatenate all flattened entries into a single DataFrame\n", - "df = pd.concat(flattened_data, ignore_index=True)\n", - "\n", - "display(df)" - ], - "id": "3b9b8bdad8bdb55d", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "Explanation of whats happening in the code below:\n", - "\n", - "Function to get all the ids from a gene name:\n", - "```python\n", - "get_variant_ids_from_clinvar_name_api(name: str, count: int)\n", - "```\n", - "\n", - "function gets the ids from the clinvar api, the name is the gene name and the count is the maximum number of ids to get (api's limit is 500)\n", - "\n", - "function returns a dictionary with the count and the list of ids:\n", - "\n", - "```json\n", - "{\n", - " 'count': int,\n", - " 'idlist': List[str]\n", - "}\n", - "```\n", - "\n", - "if the count is greater than the api's limit, the function will split the list of ids into smaller lists of 500 and then request the data from the api in chunks of 500 ids:\n", - "\n", - "```python\n", - "id_lists = [id_list[i:i + max] for i in range(0, size, max)]\n", - "```\n", - "\n", - "then the function will request the data from the api and concatenate the dataframes into a single dataframe:\n", - "\n", - "```python\n", - "frames = request_clinvar_api_data(join)\n", - "variations = pd.concat([variations, frames], ignore_index=True)\n", - "```\n", - "\n", - "The variant extraction function contains a lot of nested lists and dictionaries, so the function will flatten the data and then concatenate the dataframes into a single dataframe\n", - "\n", - "**NOTE**\n", - "\n", - "> joining function may have been implemented wrong due to the waiting time of the api.\n" - ], - "id": "655a935b2874c218" - }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-07T12:57:28.089588Z", - "start_time": "2024-08-07T12:55:09.972813Z" + "end_time": "2024-08-19T08:25:08.217689Z", + "start_time": "2024-08-19T08:25:06.865107Z" } }, "cell_type": "code", "source": [ - "import pandas as pd\n", - "\n", - "variations = pd.DataFrame()\n", + "variation_ids = '148002'\n", "\n", - "max = 500\n", - "name = \"EYS\"\n", - "count = 2147483647\n", - "\n", - "id_array = get_variant_ids_from_clinvar_name_api(name, count)\n", - "size = int(id_array['count'])\n", - "id_list = id_array['idlist']\n", - "\n", - "id_lists = [id_list[i:i + max] for i in range(0, size, max)]\n", - "\n", - "track = 0\n", - "for lists in id_lists:\n", - " join = \",\".join(lists)\n", - " frame = request_clinvar_api_data(join)\n", - " \n", - " variations = pd.concat([variations, frame], ignore_index=True)\n", - " \n", - " print(f\"{track + 1}/{len(id_lists)}\")\n", - " track += 1\n", + "frames = request_clinvar_api_data(variation_ids)\n", "\n", - "display(variations)\n" + "display(frames)" ], - "id": "129175e3a2e568be", + "id": "b21c3487476b684f", "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1/10\n", - "2/10\n", - "3/10\n", - "4/10\n", - "5/10\n", - "6/10\n", - "7/10\n", - "8/10\n", - "9/10\n", - "10/10\n" - ] - }, { "data": { "text/plain": [ - " uid obj_type ... gene_1029_strand gene_1029_source\n", - "0 3251429 single nucleotide variant ... NaN NaN\n", - "1 3246148 Deletion ... NaN NaN\n", - "2 3246147 Deletion ... NaN NaN\n", - "3 3246146 Deletion ... NaN NaN\n", - "4 3246145 Deletion ... NaN NaN\n", - "... ... ... ... ... ...\n", - "4778 538 single nucleotide variant ... NaN NaN\n", - "4779 537 single nucleotide variant ... NaN NaN\n", - "4780 536 Deletion ... NaN NaN\n", - "4781 535 Deletion ... NaN NaN\n", - "4782 534 Deletion ... NaN NaN\n", + " uid ... germline_set_0_trait_name\n", + "0 148002 ... See cases\n", "\n", - "[4783 rows x 4428 columns]" + "[1 rows x 37 columns]" ], "text/html": [ "
\n", @@ -412,746 +211,30 @@ " variation_set_0_aliases\n", " variation_set_0_variant_type\n", " variation_set_0_canonical_spdi\n", - " variation_set_0_loc_0_status\n", - " variation_set_0_loc_0_assembly_name\n", - " variation_set_0_loc_0_chr\n", - " variation_set_0_loc_0_band\n", - " ...\n", - " gene_1020_symbol\n", - " gene_1020_geneid\n", - " gene_1020_strand\n", - " gene_1020_source\n", - " gene_1021_symbol\n", - " gene_1021_geneid\n", - " gene_1021_strand\n", - " gene_1021_source\n", - " gene_1022_symbol\n", - " gene_1022_geneid\n", - " gene_1022_strand\n", - " gene_1022_source\n", - " gene_1023_symbol\n", - " gene_1023_geneid\n", - " gene_1023_strand\n", - " gene_1023_source\n", - " gene_1024_symbol\n", - " gene_1024_geneid\n", - " gene_1024_strand\n", - " gene_1024_source\n", - " gene_1025_symbol\n", - " gene_1025_geneid\n", - " gene_1025_strand\n", - " gene_1025_source\n", - " gene_1026_symbol\n", - " gene_1026_geneid\n", - " gene_1026_strand\n", - " gene_1026_source\n", - " gene_1027_symbol\n", - " gene_1027_geneid\n", - " gene_1027_strand\n", - " gene_1027_source\n", - " gene_1028_symbol\n", - " gene_1028_geneid\n", - " gene_1028_strand\n", - " gene_1028_source\n", - " gene_1029_symbol\n", - " gene_1029_geneid\n", - " gene_1029_strand\n", - " gene_1029_source\n", + " germline_set_0_trait_name\n", " \n", " \n", " \n", " \n", " 0\n", - " 3251429\n", - " single nucleotide variant\n", - " VCV003251429\n", - " VCV003251429.\n", - " NM_001142800.2(EYS):c.5886T>C (p.Thr1962=)\n", + " 148002\n", + " copy number gain\n", + " VCV000148002\n", + " VCV000148002.\n", + " GRCh38/hg38 6p12.1-q12(chr6:53931543-68149750)x3\n", " \n", - " EYS\n", + " BAG2\n", " 06\n", - " 00000000000064436215\n", - " \n", - " \n", - " [synonymous variant]\n", - " \n", - " \n", - " [SCV005076913]\n", - " [RCV004587835]\n", - " Likely benign\n", - " 2024/04/08 00:00\n", - " criteria provided, single submitter\n", - " \n", - " \n", - " 1/01/01 00:00\n", + " 00000000000053931543\n", " \n", " \n", " []\n", " \n", - " 1/01/01 00:00\n", - " \n", " \n", - " []\n", - " 3410228\n", - " NM_001142800.2(EYS):c.5886T>C (p.Thr1962=)\n", - " c.5886T>C\n", - " []\n", - " single nucleotide variant\n", - " NC_000006.12:64436214:A:G\n", - " current\n", - " GRCh38\n", - " 6\n", - " 6q12\n", - " ...\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " 1\n", - " 3246148\n", - " Deletion\n", - " VCV003246148\n", - " VCV003246148.\n", - " NC_000006.11:g.(?_66204859)_(66217229_?)del\n", - " \n", - " EYS\n", - " 06\n", - " 99999999999999999999\n", - " \n", - " \n", - " []\n", - " \n", - " \n", - " [SCV005067530]\n", - " [RCV004578792]\n", + " [SCV000177291]\n", + " [RCV000137097]\n", " Pathogenic\n", - " 2023/01/02 00:00\n", - " criteria provided, single submitter\n", - " \n", - " \n", - " 1/01/01 00:00\n", - " \n", - " \n", - " []\n", - " \n", - " 1/01/01 00:00\n", - " \n", - " \n", - " []\n", - " 3403857\n", - " NC_000006.11:g.(?_66204859)_(66217229_?)del\n", - " NC_000006.11:g.(?_66204859)_(66217229_?)del\n", - " []\n", - " Deletion\n", - " \n", - " previous\n", - " GRCh37\n", - " 6\n", - " 6q12\n", - " ...\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " 2\n", - " 3246147\n", - " Deletion\n", - " VCV003246147\n", - " VCV003246147.\n", - " NC_000006.11:g.(?_64511633)_(64516181_?)del\n", - " \n", - " EYS\n", - " 06\n", - " 99999999999999999999\n", - " \n", - " \n", - " []\n", - " \n", - " \n", - " [SCV005067529]\n", - " [RCV004578791]\n", - " Likely pathogenic\n", - " 2023/03/08 00:00\n", - " criteria provided, single submitter\n", - " \n", - " \n", - " 1/01/01 00:00\n", - " \n", - " \n", - " []\n", - " \n", - " 1/01/01 00:00\n", - " \n", - " \n", - " []\n", - " 3403856\n", - " NC_000006.11:g.(?_64511633)_(64516181_?)del\n", - " NC_000006.11:g.(?_64511633)_(64516181_?)del\n", - " []\n", - " Deletion\n", - " \n", - " previous\n", - " GRCh37\n", - " 6\n", - " 6q12\n", - " ...\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " 3\n", - " 3246146\n", - " Deletion\n", - " VCV003246146\n", - " VCV003246146.\n", - " NC_000006.11:g.(?_65523280)_(65527746_?)del\n", - " \n", - " EYS\n", - " 06\n", - " 99999999999999999999\n", - " \n", - " \n", - " []\n", - " \n", - " \n", - " [SCV005067528]\n", - " [RCV004578790]\n", - " Likely pathogenic\n", - " 2023/04/30 00:00\n", - " criteria provided, single submitter\n", - " \n", - " \n", - " 1/01/01 00:00\n", - " \n", - " \n", - " []\n", - " \n", - " 1/01/01 00:00\n", - " \n", - " \n", - " []\n", - " 3403855\n", - " NC_000006.11:g.(?_65523280)_(65527746_?)del\n", - " NC_000006.11:g.(?_65523280)_(65527746_?)del\n", - " []\n", - " Deletion\n", - " \n", - " previous\n", - " GRCh37\n", - " 6\n", - " 6q12\n", - " ...\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " 4\n", - " 3246145\n", - " Deletion\n", - " VCV003246145\n", - " VCV003246145.\n", - " NC_000006.11:g.(?_65587645)_(65596716_?)del\n", - " \n", - " EYS\n", - " 06\n", - " 99999999999999999999\n", - " \n", - " \n", - " []\n", - " \n", - " \n", - " [SCV005067527]\n", - " [RCV004578789]\n", - " Likely pathogenic\n", - " 2023/06/27 00:00\n", - " criteria provided, single submitter\n", - " \n", - " \n", - " 1/01/01 00:00\n", - " \n", - " \n", - " []\n", - " \n", - " 1/01/01 00:00\n", - " \n", - " \n", - " []\n", - " 3403854\n", - " NC_000006.11:g.(?_65587645)_(65596716_?)del\n", - " NC_000006.11:g.(?_65587645)_(65596716_?)del\n", - " []\n", - " Deletion\n", - " \n", - " previous\n", - " GRCh37\n", - " 6\n", - " 6q12\n", - " ...\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " \n", - " \n", - " 4778\n", - " 538\n", - " single nucleotide variant\n", - " VCV000000538\n", - " VCV000000538.\n", - " NM_001142800.2(EYS):c.9405T>A (p.Tyr3135Ter)\n", - " \n", - " EYS\n", - " 06\n", - " 00000000000063720626\n", - " \n", - " \n", - " [3 prime UTR variant, nonsense]\n", - " Y3156*, Y3135*\n", - " \n", - " [SCV000020717, SCV000894389, SCV000709692, SCV...\n", - " [RCV000000568, RCV000593252, RCV003914789, RCV...\n", - " Pathogenic/Likely pathogenic\n", - " 2024/03/09 00:00\n", - " criteria provided, multiple submitters, no con...\n", - " \n", - " \n", - " 1/01/01 00:00\n", - " \n", - " \n", - " []\n", - " \n", - " 1/01/01 00:00\n", - " \n", - " \n", - " []\n", - " 15577\n", - " NM_001142800.2(EYS):c.9405T>A (p.Tyr3135Ter)\n", - " c.9405T>A\n", - " []\n", - " single nucleotide variant\n", - " NC_000006.12:63720625:A:T\n", - " current\n", - " GRCh38\n", - " 6\n", - " 6q12\n", - " ...\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " 4779\n", - " 537\n", - " single nucleotide variant\n", - " VCV000000537\n", - " VCV000000537.\n", - " NM_001142800.2(EYS):c.5857G>T (p.Glu1953Ter)\n", - " \n", - " EYS\n", - " 06\n", - " 00000000000064436244\n", - " \n", - " \n", - " [nonsense]\n", - " E1953*\n", - " \n", - " [SCV000020716, SCV002519636, SCV004195857, SCV...\n", - " [RCV000000567, RCV001387157]\n", - " Pathogenic\n", - " 2024/02/15 00:00\n", - " criteria provided, multiple submitters, no con...\n", - " \n", - " \n", - " 1/01/01 00:00\n", - " \n", - " \n", - " []\n", - " \n", - " 1/01/01 00:00\n", - " \n", - " \n", - " []\n", - " 15576\n", - " NM_001142800.2(EYS):c.5857G>T (p.Glu1953Ter)\n", - " c.5857G>T\n", - " []\n", - " single nucleotide variant\n", - " NC_000006.12:64436243:C:A\n", - " current\n", - " GRCh38\n", - " 6\n", - " 6q12\n", - " ...\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " 4780\n", - " 536\n", - " Deletion\n", - " VCV000000536\n", - " VCV000000536.\n", - " NM_001142800.1(EYS):c.1767-24596_2023+238135del\n", - " \n", - " LOC441155\n", - " 06\n", - " 00000000000065057728\n", - " \n", - " \n", - " []\n", - " \n", - " \n", - " [SCV000020715]\n", - " [RCV000000566]\n", - " Pathogenic\n", - " 2008/11/01 00:00\n", + " 2010/12/22 00:00\n", " no assertion criteria provided\n", " \n", " \n", @@ -1164,229 +247,16 @@ " \n", " \n", " []\n", - " 15575\n", - " NM_001142800.1(EYS):c.1767-24596_2023+238135del\n", - " NM_001142800.1(EYS):c.1767-24596_2023+238135del\n", - " [EX12DEL]\n", - " Deletion\n", - " \n", - " current\n", - " GRCh38\n", - " 6\n", - " 6q12\n", - " ...\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " 4781\n", - " 535\n", - " Deletion\n", - " VCV000000535\n", - " VCV000000535.\n", - " NM_001142800.1(EYS):c.2260-51191_2992+45990del\n", - " \n", - " EYS\n", - " 06\n", - " 00000000000064840707\n", - " \n", - " \n", - " []\n", - " \n", - " \n", - " [SCV000020714]\n", - " [RCV000000565]\n", - " Pathogenic\n", - " 2008/11/01 00:00\n", - " no assertion criteria provided\n", - " \n", - " \n", - " 1/01/01 00:00\n", - " \n", - " \n", + " 157753\n", + " GRCh38/hg38 6p12.1-q12(chr6:53931543-68149750)x3\n", + " GRCh38/hg38 6p12.1-q12(chr6:53931543-68149750)x3\n", " []\n", + " copy number gain\n", " \n", - " 1/01/01 00:00\n", - " \n", - " \n", - " []\n", - " 15574\n", - " NM_001142800.1(EYS):c.2260-51191_2992+45990del\n", - " NM_001142800.1(EYS):c.2260-51191_2992+45990del\n", - " [EX15-19DEL]\n", - " Deletion\n", - " \n", - " current\n", - " GRCh38\n", - " 6\n", - " 6q12\n", - " ...\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " \n", - " \n", - " 4782\n", - " 534\n", - " Deletion\n", - " VCV000000534\n", - " VCV000000534.\n", - " NM_001142800.2(EYS):c.2710_2726del (p.Asp904fs)\n", - " \n", - " EYS\n", - " 06\n", - " 00000000000064902416\n", - " \n", - " \n", - " [frameshift variant]\n", - " D904fs\n", - " \n", - " [SCV000020713]\n", - " [RCV000000564]\n", - " Pathogenic\n", - " 2008/11/01 00:00\n", - " no assertion criteria provided\n", - " \n", - " \n", - " 1/01/01 00:00\n", - " \n", - " \n", - " []\n", - " \n", - " 1/01/01 00:00\n", - " \n", - " \n", - " []\n", - " 15573\n", - " NM_001142800.2(EYS):c.2710_2726del (p.Asp904fs)\n", - " c.2710_2726del\n", - " []\n", - " Deletion\n", - " NC_000006.12:64902415:ACCATATCTTCACAGTCACCATA:...\n", - " current\n", - " GRCh38\n", - " 6\n", - " 6q12\n", - " ...\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " See cases\n", " \n", " \n", "\n", - "

4783 rows × 4428 columns

\n", "
" ] }, @@ -1394,7 +264,116 @@ "output_type": "display_data" } ], - "execution_count": 37 + "execution_count": 2 + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "clinvar_data = pd.read_csv(\"C:\\\\Users\\\\Kajus\\\\Desktop\\\\clinvar_results.txt\", sep='\\t')\n", + "\n", + "display(clinvar_data)" + ], + "id": "8cb4bbe3f35562d5", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "Explanation of whats happening in the code below:\n", + "\n", + "Function to get all the ids from a gene name:\n", + "```python\n", + "get_variant_ids_from_clinvar_name_api(name: str, count: int)\n", + "```\n", + "\n", + "function gets the ids from the clinvar api, the name is the gene name and the count is the maximum number of ids to get (api's limit is 500)\n", + "\n", + "function returns a dictionary with the count and the list of ids:\n", + "\n", + "```json\n", + "{\n", + " 'count': int,\n", + " 'idlist': List[str]\n", + "}\n", + "```\n", + "\n", + "if the count is greater than the api's limit, the function will split the list of ids into smaller lists of 500 and then request the data from the api in chunks of 500 ids:\n", + "\n", + "```python\n", + "id_lists = [id_list[i:i + max] for i in range(0, size, max)]\n", + "```\n", + "\n", + "then the function will request the data from the api and concatenate the dataframes into a single dataframe:\n", + "\n", + "```python\n", + "frames = request_clinvar_api_data(join)\n", + "variations = pd.concat([variations, frames], ignore_index=True)\n", + "```\n", + "\n", + "The variant extraction function contains a lot of nested lists and dictionaries, so the function will flatten the data and then concatenate the dataframes into a single dataframe\n", + "\n", + "**NOTE**\n", + "\n", + "> joining function may have been implemented wrong due to the waiting time of the api.\n" + ], + "id": "655a935b2874c218" + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "\n", + "variations = pd.DataFrame()\n", + "\n", + "max = 500\n", + "name = \"EYS\"\n", + "count = 2147483647\n", + "\n", + "id_array = get_variant_ids_from_clinvar_name_api(name, count)\n", + "size = int(id_array['count'])\n", + "id_list = id_array['idlist']\n", + "\n", + "id_lists = [id_list[i:i + max] for i in range(0, size, max)]\n", + "\n", + "track = 0\n", + "for lists in id_lists:\n", + " join = \",\".join(lists)\n", + " frame = request_clinvar_api_data(join)\n", + " \n", + " variations = pd.concat([variations, frame], ignore_index=True)\n", + " \n", + " print(f\"{track + 1}/{len(id_lists)}\")\n", + " track += 1\n", + "\n", + "display(variations)\n" + ], + "id": "129175e3a2e568be", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "clinvar_data = pd.read_csv('C:\\\\Users\\\\Kajus\\\\Desktop\\\\clinvar_result.txt', sep='\\t')\n", + "\n", + "display(clinvar_data)" + ], + "id": "c85507a3e2c584da", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": "", + "id": "8e0e2f2853152d96", + "outputs": [], + "execution_count": null } ], "metadata": { From 5243f8e43cfabe4b532b74c29b455cc3ccd616c5 Mon Sep 17 00:00:00 2001 From: Kajus CC <42713684+KajusC@users.noreply.github.com> Date: Thu, 22 Aug 2024 17:16:16 +0300 Subject: [PATCH 05/10] Created a function which collects EYS data from gnomAD api. --- api/data/__init__.py | 1 + api/data/refactoring.py | 127 ++++++++++++++++ tests/pipeline.ipynb | 324 ++++++++++++++++++++++++---------------- 3 files changed, 322 insertions(+), 130 deletions(-) diff --git a/api/data/__init__.py b/api/data/__init__.py index 5ccab15..9598171 100644 --- a/api/data/__init__.py +++ b/api/data/__init__.py @@ -59,4 +59,5 @@ request_clinvar_api_data, get_variant_ids_from_clinvar_name_api, extract_nested_json, + request_gnomad_api_data, ) diff --git a/api/data/refactoring.py b/api/data/refactoring.py index 09eaca1..de2ff91 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -272,3 +272,130 @@ def request_clinvar_api_data(gene_id): df = pd.concat(flattened_data, ignore_index=True) return df + + +def request_gnomad_api_data(to_file=True): + """ + Requests gnomAD API for data about EYS gene containing: + - variant_id + - cDNA change + - protein change + - allele frequency + - homozygote count + - popmax + - popmax population + + :param bool to_file: if True, saves data to variants.csv + :returns: DataFrame from gnomAD API + :rtype: DataFrame + """ + + url = 'https://gnomad.broadinstitute.org/api' + query = """ + query{ + gene(gene_id: "ENSG00000188107", reference_genome: GRCh38) { + variants(dataset: gnomad_r4) + { + variant_id + chrom + pos + ref + hgvsc + hgvsp + alt + exome { + ac + an + ac_hom + populations + { + id + ac + an + } + } + genome + { + ac + an + ac_hom + populations + { + id + ac + an + } + } + } + } + } + """ + response = requests.post(url, json={'query': query}) + if response.status_code == 200: + data = response.json()['data']['gene']['variants'] + + df = pd.json_normalize(data) + + df['total_ac'] = df['exome.ac'].fillna(0) + df['genome.ac'].fillna(0) + df['total_an'] = df['exome.an'].fillna(0) + df['genome.an'].fillna(0) + + df['cDNA change'] = df['hgvsc'].fillna(0) + df['Protein change'] = df['hgvsp'].fillna(0) + + df['Allele Frequency'] = df['total_ac'] / df['total_an'] + df['Homozygote Count'] = df['exome.ac_hom'].fillna(0) + df['genome.ac_hom'].fillna(0) + exome_populations = df['exome.populations'] + genome_populations = df['genome.populations'] + ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining'] + + def process_population_data(pop_data, name, pop_ids, index): + for pop_id in pop_ids: + df.loc[index, f'{name}_ac_{pop_id}'] = 0 + df.loc[index, f'{name}_an_{pop_id}'] = 0 + if type(pop_data) == list: + for pop in pop_data: + id = pop['id'] + df.loc[index, f'{name}_ac_{id}'] = pop['ac'] + df.loc[index, f'{name}_an_{id}'] = pop['an'] + + for i in range(len(exome_populations)): + exome_pop = exome_populations[i] + process_population_data(exome_pop, 'exome', ids, i) + genome_pop = genome_populations[i] + process_population_data(genome_pop, 'genome', ids, i) + + for id in ids: + df[f'Allele_Frequency_{id}'] = (df[f'exome_ac_{id}'].fillna(0) + df[f'genome_ac_{id}'].fillna(0)) / ( + df[f'exome_an_{id}'].fillna(0) + df[f'genome_an_{id}'].fillna(0)) + population_mapping = { + 'afr': 'African/African American', + 'eas': 'East Asian', + 'asj': 'Ashkenazi Jew', + 'sas': 'South Asian', + 'nfe': 'European (non-Finnish)', + 'fin': 'European (Finnish)', + 'mid': 'Middle Eastern', + 'amr': 'Admixed American', + 'ami': "Amish", + 'remaining': 'Remaining', + '': '' + } + for i in range(len(df)): + max = 0 + maxid = '' + for id in ids: + if df.loc[i, f'Allele_Frequency_{id}'] > max: + max = df.loc[i, f'Allele_Frequency_{id}'] + maxid = id + df.loc[i, 'Popmax'] = max + df.loc[i, 'Popmax population'] = population_mapping[maxid] + not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency', 'variant_id', + 'cDNA change', 'Protein change'] + df = df.drop([col for col in df.columns if col not in not_to_drop], axis=1) + if to_file: + df.to_csv('variants.csv', index=True) + + else: + print('Error:', response.status_code) + + return df diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 91f5df4..6c6ef65 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -9,8 +9,8 @@ "outputs_hidden": true }, "ExecuteTime": { - "end_time": "2024-08-19T08:25:04.230992Z", - "start_time": "2024-08-19T08:25:03.038301Z" + "end_time": "2024-08-21T19:51:25.871973Z", + "start_time": "2024-08-21T19:51:25.105850Z" } }, "source": [ @@ -23,6 +23,7 @@ " set_lovd_dtypes,\n", " request_clinvar_api_data,\n", " get_variant_ids_from_clinvar_name_api,\n", + " request_gnomad_api_data,\n", " )\n", "from api.data import save_lovd_as_vcf\n", "\n", @@ -132,12 +133,7 @@ "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-08-19T08:25:08.217689Z", - "start_time": "2024-08-19T08:25:06.865107Z" - } - }, + "metadata": {}, "cell_type": "code", "source": [ "variation_ids = '148002'\n", @@ -147,124 +143,8 @@ "display(frames)" ], "id": "b21c3487476b684f", - "outputs": [ - { - "data": { - "text/plain": [ - " uid ... germline_set_0_trait_name\n", - "0 148002 ... See cases\n", - "\n", - "[1 rows x 37 columns]" - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
uidobj_typeaccessionaccession_versiontitlerecord_statusgene_sortchr_sortlocation_sortvariation_set_namevariation_set_idmolecular_consequence_listprotein_changefda_recognized_databasesupporting_submissions_scvsupporting_submissions_rcvgermline_classification_descriptiongermline_classification_last_evaluatedgermline_classification_review_statusgermline_classification_fda_recognized_databaseclinical_impact_classification_descriptionclinical_impact_classification_last_evaluatedclinical_impact_classification_review_statusclinical_impact_classification_fda_recognized_databaseclinical_impact_classification_trait_setoncogenicity_classification_descriptiononcogenicity_classification_last_evaluatedoncogenicity_classification_review_statusoncogenicity_classification_fda_recognized_databaseoncogenicity_classification_trait_setvariation_set_0_measure_idvariation_set_0_variation_namevariation_set_0_cdna_changevariation_set_0_aliasesvariation_set_0_variant_typevariation_set_0_canonical_spdigermline_set_0_trait_name
0148002copy number gainVCV000148002VCV000148002.GRCh38/hg38 6p12.1-q12(chr6:53931543-68149750)x3BAG20600000000000053931543[][SCV000177291][RCV000137097]Pathogenic2010/12/22 00:00no assertion criteria provided1/01/01 00:00[]1/01/01 00:00[]157753GRCh38/hg38 6p12.1-q12(chr6:53931543-68149750)x3GRCh38/hg38 6p12.1-q12(chr6:53931543-68149750)x3[]copy number gainSee cases
\n", - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "execution_count": 2 + "outputs": [], + "execution_count": null }, { "metadata": {}, @@ -319,7 +199,7 @@ "\n", "> joining function may have been implemented wrong due to the waiting time of the api.\n" ], - "id": "655a935b2874c218" + "id": "976f9632a8ef29e3" }, { "metadata": {}, @@ -367,13 +247,197 @@ "outputs": [], "execution_count": null }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-21T19:54:33.516081Z", + "start_time": "2024-08-21T19:52:03.354634Z" + } + }, + "cell_type": "code", + "source": [ + "gnomad_from_api = request_gnomad_api_data(False)\n", + "\n", + "display(gnomad_from_api)" + ], + "id": "64482c033c794fb4", + "outputs": [ + { + "data": { + "text/plain": [ + " variant_id cDNA change ... Popmax Popmax population\n", + "0 6-63720525-A-G c.*71T>C ... 0.000016 African/African American\n", + "1 6-63720525-A-T c.*71T>A ... 0.000192 East Asian\n", + "2 6-63720525-A-C c.*71T>G ... 0.000000 \n", + "3 6-63720526-T-A c.*70A>T ... 0.000020 South Asian\n", + "4 6-63720527-G-T c.*69C>A ... 0.000000 \n", + "... ... ... ... ... ...\n", + "14295 6-65495479-G-T c.-69C>A ... 0.000000 \n", + "14296 6-65495479-G-A c.-69C>T ... 0.000031 African/African American\n", + "14297 6-65495482-A-G c.-72T>C ... 0.000070 Admixed American\n", + "14298 6-65495484-T-G c.-74A>C ... 0.000060 South Asian\n", + "14299 6-65495485-T-C c.-75A>G ... 0.000012 South Asian\n", + "\n", + "[14300 rows x 7 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
variant_idcDNA changeProtein changeAllele FrequencyHomozygote CountPopmaxPopmax population
06-63720525-A-Gc.*71T>C01.807419e-060.00.000016African/African American
16-63720525-A-Tc.*71T>A06.573844e-060.00.000192East Asian
26-63720525-A-Cc.*71T>G00.000000e+000.00.000000
36-63720526-T-Ac.*70A>T01.045299e-060.00.000020South Asian
46-63720527-G-Tc.*69C>A00.000000e+000.00.000000
........................
142956-65495479-G-Tc.-69C>A00.000000e+000.00.000000
142966-65495479-G-Ac.-69C>T01.446349e-060.00.000031African/African American
142976-65495482-A-Gc.-72T>C02.629510e-060.00.000070Admixed American
142986-65495484-T-Gc.-74A>C03.645085e-060.00.000060South Asian
142996-65495485-T-Cc.-75A>G07.310070e-070.00.000012South Asian
\n", + "

14300 rows × 7 columns

\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "execution_count": 2 + }, { "metadata": {}, "cell_type": "code", - "source": "", - "id": "8e0e2f2853152d96", "outputs": [], - "execution_count": null + "execution_count": null, + "source": "", + "id": "6f0abfb50bd211a0" } ], "metadata": { From 5c30ce16e8e859bb53c98e18eff4db0e314c250a Mon Sep 17 00:00:00 2001 From: Kajus CC <42713684+KajusC@users.noreply.github.com> Date: Thu, 22 Aug 2024 20:24:16 +0300 Subject: [PATCH 06/10] Added ability to get any gene from API --- api/data/refactoring.py | 36 +++++++++++++++++++----------------- tests/pipeline.ipynb | 10 +++++----- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/api/data/refactoring.py b/api/data/refactoring.py index de2ff91..8057842 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -274,9 +274,9 @@ def request_clinvar_api_data(gene_id): return df -def request_gnomad_api_data(to_file=True): +def request_gnomad_api_data(gene_name, to_file=True): """ - Requests gnomAD API for data about EYS gene containing: + Requests gnomAD API for data about a specific gene containing: - variant_id - cDNA change - protein change @@ -285,17 +285,18 @@ def request_gnomad_api_data(to_file=True): - popmax - popmax population + :param str gene_name: name of gene :param bool to_file: if True, saves data to variants.csv :returns: DataFrame from gnomAD API :rtype: DataFrame """ url = 'https://gnomad.broadinstitute.org/api' - query = """ - query{ - gene(gene_id: "ENSG00000188107", reference_genome: GRCh38) { + query = f""" + query{{ + gene(gene_symbol: "{gene_name}", reference_genome: GRCh38) {{ variants(dataset: gnomad_r4) - { + {{ variant_id chrom pos @@ -303,33 +304,34 @@ def request_gnomad_api_data(to_file=True): hgvsc hgvsp alt - exome { + exome {{ ac an ac_hom populations - { + {{ id ac an - } - } + }} + }} genome - { + {{ ac an ac_hom populations - { + {{ id ac an - } - } - } - } - } + }} + }} + }} + }} + }} """ + response = requests.post(url, json={'query': query}) if response.status_code == 200: data = response.json()['data']['gene']['variants'] diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 6c6ef65..a838cd4 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -9,8 +9,8 @@ "outputs_hidden": true }, "ExecuteTime": { - "end_time": "2024-08-21T19:51:25.871973Z", - "start_time": "2024-08-21T19:51:25.105850Z" + "end_time": "2024-08-22T17:20:23.240355Z", + "start_time": "2024-08-22T17:20:21.651097Z" } }, "source": [ @@ -250,13 +250,13 @@ { "metadata": { "ExecuteTime": { - "end_time": "2024-08-21T19:54:33.516081Z", - "start_time": "2024-08-21T19:52:03.354634Z" + "end_time": "2024-08-22T17:23:41.828469Z", + "start_time": "2024-08-22T17:21:09.627424Z" } }, "cell_type": "code", "source": [ - "gnomad_from_api = request_gnomad_api_data(False)\n", + "gnomad_from_api = request_gnomad_api_data(\"EYS\", False)\n", "\n", "display(gnomad_from_api)" ], From 197864c56ecd4d396ddac539c55680bcce5c17c8 Mon Sep 17 00:00:00 2001 From: Kajus CC <42713684+KajusC@users.noreply.github.com> Date: Thu, 22 Aug 2024 20:28:25 +0300 Subject: [PATCH 07/10] Removed from last branch --- api/__init__.py | 2 - api/data/__init__.py | 3 -- api/data/refactoring.py | 115 ---------------------------------------- tests/pipeline.ipynb | 115 ---------------------------------------- 4 files changed, 235 deletions(-) diff --git a/api/__init__.py b/api/__init__.py index fb618dd..459952b 100644 --- a/api/__init__.py +++ b/api/__init__.py @@ -56,6 +56,4 @@ parse_lovd, from_clinvar_name_to_cdna_position, save_lovd_as_vcf, - request_clinvar_api_data, - get_variant_ids_from_clinvar_name_api, ) diff --git a/api/data/__init__.py b/api/data/__init__.py index 9598171..7cd3997 100644 --- a/api/data/__init__.py +++ b/api/data/__init__.py @@ -56,8 +56,5 @@ parse_lovd, from_clinvar_name_to_cdna_position, save_lovd_as_vcf, - request_clinvar_api_data, - get_variant_ids_from_clinvar_name_api, - extract_nested_json, request_gnomad_api_data, ) diff --git a/api/data/refactoring.py b/api/data/refactoring.py index 8057842..1ac916b 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -159,121 +159,6 @@ def save_lovd_as_vcf(data, save_to="./lovd.vcf"): f.write("\n") -def get_variant_ids_from_clinvar_name_api(name, count=100): - """ - Extracts variant ids from ClinVar `name` variable. /n - key of dictionary is the size of the list of ids. - - :param str name: name of variant - :param int count: number of ids to extract - :returns: ids of variants - :rtype: str - """ - - result = {} - base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=clinvar&term=" - clinvar_url = f"{base_url}{name}&retmode=json&retmax={count}" - - request = requests.get(clinvar_url) - - if request.status_code != 200: - raise ValueError(f"Request failed with status code {request.status_code}") - - data = request.json() - - result['idlist'] = data['esearchresult']['idlist'] - result['count'] = data['esearchresult']['count'] - - return result - - -def extract_nested_json(flat_parsed, parsed_from, required_column, prefix, join_prefix): - """ - Extracts nested JSON data from dictionary. - - :param DataFrame parsed_from: normalised JSON data - :param str required_column: column to extract - :param str prefix: prefix for extracted columns - """ - - data_set = parsed_from.get(required_column, []) - for idx, data in enumerate(data_set): - flat_data = pd.json_normalize(data, sep='_') - flat_data = flat_data.add_prefix(f'{prefix}_{idx}_') - flat_parsed = flat_parsed.join(flat_data, rsuffix=f'_{idx}_{join_prefix}') - - -def request_clinvar_api_data(gene_id): - """ - Requests ClinVar API for data about variant with given id. - Converts it to pandas dataframe. - - :param str gene_id: id of variant (may be multiple) - :returns: DataFrame from ClinVar API - :rtype: DataFrame - """ - base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=clinvar&id=" - clinvar_url = f"{base_url}{gene_id}&retmode=json" - - request = requests.get(clinvar_url) - - if request.status_code != 200: - raise ValueError(f"Request failed with status code {request.status_code}") - - results = request.json()['result'] - - flattened_data = [] - - for uid in results['uids']: - entry = results[uid] - - flattened_entry = pd.json_normalize(entry, sep='_') - - variation_set = flattened_entry.at[0, 'variation_set'] - for idx, var_set in enumerate(variation_set): - flat_var_set = pd.json_normalize(var_set, sep='_') - flat_var_set = flat_var_set.add_prefix(f'variation_set_{idx}_') - - extract_nested_json(flat_var_set, var_set, 'variation_loc', f'variation_set_{idx}_loc', 'loc') - extract_nested_json(flat_var_set, var_set, 'variation_xrefs', f'variation_set_{idx}_xrefs', 'xrefs') - extract_nested_json(flat_var_set, var_set, 'allele_freq_set', f'variation_set_{idx}_allele_freq', 'allele_freq') - - flat_var_set = flat_var_set.drop( - columns=[f'variation_set_{idx}_variation_loc', - f'variation_set_{idx}_variation_xrefs', - f'variation_set_{idx}_allele_freq_set']) - flattened_entry = flattened_entry.join(flat_var_set, rsuffix=f'_{idx}_vs') - - # this extraction is different from the previous ones - - genes = flattened_entry.at[0, 'genes'] - for idx, gene in enumerate(genes): - flat_genes = pd.json_normalize(gene, sep='_') - flat_genes = flat_genes.add_prefix(f'gene_{idx}_') - flattened_entry = flattened_entry.join(flat_genes, rsuffix=f'_{idx}_g') - - germline_classification_trait_set = flattened_entry.at[0, - 'germline_classification_trait_set'] - for idx, germline_set in enumerate(germline_classification_trait_set): - flat_germline_set = pd.json_normalize(germline_set, sep='_') - flat_germline_set = flat_germline_set.add_prefix(f'germline_set_{idx}_') - - extract_nested_json(flat_germline_set, germline_set, 'trait_xrefs', f'germline_set_{idx}_trait_xrefs', 'trait_xrefs') - - flat_germline_set = flat_germline_set.drop(columns=[f'germline_set_{idx}_trait_xrefs']) - flattened_entry = flattened_entry.join(flat_germline_set, rsuffix=f'_{idx}_gls') - - flattened_entry = flattened_entry.drop(columns=['variation_set', - 'genes', - 'germline_classification_trait_set']) - - flattened_data.append(flattened_entry) - - df = pd.concat(flattened_data, ignore_index=True) - - return df - - def request_gnomad_api_data(gene_name, to_file=True): """ Requests gnomAD API for data about a specific gene containing: diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index a838cd4..044e76d 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -132,121 +132,6 @@ "outputs": [], "execution_count": null }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "variation_ids = '148002'\n", - "\n", - "frames = request_clinvar_api_data(variation_ids)\n", - "\n", - "display(frames)" - ], - "id": "b21c3487476b684f", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "clinvar_data = pd.read_csv(\"C:\\\\Users\\\\Kajus\\\\Desktop\\\\clinvar_results.txt\", sep='\\t')\n", - "\n", - "display(clinvar_data)" - ], - "id": "8cb4bbe3f35562d5", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "Explanation of whats happening in the code below:\n", - "\n", - "Function to get all the ids from a gene name:\n", - "```python\n", - "get_variant_ids_from_clinvar_name_api(name: str, count: int)\n", - "```\n", - "\n", - "function gets the ids from the clinvar api, the name is the gene name and the count is the maximum number of ids to get (api's limit is 500)\n", - "\n", - "function returns a dictionary with the count and the list of ids:\n", - "\n", - "```json\n", - "{\n", - " 'count': int,\n", - " 'idlist': List[str]\n", - "}\n", - "```\n", - "\n", - "if the count is greater than the api's limit, the function will split the list of ids into smaller lists of 500 and then request the data from the api in chunks of 500 ids:\n", - "\n", - "```python\n", - "id_lists = [id_list[i:i + max] for i in range(0, size, max)]\n", - "```\n", - "\n", - "then the function will request the data from the api and concatenate the dataframes into a single dataframe:\n", - "\n", - "```python\n", - "frames = request_clinvar_api_data(join)\n", - "variations = pd.concat([variations, frames], ignore_index=True)\n", - "```\n", - "\n", - "The variant extraction function contains a lot of nested lists and dictionaries, so the function will flatten the data and then concatenate the dataframes into a single dataframe\n", - "\n", - "**NOTE**\n", - "\n", - "> joining function may have been implemented wrong due to the waiting time of the api.\n" - ], - "id": "976f9632a8ef29e3" - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "import pandas as pd\n", - "\n", - "variations = pd.DataFrame()\n", - "\n", - "max = 500\n", - "name = \"EYS\"\n", - "count = 2147483647\n", - "\n", - "id_array = get_variant_ids_from_clinvar_name_api(name, count)\n", - "size = int(id_array['count'])\n", - "id_list = id_array['idlist']\n", - "\n", - "id_lists = [id_list[i:i + max] for i in range(0, size, max)]\n", - "\n", - "track = 0\n", - "for lists in id_lists:\n", - " join = \",\".join(lists)\n", - " frame = request_clinvar_api_data(join)\n", - " \n", - " variations = pd.concat([variations, frame], ignore_index=True)\n", - " \n", - " print(f\"{track + 1}/{len(id_lists)}\")\n", - " track += 1\n", - "\n", - "display(variations)\n" - ], - "id": "129175e3a2e568be", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "clinvar_data = pd.read_csv('C:\\\\Users\\\\Kajus\\\\Desktop\\\\clinvar_result.txt', sep='\\t')\n", - "\n", - "display(clinvar_data)" - ], - "id": "c85507a3e2c584da", - "outputs": [], - "execution_count": null - }, { "metadata": { "ExecuteTime": { From 4d3575a2e77834118bf4d4a51c2ccf6ae95f83c5 Mon Sep 17 00:00:00 2001 From: Kajus CC <42713684+KajusC@users.noreply.github.com> Date: Wed, 28 Aug 2024 21:33:19 +0300 Subject: [PATCH 08/10] Downloaded data from API, formatted code --- api/data/__init__.py | 7 +- api/data/refactoring.py | 110 ++++++------ tests/pipeline.ipynb | 359 ++++++++++++++++------------------------ 3 files changed, 203 insertions(+), 273 deletions(-) diff --git a/api/data/__init__.py b/api/data/__init__.py index 7cd3997..bd40c79 100644 --- a/api/data/__init__.py +++ b/api/data/__init__.py @@ -28,7 +28,9 @@ LOVD_TABLES_DATA_TYPES, # Paths for database downloads - DATABASES_DOWNLOAD_PATHS + DATABASES_DOWNLOAD_PATHS, + + GNOMAD_PATH, ) # DATA COLLECTION IMPORT @@ -57,4 +59,7 @@ from_clinvar_name_to_cdna_position, save_lovd_as_vcf, request_gnomad_api_data, + merge_gnomad_lovd, + parse_gnomad, + set_gnomad_dtypes, ) diff --git a/api/data/refactoring.py b/api/data/refactoring.py index 8aa880d..f2fd6cd 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -247,7 +247,18 @@ def save_lovd_as_vcf(data, save_to="./lovd.vcf"): f.write("\n") -def request_gnomad_api_data(gene_name, to_file=True): +def process_population_data(df, pop_data, name, pop_ids, index): + for pop_id in pop_ids: + df.loc[index, f'{name}_ac_{pop_id}'] = 0 + df.loc[index, f'{name}_an_{pop_id}'] = 0 + if isinstance(pop_data, list): + for pop in pop_data: + variant_id = pop['id'] + df.loc[index, f'{name}_ac_{variant_id}'] = pop['ac'] + df.loc[index, f'{name}_an_{variant_id}'] = pop['an'] + + +def request_gnomad_api_data(gene_name): """ Requests gnomAD API for data about a specific gene containing: - variant_id @@ -305,44 +316,38 @@ def request_gnomad_api_data(gene_name, to_file=True): }} """ - response = requests.post(url, json={'query': query}) - if response.status_code == 200: - data = response.json()['data']['gene']['variants'] - - df = pd.json_normalize(data) - - df['total_ac'] = df['exome.ac'].fillna(0) + df['genome.ac'].fillna(0) - df['total_an'] = df['exome.an'].fillna(0) + df['genome.an'].fillna(0) - - df['cDNA change'] = df['hgvsc'].fillna(0) - df['Protein change'] = df['hgvsp'].fillna(0) - - df['Allele Frequency'] = df['total_ac'] / df['total_an'] - df['Homozygote Count'] = df['exome.ac_hom'].fillna(0) + df['genome.ac_hom'].fillna(0) - exome_populations = df['exome.populations'] - genome_populations = df['genome.populations'] - ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining'] - - def process_population_data(pop_data, name, pop_ids, index): - for pop_id in pop_ids: - df.loc[index, f'{name}_ac_{pop_id}'] = 0 - df.loc[index, f'{name}_an_{pop_id}'] = 0 - if type(pop_data) == list: - for pop in pop_data: - id = pop['id'] - df.loc[index, f'{name}_ac_{id}'] = pop['ac'] - df.loc[index, f'{name}_an_{id}'] = pop['an'] - - for i in range(len(exome_populations)): - exome_pop = exome_populations[i] - process_population_data(exome_pop, 'exome', ids, i) - genome_pop = genome_populations[i] - process_population_data(genome_pop, 'genome', ids, i) - - for id in ids: - df[f'Allele_Frequency_{id}'] = (df[f'exome_ac_{id}'].fillna(0) + df[f'genome_ac_{id}'].fillna(0)) / ( - df[f'exome_an_{id}'].fillna(0) + df[f'genome_an_{id}'].fillna(0)) - population_mapping = { + response = requests.post(url, json={'query': query}, timeout=300)# timeout set to 5 minutes + + if response.status_code != 200: + print('Error:', response.status_code) + return None + + data = response.json()['data']['gene']['variants'] + + df = pd.json_normalize(data) + + df['total_ac'] = df['exome.ac'].fillna(0) + df['genome.ac'].fillna(0) + df['total_an'] = df['exome.an'].fillna(0) + df['genome.an'].fillna(0) + + df['HGVS Consequence'] = df['hgvsc'].fillna(0) # cDNA change + df['Protein Consequence'] = df['hgvsp'].fillna(0) # Protein change + + df['Allele Frequency'] = df['total_ac'] / df['total_an'] + df['Homozygote Count'] = df['exome.ac_hom'].fillna(0) + df['genome.ac_hom'].fillna(0) + exome_populations = df['exome.populations'] + genome_populations = df['genome.populations'] + ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining'] + + for i in range(len(exome_populations)): + exome_pop = exome_populations[i] + process_population_data(df, exome_pop, 'exome', ids, i) + genome_pop = genome_populations[i] + process_population_data(df, genome_pop, 'genome', ids, i) + + for variant_id in ids: + df[f'Allele_Frequency_{variant_id}'] = (df[f'exome_ac_{variant_id}'].fillna(0) + df[f'genome_ac_{variant_id}'].fillna(0)) / ( + df[f'exome_an_{variant_id}'].fillna(0) + df[f'genome_an_{variant_id}'].fillna(0)) + population_mapping = { 'afr': 'African/African American', 'eas': 'East Asian', 'asj': 'Ashkenazi Jew', @@ -355,22 +360,19 @@ def process_population_data(pop_data, name, pop_ids, index): 'remaining': 'Remaining', '': '' } - for i in range(len(df)): - max = 0 - maxid = '' - for id in ids: - if df.loc[i, f'Allele_Frequency_{id}'] > max: - max = df.loc[i, f'Allele_Frequency_{id}'] - maxid = id - df.loc[i, 'Popmax'] = max - df.loc[i, 'Popmax population'] = population_mapping[maxid] - not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency', 'variant_id', + for i in range(len(df)): + max_pop = 0 + maxid = '' + for variant_id in ids: + if df.loc[i, f'Allele_Frequency_{variant_id}'] > max_pop: + max_pop = df.loc[i, f'Allele_Frequency_{variant_id}'] + maxid = variant_id + df.loc[i, 'Popmax'] = max_pop + df.loc[i, 'Popmax population'] = population_mapping[maxid] + not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency', 'variant_id', 'cDNA change', 'Protein change'] - df = df.drop([col for col in df.columns if col not in not_to_drop], axis=1) - if to_file: - df.to_csv('variants.csv', index=True) + df = df.drop([col for col in df.columns if col not in not_to_drop], axis=1) - else: - print('Error:', response.status_code) + df.rename(columns={'variant_id': 'gnomAD ID'}, inplace=True) return df diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index fb86c24..6734e80 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -7,10 +7,6 @@ "collapsed": true, "jupyter": { "outputs_hidden": true - }, - "ExecuteTime": { - "end_time": "2024-08-22T17:20:23.240355Z", - "start_time": "2024-08-22T17:20:21.651097Z" } }, "source": [ @@ -19,30 +15,28 @@ "\n", "from api.data import (store_database_for_eys_gene,\n", " parse_lovd,\n", + " parse_gnomad,\n", " LOVD_PATH,\n", " set_lovd_dtypes,\n", - " request_clinvar_api_data,\n", - " get_variant_ids_from_clinvar_name_api,\n", + " set_gnomad_dtypes,\n", " request_gnomad_api_data,\n", + " merge_gnomad_lovd,\n", + " GNOMAD_PATH,\n", " )\n", "from api.data import save_lovd_as_vcf\n", "\n", + "\n", "pd.options.display.max_columns = 0" ], "outputs": [], - "execution_count": 1 + "execution_count": null }, { "cell_type": "code", "id": "f49f7691a27aa7b4", "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-08-11T16:16:57.305309Z", - "start_time": "2024-08-11T16:16:56.668571Z" - } + "collapsed": false }, - "source": [ "store_database_for_eys_gene(\"lovd\", override=False)" ], @@ -53,12 +47,10 @@ "cell_type": "code", "id": "cf5c45c0f7b9de0f", "metadata": { - "collapsed": false, "jupyter": { "outputs_hidden": false } - }, "source": [ "data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")" @@ -67,62 +59,177 @@ "execution_count": null }, { + "metadata": {}, "cell_type": "code", - "id": "8a089e29bfc8c119", + "source": [ + "gnomad_data = request_gnomad_api_data(\"EYS\")\n", + "\n", + "display(gnomad_data)" + ], + "id": "64482c033c794fb4", + "outputs": [], + "execution_count": null + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-28T18:11:25.802540Z", + "start_time": "2024-08-28T18:11:25.715039Z" + } + }, + "cell_type": "code", + "source": [ + "store_database_for_eys_gene('gnomad', False)\n", + "\n", + "gnomad_data_2 = parse_gnomad(GNOMAD_PATH +'/gnomad_data.csv')" + ], + "id": "60f3f3074a9b19f4", + "outputs": [], + "execution_count": 24 + }, + { "metadata": {}, + "cell_type": "code", + "source": "display(gnomad_data_2)", + "id": "9d3e4d6b5f7be127", + "outputs": [], + "execution_count": null + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-08-28T18:11:35.536411Z", + "start_time": "2024-08-28T18:11:35.258009Z" + } + }, + "cell_type": "code", "source": [ - "for i in data:\n", - " print(i)\n", - " display(data[i])" + "gnomad_data_2.to_csv('C:\\\\Users\\\\Kajus\\\\Desktop\\\\gnomad_data_downloaded.csv', index=False)\n", + "gnomad_data.to_csv('C:\\\\Users\\\\Kajus\\\\Desktop\\\\gnomad_data_api.csv', index=False)" + ], + "id": "2e869f5c77dbe3d3", + "outputs": [], + "execution_count": 26 + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "len(gnomad_data_2), len(gnomad_data)\n", + "\n", + "print(len(gnomad_data_2) - len(gnomad_data))" ], + "id": "9efafb201061c146", "outputs": [], "execution_count": null }, { + "metadata": {}, "cell_type": "code", - "id": "ef07740b2fa63e42", + "source": [ + "gnomad_data_2.rename(columns={'gnomAD ID': 'variant_id'}, inplace=True)\n", + "\n", + "missing_from_api = []\n", + "\n", + "for i in gnomad_data['variant_id']:\n", + " if(i in gnomad_data_2['variant_id'].values):\n", + " continue\n", + " missing_from_api.append(i)\n", + "\n", + "len(missing_from_api)\n", + "\n", + "missing_data = gnomad_data.loc[gnomad_data['variant_id'].isin(missing_from_api)]\n", + "\n", + "missing_data" + ], + "id": "d0eb0a6db96d31c8", + "outputs": [], + "execution_count": null + }, + { "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false + "ExecuteTime": { + "end_time": "2024-08-28T18:06:31.488622Z", + "start_time": "2024-08-28T18:06:31.471299Z" } }, + "cell_type": "code", + "source": "missing_data.to_csv('C:\\\\Users\\\\Kajus\\\\Desktop\\\\gnomad_data_missing.csv', index=False)", + "id": "388120b03b094511", + "outputs": [], + "execution_count": 23 + }, + { + "metadata": {}, + "cell_type": "code", "source": [ "set_lovd_dtypes(data)\n", + "set_gnomad_dtypes(gnomad_data)\n", + "\n", + "variants_on_genome = data[\"Variants_On_Genome\"].copy()\n", + "\n", + "lovd_data = pd.merge(data[\"Variants_On_Transcripts\"],\n", + " variants_on_genome[['id','VariantOnGenome/DNA/hg38']],\n", + " on='id',\n", + " how='left')\n", + "\n", + "gnomad_data = gnomad_data.copy()\n", + "final_data = merge_gnomad_lovd(lovd_data, gnomad_data)\n", + "final_data" + ], + "id": "96453d88e353aeb1", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ "for i in data:\n", " print(i)\n", - " display(data[i].info())" + " display(data[i])" ], + "id": "8a089e29bfc8c119", "outputs": [], "execution_count": null }, { - "cell_type": "code", - "id": "c968af1617be40db", "metadata": {}, + "cell_type": "code", "source": [ - "save_lovd_as_vcf(data[\"Variants_On_Genome\"], \"./lovd.vcf\")" + "set_lovd_dtypes(data)\n", + "for i in data:\n", + " print(i)\n", + " display(data[i].info())" ], + "id": "ef07740b2fa63e42", "outputs": [], "execution_count": null }, { + "metadata": {}, "cell_type": "code", - "id": "c7ff16903e0c52bd", + "source": "save_lovd_as_vcf(data[\"Variants_On_Genome\"], \"./lovd.vcf\")", + "id": "c968af1617be40db", + "outputs": [], + "execution_count": null + }, + { "metadata": {}, + "cell_type": "code", "source": [ "from subprocess import Popen\n", "\n", "process = Popen(\"spliceai -I ./lovd.vcf -O ./lovd_output.vcf -R ../tools/spliceai/hg38.fa -A grch38\".split())\n", "process.wait()" ], + "id": "c7ff16903e0c52bd", "outputs": [], "execution_count": null }, { - "cell_type": "code", - "id": "0514ccc3-5c91-41ad-ab15-f4158030ea14", "metadata": {}, + "cell_type": "code", "source": [ "from api.tools import get_revel_scores\n", "\n", @@ -133,201 +240,17 @@ "\n", "display(results)" ], + "id": "0514ccc3-5c91-41ad-ab15-f4158030ea14", "outputs": [], "execution_count": null }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2024-08-22T17:23:41.828469Z", - "start_time": "2024-08-22T17:21:09.627424Z" - } - }, - "cell_type": "code", - "source": [ - "gnomad_from_api = request_gnomad_api_data(\"EYS\", False)\n", - "\n", - "display(gnomad_from_api)" - ], - "id": "64482c033c794fb4", - "outputs": [ - { - "data": { - "text/plain": [ - " variant_id cDNA change ... Popmax Popmax population\n", - "0 6-63720525-A-G c.*71T>C ... 0.000016 African/African American\n", - "1 6-63720525-A-T c.*71T>A ... 0.000192 East Asian\n", - "2 6-63720525-A-C c.*71T>G ... 0.000000 \n", - "3 6-63720526-T-A c.*70A>T ... 0.000020 South Asian\n", - "4 6-63720527-G-T c.*69C>A ... 0.000000 \n", - "... ... ... ... ... ...\n", - "14295 6-65495479-G-T c.-69C>A ... 0.000000 \n", - "14296 6-65495479-G-A c.-69C>T ... 0.000031 African/African American\n", - "14297 6-65495482-A-G c.-72T>C ... 0.000070 Admixed American\n", - "14298 6-65495484-T-G c.-74A>C ... 0.000060 South Asian\n", - "14299 6-65495485-T-C c.-75A>G ... 0.000012 South Asian\n", - "\n", - "[14300 rows x 7 columns]" - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
variant_idcDNA changeProtein changeAllele FrequencyHomozygote CountPopmaxPopmax population
06-63720525-A-Gc.*71T>C01.807419e-060.00.000016African/African American
16-63720525-A-Tc.*71T>A06.573844e-060.00.000192East Asian
26-63720525-A-Cc.*71T>G00.000000e+000.00.000000
36-63720526-T-Ac.*70A>T01.045299e-060.00.000020South Asian
46-63720527-G-Tc.*69C>A00.000000e+000.00.000000
........................
142956-65495479-G-Tc.-69C>A00.000000e+000.00.000000
142966-65495479-G-Ac.-69C>T01.446349e-060.00.000031African/African American
142976-65495482-A-Gc.-72T>C02.629510e-060.00.000070Admixed American
142986-65495484-T-Gc.-74A>C03.645085e-060.00.000060South Asian
142996-65495485-T-Cc.-75A>G07.310070e-070.00.000012South Asian
\n", - "

14300 rows × 7 columns

\n", - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "execution_count": 2 - }, { "metadata": {}, "cell_type": "code", - "outputs": [], - "execution_count": null, "source": "", - "id": "6f0abfb50bd211a0" - + "id": "6f0abfb50bd211a0", + "outputs": [], + "execution_count": null } ], "metadata": { From 437954ab2c5781cffeae7161b3e5b96016045ccc Mon Sep 17 00:00:00 2001 From: Kajus CC <42713684+KajusC@users.noreply.github.com> Date: Mon, 2 Sep 2024 21:50:20 +0300 Subject: [PATCH 09/10] Refactored and resolved PR comments --- api/data/refactoring.py | 56 ++++----- tests/pipeline.ipynb | 253 +++++++++++++++++++++++++++++++++++----- 2 files changed, 256 insertions(+), 53 deletions(-) diff --git a/api/data/refactoring.py b/api/data/refactoring.py index f2fd6cd..0c32241 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -316,37 +316,37 @@ def request_gnomad_api_data(gene_name): }} """ - response = requests.post(url, json={'query': query}, timeout=300)# timeout set to 5 minutes + response = requests.post(url, json={'query': query}, timeout=300) # timeout set to 5 minutes if response.status_code != 200: print('Error:', response.status_code) - return None data = response.json()['data']['gene']['variants'] df = pd.json_normalize(data) - df['total_ac'] = df['exome.ac'].fillna(0) + df['genome.ac'].fillna(0) - df['total_an'] = df['exome.an'].fillna(0) + df['genome.an'].fillna(0) + df.loc[:, 'total_ac'] = df.loc[:, 'exome.ac'].fillna(0) + df.loc[:, 'genome.ac'].fillna(0) + df.loc[:, 'total_an'] = df.loc[:, 'exome.an'].fillna(0) + df.loc[:, 'genome.an'].fillna(0) - df['HGVS Consequence'] = df['hgvsc'].fillna(0) # cDNA change - df['Protein Consequence'] = df['hgvsp'].fillna(0) # Protein change + df.loc[:, 'HGVS Consequence'] = df.loc[:, 'hgvsc'].fillna(0) # cDNA change + df.loc[:, 'Protein Consequence'] = df.loc[:, 'hgvsp'].fillna(0) # Protein change - df['Allele Frequency'] = df['total_ac'] / df['total_an'] - df['Homozygote Count'] = df['exome.ac_hom'].fillna(0) + df['genome.ac_hom'].fillna(0) - exome_populations = df['exome.populations'] - genome_populations = df['genome.populations'] - ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining'] + df.loc[:, 'Allele Frequency'] = df.loc[:, 'total_ac'] / df.loc[:, 'total_an'] + df.loc[:, 'Homozygote Count'] = df.loc[:, 'exome.ac_hom'].fillna(0) + df.loc[:, 'genome.ac_hom'].fillna(0) + exome_populations = df.loc[:, 'exome.populations'] + genome_populations = df.loc[:, 'genome.populations'] + population_ids = ['afr', 'eas', 'asj', 'sas', 'nfe', 'fin', 'mid', 'amr', 'ami', 'remaining'] for i in range(len(exome_populations)): exome_pop = exome_populations[i] - process_population_data(df, exome_pop, 'exome', ids, i) + process_population_data(df, exome_pop, 'exome', population_ids, i) genome_pop = genome_populations[i] - process_population_data(df, genome_pop, 'genome', ids, i) + process_population_data(df, genome_pop, 'genome', population_ids, i) - for variant_id in ids: - df[f'Allele_Frequency_{variant_id}'] = (df[f'exome_ac_{variant_id}'].fillna(0) + df[f'genome_ac_{variant_id}'].fillna(0)) / ( - df[f'exome_an_{variant_id}'].fillna(0) + df[f'genome_an_{variant_id}'].fillna(0)) + for population_id in population_ids: + df.loc[:, f'Allele_Frequency_{population_id}'] = ( + (df.loc[:, f'exome_ac_{population_id}'].fillna(0) + df.loc[:, f'genome_ac_{population_id}'].fillna(0)) / ( + df.loc[:, f'exome_an_{population_id}'].fillna(0) + df.loc[:, f'genome_an_{population_id}'].fillna(0))) population_mapping = { 'afr': 'African/African American', 'eas': 'East Asian', @@ -360,19 +360,21 @@ def request_gnomad_api_data(gene_name): 'remaining': 'Remaining', '': '' } - for i in range(len(df)): + + for i in range(df.shape[0]): max_pop = 0 - maxid = '' - for variant_id in ids: - if df.loc[i, f'Allele_Frequency_{variant_id}'] > max_pop: - max_pop = df.loc[i, f'Allele_Frequency_{variant_id}'] - maxid = variant_id + max_id = '' + for population_id in population_ids: + if df.loc[i, f'Allele_Frequency_{population_id}'] > max_pop: + max_pop = df.loc[i, f'Allele_Frequency_{population_id}'] + max_id = population_id df.loc[i, 'Popmax'] = max_pop - df.loc[i, 'Popmax population'] = population_mapping[maxid] - not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency', 'variant_id', - 'cDNA change', 'Protein change'] - df = df.drop([col for col in df.columns if col not in not_to_drop], axis=1) + df.loc[i, 'Popmax population'] = population_mapping[max_id] + not_to_drop = ['Popmax', 'Popmax population', 'Homozygote Count', 'Allele Frequency', + 'variant_id', 'cDNA change', 'Protein change'] + + df = df.filter(not_to_drop, axis="columns") - df.rename(columns={'variant_id': 'gnomAD ID'}, inplace=True) + df.rename(columns={'variant_id': 'gnomAD ID'}) return df diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 6734e80..45c74af 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -7,6 +7,10 @@ "collapsed": true, "jupyter": { "outputs_hidden": true + }, + "ExecuteTime": { + "end_time": "2024-09-02T18:45:02.492330Z", + "start_time": "2024-09-02T18:45:02.488185Z" } }, "source": [ @@ -29,7 +33,7 @@ "pd.options.display.max_columns = 0" ], "outputs": [], - "execution_count": null + "execution_count": 11 }, { "cell_type": "code", @@ -59,7 +63,11 @@ "execution_count": null }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "start_time": "2024-09-02T18:42:20.091398Z" + } + }, "cell_type": "code", "source": [ "gnomad_data = request_gnomad_api_data(\"EYS\")\n", @@ -73,8 +81,7 @@ { "metadata": { "ExecuteTime": { - "end_time": "2024-08-28T18:11:25.802540Z", - "start_time": "2024-08-28T18:11:25.715039Z" + "start_time": "2024-09-02T18:44:44.422287Z" } }, "cell_type": "code", @@ -85,10 +92,14 @@ ], "id": "60f3f3074a9b19f4", "outputs": [], - "execution_count": 24 + "execution_count": null }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "start_time": "2024-09-02T18:44:44.497881Z" + } + }, "cell_type": "code", "source": "display(gnomad_data_2)", "id": "9d3e4d6b5f7be127", @@ -98,8 +109,7 @@ { "metadata": { "ExecuteTime": { - "end_time": "2024-08-28T18:11:35.536411Z", - "start_time": "2024-08-28T18:11:35.258009Z" + "start_time": "2024-09-02T18:44:44.546361Z" } }, "cell_type": "code", @@ -109,10 +119,14 @@ ], "id": "2e869f5c77dbe3d3", "outputs": [], - "execution_count": 26 + "execution_count": null }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "start_time": "2024-09-02T18:44:44.806484Z" + } + }, "cell_type": "code", "source": [ "len(gnomad_data_2), len(gnomad_data)\n", @@ -124,21 +138,180 @@ "execution_count": null }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-02T18:45:06.035450Z", + "start_time": "2024-09-02T18:45:06.022832Z" + } + }, + "cell_type": "code", + "source": "gnomad_data", + "id": "96283480cccf641", + "outputs": [ + { + "data": { + "text/plain": [ + " Popmax Popmax population ... Allele Frequency variant_id\n", + "0 0.000016 African/African American ... 1.807419e-06 6-63720525-A-G\n", + "1 0.000192 East Asian ... 6.573844e-06 6-63720525-A-T\n", + "2 0.000000 ... 0.000000e+00 6-63720525-A-C\n", + "3 0.000020 South Asian ... 1.045299e-06 6-63720526-T-A\n", + "4 0.000000 ... 0.000000e+00 6-63720527-G-T\n", + "... ... ... ... ... ...\n", + "14295 0.000000 ... 0.000000e+00 6-65495479-G-T\n", + "14296 0.000031 African/African American ... 1.446349e-06 6-65495479-G-A\n", + "14297 0.000070 Admixed American ... 2.629510e-06 6-65495482-A-G\n", + "14298 0.000060 South Asian ... 3.645085e-06 6-65495484-T-G\n", + "14299 0.000012 South Asian ... 7.310070e-07 6-65495485-T-C\n", + "\n", + "[14300 rows x 5 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PopmaxPopmax populationHomozygote CountAllele Frequencyvariant_id
00.000016African/African American0.01.807419e-066-63720525-A-G
10.000192East Asian0.06.573844e-066-63720525-A-T
20.0000000.00.000000e+006-63720525-A-C
30.000020South Asian0.01.045299e-066-63720526-T-A
40.0000000.00.000000e+006-63720527-G-T
..................
142950.0000000.00.000000e+006-65495479-G-T
142960.000031African/African American0.01.446349e-066-65495479-G-A
142970.000070Admixed American0.02.629510e-066-65495482-A-G
142980.000060South Asian0.03.645085e-066-65495484-T-G
142990.000012South Asian0.07.310070e-076-65495485-T-C
\n", + "

14300 rows × 5 columns

\n", + "
" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 12 + }, + { + "metadata": { + "ExecuteTime": { + "start_time": "2024-09-02T18:44:44.827926Z" + } + }, "cell_type": "code", "source": [ - "gnomad_data_2.rename(columns={'gnomAD ID': 'variant_id'}, inplace=True)\n", - "\n", "missing_from_api = []\n", "\n", - "for i in gnomad_data['variant_id']:\n", - " if(i in gnomad_data_2['variant_id'].values):\n", + "for i in gnomad_data['gnomAD ID']:\n", + " if(i in gnomad_data_2['gnomAD ID'].values):\n", " continue\n", " missing_from_api.append(i)\n", "\n", "len(missing_from_api)\n", "\n", - "missing_data = gnomad_data.loc[gnomad_data['variant_id'].isin(missing_from_api)]\n", + "missing_data = gnomad_data.loc[gnomad_data['gnomAD ID'].isin(missing_from_api)]\n", "\n", "missing_data" ], @@ -149,18 +322,21 @@ { "metadata": { "ExecuteTime": { - "end_time": "2024-08-28T18:06:31.488622Z", - "start_time": "2024-08-28T18:06:31.471299Z" + "start_time": "2024-09-02T18:44:45.626358Z" } }, "cell_type": "code", "source": "missing_data.to_csv('C:\\\\Users\\\\Kajus\\\\Desktop\\\\gnomad_data_missing.csv', index=False)", "id": "388120b03b094511", "outputs": [], - "execution_count": 23 + "execution_count": null }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "start_time": "2024-09-02T18:44:45.626358Z" + } + }, "cell_type": "code", "source": [ "set_lovd_dtypes(data)\n", @@ -182,7 +358,11 @@ "execution_count": null }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "start_time": "2024-09-02T18:44:45.627863Z" + } + }, "cell_type": "code", "source": [ "for i in data:\n", @@ -194,7 +374,11 @@ "execution_count": null }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "start_time": "2024-09-02T18:44:45.628871Z" + } + }, "cell_type": "code", "source": [ "set_lovd_dtypes(data)\n", @@ -207,7 +391,12 @@ "execution_count": null }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-02T18:44:45.646110Z", + "start_time": "2024-09-02T18:44:45.629871Z" + } + }, "cell_type": "code", "source": "save_lovd_as_vcf(data[\"Variants_On_Genome\"], \"./lovd.vcf\")", "id": "c968af1617be40db", @@ -215,7 +404,11 @@ "execution_count": null }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "start_time": "2024-09-02T18:44:45.630870Z" + } + }, "cell_type": "code", "source": [ "from subprocess import Popen\n", @@ -228,7 +421,11 @@ "execution_count": null }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "start_time": "2024-09-02T18:44:45.631870Z" + } + }, "cell_type": "code", "source": [ "from api.tools import get_revel_scores\n", @@ -245,7 +442,11 @@ "execution_count": null }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "start_time": "2024-09-02T18:44:45.631870Z" + } + }, "cell_type": "code", "source": "", "id": "6f0abfb50bd211a0", From 522cf7049a910502006920b60792d3df73409a14 Mon Sep 17 00:00:00 2001 From: Kajus CC <42713684+KajusC@users.noreply.github.com> Date: Mon, 9 Sep 2024 23:09:42 +0300 Subject: [PATCH 10/10] Extra PR refactoring --- api/data/refactoring.py | 17 ++- tests/pipeline.ipynb | 246 +++------------------------------------- 2 files changed, 32 insertions(+), 231 deletions(-) diff --git a/api/data/refactoring.py b/api/data/refactoring.py index 0c32241..51f9a4c 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -247,7 +247,18 @@ def save_lovd_as_vcf(data, save_to="./lovd.vcf"): f.write("\n") -def process_population_data(df, pop_data, name, pop_ids, index): +def prepare_popmax_calculation(df, pop_data, name, pop_ids, index): + """ + prepares the calculation of popmax and popmax population for a variant. + genome and exome data of ac and an. + + :param DataFrame df: DataFrame containing gnomAD data + :param dict pop_data: dictionary containing population data + :param str name: name of the population + :param list[str] pop_ids: list of population ids + :param int index: index of the variant + """ + for pop_id in pop_ids: df.loc[index, f'{name}_ac_{pop_id}'] = 0 df.loc[index, f'{name}_an_{pop_id}'] = 0 @@ -339,9 +350,9 @@ def request_gnomad_api_data(gene_name): for i in range(len(exome_populations)): exome_pop = exome_populations[i] - process_population_data(df, exome_pop, 'exome', population_ids, i) + prepare_popmax_calculation(df, exome_pop, 'exome', population_ids, i) genome_pop = genome_populations[i] - process_population_data(df, genome_pop, 'genome', population_ids, i) + prepare_popmax_calculation(df, genome_pop, 'genome', population_ids, i) for population_id in population_ids: df.loc[:, f'Allele_Frequency_{population_id}'] = ( diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 45c74af..71cf21d 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -7,10 +7,6 @@ "collapsed": true, "jupyter": { "outputs_hidden": true - }, - "ExecuteTime": { - "end_time": "2024-09-02T18:45:02.492330Z", - "start_time": "2024-09-02T18:45:02.488185Z" } }, "source": [ @@ -33,7 +29,7 @@ "pd.options.display.max_columns = 0" ], "outputs": [], - "execution_count": 11 + "execution_count": null }, { "cell_type": "code", @@ -63,11 +59,7 @@ "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "start_time": "2024-09-02T18:42:20.091398Z" - } - }, + "metadata": {}, "cell_type": "code", "source": [ "gnomad_data = request_gnomad_api_data(\"EYS\")\n", @@ -79,11 +71,7 @@ "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "start_time": "2024-09-02T18:44:44.422287Z" - } - }, + "metadata": {}, "cell_type": "code", "source": [ "store_database_for_eys_gene('gnomad', False)\n", @@ -95,11 +83,7 @@ "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "start_time": "2024-09-02T18:44:44.497881Z" - } - }, + "metadata": {}, "cell_type": "code", "source": "display(gnomad_data_2)", "id": "9d3e4d6b5f7be127", @@ -107,11 +91,7 @@ "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "start_time": "2024-09-02T18:44:44.546361Z" - } - }, + "metadata": {}, "cell_type": "code", "source": [ "gnomad_data_2.to_csv('C:\\\\Users\\\\Kajus\\\\Desktop\\\\gnomad_data_downloaded.csv', index=False)\n", @@ -122,11 +102,7 @@ "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "start_time": "2024-09-02T18:44:44.806484Z" - } - }, + "metadata": {}, "cell_type": "code", "source": [ "len(gnomad_data_2), len(gnomad_data)\n", @@ -138,168 +114,15 @@ "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-02T18:45:06.035450Z", - "start_time": "2024-09-02T18:45:06.022832Z" - } - }, + "metadata": {}, "cell_type": "code", "source": "gnomad_data", "id": "96283480cccf641", - "outputs": [ - { - "data": { - "text/plain": [ - " Popmax Popmax population ... Allele Frequency variant_id\n", - "0 0.000016 African/African American ... 1.807419e-06 6-63720525-A-G\n", - "1 0.000192 East Asian ... 6.573844e-06 6-63720525-A-T\n", - "2 0.000000 ... 0.000000e+00 6-63720525-A-C\n", - "3 0.000020 South Asian ... 1.045299e-06 6-63720526-T-A\n", - "4 0.000000 ... 0.000000e+00 6-63720527-G-T\n", - "... ... ... ... ... ...\n", - "14295 0.000000 ... 0.000000e+00 6-65495479-G-T\n", - "14296 0.000031 African/African American ... 1.446349e-06 6-65495479-G-A\n", - "14297 0.000070 Admixed American ... 2.629510e-06 6-65495482-A-G\n", - "14298 0.000060 South Asian ... 3.645085e-06 6-65495484-T-G\n", - "14299 0.000012 South Asian ... 7.310070e-07 6-65495485-T-C\n", - "\n", - "[14300 rows x 5 columns]" - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
PopmaxPopmax populationHomozygote CountAllele Frequencyvariant_id
00.000016African/African American0.01.807419e-066-63720525-A-G
10.000192East Asian0.06.573844e-066-63720525-A-T
20.0000000.00.000000e+006-63720525-A-C
30.000020South Asian0.01.045299e-066-63720526-T-A
40.0000000.00.000000e+006-63720527-G-T
..................
142950.0000000.00.000000e+006-65495479-G-T
142960.000031African/African American0.01.446349e-066-65495479-G-A
142970.000070Admixed American0.02.629510e-066-65495482-A-G
142980.000060South Asian0.03.645085e-066-65495484-T-G
142990.000012South Asian0.07.310070e-076-65495485-T-C
\n", - "

14300 rows × 5 columns

\n", - "
" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 12 + "outputs": [], + "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "start_time": "2024-09-02T18:44:44.827926Z" - } - }, + "metadata": {}, "cell_type": "code", "source": [ "missing_from_api = []\n", @@ -320,11 +143,7 @@ "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "start_time": "2024-09-02T18:44:45.626358Z" - } - }, + "metadata": {}, "cell_type": "code", "source": "missing_data.to_csv('C:\\\\Users\\\\Kajus\\\\Desktop\\\\gnomad_data_missing.csv', index=False)", "id": "388120b03b094511", @@ -332,11 +151,7 @@ "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "start_time": "2024-09-02T18:44:45.626358Z" - } - }, + "metadata": {}, "cell_type": "code", "source": [ "set_lovd_dtypes(data)\n", @@ -358,11 +173,7 @@ "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "start_time": "2024-09-02T18:44:45.627863Z" - } - }, + "metadata": {}, "cell_type": "code", "source": [ "for i in data:\n", @@ -374,11 +185,7 @@ "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "start_time": "2024-09-02T18:44:45.628871Z" - } - }, + "metadata": {}, "cell_type": "code", "source": [ "set_lovd_dtypes(data)\n", @@ -391,12 +198,7 @@ "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-02T18:44:45.646110Z", - "start_time": "2024-09-02T18:44:45.629871Z" - } - }, + "metadata": {}, "cell_type": "code", "source": "save_lovd_as_vcf(data[\"Variants_On_Genome\"], \"./lovd.vcf\")", "id": "c968af1617be40db", @@ -404,11 +206,7 @@ "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "start_time": "2024-09-02T18:44:45.630870Z" - } - }, + "metadata": {}, "cell_type": "code", "source": [ "from subprocess import Popen\n", @@ -421,11 +219,7 @@ "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "start_time": "2024-09-02T18:44:45.631870Z" - } - }, + "metadata": {}, "cell_type": "code", "source": [ "from api.tools import get_revel_scores\n", @@ -442,11 +236,7 @@ "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "start_time": "2024-09-02T18:44:45.631870Z" - } - }, + "metadata": {}, "cell_type": "code", "source": "", "id": "6f0abfb50bd211a0",