From 1e7003c36251bbc7f66fc3bb752e7f2fb37468e3 Mon Sep 17 00:00:00 2001 From: Vladyslav Levchenko Date: Sat, 31 Aug 2024 19:51:21 +0300 Subject: [PATCH 01/15] Mut_position standartisation for merging LOVD and gnomAD --- api/data/refactoring.py | 107 ++++++++++++++++++++++++++++++++++------ requirements.txt | 9 ++-- tests/pipeline.ipynb | 12 ++--- 3 files changed, 102 insertions(+), 26 deletions(-) diff --git a/api/data/refactoring.py b/api/data/refactoring.py index 5154c44..c3b2a09 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -6,6 +6,8 @@ import pandas as pd from pandas import DataFrame +from pyliftover import LiftOver + from .constants import LOVD_TABLES_DATA_TYPES, LOVD_PATH, GNOMAD_TABLES_DATA_TYPES, GNOMAD_PATH @@ -174,45 +176,118 @@ def from_clinvar_name_to_cdna_position(name): return name[start:end] -def add_g_position_to_gnomad(gnomad): +def lovd_fill_hg38(lovd: pd.DataFrame): """ - Create new column 'hg38_gnomAD' from 'gnomAD ID' in the gnomAD dataframe. + Fills missing hg38 values in the LOVD dataframe by converting hg19 values to hg38. + New column 'hg19/hg38_lovd' is added to store the converted positions in the format '6-position-ref-alt'. Parameters: - gnomad : pd.DataFrame - gnomAD dataframe. This function modifies it in-place. + - lovd (pd.DataFrame): A pandas DataFrame containing following columns: + - 'VariantOnGenome/DNA': hg19 values. + - 'VariantOnGenome/DNA/hg38': hg38 values. + + Returns: + None: Modifies the input DataFrame in-place by adding or updating the 'hg19/hg38_lovd' column. """ - gnomad[['chromosome', 'position', 'ref', 'alt']] = gnomad['gnomAD ID'].str.split('-', expand=True) - gnomad['hg38'] = 'g.' + gnomad['position'] + gnomad['ref'] + '>' + gnomad['alt'] - gnomad.drop(columns=['chromosome', 'position', 'ref', 'alt'], inplace=True) + def convert_hg19_if_missing(row): + """ + converts hg19 variant to hg38 if hg38 is missing. -def merge_gnomad_lovd(lovd, gnomad): + checks if the hg38 value is missing (NaN) in a given row. If it is, the hg19 variant + is converted to hg38 using the `convert_hg19_to_hg38` function. Otherwise, the existing hg38 value is formatted. + + Parameters: + - row (pd.Series): A pandas Series representing a single row of the DataFrame. + + Returns: + - str: The hg38 value or a conversion of the hg19 value in the format '6-position-ref-alt'. + """ + if pd.isna(row['VariantOnGenome/DNA/hg38']): + return convert_hg19_to_hg38(convert_to_gnomad_gen_pos(row['VariantOnGenome/DNA'])) + return convert_to_gnomad_gen_pos(row['VariantOnGenome/DNA/hg38']) + + def convert_hg19_to_hg38(position: str, lo=LiftOver('hg19', 'hg38')): + """ + converts a genomic position from hg19 to hg38 using the LiftOver tool. + + parameters: + - position (str): A string representing the hg19 variant in the format 'g.positionRef>Alt'. + - lo (LiftOver): Converter for coordinates between genome builds + + returns: + - str: converted hg38 position in the format '6-position-ref-alt'. + """ + try: + if '?' in position: + return '?' + new_pos = lo.convert_coordinate('chr6', int(position[2:10]))[0][1] + return f"6-{new_pos}-{position[-3:]}" + except Exception as e: + return f"Error processing variant: {str(e)}" + + lovd['VariantOnGenome/DNA/hg38'] = lovd['VariantOnGenome/DNA/hg38'].replace('', pd.NA) + lovd['hg19/hg38_lovd'] = lovd.apply(convert_hg19_if_missing, axis=1) + + +def convert_to_gnomad_gen_pos(variant: str): """ - merge LOVD and gnomAD dataframes on genomic positions. + Converts a variant string from hg19 or hg38 format to the format used by gnomAD ('6-position-ref-alt'). + + This function processes the variant string, checks if it contains complex cases like intervals or uncertainties, + and formats the string accordingly. For special cases like 'dup' or 'del', it adds appropriate postfixes. parameters: + - variant (str): string representing the variant in the format 'g.startRef>Alt' + (or other formats like 'g.startdup'). + + returns: + - str: variant formatted as '6-position-ref-alt' or '?' if the input is ambiguous or invalid. + """ + + if '_' in variant or '?' in variant: + return '?' + variant = variant[2:] + position = variant[:-3] + ref = variant[-3] + alt = variant[-1] + + if 'dup' in variant: + return f"6-{position}-dup" + + if 'del' in variant: + return f"6-{position}-del" + + return f"6-{position}-{ref}-{alt}" + + +def merge_gnomad_lovd(lovd, gnomad): + """ + Merge LOVD and gnomAD dataframes on genomic positions. + + Parameters: lovd : pd.DataFrame LOVD dataframe. gnomAD : pd.DataFrame gnomAD dataframe. - returns: + Returns: pd.DataFrame - merged dataframe with combined information from LOVD and gnomAD. + Merged dataframe with combined information from LOVD and gnomAD. """ - add_g_position_to_gnomad(gnomad) + lovd_fill_hg38(lovd) gnomad.columns = [col + '_gnomad' for col in gnomad.columns] - main_frame = pd.merge( + merged_frame = pd.merge( lovd, gnomad, how="outer", - left_on="VariantOnGenome/DNA/hg38", - right_on="hg38_gnomad") + left_on="hg19/hg38_lovd", + right_on="gnomAD ID_gnomad" + ) - return main_frame + return merged_frame def save_lovd_as_vcf(data, save_to="./lovd.vcf"): diff --git a/requirements.txt b/requirements.txt index 0cae80e..5945f87 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ -requests -pandas -selenium +requests~=2.32.3 +pandas~=2.2.2 +selenium~=4.23.1 spliceai tensorflow -flask \ No newline at end of file +flask~=3.0.3 +pyliftover~=0.4.1 \ No newline at end of file diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 5e51efc..06705ed 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -3996,8 +3996,8 @@ }, { "data": { - "text/plain": " id transcriptid effectid position_c_start \\\n0 170936 7329 90 -538 \n1 235579 7329 99 -332 \n2 235593 7329 99 1300 \n3 235595 7329 99 1300 \n4 235603 7329 99 6572 \n... ... ... ... ... \n13272 822052 7329 70 1767 \n13273 822775 7329 70 0 \n13274 822785 7329 70 0 \n13275 822816 7329 70 0 \n13276 867648 7329 70 0 \n\n position_c_start_intron position_c_end position_c_end_intron \\\n0 0 1599 1 \n1 -1 748 1 \n2 -1 1459 1 \n3 -1 1459 1 \n4 -1 6725 1 \n... ... ... ... \n13272 -1 2023 1 \n13273 0 0 0 \n13274 0 0 0 \n13275 0 0 0 \n13276 0 0 0 \n\n VariantOnTranscript/DNA VariantOnTranscript/RNA \\\n0 c.(?_-538)_(1599+1_1600-1)del r.0? \n1 c.(-333+1_-332-1)_(748+1_749-1)del r.? \n2 c.(1299+1_1300-1)_(1459+1_1460-1)del r.? \n3 c.(1299+1_1300-1)_(1459+1_1460-1)del r.(?) \n4 c.(6571+1_6572-1)_(6725+1_6726-1)del r.? \n... ... ... \n13272 c.(1766+1_1767-1)_(2023+1_2024-1)del r.spl \n13273 c.? r.(?) \n13274 c.? r.(?) \n13275 c.? r.(?) \n13276 c.? r.(?) \n\n VariantOnTranscript/Protein VariantOnTranscript/Exon \\\n0 p.0? _1_10i \n1 p.? 2i_4i \n2 p.? 8i_9i \n3 p.? 8i_9i \n4 p.(Ser2191Thrfs*14) 32i_33i \n... ... ... \n13272 p.(?) \n13273 p.(Tyr2555fs) \n13274 p.(Asp498fs) \n13275 p.(Gln3101fs) \n13276 p.? \n\n VariantOnGenome/DNA/hg38 gnomAD ID_gnomad hg38_gnomad \n0 \n1 \n2 \n3 \n4 \n... ... ... ... \n13272 g.? \n13273 g.? \n13274 g.? \n13275 g.? \n13276 g.? \n\n[13277 rows x 14 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idtranscriptideffectidposition_c_startposition_c_start_intronposition_c_endposition_c_end_intronVariantOnTranscript/DNAVariantOnTranscript/RNAVariantOnTranscript/ProteinVariantOnTranscript/ExonVariantOnGenome/DNA/hg38gnomAD ID_gnomadhg38_gnomad
0170936732990-538015991c.(?_-538)_(1599+1_1600-1)delr.0?p.0?_1_10i<NA><NA>
1235579732999-332-17481c.(-333+1_-332-1)_(748+1_749-1)delr.?p.?2i_4i<NA><NA>
22355937329991300-114591c.(1299+1_1300-1)_(1459+1_1460-1)delr.?p.?8i_9i<NA><NA>
32355957329991300-114591c.(1299+1_1300-1)_(1459+1_1460-1)delr.(?)p.?8i_9i<NA><NA>
42356037329996572-167251c.(6571+1_6572-1)_(6725+1_6726-1)delr.?p.(Ser2191Thrfs*14)32i_33i<NA><NA>
.............................................
132728220527329701767-120231c.(1766+1_1767-1)_(2023+1_2024-1)delr.splp.(?)g.?<NA><NA>
132738227757329700000c.?r.(?)p.(Tyr2555fs)g.?<NA><NA>
132748227857329700000c.?r.(?)p.(Asp498fs)g.?<NA><NA>
132758228167329700000c.?r.(?)p.(Gln3101fs)g.?<NA><NA>
132768676487329700000c.?r.(?)p.?g.?<NA><NA>
\n

13277 rows × 14 columns

\n
" + "text/plain": " id transcriptid effectid position_c_start \\\n0 822823 7329 70 632 \n1 822787 7329 70 8391 \n2 822843 7329 70 5608 \n3 822771 7329 70 8206 \n4 \n... ... ... ... ... \n13218 959060 7329 70 9383 \n13219 959064 7329 50 0 \n13220 985494 7329 70 2137 \n13221 986425 7329 90 4361 \n13222 987322 7329 90 9299 \n\n position_c_start_intron position_c_end position_c_end_intron \\\n0 0 632 0 \n1 0 8391 0 \n2 0 5608 0 \n3 0 8206 0 \n4 \n... ... ... ... \n13218 0 9387 0 \n13219 0 0 0 \n13220 20590 3444 -29847 \n13221 0 4362 0 \n13222 0 9302 0 \n\n VariantOnTranscript/DNA VariantOnTranscript/RNA \\\n0 c.632G>A r.(?) \n1 c.8391del r.(?) \n2 c.5608C>T r.(?) \n3 c.8206G>C r.(?) \n4 \n... ... ... \n13218 c.9383_9387del r.(?) \n13219 c.-538_862+10652{1}inv r.? \n13220 c.2137+20590_3444-29847del r.? \n13221 c.4361_4362delinsAG r.(?) \n13222 c.9299_9302del r.(?) \n\n VariantOnTranscript/Protein ... Homozygote Count Amish_gnomad \\\n0 p.(Cys211Tyr) ... \n1 p.(Gly2799Valfs*31) ... \n2 p.(Arg1870Trp) ... \n3 p.(Ala2736Pro) ... \n4 ... 0 \n... ... ... ... \n13218 p.(Lys3128ArgfsTer7) ... \n13219 p.? ... \n13220 p.(Val713AspfsTer14) ... \n13221 p.(Ser1454Ter) ... \n13222 p.(Thr3100LysfsTer26) ... \n\n Hemizygote Count Amish_gnomad Allele Count South Asian_gnomad \\\n0 \n1 \n2 \n3 \n4 0 0 \n... ... ... \n13218 \n13219 \n13220 \n13221 \n13222 \n\n Allele Number South Asian_gnomad Homozygote Count South Asian_gnomad \\\n0 \n1 \n2 \n3 \n4 55362 0 \n... ... ... \n13218 \n13219 \n13220 \n13221 \n13222 \n\n Hemizygote Count South Asian_gnomad Allele Count Remaining_gnomad \\\n0 \n1 \n2 \n3 \n4 0 0 \n... ... ... \n13218 \n13219 \n13220 \n13221 \n13222 \n\n Allele Number Remaining_gnomad Homozygote Count Remaining_gnomad \\\n0 \n1 \n2 \n3 \n4 44082 0 \n... ... ... \n13218 \n13219 \n13220 \n13221 \n13222 \n\n Hemizygote Count Remaining_gnomad \n0 \n1 \n2 \n3 \n4 0 \n... ... \n13218 \n13219 \n13220 \n13221 \n13222 \n\n[13223 rows x 86 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idtranscriptideffectidposition_c_startposition_c_start_intronposition_c_endposition_c_end_intronVariantOnTranscript/DNAVariantOnTranscript/RNAVariantOnTranscript/Protein...Homozygote Count Amish_gnomadHemizygote Count Amish_gnomadAllele Count South Asian_gnomadAllele Number South Asian_gnomadHomozygote Count South Asian_gnomadHemizygote Count South Asian_gnomadAllele Count Remaining_gnomadAllele Number Remaining_gnomadHomozygote Count Remaining_gnomadHemizygote Count Remaining_gnomad
082282373297063206320c.632G>Ar.(?)p.(Cys211Tyr)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
18227877329708391083910c.8391delr.(?)p.(Gly2799Valfs*31)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
28228437329705608056080c.5608C>Tr.(?)p.(Arg1870Trp)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
38227717329708206082060c.8206G>Cr.(?)p.(Ala2736Pro)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
4<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>...000553620004408200
..................................................................
132189590607329709383093870c.9383_9387delr.(?)p.(Lys3128ArgfsTer7)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132199590647329500000c.-538_862+10652{1}invr.?p.?...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132209854947329702137205903444-29847c.2137+20590_3444-29847delr.?p.(Val713AspfsTer14)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132219864257329904361043620c.4361_4362delinsAGr.(?)p.(Ser1454Ter)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132229873227329909299093020c.9299_9302delr.(?)p.(Thr3100LysfsTer26)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
\n

13223 rows × 86 columns

\n
" }, "execution_count": 1, "metadata": {}, @@ -4025,7 +4025,7 @@ "variants_on_genome = lovd_data[\"Variants_On_Genome\"].copy()\n", "\n", "lovd_data = pd.merge(lovd_data[\"Variants_On_Transcripts\"],\n", - " variants_on_genome[['id','VariantOnGenome/DNA/hg38']],\n", + " variants_on_genome[['id','VariantOnGenome/DNA','VariantOnGenome/DNA/hg38']],\n", " on='id',\n", " how='left')\n", "\n", @@ -4036,8 +4036,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-21T18:35:42.249375Z", - "start_time": "2024-08-21T18:35:33.312752Z" + "end_time": "2024-08-31T16:18:56.746641Z", + "start_time": "2024-08-31T16:18:47.798219Z" } }, "id": "dd9b17623f26a07c", @@ -4050,7 +4050,7 @@ "metadata": { "collapsed": false }, - "id": "1a3b6e41853817ca" + "id": "50b0e50e88fa0914" } ], "metadata": { From c0e5a7448b6549d0cdc7ba440cae75dc6f3af824 Mon Sep 17 00:00:00 2001 From: Vladyslav Levchenko Date: Sat, 31 Aug 2024 19:57:07 +0300 Subject: [PATCH 02/15] pylint issues fix --- api/data/refactoring.py | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/api/data/refactoring.py b/api/data/refactoring.py index c3b2a09..cba9f6d 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -178,8 +178,10 @@ def from_clinvar_name_to_cdna_position(name): def lovd_fill_hg38(lovd: pd.DataFrame): """ - Fills missing hg38 values in the LOVD dataframe by converting hg19 values to hg38. - New column 'hg19/hg38_lovd' is added to store the converted positions in the format '6-position-ref-alt'. + Fills missing hg38 values in the LOVD dataframe + by converting hg19 values to hg38. + New column 'hg19/hg38_lovd' is added to store + the converted positions in the format '6-position-ref-alt'. Parameters: - lovd (pd.DataFrame): A pandas DataFrame containing following columns: @@ -187,15 +189,18 @@ def lovd_fill_hg38(lovd: pd.DataFrame): - 'VariantOnGenome/DNA/hg38': hg38 values. Returns: - None: Modifies the input DataFrame in-place by adding or updating the 'hg19/hg38_lovd' column. + None: Modifies the input DataFrame in-place by adding or + updating the 'hg19/hg38_lovd' column. """ def convert_hg19_if_missing(row): """ converts hg19 variant to hg38 if hg38 is missing. - checks if the hg38 value is missing (NaN) in a given row. If it is, the hg19 variant - is converted to hg38 using the `convert_hg19_to_hg38` function. Otherwise, the existing hg38 value is formatted. + checks if the hg38 value is missing (NaN) in a given row. + If it is, the hg19 variant is converted to hg38 + using the `convert_hg19_to_hg38` function. + Otherwise, the existing hg38 value is formatted. Parameters: - row (pd.Series): A pandas Series representing a single row of the DataFrame. @@ -218,13 +223,13 @@ def convert_hg19_to_hg38(position: str, lo=LiftOver('hg19', 'hg38')): returns: - str: converted hg38 position in the format '6-position-ref-alt'. """ + if '?' in position: + return '?' try: - if '?' in position: - return '?' new_pos = lo.convert_coordinate('chr6', int(position[2:10]))[0][1] - return f"6-{new_pos}-{position[-3:]}" except Exception as e: return f"Error processing variant: {str(e)}" + return f"6-{new_pos}-{position[-3:]}" lovd['VariantOnGenome/DNA/hg38'] = lovd['VariantOnGenome/DNA/hg38'].replace('', pd.NA) lovd['hg19/hg38_lovd'] = lovd.apply(convert_hg19_if_missing, axis=1) @@ -232,17 +237,21 @@ def convert_hg19_to_hg38(position: str, lo=LiftOver('hg19', 'hg38')): def convert_to_gnomad_gen_pos(variant: str): """ - Converts a variant string from hg19 or hg38 format to the format used by gnomAD ('6-position-ref-alt'). + Converts a variant string from hg19 or hg38 format + to the format used by gnomAD ('6-position-ref-alt'). - This function processes the variant string, checks if it contains complex cases like intervals or uncertainties, - and formats the string accordingly. For special cases like 'dup' or 'del', it adds appropriate postfixes. + This function processes the variant string, + checks if it contains complex cases like intervals or uncertainties, + and formats the string accordingly. + For special cases like 'dup' or 'del', it adds appropriate postfixes. parameters: - variant (str): string representing the variant in the format 'g.startRef>Alt' (or other formats like 'g.startdup'). returns: - - str: variant formatted as '6-position-ref-alt' or '?' if the input is ambiguous or invalid. + - str: variant formatted as '6-position-ref-alt' + or '?' if the input is ambiguous or invalid. """ if '_' in variant or '?' in variant: From 3c4f3485de693c47bbe3c9e0cd77bbc1edc9f799 Mon Sep 17 00:00:00 2001 From: Vladyslav Levchenko Date: Sat, 31 Aug 2024 20:00:11 +0300 Subject: [PATCH 03/15] KBE-25/pylint_fix --- api/data/refactoring.py | 8 ++++++-- api/tools/__init__.py | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/api/data/refactoring.py b/api/data/refactoring.py index cba9f6d..d28d0aa 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -227,8 +227,12 @@ def convert_hg19_to_hg38(position: str, lo=LiftOver('hg19', 'hg38')): return '?' try: new_pos = lo.convert_coordinate('chr6', int(position[2:10]))[0][1] - except Exception as e: - return f"Error processing variant: {str(e)}" + except ValueError as ve: + return f"Error processing variant (ValueError): {str(ve)}" + except IndexError as ie: + return f"Error processing variant (IndexError): {str(ie)}" + except TypeError as te: + return f"Error processing variant (TypeError): {str(te)}" return f"6-{new_pos}-{position[-3:]}" lovd['VariantOnGenome/DNA/hg38'] = lovd['VariantOnGenome/DNA/hg38'].replace('', pd.NA) diff --git a/api/tools/__init__.py b/api/tools/__init__.py index e1023e4..f8d75a8 100644 --- a/api/tools/__init__.py +++ b/api/tools/__init__.py @@ -4,4 +4,4 @@ from .revel.revel import ( get_revel_scores -) \ No newline at end of file +) From 0d89e1c9d34dbbe0d6191f688fd6b974f5e16b48 Mon Sep 17 00:00:00 2001 From: Vladyslav Levchenko Date: Sat, 31 Aug 2024 21:39:51 +0300 Subject: [PATCH 04/15] improved convert_to_gnomad_gen_pos: if value is string but wrong format its rejected --- api/data/refactoring.py | 71 ++++++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 29 deletions(-) diff --git a/api/data/refactoring.py b/api/data/refactoring.py index d28d0aa..4311a2b 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -2,6 +2,8 @@ import os import logging +import re +from operator import is_not import pandas as pd from pandas import DataFrame @@ -178,35 +180,37 @@ def from_clinvar_name_to_cdna_position(name): def lovd_fill_hg38(lovd: pd.DataFrame): """ - Fills missing hg38 values in the LOVD dataframe + fills missing hg38 values in the LOVD dataframe by converting hg19 values to hg38. New column 'hg19/hg38_lovd' is added to store the converted positions in the format '6-position-ref-alt'. - Parameters: + parameters: - lovd (pd.DataFrame): A pandas DataFrame containing following columns: - 'VariantOnGenome/DNA': hg19 values. - 'VariantOnGenome/DNA/hg38': hg38 values. - Returns: + returns: None: Modifies the input DataFrame in-place by adding or updating the 'hg19/hg38_lovd' column. """ + if lovd.empty: + return + def convert_hg19_if_missing(row): """ converts hg19 variant to hg38 if hg38 is missing. - - checks if the hg38 value is missing (NaN) in a given row. + Checks if the hg38 value is missing (NaN) in a given row. If it is, the hg19 variant is converted to hg38 using the `convert_hg19_to_hg38` function. Otherwise, the existing hg38 value is formatted. - Parameters: - - row (pd.Series): A pandas Series representing a single row of the DataFrame. + parameters: + - row (pd.Series): single row of the DataFrame. - Returns: - - str: The hg38 value or a conversion of the hg19 value in the format '6-position-ref-alt'. + returns: + - str: hg38 value or a conversion of the hg19 value in the format '6-position-ref-alt'. """ if pd.isna(row['VariantOnGenome/DNA/hg38']): return convert_hg19_to_hg38(convert_to_gnomad_gen_pos(row['VariantOnGenome/DNA'])) @@ -217,8 +221,9 @@ def convert_hg19_to_hg38(position: str, lo=LiftOver('hg19', 'hg38')): converts a genomic position from hg19 to hg38 using the LiftOver tool. parameters: - - position (str): A string representing the hg19 variant in the format 'g.positionRef>Alt'. - - lo (LiftOver): Converter for coordinates between genome builds + - position (str): string representing the hg19 variant + in the format 'g.positionRef>Alt'. + - lo (LiftOver): converter for coordinates between genome builds returns: - str: converted hg38 position in the format '6-position-ref-alt'. @@ -241,37 +246,45 @@ def convert_hg19_to_hg38(position: str, lo=LiftOver('hg19', 'hg38')): def convert_to_gnomad_gen_pos(variant: str): """ - Converts a variant string from hg19 or hg38 format + converts a variant string from hg19 or hg38 format to the format used by gnomAD ('6-position-ref-alt'). - This function processes the variant string, - checks if it contains complex cases like intervals or uncertainties, - and formats the string accordingly. - For special cases like 'dup' or 'del', it adds appropriate postfixes. - parameters: - - variant (str): string representing the variant in the format 'g.startRef>Alt' - (or other formats like 'g.startdup'). + - variant (str): string representing the variant + in the format 'g.startRef>Alt'. returns: - str: variant formatted as '6-position-ref-alt' - or '?' if the input is ambiguous or invalid. + or '?' if the input contains interval ranges or is invalid. """ - if '_' in variant or '?' in variant: - return '?' - variant = variant[2:] - position = variant[:-3] - ref = variant[-3] - alt = variant[-1] + if not isinstance(variant, str): + return "?" - if 'dup' in variant: + patterns = { + 'dup': re.compile(r'^g\.(\d+)dup$'), + 'del': re.compile(r'^g\.(\d+)del$'), + 'ref_alt': re.compile(r'^g\.(\d+)([A-Z])>([A-Z])$') + } + + match = patterns['dup'].match(variant) + if match: + position = match.group(1) return f"6-{position}-dup" - if 'del' in variant: + match = patterns['del'].match(variant) + if match: + position = match.group(1) return f"6-{position}-del" - return f"6-{position}-{ref}-{alt}" + match = patterns['ref_alt'].match(variant) + if match: + position = match.group(1) + ref = match.group(2) + alt = match.group(3) + return f"6-{position}-{ref}-{alt}" + + return "?" def merge_gnomad_lovd(lovd, gnomad): From 5bfc3b2b0ca6ac48cb82324e3de2102865f0f192 Mon Sep 17 00:00:00 2001 From: Vladyslav Levchenko Date: Sat, 31 Aug 2024 21:41:30 +0300 Subject: [PATCH 05/15] removed unused import --- api/data/refactoring.py | 1 - 1 file changed, 1 deletion(-) diff --git a/api/data/refactoring.py b/api/data/refactoring.py index 4311a2b..27114f2 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -3,7 +3,6 @@ import os import logging import re -from operator import is_not import pandas as pd from pandas import DataFrame From baed83d11dfac8716abb4df4d200c68dac6688bc Mon Sep 17 00:00:00 2001 From: Vladyslav Levchenko Date: Sat, 31 Aug 2024 22:56:41 +0300 Subject: [PATCH 06/15] revert changes reqs.txt --- requirements.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index 5945f87..99a5bc2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ -requests~=2.32.3 -pandas~=2.2.2 -selenium~=4.23.1 +requests +pandas +selenium spliceai tensorflow -flask~=3.0.3 -pyliftover~=0.4.1 \ No newline at end of file +flask +pyliftover \ No newline at end of file From b65e97132ccc6e36c4f25235d598d3419691e8d8 Mon Sep 17 00:00:00 2001 From: Vladyslav Levchenko Date: Mon, 2 Sep 2024 20:56:16 +0300 Subject: [PATCH 07/15] added exceptions for git ignore, fixing naming issues --- .gitignore | 1 + api/data/refactoring.py | 107 ++++++++++++++--------------------- tests/test_lovd_fill_hg38.py | 87 ++++++++++++++++++++++++++++ 3 files changed, 132 insertions(+), 63 deletions(-) create mode 100644 tests/test_lovd_fill_hg38.py diff --git a/.gitignore b/.gitignore index fc2dc10..344a6e8 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ __pycache__ # specific folders data/* tests/* +!test_lovd_fill_hg38.py # large gene reference files *.fa \ No newline at end of file diff --git a/api/data/refactoring.py b/api/data/refactoring.py index 27114f2..8bf5e30 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -179,86 +179,67 @@ def from_clinvar_name_to_cdna_position(name): def lovd_fill_hg38(lovd: pd.DataFrame): """ - fills missing hg38 values in the LOVD dataframe + Fills missing hg38 values in the LOVD dataframe by converting hg19 values to hg38. New column 'hg19/hg38_lovd' is added to store the converted positions in the format '6-position-ref-alt'. - - parameters: - - lovd (pd.DataFrame): A pandas DataFrame containing following columns: - - 'VariantOnGenome/DNA': hg19 values. - - 'VariantOnGenome/DNA/hg38': hg38 values. - - returns: - None: Modifies the input DataFrame in-place by adding or - updating the 'hg19/hg38_lovd' column. + :param lovd: pandas DataFrame containing following columns: + - 'VariantOnGenome/DNA': hg19 values. + - 'VariantOnGenome/DNA/hg38': hg38 values. + :return: None: Modifies the input DataFrame in-place by adding or + updating the 'hg19/hg38_lovd' column. """ if lovd.empty: return + lovd['VariantOnGenome/DNA/hg38'] = lovd['VariantOnGenome/DNA/hg38'].replace('', pd.NA) + lovd['hg38_gnomad_format'] = lovd.apply(convert_hg19_if_missing, axis=1) - def convert_hg19_if_missing(row): - """ - converts hg19 variant to hg38 if hg38 is missing. - Checks if the hg38 value is missing (NaN) in a given row. - If it is, the hg19 variant is converted to hg38 - using the `convert_hg19_to_hg38` function. - Otherwise, the existing hg38 value is formatted. - - parameters: - - row (pd.Series): single row of the DataFrame. - - returns: - - str: hg38 value or a conversion of the hg19 value in the format '6-position-ref-alt'. - """ - if pd.isna(row['VariantOnGenome/DNA/hg38']): - return convert_hg19_to_hg38(convert_to_gnomad_gen_pos(row['VariantOnGenome/DNA'])) - return convert_to_gnomad_gen_pos(row['VariantOnGenome/DNA/hg38']) - - def convert_hg19_to_hg38(position: str, lo=LiftOver('hg19', 'hg38')): - """ - converts a genomic position from hg19 to hg38 using the LiftOver tool. - - parameters: - - position (str): string representing the hg19 variant - in the format 'g.positionRef>Alt'. - - lo (LiftOver): converter for coordinates between genome builds - - returns: - - str: converted hg38 position in the format '6-position-ref-alt'. - """ - if '?' in position: - return '?' - try: - new_pos = lo.convert_coordinate('chr6', int(position[2:10]))[0][1] - except ValueError as ve: - return f"Error processing variant (ValueError): {str(ve)}" - except IndexError as ie: - return f"Error processing variant (IndexError): {str(ie)}" - except TypeError as te: - return f"Error processing variant (TypeError): {str(te)}" - return f"6-{new_pos}-{position[-3:]}" - lovd['VariantOnGenome/DNA/hg38'] = lovd['VariantOnGenome/DNA/hg38'].replace('', pd.NA) - lovd['hg19/hg38_lovd'] = lovd.apply(convert_hg19_if_missing, axis=1) +def convert_hg19_if_missing(row): + """ + converts hg19 variant to hg38 if hg38 is missing. + Checks if the hg38 value is missing (NaN) in a given row. + If it is, the hg19 variant is converted to hg38 + using the `convert_hg19_to_hg38` function. + Otherwise, the existing hg38 value is formatted. + :param row: single row of the DataFrame. + :return: + - str: hg38 value or a conversion of + the hg19 value in the format '6-position-ref-alt'. + """ + + if pd.isna(row['VariantOnGenome/DNA/hg38']): + return convert_hg19_to_hg38(convert_to_gnomad_gen(row['VariantOnGenome/DNA'])) + return convert_to_gnomad_gen(row['VariantOnGenome/DNA/hg38']) -def convert_to_gnomad_gen_pos(variant: str): +def convert_hg19_to_hg38(position: str, lo=LiftOver('hg19', 'hg38')): """ - converts a variant string from hg19 or hg38 format - to the format used by gnomAD ('6-position-ref-alt'). + converts a genomic position from hg19 to hg38 using the LiftOver tool. + :param position: string representing the hg19 variant + in the format 'g.positionRef>Alt'. + :param lo: converter for coordinates between genome builds + :return: string converted hg38 position in the format '6-position-ref-alt'. + """ + + if '?' in position: + return '?' + new_pos = lo.convert_coordinate('chr6', int(position[2:10]))[0][1] + return f"6-{new_pos}-{position[-3:]}" - parameters: - - variant (str): string representing the variant - in the format 'g.startRef>Alt'. - returns: - - str: variant formatted as '6-position-ref-alt' +def convert_to_gnomad_gen(variant: str): + """ + converts a variant string from hg19 or hg38 format + to the format used by gnomAD ('6-position-ref-alt'). + :param variant: str: the variant in the format 'g.startRef>Alt'. + :return: str: variant formatted as '6-position-ref-alt' or '?' if the input contains interval ranges or is invalid. """ if not isinstance(variant, str): - return "?" + raise TypeError(f"Expected a string for 'variant', got {type(variant).__name__} instead") patterns = { 'dup': re.compile(r'^g\.(\d+)dup$'), @@ -308,7 +289,7 @@ def merge_gnomad_lovd(lovd, gnomad): lovd, gnomad, how="outer", - left_on="hg19/hg38_lovd", + left_on="hg38_gnomad_format", right_on="gnomAD ID_gnomad" ) diff --git a/tests/test_lovd_fill_hg38.py b/tests/test_lovd_fill_hg38.py new file mode 100644 index 0000000..1b4b164 --- /dev/null +++ b/tests/test_lovd_fill_hg38.py @@ -0,0 +1,87 @@ +import unittest +import pandas as pd +from api.data.refactoring import lovd_fill_hg38 + + +class TestLOVDFillHg38(unittest.TestCase): + + def setUp(self): + """Set up any initial data used in multiple tests.""" + self.df = pd.DataFrame() + + def test_fill_hg38_with_no_missing_values(self): + """Test filling hg38 values when no values are missing.""" + self.df = pd.DataFrame({ + 'VariantOnGenome/DNA': ['g.64430517C>T', 'g.64430535C>G'], + 'VariantOnGenome/DNA/hg38': ['g.63720621C>T', 'g.63720639C>G'] + }) + lovd_fill_hg38(self.df) + expected_values = ['6-63720621-C-T', '6-63720639-C-G'] + self.assertIn('hg38_gnomad_format', self.df.columns, + "Column 'hg38_gnomad_format' should be added.") + self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values) + + def test_fill_hg38_with_missing_values_nan(self): + """Test filling hg38 values when they are missing (NaN case).""" + self.df = pd.DataFrame({ + 'VariantOnGenome/DNA': ['g.64430518C>T'], + 'VariantOnGenome/DNA/hg38': [pd.NA] + }) + lovd_fill_hg38(self.df) + expected_values = ['6-63720622-C-T'] + self.assertIn('hg38_gnomad_format', self.df.columns) + self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values) + + def test_fill_hg38_with_missing_values_empty_string(self): + """Test filling hg38 values when they are missing (empty string case).""" + self.df = pd.DataFrame({ + 'VariantOnGenome/DNA': ['g.64430518C>T'], + 'VariantOnGenome/DNA/hg38': [""] + }) + lovd_fill_hg38(self.df) + expected_values = ['6-63720622-C-T'] + self.assertIn('hg38_gnomad_format', self.df.columns) + self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values) + + def test_fill_hg38_with_dup_cases(self): + """Test filling hg38 values when they have 'dup' postfix.""" + self.df = pd.DataFrame({ + 'VariantOnGenome/DNA': ['g.64430518dup'], + 'VariantOnGenome/DNA/hg38': [""] + }) + lovd_fill_hg38(self.df) + expected_values = ['6-63720622-dup'] + self.assertIn('hg38_gnomad_format', self.df.columns) + self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values) + + def test_fill_hg38_with_del_cases(self): + """Test filling hg38 values when they have 'del' postfix.""" + self.df = pd.DataFrame({ + 'VariantOnGenome/DNA': ['g.64430518del'], + 'VariantOnGenome/DNA/hg38': [""] + }) + lovd_fill_hg38(self.df) + expected_values = ['6-63720622-del'] + self.assertIn('hg38_gnomad_format', self.df.columns) + self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values) + + def test_fill_hg38_with_interval_cases(self): + """Test filling hg38 values when they have intervals (e.g., 'del' range).""" + self.df = pd.DataFrame({ + 'VariantOnGenome/DNA': ['g.64430540_64430544del'], + 'VariantOnGenome/DNA/hg38': [""] + }) + lovd_fill_hg38(self.df) + expected_values = ["?"] + self.assertIn('hg38_gnomad_format', self.df.columns) + self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values) + + def test_fill_hg38_no_variants(self): + """Test filling hg38 values when there are no variants in the dataframe.""" + self.df = pd.DataFrame(columns=['VariantOnGenome/DNA', 'VariantOnGenome/DNA/hg38']) + lovd_fill_hg38(self.df) + self.assertEqual(self.df.shape[0], 0, "Empty dataframe should not add rows.") + + +if __name__ == '__main__': + unittest.main() From f830acbbbdd6b08b6458eb6c99516ea10960ccad Mon Sep 17 00:00:00 2001 From: Vladyslav Levchenko Date: Mon, 2 Sep 2024 21:10:46 +0300 Subject: [PATCH 08/15] updated test_lovd_fill_hg38 --- tests/pipeline.ipynb | 473 +++++++++++++++++++++++++++++++++-- tests/test_lovd_fill_hg38.py | 11 + 2 files changed, 463 insertions(+), 21 deletions(-) diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 06705ed..6cdacd5 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -3986,24 +3986,6 @@ }, { "cell_type": "code", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The file at ../data/lovd/lovd_data.txt already exists.\n" - ] - }, - { - "data": { - "text/plain": " id transcriptid effectid position_c_start \\\n0 822823 7329 70 632 \n1 822787 7329 70 8391 \n2 822843 7329 70 5608 \n3 822771 7329 70 8206 \n4 \n... ... ... ... ... \n13218 959060 7329 70 9383 \n13219 959064 7329 50 0 \n13220 985494 7329 70 2137 \n13221 986425 7329 90 4361 \n13222 987322 7329 90 9299 \n\n position_c_start_intron position_c_end position_c_end_intron \\\n0 0 632 0 \n1 0 8391 0 \n2 0 5608 0 \n3 0 8206 0 \n4 \n... ... ... ... \n13218 0 9387 0 \n13219 0 0 0 \n13220 20590 3444 -29847 \n13221 0 4362 0 \n13222 0 9302 0 \n\n VariantOnTranscript/DNA VariantOnTranscript/RNA \\\n0 c.632G>A r.(?) \n1 c.8391del r.(?) \n2 c.5608C>T r.(?) \n3 c.8206G>C r.(?) \n4 \n... ... ... \n13218 c.9383_9387del r.(?) \n13219 c.-538_862+10652{1}inv r.? \n13220 c.2137+20590_3444-29847del r.? \n13221 c.4361_4362delinsAG r.(?) \n13222 c.9299_9302del r.(?) \n\n VariantOnTranscript/Protein ... Homozygote Count Amish_gnomad \\\n0 p.(Cys211Tyr) ... \n1 p.(Gly2799Valfs*31) ... \n2 p.(Arg1870Trp) ... \n3 p.(Ala2736Pro) ... \n4 ... 0 \n... ... ... ... \n13218 p.(Lys3128ArgfsTer7) ... \n13219 p.? ... \n13220 p.(Val713AspfsTer14) ... \n13221 p.(Ser1454Ter) ... \n13222 p.(Thr3100LysfsTer26) ... \n\n Hemizygote Count Amish_gnomad Allele Count South Asian_gnomad \\\n0 \n1 \n2 \n3 \n4 0 0 \n... ... ... \n13218 \n13219 \n13220 \n13221 \n13222 \n\n Allele Number South Asian_gnomad Homozygote Count South Asian_gnomad \\\n0 \n1 \n2 \n3 \n4 55362 0 \n... ... ... \n13218 \n13219 \n13220 \n13221 \n13222 \n\n Hemizygote Count South Asian_gnomad Allele Count Remaining_gnomad \\\n0 \n1 \n2 \n3 \n4 0 0 \n... ... ... \n13218 \n13219 \n13220 \n13221 \n13222 \n\n Allele Number Remaining_gnomad Homozygote Count Remaining_gnomad \\\n0 \n1 \n2 \n3 \n4 44082 0 \n... ... ... \n13218 \n13219 \n13220 \n13221 \n13222 \n\n Hemizygote Count Remaining_gnomad \n0 \n1 \n2 \n3 \n4 0 \n... ... \n13218 \n13219 \n13220 \n13221 \n13222 \n\n[13223 rows x 86 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idtranscriptideffectidposition_c_startposition_c_start_intronposition_c_endposition_c_end_intronVariantOnTranscript/DNAVariantOnTranscript/RNAVariantOnTranscript/Protein...Homozygote Count Amish_gnomadHemizygote Count Amish_gnomadAllele Count South Asian_gnomadAllele Number South Asian_gnomadHomozygote Count South Asian_gnomadHemizygote Count South Asian_gnomadAllele Count Remaining_gnomadAllele Number Remaining_gnomadHomozygote Count Remaining_gnomadHemizygote Count Remaining_gnomad
082282373297063206320c.632G>Ar.(?)p.(Cys211Tyr)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
18227877329708391083910c.8391delr.(?)p.(Gly2799Valfs*31)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
28228437329705608056080c.5608C>Tr.(?)p.(Arg1870Trp)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
38227717329708206082060c.8206G>Cr.(?)p.(Ala2736Pro)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
4<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>...000553620004408200
..................................................................
132189590607329709383093870c.9383_9387delr.(?)p.(Lys3128ArgfsTer7)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132199590647329500000c.-538_862+10652{1}invr.?p.?...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132209854947329702137205903444-29847c.2137+20590_3444-29847delr.?p.(Val713AspfsTer14)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132219864257329904361043620c.4361_4362delinsAGr.(?)p.(Ser1454Ter)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132229873227329909299093020c.9299_9302delr.(?)p.(Thr3100LysfsTer26)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
\n

13223 rows × 86 columns

\n
" - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "from api.data.refactoring import merge_gnomad_lovd, parse_gnomad, set_gnomad_dtypes\n", "import pandas as pd\n", @@ -4036,11 +4018,459 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-31T16:18:56.746641Z", - "start_time": "2024-08-31T16:18:47.798219Z" + "end_time": "2024-09-02T18:03:53.205784Z", + "start_time": "2024-09-02T18:03:43.273240Z" } }, "id": "dd9b17623f26a07c", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The file at ../data/lovd/lovd_data.txt already exists.\n" + ] + }, + { + "data": { + "text/plain": [ + " id transcriptid effectid position_c_start \\\n", + "0 822823 7329 70 632 \n", + "1 822787 7329 70 8391 \n", + "2 822843 7329 70 5608 \n", + "3 822771 7329 70 8206 \n", + "4 \n", + "... ... ... ... ... \n", + "13218 959060 7329 70 9383 \n", + "13219 959064 7329 50 0 \n", + "13220 985494 7329 70 2137 \n", + "13221 986425 7329 90 4361 \n", + "13222 987322 7329 90 9299 \n", + "\n", + " position_c_start_intron position_c_end position_c_end_intron \\\n", + "0 0 632 0 \n", + "1 0 8391 0 \n", + "2 0 5608 0 \n", + "3 0 8206 0 \n", + "4 \n", + "... ... ... ... \n", + "13218 0 9387 0 \n", + "13219 0 0 0 \n", + "13220 20590 3444 -29847 \n", + "13221 0 4362 0 \n", + "13222 0 9302 0 \n", + "\n", + " VariantOnTranscript/DNA VariantOnTranscript/RNA \\\n", + "0 c.632G>A r.(?) \n", + "1 c.8391del r.(?) \n", + "2 c.5608C>T r.(?) \n", + "3 c.8206G>C r.(?) \n", + "4 \n", + "... ... ... \n", + "13218 c.9383_9387del r.(?) \n", + "13219 c.-538_862+10652{1}inv r.? \n", + "13220 c.2137+20590_3444-29847del r.? \n", + "13221 c.4361_4362delinsAG r.(?) \n", + "13222 c.9299_9302del r.(?) \n", + "\n", + " VariantOnTranscript/Protein ... Homozygote Count Amish_gnomad \\\n", + "0 p.(Cys211Tyr) ... \n", + "1 p.(Gly2799Valfs*31) ... \n", + "2 p.(Arg1870Trp) ... \n", + "3 p.(Ala2736Pro) ... \n", + "4 ... 0 \n", + "... ... ... ... \n", + "13218 p.(Lys3128ArgfsTer7) ... \n", + "13219 p.? ... \n", + "13220 p.(Val713AspfsTer14) ... \n", + "13221 p.(Ser1454Ter) ... \n", + "13222 p.(Thr3100LysfsTer26) ... \n", + "\n", + " Hemizygote Count Amish_gnomad Allele Count South Asian_gnomad \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 0 0 \n", + "... ... ... \n", + "13218 \n", + "13219 \n", + "13220 \n", + "13221 \n", + "13222 \n", + "\n", + " Allele Number South Asian_gnomad Homozygote Count South Asian_gnomad \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 55362 0 \n", + "... ... ... \n", + "13218 \n", + "13219 \n", + "13220 \n", + "13221 \n", + "13222 \n", + "\n", + " Hemizygote Count South Asian_gnomad Allele Count Remaining_gnomad \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 0 0 \n", + "... ... ... \n", + "13218 \n", + "13219 \n", + "13220 \n", + "13221 \n", + "13222 \n", + "\n", + " Allele Number Remaining_gnomad Homozygote Count Remaining_gnomad \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 44082 0 \n", + "... ... ... \n", + "13218 \n", + "13219 \n", + "13220 \n", + "13221 \n", + "13222 \n", + "\n", + " Hemizygote Count Remaining_gnomad \n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 0 \n", + "... ... \n", + "13218 \n", + "13219 \n", + "13220 \n", + "13221 \n", + "13222 \n", + "\n", + "[13223 rows x 86 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtranscriptideffectidposition_c_startposition_c_start_intronposition_c_endposition_c_end_intronVariantOnTranscript/DNAVariantOnTranscript/RNAVariantOnTranscript/Protein...Homozygote Count Amish_gnomadHemizygote Count Amish_gnomadAllele Count South Asian_gnomadAllele Number South Asian_gnomadHomozygote Count South Asian_gnomadHemizygote Count South Asian_gnomadAllele Count Remaining_gnomadAllele Number Remaining_gnomadHomozygote Count Remaining_gnomadHemizygote Count Remaining_gnomad
082282373297063206320c.632G>Ar.(?)p.(Cys211Tyr)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
18227877329708391083910c.8391delr.(?)p.(Gly2799Valfs*31)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
28228437329705608056080c.5608C>Tr.(?)p.(Arg1870Trp)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
38227717329708206082060c.8206G>Cr.(?)p.(Ala2736Pro)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
4<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>...000553620004408200
..................................................................
132189590607329709383093870c.9383_9387delr.(?)p.(Lys3128ArgfsTer7)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132199590647329500000c.-538_862+10652{1}invr.?p.?...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132209854947329702137205903444-29847c.2137+20590_3444-29847delr.?p.(Val713AspfsTer14)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132219864257329904361043620c.4361_4362delinsAGr.(?)p.(Ser1454Ter)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132229873227329909299093020c.9299_9302delr.(?)p.(Thr3100LysfsTer26)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
\n", + "

13223 rows × 86 columns

\n", + "
" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], "execution_count": 1 }, { @@ -4050,7 +4480,8 @@ "metadata": { "collapsed": false }, - "id": "50b0e50e88fa0914" + "id": "50b0e50e88fa0914", + "execution_count": null } ], "metadata": { diff --git a/tests/test_lovd_fill_hg38.py b/tests/test_lovd_fill_hg38.py index 1b4b164..fa9db1d 100644 --- a/tests/test_lovd_fill_hg38.py +++ b/tests/test_lovd_fill_hg38.py @@ -82,6 +82,17 @@ def test_fill_hg38_no_variants(self): lovd_fill_hg38(self.df) self.assertEqual(self.df.shape[0], 0, "Empty dataframe should not add rows.") + def test_fill_hg38_NA_variants(self): + """Test filling hg38 values when there pd. NA variants in the dataframe.""" + self.df = pd.DataFrame({ + 'VariantOnGenome/DNA': [pd.NA], + 'VariantOnGenome/DNA/hg38': [pd.NA] + }) + with self.assertRaises(TypeError) as context: + lovd_fill_hg38(self.df) + + self.assertEqual(str(context.exception), "Expected a string for 'variant', got NAType instead") + if __name__ == '__main__': unittest.main() From 97fcf1f5733b0c066516058b525b82d3bfff8f9f Mon Sep 17 00:00:00 2001 From: Vladyslav Levchenko Date: Mon, 2 Sep 2024 21:12:52 +0300 Subject: [PATCH 09/15] removed space in test_lovd_fill_hg38 --- tests/test_lovd_fill_hg38.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_lovd_fill_hg38.py b/tests/test_lovd_fill_hg38.py index fa9db1d..822fa3a 100644 --- a/tests/test_lovd_fill_hg38.py +++ b/tests/test_lovd_fill_hg38.py @@ -90,7 +90,6 @@ def test_fill_hg38_NA_variants(self): }) with self.assertRaises(TypeError) as context: lovd_fill_hg38(self.df) - self.assertEqual(str(context.exception), "Expected a string for 'variant', got NAType instead") From f7c133e88580f4cea461692bef7ae97588d0f5a8 Mon Sep 17 00:00:00 2001 From: Vladyslav Levchenko Date: Tue, 3 Sep 2024 11:11:10 +0300 Subject: [PATCH 10/15] refactor lovd_fill_hg38 and all functions used inside it(same result, different approach) --- .gitignore | 2 -- api/data/refactoring.py | 56 +++++++++++++----------------------- tests/pipeline.ipynb | 4 +-- tests/test_lovd_fill_hg38.py | 9 ------ 4 files changed, 22 insertions(+), 49 deletions(-) diff --git a/.gitignore b/.gitignore index 344a6e8..17a5b3c 100644 --- a/.gitignore +++ b/.gitignore @@ -5,8 +5,6 @@ __pycache__ # specific folders data/* -tests/* -!test_lovd_fill_hg38.py # large gene reference files *.fa \ No newline at end of file diff --git a/api/data/refactoring.py b/api/data/refactoring.py index 8bf5e30..0293adb 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -181,66 +181,50 @@ def lovd_fill_hg38(lovd: pd.DataFrame): """ Fills missing hg38 values in the LOVD dataframe by converting hg19 values to hg38. - New column 'hg19/hg38_lovd' is added to store + New column 'hg38_gnomad_format' is added to store the converted positions in the format '6-position-ref-alt'. :param lovd: pandas DataFrame containing following columns: - 'VariantOnGenome/DNA': hg19 values. - 'VariantOnGenome/DNA/hg38': hg38 values. - :return: None: Modifies the input DataFrame in-place by adding or - updating the 'hg19/hg38_lovd' column. + :return: None: Modifies the input DataFrame in-place by adding + 'hg38_gnomad_format' column. """ if lovd.empty: return - lovd['VariantOnGenome/DNA/hg38'] = lovd['VariantOnGenome/DNA/hg38'].replace('', pd.NA) - lovd['hg38_gnomad_format'] = lovd.apply(convert_hg19_if_missing, axis=1) + lovd['hg38_gnomad_format'] = lovd['VariantOnGenome/DNA/hg38'].replace('', pd.NA) + missing_hg38_mask = lovd['hg38_gnomad_format'].isna() + lovd.loc[missing_hg38_mask, 'hg38_gnomad_format'] = lovd.loc[missing_hg38_mask, 'VariantOnGenome/DNA'].apply( + convert_hg19_if_missing) + lovd['hg38_gnomad_format'] = lovd['hg38_gnomad_format'].apply(convert_to_gnomad_gen) -def convert_hg19_if_missing(row): +def convert_hg19_if_missing(hg19: pd.Series, lo = LiftOver('hg19', 'hg38')): """ - converts hg19 variant to hg38 if hg38 is missing. - Checks if the hg38 value is missing (NaN) in a given row. - If it is, the hg19 variant is converted to hg38 - using the `convert_hg19_to_hg38` function. - Otherwise, the existing hg38 value is formatted. - :param row: single row of the DataFrame. - :return: - - str: hg38 value or a conversion of - the hg19 value in the format '6-position-ref-alt'. + Converts hg19 variant to hg38 if hg38 is missing. + :param hg19: a row from the DataFrame. + :param lo: converter for genomic data between reference assemblies + :return: hg38 value or a conversion of the hg19 value in the format 'g.positionref>alt'. """ - if pd.isna(row['VariantOnGenome/DNA/hg38']): - return convert_hg19_to_hg38(convert_to_gnomad_gen(row['VariantOnGenome/DNA'])) - return convert_to_gnomad_gen(row['VariantOnGenome/DNA/hg38']) - - -def convert_hg19_to_hg38(position: str, lo=LiftOver('hg19', 'hg38')): - """ - converts a genomic position from hg19 to hg38 using the LiftOver tool. - :param position: string representing the hg19 variant - in the format 'g.positionRef>Alt'. - :param lo: converter for coordinates between genome builds - :return: string converted hg38 position in the format '6-position-ref-alt'. - """ - - if '?' in position: + if pd.isna(hg19): + return "?" + if '?' in hg19 or "_" in hg19: return '?' - new_pos = lo.convert_coordinate('chr6', int(position[2:10]))[0][1] - return f"6-{new_pos}-{position[-3:]}" + position_str = hg19[2:10] + new_pos = lo.convert_coordinate('chr6', int(position_str))[0][1] + return f"g.{new_pos}{hg19[-3:]}" def convert_to_gnomad_gen(variant: str): """ - converts a variant string from hg19 or hg38 format + converts a variant string from hg38 format to the format used by gnomAD ('6-position-ref-alt'). :param variant: str: the variant in the format 'g.startRef>Alt'. :return: str: variant formatted as '6-position-ref-alt' or '?' if the input contains interval ranges or is invalid. """ - if not isinstance(variant, str): - raise TypeError(f"Expected a string for 'variant', got {type(variant).__name__} instead") - patterns = { 'dup': re.compile(r'^g\.(\d+)dup$'), 'del': re.compile(r'^g\.(\d+)del$'), diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 6cdacd5..69c80f3 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -4018,8 +4018,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-02T18:03:53.205784Z", - "start_time": "2024-09-02T18:03:43.273240Z" + "end_time": "2024-09-03T07:46:50.003340Z", + "start_time": "2024-09-03T07:46:40.913231Z" } }, "id": "dd9b17623f26a07c", diff --git a/tests/test_lovd_fill_hg38.py b/tests/test_lovd_fill_hg38.py index 822fa3a..c4008d7 100644 --- a/tests/test_lovd_fill_hg38.py +++ b/tests/test_lovd_fill_hg38.py @@ -82,15 +82,6 @@ def test_fill_hg38_no_variants(self): lovd_fill_hg38(self.df) self.assertEqual(self.df.shape[0], 0, "Empty dataframe should not add rows.") - def test_fill_hg38_NA_variants(self): - """Test filling hg38 values when there pd. NA variants in the dataframe.""" - self.df = pd.DataFrame({ - 'VariantOnGenome/DNA': [pd.NA], - 'VariantOnGenome/DNA/hg38': [pd.NA] - }) - with self.assertRaises(TypeError) as context: - lovd_fill_hg38(self.df) - self.assertEqual(str(context.exception), "Expected a string for 'variant', got NAType instead") if __name__ == '__main__': From e16bde3094c14a664bbd1622d40e11fc7d92ce9b Mon Sep 17 00:00:00 2001 From: Vladyslav Levchenko Date: Tue, 3 Sep 2024 11:16:26 +0300 Subject: [PATCH 11/15] added docstring for test module and for test class in test_lovd_hg38.py --- tests/pipeline.ipynb | 4 ++-- tests/test_lovd_fill_hg38.py | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 69c80f3..017c90f 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -4018,8 +4018,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-03T07:46:50.003340Z", - "start_time": "2024-09-03T07:46:40.913231Z" + "end_time": "2024-09-03T08:10:37.655840Z", + "start_time": "2024-09-03T08:10:28.838482Z" } }, "id": "dd9b17623f26a07c", diff --git a/tests/test_lovd_fill_hg38.py b/tests/test_lovd_fill_hg38.py index c4008d7..df93c67 100644 --- a/tests/test_lovd_fill_hg38.py +++ b/tests/test_lovd_fill_hg38.py @@ -1,9 +1,15 @@ +""" +Module for testing the `lovd_fill_hg38` function from the `api.data.refactoring` module. +""" import unittest import pandas as pd from api.data.refactoring import lovd_fill_hg38 class TestLOVDFillHg38(unittest.TestCase): + """ + Unit tests for the `lovd_fill_hg38` function. + """ def setUp(self): """Set up any initial data used in multiple tests.""" From 94156a9e303df52fccf6836d8c6e759e7ff9f366 Mon Sep 17 00:00:00 2001 From: Vladyslav Levchenko Date: Tue, 3 Sep 2024 17:23:32 +0300 Subject: [PATCH 12/15] added pd. NA case in test_lovd_fill_hg38.py, added mask for values in lovd_fill_hg38 --- api/data/refactoring.py | 18 +++++++++++------- tests/pipeline.ipynb | 4 ++-- tests/test_lovd_fill_hg38.py | 12 ++++++++++++ 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/api/data/refactoring.py b/api/data/refactoring.py index 0293adb..3607c64 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -192,14 +192,14 @@ def lovd_fill_hg38(lovd: pd.DataFrame): if lovd.empty: return - lovd['hg38_gnomad_format'] = lovd['VariantOnGenome/DNA/hg38'].replace('', pd.NA) - missing_hg38_mask = lovd['hg38_gnomad_format'].isna() + lovd.loc[:,'hg38_gnomad_format'] = lovd.loc[:,'VariantOnGenome/DNA/hg38'].replace('', pd.NA) + missing_hg38_mask = lovd.loc[:,'hg38_gnomad_format'].isna() lovd.loc[missing_hg38_mask, 'hg38_gnomad_format'] = lovd.loc[missing_hg38_mask, 'VariantOnGenome/DNA'].apply( convert_hg19_if_missing) - lovd['hg38_gnomad_format'] = lovd['hg38_gnomad_format'].apply(convert_to_gnomad_gen) + lovd.loc[:,'hg38_gnomad_format'] = lovd.loc[:,'hg38_gnomad_format'].apply(convert_to_gnomad_gen) -def convert_hg19_if_missing(hg19: pd.Series, lo = LiftOver('hg19', 'hg38')): +def convert_hg19_if_missing(hg19: str, lo = LiftOver('hg19', 'hg38')): """ Converts hg19 variant to hg38 if hg38 is missing. :param hg19: a row from the DataFrame. @@ -207,15 +207,19 @@ def convert_hg19_if_missing(hg19: pd.Series, lo = LiftOver('hg19', 'hg38')): :return: hg38 value or a conversion of the hg19 value in the format 'g.positionref>alt'. """ - if pd.isna(hg19): + if pd.isna(hg19) or '_' in hg19: return "?" - if '?' in hg19 or "_" in hg19: + + match = re.search(r'g\.(\d+)', hg19) + if not match: return '?' - position_str = hg19[2:10] + + position_str = match.group(1) new_pos = lo.convert_coordinate('chr6', int(position_str))[0][1] return f"g.{new_pos}{hg19[-3:]}" + def convert_to_gnomad_gen(variant: str): """ converts a variant string from hg38 format diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 017c90f..9a1aa3c 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -4018,8 +4018,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-03T08:10:37.655840Z", - "start_time": "2024-09-03T08:10:28.838482Z" + "end_time": "2024-09-03T14:19:14.730427Z", + "start_time": "2024-09-03T14:19:05.969159Z" } }, "id": "dd9b17623f26a07c", diff --git a/tests/test_lovd_fill_hg38.py b/tests/test_lovd_fill_hg38.py index df93c67..d8cf37e 100644 --- a/tests/test_lovd_fill_hg38.py +++ b/tests/test_lovd_fill_hg38.py @@ -88,6 +88,18 @@ def test_fill_hg38_no_variants(self): lovd_fill_hg38(self.df) self.assertEqual(self.df.shape[0], 0, "Empty dataframe should not add rows.") + def test_fill_hg38_NA_variants(self): + """Test filling hg38 values when there are pd. NA variants in the dataframe.""" + self.df = pd.DataFrame({ + 'VariantOnGenome/DNA': [pd.NA], + 'VariantOnGenome/DNA/hg38': [pd.NA] + }) + lovd_fill_hg38(self.df) + expected_values = ['?'] + self.assertIn('hg38_gnomad_format', self.df.columns, + "Column 'hg38_gnomad_format' should be added.") + self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values) + if __name__ == '__main__': From 5384fe93315bb5a2e638603bf7ab5ff8d4476193 Mon Sep 17 00:00:00 2001 From: Vladyslav Levchenko Date: Tue, 10 Sep 2024 16:56:23 +0300 Subject: [PATCH 13/15] updated pipeline.ipynb to prevent conflict --- tests/pipeline.ipynb | 4090 ++---------------------------------------- 1 file changed, 173 insertions(+), 3917 deletions(-) diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 9a1aa3c..e85aff1 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -2,3902 +2,227 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, "id": "initial_id", "metadata": { - "ExecuteTime": { - "end_time": "2024-05-13T15:38:18.029744Z", - "start_time": "2024-05-13T15:38:17.807980Z" - }, "collapsed": true, "jupyter": { "outputs_hidden": true } }, - "outputs": [], "source": [ "import pandas as pd\n", + "import requests\n", "\n", "from api.data import (store_database_for_eys_gene,\n", " parse_lovd,\n", + " parse_gnomad,\n", " LOVD_PATH,\n", - " set_lovd_dtypes)\n", + " set_lovd_dtypes,\n", + " set_gnomad_dtypes,\n", + " request_gnomad_api_data,\n", + " merge_gnomad_lovd,\n", + " GNOMAD_PATH,\n", + " )\n", "from api.data import save_lovd_as_vcf\n", + "\n", + "\n", "pd.options.display.max_columns = 0" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "code", - "execution_count": 10, "id": "f49f7691a27aa7b4", "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-08-11T16:16:57.305309Z", - "start_time": "2024-08-11T16:16:56.668571Z" - } + "collapsed": false }, - "outputs": [], "source": [ "store_database_for_eys_gene(\"lovd\", override=False)" - ] + ], + "outputs": [], + "execution_count": null }, { "cell_type": "code", - "execution_count": 2, "id": "cf5c45c0f7b9de0f", "metadata": { - "ExecuteTime": { - "end_time": "2024-05-13T15:38:24.591752Z", - "start_time": "2024-05-13T15:38:19.498594Z" - }, - "collapsed": false + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } }, - "outputs": [], "source": [ "data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")" - ] + ], + "outputs": [], + "execution_count": null }, { + "metadata": {}, "cell_type": "code", - "execution_count": 8, - "id": "8a089e29bfc8c119", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-13T15:12:07.510712Z", - "start_time": "2024-05-13T15:12:07.366319Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Genes\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamechromosomechrom_bandimprintingrefseq_genomicrefseq_UDreferenceurl_homepageurl_externalallow_downloadid_hgncid_entrezid_omimshow_hgmdshow_genecardsshow_genetestsshow_orphanetnote_indexnote_listingrefseqrefseq_urldisclaimerdisclaimer_textheaderheader_alignfooterfooter_aligncreated_bycreated_dateedited_byedited_dateupdated_byupdated_date
0EYSeyes shut homolog (Drosophila)6q12unknownNG_023443.2UD_132085377375http://www.LOVD.nl/EYSNaN21555346007612424NaNNaNNaNNaN<font color=\\\"#FF0000\\\">This database is one o...ghttp://databases.lovd.nl/shared/refseq/EYS_NM_...NaN<font color=\\\"#FF0000\\\">This database is one o...-1-112012-02-1362023-08-30 13:08:1902024-04-19 20:27:30
\n", - "
" - ], - "text/plain": [ - " id name ... updated_by updated_date\n", - "0 EYS eyes shut homolog (Drosophila) ... 0 2024-04-19 20:27:30\n", - "\n", - "[1 rows x 34 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Transcripts\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idgeneidnameid_mutalyzerid_ncbiid_ensemblid_protein_ncbiid_protein_ensemblid_protein_uniprotremarksposition_c_mrna_startposition_c_mrna_endposition_c_cds_endposition_g_mrna_startposition_g_mrna_endcreated_bycreated_dateedited_byedited_date
07329EYStranscript variant 11NM_001142800.1NP_001136272.1-5381005194356641711864429876<NA>NaT<NA>NaT
\n", - "
" - ], - "text/plain": [ - " id geneid name ... created_date edited_by edited_date\n", - "0 7329 EYS transcript variant 1 ... NaT NaT\n", - "\n", - "[1 rows x 19 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Diseases\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idsymbolnameinheritanceid_omimtissuesfeaturesremarkscreated_bycreated_dateedited_byedited_date
012PSORSpsoriasis, pustular, generalized (PSORS)<NA>62012-07-06 21:50:3262019-08-12 13:38:21
158CORDdystrophy, cone-rod (CORD)<NA>62012-09-22 11:31:2562020-08-30 09:43:59
2112RPretinitis pigmentosa (RP)26800012013-02-21 17:12:3662021-01-18 09:53:26
3139IDintellectual disability (ID)<NA>842013-06-04 18:18:0762015-02-09 10:02:49
4173SLOSSmith-Lemli-Opitz syndrome (SLOS)AR27040062013-08-01 11:16:1462021-12-10 21:51:32
5198?unclassified / mixed<NA>62013-09-13 14:21:4762016-10-22 17:54:40
62156-retinitis pigmentosa, X-linked, and sinorespir...30045562014-09-25 23:29:4062021-12-10 21:51:32
72440RP25retinitis pigmentosa, type 25 (RP25)AR60277262014-09-25 23:29:4062021-12-10 21:51:32
84211RParretinitis pigmentosa, autosomal recessive (RPar)<NA>62015-02-27 18:58:57<NA>NaT
94214-retinal disease<NA>62015-02-27 19:48:0712023-03-09 14:26:26
104249macular dystrophydystrophy, macular<NA>62015-05-04 22:10:5862024-02-15 21:18:39
115086HLhearing loss (HL)<NA>62015-10-23 11:41:0562015-10-23 11:43:00
125415USHUsher syndrome (USH)<NA>62018-04-02 16:40:44<NA>NaT
135468uveitisuveitis<NA>62018-08-22 09:47:04<NA>NaT
146906DEEencephalopathy, developmental and epileptic<NA>62022-04-07 09:24:23<NA>NaT
\n", - "
" - ], - "text/plain": [ - " id symbol ... edited_by edited_date\n", - "0 12 PSORS ... 6 2019-08-12 13:38:21\n", - "1 58 CORD ... 6 2020-08-30 09:43:59\n", - "2 112 RP ... 6 2021-01-18 09:53:26\n", - "3 139 ID ... 6 2015-02-09 10:02:49\n", - "4 173 SLOS ... 6 2021-12-10 21:51:32\n", - "5 198 ? ... 6 2016-10-22 17:54:40\n", - "6 2156 - ... 6 2021-12-10 21:51:32\n", - "7 2440 RP25 ... 6 2021-12-10 21:51:32\n", - "8 4211 RPar ... NaT\n", - "9 4214 - ... 1 2023-03-09 14:26:26\n", - "10 4249 macular dystrophy ... 6 2024-02-15 21:18:39\n", - "11 5086 HL ... 6 2015-10-23 11:43:00\n", - "12 5415 USH ... NaT\n", - "13 5468 uveitis ... NaT\n", - "14 6906 DEE ... NaT\n", - "\n", - "[15 rows x 12 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Genes_To_Diseases\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
geneiddiseaseid
0EYS112
1EYS2440
\n", - "
" - ], - "text/plain": [ - " geneid diseaseid\n", - "0 EYS 112\n", - "1 EYS 2440" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Individuals\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idfatheridmotheridpanelidpanel_sizelicenseowned_byIndividual/ReferenceIndividual/RemarksIndividual/GenderIndividual/ConsanguinityIndividual/Origin/GeographicIndividual/Age_of_deathIndividual/VIPIndividual/Data_avIndividual/TreatmentIndividual/Origin/PopulationIndividual/Individual_ID
0135<NA>36{PMID:Marrakchi 2011:21848462}5-generation family, 3 affecteds (M)MyesTunisia
1210<NA>139{PMID:Abu-Safieh-2013:23105016}(Saudi Arabia)
21962<NA>125M?Germanywhite
316605<NA>15520
433096<NA>1229{PMID:Neveling 2012:22334370}Mno0
.........................................................
1445447702<NA>16{PMID:Weisschuh 2024:37734845}patient, no family historyFGermany0SRP-1105
1446447707<NA>16{PMID:Weisschuh 2024:37734845}patient, no family historyMGermany0SRP-1167
1447447716<NA>16{PMID:Weisschuh 2024:37734845}patient, no family historyFGermany0SRP-1249
1448447718<NA>16{PMID:Weisschuh 2024:37734845}patient, no family historyMGermany0SRP-1274
1449447720<NA>16{PMID:Weisschuh 2024:37734845}patientMGermany0SRP-1299
\n", - "

1450 rows × 18 columns

\n", - "
" - ], - "text/plain": [ - " id fatherid ... Individual/Origin/Population Individual/Individual_ID\n", - "0 135 ... \n", - "1 210 ... \n", - "2 1962 ... white \n", - "3 16605 ... \n", - "4 33096 ... \n", - "... ... ... ... ... ...\n", - "1445 447702 ... SRP-1105\n", - "1446 447707 ... SRP-1167\n", - "1447 447716 ... SRP-1249\n", - "1448 447718 ... SRP-1274\n", - "1449 447720 ... SRP-1299\n", - "\n", - "[1450 rows x 18 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Individuals_To_Diseases\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
individualiddiseaseid
013512
121058
21962173
3330964214
4331094214
.........
1444447702198
1445447707198
1446447716198
1447447718198
1448447720198
\n", - "

1449 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " individualid diseaseid\n", - "0 135 12\n", - "1 210 58\n", - "2 1962 173\n", - "3 33096 4214\n", - "4 33109 4214\n", - "... ... ...\n", - "1444 447702 198\n", - "1445 447707 198\n", - "1446 447716 198\n", - "1447 447718 198\n", - "1448 447720 198\n", - "\n", - "[1449 rows x 2 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Phenotypes\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
iddiseaseidindividualidowned_byPhenotype/InheritancePhenotype/AgePhenotype/AdditionalPhenotype/Biochem_paramPhenotype/Age/OnsetPhenotype/Age/DiagnosisPhenotype/Severity_scorePhenotype/OnsetPhenotype/ProteinPhenotype/Tumor/MSIPhenotype/Enzyme/CPKPhenotype/Heart/MyocardiumPhenotype/LungPhenotype/Diagnosis/DefinitePhenotype/Diagnosis/InitialPhenotype/Diagnosis/Criteria
08121356Familial, autosomal recessive
1265821039Familial, autosomal recessive
2941173196225Familial2-3 toe syndactyly5
326525421433096229Unknownretinitis pigmentosa
426538421433109229Unknownretinitis pigmentosa
...............................................................
12663369011984477026Unknownretinitis pigmentosa, simplex
12673369061984477076Unknownretinitis pigmentosa, simplex
12683369151984477166Unknownretinitis pigmentosa, simplex
12693369171984477186Unknownretinitis pigmentosa, simplex
12703369191984477206Unknownretinitis pigmentosa, simplex
\n", - "

1271 rows × 20 columns

\n", - "
" - ], - "text/plain": [ - " id ... Phenotype/Diagnosis/Criteria\n", - "0 8 ... \n", - "1 26 ... \n", - "2 941 ... \n", - "3 26525 ... \n", - "4 26538 ... \n", - "... ... ... ...\n", - "1266 336901 ... \n", - "1267 336906 ... \n", - "1268 336915 ... \n", - "1269 336917 ... \n", - "1270 336919 ... \n", - "\n", - "[1271 rows x 20 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Screenings\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idindividualidvariants_foundowned_bycreated_bycreated_dateedited_byedited_dateScreening/TechniqueScreening/TemplateScreening/TissueScreening/Remarks
01261351662012-07-07 19:04:1962012-07-07 19:12:08RT-PCR;SEQDNA;RNA
121121013962012-09-22 11:36:24<NA>NaTSEQDNA
21640196212562010-03-11 16:36:41252012-04-13 15:18:00SEQDNA
3165571660515525522014-05-23 13:12:43<NA>NaTSEQ-NG-IDNA
4331643309612292292012-02-04 15:20:0162012-05-18 13:59:33SEQ;SEQ-NG-SDNA
.......................................
14454492794477021662024-01-26 10:23:59<NA>NaTSEQ-NGDNAWGS
14464492844477071662024-01-26 10:23:59<NA>NaTSEQ-NGDNAWGS
14474492934477161662024-01-26 10:23:59<NA>NaTSEQ-NGDNAWGS
14484492954477181662024-01-26 10:23:59<NA>NaTSEQ-NGDNAWGS
14494492974477201662024-01-26 10:23:59<NA>NaTSEQ-NGDNAWGS
\n", - "

1450 rows × 12 columns

\n", - "
" - ], - "text/plain": [ - " id individualid ... Screening/Tissue Screening/Remarks\n", - "0 126 135 ... \n", - "1 211 210 ... \n", - "2 1640 1962 ... \n", - "3 16557 16605 ... \n", - "4 33164 33096 ... \n", - "... ... ... ... ... ...\n", - "1445 449279 447702 ... WGS\n", - "1446 449284 447707 ... WGS\n", - "1447 449293 447716 ... WGS\n", - "1448 449295 447718 ... WGS\n", - "1449 449297 447720 ... WGS\n", - "\n", - "[1450 rows x 12 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Screenings_To_Genes\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
screeningidgeneid
0126IL36RN
1211MKS1
21640DHCR7
333164AHI1
433164EYS
.........
1311437646EYS
1312437902EYS
1313437922EYS
1314443144EYS
1315443145EYS
\n", - "

1316 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " screeningid geneid\n", - "0 126 IL36RN\n", - "1 211 MKS1\n", - "2 1640 DHCR7\n", - "3 33164 AHI1\n", - "4 33164 EYS\n", - "... ... ...\n", - "1311 437646 EYS\n", - "1312 437902 EYS\n", - "1313 437922 EYS\n", - "1314 443144 EYS\n", - "1315 443145 EYS\n", - "\n", - "[1316 rows x 2 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Variants_On_Genome\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idalleleeffectidchromosomeposition_g_startposition_g_endtypeaverage_frequencyowned_byVariantOnGenome/DBIDVariantOnGenome/DNAVariantOnGenome/FrequencyVariantOnGenome/ReferenceVariantOnGenome/Restriction_siteVariantOnGenome/Published_asVariantOnGenome/RemarksVariantOnGenome/Genetic_originVariantOnGenome/SegregationVariantOnGenome/dbSNPVariantOnGenome/VIPVariantOnGenome/MethylationVariantOnGenome/ISCNVariantOnGenome/DNA/hg38VariantOnGenome/ClinVarVariantOnGenome/ClinicalClassificationVariantOnGenome/ClinicalClassification/Method
03642635066449897164498971subst0.000743552EYS_000007g.64498971A>GGermline0g.63789078A>GVUS
15988135566565575865655758subst0.001153229EYS_000001g.65655758T>GExAC: 60, 19750, 0, 0.003038{PMID:Neveling 2012:22334370}Q770PGermlineyes0g.64945865T>GVUS
25988311166533614365336143subst0.224189229EYS_000002g.65336143G>AExAC: 3936, 19366, 441, 0.2032{PMID:Neveling 2012:22334370}p.?unaffected brother also this variant homozygousGermlineno0g.64626250G>Abenign
35988411566530086965300869subst0.000838229EYS_000003g.65300869G>AExAC: 12, 19406, 0, 0.0006184{PMID:Neveling 2012:22334370}(P1631S)predicted benign, disease-related variant in o...Germline0g.64590976G>Abenign
45988511166501699865016999del0.000000229EYS_000004g.65016998_65016999delExAC: 9866, 18292, 921, 0.5394{PMID:Neveling 2012:22334370}6045-4_6045-3delpredicted benignGermlineyes0g.64307105_64307106delbenign
.................................................................................
253696421103066576763465767634subst0.2430222330EYS_000248g.65767634G>AEYS(NM_001292009.2):c.2024-15_2024-14delTCinsTTVKGL data sharing initiative NederlandCLASSIFICATION recordlikely benign
253796421203066576764365767643del0.0000002330EYS_000926g.65767643delEYS(NM_001292009.2):c.2024-15delTVKGL data sharing initiative NederlandCLASSIFICATION recordlikely benign
253896421505066600592766005927subst0.0001122327EYS_000253g.66005927C>TEYS(NM_001142800.1):c.1852G>A (p.G618S), EYS(N...VKGL data sharing initiative NederlandCLASSIFICATION recordVUS
253996421605066604487466044874subst0.0000822327EYS_000256g.66044874T>CEYS(NM_001292009.2):c.1765A>G (p.R589G)VKGL data sharing initiative NederlandCLASSIFICATION recordVUS
254097731409066443094364430943subst0.0000071804EYS_000060g.64430943A>TEYS(NM_001142800.2):c.8984T>A (p.(Ile2995Asn))...VKGL data sharing initiative NederlandCLASSIFICATION recordpathogenic
\n", - "

2541 rows × 26 columns

\n", - "
" - ], - "text/plain": [ - " id ... VariantOnGenome/ClinicalClassification/Method\n", - "0 36426 ... \n", - "1 59881 ... \n", - "2 59883 ... \n", - "3 59884 ... \n", - "4 59885 ... \n", - "... ... ... ...\n", - "2536 964211 ... \n", - "2537 964212 ... \n", - "2538 964215 ... \n", - "2539 964216 ... \n", - "2540 977314 ... \n", - "\n", - "[2541 rows x 26 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Variants_On_Transcripts\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idtranscriptideffectidposition_c_startposition_c_start_intronposition_c_endposition_c_end_intronVariantOnTranscript/DNAVariantOnTranscript/RNAVariantOnTranscript/ProteinVariantOnTranscript/Exon
0364267329507558075580c.7558T>Cr.(?)p.(Phe2520Leu)38
1598817329552309023090c.2309A>Cr.(?)p.(Gln770Pro)15
2598837329113444-53444-5c.3444-5C>Tr.(?)p.(=)22i
3598847329154891048910c.4891C>Tr.(?)p.(Pro1631Ser)26
4598857329116079-46079-3c.6079-4_6079-3delr.(?)p.(=)29i
....................................
25369642117329302024-142024-14c.2024-14C>Tr.(=)p.(=)
25379642127329302024-152024-15c.2024-15delr.(=)p.(=)
25389642157329501852018520c.1852G>Ar.(?)p.(Gly618Ser)
25399642167329501765017650c.1765A>Gr.(?)p.(Arg589Gly)
25409773147329908984089840c.8984T>Ar.(?)p.(Ile2995Asn)
\n", - "

2541 rows × 11 columns

\n", - "
" - ], - "text/plain": [ - " id transcriptid ... VariantOnTranscript/Protein VariantOnTranscript/Exon\n", - "0 36426 7329 ... p.(Phe2520Leu) 38\n", - "1 59881 7329 ... p.(Gln770Pro) 15\n", - "2 59883 7329 ... p.(=) 22i\n", - "3 59884 7329 ... p.(Pro1631Ser) 26\n", - "4 59885 7329 ... p.(=) 29i\n", - "... ... ... ... ... ...\n", - "2536 964211 7329 ... p.(=) \n", - "2537 964212 7329 ... p.(=) \n", - "2538 964215 7329 ... p.(Gly618Ser) \n", - "2539 964216 7329 ... p.(Arg589Gly) \n", - "2540 977314 7329 ... p.(Ile2995Asn) \n", - "\n", - "[2541 rows x 11 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Screenings_To_Variants\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
screeningidvariantid
0126783293
1211790459
21640235838
31655736426
43316459884
.........
2144449279959046
2145449284959051
2146449293959060
2147449295959474
2148449297959064
\n", - "

2149 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " screeningid variantid\n", - "0 126 783293\n", - "1 211 790459\n", - "2 1640 235838\n", - "3 16557 36426\n", - "4 33164 59884\n", - "... ... ...\n", - "2144 449279 959046\n", - "2145 449284 959051\n", - "2146 449293 959060\n", - "2147 449295 959474\n", - "2148 449297 959064\n", - "\n", - "[2149 rows x 2 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } + "source": [ + "gnomad_data = request_gnomad_api_data(\"EYS\")\n", + "\n", + "display(gnomad_data)" + ], + "id": "64482c033c794fb4", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "store_database_for_eys_gene('gnomad', False)\n", + "\n", + "gnomad_data_2 = parse_gnomad(GNOMAD_PATH +'/gnomad_data.csv')" + ], + "id": "60f3f3074a9b19f4", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": "display(gnomad_data_2)", + "id": "9d3e4d6b5f7be127", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "gnomad_data_2.to_csv('C:\\\\Users\\\\Kajus\\\\Desktop\\\\gnomad_data_downloaded.csv', index=False)\n", + "gnomad_data.to_csv('C:\\\\Users\\\\Kajus\\\\Desktop\\\\gnomad_data_api.csv', index=False)" + ], + "id": "2e869f5c77dbe3d3", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "len(gnomad_data_2), len(gnomad_data)\n", + "\n", + "print(len(gnomad_data_2) - len(gnomad_data))" + ], + "id": "9efafb201061c146", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": "gnomad_data", + "id": "96283480cccf641", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "missing_from_api = []\n", + "\n", + "for i in gnomad_data['gnomAD ID']:\n", + " if(i in gnomad_data_2['gnomAD ID'].values):\n", + " continue\n", + " missing_from_api.append(i)\n", + "\n", + "len(missing_from_api)\n", + "\n", + "missing_data = gnomad_data.loc[gnomad_data['gnomAD ID'].isin(missing_from_api)]\n", + "\n", + "missing_data" + ], + "id": "d0eb0a6db96d31c8", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": "missing_data.to_csv('C:\\\\Users\\\\Kajus\\\\Desktop\\\\gnomad_data_missing.csv', index=False)", + "id": "388120b03b094511", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "set_lovd_dtypes(data)\n", + "set_gnomad_dtypes(gnomad_data)\n", + "\n", + "variants_on_genome = data[\"Variants_On_Genome\"].copy()\n", + "\n", + "lovd_data = pd.merge(data[\"Variants_On_Transcripts\"],\n", + " variants_on_genome[['id','VariantOnGenome/DNA/hg38']],\n", + " on='id',\n", + " how='left')\n", + "\n", + "gnomad_data = gnomad_data.copy()\n", + "final_data = merge_gnomad_lovd(lovd_data, gnomad_data)\n", + "final_data" ], + "id": "96453d88e353aeb1", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", "source": [ "for i in data:\n", " print(i)\n", " display(data[i])" - ] + ], + "id": "8a089e29bfc8c119", + "outputs": [], + "execution_count": null }, { + "metadata": {}, "cell_type": "code", - "execution_count": null, - "id": "ef07740b2fa63e42", - "metadata": { - "collapsed": false - }, - "outputs": [], "source": [ "set_lovd_dtypes(data)\n", "for i in data:\n", " print(i)\n", " display(data[i].info())" - ] + ], + "id": "ef07740b2fa63e42", + "outputs": [], + "execution_count": null }, { + "metadata": {}, "cell_type": "code", - "execution_count": 3, + "source": "save_lovd_as_vcf(data[\"Variants_On_Genome\"], \"./lovd.vcf\")", "id": "c968af1617be40db", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-13T15:38:25.149624Z", - "start_time": "2024-05-13T15:38:24.807199Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:root:Skipping variant g.64307105_64307106del\n", - "WARNING:root:Skipping variant g.65495379dup\n", - "WARNING:root:Skipping variant g.65495379dup\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64902422_64902438del\n", - "WARNING:root:Skipping variant g.64902422_64902438del\n", - "WARNING:root:Skipping variant g.64840707_64997105del\n", - "WARNING:root:Skipping variant g.64840707_64997105del\n", - "WARNING:root:Skipping variant g.64840707_64997105del\n", - "WARNING:root:Skipping variant g.65295915del\n", - "WARNING:root:Skipping variant g.65295915del\n", - "WARNING:root:Skipping variant g.65295915del\n", - "WARNING:root:Skipping variant g.65057728_65320715del\n", - "WARNING:root:Skipping variant g.65057728_65320715del\n", - "WARNING:root:Skipping variant g.65057728_65320715del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.65384425del\n", - "WARNING:root:Skipping variant g.63720737_63720746del\n", - "WARNING:root:Skipping variant g.63720737_63720746del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64439165del\n", - "WARNING:root:Skipping variant g.64439165del\n", - "WARNING:root:Skipping variant g.64626122del\n", - "WARNING:root:Skipping variant g.65494867del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.65494867del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.65494988_65495003del\n", - "WARNING:root:Skipping variant g.63720919_63720920del\n", - "WARNING:root:Skipping variant g.63720668dup\n", - "WARNING:root:Skipping variant g.63720919_63720920del\n", - "WARNING:root:Skipping variant g.63720668dup\n", - "WARNING:root:Skipping variant g.63999116del\n", - "WARNING:root:Skipping variant g.64591039_64591042del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64822643dup\n", - "WARNING:root:Skipping variant g.65495005_65495008delinsAAG\n", - "WARNING:root:Skipping variant g.65495005_65495008delinsAAG\n", - "WARNING:root:Skipping variant g.65495005_65495008delinsAAG\n", - "WARNING:root:Skipping variant g.65495005_65495008delinsAAG\n", - "WARNING:root:Skipping variant g.64591505_64591506delinsCT\n", - "WARNING:root:Skipping variant g.65495005_65495008delinsAAG\n", - "WARNING:root:Skipping variant g.65495005_65495008delinsAAG\n", - "WARNING:root:Skipping variant g.65384480dup\n", - "WARNING:root:Skipping variant g.63726599_63726600del\n", - "WARNING:root:Skipping variant g.65495005_65495008delinsAAG\n", - "WARNING:root:Skipping variant g.65495005_65495008delinsAAG\n", - "WARNING:root:Skipping variant g.65495005_65495008delinsAAG\n", - "WARNING:root:Skipping variant g.63726599_63726600del\n", - "WARNING:root:Skipping variant g.65495005_65495008delinsAAG\n", - "WARNING:root:Skipping variant g.63726599_63726600del\n", - "WARNING:root:Skipping variant g.65335102del\n", - "WARNING:root:Skipping variant g.65335102del\n", - "WARNING:root:Skipping variant g.65335102del\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.63721377_63721384del\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.63721377_63721384del\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.63999116del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.63999110_63999111del\n", - "WARNING:root:Skipping variant g.63999110_63999111del\n", - "WARNING:root:Skipping variant g.65321830_65370656del\n", - "WARNING:root:Skipping variant g.63720850_63720853del\n", - "WARNING:root:Skipping variant g.65321830_65370656del\n", - "WARNING:root:Skipping variant g.63720850_63720853del\n", - "WARNING:root:Skipping variant g.65321830_65370656del\n", - "WARNING:root:Skipping variant g.63720850_63720853del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.63806240del\n", - "WARNING:root:Skipping variant g.63806240del\n", - "WARNING:root:Skipping variant g.65495332_65495333dup\n", - "WARNING:root:Skipping variant g.65324960_65416038del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.65295915del\n", - "WARNING:root:Skipping variant g.65494988_65495003del\n", - "WARNING:root:Skipping variant g.65494988_65495003del\n", - "WARNING:root:Skipping variant g.65494988_65495003del\n", - "WARNING:root:Skipping variant g.65494988_65495003del\n", - "WARNING:root:Skipping variant g.65494988_65495003del\n", - "WARNING:root:Skipping variant g.65494988_65495003del\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.65384480dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.65344144_65344152delinsCTTTTCG\n", - "WARNING:root:Skipping variant g.63984409_63984410delinsACGAT\n", - "WARNING:root:Skipping variant g.63788163_63788164del\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64912603dup\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590700dup\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64590700dup\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64591845del\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.63984390del\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.63721651_63721652insCA\n", - "WARNING:root:Skipping variant g.64590665_64590666del\n", - "WARNING:root:Skipping variant g.63721651_63721652insCA\n", - "WARNING:root:Skipping variant g.64590665_64590666del\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64591466dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64591480del\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.63788136del\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.63720730_63720733del\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64912603dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590665_64590666del\n", - "WARNING:root:Skipping variant g.63721652_63721655dup\n", - "WARNING:root:Skipping variant g.64590665_64590666del\n", - "WARNING:root:Skipping variant g.63721652_63721655dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64912603dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64591313del\n", - "WARNING:root:Skipping variant g.64439355_64439356insA\n", - "WARNING:root:Skipping variant g.63984543_63984570del\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.63984537_63984542dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64614795_64939832del\n", - "WARNING:root:Skipping variant g.64614795_64939832del\n", - "WARNING:root:Skipping variant g.64614795_64939832del\n", - "WARNING:root:Skipping variant g.63721576del\n", - "WARNING:root:Skipping variant g.64902132_64902133del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.63720889dup\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.65495379dup\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.65495379dup\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.63721625dup\n", - "WARNING:root:Skipping variant g.65405300dup\n", - "WARNING:root:Skipping variant g.64813506del\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.63720753_63720754dup\n", - "WARNING:root:Skipping variant g.65405325dup\n", - "WARNING:root:Skipping variant g.65405325dup\n", - "WARNING:root:Skipping variant g.65405325dup\n", - "WARNING:root:Skipping variant g.65405325dup\n", - "WARNING:root:Skipping variant g.65405325dup\n", - "WARNING:root:Skipping variant g.65405325dup\n", - "WARNING:root:Skipping variant g.65405325dup\n", - "WARNING:root:Skipping variant g.65405325dup\n", - "WARNING:root:Skipping variant g.65405325dup\n", - "WARNING:root:Skipping variant g.65405325dup\n", - "WARNING:root:Skipping variant g.65405325dup\n", - "WARNING:root:Skipping variant g.65405325dup\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.65707136_65707226del\n", - "WARNING:root:Skipping variant g.65495348del\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.63720845_63720846del\n", - "WARNING:root:Skipping variant g.63726607del\n", - "WARNING:root:Skipping variant g.63721786dup\n", - "WARNING:root:Skipping variant g.63721640del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.65344138dup\n", - "WARNING:root:Skipping variant g.65402503del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64590699_64590700del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64590699_64590700del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.65384384_65384387del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.65384480dup\n", - "WARNING:root:Skipping variant g.63806228dup\n", - "WARNING:root:Skipping variant g.63999116del\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.65384384_65384387del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.63721377_63721384del\n", - "WARNING:root:Skipping variant g.63999110_63999111del\n", - "WARNING:root:Skipping variant g.63999110_63999111del\n", - "WARNING:root:Skipping variant g.63999110_63999111del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.64439331del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.65494957dup\n", - "WARNING:root:Skipping variant g.63720995del\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.63721771_63721776del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.65494957dup\n", - "WARNING:root:Skipping variant g.64591977del\n", - "WARNING:root:Skipping variant g.64886728_64886736del\n", - "WARNING:root:Skipping variant g.65495379dup\n", - "WARNING:root:Skipping variant g.63721138del\n", - "WARNING:root:Skipping variant g.65353541del\n", - "WARNING:root:Skipping variant g.65353541del\n", - "WARNING:root:Skipping variant g.65494988_65495002del\n", - "WARNING:root:Skipping variant g.65494988_65495002del\n", - "WARNING:root:Skipping variant g.65494988_65495002del\n", - "WARNING:root:Skipping variant g.65494988_65495002del\n", - "WARNING:root:Skipping variant g.65494988_65495002del\n", - "WARNING:root:Skipping variant g.65494988_65495002del\n", - "WARNING:root:Skipping variant g.65494988_65495002del\n", - "WARNING:root:Skipping variant g.65494988_65495002del\n", - "WARNING:root:Skipping variant g.65494988_65495002del\n", - "WARNING:root:Skipping variant g.65494988_65495002del\n", - "WARNING:root:Skipping variant g.65295857dup\n", - "WARNING:root:Skipping variant g.65295856_65295857dup\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.64307105_64307106del\n", - "WARNING:root:Skipping variant g.64307103_64307106del\n", - "WARNING:root:Skipping variant g.63726618_63726622del\n", - "WARNING:root:Skipping variant g.63720642_63720644del\n", - "WARNING:root:Skipping variant g.65295857dup\n", - "WARNING:root:Skipping variant g.64307105_64307106del\n", - "WARNING:root:Skipping variant g.65353541del\n", - "WARNING:root:Skipping variant g.64591918_64591919del\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.64590525_64590548del\n", - "WARNING:root:Skipping variant g.64388841_64388843del\n", - "WARNING:root:Skipping variant g.64307084_64307085del\n", - "WARNING:root:Skipping variant g.64307105_64307106del\n", - "WARNING:root:Skipping variant g.64307103_64307106del\n", - "WARNING:root:Skipping variant g.64307105_64307106del\n", - "WARNING:root:Skipping variant g.64593097_64593101del\n", - "WARNING:root:Skipping variant g.65384384_65384387del\n", - "WARNING:root:Skipping variant g.63999110_63999111del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.64439213del\n", - "WARNING:root:Skipping variant g.63721771_63721776del\n", - "WARNING:root:Skipping variant g.65494957dup\n", - "WARNING:root:Skipping variant g.65494988_65495008delinsAAAAG\n", - "WARNING:root:Skipping variant g.63720799_63720808del\n", - "WARNING:root:Skipping variant g.63726584del\n", - "WARNING:root:Skipping variant g.63726599_63726600del\n", - "WARNING:root:Skipping variant g.63726648del\n", - "WARNING:root:Skipping variant g.64349976_64426764del\n", - "WARNING:root:Skipping variant g.64591505_64591506delinsCT\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.63720728_63720729del\n", - "WARNING:root:Skipping variant g.63720845_63720846del\n", - "WARNING:root:Skipping variant g.63721162del\n", - "WARNING:root:Skipping variant g.63721237_63721240del\n", - "WARNING:root:Skipping variant g.63721343del\n", - "WARNING:root:Skipping variant g.63721730_63721733del\n", - "WARNING:root:Skipping variant g.63721763_63721767del\n", - "WARNING:root:Skipping variant g.63721787dup\n", - "WARNING:root:Skipping variant g.63726618_63726622del\n", - "WARNING:root:Skipping variant g.63788163_63788164del\n", - "WARNING:root:Skipping variant g.63984389del\n", - "WARNING:root:Skipping variant g.64066348del\n", - "WARNING:root:Skipping variant g.64590909dup\n", - "WARNING:root:Skipping variant g.64591309del\n", - "WARNING:root:Skipping variant g.64591858_64591859del\n", - "WARNING:root:Skipping variant g.64886728_64886736del\n", - "WARNING:root:Skipping variant g.64945814del\n", - "WARNING:root:Skipping variant g.65344143_65344144insCTTT\n", - "WARNING:root:Skipping variant g.65344146_65344151del\n", - "WARNING:root:Skipping variant g.65344181dup\n", - "WARNING:root:Skipping variant g.65384473dup\n", - "WARNING:root:Skipping variant g.65405287del\n", - "WARNING:root:Skipping variant g.65494885_65494887del\n", - "WARNING:root:Skipping variant g.65494961del\n", - "WARNING:root:Skipping variant g.65495205del\n", - "WARNING:root:Skipping variant g.65495296_65495297del\n", - "WARNING:root:Skipping variant g.64590909dup\n", - "WARNING:root:Skipping variant g.65384473dup\n", - "WARNING:root:Skipping variant g.65405287del\n", - "WARNING:root:Skipping variant g.65494885_65494887del\n", - "WARNING:root:Skipping variant g.63762589del\n", - "WARNING:root:Skipping variant g.63720874del\n", - "WARNING:root:Skipping variant g.63721619_63721620insGT\n", - "WARNING:root:Skipping variant g.63788268dup\n", - "WARNING:root:Skipping variant g.63984369_63984392del\n", - "WARNING:root:Skipping variant g.65057740_65057741insA\n", - "WARNING:root:Skipping variant g.65057740_65057741insAA\n", - "WARNING:root:Skipping variant g.65057750dup\n", - "WARNING:root:Skipping variant g.65384384_65384387del\n", - "WARNING:root:Skipping variant g.65402624dup\n", - "WARNING:root:Skipping variant g.65405377dup\n", - "WARNING:root:Skipping variant g.65494957dup\n", - "WARNING:root:Skipping variant g.65494957dup\n", - "WARNING:root:Skipping variant g.65495379dup\n", - "WARNING:root:Skipping variant g.63721625dup\n", - "WARNING:root:Skipping variant g.63721704dup\n", - "WARNING:root:Skipping variant g.65353537dup\n", - "WARNING:root:Skipping variant g.65335105_65335108del\n", - "WARNING:root:Skipping variant g.63721385_63721386del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64439195del\n", - "WARNING:root:Skipping variant g.63721377_63721384del\n", - "WARNING:root:Skipping variant g.63720867_63720868del\n", - "WARNING:root:Skipping variant g.63721314_63721321del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.63720737_63720746del\n", - "WARNING:root:Skipping variant g.63720737_63720746del\n", - "WARNING:root:Skipping variant g.63720737_63720746del\n", - "WARNING:root:Skipping variant g.63720737_63720746del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.63765706_63791377del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.63781919_63803805del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64892926_64948294del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64591845del\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64945857dup\n", - "WARNING:root:Skipping variant g.63721377_63721384del\n", - "WARNING:root:Skipping variant g.63721377_63721384del\n", - "WARNING:root:Skipping variant g.63721377_63721384del\n", - "WARNING:root:Skipping variant g.63721377_63721384del\n", - "WARNING:root:Skipping variant g.63721557_63721558insA\n", - "WARNING:root:Skipping variant g.64439319_64439323del\n", - "WARNING:root:Skipping variant g.65296021del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.65495181del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.65384425del\n", - "WARNING:root:Skipping variant g.64591039_64591042del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.63721599_63721604del\n", - "WARNING:root:Skipping variant g.64902132_64902133del\n", - "WARNING:root:Skipping variant g.65494988_65495008delinsAAAAG\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.65495236del\n", - "WARNING:root:Skipping variant g.64590908_64590909insT\n", - "WARNING:root:Skipping variant g.64617409_64617411dup\n", - "WARNING:root:Skipping variant g.65295857dup\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.63720845_63720846del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.63720668dup\n", - "WARNING:root:Skipping variant g.(63741975_63762461)_(63778180_63788105)dup\n", - "WARNING:root:Skipping variant g.65296051_65296052del\n", - "WARNING:root:Skipping variant g.65296051_65296052del\n", - "WARNING:root:Skipping variant g.64439200dup\n", - "WARNING:root:Skipping variant g.63720737_63720746del\n", - "WARNING:root:Skipping variant g.63720737_63720746del\n", - "WARNING:root:Skipping variant g.65494885_65494887del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.63984390del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.65295897del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.65494885_65494887del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590615dup\n", - "WARNING:root:Skipping variant g.63721705del\n", - "WARNING:root:Skipping variant g.64081884del\n", - "WARNING:root:Skipping variant g.65405342_65405355delinsAAA\n", - "WARNING:root:Skipping variant g.65490640_65490643dup\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.65296058del\n", - "WARNING:root:Skipping variant g.65405345del\n", - "WARNING:root:Skipping variant g.(65405368_65490593)_(65495411_?)del\n", - "WARNING:root:Skipping variant g.(65353618_65384385)_(65405368_65490593)del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64081885_64081888del\n", - "WARNING:root:Skipping variant g.64591401_64591408dup\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.63721377_63721384del\n", - "WARNING:root:Skipping variant g.63721237_63721240del\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.64591256_64591272del\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.63721436del\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.64591039_64591042del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.65495181del\n", - "WARNING:root:Skipping variant g.65495181del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.65495181del\n", - "WARNING:root:Skipping variant g.65495181del\n", - "WARNING:root:Skipping variant g.63720695_63720714del\n", - "WARNING:root:Skipping variant g.65384480dup\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.63721377_63721384del\n", - "WARNING:root:Skipping variant g.65495181del\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.65495181del\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.65495064del\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.63721377_63721384del\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.65495181del\n", - "WARNING:root:Skipping variant g.64066470_64066473del\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.65495181del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.65495181del\n", - "WARNING:root:Skipping variant g.64066470_64066473del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.63721377_63721384del\n", - "WARNING:root:Skipping variant g.65495181del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.63721377_63721384del\n", - "WARNING:root:Skipping variant g.65495181del\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.63720695_63720714del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.63720668del\n", - "WARNING:root:Skipping variant g.64912603dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.63720730_63720733del\n", - "WARNING:root:Skipping variant g.64902132_64902133del\n", - "WARNING:root:Skipping variant g.63720954dup\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.64066335del\n", - "WARNING:root:Skipping variant g.64886711del\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.63778166_63778172del\n", - "WARNING:root:Skipping variant g.64590914dup\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.63726618_63726622del\n", - "WARNING:root:Skipping variant g.64886728_64886736del\n", - "WARNING:root:Skipping variant g.64893157_64947352del\n", - "WARNING:root:Skipping variant g.64797009_64846087del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.63726519del\n", - "WARNING:root:Skipping variant g.64997275_64998015del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.64081605_64082252del\n", - "WARNING:root:Skipping variant g.63777755_63789474dup\n", - "WARNING:root:Skipping variant g.63720737_63720746del\n", - "WARNING:root:Skipping variant g.64081605_64082252del\n", - "WARNING:root:Skipping variant g.65353208_65353867del\n", - "WARNING:root:Skipping variant g.63984134_63984854del\n", - "WARNING:root:Skipping variant g.63743557_63907234del\n", - "WARNING:root:Skipping variant g.63720737_63720746del\n", - "WARNING:root:Skipping variant g.64662532_64979780del\n", - "WARNING:root:Skipping variant g.64839119_64970113del\n", - "WARNING:root:Skipping variant g.63720872_63720873del\n", - "WARNING:root:Skipping variant g.63721377_63721384del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.64081885_64081888del\n", - "WARNING:root:Skipping variant g.63720737_63720746del\n", - "WARNING:root:Skipping variant g.63720737_63720746del\n", - "WARNING:root:Skipping variant g.63720682dup\n", - "WARNING:root:Skipping variant g.63721619dup\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.64388841del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.63720649_63720653del\n", - "WARNING:root:Skipping variant g.63721377_63721384del\n", - "WARNING:root:Skipping variant g.63720695_63720714del\n", - "WARNING:root:Skipping variant g.65384384_65384387del\n", - "WARNING:root:Skipping variant g.63720730_63720733del\n", - "WARNING:root:Skipping variant g.65335105_65335108del\n", - "WARNING:root:Skipping variant g.64439165del\n", - "WARNING:root:Skipping variant g.63999116del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.63998527_64002156del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.63721377_63721384del\n", - "WARNING:root:Skipping variant g.65495379dup\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.63720730_63720733del\n", - "WARNING:root:Skipping variant g.63720730_63720733del\n", - "WARNING:root:Skipping variant g.63721237_63721240del\n", - "WARNING:root:Skipping variant g.63721237_63721240del\n", - "WARNING:root:Skipping variant g.63721237_63721240del\n", - "WARNING:root:Skipping variant g.64081885_64081888del\n", - "WARNING:root:Skipping variant g.65495348dup\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.64439331del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.64912705del\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.64439200dup\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.65495181del\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.63721625dup\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.65384480del\n", - "WARNING:root:Skipping variant g.64388841_64388843del\n", - "WARNING:root:Skipping variant g.63998527_64002156del\n", - "WARNING:root:Skipping variant g.64590525_64590550delinsTA\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64081884del\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.216327637C>T\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.215879068C>T\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.215878931del\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.65384480dup\n", - "WARNING:root:Skipping variant g.216073265G>A\n", - "WARNING:root:Skipping variant g.65495206_65495207insTGCCAGTTTA\n", - "WARNING:root:Skipping variant g.63721227dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.63720990_63720991insATAT\n", - "WARNING:root:Skipping variant g.63720728_63720729del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.63720947_63720948insT\n", - "WARNING:root:Skipping variant g.64590875_64590876insTCTT\n", - "WARNING:root:Skipping variant g.63721432del\n", - "WARNING:root:Skipping variant g.64591501_64591502insAGAA\n", - "WARNING:root:Skipping variant g.64590556_64590566del\n", - "WARNING:root:Skipping variant g.63999116del\n", - "WARNING:root:Skipping variant g.63720753_63720754dup\n", - "WARNING:root:Skipping variant g.(?_64945792)_(64945915_64997581)del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.63720730_63720733del\n", - "WARNING:root:Skipping variant g.63721377_63721384del\n", - "WARNING:root:Skipping variant g.63726524del\n", - "WARNING:root:Skipping variant g.64423168_64798957delinsATGA\n", - "WARNING:root:Skipping variant g.63942752_64337822delinsATTATG\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant g.?\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64423168_64798957delinsATGA\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.63942752_64337822delinsATTATG\n", - "WARNING:root:Skipping variant g.63957115_63958454del\n", - "WARNING:root:Skipping variant g.65001113_65005820del\n", - "WARNING:root:Skipping variant g.65550144_65552138del\n", - "WARNING:root:Skipping variant g.65689153_65694794del\n", - "WARNING:root:Skipping variant g.65454073_65454074insN[305]\n", - "WARNING:root:Skipping variant g.64296539_64296632del\n", - "WARNING:root:Skipping variant g.65204982_65205044del\n", - "WARNING:root:Skipping variant g.65564961_65565284del\n", - "WARNING:root:Skipping variant :g.64295412_64295413insN[118]\n", - "WARNING:root:Skipping variant g.65278328_65278329insN[59]\n", - "WARNING:root:Skipping variant g.64590911dup\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64590875_64590876insTCTT\n", - "WARNING:root:Skipping variant g.63720947_63720948insT\n", - "WARNING:root:Skipping variant g.64591501_64591502insAGAA\n", - "WARNING:root:Skipping variant g.63721432del\n", - "WARNING:root:Skipping variant g.64590556_64590566del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.65658176_65718924del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant NM_001142800.2:c.6079-2A>G\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.63999116del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.64986218_65013355del\n", - "WARNING:root:Skipping variant g.64388690_64388840del\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant g.(?_63719980)_(63726681_63762460)dup\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.65274506_65316845delinsAGATCA\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.65494100_65508832del\n", - "WARNING:root:Skipping variant g.65213025_65296862delinsGTTTTCTTTTTA\n", - "WARNING:root:Skipping variant g.64066349del\n", - "WARNING:root:Skipping variant g.64602159_64657461dup\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.63762589del\n", - "WARNING:root:Skipping variant g.65383303_65441305delinsAACTTTTACT\n", - "WARNING:root:Skipping variant g.63720737_63720746del\n", - "WARNING:root:Skipping variant g.65284957_66872862delinsT\n", - "WARNING:root:Skipping variant g.64122444_64129159delins64204448_64235506inv\n", - "WARNING:root:Skipping variant g.64937848_64948401delins[64944099_64944163inv;CAATTTTGTAT]\n", - "WARNING:root:Skipping variant g.63721385_63721386del\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.63720654_63720657del\n", - "WARNING:root:Skipping variant g.64591069_64591081del\n", - "WARNING:root:Skipping variant g.64886841del\n", - "WARNING:root:Skipping variant g.64790603_64977512del\n", - "WARNING:root:Skipping variant g.63721625dup\n", - "WARNING:root:Skipping variant g.64591514_64591520del\n", - "WARNING:root:Skipping variant g.63720649_63720653del\n", - "WARNING:root:Skipping variant g.65479942_67131267inv\n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n", - "WARNING:root:Skipping variant \n" - ] - } - ], - "source": [ - "save_lovd_as_vcf(data[\"Variants_On_Genome\"], \"./lovd.vcf\")" - ] + "outputs": [], + "execution_count": null }, { + "metadata": {}, "cell_type": "code", - "execution_count": 4, - "id": "c7ff16903e0c52bd", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-13T15:58:47.868055Z", - "start_time": "2024-05-13T15:58:41.380466Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-05-13 18:58:41.794056: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", - "2024-05-13 18:58:41.794769: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n", - "2024-05-13 18:58:41.797917: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n", - "2024-05-13 18:58:41.857361: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", - "To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2024-05-13 18:58:42.410244: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", - "2024-05-13 18:58:42.957291: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", - "2024-05-13 18:58:42.957684: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.\n", - "Skipping registering GPU devices...\n", - "WARNING:absl:No training configuration found in the save file, so the model was *not* compiled. Compile it manually.\n", - "WARNING:absl:No training configuration found in the save file, so the model was *not* compiled. Compile it manually.\n", - "WARNING:absl:No training configuration found in the save file, so the model was *not* compiled. Compile it manually.\n", - "WARNING:absl:No training configuration found in the save file, so the model was *not* compiled. Compile it manually.\n", - "WARNING:absl:No training configuration found in the save file, so the model was *not* compiled. Compile it manually.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001B[1m1/1\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m1s\u001B[0m 595ms/step\n", - "\u001B[1m1/1\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m1s\u001B[0m 554ms/step\n", - "\u001B[1m1/1\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m1s\u001B[0m 553ms/step\n", - "\u001B[1m1/1\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m1s\u001B[0m 548ms/step\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:tensorflow:5 out of the last 5 calls to .one_step_on_data_distributed at 0x7dee981abf40> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for more details.\n", - "WARNING:tensorflow:5 out of the last 5 calls to .one_step_on_data_distributed at 0x7dee981abf40> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for more details.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001B[1m1/1\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m1s\u001B[0m 548ms/step\n", - "\u001B[1m1/1\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m0s\u001B[0m 49ms/step\n", - "\u001B[1m1/1\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m0s\u001B[0m 51ms/step\n", - "\u001B[1m1/1\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m0s\u001B[0m 51ms/step\n", - "\u001B[1m1/1\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m0s\u001B[0m 59ms/step\n", - "\u001B[1m1/1\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m0s\u001B[0m 53ms/step\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:root:dede['G|EYS|0.00|0.00|0.00|0.00|3|9|-20|9']\n" - ] - }, - { - "data": { - "text/plain": [ - "0" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "from subprocess import Popen\n", "\n", - "\n", "process = Popen(\"spliceai -I ./lovd.vcf -O ./lovd_output.vcf -R ../tools/spliceai/hg38.fa -A grch38\".split())\n", "process.wait()" - ] + ], + "id": "c7ff16903e0c52bd", + "outputs": [], + "execution_count": null }, { - "cell_type": "code", - "execution_count": 1, - "id": "0514ccc3-5c91-41ad-ab15-f4158030ea14", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/nojsaj/kath/tools/revel/revel.py:9: DtypeWarning: Columns (0,2) have mixed types. Specify dtype option on import or set low_memory=False.\n", - " revel_data = pd.read_csv(revel_file)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
chrhg19_posgrch38_posrefaltaarefaaaltREVELEnsembl_transcriptid
2875612766565575864945865TAQL0.188ENST00000503581;ENST00000370621;ENST00000370616
2875612866565575864945865TCQR0.111ENST00000503581;ENST00000370621;ENST00000370616
2875612966565575864945865TGQP0.344ENST00000503581;ENST00000370621;ENST00000370616
\n", - "
" - ], - "text/plain": [ - " chr hg19_pos grch38_pos ref alt aaref aaalt REVEL \\\n", - "28756127 6 65655758 64945865 T A Q L 0.188 \n", - "28756128 6 65655758 64945865 T C Q R 0.111 \n", - "28756129 6 65655758 64945865 T G Q P 0.344 \n", - "\n", - " Ensembl_transcriptid \n", - "28756127 ENST00000503581;ENST00000370621;ENST00000370616 \n", - "28756128 ENST00000503581;ENST00000370621;ENST00000370616 \n", - "28756129 ENST00000503581;ENST00000370621;ENST00000370616 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "cell_type": "code", "source": [ - "from tools import get_revel_scores\n", + "from api.tools import get_revel_scores\n", "\n", "chromosome = 6\n", "position = 65655758\n", @@ -3905,86 +230,26 @@ "results = get_revel_scores(chromosome, position)\n", "\n", "display(results)" - ] - }, - { - "cell_type": "code", - "outputs": [], - "source": [ - "from api.data.collection import store_database_for_eys_gene\n", - "store_database_for_eys_gene(\"clinvar\", override=False)" ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-08-12T16:46:07.973915Z", - "start_time": "2024-08-12T16:46:07.970874Z" - } - }, - "id": "b80a1049abe7596e", - "execution_count": 12 + "id": "0514ccc3-5c91-41ad-ab15-f4158030ea14", + "outputs": [], + "execution_count": null }, { + "metadata": {}, "cell_type": "code", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The file at ../data/lovd/lovd_data.txt already exists.\n" - ] - } - ], - "source": [ - "from api.data.collection import store_database_for_eys_gene\n", - "store_database_for_eys_gene(\"lovd\", override=False)" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-08-12T16:47:56.094297Z", - "start_time": "2024-08-12T16:47:56.090300Z" - } - }, - "id": "a1e10fc8175753a0", - "execution_count": 4 + "source": "", + "id": "6f0abfb50bd211a0", + "outputs": [], + "execution_count": null }, { - "cell_type": "code", - "outputs": [ - { - "data": { - "text/plain": " gnomAD ID Chromosome Position rsIDs Reference \\\n0 6-63720525-A-G 6 63720525 rs1768331164 A \n1 6-63720525-A-T 6 63720525 rs1768331164 A \n2 6-63720526-T-A 6 63720526 T \n3 6-63720531-C-CAA 6 63720531 C \n4 6-63720531-C-G 6 63720531 rs927390284 C \n... ... ... ... ... ... \n11083 6-65495478-G-A 6 65495478 rs530118054 G \n11084 6-65495479-G-A 6 65495479 rs1766225632 G \n11085 6-65495482-A-G 6 65495482 rs1766225707 A \n11086 6-65495484-T-G 6 65495484 rs1766225807 T \n11087 6-65495485-T-C 6 65495485 T \n\n Alternate Source Filters - exomes \\\n0 G gnomAD Exomes,gnomAD Genomes PASS \n1 T gnomAD Genomes \n2 A gnomAD Exomes PASS \n3 CAA gnomAD Exomes PASS \n4 G gnomAD Exomes,gnomAD Genomes PASS \n... ... ... ... \n11083 A gnomAD Exomes,gnomAD Genomes PASS \n11084 A gnomAD Exomes PASS \n11085 G gnomAD Exomes,gnomAD Genomes PASS \n11086 G gnomAD Exomes PASS \n11087 C gnomAD Exomes PASS \n\n Filters - genomes Transcript ... Homozygote Count Amish \\\n0 PASS ENST00000503581.6 ... 0 \n1 PASS ENST00000503581.6 ... 0 \n2 ENST00000503581.6 ... 0 \n3 ENST00000503581.6 ... 0 \n4 PASS ENST00000503581.6 ... 0 \n... ... ... ... ... \n11083 PASS ENST00000503581.6 ... 0 \n11084 ENST00000503581.6 ... 0 \n11085 PASS ENST00000503581.6 ... 0 \n11086 ENST00000503581.6 ... 0 \n11087 ENST00000503581.6 ... 0 \n\n Hemizygote Count Amish Allele Count South Asian \\\n0 0 0 \n1 0 0 \n2 0 1 \n3 0 0 \n4 0 0 \n... ... ... \n11083 0 4 \n11084 0 0 \n11085 0 0 \n11086 0 5 \n11087 0 1 \n\n Allele Number South Asian Homozygote Count South Asian \\\n0 55362 0 \n1 55362 0 \n2 55360 0 \n3 57520 0 \n4 57524 0 \n... ... ... \n11083 88352 0 \n11084 88316 0 \n11085 87828 0 \n11086 87818 0 \n11087 87636 0 \n\n Hemizygote Count South Asian Allele Count Remaining \\\n0 0 0 \n1 0 0 \n2 0 0 \n3 0 1 \n4 0 0 \n... ... ... \n11083 0 8 \n11084 0 0 \n11085 0 0 \n11086 0 0 \n11087 0 0 \n\n Allele Number Remaining Homozygote Count Remaining \\\n0 44082 0 \n1 44082 0 \n2 44162 0 \n3 47700 0 \n4 47678 0 \n... ... ... \n11083 60164 0 \n11084 60142 0 \n11085 59586 0 \n11086 59748 0 \n11087 59608 0 \n\n Hemizygote Count Remaining \n0 0 \n1 0 \n2 0 \n3 0 \n4 0 \n... ... \n11083 0 \n11084 0 \n11085 0 \n11086 0 \n11087 0 \n\n[11088 rows x 72 columns]", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
gnomAD IDChromosomePositionrsIDsReferenceAlternateSourceFilters - exomesFilters - genomesTranscript...Homozygote Count AmishHemizygote Count AmishAllele Count South AsianAllele Number South AsianHomozygote Count South AsianHemizygote Count South AsianAllele Count RemainingAllele Number RemainingHomozygote Count RemainingHemizygote Count Remaining
06-63720525-A-G663720525rs1768331164AGgnomAD Exomes,gnomAD GenomesPASSPASSENST00000503581.6...000553620004408200
16-63720525-A-T663720525rs1768331164ATgnomAD Genomes<NA>PASSENST00000503581.6...000553620004408200
26-63720526-T-A663720526<NA>TAgnomAD ExomesPASS<NA>ENST00000503581.6...001553600004416200
36-63720531-C-CAA663720531<NA>CCAAgnomAD ExomesPASS<NA>ENST00000503581.6...000575200014770000
46-63720531-C-G663720531rs927390284CGgnomAD Exomes,gnomAD GenomesPASSPASSENST00000503581.6...000575240004767800
..................................................................
110836-65495478-G-A665495478rs530118054GAgnomAD Exomes,gnomAD GenomesPASSPASSENST00000503581.6...004883520086016400
110846-65495479-G-A665495479rs1766225632GAgnomAD ExomesPASS<NA>ENST00000503581.6...000883160006014200
110856-65495482-A-G665495482rs1766225707AGgnomAD Exomes,gnomAD GenomesPASSPASSENST00000503581.6...000878280005958600
110866-65495484-T-G665495484rs1766225807TGgnomAD ExomesPASS<NA>ENST00000503581.6...005878180005974800
110876-65495485-T-C665495485<NA>TCgnomAD ExomesPASS<NA>ENST00000503581.6...001876360005960800
\n

11088 rows × 72 columns

\n
" - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from api.data.refactoring import parse_gnomad, set_gnomad_dtypes\n", - "from api import (store_database_for_eys_gene,GNOMAD_PATH)\n", - "\n", - "store_database_for_eys_gene('gnomad', False)\n", - "\n", - "gnomad_data = parse_gnomad(GNOMAD_PATH+'/gnomad_data.csv')\n", - "\n", - "set_gnomad_dtypes(gnomad_data)\n", - "\n", - "gnomad_data" - ], "metadata": { - "collapsed": false, "ExecuteTime": { - "end_time": "2024-08-20T18:43:31.996838Z", - "start_time": "2024-08-20T18:43:30.905943Z" + "end_time": "2024-09-10T13:54:32.598753Z", + "start_time": "2024-09-10T13:54:22.948649Z" } }, - "id": "4ba7fd02a60f5693", - "execution_count": 1 - }, - { "cell_type": "code", "source": [ "from api.data.refactoring import merge_gnomad_lovd, parse_gnomad, set_gnomad_dtypes\n", @@ -4015,14 +280,7 @@ "final_data = merge_gnomad_lovd(lovd_data, gnomad_data)\n", "final_data" ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-09-03T14:19:14.730427Z", - "start_time": "2024-09-03T14:19:05.969159Z" - } - }, - "id": "dd9b17623f26a07c", + "id": "895c77782fbf2e21", "outputs": [ { "name": "stdout", @@ -4474,14 +732,12 @@ "execution_count": 1 }, { + "metadata": {}, "cell_type": "code", "outputs": [], - "source": [], - "metadata": { - "collapsed": false - }, - "id": "50b0e50e88fa0914", - "execution_count": null + "execution_count": null, + "source": "", + "id": "a84eee9bd9b294c9" } ], "metadata": { From 50739383e4e6e16df3a87db233511013b765762d Mon Sep 17 00:00:00 2001 From: Vladyslav Levchenko Date: Tue, 10 Sep 2024 17:00:59 +0300 Subject: [PATCH 14/15] updated pipeline.ipynb to prevent conflict --- tests/pipeline.ipynb | 496 ------------------------------------------- 1 file changed, 496 deletions(-) diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index e85aff1..71cf21d 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -242,502 +242,6 @@ "id": "6f0abfb50bd211a0", "outputs": [], "execution_count": null - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-10T13:54:32.598753Z", - "start_time": "2024-09-10T13:54:22.948649Z" - } - }, - "cell_type": "code", - "source": [ - "from api.data.refactoring import merge_gnomad_lovd, parse_gnomad, set_gnomad_dtypes\n", - "import pandas as pd\n", - "from api import (store_database_for_eys_gene,\n", - " parse_lovd,\n", - " set_lovd_dtypes,\n", - " LOVD_PATH,\n", - " GNOMAD_PATH)\n", - "\n", - "store_database_for_eys_gene('lovd', False)\n", - "store_database_for_eys_gene('gnomad', False)\n", - "\n", - "lovd_data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")\n", - "gnomad_data = parse_gnomad(GNOMAD_PATH+'/gnomad_data.csv')\n", - "\n", - "set_lovd_dtypes(lovd_data)\n", - "set_gnomad_dtypes(gnomad_data)\n", - "\n", - "variants_on_genome = lovd_data[\"Variants_On_Genome\"].copy()\n", - "\n", - "lovd_data = pd.merge(lovd_data[\"Variants_On_Transcripts\"],\n", - " variants_on_genome[['id','VariantOnGenome/DNA','VariantOnGenome/DNA/hg38']],\n", - " on='id',\n", - " how='left')\n", - "\n", - "gnomad_data = gnomad_data.copy()\n", - "final_data = merge_gnomad_lovd(lovd_data, gnomad_data)\n", - "final_data" - ], - "id": "895c77782fbf2e21", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The file at ../data/lovd/lovd_data.txt already exists.\n" - ] - }, - { - "data": { - "text/plain": [ - " id transcriptid effectid position_c_start \\\n", - "0 822823 7329 70 632 \n", - "1 822787 7329 70 8391 \n", - "2 822843 7329 70 5608 \n", - "3 822771 7329 70 8206 \n", - "4 \n", - "... ... ... ... ... \n", - "13218 959060 7329 70 9383 \n", - "13219 959064 7329 50 0 \n", - "13220 985494 7329 70 2137 \n", - "13221 986425 7329 90 4361 \n", - "13222 987322 7329 90 9299 \n", - "\n", - " position_c_start_intron position_c_end position_c_end_intron \\\n", - "0 0 632 0 \n", - "1 0 8391 0 \n", - "2 0 5608 0 \n", - "3 0 8206 0 \n", - "4 \n", - "... ... ... ... \n", - "13218 0 9387 0 \n", - "13219 0 0 0 \n", - "13220 20590 3444 -29847 \n", - "13221 0 4362 0 \n", - "13222 0 9302 0 \n", - "\n", - " VariantOnTranscript/DNA VariantOnTranscript/RNA \\\n", - "0 c.632G>A r.(?) \n", - "1 c.8391del r.(?) \n", - "2 c.5608C>T r.(?) \n", - "3 c.8206G>C r.(?) \n", - "4 \n", - "... ... ... \n", - "13218 c.9383_9387del r.(?) \n", - "13219 c.-538_862+10652{1}inv r.? \n", - "13220 c.2137+20590_3444-29847del r.? \n", - "13221 c.4361_4362delinsAG r.(?) \n", - "13222 c.9299_9302del r.(?) \n", - "\n", - " VariantOnTranscript/Protein ... Homozygote Count Amish_gnomad \\\n", - "0 p.(Cys211Tyr) ... \n", - "1 p.(Gly2799Valfs*31) ... \n", - "2 p.(Arg1870Trp) ... \n", - "3 p.(Ala2736Pro) ... \n", - "4 ... 0 \n", - "... ... ... ... \n", - "13218 p.(Lys3128ArgfsTer7) ... \n", - "13219 p.? ... \n", - "13220 p.(Val713AspfsTer14) ... \n", - "13221 p.(Ser1454Ter) ... \n", - "13222 p.(Thr3100LysfsTer26) ... \n", - "\n", - " Hemizygote Count Amish_gnomad Allele Count South Asian_gnomad \\\n", - "0 \n", - "1 \n", - "2 \n", - "3 \n", - "4 0 0 \n", - "... ... ... \n", - "13218 \n", - "13219 \n", - "13220 \n", - "13221 \n", - "13222 \n", - "\n", - " Allele Number South Asian_gnomad Homozygote Count South Asian_gnomad \\\n", - "0 \n", - "1 \n", - "2 \n", - "3 \n", - "4 55362 0 \n", - "... ... ... \n", - "13218 \n", - "13219 \n", - "13220 \n", - "13221 \n", - "13222 \n", - "\n", - " Hemizygote Count South Asian_gnomad Allele Count Remaining_gnomad \\\n", - "0 \n", - "1 \n", - "2 \n", - "3 \n", - "4 0 0 \n", - "... ... ... \n", - "13218 \n", - "13219 \n", - "13220 \n", - "13221 \n", - "13222 \n", - "\n", - " Allele Number Remaining_gnomad Homozygote Count Remaining_gnomad \\\n", - "0 \n", - "1 \n", - "2 \n", - "3 \n", - "4 44082 0 \n", - "... ... ... \n", - "13218 \n", - "13219 \n", - "13220 \n", - "13221 \n", - "13222 \n", - "\n", - " Hemizygote Count Remaining_gnomad \n", - "0 \n", - "1 \n", - "2 \n", - "3 \n", - "4 0 \n", - "... ... \n", - "13218 \n", - "13219 \n", - "13220 \n", - "13221 \n", - "13222 \n", - "\n", - "[13223 rows x 86 columns]" - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idtranscriptideffectidposition_c_startposition_c_start_intronposition_c_endposition_c_end_intronVariantOnTranscript/DNAVariantOnTranscript/RNAVariantOnTranscript/Protein...Homozygote Count Amish_gnomadHemizygote Count Amish_gnomadAllele Count South Asian_gnomadAllele Number South Asian_gnomadHomozygote Count South Asian_gnomadHemizygote Count South Asian_gnomadAllele Count Remaining_gnomadAllele Number Remaining_gnomadHomozygote Count Remaining_gnomadHemizygote Count Remaining_gnomad
082282373297063206320c.632G>Ar.(?)p.(Cys211Tyr)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
18227877329708391083910c.8391delr.(?)p.(Gly2799Valfs*31)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
28228437329705608056080c.5608C>Tr.(?)p.(Arg1870Trp)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
38227717329708206082060c.8206G>Cr.(?)p.(Ala2736Pro)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
4<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>...000553620004408200
..................................................................
132189590607329709383093870c.9383_9387delr.(?)p.(Lys3128ArgfsTer7)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132199590647329500000c.-538_862+10652{1}invr.?p.?...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132209854947329702137205903444-29847c.2137+20590_3444-29847delr.?p.(Val713AspfsTer14)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132219864257329904361043620c.4361_4362delinsAGr.(?)p.(Ser1454Ter)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
132229873227329909299093020c.9299_9302delr.(?)p.(Thr3100LysfsTer26)...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
\n", - "

13223 rows × 86 columns

\n", - "
" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 1 - }, - { - "metadata": {}, - "cell_type": "code", - "outputs": [], - "execution_count": null, - "source": "", - "id": "a84eee9bd9b294c9" } ], "metadata": { From 68b8472c2be1a1f8810cbff6f8079f53837734a3 Mon Sep 17 00:00:00 2001 From: Vladyslav Levchenko Date: Tue, 10 Sep 2024 17:08:54 +0300 Subject: [PATCH 15/15] updated pipeline.ipynb to prevent conflict --- tests/pipeline.ipynb | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 71cf21d..1382b31 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -238,10 +238,38 @@ { "metadata": {}, "cell_type": "code", - "source": "", - "id": "6f0abfb50bd211a0", "outputs": [], - "execution_count": null + "execution_count": null, + "source": [ + "from api.data.refactoring import merge_gnomad_lovd, parse_gnomad, set_gnomad_dtypes\n", + "import pandas as pd\n", + "from api import (store_database_for_eys_gene,\n", + " parse_lovd,\n", + " set_lovd_dtypes,\n", + " LOVD_PATH,\n", + " GNOMAD_PATH)\n", + "\n", + "store_database_for_eys_gene('lovd', False)\n", + "store_database_for_eys_gene('gnomad', False)\n", + "\n", + "lovd_data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")\n", + "gnomad_data = parse_gnomad(GNOMAD_PATH+'/gnomad_data.csv')\n", + "\n", + "set_lovd_dtypes(lovd_data)\n", + "set_gnomad_dtypes(gnomad_data)\n", + "\n", + "variants_on_genome = lovd_data[\"Variants_On_Genome\"].copy()\n", + "\n", + "lovd_data = pd.merge(lovd_data[\"Variants_On_Transcripts\"],\n", + " variants_on_genome[['id','VariantOnGenome/DNA','VariantOnGenome/DNA/hg38']],\n", + " on='id',\n", + " how='left')\n", + "\n", + "gnomad_data = gnomad_data.copy()\n", + "final_data = merge_gnomad_lovd(lovd_data, gnomad_data)\n", + "final_data" + ], + "id": "ba435cd29d565f7d" } ], "metadata": {