@@ -47,75 +47,80 @@ def calculate_max_frequency(row):
47
47
return pd .Series ([max_freq , max_pop ], index = ['PopMax' , 'PopMax population' ])
48
48
49
49
50
- # MAIN
51
- # Download all data
52
- store_database_for_eys_gene ('lovd' , True )
53
- store_database_for_eys_gene ('gnomad' , True )
54
- store_database_for_eys_gene ('clinvar' , True )
55
-
56
- # Read and convert data
57
- lovd_data = parse_lovd (LOVD_PATH + "/lovd_data.txt" )
58
- gnomad_data = pd .read_csv (GNOMAD_PATH + "/gnomad_data.csv" )
59
- clinvar_data = pd .read_csv (CLINVAR_PATH + "/clinvar_data.txt" , sep = '\t ' )
60
-
61
- convert_lovd_to_datatype (lovd_data )
62
-
63
- # renaming databases' columns
64
- gnomad_data .columns += "(gnomad)"
65
- clinvar_data .columns += "(clinvar)"
66
-
67
- # Reading main working table
68
- main_frame = lovd_data ["Variants_On_Transcripts" ][0 ].copy ()
69
- notes = lovd_data ["Variants_On_Transcripts" ][1 ][::]
70
-
71
- # Merging Clinvar
72
- clinvar = clinvar_data .copy ()[["Name(clinvar)" ,
73
- "Germline classification(clinvar)" ,
74
- "Accession(clinvar)" ]]
75
- clinvar ["VariantOnTranscript/DNA" ] = (clinvar ["Name(clinvar)" ].
76
- apply (from_clinvar_name_to_cdna_position ))
77
-
78
- main_frame = pd .merge (main_frame ,
79
- clinvar ,
80
- how = "outer" ,
81
- on = ["VariantOnTranscript/DNA" ]).drop ("Name(clinvar)" , axis = 1 )
82
-
83
- # MERGING GnomAd
84
- main_frame = (pd .merge (main_frame ,
85
- gnomad_data ,
86
- how = "left" ,
87
- left_on = "VariantOnTranscript/DNA" ,
88
- right_on = "HGVS Consequence(gnomad)" ).drop ("HGVS Consequence(gnomad)" ,
89
- axis = 1 ))
90
-
91
- # Calculating frequencies
92
- lovd_without_association_in_gnomad = pd .isnull (main_frame ["Hemizygote Count Remaining(gnomad)" ])
93
- lovd_with_gnomad = main_frame [~ lovd_without_association_in_gnomad ].copy ()
94
- max_values = lovd_with_gnomad .apply (calculate_max_frequency , axis = 1 )
95
- lovd_with_gnomad [['PopMax(gnomad)' , 'PopMax population(gnomad)' ]] = max_values
96
-
97
- # Leaving necessary columns
98
-
99
- lovd_with_gnomad = lovd_with_gnomad .loc [:, ['id' ,
100
- 'transcriptid' ,
101
- 'effectid' ,
102
- 'position_c_start' ,
103
- 'position_c_start_intron' ,
104
- 'position_c_end' ,
105
- 'position_c_end_intron' ,
106
- 'VariantOnTranscript/DNA' ,
107
- 'VariantOnTranscript/RNA' ,
108
- 'VariantOnTranscript/Protein' ,
109
- 'VariantOnTranscript/Exon' ,
110
- 'Germline classification(clinvar)' ,
111
- 'Accession(clinvar)' ,
112
- 'Allele Frequency(gnomad)' ,
113
- 'Homozygote Count(gnomad)' ,
114
- 'PopMax(gnomad)' ,
115
- 'PopMax population(gnomad)' ]]
116
-
117
- # final join
118
- main_frame = main_frame .iloc [:, range (13 )]
119
- main_frame = pd .merge (main_frame , lovd_with_gnomad , how = "left" , on = list (main_frame .columns [:13 ]))
120
-
121
- main_frame .to_csv (DATA_PATH + "/final.csv" )
50
+ def main ():
51
+ # MAIN
52
+ # Download all data
53
+ store_database_for_eys_gene ('lovd' , True )
54
+ store_database_for_eys_gene ('gnomad' , True )
55
+ store_database_for_eys_gene ('clinvar' , True )
56
+
57
+ # Read and convert data
58
+ lovd_data = parse_lovd (LOVD_PATH + "/lovd_data.txt" )
59
+ gnomad_data = pd .read_csv (GNOMAD_PATH + "/gnomad_data.csv" )
60
+ clinvar_data = pd .read_csv (CLINVAR_PATH + "/clinvar_data.txt" , sep = '\t ' )
61
+
62
+ convert_lovd_to_datatype (lovd_data )
63
+
64
+ # renaming databases' columns
65
+ gnomad_data .columns += "(gnomad)"
66
+ clinvar_data .columns += "(clinvar)"
67
+
68
+ # Reading main working table
69
+ main_frame = lovd_data ["Variants_On_Transcripts" ][0 ].copy ()
70
+ notes = lovd_data ["Variants_On_Transcripts" ][1 ][::]
71
+
72
+ # Merging Clinvar
73
+ clinvar = clinvar_data .copy ()[["Name(clinvar)" ,
74
+ "Germline classification(clinvar)" ,
75
+ "Accession(clinvar)" ]]
76
+ clinvar ["VariantOnTranscript/DNA" ] = (clinvar ["Name(clinvar)" ].
77
+ apply (from_clinvar_name_to_cdna_position ))
78
+
79
+ main_frame = pd .merge (main_frame ,
80
+ clinvar ,
81
+ how = "outer" ,
82
+ on = ["VariantOnTranscript/DNA" ]).drop ("Name(clinvar)" , axis = 1 )
83
+
84
+ # MERGING GnomAd
85
+ main_frame = (pd .merge (main_frame ,
86
+ gnomad_data ,
87
+ how = "left" ,
88
+ left_on = "VariantOnTranscript/DNA" ,
89
+ right_on = "HGVS Consequence(gnomad)" ).drop ("HGVS Consequence(gnomad)" ,
90
+ axis = 1 ))
91
+
92
+ # Calculating frequencies
93
+ lovd_without_association_in_gnomad = pd .isnull (main_frame ["Hemizygote Count Remaining(gnomad)" ])
94
+ lovd_with_gnomad = main_frame [~ lovd_without_association_in_gnomad ].copy ()
95
+ max_values = lovd_with_gnomad .apply (calculate_max_frequency , axis = 1 )
96
+ lovd_with_gnomad [['PopMax(gnomad)' , 'PopMax population(gnomad)' ]] = max_values
97
+
98
+ # Leaving necessary columns
99
+
100
+ lovd_with_gnomad = lovd_with_gnomad .loc [:, ['id' ,
101
+ 'transcriptid' ,
102
+ 'effectid' ,
103
+ 'position_c_start' ,
104
+ 'position_c_start_intron' ,
105
+ 'position_c_end' ,
106
+ 'position_c_end_intron' ,
107
+ 'VariantOnTranscript/DNA' ,
108
+ 'VariantOnTranscript/RNA' ,
109
+ 'VariantOnTranscript/Protein' ,
110
+ 'VariantOnTranscript/Exon' ,
111
+ 'Germline classification(clinvar)' ,
112
+ 'Accession(clinvar)' ,
113
+ 'Allele Frequency(gnomad)' ,
114
+ 'Homozygote Count(gnomad)' ,
115
+ 'PopMax(gnomad)' ,
116
+ 'PopMax population(gnomad)' ]]
117
+
118
+ # final join
119
+ main_frame = main_frame .iloc [:, range (13 )]
120
+ main_frame = pd .merge (main_frame , lovd_with_gnomad , how = "left" , on = list (main_frame .columns [:13 ]))
121
+
122
+ main_frame .to_csv (DATA_PATH + "/final.csv" )
123
+
124
+
125
+ if __name__ == "__main__" :
126
+ main ()
0 commit comments