@@ -47,75 +47,86 @@ def calculate_max_frequency(row):
47
47
return pd .Series ([max_freq , max_pop ], index = ['PopMax' , 'PopMax population' ])
48
48
49
49
50
- # MAIN
51
- # Download all data
52
- store_database_for_eys_gene ('lovd' , True )
53
- store_database_for_eys_gene ('gnomad' , True )
54
- store_database_for_eys_gene ('clinvar' , True )
55
-
56
- # Read and convert data
57
- lovd_data = parse_lovd (LOVD_PATH + "/lovd_data.txt" )
58
- gnomad_data = pd .read_csv (GNOMAD_PATH + "/gnomad_data.csv" )
59
- clinvar_data = pd .read_csv (CLINVAR_PATH + "/clinvar_data.txt" , sep = '\t ' )
60
-
61
- convert_lovd_to_datatype (lovd_data )
62
-
63
- # renaming databases' columns
64
- gnomad_data .columns += "(gnomad)"
65
- clinvar_data .columns += "(clinvar)"
66
-
67
- # Reading main working table
68
- main_frame = lovd_data ["Variants_On_Transcripts" ][0 ].copy ()
69
- notes = lovd_data ["Variants_On_Transcripts" ][1 ][::]
70
-
71
- # Merging Clinvar
72
- clinvar = clinvar_data .copy ()[["Name(clinvar)" ,
73
- "Germline classification(clinvar)" ,
74
- "Accession(clinvar)" ]]
75
- clinvar ["VariantOnTranscript/DNA" ] = (clinvar ["Name(clinvar)" ].
76
- apply (from_clinvar_name_to_cdna_position ))
77
-
78
- main_frame = pd .merge (main_frame ,
79
- clinvar ,
80
- how = "outer" ,
81
- on = ["VariantOnTranscript/DNA" ]).drop ("Name(clinvar)" , axis = 1 )
82
-
83
- # MERGING GnomAd
84
- main_frame = (pd .merge (main_frame ,
85
- gnomad_data ,
86
- how = "left" ,
87
- left_on = "VariantOnTranscript/DNA" ,
88
- right_on = "HGVS Consequence(gnomad)" ).drop ("HGVS Consequence(gnomad)" ,
89
- axis = 1 ))
90
-
91
- # Calculating frequencies
92
- lovd_without_association_in_gnomad = pd .isnull (main_frame ["Hemizygote Count Remaining(gnomad)" ])
93
- lovd_with_gnomad = main_frame [~ lovd_without_association_in_gnomad ].copy ()
94
- max_values = lovd_with_gnomad .apply (calculate_max_frequency , axis = 1 )
95
- lovd_with_gnomad [['PopMax(gnomad)' , 'PopMax population(gnomad)' ]] = max_values
96
-
97
- # Leaving necessary columns
98
-
99
- lovd_with_gnomad = lovd_with_gnomad .loc [:, ['id' ,
100
- 'transcriptid' ,
101
- 'effectid' ,
102
- 'position_c_start' ,
103
- 'position_c_start_intron' ,
104
- 'position_c_end' ,
105
- 'position_c_end_intron' ,
106
- 'VariantOnTranscript/DNA' ,
107
- 'VariantOnTranscript/RNA' ,
108
- 'VariantOnTranscript/Protein' ,
109
- 'VariantOnTranscript/Exon' ,
110
- 'Germline classification(clinvar)' ,
111
- 'Accession(clinvar)' ,
112
- 'Allele Frequency(gnomad)' ,
113
- 'Homozygote Count(gnomad)' ,
114
- 'PopMax(gnomad)' ,
115
- 'PopMax population(gnomad)' ]]
116
-
117
- # final join
118
- main_frame = main_frame .iloc [:, range (13 )]
119
- main_frame = pd .merge (main_frame , lovd_with_gnomad , how = "left" , on = list (main_frame .columns [:13 ]))
120
-
121
- main_frame .to_csv (DATA_PATH + "/final.csv" )
50
+ def main ():
51
+ """
52
+ Main function implementing pipeline for data collection and merging of data from
53
+ LOVD, GNOMAD and CLINVAR.
54
+ """
55
+
56
+ # Download all data
57
+ store_database_for_eys_gene ('lovd' , True )
58
+ store_database_for_eys_gene ('gnomad' , True )
59
+ store_database_for_eys_gene ('clinvar' , True )
60
+
61
+ # Read and convert data
62
+ lovd_data = parse_lovd (LOVD_PATH + "/lovd_data.txt" )
63
+ gnomad_data = pd .read_csv (GNOMAD_PATH + "/gnomad_data.csv" )
64
+ clinvar_data = pd .read_csv (CLINVAR_PATH + "/clinvar_data.txt" , sep = '\t ' )
65
+
66
+ convert_lovd_to_datatype (lovd_data )
67
+
68
+ # renaming databases' columns
69
+ gnomad_data .columns += "(gnomad)"
70
+ clinvar_data .columns += "(clinvar)"
71
+
72
+ # Reading main working table
73
+ main_frame = lovd_data ["Variants_On_Transcripts" ][0 ].copy ()
74
+
75
+ # Merging Clinvar
76
+ clinvar = clinvar_data .copy ()[["Name(clinvar)" ,
77
+ "Germline classification(clinvar)" ,
78
+ "Accession(clinvar)" ]]
79
+ clinvar ["VariantOnTranscript/DNA" ] = (clinvar ["Name(clinvar)" ].
80
+ apply (from_clinvar_name_to_cdna_position ))
81
+
82
+ main_frame = pd .merge (main_frame ,
83
+ clinvar ,
84
+ how = "outer" ,
85
+ on = ["VariantOnTranscript/DNA" ]).drop ("Name(clinvar)" , axis = 1 )
86
+
87
+ # MERGING GnomAd
88
+ main_frame = (pd .merge (main_frame ,
89
+ gnomad_data ,
90
+ how = "left" ,
91
+ left_on = "VariantOnTranscript/DNA" ,
92
+ right_on = "HGVS Consequence(gnomad)" ).
93
+ drop ("HGVS Consequence(gnomad)" ,
94
+ axis = 1 ))
95
+
96
+ # Calculating frequencies
97
+ lovd_without_association_in_gnomad = pd .isnull (main_frame ["Hemizygote Count Remaining(gnomad)" ])
98
+ lovd_with_gnomad = main_frame [~ lovd_without_association_in_gnomad ].copy ()
99
+ max_values = lovd_with_gnomad .apply (calculate_max_frequency , axis = 1 )
100
+ lovd_with_gnomad [['PopMax(gnomad)' , 'PopMax population(gnomad)' ]] = max_values
101
+
102
+ # Leaving necessary columns
103
+ lovd_with_gnomad = lovd_with_gnomad .loc [:, ['id' ,
104
+ 'transcriptid' ,
105
+ 'effectid' ,
106
+ 'position_c_start' ,
107
+ 'position_c_start_intron' ,
108
+ 'position_c_end' ,
109
+ 'position_c_end_intron' ,
110
+ 'VariantOnTranscript/DNA' ,
111
+ 'VariantOnTranscript/RNA' ,
112
+ 'VariantOnTranscript/Protein' ,
113
+ 'VariantOnTranscript/Exon' ,
114
+ 'Germline classification(clinvar)' ,
115
+ 'Accession(clinvar)' ,
116
+ 'Allele Frequency(gnomad)' ,
117
+ 'Homozygote Count(gnomad)' ,
118
+ 'PopMax(gnomad)' ,
119
+ 'PopMax population(gnomad)' ]]
120
+
121
+ # final join
122
+ main_frame = main_frame .iloc [:, range (13 )]
123
+ main_frame = pd .merge (main_frame ,
124
+ lovd_with_gnomad ,
125
+ how = "left" ,
126
+ on = list (main_frame .columns [:13 ]))
127
+
128
+ main_frame .to_csv (DATA_PATH + "/final.csv" )
129
+
130
+
131
+ if __name__ == "__main__" :
132
+ main ()
0 commit comments