@@ -179,86 +179,67 @@ def from_clinvar_name_to_cdna_position(name):
179
179
180
180
def lovd_fill_hg38 (lovd : pd .DataFrame ):
181
181
"""
182
- fills missing hg38 values in the LOVD dataframe
182
+ Fills missing hg38 values in the LOVD dataframe
183
183
by converting hg19 values to hg38.
184
184
New column 'hg19/hg38_lovd' is added to store
185
185
the converted positions in the format '6-position-ref-alt'.
186
-
187
- parameters:
188
- - lovd (pd.DataFrame): A pandas DataFrame containing following columns:
189
- - 'VariantOnGenome/DNA': hg19 values.
190
- - 'VariantOnGenome/DNA/hg38': hg38 values.
191
-
192
- returns:
193
- None: Modifies the input DataFrame in-place by adding or
194
- updating the 'hg19/hg38_lovd' column.
186
+ :param lovd: pandas DataFrame containing following columns:
187
+ - 'VariantOnGenome/DNA': hg19 values.
188
+ - 'VariantOnGenome/DNA/hg38': hg38 values.
189
+ :return: None: Modifies the input DataFrame in-place by adding or
190
+ updating the 'hg19/hg38_lovd' column.
195
191
"""
196
192
197
193
if lovd .empty :
198
194
return
195
+ lovd ['VariantOnGenome/DNA/hg38' ] = lovd ['VariantOnGenome/DNA/hg38' ].replace ('' , pd .NA )
196
+ lovd ['hg38_gnomad_format' ] = lovd .apply (convert_hg19_if_missing , axis = 1 )
199
197
200
- def convert_hg19_if_missing (row ):
201
- """
202
- converts hg19 variant to hg38 if hg38 is missing.
203
- Checks if the hg38 value is missing (NaN) in a given row.
204
- If it is, the hg19 variant is converted to hg38
205
- using the `convert_hg19_to_hg38` function.
206
- Otherwise, the existing hg38 value is formatted.
207
-
208
- parameters:
209
- - row (pd.Series): single row of the DataFrame.
210
-
211
- returns:
212
- - str: hg38 value or a conversion of the hg19 value in the format '6-position-ref-alt'.
213
- """
214
- if pd .isna (row ['VariantOnGenome/DNA/hg38' ]):
215
- return convert_hg19_to_hg38 (convert_to_gnomad_gen_pos (row ['VariantOnGenome/DNA' ]))
216
- return convert_to_gnomad_gen_pos (row ['VariantOnGenome/DNA/hg38' ])
217
-
218
- def convert_hg19_to_hg38 (position : str , lo = LiftOver ('hg19' , 'hg38' )):
219
- """
220
- converts a genomic position from hg19 to hg38 using the LiftOver tool.
221
-
222
- parameters:
223
- - position (str): string representing the hg19 variant
224
- in the format 'g.positionRef>Alt'.
225
- - lo (LiftOver): converter for coordinates between genome builds
226
-
227
- returns:
228
- - str: converted hg38 position in the format '6-position-ref-alt'.
229
- """
230
- if '?' in position :
231
- return '?'
232
- try :
233
- new_pos = lo .convert_coordinate ('chr6' , int (position [2 :10 ]))[0 ][1 ]
234
- except ValueError as ve :
235
- return f"Error processing variant (ValueError): { str (ve )} "
236
- except IndexError as ie :
237
- return f"Error processing variant (IndexError): { str (ie )} "
238
- except TypeError as te :
239
- return f"Error processing variant (TypeError): { str (te )} "
240
- return f"6-{ new_pos } -{ position [- 3 :]} "
241
198
242
- lovd ['VariantOnGenome/DNA/hg38' ] = lovd ['VariantOnGenome/DNA/hg38' ].replace ('' , pd .NA )
243
- lovd ['hg19/hg38_lovd' ] = lovd .apply (convert_hg19_if_missing , axis = 1 )
199
+ def convert_hg19_if_missing (row ):
200
+ """
201
+ converts hg19 variant to hg38 if hg38 is missing.
202
+ Checks if the hg38 value is missing (NaN) in a given row.
203
+ If it is, the hg19 variant is converted to hg38
204
+ using the `convert_hg19_to_hg38` function.
205
+ Otherwise, the existing hg38 value is formatted.
206
+ :param row: single row of the DataFrame.
207
+ :return:
208
+ - str: hg38 value or a conversion of
209
+ the hg19 value in the format '6-position-ref-alt'.
210
+ """
211
+
212
+ if pd .isna (row ['VariantOnGenome/DNA/hg38' ]):
213
+ return convert_hg19_to_hg38 (convert_to_gnomad_gen (row ['VariantOnGenome/DNA' ]))
214
+ return convert_to_gnomad_gen (row ['VariantOnGenome/DNA/hg38' ])
244
215
245
216
246
- def convert_to_gnomad_gen_pos ( variant : str ):
217
+ def convert_hg19_to_hg38 ( position : str , lo = LiftOver ( 'hg19' , 'hg38' ) ):
247
218
"""
248
- converts a variant string from hg19 or hg38 format
249
- to the format used by gnomAD ('6-position-ref-alt').
219
+ converts a genomic position from hg19 to hg38 using the LiftOver tool.
220
+ :param position: string representing the hg19 variant
221
+ in the format 'g.positionRef>Alt'.
222
+ :param lo: converter for coordinates between genome builds
223
+ :return: string converted hg38 position in the format '6-position-ref-alt'.
224
+ """
225
+
226
+ if '?' in position :
227
+ return '?'
228
+ new_pos = lo .convert_coordinate ('chr6' , int (position [2 :10 ]))[0 ][1 ]
229
+ return f"6-{ new_pos } -{ position [- 3 :]} "
250
230
251
- parameters:
252
- - variant (str): string representing the variant
253
- in the format 'g.startRef>Alt'.
254
231
255
- returns:
256
- - str: variant formatted as '6-position-ref-alt'
232
+ def convert_to_gnomad_gen (variant : str ):
233
+ """
234
+ converts a variant string from hg19 or hg38 format
235
+ to the format used by gnomAD ('6-position-ref-alt').
236
+ :param variant: str: the variant in the format 'g.startRef>Alt'.
237
+ :return: str: variant formatted as '6-position-ref-alt'
257
238
or '?' if the input contains interval ranges or is invalid.
258
239
"""
259
240
260
241
if not isinstance (variant , str ):
261
- return "?"
242
+ raise TypeError ( f"Expected a string for 'variant', got { type ( variant ). __name__ } instead" )
262
243
263
244
patterns = {
264
245
'dup' : re .compile (r'^g\.(\d+)dup$' ),
@@ -308,7 +289,7 @@ def merge_gnomad_lovd(lovd, gnomad):
308
289
lovd ,
309
290
gnomad ,
310
291
how = "outer" ,
311
- left_on = "hg19/hg38_lovd " ,
292
+ left_on = "hg38_gnomad_format " ,
312
293
right_on = "gnomAD ID_gnomad"
313
294
)
314
295
0 commit comments