Skip to content

Commit b65e971

Browse files
committed
added exceptions for git ignore, fixing naming issues
1 parent baed83d commit b65e971

File tree

3 files changed

+132
-63
lines changed

3 files changed

+132
-63
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ __pycache__
66
# specific folders
77
data/*
88
tests/*
9+
!test_lovd_fill_hg38.py
910

1011
# large gene reference files
1112
*.fa

api/data/refactoring.py

Lines changed: 44 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -179,86 +179,67 @@ def from_clinvar_name_to_cdna_position(name):
179179

180180
def lovd_fill_hg38(lovd: pd.DataFrame):
181181
"""
182-
fills missing hg38 values in the LOVD dataframe
182+
Fills missing hg38 values in the LOVD dataframe
183183
by converting hg19 values to hg38.
184184
New column 'hg19/hg38_lovd' is added to store
185185
the converted positions in the format '6-position-ref-alt'.
186-
187-
parameters:
188-
- lovd (pd.DataFrame): A pandas DataFrame containing following columns:
189-
- 'VariantOnGenome/DNA': hg19 values.
190-
- 'VariantOnGenome/DNA/hg38': hg38 values.
191-
192-
returns:
193-
None: Modifies the input DataFrame in-place by adding or
194-
updating the 'hg19/hg38_lovd' column.
186+
:param lovd: pandas DataFrame containing following columns:
187+
- 'VariantOnGenome/DNA': hg19 values.
188+
- 'VariantOnGenome/DNA/hg38': hg38 values.
189+
:return: None: Modifies the input DataFrame in-place by adding or
190+
updating the 'hg19/hg38_lovd' column.
195191
"""
196192

197193
if lovd.empty:
198194
return
195+
lovd['VariantOnGenome/DNA/hg38'] = lovd['VariantOnGenome/DNA/hg38'].replace('', pd.NA)
196+
lovd['hg38_gnomad_format'] = lovd.apply(convert_hg19_if_missing, axis=1)
199197

200-
def convert_hg19_if_missing(row):
201-
"""
202-
converts hg19 variant to hg38 if hg38 is missing.
203-
Checks if the hg38 value is missing (NaN) in a given row.
204-
If it is, the hg19 variant is converted to hg38
205-
using the `convert_hg19_to_hg38` function.
206-
Otherwise, the existing hg38 value is formatted.
207-
208-
parameters:
209-
- row (pd.Series): single row of the DataFrame.
210-
211-
returns:
212-
- str: hg38 value or a conversion of the hg19 value in the format '6-position-ref-alt'.
213-
"""
214-
if pd.isna(row['VariantOnGenome/DNA/hg38']):
215-
return convert_hg19_to_hg38(convert_to_gnomad_gen_pos(row['VariantOnGenome/DNA']))
216-
return convert_to_gnomad_gen_pos(row['VariantOnGenome/DNA/hg38'])
217-
218-
def convert_hg19_to_hg38(position: str, lo=LiftOver('hg19', 'hg38')):
219-
"""
220-
converts a genomic position from hg19 to hg38 using the LiftOver tool.
221-
222-
parameters:
223-
- position (str): string representing the hg19 variant
224-
in the format 'g.positionRef>Alt'.
225-
- lo (LiftOver): converter for coordinates between genome builds
226-
227-
returns:
228-
- str: converted hg38 position in the format '6-position-ref-alt'.
229-
"""
230-
if '?' in position:
231-
return '?'
232-
try:
233-
new_pos = lo.convert_coordinate('chr6', int(position[2:10]))[0][1]
234-
except ValueError as ve:
235-
return f"Error processing variant (ValueError): {str(ve)}"
236-
except IndexError as ie:
237-
return f"Error processing variant (IndexError): {str(ie)}"
238-
except TypeError as te:
239-
return f"Error processing variant (TypeError): {str(te)}"
240-
return f"6-{new_pos}-{position[-3:]}"
241198

242-
lovd['VariantOnGenome/DNA/hg38'] = lovd['VariantOnGenome/DNA/hg38'].replace('', pd.NA)
243-
lovd['hg19/hg38_lovd'] = lovd.apply(convert_hg19_if_missing, axis=1)
199+
def convert_hg19_if_missing(row):
200+
"""
201+
converts hg19 variant to hg38 if hg38 is missing.
202+
Checks if the hg38 value is missing (NaN) in a given row.
203+
If it is, the hg19 variant is converted to hg38
204+
using the `convert_hg19_to_hg38` function.
205+
Otherwise, the existing hg38 value is formatted.
206+
:param row: single row of the DataFrame.
207+
:return:
208+
- str: hg38 value or a conversion of
209+
the hg19 value in the format '6-position-ref-alt'.
210+
"""
211+
212+
if pd.isna(row['VariantOnGenome/DNA/hg38']):
213+
return convert_hg19_to_hg38(convert_to_gnomad_gen(row['VariantOnGenome/DNA']))
214+
return convert_to_gnomad_gen(row['VariantOnGenome/DNA/hg38'])
244215

245216

246-
def convert_to_gnomad_gen_pos(variant: str):
217+
def convert_hg19_to_hg38(position: str, lo=LiftOver('hg19', 'hg38')):
247218
"""
248-
converts a variant string from hg19 or hg38 format
249-
to the format used by gnomAD ('6-position-ref-alt').
219+
converts a genomic position from hg19 to hg38 using the LiftOver tool.
220+
:param position: string representing the hg19 variant
221+
in the format 'g.positionRef>Alt'.
222+
:param lo: converter for coordinates between genome builds
223+
:return: string converted hg38 position in the format '6-position-ref-alt'.
224+
"""
225+
226+
if '?' in position:
227+
return '?'
228+
new_pos = lo.convert_coordinate('chr6', int(position[2:10]))[0][1]
229+
return f"6-{new_pos}-{position[-3:]}"
250230

251-
parameters:
252-
- variant (str): string representing the variant
253-
in the format 'g.startRef>Alt'.
254231

255-
returns:
256-
- str: variant formatted as '6-position-ref-alt'
232+
def convert_to_gnomad_gen(variant: str):
233+
"""
234+
converts a variant string from hg19 or hg38 format
235+
to the format used by gnomAD ('6-position-ref-alt').
236+
:param variant: str: the variant in the format 'g.startRef>Alt'.
237+
:return: str: variant formatted as '6-position-ref-alt'
257238
or '?' if the input contains interval ranges or is invalid.
258239
"""
259240

260241
if not isinstance(variant, str):
261-
return "?"
242+
raise TypeError(f"Expected a string for 'variant', got {type(variant).__name__} instead")
262243

263244
patterns = {
264245
'dup': re.compile(r'^g\.(\d+)dup$'),
@@ -308,7 +289,7 @@ def merge_gnomad_lovd(lovd, gnomad):
308289
lovd,
309290
gnomad,
310291
how="outer",
311-
left_on="hg19/hg38_lovd",
292+
left_on="hg38_gnomad_format",
312293
right_on="gnomAD ID_gnomad"
313294
)
314295

tests/test_lovd_fill_hg38.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
import unittest
2+
import pandas as pd
3+
from api.data.refactoring import lovd_fill_hg38
4+
5+
6+
class TestLOVDFillHg38(unittest.TestCase):
7+
8+
def setUp(self):
9+
"""Set up any initial data used in multiple tests."""
10+
self.df = pd.DataFrame()
11+
12+
def test_fill_hg38_with_no_missing_values(self):
13+
"""Test filling hg38 values when no values are missing."""
14+
self.df = pd.DataFrame({
15+
'VariantOnGenome/DNA': ['g.64430517C>T', 'g.64430535C>G'],
16+
'VariantOnGenome/DNA/hg38': ['g.63720621C>T', 'g.63720639C>G']
17+
})
18+
lovd_fill_hg38(self.df)
19+
expected_values = ['6-63720621-C-T', '6-63720639-C-G']
20+
self.assertIn('hg38_gnomad_format', self.df.columns,
21+
"Column 'hg38_gnomad_format' should be added.")
22+
self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)
23+
24+
def test_fill_hg38_with_missing_values_nan(self):
25+
"""Test filling hg38 values when they are missing (NaN case)."""
26+
self.df = pd.DataFrame({
27+
'VariantOnGenome/DNA': ['g.64430518C>T'],
28+
'VariantOnGenome/DNA/hg38': [pd.NA]
29+
})
30+
lovd_fill_hg38(self.df)
31+
expected_values = ['6-63720622-C-T']
32+
self.assertIn('hg38_gnomad_format', self.df.columns)
33+
self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)
34+
35+
def test_fill_hg38_with_missing_values_empty_string(self):
36+
"""Test filling hg38 values when they are missing (empty string case)."""
37+
self.df = pd.DataFrame({
38+
'VariantOnGenome/DNA': ['g.64430518C>T'],
39+
'VariantOnGenome/DNA/hg38': [""]
40+
})
41+
lovd_fill_hg38(self.df)
42+
expected_values = ['6-63720622-C-T']
43+
self.assertIn('hg38_gnomad_format', self.df.columns)
44+
self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)
45+
46+
def test_fill_hg38_with_dup_cases(self):
47+
"""Test filling hg38 values when they have 'dup' postfix."""
48+
self.df = pd.DataFrame({
49+
'VariantOnGenome/DNA': ['g.64430518dup'],
50+
'VariantOnGenome/DNA/hg38': [""]
51+
})
52+
lovd_fill_hg38(self.df)
53+
expected_values = ['6-63720622-dup']
54+
self.assertIn('hg38_gnomad_format', self.df.columns)
55+
self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)
56+
57+
def test_fill_hg38_with_del_cases(self):
58+
"""Test filling hg38 values when they have 'del' postfix."""
59+
self.df = pd.DataFrame({
60+
'VariantOnGenome/DNA': ['g.64430518del'],
61+
'VariantOnGenome/DNA/hg38': [""]
62+
})
63+
lovd_fill_hg38(self.df)
64+
expected_values = ['6-63720622-del']
65+
self.assertIn('hg38_gnomad_format', self.df.columns)
66+
self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)
67+
68+
def test_fill_hg38_with_interval_cases(self):
69+
"""Test filling hg38 values when they have intervals (e.g., 'del' range)."""
70+
self.df = pd.DataFrame({
71+
'VariantOnGenome/DNA': ['g.64430540_64430544del'],
72+
'VariantOnGenome/DNA/hg38': [""]
73+
})
74+
lovd_fill_hg38(self.df)
75+
expected_values = ["?"]
76+
self.assertIn('hg38_gnomad_format', self.df.columns)
77+
self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)
78+
79+
def test_fill_hg38_no_variants(self):
80+
"""Test filling hg38 values when there are no variants in the dataframe."""
81+
self.df = pd.DataFrame(columns=['VariantOnGenome/DNA', 'VariantOnGenome/DNA/hg38'])
82+
lovd_fill_hg38(self.df)
83+
self.assertEqual(self.df.shape[0], 0, "Empty dataframe should not add rows.")
84+
85+
86+
if __name__ == '__main__':
87+
unittest.main()

0 commit comments

Comments
 (0)