Skip to content

Commit

Permalink
refactor lovd_fill_hg38 and all functions used inside it(same result,…
Browse files Browse the repository at this point in the history
… different approach)
  • Loading branch information
Akaud committed Sep 3, 2024
1 parent 97fcf1f commit f7c133e
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 49 deletions.
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ __pycache__

# specific folders
data/*
tests/*
!test_lovd_fill_hg38.py

# large gene reference files
*.fa
56 changes: 20 additions & 36 deletions api/data/refactoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,66 +181,50 @@ def lovd_fill_hg38(lovd: pd.DataFrame):
"""
Fills missing hg38 values in the LOVD dataframe
by converting hg19 values to hg38.
New column 'hg19/hg38_lovd' is added to store
New column 'hg38_gnomad_format' is added to store
the converted positions in the format '6-position-ref-alt'.
:param lovd: pandas DataFrame containing following columns:
- 'VariantOnGenome/DNA': hg19 values.
- 'VariantOnGenome/DNA/hg38': hg38 values.
:return: None: Modifies the input DataFrame in-place by adding or
updating the 'hg19/hg38_lovd' column.
:return: None: Modifies the input DataFrame in-place by adding
'hg38_gnomad_format' column.
"""

if lovd.empty:
return
lovd['VariantOnGenome/DNA/hg38'] = lovd['VariantOnGenome/DNA/hg38'].replace('', pd.NA)
lovd['hg38_gnomad_format'] = lovd.apply(convert_hg19_if_missing, axis=1)
lovd['hg38_gnomad_format'] = lovd['VariantOnGenome/DNA/hg38'].replace('', pd.NA)
missing_hg38_mask = lovd['hg38_gnomad_format'].isna()
lovd.loc[missing_hg38_mask, 'hg38_gnomad_format'] = lovd.loc[missing_hg38_mask, 'VariantOnGenome/DNA'].apply(
convert_hg19_if_missing)
lovd['hg38_gnomad_format'] = lovd['hg38_gnomad_format'].apply(convert_to_gnomad_gen)


def convert_hg19_if_missing(row):
def convert_hg19_if_missing(hg19: pd.Series, lo = LiftOver('hg19', 'hg38')):
"""
converts hg19 variant to hg38 if hg38 is missing.
Checks if the hg38 value is missing (NaN) in a given row.
If it is, the hg19 variant is converted to hg38
using the `convert_hg19_to_hg38` function.
Otherwise, the existing hg38 value is formatted.
:param row: single row of the DataFrame.
:return:
- str: hg38 value or a conversion of
the hg19 value in the format '6-position-ref-alt'.
Converts hg19 variant to hg38 if hg38 is missing.
:param hg19: a row from the DataFrame.
:param lo: converter for genomic data between reference assemblies
:return: hg38 value or a conversion of the hg19 value in the format 'g.positionref>alt'.
"""

if pd.isna(row['VariantOnGenome/DNA/hg38']):
return convert_hg19_to_hg38(convert_to_gnomad_gen(row['VariantOnGenome/DNA']))
return convert_to_gnomad_gen(row['VariantOnGenome/DNA/hg38'])


def convert_hg19_to_hg38(position: str, lo=LiftOver('hg19', 'hg38')):
"""
converts a genomic position from hg19 to hg38 using the LiftOver tool.
:param position: string representing the hg19 variant
in the format 'g.positionRef>Alt'.
:param lo: converter for coordinates between genome builds
:return: string converted hg38 position in the format '6-position-ref-alt'.
"""

if '?' in position:
if pd.isna(hg19):
return "?"
if '?' in hg19 or "_" in hg19:
return '?'
new_pos = lo.convert_coordinate('chr6', int(position[2:10]))[0][1]
return f"6-{new_pos}-{position[-3:]}"
position_str = hg19[2:10]
new_pos = lo.convert_coordinate('chr6', int(position_str))[0][1]
return f"g.{new_pos}{hg19[-3:]}"


def convert_to_gnomad_gen(variant: str):
"""
converts a variant string from hg19 or hg38 format
converts a variant string from hg38 format
to the format used by gnomAD ('6-position-ref-alt').
:param variant: str: the variant in the format 'g.startRef>Alt'.
:return: str: variant formatted as '6-position-ref-alt'
or '?' if the input contains interval ranges or is invalid.
"""

if not isinstance(variant, str):
raise TypeError(f"Expected a string for 'variant', got {type(variant).__name__} instead")

patterns = {
'dup': re.compile(r'^g\.(\d+)dup$'),
'del': re.compile(r'^g\.(\d+)del$'),
Expand Down
4 changes: 2 additions & 2 deletions tests/pipeline.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4018,8 +4018,8 @@
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-09-02T18:03:53.205784Z",
"start_time": "2024-09-02T18:03:43.273240Z"
"end_time": "2024-09-03T07:46:50.003340Z",
"start_time": "2024-09-03T07:46:40.913231Z"
}
},
"id": "dd9b17623f26a07c",
Expand Down
9 changes: 0 additions & 9 deletions tests/test_lovd_fill_hg38.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,15 +82,6 @@ def test_fill_hg38_no_variants(self):
lovd_fill_hg38(self.df)
self.assertEqual(self.df.shape[0], 0, "Empty dataframe should not add rows.")

def test_fill_hg38_NA_variants(self):
"""Test filling hg38 values when there pd. NA variants in the dataframe."""
self.df = pd.DataFrame({
'VariantOnGenome/DNA': [pd.NA],
'VariantOnGenome/DNA/hg38': [pd.NA]
})
with self.assertRaises(TypeError) as context:
lovd_fill_hg38(self.df)
self.assertEqual(str(context.exception), "Expected a string for 'variant', got NAType instead")


if __name__ == '__main__':
Expand Down

0 comments on commit f7c133e

Please sign in to comment.