added exceptions for git ignore, fixing naming issues

Akaud · Akaud · commit b65e97132ccc · 2024-09-02T20:56:16.000+03:00
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@ __pycache__
 # specific folders
 data/*
 tests/*
+!test_lovd_fill_hg38.py
 
 # large gene reference files
 *.fa
diff --git a/api/data/refactoring.py b/api/data/refactoring.py
@@ -179,86 +179,67 @@ def from_clinvar_name_to_cdna_position(name):
 
 def lovd_fill_hg38(lovd: pd.DataFrame):
     """
-    fills missing hg38 values in the LOVD dataframe
+    Fills missing hg38 values in the LOVD dataframe
     by converting hg19 values to hg38.
     New column 'hg19/hg38_lovd' is added to store
     the converted positions in the format '6-position-ref-alt'.
-
-    parameters:
-    - lovd (pd.DataFrame): A pandas DataFrame containing following columns:
-        - 'VariantOnGenome/DNA': hg19 values.
-        - 'VariantOnGenome/DNA/hg38': hg38 values.
-
-    returns:
-    None: Modifies the input DataFrame in-place by adding or
-    updating the 'hg19/hg38_lovd' column.
+    :param lovd: pandas DataFrame containing following columns:
+               - 'VariantOnGenome/DNA': hg19 values.
+               - 'VariantOnGenome/DNA/hg38': hg38 values.
+    :return: None: Modifies the input DataFrame in-place by adding or
+               updating the 'hg19/hg38_lovd' column.
     """
 
     if lovd.empty:
         return
+    lovd['VariantOnGenome/DNA/hg38'] = lovd['VariantOnGenome/DNA/hg38'].replace('', pd.NA)
+    lovd['hg38_gnomad_format'] = lovd.apply(convert_hg19_if_missing, axis=1)
 
-    def convert_hg19_if_missing(row):
-        """
-        converts hg19 variant to hg38 if hg38 is missing.
-        Checks if the hg38 value is missing (NaN) in a given row.
-        If it is, the hg19 variant is converted to hg38
-        using the `convert_hg19_to_hg38` function.
-        Otherwise, the existing hg38 value is formatted.
-
-        parameters:
-        - row (pd.Series):  single row of the DataFrame.
-
-        returns:
-        - str: hg38 value or a conversion of the hg19 value in the format '6-position-ref-alt'.
-        """
-        if pd.isna(row['VariantOnGenome/DNA/hg38']):
-            return convert_hg19_to_hg38(convert_to_gnomad_gen_pos(row['VariantOnGenome/DNA']))
-        return convert_to_gnomad_gen_pos(row['VariantOnGenome/DNA/hg38'])
-
-    def convert_hg19_to_hg38(position: str, lo=LiftOver('hg19', 'hg38')):
-        """
-        converts a genomic position from hg19 to hg38 using the LiftOver tool.
-
-        parameters:
-        - position (str): string representing the hg19 variant
-        in the format 'g.positionRef>Alt'.
-        - lo (LiftOver): converter for coordinates between genome builds
-
-        returns:
-        - str: converted hg38 position in the format '6-position-ref-alt'.
-        """
-        if '?' in position:
-            return '?'
-        try:
-            new_pos = lo.convert_coordinate('chr6', int(position[2:10]))[0][1]
-        except ValueError as ve:
-            return f"Error processing variant (ValueError): {str(ve)}"
-        except IndexError as ie:
-            return f"Error processing variant (IndexError): {str(ie)}"
-        except TypeError as te:
-            return f"Error processing variant (TypeError): {str(te)}"
-        return f"6-{new_pos}-{position[-3:]}"
 
-    lovd['VariantOnGenome/DNA/hg38'] = lovd['VariantOnGenome/DNA/hg38'].replace('', pd.NA)
-    lovd['hg19/hg38_lovd'] = lovd.apply(convert_hg19_if_missing, axis=1)
+def convert_hg19_if_missing(row):
+    """
+    converts hg19 variant to hg38 if hg38 is missing.
+    Checks if the hg38 value is missing (NaN) in a given row.
+    If it is, the hg19 variant is converted to hg38
+    using the `convert_hg19_to_hg38` function.
+    Otherwise, the existing hg38 value is formatted.
+    :param row: single row of the DataFrame.
+    :return:
+    - str: hg38 value or a conversion of
+    the hg19 value in the format '6-position-ref-alt'.
+    """
+
+    if pd.isna(row['VariantOnGenome/DNA/hg38']):
+        return convert_hg19_to_hg38(convert_to_gnomad_gen(row['VariantOnGenome/DNA']))
+    return convert_to_gnomad_gen(row['VariantOnGenome/DNA/hg38'])
 
 
-def convert_to_gnomad_gen_pos(variant: str):
+def convert_hg19_to_hg38(position: str, lo=LiftOver('hg19', 'hg38')):
     """
-    converts a variant string from hg19 or hg38 format
-    to the format used by gnomAD ('6-position-ref-alt').
+    converts a genomic position from hg19 to hg38 using the LiftOver tool.
+    :param position: string representing the hg19 variant
+                    in the format 'g.positionRef>Alt'.
+    :param lo: converter for coordinates between genome builds
+    :return: string converted hg38 position in the format '6-position-ref-alt'.
+    """
+
+    if '?' in position:
+        return '?'
+    new_pos = lo.convert_coordinate('chr6', int(position[2:10]))[0][1]
+    return f"6-{new_pos}-{position[-3:]}"
 
-    parameters:
-    - variant (str): string representing the variant
-    in the format 'g.startRef>Alt'.
 
-    returns:
-    - str: variant formatted as '6-position-ref-alt'
+def convert_to_gnomad_gen(variant: str):
+    """
+    converts a variant string from hg19 or hg38 format
+    to the format used by gnomAD ('6-position-ref-alt').
+    :param variant: str: the variant in the format 'g.startRef>Alt'.
+    :return: str: variant formatted as '6-position-ref-alt'
     or '?' if the input contains interval ranges or is invalid.
     """
 
     if not isinstance(variant, str):
-        return "?"
+        raise TypeError(f"Expected a string for 'variant', got {type(variant).__name__} instead")
 
     patterns = {
         'dup': re.compile(r'^g\.(\d+)dup$'),
@@ -308,7 +289,7 @@ def merge_gnomad_lovd(lovd, gnomad):
         lovd,
         gnomad,
         how="outer",
-        left_on="hg19/hg38_lovd",
+        left_on="hg38_gnomad_format",
         right_on="gnomAD ID_gnomad"
     )
 
diff --git a/tests/test_lovd_fill_hg38.py b/tests/test_lovd_fill_hg38.py
@@ -0,0 +1,87 @@
+import unittest
+import pandas as pd
+from api.data.refactoring import lovd_fill_hg38
+
+
+class TestLOVDFillHg38(unittest.TestCase):
+
+    def setUp(self):
+        """Set up any initial data used in multiple tests."""
+        self.df = pd.DataFrame()
+
+    def test_fill_hg38_with_no_missing_values(self):
+        """Test filling hg38 values when no values are missing."""
+        self.df = pd.DataFrame({
+            'VariantOnGenome/DNA': ['g.64430517C>T', 'g.64430535C>G'],
+            'VariantOnGenome/DNA/hg38': ['g.63720621C>T', 'g.63720639C>G']
+        })
+        lovd_fill_hg38(self.df)
+        expected_values = ['6-63720621-C-T', '6-63720639-C-G']
+        self.assertIn('hg38_gnomad_format', self.df.columns,
+                      "Column 'hg38_gnomad_format' should be added.")
+        self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)
+
+    def test_fill_hg38_with_missing_values_nan(self):
+        """Test filling hg38 values when they are missing (NaN case)."""
+        self.df = pd.DataFrame({
+            'VariantOnGenome/DNA': ['g.64430518C>T'],
+            'VariantOnGenome/DNA/hg38': [pd.NA]
+        })
+        lovd_fill_hg38(self.df)
+        expected_values = ['6-63720622-C-T']
+        self.assertIn('hg38_gnomad_format', self.df.columns)
+        self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)
+
+    def test_fill_hg38_with_missing_values_empty_string(self):
+        """Test filling hg38 values when they are missing (empty string case)."""
+        self.df = pd.DataFrame({
+            'VariantOnGenome/DNA': ['g.64430518C>T'],
+            'VariantOnGenome/DNA/hg38': [""]
+        })
+        lovd_fill_hg38(self.df)
+        expected_values = ['6-63720622-C-T']
+        self.assertIn('hg38_gnomad_format', self.df.columns)
+        self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)
+
+    def test_fill_hg38_with_dup_cases(self):
+        """Test filling hg38 values when they have 'dup' postfix."""
+        self.df = pd.DataFrame({
+            'VariantOnGenome/DNA': ['g.64430518dup'],
+            'VariantOnGenome/DNA/hg38': [""]
+        })
+        lovd_fill_hg38(self.df)
+        expected_values = ['6-63720622-dup']
+        self.assertIn('hg38_gnomad_format', self.df.columns)
+        self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)
+
+    def test_fill_hg38_with_del_cases(self):
+        """Test filling hg38 values when they have 'del' postfix."""
+        self.df = pd.DataFrame({
+            'VariantOnGenome/DNA': ['g.64430518del'],
+            'VariantOnGenome/DNA/hg38': [""]
+        })
+        lovd_fill_hg38(self.df)
+        expected_values = ['6-63720622-del']
+        self.assertIn('hg38_gnomad_format', self.df.columns)
+        self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)
+
+    def test_fill_hg38_with_interval_cases(self):
+        """Test filling hg38 values when they have intervals (e.g., 'del' range)."""
+        self.df = pd.DataFrame({
+            'VariantOnGenome/DNA': ['g.64430540_64430544del'],
+            'VariantOnGenome/DNA/hg38': [""]
+        })
+        lovd_fill_hg38(self.df)
+        expected_values = ["?"]
+        self.assertIn('hg38_gnomad_format', self.df.columns)
+        self.assertListEqual(self.df['hg38_gnomad_format'].tolist(), expected_values)
+
+    def test_fill_hg38_no_variants(self):
+        """Test filling hg38 values when there are no variants in the dataframe."""
+        self.df = pd.DataFrame(columns=['VariantOnGenome/DNA', 'VariantOnGenome/DNA/hg38'])
+        lovd_fill_hg38(self.df)
+        self.assertEqual(self.df.shape[0], 0, "Empty dataframe should not add rows.")
+
+
+if __name__ == '__main__':
+    unittest.main()