- Small QoL update to test_categorical_processor.py

SietsmaRJ · SietsmaRJ · commit 75dbec3b4dab · 2022-12-16T10:34:06.000+01:00
- Processed mypy feedback
- Processed flake8 feedback
diff --git a/src/molgenis/capice/cli/args_handler_parent.py b/src/molgenis/capice/cli/args_handler_parent.py
@@ -119,6 +119,7 @@ def _retrieve_argument_from_list(self,
             return self._single_argument_retriever(arg, arg_name, has_default)
         except IOError as e:
             self.parser.error(e)
+            return None
 
     @staticmethod
     def _single_argument_retriever(arg: list | None,
diff --git a/src/molgenis/capice/cli/args_handler_predict.py b/src/molgenis/capice/cli/args_handler_predict.py
@@ -20,7 +20,8 @@ def _extension(self):
 
     @property
     def _model_extension(self) -> tuple[str]:
-        return '.json', '.ubj'
+        # Ignore because the amount of values of tuple does not matter.
+        return '.json', '.ubj'  # type: ignore
 
     def _model_extension_str(self) -> str:
         return self._join_extensions(self._model_extension)
diff --git a/src/molgenis/capice/utilities/categorical_processor.py b/src/molgenis/capice/utilities/categorical_processor.py
@@ -20,7 +20,7 @@ def __init__(self):
     def process(
             self,
             dataset: pd.DataFrame,
-            processable_features: list | None = None,
+            processable_features: list[str] | None = None,
             predetermined_features: dict[str, list] | None = None
     ) -> tuple[pd.DataFrame, dict[str, list]]:
         """
@@ -54,7 +54,12 @@ def process(
         self._validate_one_feature_list_present(processable_features, predetermined_features)
         self._create_preservation_col(dataset)
         if predetermined_features is None:
-            processing_features = self._get_categorical_columns(dataset, processable_features)
+            # Type ignore, else mypy takes issue with Typing since processable_features can be
+            # None, so it is considered Optional[list[str]] instead of list[str].
+            processing_features = self._get_categorical_columns(
+                dataset,
+                processable_features  # type: ignore
+            )
         else:
             processing_features = predetermined_features
 
@@ -88,7 +93,8 @@ def _create_preservation_col(dataset: pd.DataFrame) -> None:
             [Column.chr.value, Column.pos.value, Column.ref.value, Column.alt.value]
         ].astype(str).agg(UniqueSeparator.unique_separator.value.join, axis=1)
 
-    def _get_categorical_columns(self, dataset: pd.DataFrame, processable_features: list) -> dict:
+    def _get_categorical_columns(self, dataset: pd.DataFrame,
+                                 processable_features: list[str]) -> dict[str, list]:
         """
         Method for when the predetermined_features is None, usually in case of train,
         to determine the top 5 features that should be used for pandas.get_dummies().
diff --git a/src/molgenis/capice/utilities/dynamic_loader.py b/src/molgenis/capice/utilities/dynamic_loader.py
@@ -21,7 +21,7 @@ def __init__(self, required_attributes: list, path):
         self.path = path
         self._check_dir_exists()
         self.required_attributes = required_attributes
-        self.modules = {}
+        self.modules: dict[str, object] = {}
 
     def load_manual_annotators(self):
         """
@@ -82,14 +82,15 @@ def _load_modules_from_path(path):
                 modules.append(module)
         return modules
 
-    def _import(self, usable_modules: list):
+    def _import(self, usable_modules: list[str]) -> dict[str, object]:
         """
         Function  to dynamically load in the modules using the
         import_module library.
         :param usable_modules: list of absolute paths to potential modules
         :return: list of usable modules
         """
-        return_modules = {}
+        # For some reason, mypy wants this line to be Typed instead of the method.
+        return_modules: dict[str, object] = {}
         for module in usable_modules:
             name = os.path.basename(module).split('.py')[0]
             spec = util.spec_from_file_location(name=name, location=module)
diff --git a/src/molgenis/capice/validators/input_validator.py b/src/molgenis/capice/validators/input_validator.py
@@ -16,7 +16,7 @@ def validate_input_path(input_path: os.PathLike, extension: tuple[str]):
         """
         if not os.path.exists(input_path):
             raise FileNotFoundError(f'{input_path} does not exist!')
-        if not (input_path.endswith(extension)):
+        if not str(input_path).endswith(extension):
             raise IOError(f'{input_path} does not match required extension: '
                           f'{", ".join(extension)}')
 
diff --git a/src/molgenis/capice/validators/predict_validator.py b/src/molgenis/capice/validators/predict_validator.py
@@ -23,7 +23,7 @@ def validate_data_predict_ready(self, dataset: pd.DataFrame, model: xgb.XGBClass
                 Raised when a required predict feature is missing from dataset.
         """
         missing = []
-        for feature in model.get_booster().feature_names:
+        for feature in model.get_booster().feature_names:  # type: ignore
             if feature not in dataset.columns:
                 missing.append(feature)
         if len(missing) > 0:
diff --git a/src/molgenis/capice/validators/version_validator.py b/src/molgenis/capice/validators/version_validator.py
@@ -79,16 +79,18 @@ def validate_versions_compatible(self, capice_version: str, model_version: str):
         ValueError
             Raised when the model and framework versions are not compatible.
         """
+        # All mypy ignores here are because attributes are not found.
         capice = match(self.regex, capice_version)
         model = match(self.regex, model_version)
-        if capice.group('major') != model.group('major'):
+        if capice.group('major') != model.group('major'):  # type: ignore
             raise ValueError(
-                f'CAPICE major version {capice.string} does not match with the model '
-                f'{model.string}!'
+                f'CAPICE major version {capice.string} '  # type: ignore
+                f'does not match with the model '
+                f'{model.string}!'  # type: ignore
             )
 
-        if capice.group('prerelease') or model.group('prerelease'):
-            self._validate_prerelease(capice, model)
+        if capice.group('prerelease') or model.group('prerelease'):  # type: ignore
+            self._validate_prerelease(capice, model)  # type: ignore
 
     @staticmethod
     def _validate_prerelease(capice_version: re.Match,
diff --git a/tests/capice/utilities/test_categorical_processor.py b/tests/capice/utilities/test_categorical_processor.py
@@ -26,6 +26,10 @@ def setUp(cls):
     def tearDownClass(cls) -> None:
         teardown()
 
+    @staticmethod
+    def creat_other_column(value: str) -> str:
+        return '_'.join([value, Column.other.value])
+
     def test_unit_preprocessing_file(self):
         """
         Unit test for the preprocessor to see if the preprocessor works just
@@ -88,15 +92,15 @@ def test_preprocessing_train(self):
                 'foo_a': [1, 0, 0, 0, 0, 0],
                 'foo_b': [0, 1, 0, 0, 0, 0],
                 'foo_c': [0, 0, 1, 0, 0, 0],
-                'foo_other_CAPICE_value': [0, 0, 0, 1, 1, 1],
+                self.creat_other_column('foo'): [0, 0, 0, 1, 1, 1],
                 'bar_a': [1, 0, 0, 0, 0, 0],
-                'bar_other_CAPICE_value': [0, 1, 1, 1, 1, 1],
+                self.creat_other_column('bar'): [0, 1, 1, 1, 1, 1],
                 'baz_a': [1, 0, 0, 0, 0, 0],
                 'baz_b': [0, 1, 0, 0, 0, 0],
                 'baz_c': [0, 0, 1, 0, 0, 0],
                 'baz_d': [0, 0, 0, 1, 0, 0],
                 'baz_e': [0, 0, 0, 0, 1, 0],
-                'baz_other_CAPICE_value': [0, 0, 0, 0, 0, 1],
+                self.creat_other_column('baz'): [0, 0, 0, 0, 0, 1],
                 'REF': ['A', 'T', 'A', 'T', 'A', 'T'],
                 'ALT': ['G', 'C', 'G', 'C', 'G', 'C'],
                 'feature_1': [1, 2, 3, 4, np.nan, np.nan],
@@ -144,7 +148,7 @@ def test_creation_other(self):
             Column.other.value,
             observed_dict['foo']
         )
-        self.assertIn('foo_other_CAPICE_value', observed_df.columns)
+        self.assertIn(self.creat_other_column('foo'), observed_df.columns)
 
     def test_creation_other_notin(self):
         test_case = pd.concat(
@@ -167,7 +171,7 @@ def test_creation_other_notin(self):
             Column.other.value,
             observed_dict['foo']
         )
-        self.assertNotIn('foo_other_CAPICE_value', observed_df.columns)
+        self.assertNotIn(self.creat_other_column('foo'), observed_df.columns)
 
     def test_other_in_top_5(self):
         # Tests that, if "other" occurs in the top 5 categories, only this "other" feature gets
@@ -188,7 +192,7 @@ def test_other_in_top_5(self):
         self.assertFalse(test_series[test_series > 0].size > 2,
                          msg=f'Actual size: {test_series[test_series > 0].size}')
         self.assertIn(
-            'foo_other_CAPICE_value',
+            self.creat_other_column('foo'),
             observed_df.columns
         )