Fix mypy issue by adding plugin, add tests (#534)

amrit110 · web-flow · commit 3b3142555220 · 2023-12-15T17:16:18.000-05:00
* Fix mypy issue by adding plugin, add tests

* Add test for sklearn model wrapper find_best method

* Small fix, add [all] option to extras install

* Additional fixes, test
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -28,7 +28,7 @@ repos:
       types_or: [python, jupyter]
 
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.6.1
+    rev: v1.7.1
     hooks:
     - id: mypy
       entry: python3 -m mypy --config-file pyproject.toml
diff --git a/cyclops/models/__init__.py b/cyclops/models/__init__.py
@@ -40,7 +40,7 @@
 register_model(name="sgd_regressor", model_type="static")(SGDRegressor)
 register_model("rf_classifier", model_type="static")(RandomForestClassifier)
 register_model("logistic_regression", model_type="static")(LogisticRegression)
-register_model("mlp", model_type="static")(MLPClassifier)
+register_model("mlp_classifier", model_type="static")(MLPClassifier)
 if XGBClassifier is not None:
     register_model("xgb_classifier", model_type="static")(XGBClassifier)
 if DenseNet is not None:
diff --git a/cyclops/models/catalog.py b/cyclops/models/catalog.py
@@ -224,7 +224,7 @@ def create_model(
             raise RuntimeError(_xgboost_unavailable_message)
         if model_name in ["densenet", "resnet"]:
             raise RuntimeError(_torchxrayvision_unavailable_message)
-        if model_name in ["gru", "lstm", "mlp", "rnn"]:
+        if model_name in ["gru", "lstm", "mlp_pt", "rnn"]:
             raise RuntimeError(_torch_unavailable_message)
         similar_keys_list: List[str] = get_close_matches(
             model_name,
diff --git a/cyclops/models/configs/mlp_classifier.yaml b/cyclops/models/configs/mlp_classifier.yaml
diff --git a/cyclops/models/wrappers/sk_model.py b/cyclops/models/wrappers/sk_model.py
@@ -187,7 +187,7 @@ def find_best(  # noqa: PLR0912, PLR0915
         if isinstance(X, (Dataset, DatasetDict)):
             if feature_columns is None:
                 raise ValueError(
-                    "Missing target columns 'target_columns'. Please provide \
+                    "Missing target columns 'feature_columns'. Please provide \
                     the name of feature columns when using a \
                     Hugging Face dataset as the input.",
                 )
@@ -336,10 +336,11 @@ def find_best(  # noqa: PLR0912, PLR0915
                 )
             clf.fit(X, y)
 
-        for key, value in clf["clf"].best_params_.items():
+        if isinstance(clf, Pipeline):
+            clf = clf["clf"]
+        for key, value in clf.best_params_.items():
             LOGGER.info("Best %s: %s", key, value)
-
-        self.model_ = clf["clf"].best_estimator_
+        self.model_ = clf.best_estimator_
 
         return self
 
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -98,7 +98,7 @@ black = "^22.1.0"
 pytest-cov = "^3.0.0"
 codecov = "^2.1.13"
 nbstripout = "^0.6.1"
-mypy = "^1.0.0"
+mypy = "^1.7.0"
 ruff = "^0.1.0"
 nbqa = { version = "^1.7.0", extras = ["toolchain"] }
 cycquery = "^0.1.2" # used for integration test
@@ -146,8 +146,10 @@ monai = ["torch", "monai"]
 xgboost = ["xgboost"]
 alibi = ["llvmlite", "alibi"]
 alibi-detect = ["torch", "llvmlite", "alibi-detect"]
+all = ["torch", "torchvision", "torchxrayvision", "llvmlite", "monai", "xgboost", "alibi", "alibi-detect"]
 
 [tool.mypy]
+plugins = ["numpy.typing.mypy_plugin"]
 ignore_missing_imports = true
 install_types = true
 pretty = true
diff --git a/tests/cyclops/models/wrappers/test_sk_model.py b/tests/cyclops/models/wrappers/test_sk_model.py
@@ -0,0 +1,67 @@
+"""Tests for scikit-learn model wrapper."""
+
+import pandas as pd
+from datasets import Dataset
+from sklearn.datasets import load_diabetes
+
+from cyclops.models import create_model
+from cyclops.models.wrappers import SKModel
+
+
+def test_find_best_grid_search():
+    """Test find_best method with grid search."""
+    parameters = {"C": [1], "l1_ratio": [0.5]}
+    X, y = load_diabetes(return_X_y=True)
+    metric = "accuracy"
+    method = "grid"
+
+    model = create_model("logistic_regression", penalty="elasticnet", solver="saga")
+    best_estimator = model.find_best(
+        parameters=parameters,
+        X=X,
+        y=y,
+        metric=metric,
+        method=method,
+    )
+    assert isinstance(best_estimator, SKModel)
+
+
+def test_find_best_random_search():
+    """Test find_best method with random search."""
+    parameters = {"alpha": [0.001], "hidden_layer_sizes": [10]}
+    X, y = load_diabetes(return_X_y=True)
+    metric = "accuracy"
+    method = "random"
+
+    model = create_model("mlp_classifier", early_stopping=True)
+    best_estimator = model.find_best(
+        parameters=parameters,
+        X=X,
+        y=y,
+        metric=metric,
+        method=method,
+    )
+    assert isinstance(best_estimator, SKModel)
+
+
+def test_find_best_hf_dataset_input():
+    """Test find_best method with huggingface dataset input."""
+    parameters = {"alpha": [0.001], "hidden_layer_sizes": [10]}
+    data = load_diabetes(as_frame=True)
+    X, y = data["data"], data["target"]
+    X_y = pd.concat([X, y], axis=1)
+    features_names = data["feature_names"]
+    dataset = Dataset.from_pandas(X_y)
+    metric = "accuracy"
+    method = "random"
+
+    model = create_model("mlp_classifier", early_stopping=True)
+    best_estimator = model.find_best(
+        parameters=parameters,
+        X=dataset,
+        metric=metric,
+        method=method,
+        feature_columns=features_names,
+        target_columns="target",
+    )
+    assert isinstance(best_estimator, SKModel)
diff --git a/tests/cyclops/models/wrappers/test_utils.py b/tests/cyclops/models/wrappers/test_utils.py
@@ -5,7 +5,53 @@
 import torch
 from datasets import Dataset
 
-from cyclops.models.wrappers.utils import DatasetColumn, to_numpy, to_tensor
+from cyclops.models.wrappers.utils import (
+    DatasetColumn,
+    get_params,
+    set_params,
+    to_numpy,
+    to_tensor,
+)
+
+
+def test_set_params():
+    """Test set_params function."""
+
+    class ExampleClass:
+        """Example class for testing."""
+
+        def __init__(self, param1, param2, param3):
+            """Initialize the class."""
+            self.param1 = param1
+            self.param2 = param2
+            self.param3 = param3
+
+    params = {"param1": 10, "param2": "hello", "param3": True}
+    example_class = ExampleClass(1, "world", False)
+    set_params(example_class, **params)
+    assert example_class.param1 == 10
+    assert example_class.param2 == "hello"
+    assert example_class.param3 is True
+
+
+def test_get_params():
+    """Test get_params function."""
+
+    class ExampleClass:
+        """Example class for testing."""
+
+        def __init__(self, param1, param2, param3):
+            """Initialize the class."""
+            self.param1 = param1
+            self.param2 = param2
+            self.param3 = param3
+
+    result = get_params(ExampleClass(10, "hello", True))
+    assert isinstance(result, dict)
+    assert len(result) == 3
+    assert result["param1"] == 10
+    assert result["param2"] == "hello"
+    assert result["param3"] is True
 
 
 @pytest.mark.integration_test()
diff --git a/tests/cyclops/tasks/test_classification.py b/tests/cyclops/tasks/test_classification.py
@@ -19,7 +19,7 @@ class TestBinaryTabularClassificationTask(TestCase):
 
     def setUp(self):
         """Set up for testing."""
-        self.model_name = "mlp"
+        self.model_name = "mlp_classifier"
         self.model = create_model(self.model_name)
         self.test_task = BinaryTabularClassificationTask(
             {self.model_name: self.model},