From b8dc2c6cde8d79db1bd256347a86c70ee31075f2 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 15 Aug 2023 14:29:31 -0400
Subject: [PATCH] Fixed unit tests

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/ensemble/_honest_forest.py     |  96 +++++++++++++++++++-
 sktree/tests/test_honest_forest.py    |  52 ++++++-----
 sktree/tree/_classes.py               | 124 +++++++++-----------------
 sktree/tree/_honest_tree.py           | 100 +++++++++++++++++++--
 sktree/tree/tests/test_honest_tree.py |   8 +-
 5 files changed, 268 insertions(+), 112 deletions(-)

diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py
index 295617025..35f1eb706 100644
--- a/sktree/ensemble/_honest_forest.py
+++ b/sktree/ensemble/_honest_forest.py
@@ -298,7 +298,10 @@ class labels (multi-output problem).
     >>> X, y = make_classification(n_samples=1000, n_features=4,
     ...                            n_informative=2, n_redundant=0,
     ...                            random_state=0, shuffle=False)
-    >>> clf = HonestForestClassifier(max_depth=2, random_state=0)
+    >>> clf = HonestForestClassifier(
+    >>>        max_depth=2,
+    >>>        random_state=0,
+    >>>        tree_estimator=ObliqueDecisionTreeClassifier())
     >>> clf.fit(X, y)
     HonestForestClassifier(...)
     >>> print(clf.predict([[0, 0, 0, 0]]))
@@ -399,8 +402,8 @@ def fit(self, X, y, sample_weight=None):
         self : HonestForestClassifier
             Fitted tree estimator.
         """
-        super().fit(X, y, sample_weight)
         X, y = check_X_y(X, y, multi_output=True)
+        super().fit(X, y, sample_weight)
 
         # Compute honest decision function
         self.honest_decision_function_ = self._predict_proba(
@@ -434,6 +437,7 @@ def predict_proba(self, X):
 
     def _predict_proba(self, X, indices=None, impute_missing=None):
         """predict_proba helper class"""
+        check_is_fitted(self)
         X = self._validate_X_predict(X)
         n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
 
@@ -471,17 +475,105 @@ def _predict_proba(self, X, indices=None, impute_missing=None):
 
     @property
     def structure_indices_(self):
+        """The indices used to learn the structure of the trees."""
         check_is_fitted(self)
         return [tree.structure_indices_ for tree in self.estimators_]
 
     @property
     def honest_indices_(self):
+        """The indices used to fit the leaf nodes."""
         check_is_fitted(self)
         return [tree.honest_indices_ for tree in self.estimators_]
 
+    @property
+    def feature_importances_(self):
+        """The feature importances."""
+        return self.estimator_.feature_importances_
+
     def _more_tags(self):
         return {"multioutput": False}
 
+    def apply(self, X):
+        """
+        Apply trees in the forest to X, return leaf indices.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, its dtype will be converted to
+            ``dtype=np.float32``. If a sparse matrix is provided, it will be
+            converted into a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        X_leaves : ndarray of shape (n_samples, n_estimators)
+            For each datapoint x in X and for each tree in the forest,
+            return the index of the leaf x ends up in.
+        """
+        return self.estimator_.apply(X)
+
+    def decision_path(self, X):
+        """
+        Return the decision path in the forest.
+
+        .. versionadded:: 0.18
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, its dtype will be converted to
+            ``dtype=np.float32``. If a sparse matrix is provided, it will be
+            converted into a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        indicator : sparse matrix of shape (n_samples, n_nodes)
+            Return a node indicator matrix where non zero elements indicates
+            that the samples goes through the nodes. The matrix is of CSR
+            format.
+
+        n_nodes_ptr : ndarray of shape (n_estimators + 1,)
+            The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]]
+            gives the indicator value for the i-th estimator.
+        """
+        return self.estimator_.decision_path(X)
+
+    def predict_quantiles(self, X, quantiles=0.5, method="nearest"):
+        """Predict class or regression value for X at given quantiles.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data.
+        quantiles : float, optional
+            The quantiles at which to evaluate, by default 0.5 (median).
+        method : str, optional
+            The method to interpolate, by default 'linear'. Can be any keyword
+            argument accepted by :func:`~np.quantile`.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples, n_quantiles, [n_outputs])
+            The predicted values. The ``n_outputs`` dimension is present only
+            for multi-output regressors.
+        """
+        return self.estimator_.predict_quantiles(X, quantiles, method)
+
+    def get_leaf_node_samples(self, X):
+        """Get samples in each leaf node across the forest.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data array.
+
+        Returns
+        -------
+        leaf_node_samples : array-like of shape (n_samples, n_estimators)
+            Samples within each leaf node.
+        """
+        return self.estimator_.get_leaf_node_samples(X)
+
 
 def _accumulate_prediction(tree, X, out, lock, indices=None):
     """
diff --git a/sktree/tests/test_honest_forest.py b/sktree/tests/test_honest_forest.py
index a9846642a..64e9aedd2 100644
--- a/sktree/tests/test_honest_forest.py
+++ b/sktree/tests/test_honest_forest.py
@@ -2,9 +2,9 @@
 import pytest
 from sklearn import datasets
 from sklearn.metrics import accuracy_score, r2_score
-from sklearn.tree import DecisionTreeClassifier
 from sklearn.utils.estimator_checks import parametrize_with_checks
 
+from sktree._lib.sklearn.tree import DecisionTreeClassifier
 from sktree.ensemble import HonestForestClassifier
 from sktree.tree import ObliqueDecisionTreeClassifier, PatchObliqueDecisionTreeClassifier
 
@@ -34,6 +34,7 @@ def test_toy_accuracy():
 @pytest.mark.parametrize(
     "estimator",
     [
+        None,
         DecisionTreeClassifier(),
         ObliqueDecisionTreeClassifier(),
         PatchObliqueDecisionTreeClassifier(),
@@ -49,19 +50,21 @@ def test_iris(criterion, max_features, honest_prior, estimator):
         honest_prior=honest_prior,
         tree_estimator=estimator,
     )
-    try:
+    if honest_prior == "error":
+        with pytest.raises(ValueError, match="honest_prior error not a valid input."):
+            clf.fit(iris.data, iris.target)
+    else:
         clf.fit(iris.data, iris.target)
         score = accuracy_score(clf.predict(iris.data), iris.target)
-    except ValueError:
-        return
-    assert score > 0.5 and score < 1.0, "Failed with {0}, criterion = {1} and score = {2}".format(
-        "HForest", criterion, score
-    )
 
-    score = accuracy_score(clf.predict(iris.data), clf.predict_proba(iris.data).argmax(1))
-    assert score == 1.0, "Failed with {0}, criterion = {1} and score = {2}".format(
-        "HForest", criterion, score
-    )
+        assert (
+            score > 0.5 and score < 1.0
+        ), "Failed with {0}, criterion = {1} and score = {2}".format("HForest", criterion, score)
+
+        score = accuracy_score(clf.predict(iris.data), clf.predict_proba(iris.data).argmax(1))
+        assert score == 1.0, "Failed with {0}, criterion = {1} and score = {2}".format(
+            "HForest", criterion, score
+        )
 
 
 @pytest.mark.parametrize("criterion", ["gini", "entropy"])
@@ -90,19 +93,24 @@ def test_iris_multi(criterion, max_features, honest_prior, estimator):
 
     X = iris.data
     y = np.stack((iris.target, second_y[perm])).T
-    try:
+    if honest_prior == "error":
+        with pytest.raises(ValueError, match="honest_prior error not a valid input."):
+            clf.fit(X, y)
+    else:
         clf.fit(X, y)
         score = r2_score(clf.predict(X), y)
-    except ValueError:
-        return
-    if honest_prior == "ignore":
-        assert (
-            score > 0.6 and score < 1.0
-        ), "Failed with {0}, criterion = {1} and score = {2}".format("HForest", criterion, score)
-    else:
-        assert (
-            score > 0.9 and score < 1.0
-        ), "Failed with {0}, criterion = {1} and score = {2}".format("HForest", criterion, score)
+        if honest_prior == "ignore":
+            assert (
+                score > 0.6 and score < 1.0
+            ), "Failed with {0}, criterion = {1} and score = {2}".format(
+                "HForest", criterion, score
+            )
+        else:
+            assert (
+                score > 0.9 and score < 1.0
+            ), "Failed with {0}, criterion = {1} and score = {2}".format(
+                "HForest", criterion, score
+            )
 
 
 def test_max_samples():
diff --git a/sktree/tree/_classes.py b/sktree/tree/_classes.py
index c8e7b2463..0538920fc 100644
--- a/sktree/tree/_classes.py
+++ b/sktree/tree/_classes.py
@@ -2010,8 +2010,20 @@ def __init__(
         self.boundary = boundary
         self.feature_weight = feature_weight
 
-    def fit(self, X, y, sample_weight=None, check_input=True):
-        """Fit tree.
+    def _build_tree(
+        self,
+        X,
+        y,
+        sample_weight,
+        missing_values_in_feature_mask,
+        min_samples_leaf,
+        min_weight_leaf,
+        max_leaf_nodes,
+        min_samples_split,
+        max_depth,
+        random_state,
+    ):
+        """Build the actual tree.
 
         Parameters
         ----------
@@ -2029,37 +2041,42 @@ def fit(self, X, y, sample_weight=None, check_input=True):
             that would create child nodes with net zero or negative weight are
             ignored while searching for a split in each node.
 
-        check_input : bool, optional
-            Whether or not to check input, by default True.
+        min_samples_leaf : int or float
+            The minimum number of samples required to be at a leaf node.
+
+        min_weight_leaf : float, default=0.0
+           The minimum weighted fraction of the sum total of weights.
+
+        max_leaf_nodes : int, default=None
+            Grow a tree with ``max_leaf_nodes`` in best-first fashion.
+
+        min_samples_split : int or float, default=2
+            The minimum number of samples required to split an internal node.
+
+        max_depth : int, default=None
+            The maximum depth of the tree. If None, then nodes are expanded until
+            all leaves are pure or until all leaves contain less than
+            min_samples_split samples.
+
+        random_state : int, RandomState instance or None, default=None
+            Controls the randomness of the estimator.
         """
-        if check_input:
-            # Need to validate separately here.
-            # We can't pass multi_output=True because that would allow y to be
-            # csr.
-            check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
-            check_y_params = dict(ensure_2d=False, dtype=None)
-            X, y = self._validate_data(X, y, validate_separately=(check_X_params, check_y_params))
-            if self.feature_weight is not None:
-                self.feature_weight = self._validate_data(
-                    self.feature_weight, ensure_2d=True, dtype=DTYPE
+        if self.feature_weight is not None:
+            self.feature_weight = self._validate_data(
+                self.feature_weight, ensure_2d=True, dtype=DTYPE
+            )
+            if self.feature_weight.shape != X.shape:
+                raise ValueError(
+                    f"feature_weight has shape {self.feature_weight.shape} but X has "
+                    f"shape {X.shape}"
                 )
-                if self.feature_weight.shape != X.shape:
-                    raise ValueError(
-                        f"feature_weight has shape {self.feature_weight.shape} but X has "
-                        f"shape {X.shape}"
-                    )
-            if issparse(X):
-                X.sort_indices()
-
-                if X.indices.dtype != np.intc or X.indptr.dtype != np.intc:
-                    raise ValueError("No support for np.int64 index based sparse matrices")
 
         if self.data_dims is None:
-            self.data_dims_ = np.array((1, X.shape[1]))
+            self.data_dims_ = np.array((1, X.shape[1]), dtype=np.intp)
         else:
             if np.prod(self.data_dims) != X.shape[1]:
                 raise RuntimeError(f"Data dimensions {self.data_dims} do not match {X.shape[1]}.")
-            self.data_dims_ = np.array(self.data_dims)
+            self.data_dims_ = np.array(self.data_dims, dtype=np.intp)
         ndim = len(self.data_dims_)
 
         # validate contiguous parameter
@@ -2074,7 +2091,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
         if self.min_patch_dims is None:
             self.min_patch_dims_ = np.ones((ndim,), dtype=np.intp)
         else:
-            self.min_patch_dims_ = np.array(self.min_patch_dims)
+            self.min_patch_dims_ = np.array(self.min_patch_dims, dtype=np.intp)
 
         if self.max_patch_dims is None:
             self.max_patch_dims_ = np.ones((ndim,), dtype=np.intp)
@@ -2105,59 +2122,6 @@ def fit(self, X, y, sample_weight=None, check_input=True):
                     f"greater than the data width {self.data_dims_[idx]}"
                 )
 
-        return super().fit(X, y, sample_weight, check_input=False)
-
-    def _build_tree(
-        self,
-        X,
-        y,
-        sample_weight,
-        missing_values_in_feature_mask,
-        min_samples_leaf,
-        min_weight_leaf,
-        max_leaf_nodes,
-        min_samples_split,
-        max_depth,
-        random_state,
-    ):
-        """Build the actual tree.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            The training input samples. Internally, it will be converted to
-            ``dtype=np.float32`` and if a sparse matrix is provided
-            to a sparse ``csc_matrix``.
-
-        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
-            The target values (real numbers). Use ``dtype=np.float64`` and
-            ``order='C'`` for maximum efficiency.
-
-        sample_weight : array-like of shape (n_samples,), default=None
-            Sample weights. If None, then samples are equally weighted. Splits
-            that would create child nodes with net zero or negative weight are
-            ignored while searching for a split in each node.
-
-        min_samples_leaf : int or float
-            The minimum number of samples required to be at a leaf node.
-
-        min_weight_leaf : float, default=0.0
-           The minimum weighted fraction of the sum total of weights.
-
-        max_leaf_nodes : int, default=None
-            Grow a tree with ``max_leaf_nodes`` in best-first fashion.
-
-        min_samples_split : int or float, default=2
-            The minimum number of samples required to split an internal node.
-
-        max_depth : int, default=None
-            The maximum depth of the tree. If None, then nodes are expanded until
-            all leaves are pure or until all leaves contain less than
-            min_samples_split samples.
-
-        random_state : int, RandomState instance or None, default=None
-            Controls the randomness of the estimator.
-        """
         monotonic_cst = None
         n_samples = X.shape[0]
 
diff --git a/sktree/tree/_honest_tree.py b/sktree/tree/_honest_tree.py
index 343830429..a107b7939 100644
--- a/sktree/tree/_honest_tree.py
+++ b/sktree/tree/_honest_tree.py
@@ -4,7 +4,7 @@
 from copy import deepcopy
 
 import numpy as np
-from sklearn.base import MetaEstimatorMixin
+from sklearn.base import ClassifierMixin, MetaEstimatorMixin, _fit_context
 from sklearn.utils.multiclass import check_classification_targets
 from sklearn.utils.validation import check_is_fitted, check_X_y
 
@@ -12,7 +12,7 @@
 from sktree._lib.sklearn.tree._classes import BaseDecisionTree
 
 
-class HonestTreeClassifier(MetaEstimatorMixin, BaseDecisionTree):
+class HonestTreeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseDecisionTree):
     """
     A decision tree classifier with honest predictions.
 
@@ -133,6 +133,23 @@ class HonestTreeClassifier(MetaEstimatorMixin, BaseDecisionTree):
         ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
         :ref:`minimal_cost_complexity_pruning` for details.
 
+    monotonic_cst : array-like of int of shape (n_features), default=None
+        Indicates the monotonicity constraint to enforce on each feature.
+          - 1: monotonic increase
+          - 0: no constraint
+          - -1: monotonic decrease
+
+        If monotonic_cst is None, no constraints are applied.
+
+        Monotonicity constraints are not supported for:
+          - multiclass classifications (i.e. when `n_classes > 2`),
+          - multioutput classifications (i.e. when `n_outputs_ > 1`),
+          - classifications trained on data with missing values.
+
+        The constraints hold over the probability of the positive class.
+
+        Read more in the :ref:`User Guide <monotonic_cst_gbdt>`.
+
     tree_estimator : object, default=None
         Instatiated tree of type BaseDecisionTree.
         If None, then DecisionTreeClassifier with default parameters will
@@ -262,6 +279,7 @@ def __init__(
         min_impurity_decrease=0.0,
         class_weight=None,
         ccp_alpha=0.0,
+        monotonic_cst=None,
         tree_estimator=None,
         honest_fraction=0.5,
         honest_prior="empirical",
@@ -281,8 +299,71 @@ def __init__(
         self.ccp_alpha = ccp_alpha
         self.honest_fraction = honest_fraction
         self.honest_prior = honest_prior
+        self.monotonic_cst = monotonic_cst
+
+        # XXX: to enable this, we need to also reset the leaf node samples during `_set_leaf_nodes`
+        self.store_leaf_values = False
 
-    def fit(self, X, y, sample_weight=None, check_input=True):
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(
+        self,
+        X,
+        y,
+        sample_weight=None,
+        check_input=True,
+        classes=None,
+    ):
+        """Build a decision tree classifier from the training set (X, y).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csc_matrix``.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            The target values (class labels) as integers or strings.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted. Splits
+            that would create child nodes with net zero or negative weight are
+            ignored while searching for a split in each node. Splits are also
+            ignored if they would result in any single class carrying a
+            negative weight in either child node.
+
+        check_input : bool, default=True
+            Allow to bypass several input checking.
+            Don't use this parameter unless you know what you're doing.
+
+        classes : array-like of shape (n_classes,), default=None
+            List of all the classes that can possibly appear in the y vector.
+            Must be provided at the first call to partial_fit, can be omitted
+            in subsequent calls.
+
+        Returns
+        -------
+        self : DecisionTreeClassifier
+            Fitted estimator.
+        """
+        self._fit(
+            X,
+            y,
+            sample_weight=sample_weight,
+            check_input=check_input,
+            classes=classes,
+        )
+        return self
+
+    def _fit(
+        self,
+        X,
+        y,
+        sample_weight=None,
+        classes=None,
+        check_input=True,
+        missing_values_in_feature_mask=None,
+    ):
         """Build an honest tree classifier from the training set (X, y).
 
         Parameters
@@ -313,7 +394,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
         """
         rng = np.random.default_rng(self.random_state)
         if check_input:
-            X, y = check_X_y(X, y)
+            X, y = check_X_y(X, y, multi_output=True)
 
         # Account for bootstrapping too
         if sample_weight is None:
@@ -346,17 +427,21 @@ def fit(self, X, y, sample_weight=None, check_input=True):
                 random_state=self.random_state,
                 min_impurity_decrease=self.min_impurity_decrease,
                 ccp_alpha=self.ccp_alpha,
+                monotonic_cst=self.monotonic_cst,
+                store_leaf_values=self.store_leaf_values,
             )
         else:
             # XXX: maybe error out if the tree_estimator is already fitted
             self.estimator_ = deepcopy(self.tree_estimator)
 
         # Learn structure on subsample
-        self.estimator_.fit(
+        self.estimator_._fit(
             X,
             y,
             sample_weight=_sample_weight,
+            classes=classes,
             check_input=check_input,
+            missing_values_in_feature_mask=missing_values_in_feature_mask,
         )
         self._inherit_estimator_attributes()
 
@@ -366,9 +451,11 @@ def fit(self, X, y, sample_weight=None, check_input=True):
             # [:, np.newaxis] that does not.
             y = np.reshape(y, (-1, 1))
         check_classification_targets(y)
-        y = np.copy(y).astype(int)
+        y = np.copy(y)  # .astype(int)
+
         # Normally called by super
         X = self.estimator_._validate_X_predict(X, True)
+
         # Fit leaves using other subsample
         honest_leaves = self.tree_.apply(X[self.honest_indices_])
 
@@ -527,5 +614,6 @@ def predict(self, X, check_input=True):
         y : array-like of shape (n_samples,) or (n_samples, n_outputs)
             The predicted classes, or the predict values.
         """
+        check_is_fitted(self)
         X = self._validate_X_predict(X, check_input)
         return self.estimator_.predict(X, False)
diff --git a/sktree/tree/tests/test_honest_tree.py b/sktree/tree/tests/test_honest_tree.py
index ace165206..65a77d7bf 100644
--- a/sktree/tree/tests/test_honest_tree.py
+++ b/sktree/tree/tests/test_honest_tree.py
@@ -2,9 +2,9 @@
 import pytest
 from sklearn import datasets
 from sklearn.metrics import accuracy_score
-from sklearn.tree import DecisionTreeClassifier
 from sklearn.utils.estimator_checks import parametrize_with_checks
 
+from sktree._lib.sklearn.tree import DecisionTreeClassifier
 from sktree.tree import (
     HonestTreeClassifier,
     ObliqueDecisionTreeClassifier,
@@ -46,6 +46,10 @@ def test_iris(criterion, max_features, estimator):
         "HonestTree", criterion, score
     )
 
+    print(clf.honest_indices_)
+    assert len(clf.honest_indices_) < len(iris.target)
+    assert len(clf.structure_indices_) < len(iris.target)
+
 
 def test_toy_accuracy():
     clf = HonestTreeClassifier()
@@ -104,6 +108,6 @@ def test_impute_classes():
     assert y_proba.shape[1] == 3
 
 
-@parametrize_with_checks([HonestTreeClassifier(random_state=0)])
+@parametrize_with_checks([HonestTreeClassifier(random_state=12345)])
 def test_sklearn_compatible_estimator(estimator, check):
     check(estimator)