Fixed unit tests

Signed-off-by: Adam Li <adam2392@gmail.com>
neurodata · Aug 15, 2023 · b8dc2c6 · b8dc2c6
1 parent e86cc0a
commit b8dc2c6
Show file tree

Hide file tree

Showing 5 changed files with 268 additions and 112 deletions.
diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py
@@ -298,7 +298,10 @@ class labels (multi-output problem).
     >>> X, y = make_classification(n_samples=1000, n_features=4,
     ...                            n_informative=2, n_redundant=0,
     ...                            random_state=0, shuffle=False)
-    >>> clf = HonestForestClassifier(max_depth=2, random_state=0)
+    >>> clf = HonestForestClassifier(
+    >>>        max_depth=2,
+    >>>        random_state=0,
+    >>>        tree_estimator=ObliqueDecisionTreeClassifier())
     >>> clf.fit(X, y)
     HonestForestClassifier(...)
     >>> print(clf.predict([[0, 0, 0, 0]]))
@@ -399,8 +402,8 @@ def fit(self, X, y, sample_weight=None):
         self : HonestForestClassifier
             Fitted tree estimator.
         """
-        super().fit(X, y, sample_weight)
         X, y = check_X_y(X, y, multi_output=True)
+        super().fit(X, y, sample_weight)
 
         # Compute honest decision function
         self.honest_decision_function_ = self._predict_proba(
@@ -434,6 +437,7 @@ def predict_proba(self, X):
 
     def _predict_proba(self, X, indices=None, impute_missing=None):
         """predict_proba helper class"""
+        check_is_fitted(self)
         X = self._validate_X_predict(X)
         n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
 
@@ -471,17 +475,105 @@ def _predict_proba(self, X, indices=None, impute_missing=None):
 
     @property
     def structure_indices_(self):
+        """The indices used to learn the structure of the trees."""
         check_is_fitted(self)
         return [tree.structure_indices_ for tree in self.estimators_]
 
     @property
     def honest_indices_(self):
+        """The indices used to fit the leaf nodes."""
         check_is_fitted(self)
         return [tree.honest_indices_ for tree in self.estimators_]
 
+    @property
+    def feature_importances_(self):
+        """The feature importances."""
+        return self.estimator_.feature_importances_
+
     def _more_tags(self):
         return {"multioutput": False}
 
+    def apply(self, X):
+        """
+        Apply trees in the forest to X, return leaf indices.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, its dtype will be converted to
+            ``dtype=np.float32``. If a sparse matrix is provided, it will be
+            converted into a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        X_leaves : ndarray of shape (n_samples, n_estimators)
+            For each datapoint x in X and for each tree in the forest,
+            return the index of the leaf x ends up in.
+        """
+        return self.estimator_.apply(X)
+
+    def decision_path(self, X):
+        """
+        Return the decision path in the forest.
+
+        .. versionadded:: 0.18
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, its dtype will be converted to
+            ``dtype=np.float32``. If a sparse matrix is provided, it will be
+            converted into a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        indicator : sparse matrix of shape (n_samples, n_nodes)
+            Return a node indicator matrix where non zero elements indicates
+            that the samples goes through the nodes. The matrix is of CSR
+            format.
+
+        n_nodes_ptr : ndarray of shape (n_estimators + 1,)
+            The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]]
+            gives the indicator value for the i-th estimator.
+        """
+        return self.estimator_.decision_path(X)
+
+    def predict_quantiles(self, X, quantiles=0.5, method="nearest"):
+        """Predict class or regression value for X at given quantiles.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data.
+        quantiles : float, optional
+            The quantiles at which to evaluate, by default 0.5 (median).
+        method : str, optional
+            The method to interpolate, by default 'linear'. Can be any keyword
+            argument accepted by :func:`~np.quantile`.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples, n_quantiles, [n_outputs])
+            The predicted values. The ``n_outputs`` dimension is present only
+            for multi-output regressors.
+        """
+        return self.estimator_.predict_quantiles(X, quantiles, method)
+
+    def get_leaf_node_samples(self, X):
+        """Get samples in each leaf node across the forest.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data array.
+
+        Returns
+        -------
+        leaf_node_samples : array-like of shape (n_samples, n_estimators)
+            Samples within each leaf node.
+        """
+        return self.estimator_.get_leaf_node_samples(X)
+
 
 def _accumulate_prediction(tree, X, out, lock, indices=None):
     """

diff --git a/sktree/tests/test_honest_forest.py b/sktree/tests/test_honest_forest.py
@@ -2,9 +2,9 @@
 import pytest
 from sklearn import datasets
 from sklearn.metrics import accuracy_score, r2_score
-from sklearn.tree import DecisionTreeClassifier
 from sklearn.utils.estimator_checks import parametrize_with_checks
 
+from sktree._lib.sklearn.tree import DecisionTreeClassifier
 from sktree.ensemble import HonestForestClassifier
 from sktree.tree import ObliqueDecisionTreeClassifier, PatchObliqueDecisionTreeClassifier
 
@@ -34,6 +34,7 @@ def test_toy_accuracy():
 @pytest.mark.parametrize(
     "estimator",
     [
+        None,
         DecisionTreeClassifier(),
         ObliqueDecisionTreeClassifier(),
         PatchObliqueDecisionTreeClassifier(),
@@ -49,19 +50,21 @@ def test_iris(criterion, max_features, honest_prior, estimator):
         honest_prior=honest_prior,
         tree_estimator=estimator,
     )
-    try:
+    if honest_prior == "error":
+        with pytest.raises(ValueError, match="honest_prior error not a valid input."):
+            clf.fit(iris.data, iris.target)
+    else:
         clf.fit(iris.data, iris.target)
         score = accuracy_score(clf.predict(iris.data), iris.target)
-    except ValueError:
-        return
-    assert score > 0.5 and score < 1.0, "Failed with {0}, criterion = {1} and score = {2}".format(
-        "HForest", criterion, score
-    )
 
-    score = accuracy_score(clf.predict(iris.data), clf.predict_proba(iris.data).argmax(1))
-    assert score == 1.0, "Failed with {0}, criterion = {1} and score = {2}".format(
-        "HForest", criterion, score
-    )
+        assert (
+            score > 0.5 and score < 1.0
+        ), "Failed with {0}, criterion = {1} and score = {2}".format("HForest", criterion, score)
+
+        score = accuracy_score(clf.predict(iris.data), clf.predict_proba(iris.data).argmax(1))
+        assert score == 1.0, "Failed with {0}, criterion = {1} and score = {2}".format(
+            "HForest", criterion, score
+        )
 
 
 @pytest.mark.parametrize("criterion", ["gini", "entropy"])
@@ -90,19 +93,24 @@ def test_iris_multi(criterion, max_features, honest_prior, estimator):
 
     X = iris.data
     y = np.stack((iris.target, second_y[perm])).T
-    try:
+    if honest_prior == "error":
+        with pytest.raises(ValueError, match="honest_prior error not a valid input."):
+            clf.fit(X, y)
+    else:
         clf.fit(X, y)
         score = r2_score(clf.predict(X), y)
-    except ValueError:
-        return
-    if honest_prior == "ignore":
-        assert (
-            score > 0.6 and score < 1.0
-        ), "Failed with {0}, criterion = {1} and score = {2}".format("HForest", criterion, score)
-    else:
-        assert (
-            score > 0.9 and score < 1.0
-        ), "Failed with {0}, criterion = {1} and score = {2}".format("HForest", criterion, score)
+        if honest_prior == "ignore":
+            assert (
+                score > 0.6 and score < 1.0
+            ), "Failed with {0}, criterion = {1} and score = {2}".format(
+                "HForest", criterion, score
+            )
+        else:
+            assert (
+                score > 0.9 and score < 1.0
+            ), "Failed with {0}, criterion = {1} and score = {2}".format(
+                "HForest", criterion, score
+            )
 
 
 def test_max_samples():

diff --git a/sktree/tree/_classes.py b/sktree/tree/_classes.py
@@ -2010,8 +2010,20 @@ def __init__(
         self.boundary = boundary
         self.feature_weight = feature_weight
 
-    def fit(self, X, y, sample_weight=None, check_input=True):
-        """Fit tree.
+    def _build_tree(
+        self,
+        X,
+        y,
+        sample_weight,
+        missing_values_in_feature_mask,
+        min_samples_leaf,
+        min_weight_leaf,
+        max_leaf_nodes,
+        min_samples_split,
+        max_depth,
+        random_state,
+    ):
+        """Build the actual tree.
 
         Parameters
         ----------
@@ -2029,37 +2041,42 @@ def fit(self, X, y, sample_weight=None, check_input=True):
             that would create child nodes with net zero or negative weight are
             ignored while searching for a split in each node.
 
-        check_input : bool, optional
-            Whether or not to check input, by default True.
+        min_samples_leaf : int or float
+            The minimum number of samples required to be at a leaf node.
+
+        min_weight_leaf : float, default=0.0
+           The minimum weighted fraction of the sum total of weights.
+
+        max_leaf_nodes : int, default=None
+            Grow a tree with ``max_leaf_nodes`` in best-first fashion.
+
+        min_samples_split : int or float, default=2
+            The minimum number of samples required to split an internal node.
+
+        max_depth : int, default=None
+            The maximum depth of the tree. If None, then nodes are expanded until
+            all leaves are pure or until all leaves contain less than
+            min_samples_split samples.
+
+        random_state : int, RandomState instance or None, default=None
+            Controls the randomness of the estimator.
         """
-        if check_input:
-            # Need to validate separately here.
-            # We can't pass multi_output=True because that would allow y to be
-            # csr.
-            check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
-            check_y_params = dict(ensure_2d=False, dtype=None)
-            X, y = self._validate_data(X, y, validate_separately=(check_X_params, check_y_params))
-            if self.feature_weight is not None:
-                self.feature_weight = self._validate_data(
-                    self.feature_weight, ensure_2d=True, dtype=DTYPE
+        if self.feature_weight is not None:
+            self.feature_weight = self._validate_data(
+                self.feature_weight, ensure_2d=True, dtype=DTYPE
+            )
+            if self.feature_weight.shape != X.shape:
+                raise ValueError(
+                    f"feature_weight has shape {self.feature_weight.shape} but X has "
+                    f"shape {X.shape}"
                 )
-                if self.feature_weight.shape != X.shape:
-                    raise ValueError(
-                        f"feature_weight has shape {self.feature_weight.shape} but X has "
-                        f"shape {X.shape}"
-                    )
-            if issparse(X):
-                X.sort_indices()
-
-                if X.indices.dtype != np.intc or X.indptr.dtype != np.intc:
-                    raise ValueError("No support for np.int64 index based sparse matrices")
 
         if self.data_dims is None:
-            self.data_dims_ = np.array((1, X.shape[1]))
+            self.data_dims_ = np.array((1, X.shape[1]), dtype=np.intp)
         else:
             if np.prod(self.data_dims) != X.shape[1]:
                 raise RuntimeError(f"Data dimensions {self.data_dims} do not match {X.shape[1]}.")
-            self.data_dims_ = np.array(self.data_dims)
+            self.data_dims_ = np.array(self.data_dims, dtype=np.intp)
         ndim = len(self.data_dims_)
 
         # validate contiguous parameter
@@ -2074,7 +2091,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
         if self.min_patch_dims is None:
             self.min_patch_dims_ = np.ones((ndim,), dtype=np.intp)
         else:
-            self.min_patch_dims_ = np.array(self.min_patch_dims)
+            self.min_patch_dims_ = np.array(self.min_patch_dims, dtype=np.intp)
 
         if self.max_patch_dims is None:
             self.max_patch_dims_ = np.ones((ndim,), dtype=np.intp)
@@ -2105,59 +2122,6 @@ def fit(self, X, y, sample_weight=None, check_input=True):
                     f"greater than the data width {self.data_dims_[idx]}"
                 )
 
-        return super().fit(X, y, sample_weight, check_input=False)
-
-    def _build_tree(
-        self,
-        X,
-        y,
-        sample_weight,
-        missing_values_in_feature_mask,
-        min_samples_leaf,
-        min_weight_leaf,
-        max_leaf_nodes,
-        min_samples_split,
-        max_depth,
-        random_state,
-    ):
-        """Build the actual tree.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            The training input samples. Internally, it will be converted to
-            ``dtype=np.float32`` and if a sparse matrix is provided
-            to a sparse ``csc_matrix``.
-
-        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
-            The target values (real numbers). Use ``dtype=np.float64`` and
-            ``order='C'`` for maximum efficiency.
-
-        sample_weight : array-like of shape (n_samples,), default=None
-            Sample weights. If None, then samples are equally weighted. Splits
-            that would create child nodes with net zero or negative weight are
-            ignored while searching for a split in each node.
-
-        min_samples_leaf : int or float
-            The minimum number of samples required to be at a leaf node.
-
-        min_weight_leaf : float, default=0.0
-           The minimum weighted fraction of the sum total of weights.
-
-        max_leaf_nodes : int, default=None
-            Grow a tree with ``max_leaf_nodes`` in best-first fashion.
-
-        min_samples_split : int or float, default=2
-            The minimum number of samples required to split an internal node.
-
-        max_depth : int, default=None
-            The maximum depth of the tree. If None, then nodes are expanded until
-            all leaves are pure or until all leaves contain less than
-            min_samples_split samples.
-
-        random_state : int, RandomState instance or None, default=None
-            Controls the randomness of the estimator.
-        """
         monotonic_cst = None
         n_samples = X.shape[0]