From b8dc2c6cde8d79db1bd256347a86c70ee31075f2 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 15 Aug 2023 14:29:31 -0400 Subject: [PATCH] Fixed unit tests Signed-off-by: Adam Li --- sktree/ensemble/_honest_forest.py | 96 +++++++++++++++++++- sktree/tests/test_honest_forest.py | 52 ++++++----- sktree/tree/_classes.py | 124 +++++++++----------------- sktree/tree/_honest_tree.py | 100 +++++++++++++++++++-- sktree/tree/tests/test_honest_tree.py | 8 +- 5 files changed, 268 insertions(+), 112 deletions(-) diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py index 295617025..35f1eb706 100644 --- a/sktree/ensemble/_honest_forest.py +++ b/sktree/ensemble/_honest_forest.py @@ -298,7 +298,10 @@ class labels (multi-output problem). >>> X, y = make_classification(n_samples=1000, n_features=4, ... n_informative=2, n_redundant=0, ... random_state=0, shuffle=False) - >>> clf = HonestForestClassifier(max_depth=2, random_state=0) + >>> clf = HonestForestClassifier( + >>> max_depth=2, + >>> random_state=0, + >>> tree_estimator=ObliqueDecisionTreeClassifier()) >>> clf.fit(X, y) HonestForestClassifier(...) >>> print(clf.predict([[0, 0, 0, 0]])) @@ -399,8 +402,8 @@ def fit(self, X, y, sample_weight=None): self : HonestForestClassifier Fitted tree estimator. """ - super().fit(X, y, sample_weight) X, y = check_X_y(X, y, multi_output=True) + super().fit(X, y, sample_weight) # Compute honest decision function self.honest_decision_function_ = self._predict_proba( @@ -434,6 +437,7 @@ def predict_proba(self, X): def _predict_proba(self, X, indices=None, impute_missing=None): """predict_proba helper class""" + check_is_fitted(self) X = self._validate_X_predict(X) n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) @@ -471,17 +475,105 @@ def _predict_proba(self, X, indices=None, impute_missing=None): @property def structure_indices_(self): + """The indices used to learn the structure of the trees.""" check_is_fitted(self) return [tree.structure_indices_ for tree in self.estimators_] @property def honest_indices_(self): + """The indices used to fit the leaf nodes.""" check_is_fitted(self) return [tree.honest_indices_ for tree in self.estimators_] + @property + def feature_importances_(self): + """The feature importances.""" + return self.estimator_.feature_importances_ + def _more_tags(self): return {"multioutput": False} + def apply(self, X): + """ + Apply trees in the forest to X, return leaf indices. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, its dtype will be converted to + ``dtype=np.float32``. If a sparse matrix is provided, it will be + converted into a sparse ``csr_matrix``. + + Returns + ------- + X_leaves : ndarray of shape (n_samples, n_estimators) + For each datapoint x in X and for each tree in the forest, + return the index of the leaf x ends up in. + """ + return self.estimator_.apply(X) + + def decision_path(self, X): + """ + Return the decision path in the forest. + + .. versionadded:: 0.18 + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, its dtype will be converted to + ``dtype=np.float32``. If a sparse matrix is provided, it will be + converted into a sparse ``csr_matrix``. + + Returns + ------- + indicator : sparse matrix of shape (n_samples, n_nodes) + Return a node indicator matrix where non zero elements indicates + that the samples goes through the nodes. The matrix is of CSR + format. + + n_nodes_ptr : ndarray of shape (n_estimators + 1,) + The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]] + gives the indicator value for the i-th estimator. + """ + return self.estimator_.decision_path(X) + + def predict_quantiles(self, X, quantiles=0.5, method="nearest"): + """Predict class or regression value for X at given quantiles. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data. + quantiles : float, optional + The quantiles at which to evaluate, by default 0.5 (median). + method : str, optional + The method to interpolate, by default 'linear'. Can be any keyword + argument accepted by :func:`~np.quantile`. + + Returns + ------- + y : ndarray of shape (n_samples, n_quantiles, [n_outputs]) + The predicted values. The ``n_outputs`` dimension is present only + for multi-output regressors. + """ + return self.estimator_.predict_quantiles(X, quantiles, method) + + def get_leaf_node_samples(self, X): + """Get samples in each leaf node across the forest. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data array. + + Returns + ------- + leaf_node_samples : array-like of shape (n_samples, n_estimators) + Samples within each leaf node. + """ + return self.estimator_.get_leaf_node_samples(X) + def _accumulate_prediction(tree, X, out, lock, indices=None): """ diff --git a/sktree/tests/test_honest_forest.py b/sktree/tests/test_honest_forest.py index a9846642a..64e9aedd2 100644 --- a/sktree/tests/test_honest_forest.py +++ b/sktree/tests/test_honest_forest.py @@ -2,9 +2,9 @@ import pytest from sklearn import datasets from sklearn.metrics import accuracy_score, r2_score -from sklearn.tree import DecisionTreeClassifier from sklearn.utils.estimator_checks import parametrize_with_checks +from sktree._lib.sklearn.tree import DecisionTreeClassifier from sktree.ensemble import HonestForestClassifier from sktree.tree import ObliqueDecisionTreeClassifier, PatchObliqueDecisionTreeClassifier @@ -34,6 +34,7 @@ def test_toy_accuracy(): @pytest.mark.parametrize( "estimator", [ + None, DecisionTreeClassifier(), ObliqueDecisionTreeClassifier(), PatchObliqueDecisionTreeClassifier(), @@ -49,19 +50,21 @@ def test_iris(criterion, max_features, honest_prior, estimator): honest_prior=honest_prior, tree_estimator=estimator, ) - try: + if honest_prior == "error": + with pytest.raises(ValueError, match="honest_prior error not a valid input."): + clf.fit(iris.data, iris.target) + else: clf.fit(iris.data, iris.target) score = accuracy_score(clf.predict(iris.data), iris.target) - except ValueError: - return - assert score > 0.5 and score < 1.0, "Failed with {0}, criterion = {1} and score = {2}".format( - "HForest", criterion, score - ) - score = accuracy_score(clf.predict(iris.data), clf.predict_proba(iris.data).argmax(1)) - assert score == 1.0, "Failed with {0}, criterion = {1} and score = {2}".format( - "HForest", criterion, score - ) + assert ( + score > 0.5 and score < 1.0 + ), "Failed with {0}, criterion = {1} and score = {2}".format("HForest", criterion, score) + + score = accuracy_score(clf.predict(iris.data), clf.predict_proba(iris.data).argmax(1)) + assert score == 1.0, "Failed with {0}, criterion = {1} and score = {2}".format( + "HForest", criterion, score + ) @pytest.mark.parametrize("criterion", ["gini", "entropy"]) @@ -90,19 +93,24 @@ def test_iris_multi(criterion, max_features, honest_prior, estimator): X = iris.data y = np.stack((iris.target, second_y[perm])).T - try: + if honest_prior == "error": + with pytest.raises(ValueError, match="honest_prior error not a valid input."): + clf.fit(X, y) + else: clf.fit(X, y) score = r2_score(clf.predict(X), y) - except ValueError: - return - if honest_prior == "ignore": - assert ( - score > 0.6 and score < 1.0 - ), "Failed with {0}, criterion = {1} and score = {2}".format("HForest", criterion, score) - else: - assert ( - score > 0.9 and score < 1.0 - ), "Failed with {0}, criterion = {1} and score = {2}".format("HForest", criterion, score) + if honest_prior == "ignore": + assert ( + score > 0.6 and score < 1.0 + ), "Failed with {0}, criterion = {1} and score = {2}".format( + "HForest", criterion, score + ) + else: + assert ( + score > 0.9 and score < 1.0 + ), "Failed with {0}, criterion = {1} and score = {2}".format( + "HForest", criterion, score + ) def test_max_samples(): diff --git a/sktree/tree/_classes.py b/sktree/tree/_classes.py index c8e7b2463..0538920fc 100644 --- a/sktree/tree/_classes.py +++ b/sktree/tree/_classes.py @@ -2010,8 +2010,20 @@ def __init__( self.boundary = boundary self.feature_weight = feature_weight - def fit(self, X, y, sample_weight=None, check_input=True): - """Fit tree. + def _build_tree( + self, + X, + y, + sample_weight, + missing_values_in_feature_mask, + min_samples_leaf, + min_weight_leaf, + max_leaf_nodes, + min_samples_split, + max_depth, + random_state, + ): + """Build the actual tree. Parameters ---------- @@ -2029,37 +2041,42 @@ def fit(self, X, y, sample_weight=None, check_input=True): that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. - check_input : bool, optional - Whether or not to check input, by default True. + min_samples_leaf : int or float + The minimum number of samples required to be at a leaf node. + + min_weight_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights. + + max_leaf_nodes : int, default=None + Grow a tree with ``max_leaf_nodes`` in best-first fashion. + + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node. + + max_depth : int, default=None + The maximum depth of the tree. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + + random_state : int, RandomState instance or None, default=None + Controls the randomness of the estimator. """ - if check_input: - # Need to validate separately here. - # We can't pass multi_output=True because that would allow y to be - # csr. - check_X_params = dict(dtype=DTYPE, accept_sparse="csc") - check_y_params = dict(ensure_2d=False, dtype=None) - X, y = self._validate_data(X, y, validate_separately=(check_X_params, check_y_params)) - if self.feature_weight is not None: - self.feature_weight = self._validate_data( - self.feature_weight, ensure_2d=True, dtype=DTYPE + if self.feature_weight is not None: + self.feature_weight = self._validate_data( + self.feature_weight, ensure_2d=True, dtype=DTYPE + ) + if self.feature_weight.shape != X.shape: + raise ValueError( + f"feature_weight has shape {self.feature_weight.shape} but X has " + f"shape {X.shape}" ) - if self.feature_weight.shape != X.shape: - raise ValueError( - f"feature_weight has shape {self.feature_weight.shape} but X has " - f"shape {X.shape}" - ) - if issparse(X): - X.sort_indices() - - if X.indices.dtype != np.intc or X.indptr.dtype != np.intc: - raise ValueError("No support for np.int64 index based sparse matrices") if self.data_dims is None: - self.data_dims_ = np.array((1, X.shape[1])) + self.data_dims_ = np.array((1, X.shape[1]), dtype=np.intp) else: if np.prod(self.data_dims) != X.shape[1]: raise RuntimeError(f"Data dimensions {self.data_dims} do not match {X.shape[1]}.") - self.data_dims_ = np.array(self.data_dims) + self.data_dims_ = np.array(self.data_dims, dtype=np.intp) ndim = len(self.data_dims_) # validate contiguous parameter @@ -2074,7 +2091,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): if self.min_patch_dims is None: self.min_patch_dims_ = np.ones((ndim,), dtype=np.intp) else: - self.min_patch_dims_ = np.array(self.min_patch_dims) + self.min_patch_dims_ = np.array(self.min_patch_dims, dtype=np.intp) if self.max_patch_dims is None: self.max_patch_dims_ = np.ones((ndim,), dtype=np.intp) @@ -2105,59 +2122,6 @@ def fit(self, X, y, sample_weight=None, check_input=True): f"greater than the data width {self.data_dims_[idx]}" ) - return super().fit(X, y, sample_weight, check_input=False) - - def _build_tree( - self, - X, - y, - sample_weight, - missing_values_in_feature_mask, - min_samples_leaf, - min_weight_leaf, - max_leaf_nodes, - min_samples_split, - max_depth, - random_state, - ): - """Build the actual tree. - - Parameters - ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - The training input samples. Internally, it will be converted to - ``dtype=np.float32`` and if a sparse matrix is provided - to a sparse ``csc_matrix``. - - y : array-like of shape (n_samples,) or (n_samples, n_outputs) - The target values (real numbers). Use ``dtype=np.float64`` and - ``order='C'`` for maximum efficiency. - - sample_weight : array-like of shape (n_samples,), default=None - Sample weights. If None, then samples are equally weighted. Splits - that would create child nodes with net zero or negative weight are - ignored while searching for a split in each node. - - min_samples_leaf : int or float - The minimum number of samples required to be at a leaf node. - - min_weight_leaf : float, default=0.0 - The minimum weighted fraction of the sum total of weights. - - max_leaf_nodes : int, default=None - Grow a tree with ``max_leaf_nodes`` in best-first fashion. - - min_samples_split : int or float, default=2 - The minimum number of samples required to split an internal node. - - max_depth : int, default=None - The maximum depth of the tree. If None, then nodes are expanded until - all leaves are pure or until all leaves contain less than - min_samples_split samples. - - random_state : int, RandomState instance or None, default=None - Controls the randomness of the estimator. - """ monotonic_cst = None n_samples = X.shape[0] diff --git a/sktree/tree/_honest_tree.py b/sktree/tree/_honest_tree.py index 343830429..a107b7939 100644 --- a/sktree/tree/_honest_tree.py +++ b/sktree/tree/_honest_tree.py @@ -4,7 +4,7 @@ from copy import deepcopy import numpy as np -from sklearn.base import MetaEstimatorMixin +from sklearn.base import ClassifierMixin, MetaEstimatorMixin, _fit_context from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.validation import check_is_fitted, check_X_y @@ -12,7 +12,7 @@ from sktree._lib.sklearn.tree._classes import BaseDecisionTree -class HonestTreeClassifier(MetaEstimatorMixin, BaseDecisionTree): +class HonestTreeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseDecisionTree): """ A decision tree classifier with honest predictions. @@ -133,6 +133,23 @@ class HonestTreeClassifier(MetaEstimatorMixin, BaseDecisionTree): ``ccp_alpha`` will be chosen. By default, no pruning is performed. See :ref:`minimal_cost_complexity_pruning` for details. + monotonic_cst : array-like of int of shape (n_features), default=None + Indicates the monotonicity constraint to enforce on each feature. + - 1: monotonic increase + - 0: no constraint + - -1: monotonic decrease + + If monotonic_cst is None, no constraints are applied. + + Monotonicity constraints are not supported for: + - multiclass classifications (i.e. when `n_classes > 2`), + - multioutput classifications (i.e. when `n_outputs_ > 1`), + - classifications trained on data with missing values. + + The constraints hold over the probability of the positive class. + + Read more in the :ref:`User Guide `. + tree_estimator : object, default=None Instatiated tree of type BaseDecisionTree. If None, then DecisionTreeClassifier with default parameters will @@ -262,6 +279,7 @@ def __init__( min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0, + monotonic_cst=None, tree_estimator=None, honest_fraction=0.5, honest_prior="empirical", @@ -281,8 +299,71 @@ def __init__( self.ccp_alpha = ccp_alpha self.honest_fraction = honest_fraction self.honest_prior = honest_prior + self.monotonic_cst = monotonic_cst + + # XXX: to enable this, we need to also reset the leaf node samples during `_set_leaf_nodes` + self.store_leaf_values = False - def fit(self, X, y, sample_weight=None, check_input=True): + @_fit_context(prefer_skip_nested_validation=True) + def fit( + self, + X, + y, + sample_weight=None, + check_input=True, + classes=None, + ): + """Build a decision tree classifier from the training set (X, y). + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csc_matrix``. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) + The target values (class labels) as integers or strings. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. Splits + that would create child nodes with net zero or negative weight are + ignored while searching for a split in each node. Splits are also + ignored if they would result in any single class carrying a + negative weight in either child node. + + check_input : bool, default=True + Allow to bypass several input checking. + Don't use this parameter unless you know what you're doing. + + classes : array-like of shape (n_classes,), default=None + List of all the classes that can possibly appear in the y vector. + Must be provided at the first call to partial_fit, can be omitted + in subsequent calls. + + Returns + ------- + self : DecisionTreeClassifier + Fitted estimator. + """ + self._fit( + X, + y, + sample_weight=sample_weight, + check_input=check_input, + classes=classes, + ) + return self + + def _fit( + self, + X, + y, + sample_weight=None, + classes=None, + check_input=True, + missing_values_in_feature_mask=None, + ): """Build an honest tree classifier from the training set (X, y). Parameters @@ -313,7 +394,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): """ rng = np.random.default_rng(self.random_state) if check_input: - X, y = check_X_y(X, y) + X, y = check_X_y(X, y, multi_output=True) # Account for bootstrapping too if sample_weight is None: @@ -346,17 +427,21 @@ def fit(self, X, y, sample_weight=None, check_input=True): random_state=self.random_state, min_impurity_decrease=self.min_impurity_decrease, ccp_alpha=self.ccp_alpha, + monotonic_cst=self.monotonic_cst, + store_leaf_values=self.store_leaf_values, ) else: # XXX: maybe error out if the tree_estimator is already fitted self.estimator_ = deepcopy(self.tree_estimator) # Learn structure on subsample - self.estimator_.fit( + self.estimator_._fit( X, y, sample_weight=_sample_weight, + classes=classes, check_input=check_input, + missing_values_in_feature_mask=missing_values_in_feature_mask, ) self._inherit_estimator_attributes() @@ -366,9 +451,11 @@ def fit(self, X, y, sample_weight=None, check_input=True): # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) check_classification_targets(y) - y = np.copy(y).astype(int) + y = np.copy(y) # .astype(int) + # Normally called by super X = self.estimator_._validate_X_predict(X, True) + # Fit leaves using other subsample honest_leaves = self.tree_.apply(X[self.honest_indices_]) @@ -527,5 +614,6 @@ def predict(self, X, check_input=True): y : array-like of shape (n_samples,) or (n_samples, n_outputs) The predicted classes, or the predict values. """ + check_is_fitted(self) X = self._validate_X_predict(X, check_input) return self.estimator_.predict(X, False) diff --git a/sktree/tree/tests/test_honest_tree.py b/sktree/tree/tests/test_honest_tree.py index ace165206..65a77d7bf 100644 --- a/sktree/tree/tests/test_honest_tree.py +++ b/sktree/tree/tests/test_honest_tree.py @@ -2,9 +2,9 @@ import pytest from sklearn import datasets from sklearn.metrics import accuracy_score -from sklearn.tree import DecisionTreeClassifier from sklearn.utils.estimator_checks import parametrize_with_checks +from sktree._lib.sklearn.tree import DecisionTreeClassifier from sktree.tree import ( HonestTreeClassifier, ObliqueDecisionTreeClassifier, @@ -46,6 +46,10 @@ def test_iris(criterion, max_features, estimator): "HonestTree", criterion, score ) + print(clf.honest_indices_) + assert len(clf.honest_indices_) < len(iris.target) + assert len(clf.structure_indices_) < len(iris.target) + def test_toy_accuracy(): clf = HonestTreeClassifier() @@ -104,6 +108,6 @@ def test_impute_classes(): assert y_proba.shape[1] == 3 -@parametrize_with_checks([HonestTreeClassifier(random_state=0)]) +@parametrize_with_checks([HonestTreeClassifier(random_state=12345)]) def test_sklearn_compatible_estimator(estimator, check): check(estimator)