Skip to content

Commit

Permalink
Fixed unit tests
Browse files Browse the repository at this point in the history
Signed-off-by: Adam Li <adam2392@gmail.com>
  • Loading branch information
adam2392 committed Aug 15, 2023
1 parent e86cc0a commit b8dc2c6
Show file tree
Hide file tree
Showing 5 changed files with 268 additions and 112 deletions.
96 changes: 94 additions & 2 deletions sktree/ensemble/_honest_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,10 @@ class labels (multi-output problem).
>>> X, y = make_classification(n_samples=1000, n_features=4,
... n_informative=2, n_redundant=0,
... random_state=0, shuffle=False)
>>> clf = HonestForestClassifier(max_depth=2, random_state=0)
>>> clf = HonestForestClassifier(
>>> max_depth=2,
>>> random_state=0,
>>> tree_estimator=ObliqueDecisionTreeClassifier())
>>> clf.fit(X, y)
HonestForestClassifier(...)
>>> print(clf.predict([[0, 0, 0, 0]]))
Expand Down Expand Up @@ -399,8 +402,8 @@ def fit(self, X, y, sample_weight=None):
self : HonestForestClassifier
Fitted tree estimator.
"""
super().fit(X, y, sample_weight)
X, y = check_X_y(X, y, multi_output=True)
super().fit(X, y, sample_weight)

# Compute honest decision function
self.honest_decision_function_ = self._predict_proba(
Expand Down Expand Up @@ -434,6 +437,7 @@ def predict_proba(self, X):

def _predict_proba(self, X, indices=None, impute_missing=None):
"""predict_proba helper class"""
check_is_fitted(self)
X = self._validate_X_predict(X)
n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)

Expand Down Expand Up @@ -471,17 +475,105 @@ def _predict_proba(self, X, indices=None, impute_missing=None):

@property
def structure_indices_(self):
"""The indices used to learn the structure of the trees."""
check_is_fitted(self)
return [tree.structure_indices_ for tree in self.estimators_]

@property
def honest_indices_(self):
"""The indices used to fit the leaf nodes."""
check_is_fitted(self)
return [tree.honest_indices_ for tree in self.estimators_]

@property
def feature_importances_(self):
"""The feature importances."""
return self.estimator_.feature_importances_

def _more_tags(self):
return {"multioutput": False}

def apply(self, X):
"""
Apply trees in the forest to X, return leaf indices.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples. Internally, its dtype will be converted to
``dtype=np.float32``. If a sparse matrix is provided, it will be
converted into a sparse ``csr_matrix``.
Returns
-------
X_leaves : ndarray of shape (n_samples, n_estimators)
For each datapoint x in X and for each tree in the forest,
return the index of the leaf x ends up in.
"""
return self.estimator_.apply(X)

def decision_path(self, X):
"""
Return the decision path in the forest.
.. versionadded:: 0.18
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples. Internally, its dtype will be converted to
``dtype=np.float32``. If a sparse matrix is provided, it will be
converted into a sparse ``csr_matrix``.
Returns
-------
indicator : sparse matrix of shape (n_samples, n_nodes)
Return a node indicator matrix where non zero elements indicates
that the samples goes through the nodes. The matrix is of CSR
format.
n_nodes_ptr : ndarray of shape (n_estimators + 1,)
The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]]
gives the indicator value for the i-th estimator.
"""
return self.estimator_.decision_path(X)

def predict_quantiles(self, X, quantiles=0.5, method="nearest"):
"""Predict class or regression value for X at given quantiles.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Input data.
quantiles : float, optional
The quantiles at which to evaluate, by default 0.5 (median).
method : str, optional
The method to interpolate, by default 'linear'. Can be any keyword
argument accepted by :func:`~np.quantile`.
Returns
-------
y : ndarray of shape (n_samples, n_quantiles, [n_outputs])
The predicted values. The ``n_outputs`` dimension is present only
for multi-output regressors.
"""
return self.estimator_.predict_quantiles(X, quantiles, method)

def get_leaf_node_samples(self, X):
"""Get samples in each leaf node across the forest.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The data array.
Returns
-------
leaf_node_samples : array-like of shape (n_samples, n_estimators)
Samples within each leaf node.
"""
return self.estimator_.get_leaf_node_samples(X)


def _accumulate_prediction(tree, X, out, lock, indices=None):
"""
Expand Down
52 changes: 30 additions & 22 deletions sktree/tests/test_honest_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
import pytest
from sklearn import datasets
from sklearn.metrics import accuracy_score, r2_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.estimator_checks import parametrize_with_checks

from sktree._lib.sklearn.tree import DecisionTreeClassifier
from sktree.ensemble import HonestForestClassifier
from sktree.tree import ObliqueDecisionTreeClassifier, PatchObliqueDecisionTreeClassifier

Expand Down Expand Up @@ -34,6 +34,7 @@ def test_toy_accuracy():
@pytest.mark.parametrize(
"estimator",
[
None,
DecisionTreeClassifier(),
ObliqueDecisionTreeClassifier(),
PatchObliqueDecisionTreeClassifier(),
Expand All @@ -49,19 +50,21 @@ def test_iris(criterion, max_features, honest_prior, estimator):
honest_prior=honest_prior,
tree_estimator=estimator,
)
try:
if honest_prior == "error":
with pytest.raises(ValueError, match="honest_prior error not a valid input."):
clf.fit(iris.data, iris.target)
else:
clf.fit(iris.data, iris.target)
score = accuracy_score(clf.predict(iris.data), iris.target)
except ValueError:
return
assert score > 0.5 and score < 1.0, "Failed with {0}, criterion = {1} and score = {2}".format(
"HForest", criterion, score
)

score = accuracy_score(clf.predict(iris.data), clf.predict_proba(iris.data).argmax(1))
assert score == 1.0, "Failed with {0}, criterion = {1} and score = {2}".format(
"HForest", criterion, score
)
assert (
score > 0.5 and score < 1.0
), "Failed with {0}, criterion = {1} and score = {2}".format("HForest", criterion, score)

score = accuracy_score(clf.predict(iris.data), clf.predict_proba(iris.data).argmax(1))
assert score == 1.0, "Failed with {0}, criterion = {1} and score = {2}".format(
"HForest", criterion, score
)


@pytest.mark.parametrize("criterion", ["gini", "entropy"])
Expand Down Expand Up @@ -90,19 +93,24 @@ def test_iris_multi(criterion, max_features, honest_prior, estimator):

X = iris.data
y = np.stack((iris.target, second_y[perm])).T
try:
if honest_prior == "error":
with pytest.raises(ValueError, match="honest_prior error not a valid input."):
clf.fit(X, y)
else:
clf.fit(X, y)
score = r2_score(clf.predict(X), y)
except ValueError:
return
if honest_prior == "ignore":
assert (
score > 0.6 and score < 1.0
), "Failed with {0}, criterion = {1} and score = {2}".format("HForest", criterion, score)
else:
assert (
score > 0.9 and score < 1.0
), "Failed with {0}, criterion = {1} and score = {2}".format("HForest", criterion, score)
if honest_prior == "ignore":
assert (
score > 0.6 and score < 1.0
), "Failed with {0}, criterion = {1} and score = {2}".format(
"HForest", criterion, score
)
else:
assert (
score > 0.9 and score < 1.0
), "Failed with {0}, criterion = {1} and score = {2}".format(
"HForest", criterion, score
)


def test_max_samples():
Expand Down
124 changes: 44 additions & 80 deletions sktree/tree/_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2010,8 +2010,20 @@ def __init__(
self.boundary = boundary
self.feature_weight = feature_weight

def fit(self, X, y, sample_weight=None, check_input=True):
"""Fit tree.
def _build_tree(
self,
X,
y,
sample_weight,
missing_values_in_feature_mask,
min_samples_leaf,
min_weight_leaf,
max_leaf_nodes,
min_samples_split,
max_depth,
random_state,
):
"""Build the actual tree.
Parameters
----------
Expand All @@ -2029,37 +2041,42 @@ def fit(self, X, y, sample_weight=None, check_input=True):
that would create child nodes with net zero or negative weight are
ignored while searching for a split in each node.
check_input : bool, optional
Whether or not to check input, by default True.
min_samples_leaf : int or float
The minimum number of samples required to be at a leaf node.
min_weight_leaf : float, default=0.0
The minimum weighted fraction of the sum total of weights.
max_leaf_nodes : int, default=None
Grow a tree with ``max_leaf_nodes`` in best-first fashion.
min_samples_split : int or float, default=2
The minimum number of samples required to split an internal node.
max_depth : int, default=None
The maximum depth of the tree. If None, then nodes are expanded until
all leaves are pure or until all leaves contain less than
min_samples_split samples.
random_state : int, RandomState instance or None, default=None
Controls the randomness of the estimator.
"""
if check_input:
# Need to validate separately here.
# We can't pass multi_output=True because that would allow y to be
# csr.
check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
check_y_params = dict(ensure_2d=False, dtype=None)
X, y = self._validate_data(X, y, validate_separately=(check_X_params, check_y_params))
if self.feature_weight is not None:
self.feature_weight = self._validate_data(
self.feature_weight, ensure_2d=True, dtype=DTYPE
if self.feature_weight is not None:
self.feature_weight = self._validate_data(
self.feature_weight, ensure_2d=True, dtype=DTYPE
)
if self.feature_weight.shape != X.shape:
raise ValueError(
f"feature_weight has shape {self.feature_weight.shape} but X has "
f"shape {X.shape}"
)
if self.feature_weight.shape != X.shape:
raise ValueError(
f"feature_weight has shape {self.feature_weight.shape} but X has "
f"shape {X.shape}"
)
if issparse(X):
X.sort_indices()

if X.indices.dtype != np.intc or X.indptr.dtype != np.intc:
raise ValueError("No support for np.int64 index based sparse matrices")

if self.data_dims is None:
self.data_dims_ = np.array((1, X.shape[1]))
self.data_dims_ = np.array((1, X.shape[1]), dtype=np.intp)
else:
if np.prod(self.data_dims) != X.shape[1]:
raise RuntimeError(f"Data dimensions {self.data_dims} do not match {X.shape[1]}.")
self.data_dims_ = np.array(self.data_dims)
self.data_dims_ = np.array(self.data_dims, dtype=np.intp)
ndim = len(self.data_dims_)

# validate contiguous parameter
Expand All @@ -2074,7 +2091,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
if self.min_patch_dims is None:
self.min_patch_dims_ = np.ones((ndim,), dtype=np.intp)
else:
self.min_patch_dims_ = np.array(self.min_patch_dims)
self.min_patch_dims_ = np.array(self.min_patch_dims, dtype=np.intp)

if self.max_patch_dims is None:
self.max_patch_dims_ = np.ones((ndim,), dtype=np.intp)
Expand Down Expand Up @@ -2105,59 +2122,6 @@ def fit(self, X, y, sample_weight=None, check_input=True):
f"greater than the data width {self.data_dims_[idx]}"
)

return super().fit(X, y, sample_weight, check_input=False)

def _build_tree(
self,
X,
y,
sample_weight,
missing_values_in_feature_mask,
min_samples_leaf,
min_weight_leaf,
max_leaf_nodes,
min_samples_split,
max_depth,
random_state,
):
"""Build the actual tree.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The training input samples. Internally, it will be converted to
``dtype=np.float32`` and if a sparse matrix is provided
to a sparse ``csc_matrix``.
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
The target values (real numbers). Use ``dtype=np.float64`` and
``order='C'`` for maximum efficiency.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights. If None, then samples are equally weighted. Splits
that would create child nodes with net zero or negative weight are
ignored while searching for a split in each node.
min_samples_leaf : int or float
The minimum number of samples required to be at a leaf node.
min_weight_leaf : float, default=0.0
The minimum weighted fraction of the sum total of weights.
max_leaf_nodes : int, default=None
Grow a tree with ``max_leaf_nodes`` in best-first fashion.
min_samples_split : int or float, default=2
The minimum number of samples required to split an internal node.
max_depth : int, default=None
The maximum depth of the tree. If None, then nodes are expanded until
all leaves are pure or until all leaves contain less than
min_samples_split samples.
random_state : int, RandomState instance or None, default=None
Controls the randomness of the estimator.
"""
monotonic_cst = None
n_samples = X.shape[0]

Expand Down
Loading

0 comments on commit b8dc2c6

Please sign in to comment.