Skip to content

Commit

Permalink
Merge branch 'main' into rename
Browse files Browse the repository at this point in the history
  • Loading branch information
adam2392 committed Jul 15, 2024
2 parents 7e02443 + fae7e5c commit f215201
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 16 deletions.
2 changes: 1 addition & 1 deletion doc/whats_new/v0.8.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,4 @@ Thanks to everyone who has contributed to the maintenance and improvement of
the project since version inception, including:

* `Adam Li`_

* `Sambit Panda`_
8 changes: 6 additions & 2 deletions doc/whats_new/v0.9.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Version 0.9

**In Development**

This release include a rename of the package to from ``scikit-learn`` to ``treeple``
This release include a rename of the package to from ``scikit-tree`` to ``treeple``
The users can replace the previous usage as follows:
``import sktree`` to ``import treeple``
``from sktree import tree`` to ``from treeple import tree``
Expand All @@ -21,7 +21,10 @@ Changelog
---------

- |API| Rename the package to ``treeple``. By `SUKI-O`_ (:pr:`#292`)

- |Fix| Fixed a bug in the predict_proba function of the :class:`treeple.HonestForestClassifier` where posteriors
estimated on empty leaf with ``ignore`` prior would result in ``np.nan``
values for all trees on that sample.
By `Haoyin Xu`_ (:pr:`#291`)

Code and Documentation Contributors
-----------------------------------
Expand All @@ -31,3 +34,4 @@ the project since version inception, including:

* `Adam Li`_
* `SUKI-O`_
* `Haoyin Xu`_
15 changes: 6 additions & 9 deletions treeple/ensemble/_honest_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ class HonestForestClassifier(ForestClassifier, ForestClassifierMixin):
- If int, then draw `max_samples` samples.
- If float, then draw `max_samples * X.shape[0]` samples.
honest_prior : {"ignore", "uniform", "empirical"}, default="empirical"
honest_prior : {"ignore", "uniform", "empirical"}, default="ignore"
Method for dealing with empty leaves during evaluation of a test
sample. If "ignore", the tree is ignored. If "uniform", the prior tree
posterior is 1/(number of classes). If "empirical", the prior tree
Expand Down Expand Up @@ -444,7 +444,7 @@ def __init__(
class_weight=None,
ccp_alpha=0.0,
max_samples=None,
honest_prior="empirical",
honest_prior="ignore",
honest_fraction=0.5,
tree_estimator=None,
stratify=False,
Expand Down Expand Up @@ -648,7 +648,7 @@ def predict_proba(self, X):
"""
return self._predict_proba(X)

def _predict_proba(self, X, indices=None, impute_missing=None):
def _predict_proba(self, X, indices=None, impute_missing=np.nan):
"""predict_proba helper class"""
check_is_fitted(self)
X = self._validate_X_predict(X)
Expand All @@ -672,10 +672,7 @@ def _predict_proba(self, X, indices=None, impute_missing=None):
zero_mask = posteriors.sum(2) == 0
posteriors[~zero_mask] /= posteriors[~zero_mask].sum(1, keepdims=True)

if impute_missing is None:
pass
else:
posteriors[zero_mask] = impute_missing
posteriors[zero_mask] = impute_missing

# preserve shape of multi-outputs
if self.n_outputs_ > 1:
Expand Down Expand Up @@ -823,7 +820,7 @@ def _accumulate_prediction(predict, X, out, lock, indices=None):

with lock:
if len(out) == 1:
out[0][indices] += proba
out[0][indices] = np.nansum([out[0][indices], proba], axis=0)
else:
for i in range(len(out)):
out[i][indices] += proba[i]
out[i][indices] = np.nansum([out[i][indices], proba[i]], axis=0)
8 changes: 5 additions & 3 deletions treeple/stats/tests/test_forestht.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,11 +119,10 @@ def test_small_dataset_independent(seed):
@flaky(max_runs=3)
@pytest.mark.parametrize("seed", [10, 0])
def test_small_dataset_dependent(seed):
n_samples = 50
n_samples = 100
n_features = 5
rng = np.random.default_rng(seed)

X = rng.uniform(size=(n_samples, n_features))
X = rng.uniform(size=(n_samples // 2, n_features))
X2 = X + 3
X = np.vstack([X, X2])
Expand Down Expand Up @@ -157,12 +156,15 @@ def test_small_dataset_dependent(seed):
n_repeats=1000,
metric="mi",
return_posteriors=False,
seed=seed,
)
assert ~np.isnan(result.pvalue)
assert ~np.isnan(result.observe_test_stat)
assert result.pvalue <= 0.05

result = build_coleman_forest(clf, perm_clf, X, y, metric="mi", return_posteriors=False)
result = build_coleman_forest(
clf, perm_clf, X, y, metric="mi", return_posteriors=False, seed=seed
)
assert result.pvalue <= 0.05


Expand Down
5 changes: 4 additions & 1 deletion treeple/tests/test_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,10 @@ def test_predict_proba_per_tree(Forest, n_classes):
)

# Call the method being tested
est = Forest(n_estimators=10, bootstrap=True, random_state=0)
if Forest == HonestForestClassifier:
est = Forest(n_estimators=10, bootstrap=True, random_state=0, honest_prior="empirical")
else:
est = Forest(n_estimators=10, bootstrap=True, random_state=0)
est.fit(X, y)
proba_per_tree = est.predict_proba_per_tree(X)

Expand Down
17 changes: 17 additions & 0 deletions treeple/tests/test_honest_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,23 @@ def test_impute_posteriors(honest_prior, val):
), f"Failed with {honest_prior}, prior {clf.estimators_[0].empirical_prior_}"


def test_honestforest_predict_proba_with_honest_prior():
X = rng.normal(0, 1, (100, 2))
y = [0] * 75 + [1] * 25
honest_prior = "ignore"
clf = HonestForestClassifier(
honest_fraction=0.5, random_state=0, honest_prior=honest_prior, n_estimators=100
)
clf = clf.fit(X, y)

y_proba = clf.predict_proba(X)

# With enough trees no nan values should exist
assert (
len(np.where(np.isnan(y_proba[:, 0]))[0]) == 0
), f"Failed with {honest_prior}, prior {clf.estimators_[0].empirical_prior_}"


@pytest.mark.parametrize(
"honest_fraction, val",
[
Expand Down

0 comments on commit f215201

Please sign in to comment.