diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index e9b5f5105..aa63cda74 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,23 +1,30 @@ - + - -Fixes # +#### Reference Issues/PRs + -Changes proposed in this pull request: - -- -## Before submitting +#### What does this implement/fix? Explain your changes. - -- [ ] I've read and followed all steps in the [Making a pull request](https://github.com/py-why/pywhy-graphs/blob/main/CONTRIBUTING.md#making-a-pull-request) - section of the `CONTRIBUTING` docs. -- [ ] I've updated or added any relevant docstrings following the syntax described in the - [Writing docstrings](https://github.com/py-why/pywhy-graphs/blob/main/CONTRIBUTING.md#writing-docstrings) section of the `CONTRIBUTING` docs. -- [ ] If this PR fixes a bug, I've added a test that will fail without my fix. -- [ ] If this PR adds a new feature, I've added tests that sufficiently cover my new functionality. -## After submitting +#### Any other comments? - -- [ ] All GitHub Actions jobs for my pull request have passed. + + diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 22194c830..d378f05a3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/psf/black - rev: 24.1.1 + rev: 24.4.2 hooks: - id: black args: [--quiet] @@ -15,14 +15,14 @@ repos: types: [cython] - repo: https://github.com/MarcoGorelli/cython-lint - rev: v0.16.0 + rev: v0.16.2 hooks: - id: cython-lint - id: double-quote-cython-strings # Ruff sktree - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.2.0 + rev: v0.4.10 hooks: - id: ruff name: ruff sktree @@ -31,7 +31,7 @@ repos: # Ruff tutorials and examples - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.2.0 + rev: v0.4.10 hooks: - id: ruff name: ruff tutorials and examples @@ -42,7 +42,7 @@ repos: # Codespell - repo: https://github.com/codespell-project/codespell - rev: v2.2.6 + rev: v2.3.0 hooks: - id: codespell additional_dependencies: @@ -52,7 +52,7 @@ repos: # yamllint - repo: https://github.com/adrienverge/yamllint.git - rev: v1.33.0 + rev: v1.35.1 hooks: - id: yamllint args: [--strict, -c, .yamllint.yml] @@ -67,7 +67,7 @@ repos: # mypy - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.8.0 + rev: v1.10.0 hooks: - id: mypy # Avoid the conflict between mne/__init__.py and mne/__init__.pyi by ignoring the former @@ -84,4 +84,4 @@ repos: files: ^(?!doc/use\.rst$).*\.(rst|inc)$ ci: - autofix_prs: false + autofix_prs: true diff --git a/doc/whats_new/v0.8.rst b/doc/whats_new/v0.8.rst index 7b31bea3f..224f9fbc8 100644 --- a/doc/whats_new/v0.8.rst +++ b/doc/whats_new/v0.8.rst @@ -25,6 +25,10 @@ Changelog argument. By `Adam Li`_ (:pr:`#274`) - |API| Removed all instances of ``FeatureImportanceForestClassifier`` and outdated MIGHT code. By `Adam Li`_ (:pr:`#274`) +- |Fix| Fixed a bug in the ``sktree.HonestForestClassifier`` where posteriors + estimated on oob samples were biased when there was a low number of samples + due to imbalance in the classes when ``bootstrap=True``. + By `Adam Li`_ (:pr:`#283`) Code and Documentation Contributors ----------------------------------- diff --git a/sktree/cv.py b/sktree/cv.py deleted file mode 100644 index cd38e7793..000000000 --- a/sktree/cv.py +++ /dev/null @@ -1,55 +0,0 @@ -from typing import Any, List, Tuple - -from numpy.typing import ArrayLike -from sklearn.base import clone -from sklearn.model_selection import BaseCrossValidator, StratifiedKFold - -from sktree._lib.sklearn.ensemble._forest import BaseForest - - -def cross_fit_forest( - forest: BaseForest, - X: ArrayLike, - y: ArrayLike, - cv: BaseCrossValidator = None, - n_splits: int = 5, - random_state=None, -) -> Tuple[List[Any], List[ArrayLike]]: - """Perform cross-fitting of a forest estimator. - - Parameters - ---------- - forest : BaseForest - Forest. - X : ArrayLike of shape (n_samples, n_features) - Input data. - y : ArrayLike of shape (n_samples, [n_outputs]) - Target data. - cv : BaseCrossValidator, optional - Cross validation object, by default None, which defaults to - :class:`sklearn.model_selection.StratifiedKFold`. - n_splits : int, optional - Number of folds to generate, by default 5. - random_state : int, optional - Random seed. - - Returns - ------- - fitted_forests : List[BaseForest] - List of fitted forests. - test_indices : List[ArrayLike] - List of test indices over ``n_samples``. - """ - if cv is None: - cv = StratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=True) - - fitted_forests = [] - test_indices = [] - for train_index, test_index in cv.split(X, y): - new_forest = clone(forest) - new_forest.fit(X[train_index], y[train_index]) - - fitted_forests.append(new_forest) - test_indices.append(test_index) - - return fitted_forests, test_indices diff --git a/sktree/datasets/hyppo.py b/sktree/datasets/hyppo.py index 1fcaa1d1d..3da38a44b 100644 --- a/sktree/datasets/hyppo.py +++ b/sktree/datasets/hyppo.py @@ -388,13 +388,11 @@ def make_trunk_mixture_classification( mixture_idx = rng.choice(2, n_samples // 2, replace=True, shuffle=True, p=[mix, 1 - mix]) # type: ignore norm_params = [[mu_0_vec, cov], [mu_1_vec, cov]] - X_mixture = np.fromiter( - ( - rng_children.multivariate_normal(*(norm_params[i]), size=1, method=method) - for i, rng_children in zip(mixture_idx, rng.spawn(n_samples // 2)) - ), - dtype=np.dtype((float, n_informative)), - ) + dim_sample = len(norm_params[0][0]) # Dimensionality of the samples + X_mixture = np.empty((n_samples // 2, dim_sample)) # Pre-allocate array for samples + for idx, (i, rng_child) in enumerate(zip(mixture_idx, rng.spawn(n_samples // 2))): + mean, cov = norm_params[i] + X_mixture[idx, :] = rng_child.multivariate_normal(mean, cov, size=1, method=method) # create new generator instance to ensure reproducibility with multiple runs with the same seed rng_F = np.random.default_rng(seed=seed) @@ -474,6 +472,7 @@ def make_trunk_classification( Either 'ma', or 'ar'. return_params : bool, optional Whether or not to return the distribution parameters of the classes normal distributions. + Default false. scaling_factor : float, optional The scaling factor for the covariance matrix. By default 1. seed : int, optional diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py index 484fca881..2ea11ea93 100644 --- a/sktree/ensemble/_honest_forest.py +++ b/sktree/ensemble/_honest_forest.py @@ -9,8 +9,8 @@ from joblib import Parallel, delayed from sklearn.base import _fit_context, clone from sklearn.ensemble._base import _partition_estimators, _set_random_states -from sklearn.utils import compute_sample_weight -from sklearn.utils._param_validation import Interval, RealNotInt +from sklearn.utils import compute_sample_weight, resample +from sklearn.utils._param_validation import Interval, RealNotInt, StrOptions from sklearn.utils.validation import check_is_fitted from .._lib.sklearn.ensemble._forest import ForestClassifier @@ -31,41 +31,67 @@ def _parallel_build_trees( n_samples_bootstrap=None, missing_values_in_feature_mask=None, classes=None, + stratify=False, ): """ Private function used to fit a single tree in parallel. - XXX: this is copied over from scikit-learn and modified to allow sampling with + XXX: + 1. this is copied over from scikit-learn and modified to allow sampling with and without replacement given ``bootstrap``. + + 2. Overrides the scikit-learn implementation to allow for stratification during bootstrapping + via the `stratify` parameter. """ if verbose > 1: print("building tree %d of %d" % (tree_idx + 1, n_trees)) - n_samples = X.shape[0] - if sample_weight is None: - curr_sample_weight = np.ones((n_samples,), dtype=np.float64) + if bootstrap: + n_samples = X.shape[0] + if sample_weight is None: + curr_sample_weight = np.ones((n_samples,), dtype=np.float64) + else: + curr_sample_weight = sample_weight.copy() + + if stratify: + indices = resample( + np.arange(n_samples), + n_samples=n_samples_bootstrap, + stratify=y, + replace=True, + random_state=tree.random_state, + ) + else: + indices = _generate_sample_indices( + tree.random_state, n_samples, n_samples_bootstrap, bootstrap=bootstrap + ) + sample_counts = np.bincount(indices, minlength=n_samples) + curr_sample_weight *= sample_counts + + if class_weight == "subsample": + with catch_warnings(): + simplefilter("ignore", DeprecationWarning) + curr_sample_weight *= compute_sample_weight("auto", y, indices=indices) + elif class_weight == "balanced_subsample": + curr_sample_weight *= compute_sample_weight("balanced", y, indices=indices) + + tree._fit( + X, + y, + sample_weight=curr_sample_weight, + check_input=False, + missing_values_in_feature_mask=missing_values_in_feature_mask, + classes=classes, + ) else: - curr_sample_weight = sample_weight.copy() - - indices = _generate_sample_indices(tree.random_state, n_samples, n_samples_bootstrap, bootstrap) - sample_counts = np.bincount(indices, minlength=n_samples) - curr_sample_weight *= sample_counts - - if class_weight == "subsample": - with catch_warnings(): - simplefilter("ignore", DeprecationWarning) - curr_sample_weight *= compute_sample_weight("auto", y, indices=indices) - elif class_weight == "balanced_subsample": - curr_sample_weight *= compute_sample_weight("balanced", y, indices=indices) - - tree._fit( - X, - y, - sample_weight=curr_sample_weight, - check_input=False, - missing_values_in_feature_mask=missing_values_in_feature_mask, - classes=classes, - ) + tree._fit( + X, + y, + sample_weight=sample_weight, + check_input=False, + missing_values_in_feature_mask=missing_values_in_feature_mask, + classes=classes, + ) return tree @@ -254,6 +280,8 @@ class HonestForestClassifier(ForestClassifier, ForestClassifierMixin): stratify : bool Whether or not to stratify sample when considering structure and leaf indices. + This will also stratify samples when bootstrap sampling is used. For more + information, see :func:`sklearn.utils.resample`. By default False. **tree_estimator_params : dict @@ -389,6 +417,11 @@ class labels (multi-output problem). Interval(RealNotInt, 0.0, None, closed="right"), Interval(Integral, 1, None, closed="left"), ] + _parameter_constraints["honest_fraction"] = [Interval(RealNotInt, 0.0, 1.0, closed="both")] + _parameter_constraints["honest_prior"] = [ + StrOptions({"empirical", "uniform", "ignore"}), + ] + _parameter_constraints["stratify"] = ["boolean"] def __init__( self, @@ -515,6 +548,55 @@ def fit(self, X, y, sample_weight=None, classes=None, **fit_params): return self + def _construct_trees( + self, + X, + y, + sample_weight, + random_state, + n_samples_bootstrap, + missing_values_in_feature_mask, + classes, + n_more_estimators, + ): + """Override construction of trees to allow stratification during bootstrapping.""" + trees = [ + self._make_estimator(append=False, random_state=random_state) + for i in range(n_more_estimators) + ] + + # Parallel loop: we prefer the threading backend as the Cython code + # for fitting the trees is internally releasing the Python GIL + # making threading more efficient than multiprocessing in + # that case. However, for joblib 0.12+ we respect any + # parallel_backend contexts set at a higher level, + # since correctness does not rely on using threads. + trees = Parallel( + n_jobs=self.n_jobs, + verbose=self.verbose, + prefer="threads", + )( + delayed(_parallel_build_trees)( + t, + self.bootstrap, + X, + y, + sample_weight, + i, + len(trees), + verbose=self.verbose, + class_weight=self.class_weight, + n_samples_bootstrap=n_samples_bootstrap, + missing_values_in_feature_mask=missing_values_in_feature_mask, + classes=classes, + stratify=self.stratify, + ) + for i, t in enumerate(trees) + ) + + # Collect newly grown trees + self.estimators_.extend(trees) + def _make_estimator(self, append=True, random_state=None): """Make and configure a copy of the `estimator_` attribute. diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index a03a7dd73..bbc20d032 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -307,6 +307,9 @@ def build_oob_forest(est: ForestClassifier, X, y, verbose=False, **est_kwargs): ---------- est : Forest The type of forest to use. Must be enabled with ``bootstrap=True``. + The forest should have either ``oob_samples_`` or ``estimators_samples_`` + property defined, which will be used to compute the out of bag samples + per tree. X : ArrayLike of shape (n_samples, n_features) Data. y : ArrayLike of shape (n_samples, n_outputs) @@ -314,7 +317,7 @@ def build_oob_forest(est: ForestClassifier, X, y, verbose=False, **est_kwargs): verbose : bool, optional Verbosity, by default False. **est_kwargs : dict, optional - Additional keyword arguments to pass to the forest estimator. + Additional keyword arguments to pass to the forest estimator ``fit`` function. Returns ------- @@ -351,10 +354,22 @@ def build_oob_forest(est: ForestClassifier, X, y, verbose=False, **est_kwargs): all_proba = np.full( (len(est.estimators_), X.shape[0], est.n_classes_), np.nan, dtype=np.float64 ) - Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")( - delayed(_parallel_predict_proba_oob)(e.predict_proba, X, all_proba, idx, test_idx, lock) - for idx, (e, test_idx) in enumerate(zip(est.estimators_, est.oob_samples_)) - ) + if hasattr(est, "oob_samples_"): + Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")( + delayed(_parallel_predict_proba_oob)(e.predict_proba, X, all_proba, idx, test_idx, lock) + for idx, (e, test_idx) in enumerate(zip(est.estimators_, est.oob_samples_)) + ) + else: + inbag_samples = est.estimators_samples_ + all_samples = np.arange(X.shape[0]) + oob_samples_list = [ + np.setdiff1d(all_samples, inbag_samples[i]) for i in range(len(inbag_samples)) + ] + Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")( + delayed(_parallel_predict_proba_oob)(e.predict_proba, X, all_proba, idx, test_idx, lock) + for idx, (e, test_idx) in enumerate(zip(est.estimators_, oob_samples_list)) + ) + return est, all_proba @@ -365,6 +380,7 @@ def build_cv_forest( cv=5, test_size=0.2, verbose=False, + return_indices=False, seed=None, ): """Build a hypothesis testing forest using using cross-validation. @@ -383,6 +399,8 @@ def build_cv_forest( Proportion of samples per tree to use for the test set, by default 0.2. verbose : bool, optional Verbosity, by default False. + return_indices : bool, optional + Whether or not to return the train and test indices, by default False. seed : int, optional Random seed, by default None. @@ -393,6 +411,7 @@ def build_cv_forest( all_proba_list : list of ArrayLike of shape (n_estimators, n_samples, n_outputs) The predicted posterior probabilities for each estimator on their out of bag samples. Length of list is equal to the number of splits. + tr """ X = X.astype(np.float32) if cv is not None: @@ -415,6 +434,9 @@ def build_cv_forest( est_list = [] all_proba_list = [] for isplit in range(n_splits): + # clone the estimator to remove all fitted attributes + est = clone(est) + X_train, y_train = X[train_idx_list[isplit], :], y[train_idx_list[isplit]] # X_test = X[test_idx_list[isplit], :] @@ -447,4 +469,7 @@ def build_cv_forest( all_proba_list.append(posterior_arr) est_list.append(est) + if return_indices: + return est_list, all_proba_list, train_idx_list, test_idx_list + return est_list, all_proba_list diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index e20de8ea8..f4b397ffc 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -4,10 +4,12 @@ from numpy.testing import assert_almost_equal, assert_array_equal from sklearn import datasets -from sktree import HonestForestClassifier +from sktree import HonestForestClassifier, RandomForestClassifier from sktree.stats import ( PermutationHonestForestClassifier, build_coleman_forest, + build_cv_forest, + build_oob_forest, build_permutation_forest, ) from sktree.tree import MultiViewDecisionTreeClassifier @@ -32,7 +34,7 @@ def test_small_dataset_independent(seed): # XXX: unit test interestingly does not work for MI, possibly due to bias bootstrap = True n_samples = 100 - n_features = 1000 + n_features = 500 n_estimators = 100 rng = np.random.default_rng(seed) @@ -65,8 +67,8 @@ def test_small_dataset_independent(seed): clf, perm_clf, X, y, n_repeats=1000, metric="s@98", return_posteriors=False, seed=seed ) print(result.observe_stat, result.permuted_stat, result.pvalue, result.observe_test_stat) - assert_almost_equal(np.abs(result.observe_test_stat), 0.0, decimal=1) assert result.pvalue > 0.05, f"{result.pvalue}" + assert_almost_equal(np.abs(result.observe_test_stat), 0.0, decimal=1) # now permute only some of the features feature_set_ends = [3, n_features] @@ -392,3 +394,111 @@ def test_build_permutation_forest(): ) assert forest_result.pvalue > 0.05, f"{forest_result.pvalue}" assert forest_result.observe_test_stat < 0.05, f"{forest_result.observe_test_stat}" + + +def test_build_oob_honest_forest(): + bootstrap = True + max_samples = 1.6 + + n_estimators = 100 + est = HonestForestClassifier( + n_estimators=n_estimators, + random_state=0, + bootstrap=bootstrap, + max_samples=max_samples, + honest_fraction=0.5, + stratify=True, + ) + X = rng.normal(0, 1, (100, 2)) + X[:50] *= -1 + y = np.array([0, 1] * 50) + samples = np.arange(len(y)) + + est, proba = build_oob_forest(est, X, y) + + structure_samples = est.structure_indices_ + leaf_samples = est.honest_indices_ + oob_samples = est.oob_samples_ + for tree_idx in range(est.n_estimators): + assert len(structure_samples[tree_idx]) + len(leaf_samples[tree_idx]) + len( + oob_samples[tree_idx] + ) == len( + samples + ), f"{tree_idx} {len(structure_samples[tree_idx])} {len(leaf_samples[tree_idx])} {len(samples)}" + + +def test_build_oob_random_forest(): + """Test building oob random forest.""" + bootstrap = True + max_samples = 1.0 + + n_estimators = 100 + est = RandomForestClassifier( + n_estimators=n_estimators, random_state=0, bootstrap=bootstrap, max_samples=max_samples + ) + X = rng.normal(0, 1, (100, 2)) + X[:50] *= -1 + y = np.array([0, 1] * 50) + samples = np.arange(len(y)) + + est, proba = build_oob_forest(est, X, y) + + structure_samples = est.estimators_samples_ + all_samples = np.arange(X.shape[0]) + oob_samples_list = [ + np.setdiff1d(all_samples, structure_samples[i]) for i in range(len(structure_samples)) + ] + for tree_idx in range(est.n_estimators): + assert len(np.unique(structure_samples[tree_idx])) + len(oob_samples_list[tree_idx]) == len( + samples + ), f"{tree_idx} {len(structure_samples[tree_idx])} + {len(oob_samples_list[tree_idx])} != {len(samples)}" + + +@pytest.mark.parametrize("bootstrap, max_samples", [(True, 1.6), (False, None)]) +def test_build_cv_honest_forest(bootstrap, max_samples): + n_estimators = 100 + est = HonestForestClassifier( + n_estimators=n_estimators, + random_state=0, + bootstrap=bootstrap, + max_samples=max_samples, + honest_fraction=0.5, + stratify=True, + ) + X = rng.normal(0, 1, (100, 2)) + X[:50] *= -1 + y = np.array([0, 1] * 50) + samples = np.arange(len(y)) + + est_list, proba_list, train_idx_list, test_idx_list = build_cv_forest( + est, + X, + y, + return_indices=True, + seed=seed, + cv=3, + ) + + assert isinstance(est_list, list) + assert isinstance(proba_list, list) + + for est, proba, train_idx, test_idx in zip(est_list, proba_list, train_idx_list, test_idx_list): + assert len(train_idx) + len(test_idx) == len(samples) + structure_samples = est.structure_indices_ + leaf_samples = est.honest_indices_ + + if not bootstrap: + oob_samples = [[] for _ in range(est.n_estimators)] + else: + oob_samples = est.oob_samples_ + + # compared to oob samples, now the train samples are comprised of the entire dataset + # seen over the entire forest. The test dataset is completely disjoint + for tree_idx in range(est.n_estimators): + n_samples_in_tree = len(structure_samples[tree_idx]) + len(leaf_samples[tree_idx]) + assert n_samples_in_tree + len(oob_samples[tree_idx]) == len(train_idx), ( + f"For tree: " + f"{tree_idx} {len(structure_samples[tree_idx])} + " + f"{len(leaf_samples[tree_idx])} + {len(oob_samples[tree_idx])} " + f"!= {len(train_idx)} {len(test_idx)}" + ) diff --git a/sktree/tests/test_honest_forest.py b/sktree/tests/test_honest_forest.py index 31232a38e..3610475d6 100644 --- a/sktree/tests/test_honest_forest.py +++ b/sktree/tests/test_honest_forest.py @@ -9,7 +9,7 @@ from sklearn.utils.estimator_checks import parametrize_with_checks from sktree._lib.sklearn.tree import DecisionTreeClassifier -from sktree.datasets import make_quadratic_classification +from sktree.datasets import make_quadratic_classification, make_trunk_classification from sktree.ensemble import HonestForestClassifier from sktree.stats.utils import _mutual_information from sktree.tree import ( @@ -20,6 +20,8 @@ CLF_CRITERIONS = ("gini", "entropy") +seed = 12345 + # also load the iris dataset # and randomly permute it iris = datasets.load_iris() @@ -51,7 +53,7 @@ def test_toy_accuracy(): @pytest.mark.parametrize("criterion", ["gini", "entropy"]) @pytest.mark.parametrize("max_features", [None, 2]) -@pytest.mark.parametrize("honest_prior", ["empirical", "uniform", "ignore", "error"]) +@pytest.mark.parametrize("honest_prior", ["empirical", "uniform", "ignore"]) @pytest.mark.parametrize( "estimator", [ @@ -71,26 +73,22 @@ def test_iris(criterion, max_features, honest_prior, estimator): honest_prior=honest_prior, tree_estimator=estimator, ) - if honest_prior == "error": - with pytest.raises(ValueError, match="honest_prior error not a valid input."): - clf.fit(iris.data, iris.target) - else: - clf.fit(iris.data, iris.target) - score = accuracy_score(clf.predict(iris.data), iris.target) + clf.fit(iris.data, iris.target) + score = accuracy_score(clf.predict(iris.data), iris.target) - assert ( - score > 0.5 and score < 1.0 - ), "Failed with {0}, criterion = {1} and score = {2}".format("HForest", criterion, score) + assert score > 0.5 and score < 1.0, "Failed with {0}, criterion = {1} and score = {2}".format( + "HForest", criterion, score + ) - score = accuracy_score(clf.predict(iris.data), clf.predict_proba(iris.data).argmax(1)) - assert score == 1.0, "Failed with {0}, criterion = {1} and score = {2}".format( - "HForest", criterion, score - ) + score = accuracy_score(clf.predict(iris.data), clf.predict_proba(iris.data).argmax(1)) + assert score == 1.0, "Failed with {0}, criterion = {1} and score = {2}".format( + "HForest", criterion, score + ) @pytest.mark.parametrize("criterion", ["gini", "entropy"]) @pytest.mark.parametrize("max_features", [None, 2]) -@pytest.mark.parametrize("honest_prior", ["empirical", "uniform", "ignore", "error"]) +@pytest.mark.parametrize("honest_prior", ["empirical", "uniform", "ignore"]) @pytest.mark.parametrize( "estimator", [ @@ -120,24 +118,16 @@ def test_iris_multi(criterion, max_features, honest_prior, estimator): X = iris.data y = np.stack((iris.target, second_y[perm])).T - if honest_prior == "error": - with pytest.raises(ValueError, match="honest_prior error not a valid input."): - clf.fit(X, y) + clf.fit(X, y) + score = r2_score(clf.predict(X), y) + if honest_prior == "ignore": + assert ( + score > 0.4 and score < 1.0 + ), "Failed with {0}, criterion = {1} and score = {2}".format("HForest", criterion, score) else: - clf.fit(X, y) - score = r2_score(clf.predict(X), y) - if honest_prior == "ignore": - assert ( - score > 0.4 and score < 1.0 - ), "Failed with {0}, criterion = {1} and score = {2}".format( - "HForest", criterion, score - ) - else: - assert ( - score > 0.9 and score < 1.0 - ), "Failed with {0}, criterion = {1} and score = {2}".format( - "HForest", criterion, score - ) + assert ( + score > 0.9 and score < 1.0 + ), "Failed with {0}, criterion = {1} and score = {2}".format("HForest", criterion, score) def test_max_samples(): @@ -163,6 +153,37 @@ def test_max_samples(): uf = uf.fit(X, y) +def test_honest_forest_samples(): + bootstrap = True + max_samples = 1.6 + + n_estimators = 5 + est = HonestForestClassifier( + n_estimators=n_estimators, + random_state=0, + bootstrap=bootstrap, + max_samples=max_samples, + honest_fraction=0.5, + stratify=True, + ) + X = rng.normal(0, 1, (100, 2)) + X[:50] *= -1 + y = [0, 1] * 50 + samples = np.arange(len(y)) + + est.fit(X, y) + + structure_samples = est.structure_indices_ + leaf_samples = est.honest_indices_ + oob_samples = est.oob_samples_ + for tree_idx in range(est.n_estimators): + assert len(structure_samples[tree_idx]) + len(leaf_samples[tree_idx]) + len( + oob_samples[tree_idx] + ) == len( + samples + ), f"{tree_idx} {len(structure_samples[tree_idx])} {len(leaf_samples[tree_idx])} {len(samples)}" + + @pytest.mark.parametrize("max_samples", [0.75, 1.0]) def test_honest_forest_has_deterministic_sampling_for_oob_structure_and_leaves(max_samples): """Test that honest forest can produce the oob, structure and leaf-node samples. @@ -489,3 +510,35 @@ def test_honest_forest_with_tree_estimator_params(tree, tree_kwargs): ) else: assert getattr(clf, attr_name) == getattr(clf.estimators_[0], attr_name) + + +def test_honest_forest_posteriors_on_independent(): + """Test regression from :gh:`283`. + + Posteriors were biased when the classes were independent and using the bootstrap and oob sample + technique to estimate the final population test statistic. This resulted in a biased estimate + of the AUC score. Stratification of the bootstrapping samples was the solution to this problem. + """ + scores = [] + for idx in range(5): + # create a dataset with overlapping classes + X, y = make_trunk_classification( + n_samples=128, n_dim=4096, n_informative=1, mu_0=0.0, mu_1=0.0, seed=idx + ) + clf = HonestForestClassifier( + n_estimators=100, + random_state=idx, + bootstrap=True, + max_samples=1.6, + n_jobs=-1, + honest_prior="ignore", + stratify=True, + ) + clf.fit(X, y) + + oob_posteriors = clf.predict_proba_per_tree(X, clf.oob_samples_) + auc_score = roc_auc_score(y, np.nanmean(oob_posteriors, axis=0)[:, 1]) + scores.append(auc_score) + + # Without stratification, this test should fail + assert np.mean(scores) > 0.49 and np.mean(scores) < 0.51, f"{np.mean(scores)} {scores}" diff --git a/sktree/tree/_honest_tree.py b/sktree/tree/_honest_tree.py index a3b6d0813..61ae1da31 100644 --- a/sktree/tree/_honest_tree.py +++ b/sktree/tree/_honest_tree.py @@ -1,6 +1,6 @@ -# Authors: Ronan Perry, Sambit Panda, Haoyin Xu # Adopted from: https://github.com/neurodata/honest-forests + import numpy as np from sklearn.base import ClassifierMixin, MetaEstimatorMixin, _fit_context, clone from sklearn.model_selection import StratifiedShuffleSplit @@ -446,7 +446,6 @@ def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None): replace=False, ) self.honest_indices_ = np.setdiff1d(nonzero_indices, self.structure_indices_) - _sample_weight[self.honest_indices_] = 0 self.estimator_.partial_fit( @@ -458,46 +457,8 @@ def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None): ) self._inherit_estimator_attributes() - # update the number of classes, unsplit - if y.ndim == 1: - # reshape is necessary to preserve the data contiguity against vs - # [:, np.newaxis] that does not. - y = np.reshape(y, (-1, 1)) - check_classification_targets(y) - y = np.copy(y) # .astype(int) - - # Normally called by super - X = self.estimator_._validate_X_predict(X, True) - - # Fit leaves using other subsample - honest_leaves = self.tree_.apply(X[self.honest_indices_]) - - # preserve from underlying tree - self._tree_classes_ = self.classes_ - self._tree_n_classes_ = self.n_classes_ - self.classes_ = [] - self.n_classes_ = [] - self.empirical_prior_ = [] - - y_encoded = np.zeros(y.shape, dtype=int) - for k in range(self.n_outputs_): - classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True) - self.classes_.append(classes_k) - self.n_classes_.append(classes_k.shape[0]) - self.empirical_prior_.append( - np.bincount(y_encoded[:, k], minlength=classes_k.shape[0]) / y.shape[0] - ) - y = y_encoded - - # y-encoded ensures that y values match the indices of the classes - self._set_leaf_nodes(honest_leaves, y) - - self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) - if self.n_outputs_ == 1: - self.n_classes_ = self.n_classes_[0] - self.classes_ = self.classes_[0] - self.empirical_prior_ = self.empirical_prior_[0] - y = y[:, 0] + # set leaf nodes + self._fit_leaves(X, y, sample_weight=_sample_weight) return self @@ -511,7 +472,6 @@ def _partition_honest_indices(self, y, sample_weight): _sample_weight = np.array(sample_weight) nonzero_indices = np.where(_sample_weight > 0)[0] - # sample the structure indices if self.stratify: ss = StratifiedShuffleSplit( @@ -663,15 +623,16 @@ def _fit( else: sample_weight_leaves = np.array(sample_weight) sample_weight_leaves[not_honest_mask] = 0 - self._fit_leaves(X, y, sample_weight=sample_weight_leaves) - return self - - def _fit_leaves(self, X, y, sample_weight): - nonzero_indices = np.where(sample_weight > 0)[0] + # determine the honest indices using the sample weight + nonzero_indices = np.where(sample_weight_leaves > 0)[0] # sample the structure indices self.honest_indices_ = nonzero_indices + self._fit_leaves(X, y, sample_weight=sample_weight_leaves) + return self + + def _fit_leaves(self, X, y, sample_weight): # update the number of classes, unsplit if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs @@ -683,9 +644,6 @@ def _fit_leaves(self, X, y, sample_weight): # Normally called by super X = self.estimator_._validate_X_predict(X, True) - # Fit leaves using other subsample - honest_leaves = self.tree_.apply(X[self.honest_indices_]) - # preserve from underlying tree # https://github.com/scikit-learn/scikit-learn/blob/1.0.X/sklearn/tree/_classes.py#L202 self._tree_classes_ = self.classes_ @@ -703,18 +661,26 @@ def _fit_leaves(self, X, y, sample_weight): np.bincount(y_encoded[:, k], minlength=classes_k.shape[0]) / y.shape[0] ) y = y_encoded + self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) - # y-encoded ensures that y values match the indices of the classes - self._set_leaf_nodes(honest_leaves, y) + # XXX: implement honest pruning + honest_method = "apply" + if honest_method == "apply": + # Fit leaves using other subsample + honest_leaves = self.tree_.apply(X[self.honest_indices_]) + + # y-encoded ensures that y values match the indices of the classes + self._set_leaf_nodes(honest_leaves, y, sample_weight) + elif honest_method == "prune": + raise NotImplementedError("Pruning is not yet implemented.") - self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) if self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] self.empirical_prior_ = self.empirical_prior_[0] y = y[:, 0] - def _set_leaf_nodes(self, leaf_ids, y): + def _set_leaf_nodes(self, leaf_ids, y, sample_weight): """Traverse the already built tree with X and set leaf nodes with y. tree_.value has shape (n_nodes, n_outputs, max_n_classes), where @@ -725,8 +691,12 @@ def _set_leaf_nodes(self, leaf_ids, y): classes are ordered by their index in the tree_.value array. """ self.tree_.value[:, :, :] = 0 - for leaf_id, yval in zip(leaf_ids, y[self.honest_indices_, :]): - self.tree_.value[leaf_id][:, yval] += 1 + + # apply sample-weight to the leaf nodes + for leaf_id, yval, y_weight in zip( + leaf_ids, y[self.honest_indices_, :], sample_weight[self.honest_indices_] + ): + self.tree_.value[leaf_id][:, yval] += y_weight def _inherit_estimator_attributes(self): """Initialize necessary attributes from the provided tree estimator""" @@ -750,6 +720,8 @@ def _inherit_estimator_attributes(self): def _empty_leaf_correction(self, proba, pos=0): """Leaves with empty posteriors are assigned values. + This is called only during prediction. + The posteriors are corrected according to the honest prior. In multi-output cases, the posterior corrections only correspond to the respective y dimension, indicated by the position param pos. @@ -764,8 +736,6 @@ def _empty_leaf_correction(self, proba, pos=0): proba[zero_mask] = 1 / self.n_classes_[pos] elif self.honest_prior == "ignore": proba[zero_mask] = np.nan - else: - raise ValueError(f"honest_prior {self.honest_prior} not a valid input.") else: if self.honest_prior == "empirical": proba[zero_mask] = self.empirical_prior_ @@ -773,8 +743,6 @@ def _empty_leaf_correction(self, proba, pos=0): proba[zero_mask] = 1 / self.n_classes_ elif self.honest_prior == "ignore": proba[zero_mask] = np.nan - else: - raise ValueError(f"honest_prior {self.honest_prior} not a valid input.") return proba def predict_proba(self, X, check_input=True): diff --git a/sktree/tree/tests/test_honest_tree.py b/sktree/tree/tests/test_honest_tree.py index 907c386f1..4815b3cb2 100644 --- a/sktree/tree/tests/test_honest_tree.py +++ b/sktree/tree/tests/test_honest_tree.py @@ -115,6 +115,7 @@ def test_honest_tree_with_tree_estimator_params(tree, tree_kwargs): ], ) def test_impute_posteriors(honest_prior, val): + # XXX: test could be improved as we don't actually test proper behavior of the imputation np.random.seed(0) X = np.random.normal(0, 1, (100, 2)) y = [0] * 75 + [1] * 25 @@ -123,9 +124,14 @@ def test_impute_posteriors(honest_prior, val): y_proba = clf.predict_proba(X) if np.isnan(val): - assert len(np.where(np.isnan(y_proba[:, 0]))[0]) > 50, f"Failed with {honest_prior}" + assert ( + len(np.where(np.isnan(y_proba[:, 0]))[0]) > 50 + ), f"Failed with {honest_prior}, {len(np.where(np.isnan(y_proba[:, 0]))[0])}" + # assert len(np.where(np.isnan(y_proba[:, 0]))[0]) < 60, f"Failed with {honest_prior}, {len(np.where(np.isnan(y_proba[:, 0]))[0])}" else: - assert len(np.where(y_proba[:, 0] == val)[0]) > 50, f"Failed with {honest_prior}" + assert ( + len(np.where(y_proba[:, 0] == val)[0]) > 50 + ), f"Failed with {honest_prior}, {len(np.where(y_proba[:, 0] == val)[0])}" def test_increasing_leaves():