diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index e9b5f5105..aa63cda74 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,23 +1,30 @@
-<!-- To ensure we can review your pull request promptly please complete this template entirely. -->
+<!--
+Thanks for contributing a pull request! Please ensure you have taken a look at
+the contribution guidelines from scikit-learn: https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md
+-->
 
-<!-- Please reference the issue number here. You can replace "Fixes" with "Closes" if it makes more sense. -->
-Fixes #
+#### Reference Issues/PRs
+<!--
+Example: Fixes #1234. See also #3456.
+Please use keywords (e.g., Fixes) to create link to the issues or pull requests
+you resolved, so that they will automatically be closed when your pull request
+is merged. See https://github.com/blog/1506-closing-issues-via-pull-requests
+-->
 
-Changes proposed in this pull request:
-<!-- Please list all changes/additions here. -->
--
 
-## Before submitting
+#### What does this implement/fix? Explain your changes.
 
-<!-- Please complete this checklist BEFORE submitting your PR to speed along the review process. -->
-- [ ] I've read and followed all steps in the [Making a pull request](https://github.com/py-why/pywhy-graphs/blob/main/CONTRIBUTING.md#making-a-pull-request)
-    section of the `CONTRIBUTING` docs.
-- [ ] I've updated or added any relevant docstrings following the syntax described in the
-    [Writing docstrings](https://github.com/py-why/pywhy-graphs/blob/main/CONTRIBUTING.md#writing-docstrings) section of the `CONTRIBUTING` docs.
-- [ ] If this PR fixes a bug, I've added a test that will fail without my fix.
-- [ ] If this PR adds a new feature, I've added tests that sufficiently cover my new functionality.
 
-## After submitting
+#### Any other comments?
 
-<!-- Please complete this checklist AFTER submitting your PR to speed along the review process. -->
-- [ ] All GitHub Actions jobs for my pull request have passed.
+
+<!--
+Please be aware that we are a loose team of volunteers so patience is
+necessary; assistance handling other issues is very welcome. We value
+all user contributions, no matter how minor they are.
+
+See https://github.com/neurodata/scikit-tree/blob/main/CONTRIBUTING.md for more
+information on contributing.
+
+Thanks for contributing!
+-->
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 22194c830..d378f05a3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/psf/black
-    rev: 24.1.1
+    rev: 24.4.2
     hooks:
       - id: black
         args: [--quiet]
@@ -15,14 +15,14 @@ repos:
         types: [cython]
 
   - repo: https://github.com/MarcoGorelli/cython-lint
-    rev: v0.16.0
+    rev: v0.16.2
     hooks:
       - id: cython-lint
       - id: double-quote-cython-strings
 
   # Ruff sktree
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.2.0
+    rev: v0.4.10
     hooks:
       - id: ruff
         name: ruff sktree
@@ -31,7 +31,7 @@ repos:
 
   # Ruff tutorials and examples
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.2.0
+    rev: v0.4.10
     hooks:
       - id: ruff
         name: ruff tutorials and examples
@@ -42,7 +42,7 @@ repos:
 
   # Codespell
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.6
+    rev: v2.3.0
     hooks:
       - id: codespell
         additional_dependencies:
@@ -52,7 +52,7 @@ repos:
 
   # yamllint
   - repo: https://github.com/adrienverge/yamllint.git
-    rev: v1.33.0
+    rev: v1.35.1
     hooks:
       - id: yamllint
         args: [--strict, -c, .yamllint.yml]
@@ -67,7 +67,7 @@ repos:
 
   # mypy
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.8.0
+    rev: v1.10.0
     hooks:
       - id: mypy
         # Avoid the conflict between mne/__init__.py and mne/__init__.pyi by ignoring the former
@@ -84,4 +84,4 @@ repos:
         files: ^(?!doc/use\.rst$).*\.(rst|inc)$
 
 ci:
-  autofix_prs: false
+  autofix_prs: true
diff --git a/doc/whats_new/v0.8.rst b/doc/whats_new/v0.8.rst
index 7b31bea3f..224f9fbc8 100644
--- a/doc/whats_new/v0.8.rst
+++ b/doc/whats_new/v0.8.rst
@@ -25,6 +25,10 @@ Changelog
     argument. By `Adam Li`_ (:pr:`#274`)
 - |API| Removed all instances of ``FeatureImportanceForestClassifier`` and outdated
     MIGHT code. By `Adam Li`_ (:pr:`#274`)
+- |Fix| Fixed a bug in the ``sktree.HonestForestClassifier`` where posteriors
+    estimated on oob samples were biased when there was a low number of samples
+    due to imbalance in the classes when ``bootstrap=True``.
+    By `Adam Li`_ (:pr:`#283`)
 
 Code and Documentation Contributors
 -----------------------------------
diff --git a/sktree/cv.py b/sktree/cv.py
deleted file mode 100644
index cd38e7793..000000000
--- a/sktree/cv.py
+++ /dev/null
@@ -1,55 +0,0 @@
-from typing import Any, List, Tuple
-
-from numpy.typing import ArrayLike
-from sklearn.base import clone
-from sklearn.model_selection import BaseCrossValidator, StratifiedKFold
-
-from sktree._lib.sklearn.ensemble._forest import BaseForest
-
-
-def cross_fit_forest(
-    forest: BaseForest,
-    X: ArrayLike,
-    y: ArrayLike,
-    cv: BaseCrossValidator = None,
-    n_splits: int = 5,
-    random_state=None,
-) -> Tuple[List[Any], List[ArrayLike]]:
-    """Perform cross-fitting of a forest estimator.
-
-    Parameters
-    ----------
-    forest : BaseForest
-        Forest.
-    X : ArrayLike of shape (n_samples, n_features)
-        Input data.
-    y : ArrayLike of shape (n_samples, [n_outputs])
-        Target data.
-    cv : BaseCrossValidator, optional
-        Cross validation object, by default None, which defaults to
-        :class:`sklearn.model_selection.StratifiedKFold`.
-    n_splits : int, optional
-        Number of folds to generate, by default 5.
-    random_state : int, optional
-        Random seed.
-
-    Returns
-    -------
-    fitted_forests : List[BaseForest]
-        List of fitted forests.
-    test_indices : List[ArrayLike]
-        List of test indices over ``n_samples``.
-    """
-    if cv is None:
-        cv = StratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=True)
-
-    fitted_forests = []
-    test_indices = []
-    for train_index, test_index in cv.split(X, y):
-        new_forest = clone(forest)
-        new_forest.fit(X[train_index], y[train_index])
-
-        fitted_forests.append(new_forest)
-        test_indices.append(test_index)
-
-    return fitted_forests, test_indices
diff --git a/sktree/datasets/hyppo.py b/sktree/datasets/hyppo.py
index 1fcaa1d1d..3da38a44b 100644
--- a/sktree/datasets/hyppo.py
+++ b/sktree/datasets/hyppo.py
@@ -388,13 +388,11 @@ def make_trunk_mixture_classification(
     mixture_idx = rng.choice(2, n_samples // 2, replace=True, shuffle=True, p=[mix, 1 - mix])  # type: ignore
 
     norm_params = [[mu_0_vec, cov], [mu_1_vec, cov]]
-    X_mixture = np.fromiter(
-        (
-            rng_children.multivariate_normal(*(norm_params[i]), size=1, method=method)
-            for i, rng_children in zip(mixture_idx, rng.spawn(n_samples // 2))
-        ),
-        dtype=np.dtype((float, n_informative)),
-    )
+    dim_sample = len(norm_params[0][0])  # Dimensionality of the samples
+    X_mixture = np.empty((n_samples // 2, dim_sample))  # Pre-allocate array for samples
+    for idx, (i, rng_child) in enumerate(zip(mixture_idx, rng.spawn(n_samples // 2))):
+        mean, cov = norm_params[i]
+        X_mixture[idx, :] = rng_child.multivariate_normal(mean, cov, size=1, method=method)
 
     # create new generator instance to ensure reproducibility with multiple runs with the same seed
     rng_F = np.random.default_rng(seed=seed)
@@ -474,6 +472,7 @@ def make_trunk_classification(
         Either 'ma', or 'ar'.
     return_params : bool, optional
         Whether or not to return the distribution parameters of the classes normal distributions.
+        Default false.
     scaling_factor : float, optional
         The scaling factor for the covariance matrix. By default 1.
     seed : int, optional
diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py
index 484fca881..2ea11ea93 100644
--- a/sktree/ensemble/_honest_forest.py
+++ b/sktree/ensemble/_honest_forest.py
@@ -9,8 +9,8 @@
 from joblib import Parallel, delayed
 from sklearn.base import _fit_context, clone
 from sklearn.ensemble._base import _partition_estimators, _set_random_states
-from sklearn.utils import compute_sample_weight
-from sklearn.utils._param_validation import Interval, RealNotInt
+from sklearn.utils import compute_sample_weight, resample
+from sklearn.utils._param_validation import Interval, RealNotInt, StrOptions
 from sklearn.utils.validation import check_is_fitted
 
 from .._lib.sklearn.ensemble._forest import ForestClassifier
@@ -31,41 +31,67 @@ def _parallel_build_trees(
     n_samples_bootstrap=None,
     missing_values_in_feature_mask=None,
     classes=None,
+    stratify=False,
 ):
     """
     Private function used to fit a single tree in parallel.
 
-    XXX: this is copied over from scikit-learn and modified to allow sampling with
+    XXX:
+    1. this is copied over from scikit-learn and modified to allow sampling with
     and without replacement given ``bootstrap``.
+
+    2. Overrides the scikit-learn implementation to allow for stratification during bootstrapping
+    via the `stratify` parameter.
     """
     if verbose > 1:
         print("building tree %d of %d" % (tree_idx + 1, n_trees))
 
-    n_samples = X.shape[0]
-    if sample_weight is None:
-        curr_sample_weight = np.ones((n_samples,), dtype=np.float64)
+    if bootstrap:
+        n_samples = X.shape[0]
+        if sample_weight is None:
+            curr_sample_weight = np.ones((n_samples,), dtype=np.float64)
+        else:
+            curr_sample_weight = sample_weight.copy()
+
+        if stratify:
+            indices = resample(
+                np.arange(n_samples),
+                n_samples=n_samples_bootstrap,
+                stratify=y,
+                replace=True,
+                random_state=tree.random_state,
+            )
+        else:
+            indices = _generate_sample_indices(
+                tree.random_state, n_samples, n_samples_bootstrap, bootstrap=bootstrap
+            )
+        sample_counts = np.bincount(indices, minlength=n_samples)
+        curr_sample_weight *= sample_counts
+
+        if class_weight == "subsample":
+            with catch_warnings():
+                simplefilter("ignore", DeprecationWarning)
+                curr_sample_weight *= compute_sample_weight("auto", y, indices=indices)
+        elif class_weight == "balanced_subsample":
+            curr_sample_weight *= compute_sample_weight("balanced", y, indices=indices)
+
+        tree._fit(
+            X,
+            y,
+            sample_weight=curr_sample_weight,
+            check_input=False,
+            missing_values_in_feature_mask=missing_values_in_feature_mask,
+            classes=classes,
+        )
     else:
-        curr_sample_weight = sample_weight.copy()
-
-    indices = _generate_sample_indices(tree.random_state, n_samples, n_samples_bootstrap, bootstrap)
-    sample_counts = np.bincount(indices, minlength=n_samples)
-    curr_sample_weight *= sample_counts
-
-    if class_weight == "subsample":
-        with catch_warnings():
-            simplefilter("ignore", DeprecationWarning)
-            curr_sample_weight *= compute_sample_weight("auto", y, indices=indices)
-    elif class_weight == "balanced_subsample":
-        curr_sample_weight *= compute_sample_weight("balanced", y, indices=indices)
-
-    tree._fit(
-        X,
-        y,
-        sample_weight=curr_sample_weight,
-        check_input=False,
-        missing_values_in_feature_mask=missing_values_in_feature_mask,
-        classes=classes,
-    )
+        tree._fit(
+            X,
+            y,
+            sample_weight=sample_weight,
+            check_input=False,
+            missing_values_in_feature_mask=missing_values_in_feature_mask,
+            classes=classes,
+        )
 
     return tree
 
@@ -254,6 +280,8 @@ class HonestForestClassifier(ForestClassifier, ForestClassifierMixin):
 
     stratify : bool
         Whether or not to stratify sample when considering structure and leaf indices.
+        This will also stratify samples when bootstrap sampling is used. For more
+        information, see :func:`sklearn.utils.resample`.
         By default False.
 
     **tree_estimator_params : dict
@@ -389,6 +417,11 @@ class labels (multi-output problem).
         Interval(RealNotInt, 0.0, None, closed="right"),
         Interval(Integral, 1, None, closed="left"),
     ]
+    _parameter_constraints["honest_fraction"] = [Interval(RealNotInt, 0.0, 1.0, closed="both")]
+    _parameter_constraints["honest_prior"] = [
+        StrOptions({"empirical", "uniform", "ignore"}),
+    ]
+    _parameter_constraints["stratify"] = ["boolean"]
 
     def __init__(
         self,
@@ -515,6 +548,55 @@ def fit(self, X, y, sample_weight=None, classes=None, **fit_params):
 
         return self
 
+    def _construct_trees(
+        self,
+        X,
+        y,
+        sample_weight,
+        random_state,
+        n_samples_bootstrap,
+        missing_values_in_feature_mask,
+        classes,
+        n_more_estimators,
+    ):
+        """Override construction of trees to allow stratification during bootstrapping."""
+        trees = [
+            self._make_estimator(append=False, random_state=random_state)
+            for i in range(n_more_estimators)
+        ]
+
+        # Parallel loop: we prefer the threading backend as the Cython code
+        # for fitting the trees is internally releasing the Python GIL
+        # making threading more efficient than multiprocessing in
+        # that case. However, for joblib 0.12+ we respect any
+        # parallel_backend contexts set at a higher level,
+        # since correctness does not rely on using threads.
+        trees = Parallel(
+            n_jobs=self.n_jobs,
+            verbose=self.verbose,
+            prefer="threads",
+        )(
+            delayed(_parallel_build_trees)(
+                t,
+                self.bootstrap,
+                X,
+                y,
+                sample_weight,
+                i,
+                len(trees),
+                verbose=self.verbose,
+                class_weight=self.class_weight,
+                n_samples_bootstrap=n_samples_bootstrap,
+                missing_values_in_feature_mask=missing_values_in_feature_mask,
+                classes=classes,
+                stratify=self.stratify,
+            )
+            for i, t in enumerate(trees)
+        )
+
+        # Collect newly grown trees
+        self.estimators_.extend(trees)
+
     def _make_estimator(self, append=True, random_state=None):
         """Make and configure a copy of the `estimator_` attribute.
 
diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index a03a7dd73..bbc20d032 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -307,6 +307,9 @@ def build_oob_forest(est: ForestClassifier, X, y, verbose=False, **est_kwargs):
     ----------
     est : Forest
         The type of forest to use. Must be enabled with ``bootstrap=True``.
+        The forest should have either ``oob_samples_`` or ``estimators_samples_``
+        property defined, which will be used to compute the out of bag samples
+        per tree.
     X : ArrayLike of shape (n_samples, n_features)
         Data.
     y : ArrayLike of shape (n_samples, n_outputs)
@@ -314,7 +317,7 @@ def build_oob_forest(est: ForestClassifier, X, y, verbose=False, **est_kwargs):
     verbose : bool, optional
         Verbosity, by default False.
     **est_kwargs : dict, optional
-        Additional keyword arguments to pass to the forest estimator.
+        Additional keyword arguments to pass to the forest estimator ``fit`` function.
 
     Returns
     -------
@@ -351,10 +354,22 @@ def build_oob_forest(est: ForestClassifier, X, y, verbose=False, **est_kwargs):
     all_proba = np.full(
         (len(est.estimators_), X.shape[0], est.n_classes_), np.nan, dtype=np.float64
     )
-    Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")(
-        delayed(_parallel_predict_proba_oob)(e.predict_proba, X, all_proba, idx, test_idx, lock)
-        for idx, (e, test_idx) in enumerate(zip(est.estimators_, est.oob_samples_))
-    )
+    if hasattr(est, "oob_samples_"):
+        Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")(
+            delayed(_parallel_predict_proba_oob)(e.predict_proba, X, all_proba, idx, test_idx, lock)
+            for idx, (e, test_idx) in enumerate(zip(est.estimators_, est.oob_samples_))
+        )
+    else:
+        inbag_samples = est.estimators_samples_
+        all_samples = np.arange(X.shape[0])
+        oob_samples_list = [
+            np.setdiff1d(all_samples, inbag_samples[i]) for i in range(len(inbag_samples))
+        ]
+        Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")(
+            delayed(_parallel_predict_proba_oob)(e.predict_proba, X, all_proba, idx, test_idx, lock)
+            for idx, (e, test_idx) in enumerate(zip(est.estimators_, oob_samples_list))
+        )
+
     return est, all_proba
 
 
@@ -365,6 +380,7 @@ def build_cv_forest(
     cv=5,
     test_size=0.2,
     verbose=False,
+    return_indices=False,
     seed=None,
 ):
     """Build a hypothesis testing forest using using cross-validation.
@@ -383,6 +399,8 @@ def build_cv_forest(
         Proportion of samples per tree to use for the test set, by default 0.2.
     verbose : bool, optional
         Verbosity, by default False.
+    return_indices : bool, optional
+        Whether or not to return the train and test indices, by default False.
     seed : int, optional
         Random seed, by default None.
 
@@ -393,6 +411,7 @@ def build_cv_forest(
     all_proba_list : list of ArrayLike of shape (n_estimators, n_samples, n_outputs)
         The predicted posterior probabilities for each estimator on their
         out of bag samples. Length of list is equal to the number of splits.
+    tr
     """
     X = X.astype(np.float32)
     if cv is not None:
@@ -415,6 +434,9 @@ def build_cv_forest(
     est_list = []
     all_proba_list = []
     for isplit in range(n_splits):
+        # clone the estimator to remove all fitted attributes
+        est = clone(est)
+
         X_train, y_train = X[train_idx_list[isplit], :], y[train_idx_list[isplit]]
         # X_test = X[test_idx_list[isplit], :]
 
@@ -447,4 +469,7 @@ def build_cv_forest(
         all_proba_list.append(posterior_arr)
         est_list.append(est)
 
+    if return_indices:
+        return est_list, all_proba_list, train_idx_list, test_idx_list
+
     return est_list, all_proba_list
diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py
index e20de8ea8..f4b397ffc 100644
--- a/sktree/stats/tests/test_forestht.py
+++ b/sktree/stats/tests/test_forestht.py
@@ -4,10 +4,12 @@
 from numpy.testing import assert_almost_equal, assert_array_equal
 from sklearn import datasets
 
-from sktree import HonestForestClassifier
+from sktree import HonestForestClassifier, RandomForestClassifier
 from sktree.stats import (
     PermutationHonestForestClassifier,
     build_coleman_forest,
+    build_cv_forest,
+    build_oob_forest,
     build_permutation_forest,
 )
 from sktree.tree import MultiViewDecisionTreeClassifier
@@ -32,7 +34,7 @@ def test_small_dataset_independent(seed):
     # XXX: unit test interestingly does not work for MI, possibly due to bias
     bootstrap = True
     n_samples = 100
-    n_features = 1000
+    n_features = 500
     n_estimators = 100
 
     rng = np.random.default_rng(seed)
@@ -65,8 +67,8 @@ def test_small_dataset_independent(seed):
         clf, perm_clf, X, y, n_repeats=1000, metric="s@98", return_posteriors=False, seed=seed
     )
     print(result.observe_stat, result.permuted_stat, result.pvalue, result.observe_test_stat)
-    assert_almost_equal(np.abs(result.observe_test_stat), 0.0, decimal=1)
     assert result.pvalue > 0.05, f"{result.pvalue}"
+    assert_almost_equal(np.abs(result.observe_test_stat), 0.0, decimal=1)
 
     # now permute only some of the features
     feature_set_ends = [3, n_features]
@@ -392,3 +394,111 @@ def test_build_permutation_forest():
     )
     assert forest_result.pvalue > 0.05, f"{forest_result.pvalue}"
     assert forest_result.observe_test_stat < 0.05, f"{forest_result.observe_test_stat}"
+
+
+def test_build_oob_honest_forest():
+    bootstrap = True
+    max_samples = 1.6
+
+    n_estimators = 100
+    est = HonestForestClassifier(
+        n_estimators=n_estimators,
+        random_state=0,
+        bootstrap=bootstrap,
+        max_samples=max_samples,
+        honest_fraction=0.5,
+        stratify=True,
+    )
+    X = rng.normal(0, 1, (100, 2))
+    X[:50] *= -1
+    y = np.array([0, 1] * 50)
+    samples = np.arange(len(y))
+
+    est, proba = build_oob_forest(est, X, y)
+
+    structure_samples = est.structure_indices_
+    leaf_samples = est.honest_indices_
+    oob_samples = est.oob_samples_
+    for tree_idx in range(est.n_estimators):
+        assert len(structure_samples[tree_idx]) + len(leaf_samples[tree_idx]) + len(
+            oob_samples[tree_idx]
+        ) == len(
+            samples
+        ), f"{tree_idx} {len(structure_samples[tree_idx])} {len(leaf_samples[tree_idx])} {len(samples)}"
+
+
+def test_build_oob_random_forest():
+    """Test building oob random forest."""
+    bootstrap = True
+    max_samples = 1.0
+
+    n_estimators = 100
+    est = RandomForestClassifier(
+        n_estimators=n_estimators, random_state=0, bootstrap=bootstrap, max_samples=max_samples
+    )
+    X = rng.normal(0, 1, (100, 2))
+    X[:50] *= -1
+    y = np.array([0, 1] * 50)
+    samples = np.arange(len(y))
+
+    est, proba = build_oob_forest(est, X, y)
+
+    structure_samples = est.estimators_samples_
+    all_samples = np.arange(X.shape[0])
+    oob_samples_list = [
+        np.setdiff1d(all_samples, structure_samples[i]) for i in range(len(structure_samples))
+    ]
+    for tree_idx in range(est.n_estimators):
+        assert len(np.unique(structure_samples[tree_idx])) + len(oob_samples_list[tree_idx]) == len(
+            samples
+        ), f"{tree_idx} {len(structure_samples[tree_idx])} + {len(oob_samples_list[tree_idx])} != {len(samples)}"
+
+
+@pytest.mark.parametrize("bootstrap, max_samples", [(True, 1.6), (False, None)])
+def test_build_cv_honest_forest(bootstrap, max_samples):
+    n_estimators = 100
+    est = HonestForestClassifier(
+        n_estimators=n_estimators,
+        random_state=0,
+        bootstrap=bootstrap,
+        max_samples=max_samples,
+        honest_fraction=0.5,
+        stratify=True,
+    )
+    X = rng.normal(0, 1, (100, 2))
+    X[:50] *= -1
+    y = np.array([0, 1] * 50)
+    samples = np.arange(len(y))
+
+    est_list, proba_list, train_idx_list, test_idx_list = build_cv_forest(
+        est,
+        X,
+        y,
+        return_indices=True,
+        seed=seed,
+        cv=3,
+    )
+
+    assert isinstance(est_list, list)
+    assert isinstance(proba_list, list)
+
+    for est, proba, train_idx, test_idx in zip(est_list, proba_list, train_idx_list, test_idx_list):
+        assert len(train_idx) + len(test_idx) == len(samples)
+        structure_samples = est.structure_indices_
+        leaf_samples = est.honest_indices_
+
+        if not bootstrap:
+            oob_samples = [[] for _ in range(est.n_estimators)]
+        else:
+            oob_samples = est.oob_samples_
+
+        # compared to oob samples, now the train samples are comprised of the entire dataset
+        # seen over the entire forest. The test dataset is completely disjoint
+        for tree_idx in range(est.n_estimators):
+            n_samples_in_tree = len(structure_samples[tree_idx]) + len(leaf_samples[tree_idx])
+            assert n_samples_in_tree + len(oob_samples[tree_idx]) == len(train_idx), (
+                f"For tree: "
+                f"{tree_idx} {len(structure_samples[tree_idx])} + "
+                f"{len(leaf_samples[tree_idx])} + {len(oob_samples[tree_idx])} "
+                f"!= {len(train_idx)} {len(test_idx)}"
+            )
diff --git a/sktree/tests/test_honest_forest.py b/sktree/tests/test_honest_forest.py
index 31232a38e..3610475d6 100644
--- a/sktree/tests/test_honest_forest.py
+++ b/sktree/tests/test_honest_forest.py
@@ -9,7 +9,7 @@
 from sklearn.utils.estimator_checks import parametrize_with_checks
 
 from sktree._lib.sklearn.tree import DecisionTreeClassifier
-from sktree.datasets import make_quadratic_classification
+from sktree.datasets import make_quadratic_classification, make_trunk_classification
 from sktree.ensemble import HonestForestClassifier
 from sktree.stats.utils import _mutual_information
 from sktree.tree import (
@@ -20,6 +20,8 @@
 
 CLF_CRITERIONS = ("gini", "entropy")
 
+seed = 12345
+
 # also load the iris dataset
 # and randomly permute it
 iris = datasets.load_iris()
@@ -51,7 +53,7 @@ def test_toy_accuracy():
 
 @pytest.mark.parametrize("criterion", ["gini", "entropy"])
 @pytest.mark.parametrize("max_features", [None, 2])
-@pytest.mark.parametrize("honest_prior", ["empirical", "uniform", "ignore", "error"])
+@pytest.mark.parametrize("honest_prior", ["empirical", "uniform", "ignore"])
 @pytest.mark.parametrize(
     "estimator",
     [
@@ -71,26 +73,22 @@ def test_iris(criterion, max_features, honest_prior, estimator):
         honest_prior=honest_prior,
         tree_estimator=estimator,
     )
-    if honest_prior == "error":
-        with pytest.raises(ValueError, match="honest_prior error not a valid input."):
-            clf.fit(iris.data, iris.target)
-    else:
-        clf.fit(iris.data, iris.target)
-        score = accuracy_score(clf.predict(iris.data), iris.target)
+    clf.fit(iris.data, iris.target)
+    score = accuracy_score(clf.predict(iris.data), iris.target)
 
-        assert (
-            score > 0.5 and score < 1.0
-        ), "Failed with {0}, criterion = {1} and score = {2}".format("HForest", criterion, score)
+    assert score > 0.5 and score < 1.0, "Failed with {0}, criterion = {1} and score = {2}".format(
+        "HForest", criterion, score
+    )
 
-        score = accuracy_score(clf.predict(iris.data), clf.predict_proba(iris.data).argmax(1))
-        assert score == 1.0, "Failed with {0}, criterion = {1} and score = {2}".format(
-            "HForest", criterion, score
-        )
+    score = accuracy_score(clf.predict(iris.data), clf.predict_proba(iris.data).argmax(1))
+    assert score == 1.0, "Failed with {0}, criterion = {1} and score = {2}".format(
+        "HForest", criterion, score
+    )
 
 
 @pytest.mark.parametrize("criterion", ["gini", "entropy"])
 @pytest.mark.parametrize("max_features", [None, 2])
-@pytest.mark.parametrize("honest_prior", ["empirical", "uniform", "ignore", "error"])
+@pytest.mark.parametrize("honest_prior", ["empirical", "uniform", "ignore"])
 @pytest.mark.parametrize(
     "estimator",
     [
@@ -120,24 +118,16 @@ def test_iris_multi(criterion, max_features, honest_prior, estimator):
 
     X = iris.data
     y = np.stack((iris.target, second_y[perm])).T
-    if honest_prior == "error":
-        with pytest.raises(ValueError, match="honest_prior error not a valid input."):
-            clf.fit(X, y)
+    clf.fit(X, y)
+    score = r2_score(clf.predict(X), y)
+    if honest_prior == "ignore":
+        assert (
+            score > 0.4 and score < 1.0
+        ), "Failed with {0}, criterion = {1} and score = {2}".format("HForest", criterion, score)
     else:
-        clf.fit(X, y)
-        score = r2_score(clf.predict(X), y)
-        if honest_prior == "ignore":
-            assert (
-                score > 0.4 and score < 1.0
-            ), "Failed with {0}, criterion = {1} and score = {2}".format(
-                "HForest", criterion, score
-            )
-        else:
-            assert (
-                score > 0.9 and score < 1.0
-            ), "Failed with {0}, criterion = {1} and score = {2}".format(
-                "HForest", criterion, score
-            )
+        assert (
+            score > 0.9 and score < 1.0
+        ), "Failed with {0}, criterion = {1} and score = {2}".format("HForest", criterion, score)
 
 
 def test_max_samples():
@@ -163,6 +153,37 @@ def test_max_samples():
     uf = uf.fit(X, y)
 
 
+def test_honest_forest_samples():
+    bootstrap = True
+    max_samples = 1.6
+
+    n_estimators = 5
+    est = HonestForestClassifier(
+        n_estimators=n_estimators,
+        random_state=0,
+        bootstrap=bootstrap,
+        max_samples=max_samples,
+        honest_fraction=0.5,
+        stratify=True,
+    )
+    X = rng.normal(0, 1, (100, 2))
+    X[:50] *= -1
+    y = [0, 1] * 50
+    samples = np.arange(len(y))
+
+    est.fit(X, y)
+
+    structure_samples = est.structure_indices_
+    leaf_samples = est.honest_indices_
+    oob_samples = est.oob_samples_
+    for tree_idx in range(est.n_estimators):
+        assert len(structure_samples[tree_idx]) + len(leaf_samples[tree_idx]) + len(
+            oob_samples[tree_idx]
+        ) == len(
+            samples
+        ), f"{tree_idx} {len(structure_samples[tree_idx])} {len(leaf_samples[tree_idx])} {len(samples)}"
+
+
 @pytest.mark.parametrize("max_samples", [0.75, 1.0])
 def test_honest_forest_has_deterministic_sampling_for_oob_structure_and_leaves(max_samples):
     """Test that honest forest can produce the oob, structure and leaf-node samples.
@@ -489,3 +510,35 @@ def test_honest_forest_with_tree_estimator_params(tree, tree_kwargs):
                 )
             else:
                 assert getattr(clf, attr_name) == getattr(clf.estimators_[0], attr_name)
+
+
+def test_honest_forest_posteriors_on_independent():
+    """Test regression from :gh:`283`.
+
+    Posteriors were biased when the classes were independent and using the bootstrap and oob sample
+    technique to estimate the final population test statistic. This resulted in a biased estimate
+    of the AUC score. Stratification of the bootstrapping samples was the solution to this problem.
+    """
+    scores = []
+    for idx in range(5):
+        # create a dataset with overlapping classes
+        X, y = make_trunk_classification(
+            n_samples=128, n_dim=4096, n_informative=1, mu_0=0.0, mu_1=0.0, seed=idx
+        )
+        clf = HonestForestClassifier(
+            n_estimators=100,
+            random_state=idx,
+            bootstrap=True,
+            max_samples=1.6,
+            n_jobs=-1,
+            honest_prior="ignore",
+            stratify=True,
+        )
+        clf.fit(X, y)
+
+        oob_posteriors = clf.predict_proba_per_tree(X, clf.oob_samples_)
+        auc_score = roc_auc_score(y, np.nanmean(oob_posteriors, axis=0)[:, 1])
+        scores.append(auc_score)
+
+    # Without stratification, this test should fail
+    assert np.mean(scores) > 0.49 and np.mean(scores) < 0.51, f"{np.mean(scores)} {scores}"
diff --git a/sktree/tree/_honest_tree.py b/sktree/tree/_honest_tree.py
index a3b6d0813..61ae1da31 100644
--- a/sktree/tree/_honest_tree.py
+++ b/sktree/tree/_honest_tree.py
@@ -1,6 +1,6 @@
-# Authors: Ronan Perry, Sambit Panda, Haoyin Xu
 # Adopted from: https://github.com/neurodata/honest-forests
 
+
 import numpy as np
 from sklearn.base import ClassifierMixin, MetaEstimatorMixin, _fit_context, clone
 from sklearn.model_selection import StratifiedShuffleSplit
@@ -446,7 +446,6 @@ def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None):
             replace=False,
         )
         self.honest_indices_ = np.setdiff1d(nonzero_indices, self.structure_indices_)
-
         _sample_weight[self.honest_indices_] = 0
 
         self.estimator_.partial_fit(
@@ -458,46 +457,8 @@ def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None):
         )
         self._inherit_estimator_attributes()
 
-        # update the number of classes, unsplit
-        if y.ndim == 1:
-            # reshape is necessary to preserve the data contiguity against vs
-            # [:, np.newaxis] that does not.
-            y = np.reshape(y, (-1, 1))
-        check_classification_targets(y)
-        y = np.copy(y)  # .astype(int)
-
-        # Normally called by super
-        X = self.estimator_._validate_X_predict(X, True)
-
-        # Fit leaves using other subsample
-        honest_leaves = self.tree_.apply(X[self.honest_indices_])
-
-        # preserve from underlying tree
-        self._tree_classes_ = self.classes_
-        self._tree_n_classes_ = self.n_classes_
-        self.classes_ = []
-        self.n_classes_ = []
-        self.empirical_prior_ = []
-
-        y_encoded = np.zeros(y.shape, dtype=int)
-        for k in range(self.n_outputs_):
-            classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True)
-            self.classes_.append(classes_k)
-            self.n_classes_.append(classes_k.shape[0])
-            self.empirical_prior_.append(
-                np.bincount(y_encoded[:, k], minlength=classes_k.shape[0]) / y.shape[0]
-            )
-        y = y_encoded
-
-        # y-encoded ensures that y values match the indices of the classes
-        self._set_leaf_nodes(honest_leaves, y)
-
-        self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)
-        if self.n_outputs_ == 1:
-            self.n_classes_ = self.n_classes_[0]
-            self.classes_ = self.classes_[0]
-            self.empirical_prior_ = self.empirical_prior_[0]
-            y = y[:, 0]
+        # set leaf nodes
+        self._fit_leaves(X, y, sample_weight=_sample_weight)
 
         return self
 
@@ -511,7 +472,6 @@ def _partition_honest_indices(self, y, sample_weight):
             _sample_weight = np.array(sample_weight)
 
         nonzero_indices = np.where(_sample_weight > 0)[0]
-
         # sample the structure indices
         if self.stratify:
             ss = StratifiedShuffleSplit(
@@ -663,15 +623,16 @@ def _fit(
         else:
             sample_weight_leaves = np.array(sample_weight)
         sample_weight_leaves[not_honest_mask] = 0
-        self._fit_leaves(X, y, sample_weight=sample_weight_leaves)
-        return self
-
-    def _fit_leaves(self, X, y, sample_weight):
-        nonzero_indices = np.where(sample_weight > 0)[0]
 
+        # determine the honest indices using the sample weight
+        nonzero_indices = np.where(sample_weight_leaves > 0)[0]
         # sample the structure indices
         self.honest_indices_ = nonzero_indices
 
+        self._fit_leaves(X, y, sample_weight=sample_weight_leaves)
+        return self
+
+    def _fit_leaves(self, X, y, sample_weight):
         # update the number of classes, unsplit
         if y.ndim == 1:
             # reshape is necessary to preserve the data contiguity against vs
@@ -683,9 +644,6 @@ def _fit_leaves(self, X, y, sample_weight):
         # Normally called by super
         X = self.estimator_._validate_X_predict(X, True)
 
-        # Fit leaves using other subsample
-        honest_leaves = self.tree_.apply(X[self.honest_indices_])
-
         # preserve from underlying tree
         # https://github.com/scikit-learn/scikit-learn/blob/1.0.X/sklearn/tree/_classes.py#L202
         self._tree_classes_ = self.classes_
@@ -703,18 +661,26 @@ def _fit_leaves(self, X, y, sample_weight):
                 np.bincount(y_encoded[:, k], minlength=classes_k.shape[0]) / y.shape[0]
             )
         y = y_encoded
+        self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)
 
-        # y-encoded ensures that y values match the indices of the classes
-        self._set_leaf_nodes(honest_leaves, y)
+        # XXX: implement honest pruning
+        honest_method = "apply"
+        if honest_method == "apply":
+            # Fit leaves using other subsample
+            honest_leaves = self.tree_.apply(X[self.honest_indices_])
+
+            # y-encoded ensures that y values match the indices of the classes
+            self._set_leaf_nodes(honest_leaves, y, sample_weight)
+        elif honest_method == "prune":
+            raise NotImplementedError("Pruning is not yet implemented.")
 
-        self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)
         if self.n_outputs_ == 1:
             self.n_classes_ = self.n_classes_[0]
             self.classes_ = self.classes_[0]
             self.empirical_prior_ = self.empirical_prior_[0]
             y = y[:, 0]
 
-    def _set_leaf_nodes(self, leaf_ids, y):
+    def _set_leaf_nodes(self, leaf_ids, y, sample_weight):
         """Traverse the already built tree with X and set leaf nodes with y.
 
         tree_.value has shape (n_nodes, n_outputs, max_n_classes), where
@@ -725,8 +691,12 @@ def _set_leaf_nodes(self, leaf_ids, y):
         classes are ordered by their index in the tree_.value array.
         """
         self.tree_.value[:, :, :] = 0
-        for leaf_id, yval in zip(leaf_ids, y[self.honest_indices_, :]):
-            self.tree_.value[leaf_id][:, yval] += 1
+
+        # apply sample-weight to the leaf nodes
+        for leaf_id, yval, y_weight in zip(
+            leaf_ids, y[self.honest_indices_, :], sample_weight[self.honest_indices_]
+        ):
+            self.tree_.value[leaf_id][:, yval] += y_weight
 
     def _inherit_estimator_attributes(self):
         """Initialize necessary attributes from the provided tree estimator"""
@@ -750,6 +720,8 @@ def _inherit_estimator_attributes(self):
     def _empty_leaf_correction(self, proba, pos=0):
         """Leaves with empty posteriors are assigned values.
 
+        This is called only during prediction.
+
         The posteriors are corrected according to the honest prior.
         In multi-output cases, the posterior corrections only correspond
         to the respective y dimension, indicated by the position param pos.
@@ -764,8 +736,6 @@ def _empty_leaf_correction(self, proba, pos=0):
                 proba[zero_mask] = 1 / self.n_classes_[pos]
             elif self.honest_prior == "ignore":
                 proba[zero_mask] = np.nan
-            else:
-                raise ValueError(f"honest_prior {self.honest_prior} not a valid input.")
         else:
             if self.honest_prior == "empirical":
                 proba[zero_mask] = self.empirical_prior_
@@ -773,8 +743,6 @@ def _empty_leaf_correction(self, proba, pos=0):
                 proba[zero_mask] = 1 / self.n_classes_
             elif self.honest_prior == "ignore":
                 proba[zero_mask] = np.nan
-            else:
-                raise ValueError(f"honest_prior {self.honest_prior} not a valid input.")
         return proba
 
     def predict_proba(self, X, check_input=True):
diff --git a/sktree/tree/tests/test_honest_tree.py b/sktree/tree/tests/test_honest_tree.py
index 907c386f1..4815b3cb2 100644
--- a/sktree/tree/tests/test_honest_tree.py
+++ b/sktree/tree/tests/test_honest_tree.py
@@ -115,6 +115,7 @@ def test_honest_tree_with_tree_estimator_params(tree, tree_kwargs):
     ],
 )
 def test_impute_posteriors(honest_prior, val):
+    # XXX: test could be improved as we don't actually test proper behavior of the imputation
     np.random.seed(0)
     X = np.random.normal(0, 1, (100, 2))
     y = [0] * 75 + [1] * 25
@@ -123,9 +124,14 @@ def test_impute_posteriors(honest_prior, val):
 
     y_proba = clf.predict_proba(X)
     if np.isnan(val):
-        assert len(np.where(np.isnan(y_proba[:, 0]))[0]) > 50, f"Failed with {honest_prior}"
+        assert (
+            len(np.where(np.isnan(y_proba[:, 0]))[0]) > 50
+        ), f"Failed with {honest_prior}, {len(np.where(np.isnan(y_proba[:, 0]))[0])}"
+        # assert len(np.where(np.isnan(y_proba[:, 0]))[0]) < 60, f"Failed with {honest_prior}, {len(np.where(np.isnan(y_proba[:, 0]))[0])}"
     else:
-        assert len(np.where(y_proba[:, 0] == val)[0]) > 50, f"Failed with {honest_prior}"
+        assert (
+            len(np.where(y_proba[:, 0] == val)[0]) > 50
+        ), f"Failed with {honest_prior}, {len(np.where(y_proba[:, 0] == val)[0])}"
 
 
 def test_increasing_leaves():