From 16eb0c86ee529774c4800b9518b0ad284df24119 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Mon, 13 Nov 2023 12:50:54 -0500
Subject: [PATCH 01/14] Intial

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/experimental/monte_carlo.py            | 218 ++++++++++++++++++
 sktree/experimental/tests/test_monte_carlo.py | 196 ++++++++++++++++
 2 files changed, 414 insertions(+)
 create mode 100644 sktree/experimental/monte_carlo.py
 create mode 100644 sktree/experimental/tests/test_monte_carlo.py

diff --git a/sktree/experimental/monte_carlo.py b/sktree/experimental/monte_carlo.py
new file mode 100644
index 000000000..aab6a25cd
--- /dev/null
+++ b/sktree/experimental/monte_carlo.py
@@ -0,0 +1,218 @@
+from typing import Optional
+
+import numpy as np
+from numpy.typing import ArrayLike
+from scipy.sparse import issparse
+from sklearn.neighbors import NearestNeighbors
+from sklearn.utils import _approximate_mode, _safe_indexing, check_array, check_consistent_length
+
+
+def _conditional_shuffle(nbrs: ArrayLike, replace: bool = False, seed=None) -> ArrayLike:
+    """Compute a permutation of neighbors with restrictions.
+
+    Parameters
+    ----------
+    nbrs : ArrayLike of shape (n_samples, k)
+        The k-nearest-neighbors for each sample index. Each row corresponds to the
+        original sample. Each element corresponds to another sample index that is deemed
+        as the k-nearest neighbors with respect to the original sample.
+    replace : bool, optional
+        Whether or not to allow replacement of samples, by default False.
+    seed : int, optional
+        Random seed, by default None.
+
+    Returns
+    -------
+    restricted_perm : ArrayLike of shape (n_samples)
+        The final permutation order of the sample indices. There may be
+        repeating samples. See Notes for details.
+
+    Notes
+    -----
+    Restricted permutation goes through random samples and looks at the k-nearest
+    neighbors (columns of ``nbrs``) and shuffles the closest neighbor index only
+    if it has not been used to permute another sample. If it has been, then the
+    algorithm looks at the next nearest-neighbor and so on. If all k-nearest
+    neighbors of a sample has been checked, then a random neighbor is chosen. In this
+    manner, the algorithm tries to perform permutation without replacement, but
+    if necessary, will choose a repeating neighbor sample.
+    """
+    n_samples, k_dims = nbrs.shape
+    rng = np.random.default_rng(seed=seed)
+
+    # initialize the final permutation order
+    restricted_perm = np.zeros((n_samples,), dtype=np.intp)
+
+    # generate a random order of samples to go through
+    random_order = rng.permutation(n_samples)
+
+    # keep track of values we have already used
+    used = set()
+
+    # go through the random order
+    for idx in random_order:
+        if replace:
+            possible_nbrs = nbrs[idx, :]
+            restricted_perm[idx] = rng.choice(possible_nbrs, size=1).squeeze()
+        else:
+            m = 0
+            use_idx = nbrs[idx, m]
+
+            # if the current nbr is already used, continue incrementing
+            # until we have either found a new sample to use, or if
+            # we have reach the maximum number of shuffles to consider
+            while (use_idx in used) and (m < k_dims - 1):
+                m += 1
+                use_idx = nbrs[idx, m]
+
+            # check whether or not we have exhaustively checked all kNN
+            if use_idx in used and m == k_dims:
+                # XXX: Note this step is not in the original paper
+                # choose a random neighbor to permute
+                restricted_perm[idx] = rng.choice(nbrs[idx, :], size=1)
+            else:
+                # permute with the existing neighbor
+                restricted_perm[idx] = use_idx
+            used.add(use_idx)
+    return restricted_perm
+
+
+def conditional_resample(
+    conditional_array: ArrayLike,
+    *arrays,
+    nn_estimator=None,
+    replace: bool = True,
+    replace_nbrs: bool = True,
+    n_samples: Optional[int] = None,
+    random_state: Optional[int] = None,
+    stratify: Optional[ArrayLike] = None,
+):
+    """Conditionally resample arrays or sparse matrices in a consistent way.
+
+    The default strategy implements one step of the bootstrapping
+    procedure. Conditional resampling is a modification of the bootstrap
+    technique that preserves the conditional distribution of the data. This
+    is done by fitting a nearest neighbors estimator on the conditional array
+    and then resampling the nearest neighbors of each sample.
+
+    Parameters
+    ----------
+    conditional_array : array-like of shape (n_samples, n_features)
+        The array, which we preserve the conditional distribution of.
+
+    *arrays : sequence of array-like of shape (n_samples,) or \
+            (n_samples, n_outputs)
+        Indexable data-structures can be arrays, lists, dataframes or scipy
+        sparse matrices with consistent first dimension.
+
+    nn_estimator : estimator object, default=None
+        The nearest neighbors estimator to use. If None, then a
+        :class:`sklearn.neighbors.NearestNeighbors` instance is used.
+
+    replace : bool, default=True
+        Implements resampling with replacement. If False, this will implement
+        (sliced) random permutations. The replacement will take place at the level
+        of the sample index.
+
+    replace_nbrs : bool, default=True
+        Implements resampling with replacement at the level of the nearest neighbors.
+
+    n_samples : int, default=None
+        Number of samples to generate. If left to None this is
+        automatically set to the first dimension of the arrays.
+        If replace is False it should not be larger than the length of
+        arrays.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for shuffling
+        the data.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    stratify : array-like of shape (n_samples,) or (n_samples, n_outputs), \
+            default=None
+        If not None, data is split in a stratified fashion, using this as
+        the class labels.
+
+    Returns
+    -------
+    resampled_arrays : sequence of array-like of shape (n_samples,) or \
+            (n_samples, n_outputs)
+        Sequence of resampled copies of the collections. The original arrays
+        are not impacted.
+    """
+    max_n_samples = n_samples
+    rng = np.random.default_rng(random_state)
+
+    if len(arrays) == 0:
+        return None
+
+    first = arrays[0]
+    n_samples = first.shape[0] if hasattr(first, "shape") else len(first)
+
+    if max_n_samples is None:
+        max_n_samples = n_samples
+    elif (max_n_samples > n_samples) and (not replace):
+        raise ValueError(
+            f"Cannot sample {max_n_samples} out of arrays with dim "
+            f"{n_samples} when replace is False"
+        )
+
+    check_consistent_length(conditional_array, *arrays)
+
+    # fit nearest neighbors onto the conditional array
+    if nn_estimator is None:
+        nn_estimator = NearestNeighbors()
+    nn_estimator.fit(conditional_array)
+
+    if stratify is None:
+        if replace:
+            indices = rng.integers(0, n_samples, size=(max_n_samples,))
+        else:
+            indices = np.arange(n_samples)
+            rng.shuffle(indices)
+            indices = indices[:max_n_samples]
+    else:
+        # Code adapted from StratifiedShuffleSplit()
+        y = check_array(stratify, ensure_2d=False, dtype=None)
+        if y.ndim == 2:
+            # for multi-label y, map each distinct row to a string repr
+            # using join because str(row) uses an ellipsis if len(row) > 1000
+            y = np.array([" ".join(row.astype("str")) for row in y])
+
+        classes, y_indices = np.unique(y, return_inverse=True)
+        n_classes = classes.shape[0]
+
+        class_counts = np.bincount(y_indices)
+
+        # Find the sorted list of instances for each class:
+        # (np.unique above performs a sort, so code is O(n logn) already)
+        class_indices = np.split(
+            np.argsort(y_indices, kind="mergesort"), np.cumsum(class_counts)[:-1]
+        )
+
+        n_i = _approximate_mode(class_counts, max_n_samples, random_state)
+
+        indices = []
+
+        for i in range(n_classes):
+            indices_i = rng.choice(class_indices[i], n_i[i], replace=replace)
+            indices.extend(indices_i)
+
+        indices = rng.permutation(indices)
+
+    # now get the kNN indices for each sample (n_samples, n_neighbors)
+    sample_nbrs = nn_estimator.kneighbors(X=conditional_array[indices, :], return_distance=False)
+
+    # actually sample the indices using a conditional permutation
+    indices = _conditional_shuffle(sample_nbrs, replace=replace_nbrs, seed=rng)
+
+    # convert sparse matrices to CSR for row-based indexing
+    arrays_ = [a.tocsr() if issparse(a) else a for a in arrays]
+    resampled_arrays = [_safe_indexing(a, indices) for a in arrays_]
+
+    if len(resampled_arrays) == 1:
+        # syntactic sugar for the unit argument case
+        return resampled_arrays[0]
+    else:
+        return resampled_arrays
diff --git a/sktree/experimental/tests/test_monte_carlo.py b/sktree/experimental/tests/test_monte_carlo.py
new file mode 100644
index 000000000..b76d59fb4
--- /dev/null
+++ b/sktree/experimental/tests/test_monte_carlo.py
@@ -0,0 +1,196 @@
+import numpy as np
+import pytest
+from scipy.sparse import csr_matrix
+from sklearn.datasets import make_classification
+from sklearn.neighbors import NearestNeighbors
+
+from sktree.experimental import conditional_resample
+
+
+def test_conditional_resample_with_default_params():
+    # Create a simple example dataset for testing
+    X, y = make_classification(n_samples=100, n_features=5, random_state=42)
+    # Create conditional array
+    nn = NearestNeighbors(n_neighbors=5)
+    nn.fit(X)
+    conditional_array = nn.kneighbors_graph(X).toarray()
+    # Test conditional resampling with default parameters
+    resampled_arrays = conditional_resample(
+        conditional_array, X, y, nn_estimator=NearestNeighbors()
+    )
+
+    # Check if the number of samples in resampled_arrays is the same as the input arrays
+    assert len(resampled_arrays) == 2
+    assert len(resampled_arrays[0]) == len(X)
+    assert len(resampled_arrays[1]) == len(y)
+
+
+def test_conditional_resample_without_replacement():
+    # Create a simple example dataset for testing
+    X, y = make_classification(n_samples=100, n_features=5, random_state=42)
+    # Create conditional array
+    nn = NearestNeighbors(n_neighbors=5)
+    nn.fit(X)
+    conditional_array = nn.kneighbors_graph(X).toarray()
+
+    # Test conditional resampling without replacement
+    resampled_arrays = conditional_resample(
+        conditional_array, X, y, nn_estimator=NearestNeighbors(), replace=False
+    )
+
+    # Check if the number of samples in resampled_arrays is the same as the input arrays
+    assert len(resampled_arrays) == 2
+    assert len(resampled_arrays[0]) == len(X)
+    assert len(resampled_arrays[1]) == len(y)
+
+    # Check if the samples are unique (no replacement)
+    assert len(np.unique(resampled_arrays[1])) == len(
+        np.unique(y)
+    ), f"{len(np.unique(resampled_arrays[1]))} != {len(y)}"
+
+
+def test_conditional_resample_with_sparse_matrix():
+    # Create a simple example dataset for testing
+    X, y = make_classification(n_samples=100, n_features=5, random_state=42)
+    X_sparse = csr_matrix(X)  # Convert X to a sparse matrix
+
+    # Create conditional array
+    nn = NearestNeighbors(n_neighbors=5)
+    nn.fit(X)
+    conditional_array = nn.kneighbors_graph(X).toarray()
+
+    # Test conditional resampling with a sparse matrix
+    resampled_arrays = conditional_resample(
+        conditional_array, X_sparse, y, nn_estimator=NearestNeighbors()
+    )
+
+    # Check if the number of samples in resampled_arrays is the same as the input arrays
+    assert len(resampled_arrays) == 2
+    assert resampled_arrays[0].shape[0] == len(X)
+    assert len(resampled_arrays[1]) == len(y)
+
+
+def test_conditional_resample_with_stratify():
+    # Create a simple example dataset for testing
+    X, y = make_classification(n_samples=100, n_features=5, random_state=42)
+
+    # Create conditional array
+    nn = NearestNeighbors(n_neighbors=5)
+    nn.fit(X)
+    conditional_array = nn.kneighbors_graph(X).toarray()
+
+    # Define a custom stratify function
+    def custom_stratify(y, category):
+        # Create an array where each entry is True if it belongs to the specified category,
+        # False otherwise
+        stratify_array = y == category
+        return stratify_array
+
+    category_to_stratify = 1  # Change this to the category you want to stratify
+
+    # Get the distribution of the specified category before resampling
+    category_distribution_before = np.sum(y == category_to_stratify)
+
+    # Test conditional resampling with the custom stratify function
+    stratify = custom_stratify(y, category_to_stratify)
+    resampled_arrays = conditional_resample(
+        conditional_array, X, y, nn_estimator=NearestNeighbors(), stratify=stratify, random_state=0
+    )
+
+    # Get the distribution of the specified category after resampling
+    category_distribution_after = np.sum(resampled_arrays[1] == category_to_stratify)
+
+    # Check if the distribution of the specified category is preserved
+    assert category_distribution_before == category_distribution_after, (
+        f"Expected {category_distribution_before} samples, got "
+        f"{category_distribution_after} samples"
+    )
+
+
+def test_conditional_resample_with_replace_nbrs():
+    # Create a simple example dataset for testing
+    X, y = make_classification(n_samples=100, n_features=5, random_state=42)
+
+    # Create conditional array
+    nn = NearestNeighbors(n_neighbors=5)
+    nn.fit(X)
+    conditional_array = nn.kneighbors_graph(X).toarray()
+
+    # Test conditional resampling with replace_nbrs=False
+    resampled_arrays = conditional_resample(
+        conditional_array, X, y, nn_estimator=NearestNeighbors(), replace_nbrs=False
+    )
+
+    # Check if the number of samples in resampled_arrays is the same as the input arrays
+    assert len(resampled_arrays) == 2, f"Expected 2 arrays, got {len(resampled_arrays)} arrays"
+    assert len(resampled_arrays[0]) == len(X)
+    assert len(resampled_arrays[1]) == len(y)
+
+
+def test_conditional_resample_errors():
+    # 01: Test with invalid number of samples
+    # Create a simple example dataset for testing
+    X, y = make_classification(n_samples=100, n_features=5, random_state=42)
+
+    # Test conditional resampling with an invalid stratify array (should raise an error)
+    with pytest.raises(ValueError, match="Cannot sample"):
+        conditional_resample(X, y, nn_estimator=NearestNeighbors(), replace=False, n_samples=1000)
+
+    # 02: Test inconsistent_length
+    # Create an additional array with a different number of samples
+    additional_array = np.random.rand(80, 5)
+
+    # Test conditional resampling with inconsistent length of input arrays (should raise an error)
+    with pytest.raises(ValueError):
+        conditional_resample(X, y, additional_array, nn_estimator=NearestNeighbors())
+
+    # 03: Test with invalid sample size when replace=False
+    # Test conditional resampling with n_samples larger than the input arrays
+    # (should raise an error)
+    with pytest.raises(ValueError):
+        conditional_resample(X, y, nn_estimator=NearestNeighbors(), n_samples=200, replace=False)
+
+
+def test_conditional_resample():
+    # Generate synthetic data
+    X, y = make_classification(n_samples=100, n_features=5, random_state=42)
+
+    # Convert X to sparse matrix
+    X_sparse = csr_matrix(X)
+
+    # Create conditional array
+    nn = NearestNeighbors(n_neighbors=5)
+    nn.fit(X)
+    conditional_array = nn.kneighbors_graph(X).toarray()
+
+    # Perform conditional resampling
+    resampled_X = conditional_resample(conditional_array, X, replace=False, replace_nbrs=False)
+    resampled_X_sparse = conditional_resample(
+        conditional_array, X_sparse, replace=False, replace_nbrs=False
+    )
+
+    # Check that the resampled arrays have the correct shape
+    assert resampled_X.shape == X.shape
+    assert resampled_X_sparse.shape == X_sparse.shape
+
+    # Check that the resampled arrays have the correct number of unique samples
+    assert len(np.unique(resampled_X, axis=0)) == X.shape[0]
+    assert len(np.unique(resampled_X_sparse.toarray(), axis=0)) == X_sparse.shape[0]
+
+    # Check that the conditional distribution is preserved
+    for i in range(X.shape[1]):
+        unique_values, counts = np.unique(resampled_X[:, i], return_counts=True)
+        original_values, original_counts = np.unique(X[:, i], return_counts=True)
+
+        assert np.all(unique_values == original_values)
+        assert np.all(counts == original_counts)
+
+        unique_values_sparse, counts_sparse = np.unique(
+            resampled_X_sparse[:, i].toarray(), return_counts=True
+        )
+        original_values_sparse, original_counts_sparse = np.unique(
+            X_sparse[:, i].toarray(), return_counts=True
+        )
+
+        assert np.all(unique_values_sparse == original_values_sparse)
+        assert np.all(counts_sparse == original_counts_sparse)

From f1bc2554ad09e7f10ab1ae618f6c096f08e18fb3 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Mon, 13 Nov 2023 12:51:23 -0500
Subject: [PATCH 02/14] Intial

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/experimental/meson.build       | 1 +
 sktree/experimental/tests/meson.build | 1 +
 2 files changed, 2 insertions(+)

diff --git a/sktree/experimental/meson.build b/sktree/experimental/meson.build
index 2be7ae9e8..2d1dde609 100644
--- a/sktree/experimental/meson.build
+++ b/sktree/experimental/meson.build
@@ -3,6 +3,7 @@ python_sources = [
   'mutual_info.py',
   'simulate.py',
   'sdf.py',
+  'monte_carlo.py',
 ]
 
 py3.install_sources(
diff --git a/sktree/experimental/tests/meson.build b/sktree/experimental/tests/meson.build
index b5d1ef79c..c3fdd07c4 100644
--- a/sktree/experimental/tests/meson.build
+++ b/sktree/experimental/tests/meson.build
@@ -3,6 +3,7 @@ python_sources = [
   'test_mutual_info.py',
   'test_simulate.py',
   'test_sdf.py',
+  'test_monte_carlo.py',
 ]
 
 py3.install_sources(

From cd9ea37a5e9ab87fa790a26b123735bcba6dc484 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Mon, 13 Nov 2023 12:52:38 -0500
Subject: [PATCH 03/14] Intial

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 doc/api.rst                     | 14 ++++++++++++++
 doc/conf.py                     |  1 +
 doc/whats_new/v0.4.rst          |  1 +
 sktree/experimental/__init__.py |  1 +
 4 files changed, 17 insertions(+)

diff --git a/doc/api.rst b/doc/api.rst
index 59238d77a..12182469e 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -150,6 +150,19 @@ tree models.
    PermutationForestClassifier
    PermutationForestRegressor
 
+Datasets
+------------------------------
+We provide some convenience functions for simulating datasets beyond
+those offered in scikit-learn.
+
+.. currentmodule:: sktree.datasets
+.. autosummary::
+   :toctree: generated/
+
+   make_gaussian_mixture
+   make_joint_factor_model
+   make_quadratic_classification
+
 
 Experimental Functionality
 --------------------------
@@ -160,6 +173,7 @@ We also include experimental functionality that is works in progress.
    :toctree: generated/
 
    mutual_info_ksg
+   conditional_resample
 
 We also include functions that help simulate and evaluate mutual information (MI)
 and conditional mutual information (CMI) estimators. Specifically, functions that
diff --git a/doc/conf.py b/doc/conf.py
index ac752d66a..6e2a78d23 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -244,6 +244,7 @@
     "fit",
     "apply",
     "TreeBuilder",
+    "joint_rank",
 }
 
 # validation
diff --git a/doc/whats_new/v0.4.rst b/doc/whats_new/v0.4.rst
index 239a1d824..e1beeba8f 100644
--- a/doc/whats_new/v0.4.rst
+++ b/doc/whats_new/v0.4.rst
@@ -15,6 +15,7 @@ Changelog
 
 - |API| ``FeatureImportanceForest*`` now has a hyperparameter to control the number of permutations is done per forest ``permute_per_forest_fraction``, by `Adam Li`_ (:pr:`145`)
 - |Enhancement| Add dataset generators for regression and classification and hypothesis testing, by `Adam Li`_ (:pr:`169`)
+- |Enhancement| Add :func:`sktree.experimental.conditional_resample`, which allows conditional resampling of rows based on nearest-neighbors defined via a feature set, by `Adam Li`_ (:pr:`152`)
 
 Code and Documentation Contributors
 -----------------------------------
diff --git a/sktree/experimental/__init__.py b/sktree/experimental/__init__.py
index cdf4b4295..940195147 100644
--- a/sktree/experimental/__init__.py
+++ b/sktree/experimental/__init__.py
@@ -10,3 +10,4 @@
     mutual_info_ksg,
 )
 from .sdf import StreamDecisionForest
+from .monte_carlo import conditional_resample
\ No newline at end of file

From 876d1943a44bd176874c85e9490f4a541bc2056b Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Mon, 13 Nov 2023 12:55:12 -0500
Subject: [PATCH 04/14] Update PR number

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 doc/whats_new/v0.4.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats_new/v0.4.rst b/doc/whats_new/v0.4.rst
index e1beeba8f..6a83962d7 100644
--- a/doc/whats_new/v0.4.rst
+++ b/doc/whats_new/v0.4.rst
@@ -15,7 +15,7 @@ Changelog
 
 - |API| ``FeatureImportanceForest*`` now has a hyperparameter to control the number of permutations is done per forest ``permute_per_forest_fraction``, by `Adam Li`_ (:pr:`145`)
 - |Enhancement| Add dataset generators for regression and classification and hypothesis testing, by `Adam Li`_ (:pr:`169`)
-- |Enhancement| Add :func:`sktree.experimental.conditional_resample`, which allows conditional resampling of rows based on nearest-neighbors defined via a feature set, by `Adam Li`_ (:pr:`152`)
+- |Enhancement| Add :func:`sktree.experimental.conditional_resample`, which allows conditional resampling of rows based on nearest-neighbors defined via a feature set, by `Adam Li`_ (:pr:`170`)
 
 Code and Documentation Contributors
 -----------------------------------

From 230e1c6ea22a0d22cce8ebd7ebf55eea0336d4bc Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Mon, 13 Nov 2023 12:55:31 -0500
Subject: [PATCH 05/14] Lint

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/experimental/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sktree/experimental/__init__.py b/sktree/experimental/__init__.py
index 940195147..2934888fc 100644
--- a/sktree/experimental/__init__.py
+++ b/sktree/experimental/__init__.py
@@ -1,4 +1,5 @@
 from . import mutual_info, sdf, simulate
+from .monte_carlo import conditional_resample
 from .mutual_info import (
     cmi_from_entropy,
     cmi_gaussian,
@@ -10,4 +11,3 @@
     mutual_info_ksg,
 )
 from .sdf import StreamDecisionForest
-from .monte_carlo import conditional_resample
\ No newline at end of file

From 25d724057305b80fdf5e1a8eabbf4044665e7380 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Mon, 13 Nov 2023 14:09:07 -0500
Subject: [PATCH 06/14] Up tree count

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/tests/test_coleman.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py
index aa918191c..3476c416c 100644
--- a/sktree/stats/tests/test_coleman.py
+++ b/sktree/stats/tests/test_coleman.py
@@ -201,7 +201,7 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size)
                 "estimator": RandomForestClassifier(
                     max_features="sqrt",
                     # random_state=seed,
-                    n_estimators=100,
+                    n_estimators=200,
                     n_jobs=-1,
                 ),
                 "permute_forest_fraction": 0.5,

From 1ec41904eff370c9c0205b90558e5576bdb7a29d Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 14 Nov 2023 10:03:31 -0500
Subject: [PATCH 07/14] Update unit-test

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/forestht.py            | 31 ++++++++++++++++-------------
 sktree/stats/tests/test_forestht.py | 29 ++++++++++++++++++++++++++-
 2 files changed, 45 insertions(+), 15 deletions(-)

diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index 86a5bb32f..caac5b8ba 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -402,20 +402,23 @@ def statistic(
         # and then setting the internal meta data structures
         # first run a dummy fit on the samples to initialize the
         # internal data structure of the forest
-        if not _is_fitted(estimator) and is_classifier(estimator):
-            _unique_y = []
-            for axis in range(y.shape[1]):
-                _unique_y.append(np.unique(y[:, axis]))
-            unique_y = np.hstack(_unique_y)
-            if unique_y.ndim > 1 and unique_y.shape[1] == 1:
-                unique_y = unique_y.ravel()
-            X_dummy = np.zeros((unique_y.shape[0], X.shape[1]))
-            estimator.fit(X_dummy, unique_y)
-        elif not _is_fitted(estimator):
-            if y.ndim > 1 and y.shape[1] == 1:
-                estimator.fit(X[:2], y[:2].ravel())
-            else:
-                estimator.fit(X[:2], y[:2])
+        if not hasattr(self, "estimator_") or self.estimator_ is None:
+            self.estimator_ = self._get_estimator()
+
+            if not _is_fitted(estimator) and is_classifier(estimator):
+                _unique_y = []
+                for axis in range(y.shape[1]):
+                    _unique_y.append(np.unique(y[:, axis]))
+                unique_y = np.hstack(_unique_y)
+                if unique_y.ndim > 1 and unique_y.shape[1] == 1:
+                    unique_y = unique_y.ravel()
+                X_dummy = np.zeros((unique_y.shape[0], X.shape[1]))
+                estimator.fit(X_dummy, unique_y)
+            elif not _is_fitted(estimator):
+                if y.ndim > 1 and y.shape[1] == 1:
+                    estimator.fit(X[:2], y[:2].ravel())
+                else:
+                    estimator.fit(X[:2], y[:2])
 
         # sampling a separate train/test per tree
         if self.sample_dataset_per_tree:
diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py
index a242193b1..338e096e6 100644
--- a/sktree/stats/tests/test_forestht.py
+++ b/sktree/stats/tests/test_forestht.py
@@ -58,14 +58,19 @@ def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree):
     ), f"{len(est.train_test_samples_[0][1])} {n_samples * est.test_size}"
     assert len(est.train_test_samples_[0][0]) == est._n_samples_ - n_samples * est.test_size
 
+    # covariate index should work with mse
+    est.reset()
+    est.statistic(iris_X[:n_samples], iris_y[:n_samples], covariate_index=[1], metric="mse")
     with pytest.raises(RuntimeError, match="Metric must be"):
-        est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mi")
+        est.statistic(iris_X[:n_samples], iris_y[:n_samples], covariate_index=[1], metric="mi")
 
     # covariate index must be an iterable
+    est.reset()
     with pytest.raises(RuntimeError, match="covariate_index must be an iterable"):
         est.statistic(iris_X[:n_samples], iris_y[:n_samples], 0, metric="mi")
 
     # covariate index must be an iterable of ints
+    est.reset()
     with pytest.raises(RuntimeError, match="Not all covariate_index"):
         est.statistic(iris_X[:n_samples], iris_y[:n_samples], [0, 1.0], metric="mi")
 
@@ -588,3 +593,25 @@ def test_no_traintest_split():
     assert ~np.isnan(pvalue)
     assert ~np.isnan(stat)
     assert pvalue > 0.05, f"{pvalue}"
+
+
+pytest.mark.parametrize("covariate_index", [None, [0, 1]])
+def test_featureimportance_forest_statistic_with_covariate_index(covariate_index):
+    """Tests that calling `est.statistic` with covariate_index defined works.
+    There should be no issue calling `est.statistic` with covariate_index defined.
+    """
+    n_estimators = 10
+    n_samples = 10
+
+    est = FeatureImportanceForestClassifier(
+        estimator=RandomForestClassifier(
+            n_estimators=n_estimators,
+            random_state=seed,
+        ),
+        permute_forest_fraction=-1.0 / n_estimators * 5,
+        test_size=0.7,
+        random_state=seed,
+    )
+    est.statistic(
+        iris_X[:n_samples], iris_y[:n_samples], covariate_index=covariate_index, metric="mi"
+    )

From e59d4aee263b62e540c9c410665a3eacee4c6570 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 14 Nov 2023 10:04:25 -0500
Subject: [PATCH 08/14] Fix

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/tests/test_forestht.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py
index 338e096e6..a8c4bed3f 100644
--- a/sktree/stats/tests/test_forestht.py
+++ b/sktree/stats/tests/test_forestht.py
@@ -596,6 +596,8 @@ def test_no_traintest_split():
 
 
 pytest.mark.parametrize("covariate_index", [None, [0, 1]])
+
+
 def test_featureimportance_forest_statistic_with_covariate_index(covariate_index):
     """Tests that calling `est.statistic` with covariate_index defined works.
     There should be no issue calling `est.statistic` with covariate_index defined.
@@ -608,7 +610,7 @@ def test_featureimportance_forest_statistic_with_covariate_index(covariate_index
             n_estimators=n_estimators,
             random_state=seed,
         ),
-        permute_forest_fraction=-1.0 / n_estimators * 5,
+        permute_forest_fraction=1.0 / n_estimators * 5,
         test_size=0.7,
         random_state=seed,
     )

From ad49c88d79e8edcdd5c8edb6f2669f063abc1018 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 14 Nov 2023 13:03:39 -0500
Subject: [PATCH 09/14] Fix unit-tests

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 pyproject.toml           |  2 +-
 sktree/stats/forestht.py | 60 +++++++++++++---------------------------
 2 files changed, 20 insertions(+), 42 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e03c45e1a..a5c2b6fdd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -218,7 +218,7 @@ profile = 'black'
 multi_line_output = 3
 line_length = 100
 py_version = 38
-extend_skip_glob = ['sktree/__init__.py', 'sktree/_lib/*', '.asv/*', 'env/*']
+extend_skip_glob = ['sktree/__init__.py', 'sktree/_lib/*', '.asv/*', 'env/*', 'build-install/*']
 
 [tool.pydocstyle]
 convention = 'numpy'
diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index 6b2e370b4..aa613ae16 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -382,50 +382,28 @@ def statistic(
             self.permuted_estimator_ = self._get_estimator()
             estimator = self.permuted_estimator_
 
-            if not hasattr(self, "estimator_") or self.estimator_ is None:
-                self.estimator_ = self._get_estimator()
-
-                # Ensure that the estimator_ is fitted at least
-                if not _is_fitted(self.estimator_) and is_classifier(self.estimator_):
-                    _unique_y = []
-                    for axis in range(y.shape[1]):
-                        _unique_y.append(np.unique(y[:, axis]))
-                    unique_y = np.hstack(_unique_y)
-                    if unique_y.ndim > 1 and unique_y.shape[1] == 1:
-                        unique_y = unique_y.ravel()
-                    X_dummy = np.zeros((unique_y.shape[0], X.shape[1]))
-                    self.estimator_.fit(X_dummy, unique_y)
-                elif not _is_fitted(estimator):
-                    if y.ndim > 1 and y.shape[1] == 1:
-                        self.estimator_.fit(X[:2], y[:2].ravel())
-                    else:
-                        self.estimator_.fit(X[:2], y[:2])
-
         # Store a cache of the y variable
-        if is_classifier(self._get_estimator()):
+        if is_classifier(estimator):
             self._y = y.copy()
 
-        # XXX: this can be improved as an extra fit can be avoided, by just doing error-checking
-        # and then setting the internal meta data structures
-        # first run a dummy fit on the samples to initialize the
-        # internal data structure of the forest
-        if not hasattr(self, "estimator_") or self.estimator_ is None:
-            self.estimator_ = self._get_estimator()
-
-            if not _is_fitted(estimator) and is_classifier(estimator):
-                _unique_y = []
-                for axis in range(y.shape[1]):
-                    _unique_y.append(np.unique(y[:, axis]))
-                unique_y = np.hstack(_unique_y)
-                if unique_y.ndim > 1 and unique_y.shape[1] == 1:
-                    unique_y = unique_y.ravel()
-                X_dummy = np.zeros((unique_y.shape[0], X.shape[1]))
-                estimator.fit(X_dummy, unique_y)
-            elif not _is_fitted(estimator):
-                if y.ndim > 1 and y.shape[1] == 1:
-                    estimator.fit(X[:2], y[:2].ravel())
-                else:
-                    estimator.fit(X[:2], y[:2])
+        # # XXX: this can be improved as an extra fit can be avoided, by just doing error-checking
+        # # and then setting the internal meta data structures
+        # # first run a dummy fit on the samples to initialize the
+        # # internal data structure of the forest
+        if not _is_fitted(estimator) and is_classifier(estimator):
+            _unique_y = []
+            for axis in range(y.shape[1]):
+                _unique_y.append(np.unique(y[:, axis]))
+            unique_y = np.hstack(_unique_y)
+            if unique_y.ndim > 1 and unique_y.shape[1] == 1:
+                unique_y = unique_y.ravel()
+            X_dummy = np.zeros((unique_y.shape[0], X.shape[1]))
+            estimator.fit(X_dummy, unique_y)
+        elif not _is_fitted(estimator):
+            if y.ndim > 1 and y.shape[1] == 1:
+                estimator.fit(X[:2], y[:2].ravel())
+            else:
+                estimator.fit(X[:2], y[:2])
 
         # sampling a separate train/test per tree
         if self.sample_dataset_per_tree:

From da2863d19463a28f81c6499a3c3794fcbb5a0c93 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 14 Nov 2023 13:35:09 -0500
Subject: [PATCH 10/14] Fix unit-tests

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/forestht.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index aa613ae16..abf2a4bad 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -382,6 +382,29 @@ def statistic(
             self.permuted_estimator_ = self._get_estimator()
             estimator = self.permuted_estimator_
 
+            if not hasattr(self, "estimator_"):
+                self.estimator_ = self._get_estimator()
+
+                # XXX: this can be improved as an extra fit can be avoided, by
+                # just doing error-checking
+                # and then setting the internal meta data structures
+                # first run a dummy fit on the samples to initialize the
+                # internal data structure of the forest
+                if is_classifier(self.estimator_):
+                    _unique_y = []
+                    for axis in range(y.shape[1]):
+                        _unique_y.append(np.unique(y[:, axis]))
+                    unique_y = np.hstack(_unique_y)
+                    if unique_y.ndim > 1 and unique_y.shape[1] == 1:
+                        unique_y = unique_y.ravel()
+                    X_dummy = np.zeros((unique_y.shape[0], X.shape[1]))
+                    self.estimator_.fit(X_dummy, unique_y)
+                else:
+                    if y.ndim > 1 and y.shape[1] == 1:
+                        self.estimator_.fit(X[:2], y[:2].ravel())
+                    else:
+                        self.estimator_.fit(X[:2], y[:2])
+
         # Store a cache of the y variable
         if is_classifier(estimator):
             self._y = y.copy()

From 473c540f456764c92fc651df540bcac9339a8fae Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 14 Nov 2023 15:08:49 -0500
Subject: [PATCH 11/14] Up tree count

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/tests/test_coleman.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py
index 843db417d..936a12068 100644
--- a/sktree/stats/tests/test_coleman.py
+++ b/sktree/stats/tests/test_coleman.py
@@ -79,7 +79,7 @@
             {
                 "estimator": RandomForestRegressor(
                     max_features="sqrt",
-                    n_estimators=125,
+                    n_estimators=200,
                     n_jobs=-1,
                 ),
                 # "random_state": seed,

From 3078be1af37b8ff623a2f44c9c7aa2fb3fe9fc60 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 14 Nov 2023 16:45:19 -0500
Subject: [PATCH 12/14] Fix coleman unit-test simulations

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/tests/test_coleman.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py
index 936a12068..271a131a5 100644
--- a/sktree/stats/tests/test_coleman.py
+++ b/sktree/stats/tests/test_coleman.py
@@ -82,7 +82,7 @@
                     n_estimators=200,
                     n_jobs=-1,
                 ),
-                # "random_state": seed,
+                "random_state": seed,
                 "permute_forest_fraction": 0.5,
                 "sample_dataset_per_tree": False,
             },
@@ -167,6 +167,7 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size)
                     n_jobs=-1,
                 ),
                 "sample_dataset_per_tree": False,
+                "random_state": seed,
             },
             600,  # n_samples
             1000,  # n_repeats
@@ -200,6 +201,7 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size)
                 ),
                 "permute_forest_fraction": 0.5,
                 "sample_dataset_per_tree": False,
+                "random_state": seed,
             },
             600,  # n_samples
             1000,  # n_repeats

From b2b64d339915c3aaf2f66e5efa99976e8c2ccb4a Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 14 Nov 2023 20:25:12 -0500
Subject: [PATCH 13/14] Fix unit test

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/tests/test_coleman.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py
index 271a131a5..54e81ce0f 100644
--- a/sktree/stats/tests/test_coleman.py
+++ b/sktree/stats/tests/test_coleman.py
@@ -79,8 +79,9 @@
             {
                 "estimator": RandomForestRegressor(
                     max_features="sqrt",
-                    n_estimators=200,
+                    n_estimators=250,
                     n_jobs=-1,
+                    random_state=rng.integers(0, 1000),
                 ),
                 "random_state": seed,
                 "permute_forest_fraction": 0.5,
@@ -198,10 +199,11 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size)
                     max_features="sqrt",
                     n_estimators=200,
                     n_jobs=-1,
+                    random_state=rng.integers(0, 1000),
                 ),
                 "permute_forest_fraction": 0.5,
                 "sample_dataset_per_tree": False,
-                "random_state": seed,
+                "random_state": rng.integers(0, 1000),
             },
             600,  # n_samples
             1000,  # n_repeats

From 2d9b71806a1d5db1803846f7ab2f1785798f9952 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 14 Nov 2023 21:37:48 -0500
Subject: [PATCH 14/14] Try again

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sktree/stats/tests/test_coleman.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py
index 54e81ce0f..23904f0d5 100644
--- a/sktree/stats/tests/test_coleman.py
+++ b/sktree/stats/tests/test_coleman.py
@@ -199,11 +199,11 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size)
                     max_features="sqrt",
                     n_estimators=200,
                     n_jobs=-1,
-                    random_state=rng.integers(0, 1000),
+                    random_state=seed,
                 ),
                 "permute_forest_fraction": 0.5,
                 "sample_dataset_per_tree": False,
-                "random_state": rng.integers(0, 1000),
+                "random_state": seed,
             },
             600,  # n_samples
             1000,  # n_repeats