From 16eb0c86ee529774c4800b9518b0ad284df24119 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 13 Nov 2023 12:50:54 -0500 Subject: [PATCH 01/14] Intial Signed-off-by: Adam Li --- sktree/experimental/monte_carlo.py | 218 ++++++++++++++++++ sktree/experimental/tests/test_monte_carlo.py | 196 ++++++++++++++++ 2 files changed, 414 insertions(+) create mode 100644 sktree/experimental/monte_carlo.py create mode 100644 sktree/experimental/tests/test_monte_carlo.py diff --git a/sktree/experimental/monte_carlo.py b/sktree/experimental/monte_carlo.py new file mode 100644 index 000000000..aab6a25cd --- /dev/null +++ b/sktree/experimental/monte_carlo.py @@ -0,0 +1,218 @@ +from typing import Optional + +import numpy as np +from numpy.typing import ArrayLike +from scipy.sparse import issparse +from sklearn.neighbors import NearestNeighbors +from sklearn.utils import _approximate_mode, _safe_indexing, check_array, check_consistent_length + + +def _conditional_shuffle(nbrs: ArrayLike, replace: bool = False, seed=None) -> ArrayLike: + """Compute a permutation of neighbors with restrictions. + + Parameters + ---------- + nbrs : ArrayLike of shape (n_samples, k) + The k-nearest-neighbors for each sample index. Each row corresponds to the + original sample. Each element corresponds to another sample index that is deemed + as the k-nearest neighbors with respect to the original sample. + replace : bool, optional + Whether or not to allow replacement of samples, by default False. + seed : int, optional + Random seed, by default None. + + Returns + ------- + restricted_perm : ArrayLike of shape (n_samples) + The final permutation order of the sample indices. There may be + repeating samples. See Notes for details. + + Notes + ----- + Restricted permutation goes through random samples and looks at the k-nearest + neighbors (columns of ``nbrs``) and shuffles the closest neighbor index only + if it has not been used to permute another sample. If it has been, then the + algorithm looks at the next nearest-neighbor and so on. If all k-nearest + neighbors of a sample has been checked, then a random neighbor is chosen. In this + manner, the algorithm tries to perform permutation without replacement, but + if necessary, will choose a repeating neighbor sample. + """ + n_samples, k_dims = nbrs.shape + rng = np.random.default_rng(seed=seed) + + # initialize the final permutation order + restricted_perm = np.zeros((n_samples,), dtype=np.intp) + + # generate a random order of samples to go through + random_order = rng.permutation(n_samples) + + # keep track of values we have already used + used = set() + + # go through the random order + for idx in random_order: + if replace: + possible_nbrs = nbrs[idx, :] + restricted_perm[idx] = rng.choice(possible_nbrs, size=1).squeeze() + else: + m = 0 + use_idx = nbrs[idx, m] + + # if the current nbr is already used, continue incrementing + # until we have either found a new sample to use, or if + # we have reach the maximum number of shuffles to consider + while (use_idx in used) and (m < k_dims - 1): + m += 1 + use_idx = nbrs[idx, m] + + # check whether or not we have exhaustively checked all kNN + if use_idx in used and m == k_dims: + # XXX: Note this step is not in the original paper + # choose a random neighbor to permute + restricted_perm[idx] = rng.choice(nbrs[idx, :], size=1) + else: + # permute with the existing neighbor + restricted_perm[idx] = use_idx + used.add(use_idx) + return restricted_perm + + +def conditional_resample( + conditional_array: ArrayLike, + *arrays, + nn_estimator=None, + replace: bool = True, + replace_nbrs: bool = True, + n_samples: Optional[int] = None, + random_state: Optional[int] = None, + stratify: Optional[ArrayLike] = None, +): + """Conditionally resample arrays or sparse matrices in a consistent way. + + The default strategy implements one step of the bootstrapping + procedure. Conditional resampling is a modification of the bootstrap + technique that preserves the conditional distribution of the data. This + is done by fitting a nearest neighbors estimator on the conditional array + and then resampling the nearest neighbors of each sample. + + Parameters + ---------- + conditional_array : array-like of shape (n_samples, n_features) + The array, which we preserve the conditional distribution of. + + *arrays : sequence of array-like of shape (n_samples,) or \ + (n_samples, n_outputs) + Indexable data-structures can be arrays, lists, dataframes or scipy + sparse matrices with consistent first dimension. + + nn_estimator : estimator object, default=None + The nearest neighbors estimator to use. If None, then a + :class:`sklearn.neighbors.NearestNeighbors` instance is used. + + replace : bool, default=True + Implements resampling with replacement. If False, this will implement + (sliced) random permutations. The replacement will take place at the level + of the sample index. + + replace_nbrs : bool, default=True + Implements resampling with replacement at the level of the nearest neighbors. + + n_samples : int, default=None + Number of samples to generate. If left to None this is + automatically set to the first dimension of the arrays. + If replace is False it should not be larger than the length of + arrays. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for shuffling + the data. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + stratify : array-like of shape (n_samples,) or (n_samples, n_outputs), \ + default=None + If not None, data is split in a stratified fashion, using this as + the class labels. + + Returns + ------- + resampled_arrays : sequence of array-like of shape (n_samples,) or \ + (n_samples, n_outputs) + Sequence of resampled copies of the collections. The original arrays + are not impacted. + """ + max_n_samples = n_samples + rng = np.random.default_rng(random_state) + + if len(arrays) == 0: + return None + + first = arrays[0] + n_samples = first.shape[0] if hasattr(first, "shape") else len(first) + + if max_n_samples is None: + max_n_samples = n_samples + elif (max_n_samples > n_samples) and (not replace): + raise ValueError( + f"Cannot sample {max_n_samples} out of arrays with dim " + f"{n_samples} when replace is False" + ) + + check_consistent_length(conditional_array, *arrays) + + # fit nearest neighbors onto the conditional array + if nn_estimator is None: + nn_estimator = NearestNeighbors() + nn_estimator.fit(conditional_array) + + if stratify is None: + if replace: + indices = rng.integers(0, n_samples, size=(max_n_samples,)) + else: + indices = np.arange(n_samples) + rng.shuffle(indices) + indices = indices[:max_n_samples] + else: + # Code adapted from StratifiedShuffleSplit() + y = check_array(stratify, ensure_2d=False, dtype=None) + if y.ndim == 2: + # for multi-label y, map each distinct row to a string repr + # using join because str(row) uses an ellipsis if len(row) > 1000 + y = np.array([" ".join(row.astype("str")) for row in y]) + + classes, y_indices = np.unique(y, return_inverse=True) + n_classes = classes.shape[0] + + class_counts = np.bincount(y_indices) + + # Find the sorted list of instances for each class: + # (np.unique above performs a sort, so code is O(n logn) already) + class_indices = np.split( + np.argsort(y_indices, kind="mergesort"), np.cumsum(class_counts)[:-1] + ) + + n_i = _approximate_mode(class_counts, max_n_samples, random_state) + + indices = [] + + for i in range(n_classes): + indices_i = rng.choice(class_indices[i], n_i[i], replace=replace) + indices.extend(indices_i) + + indices = rng.permutation(indices) + + # now get the kNN indices for each sample (n_samples, n_neighbors) + sample_nbrs = nn_estimator.kneighbors(X=conditional_array[indices, :], return_distance=False) + + # actually sample the indices using a conditional permutation + indices = _conditional_shuffle(sample_nbrs, replace=replace_nbrs, seed=rng) + + # convert sparse matrices to CSR for row-based indexing + arrays_ = [a.tocsr() if issparse(a) else a for a in arrays] + resampled_arrays = [_safe_indexing(a, indices) for a in arrays_] + + if len(resampled_arrays) == 1: + # syntactic sugar for the unit argument case + return resampled_arrays[0] + else: + return resampled_arrays diff --git a/sktree/experimental/tests/test_monte_carlo.py b/sktree/experimental/tests/test_monte_carlo.py new file mode 100644 index 000000000..b76d59fb4 --- /dev/null +++ b/sktree/experimental/tests/test_monte_carlo.py @@ -0,0 +1,196 @@ +import numpy as np +import pytest +from scipy.sparse import csr_matrix +from sklearn.datasets import make_classification +from sklearn.neighbors import NearestNeighbors + +from sktree.experimental import conditional_resample + + +def test_conditional_resample_with_default_params(): + # Create a simple example dataset for testing + X, y = make_classification(n_samples=100, n_features=5, random_state=42) + # Create conditional array + nn = NearestNeighbors(n_neighbors=5) + nn.fit(X) + conditional_array = nn.kneighbors_graph(X).toarray() + # Test conditional resampling with default parameters + resampled_arrays = conditional_resample( + conditional_array, X, y, nn_estimator=NearestNeighbors() + ) + + # Check if the number of samples in resampled_arrays is the same as the input arrays + assert len(resampled_arrays) == 2 + assert len(resampled_arrays[0]) == len(X) + assert len(resampled_arrays[1]) == len(y) + + +def test_conditional_resample_without_replacement(): + # Create a simple example dataset for testing + X, y = make_classification(n_samples=100, n_features=5, random_state=42) + # Create conditional array + nn = NearestNeighbors(n_neighbors=5) + nn.fit(X) + conditional_array = nn.kneighbors_graph(X).toarray() + + # Test conditional resampling without replacement + resampled_arrays = conditional_resample( + conditional_array, X, y, nn_estimator=NearestNeighbors(), replace=False + ) + + # Check if the number of samples in resampled_arrays is the same as the input arrays + assert len(resampled_arrays) == 2 + assert len(resampled_arrays[0]) == len(X) + assert len(resampled_arrays[1]) == len(y) + + # Check if the samples are unique (no replacement) + assert len(np.unique(resampled_arrays[1])) == len( + np.unique(y) + ), f"{len(np.unique(resampled_arrays[1]))} != {len(y)}" + + +def test_conditional_resample_with_sparse_matrix(): + # Create a simple example dataset for testing + X, y = make_classification(n_samples=100, n_features=5, random_state=42) + X_sparse = csr_matrix(X) # Convert X to a sparse matrix + + # Create conditional array + nn = NearestNeighbors(n_neighbors=5) + nn.fit(X) + conditional_array = nn.kneighbors_graph(X).toarray() + + # Test conditional resampling with a sparse matrix + resampled_arrays = conditional_resample( + conditional_array, X_sparse, y, nn_estimator=NearestNeighbors() + ) + + # Check if the number of samples in resampled_arrays is the same as the input arrays + assert len(resampled_arrays) == 2 + assert resampled_arrays[0].shape[0] == len(X) + assert len(resampled_arrays[1]) == len(y) + + +def test_conditional_resample_with_stratify(): + # Create a simple example dataset for testing + X, y = make_classification(n_samples=100, n_features=5, random_state=42) + + # Create conditional array + nn = NearestNeighbors(n_neighbors=5) + nn.fit(X) + conditional_array = nn.kneighbors_graph(X).toarray() + + # Define a custom stratify function + def custom_stratify(y, category): + # Create an array where each entry is True if it belongs to the specified category, + # False otherwise + stratify_array = y == category + return stratify_array + + category_to_stratify = 1 # Change this to the category you want to stratify + + # Get the distribution of the specified category before resampling + category_distribution_before = np.sum(y == category_to_stratify) + + # Test conditional resampling with the custom stratify function + stratify = custom_stratify(y, category_to_stratify) + resampled_arrays = conditional_resample( + conditional_array, X, y, nn_estimator=NearestNeighbors(), stratify=stratify, random_state=0 + ) + + # Get the distribution of the specified category after resampling + category_distribution_after = np.sum(resampled_arrays[1] == category_to_stratify) + + # Check if the distribution of the specified category is preserved + assert category_distribution_before == category_distribution_after, ( + f"Expected {category_distribution_before} samples, got " + f"{category_distribution_after} samples" + ) + + +def test_conditional_resample_with_replace_nbrs(): + # Create a simple example dataset for testing + X, y = make_classification(n_samples=100, n_features=5, random_state=42) + + # Create conditional array + nn = NearestNeighbors(n_neighbors=5) + nn.fit(X) + conditional_array = nn.kneighbors_graph(X).toarray() + + # Test conditional resampling with replace_nbrs=False + resampled_arrays = conditional_resample( + conditional_array, X, y, nn_estimator=NearestNeighbors(), replace_nbrs=False + ) + + # Check if the number of samples in resampled_arrays is the same as the input arrays + assert len(resampled_arrays) == 2, f"Expected 2 arrays, got {len(resampled_arrays)} arrays" + assert len(resampled_arrays[0]) == len(X) + assert len(resampled_arrays[1]) == len(y) + + +def test_conditional_resample_errors(): + # 01: Test with invalid number of samples + # Create a simple example dataset for testing + X, y = make_classification(n_samples=100, n_features=5, random_state=42) + + # Test conditional resampling with an invalid stratify array (should raise an error) + with pytest.raises(ValueError, match="Cannot sample"): + conditional_resample(X, y, nn_estimator=NearestNeighbors(), replace=False, n_samples=1000) + + # 02: Test inconsistent_length + # Create an additional array with a different number of samples + additional_array = np.random.rand(80, 5) + + # Test conditional resampling with inconsistent length of input arrays (should raise an error) + with pytest.raises(ValueError): + conditional_resample(X, y, additional_array, nn_estimator=NearestNeighbors()) + + # 03: Test with invalid sample size when replace=False + # Test conditional resampling with n_samples larger than the input arrays + # (should raise an error) + with pytest.raises(ValueError): + conditional_resample(X, y, nn_estimator=NearestNeighbors(), n_samples=200, replace=False) + + +def test_conditional_resample(): + # Generate synthetic data + X, y = make_classification(n_samples=100, n_features=5, random_state=42) + + # Convert X to sparse matrix + X_sparse = csr_matrix(X) + + # Create conditional array + nn = NearestNeighbors(n_neighbors=5) + nn.fit(X) + conditional_array = nn.kneighbors_graph(X).toarray() + + # Perform conditional resampling + resampled_X = conditional_resample(conditional_array, X, replace=False, replace_nbrs=False) + resampled_X_sparse = conditional_resample( + conditional_array, X_sparse, replace=False, replace_nbrs=False + ) + + # Check that the resampled arrays have the correct shape + assert resampled_X.shape == X.shape + assert resampled_X_sparse.shape == X_sparse.shape + + # Check that the resampled arrays have the correct number of unique samples + assert len(np.unique(resampled_X, axis=0)) == X.shape[0] + assert len(np.unique(resampled_X_sparse.toarray(), axis=0)) == X_sparse.shape[0] + + # Check that the conditional distribution is preserved + for i in range(X.shape[1]): + unique_values, counts = np.unique(resampled_X[:, i], return_counts=True) + original_values, original_counts = np.unique(X[:, i], return_counts=True) + + assert np.all(unique_values == original_values) + assert np.all(counts == original_counts) + + unique_values_sparse, counts_sparse = np.unique( + resampled_X_sparse[:, i].toarray(), return_counts=True + ) + original_values_sparse, original_counts_sparse = np.unique( + X_sparse[:, i].toarray(), return_counts=True + ) + + assert np.all(unique_values_sparse == original_values_sparse) + assert np.all(counts_sparse == original_counts_sparse) From f1bc2554ad09e7f10ab1ae618f6c096f08e18fb3 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 13 Nov 2023 12:51:23 -0500 Subject: [PATCH 02/14] Intial Signed-off-by: Adam Li --- sktree/experimental/meson.build | 1 + sktree/experimental/tests/meson.build | 1 + 2 files changed, 2 insertions(+) diff --git a/sktree/experimental/meson.build b/sktree/experimental/meson.build index 2be7ae9e8..2d1dde609 100644 --- a/sktree/experimental/meson.build +++ b/sktree/experimental/meson.build @@ -3,6 +3,7 @@ python_sources = [ 'mutual_info.py', 'simulate.py', 'sdf.py', + 'monte_carlo.py', ] py3.install_sources( diff --git a/sktree/experimental/tests/meson.build b/sktree/experimental/tests/meson.build index b5d1ef79c..c3fdd07c4 100644 --- a/sktree/experimental/tests/meson.build +++ b/sktree/experimental/tests/meson.build @@ -3,6 +3,7 @@ python_sources = [ 'test_mutual_info.py', 'test_simulate.py', 'test_sdf.py', + 'test_monte_carlo.py', ] py3.install_sources( From cd9ea37a5e9ab87fa790a26b123735bcba6dc484 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 13 Nov 2023 12:52:38 -0500 Subject: [PATCH 03/14] Intial Signed-off-by: Adam Li --- doc/api.rst | 14 ++++++++++++++ doc/conf.py | 1 + doc/whats_new/v0.4.rst | 1 + sktree/experimental/__init__.py | 1 + 4 files changed, 17 insertions(+) diff --git a/doc/api.rst b/doc/api.rst index 59238d77a..12182469e 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -150,6 +150,19 @@ tree models. PermutationForestClassifier PermutationForestRegressor +Datasets +------------------------------ +We provide some convenience functions for simulating datasets beyond +those offered in scikit-learn. + +.. currentmodule:: sktree.datasets +.. autosummary:: + :toctree: generated/ + + make_gaussian_mixture + make_joint_factor_model + make_quadratic_classification + Experimental Functionality -------------------------- @@ -160,6 +173,7 @@ We also include experimental functionality that is works in progress. :toctree: generated/ mutual_info_ksg + conditional_resample We also include functions that help simulate and evaluate mutual information (MI) and conditional mutual information (CMI) estimators. Specifically, functions that diff --git a/doc/conf.py b/doc/conf.py index ac752d66a..6e2a78d23 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -244,6 +244,7 @@ "fit", "apply", "TreeBuilder", + "joint_rank", } # validation diff --git a/doc/whats_new/v0.4.rst b/doc/whats_new/v0.4.rst index 239a1d824..e1beeba8f 100644 --- a/doc/whats_new/v0.4.rst +++ b/doc/whats_new/v0.4.rst @@ -15,6 +15,7 @@ Changelog - |API| ``FeatureImportanceForest*`` now has a hyperparameter to control the number of permutations is done per forest ``permute_per_forest_fraction``, by `Adam Li`_ (:pr:`145`) - |Enhancement| Add dataset generators for regression and classification and hypothesis testing, by `Adam Li`_ (:pr:`169`) +- |Enhancement| Add :func:`sktree.experimental.conditional_resample`, which allows conditional resampling of rows based on nearest-neighbors defined via a feature set, by `Adam Li`_ (:pr:`152`) Code and Documentation Contributors ----------------------------------- diff --git a/sktree/experimental/__init__.py b/sktree/experimental/__init__.py index cdf4b4295..940195147 100644 --- a/sktree/experimental/__init__.py +++ b/sktree/experimental/__init__.py @@ -10,3 +10,4 @@ mutual_info_ksg, ) from .sdf import StreamDecisionForest +from .monte_carlo import conditional_resample \ No newline at end of file From 876d1943a44bd176874c85e9490f4a541bc2056b Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 13 Nov 2023 12:55:12 -0500 Subject: [PATCH 04/14] Update PR number Signed-off-by: Adam Li --- doc/whats_new/v0.4.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.4.rst b/doc/whats_new/v0.4.rst index e1beeba8f..6a83962d7 100644 --- a/doc/whats_new/v0.4.rst +++ b/doc/whats_new/v0.4.rst @@ -15,7 +15,7 @@ Changelog - |API| ``FeatureImportanceForest*`` now has a hyperparameter to control the number of permutations is done per forest ``permute_per_forest_fraction``, by `Adam Li`_ (:pr:`145`) - |Enhancement| Add dataset generators for regression and classification and hypothesis testing, by `Adam Li`_ (:pr:`169`) -- |Enhancement| Add :func:`sktree.experimental.conditional_resample`, which allows conditional resampling of rows based on nearest-neighbors defined via a feature set, by `Adam Li`_ (:pr:`152`) +- |Enhancement| Add :func:`sktree.experimental.conditional_resample`, which allows conditional resampling of rows based on nearest-neighbors defined via a feature set, by `Adam Li`_ (:pr:`170`) Code and Documentation Contributors ----------------------------------- From 230e1c6ea22a0d22cce8ebd7ebf55eea0336d4bc Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 13 Nov 2023 12:55:31 -0500 Subject: [PATCH 05/14] Lint Signed-off-by: Adam Li --- sktree/experimental/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sktree/experimental/__init__.py b/sktree/experimental/__init__.py index 940195147..2934888fc 100644 --- a/sktree/experimental/__init__.py +++ b/sktree/experimental/__init__.py @@ -1,4 +1,5 @@ from . import mutual_info, sdf, simulate +from .monte_carlo import conditional_resample from .mutual_info import ( cmi_from_entropy, cmi_gaussian, @@ -10,4 +11,3 @@ mutual_info_ksg, ) from .sdf import StreamDecisionForest -from .monte_carlo import conditional_resample \ No newline at end of file From 25d724057305b80fdf5e1a8eabbf4044665e7380 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 13 Nov 2023 14:09:07 -0500 Subject: [PATCH 06/14] Up tree count Signed-off-by: Adam Li --- sktree/stats/tests/test_coleman.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py index aa918191c..3476c416c 100644 --- a/sktree/stats/tests/test_coleman.py +++ b/sktree/stats/tests/test_coleman.py @@ -201,7 +201,7 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size) "estimator": RandomForestClassifier( max_features="sqrt", # random_state=seed, - n_estimators=100, + n_estimators=200, n_jobs=-1, ), "permute_forest_fraction": 0.5, From 1ec41904eff370c9c0205b90558e5576bdb7a29d Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 14 Nov 2023 10:03:31 -0500 Subject: [PATCH 07/14] Update unit-test Signed-off-by: Adam Li --- sktree/stats/forestht.py | 31 ++++++++++++++++------------- sktree/stats/tests/test_forestht.py | 29 ++++++++++++++++++++++++++- 2 files changed, 45 insertions(+), 15 deletions(-) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 86a5bb32f..caac5b8ba 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -402,20 +402,23 @@ def statistic( # and then setting the internal meta data structures # first run a dummy fit on the samples to initialize the # internal data structure of the forest - if not _is_fitted(estimator) and is_classifier(estimator): - _unique_y = [] - for axis in range(y.shape[1]): - _unique_y.append(np.unique(y[:, axis])) - unique_y = np.hstack(_unique_y) - if unique_y.ndim > 1 and unique_y.shape[1] == 1: - unique_y = unique_y.ravel() - X_dummy = np.zeros((unique_y.shape[0], X.shape[1])) - estimator.fit(X_dummy, unique_y) - elif not _is_fitted(estimator): - if y.ndim > 1 and y.shape[1] == 1: - estimator.fit(X[:2], y[:2].ravel()) - else: - estimator.fit(X[:2], y[:2]) + if not hasattr(self, "estimator_") or self.estimator_ is None: + self.estimator_ = self._get_estimator() + + if not _is_fitted(estimator) and is_classifier(estimator): + _unique_y = [] + for axis in range(y.shape[1]): + _unique_y.append(np.unique(y[:, axis])) + unique_y = np.hstack(_unique_y) + if unique_y.ndim > 1 and unique_y.shape[1] == 1: + unique_y = unique_y.ravel() + X_dummy = np.zeros((unique_y.shape[0], X.shape[1])) + estimator.fit(X_dummy, unique_y) + elif not _is_fitted(estimator): + if y.ndim > 1 and y.shape[1] == 1: + estimator.fit(X[:2], y[:2].ravel()) + else: + estimator.fit(X[:2], y[:2]) # sampling a separate train/test per tree if self.sample_dataset_per_tree: diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index a242193b1..338e096e6 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -58,14 +58,19 @@ def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree): ), f"{len(est.train_test_samples_[0][1])} {n_samples * est.test_size}" assert len(est.train_test_samples_[0][0]) == est._n_samples_ - n_samples * est.test_size + # covariate index should work with mse + est.reset() + est.statistic(iris_X[:n_samples], iris_y[:n_samples], covariate_index=[1], metric="mse") with pytest.raises(RuntimeError, match="Metric must be"): - est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mi") + est.statistic(iris_X[:n_samples], iris_y[:n_samples], covariate_index=[1], metric="mi") # covariate index must be an iterable + est.reset() with pytest.raises(RuntimeError, match="covariate_index must be an iterable"): est.statistic(iris_X[:n_samples], iris_y[:n_samples], 0, metric="mi") # covariate index must be an iterable of ints + est.reset() with pytest.raises(RuntimeError, match="Not all covariate_index"): est.statistic(iris_X[:n_samples], iris_y[:n_samples], [0, 1.0], metric="mi") @@ -588,3 +593,25 @@ def test_no_traintest_split(): assert ~np.isnan(pvalue) assert ~np.isnan(stat) assert pvalue > 0.05, f"{pvalue}" + + +pytest.mark.parametrize("covariate_index", [None, [0, 1]]) +def test_featureimportance_forest_statistic_with_covariate_index(covariate_index): + """Tests that calling `est.statistic` with covariate_index defined works. + There should be no issue calling `est.statistic` with covariate_index defined. + """ + n_estimators = 10 + n_samples = 10 + + est = FeatureImportanceForestClassifier( + estimator=RandomForestClassifier( + n_estimators=n_estimators, + random_state=seed, + ), + permute_forest_fraction=-1.0 / n_estimators * 5, + test_size=0.7, + random_state=seed, + ) + est.statistic( + iris_X[:n_samples], iris_y[:n_samples], covariate_index=covariate_index, metric="mi" + ) From e59d4aee263b62e540c9c410665a3eacee4c6570 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 14 Nov 2023 10:04:25 -0500 Subject: [PATCH 08/14] Fix Signed-off-by: Adam Li --- sktree/stats/tests/test_forestht.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index 338e096e6..a8c4bed3f 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -596,6 +596,8 @@ def test_no_traintest_split(): pytest.mark.parametrize("covariate_index", [None, [0, 1]]) + + def test_featureimportance_forest_statistic_with_covariate_index(covariate_index): """Tests that calling `est.statistic` with covariate_index defined works. There should be no issue calling `est.statistic` with covariate_index defined. @@ -608,7 +610,7 @@ def test_featureimportance_forest_statistic_with_covariate_index(covariate_index n_estimators=n_estimators, random_state=seed, ), - permute_forest_fraction=-1.0 / n_estimators * 5, + permute_forest_fraction=1.0 / n_estimators * 5, test_size=0.7, random_state=seed, ) From ad49c88d79e8edcdd5c8edb6f2669f063abc1018 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 14 Nov 2023 13:03:39 -0500 Subject: [PATCH 09/14] Fix unit-tests Signed-off-by: Adam Li --- pyproject.toml | 2 +- sktree/stats/forestht.py | 60 +++++++++++++--------------------------- 2 files changed, 20 insertions(+), 42 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e03c45e1a..a5c2b6fdd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -218,7 +218,7 @@ profile = 'black' multi_line_output = 3 line_length = 100 py_version = 38 -extend_skip_glob = ['sktree/__init__.py', 'sktree/_lib/*', '.asv/*', 'env/*'] +extend_skip_glob = ['sktree/__init__.py', 'sktree/_lib/*', '.asv/*', 'env/*', 'build-install/*'] [tool.pydocstyle] convention = 'numpy' diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 6b2e370b4..aa613ae16 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -382,50 +382,28 @@ def statistic( self.permuted_estimator_ = self._get_estimator() estimator = self.permuted_estimator_ - if not hasattr(self, "estimator_") or self.estimator_ is None: - self.estimator_ = self._get_estimator() - - # Ensure that the estimator_ is fitted at least - if not _is_fitted(self.estimator_) and is_classifier(self.estimator_): - _unique_y = [] - for axis in range(y.shape[1]): - _unique_y.append(np.unique(y[:, axis])) - unique_y = np.hstack(_unique_y) - if unique_y.ndim > 1 and unique_y.shape[1] == 1: - unique_y = unique_y.ravel() - X_dummy = np.zeros((unique_y.shape[0], X.shape[1])) - self.estimator_.fit(X_dummy, unique_y) - elif not _is_fitted(estimator): - if y.ndim > 1 and y.shape[1] == 1: - self.estimator_.fit(X[:2], y[:2].ravel()) - else: - self.estimator_.fit(X[:2], y[:2]) - # Store a cache of the y variable - if is_classifier(self._get_estimator()): + if is_classifier(estimator): self._y = y.copy() - # XXX: this can be improved as an extra fit can be avoided, by just doing error-checking - # and then setting the internal meta data structures - # first run a dummy fit on the samples to initialize the - # internal data structure of the forest - if not hasattr(self, "estimator_") or self.estimator_ is None: - self.estimator_ = self._get_estimator() - - if not _is_fitted(estimator) and is_classifier(estimator): - _unique_y = [] - for axis in range(y.shape[1]): - _unique_y.append(np.unique(y[:, axis])) - unique_y = np.hstack(_unique_y) - if unique_y.ndim > 1 and unique_y.shape[1] == 1: - unique_y = unique_y.ravel() - X_dummy = np.zeros((unique_y.shape[0], X.shape[1])) - estimator.fit(X_dummy, unique_y) - elif not _is_fitted(estimator): - if y.ndim > 1 and y.shape[1] == 1: - estimator.fit(X[:2], y[:2].ravel()) - else: - estimator.fit(X[:2], y[:2]) + # # XXX: this can be improved as an extra fit can be avoided, by just doing error-checking + # # and then setting the internal meta data structures + # # first run a dummy fit on the samples to initialize the + # # internal data structure of the forest + if not _is_fitted(estimator) and is_classifier(estimator): + _unique_y = [] + for axis in range(y.shape[1]): + _unique_y.append(np.unique(y[:, axis])) + unique_y = np.hstack(_unique_y) + if unique_y.ndim > 1 and unique_y.shape[1] == 1: + unique_y = unique_y.ravel() + X_dummy = np.zeros((unique_y.shape[0], X.shape[1])) + estimator.fit(X_dummy, unique_y) + elif not _is_fitted(estimator): + if y.ndim > 1 and y.shape[1] == 1: + estimator.fit(X[:2], y[:2].ravel()) + else: + estimator.fit(X[:2], y[:2]) # sampling a separate train/test per tree if self.sample_dataset_per_tree: From da2863d19463a28f81c6499a3c3794fcbb5a0c93 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 14 Nov 2023 13:35:09 -0500 Subject: [PATCH 10/14] Fix unit-tests Signed-off-by: Adam Li --- sktree/stats/forestht.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index aa613ae16..abf2a4bad 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -382,6 +382,29 @@ def statistic( self.permuted_estimator_ = self._get_estimator() estimator = self.permuted_estimator_ + if not hasattr(self, "estimator_"): + self.estimator_ = self._get_estimator() + + # XXX: this can be improved as an extra fit can be avoided, by + # just doing error-checking + # and then setting the internal meta data structures + # first run a dummy fit on the samples to initialize the + # internal data structure of the forest + if is_classifier(self.estimator_): + _unique_y = [] + for axis in range(y.shape[1]): + _unique_y.append(np.unique(y[:, axis])) + unique_y = np.hstack(_unique_y) + if unique_y.ndim > 1 and unique_y.shape[1] == 1: + unique_y = unique_y.ravel() + X_dummy = np.zeros((unique_y.shape[0], X.shape[1])) + self.estimator_.fit(X_dummy, unique_y) + else: + if y.ndim > 1 and y.shape[1] == 1: + self.estimator_.fit(X[:2], y[:2].ravel()) + else: + self.estimator_.fit(X[:2], y[:2]) + # Store a cache of the y variable if is_classifier(estimator): self._y = y.copy() From 473c540f456764c92fc651df540bcac9339a8fae Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 14 Nov 2023 15:08:49 -0500 Subject: [PATCH 11/14] Up tree count Signed-off-by: Adam Li --- sktree/stats/tests/test_coleman.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py index 843db417d..936a12068 100644 --- a/sktree/stats/tests/test_coleman.py +++ b/sktree/stats/tests/test_coleman.py @@ -79,7 +79,7 @@ { "estimator": RandomForestRegressor( max_features="sqrt", - n_estimators=125, + n_estimators=200, n_jobs=-1, ), # "random_state": seed, From 3078be1af37b8ff623a2f44c9c7aa2fb3fe9fc60 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 14 Nov 2023 16:45:19 -0500 Subject: [PATCH 12/14] Fix coleman unit-test simulations Signed-off-by: Adam Li --- sktree/stats/tests/test_coleman.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py index 936a12068..271a131a5 100644 --- a/sktree/stats/tests/test_coleman.py +++ b/sktree/stats/tests/test_coleman.py @@ -82,7 +82,7 @@ n_estimators=200, n_jobs=-1, ), - # "random_state": seed, + "random_state": seed, "permute_forest_fraction": 0.5, "sample_dataset_per_tree": False, }, @@ -167,6 +167,7 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size) n_jobs=-1, ), "sample_dataset_per_tree": False, + "random_state": seed, }, 600, # n_samples 1000, # n_repeats @@ -200,6 +201,7 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size) ), "permute_forest_fraction": 0.5, "sample_dataset_per_tree": False, + "random_state": seed, }, 600, # n_samples 1000, # n_repeats From b2b64d339915c3aaf2f66e5efa99976e8c2ccb4a Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 14 Nov 2023 20:25:12 -0500 Subject: [PATCH 13/14] Fix unit test Signed-off-by: Adam Li --- sktree/stats/tests/test_coleman.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py index 271a131a5..54e81ce0f 100644 --- a/sktree/stats/tests/test_coleman.py +++ b/sktree/stats/tests/test_coleman.py @@ -79,8 +79,9 @@ { "estimator": RandomForestRegressor( max_features="sqrt", - n_estimators=200, + n_estimators=250, n_jobs=-1, + random_state=rng.integers(0, 1000), ), "random_state": seed, "permute_forest_fraction": 0.5, @@ -198,10 +199,11 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size) max_features="sqrt", n_estimators=200, n_jobs=-1, + random_state=rng.integers(0, 1000), ), "permute_forest_fraction": 0.5, "sample_dataset_per_tree": False, - "random_state": seed, + "random_state": rng.integers(0, 1000), }, 600, # n_samples 1000, # n_repeats From 2d9b71806a1d5db1803846f7ab2f1785798f9952 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 14 Nov 2023 21:37:48 -0500 Subject: [PATCH 14/14] Try again Signed-off-by: Adam Li --- sktree/stats/tests/test_coleman.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py index 54e81ce0f..23904f0d5 100644 --- a/sktree/stats/tests/test_coleman.py +++ b/sktree/stats/tests/test_coleman.py @@ -199,11 +199,11 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size) max_features="sqrt", n_estimators=200, n_jobs=-1, - random_state=rng.integers(0, 1000), + random_state=seed, ), "permute_forest_fraction": 0.5, "sample_dataset_per_tree": False, - "random_state": rng.integers(0, 1000), + "random_state": seed, }, 600, # n_samples 1000, # n_repeats