diff --git a/doc/api.rst b/doc/api.rst index 59238d77a..12182469e 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -150,6 +150,19 @@ tree models. PermutationForestClassifier PermutationForestRegressor +Datasets +------------------------------ +We provide some convenience functions for simulating datasets beyond +those offered in scikit-learn. + +.. currentmodule:: sktree.datasets +.. autosummary:: + :toctree: generated/ + + make_gaussian_mixture + make_joint_factor_model + make_quadratic_classification + Experimental Functionality -------------------------- @@ -160,6 +173,7 @@ We also include experimental functionality that is works in progress. :toctree: generated/ mutual_info_ksg + conditional_resample We also include functions that help simulate and evaluate mutual information (MI) and conditional mutual information (CMI) estimators. Specifically, functions that diff --git a/doc/conf.py b/doc/conf.py index ac752d66a..6e2a78d23 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -244,6 +244,7 @@ "fit", "apply", "TreeBuilder", + "joint_rank", } # validation diff --git a/doc/whats_new/v0.4.rst b/doc/whats_new/v0.4.rst index b7d489029..8e359a89f 100644 --- a/doc/whats_new/v0.4.rst +++ b/doc/whats_new/v0.4.rst @@ -16,6 +16,7 @@ Changelog - |API| ``FeatureImportanceForest*`` now has a hyperparameter to control the number of permutations is done per forest ``permute_per_forest_fraction``, by `Adam Li`_ (:pr:`145`) - |Enhancement| Add dataset generators for regression and classification and hypothesis testing, by `Adam Li`_ (:pr:`169`) - |Fix| Fixes a bug where ``FeatureImportanceForest*`` was unable to be run when calling ``statistic`` with ``covariate_index`` defined for MI, AUC metrics, by `Adam Li`_ (:pr:`164`) +- |Enhancement| Add :func:`sktree.experimental.conditional_resample`, which allows conditional resampling of rows based on nearest-neighbors defined via a feature set, by `Adam Li`_ (:pr:`170`) Code and Documentation Contributors ----------------------------------- diff --git a/pyproject.toml b/pyproject.toml index e03c45e1a..a5c2b6fdd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -218,7 +218,7 @@ profile = 'black' multi_line_output = 3 line_length = 100 py_version = 38 -extend_skip_glob = ['sktree/__init__.py', 'sktree/_lib/*', '.asv/*', 'env/*'] +extend_skip_glob = ['sktree/__init__.py', 'sktree/_lib/*', '.asv/*', 'env/*', 'build-install/*'] [tool.pydocstyle] convention = 'numpy' diff --git a/sktree/experimental/__init__.py b/sktree/experimental/__init__.py index cdf4b4295..2934888fc 100644 --- a/sktree/experimental/__init__.py +++ b/sktree/experimental/__init__.py @@ -1,4 +1,5 @@ from . import mutual_info, sdf, simulate +from .monte_carlo import conditional_resample from .mutual_info import ( cmi_from_entropy, cmi_gaussian, diff --git a/sktree/experimental/meson.build b/sktree/experimental/meson.build index 2be7ae9e8..2d1dde609 100644 --- a/sktree/experimental/meson.build +++ b/sktree/experimental/meson.build @@ -3,6 +3,7 @@ python_sources = [ 'mutual_info.py', 'simulate.py', 'sdf.py', + 'monte_carlo.py', ] py3.install_sources( diff --git a/sktree/experimental/monte_carlo.py b/sktree/experimental/monte_carlo.py new file mode 100644 index 000000000..aab6a25cd --- /dev/null +++ b/sktree/experimental/monte_carlo.py @@ -0,0 +1,218 @@ +from typing import Optional + +import numpy as np +from numpy.typing import ArrayLike +from scipy.sparse import issparse +from sklearn.neighbors import NearestNeighbors +from sklearn.utils import _approximate_mode, _safe_indexing, check_array, check_consistent_length + + +def _conditional_shuffle(nbrs: ArrayLike, replace: bool = False, seed=None) -> ArrayLike: + """Compute a permutation of neighbors with restrictions. + + Parameters + ---------- + nbrs : ArrayLike of shape (n_samples, k) + The k-nearest-neighbors for each sample index. Each row corresponds to the + original sample. Each element corresponds to another sample index that is deemed + as the k-nearest neighbors with respect to the original sample. + replace : bool, optional + Whether or not to allow replacement of samples, by default False. + seed : int, optional + Random seed, by default None. + + Returns + ------- + restricted_perm : ArrayLike of shape (n_samples) + The final permutation order of the sample indices. There may be + repeating samples. See Notes for details. + + Notes + ----- + Restricted permutation goes through random samples and looks at the k-nearest + neighbors (columns of ``nbrs``) and shuffles the closest neighbor index only + if it has not been used to permute another sample. If it has been, then the + algorithm looks at the next nearest-neighbor and so on. If all k-nearest + neighbors of a sample has been checked, then a random neighbor is chosen. In this + manner, the algorithm tries to perform permutation without replacement, but + if necessary, will choose a repeating neighbor sample. + """ + n_samples, k_dims = nbrs.shape + rng = np.random.default_rng(seed=seed) + + # initialize the final permutation order + restricted_perm = np.zeros((n_samples,), dtype=np.intp) + + # generate a random order of samples to go through + random_order = rng.permutation(n_samples) + + # keep track of values we have already used + used = set() + + # go through the random order + for idx in random_order: + if replace: + possible_nbrs = nbrs[idx, :] + restricted_perm[idx] = rng.choice(possible_nbrs, size=1).squeeze() + else: + m = 0 + use_idx = nbrs[idx, m] + + # if the current nbr is already used, continue incrementing + # until we have either found a new sample to use, or if + # we have reach the maximum number of shuffles to consider + while (use_idx in used) and (m < k_dims - 1): + m += 1 + use_idx = nbrs[idx, m] + + # check whether or not we have exhaustively checked all kNN + if use_idx in used and m == k_dims: + # XXX: Note this step is not in the original paper + # choose a random neighbor to permute + restricted_perm[idx] = rng.choice(nbrs[idx, :], size=1) + else: + # permute with the existing neighbor + restricted_perm[idx] = use_idx + used.add(use_idx) + return restricted_perm + + +def conditional_resample( + conditional_array: ArrayLike, + *arrays, + nn_estimator=None, + replace: bool = True, + replace_nbrs: bool = True, + n_samples: Optional[int] = None, + random_state: Optional[int] = None, + stratify: Optional[ArrayLike] = None, +): + """Conditionally resample arrays or sparse matrices in a consistent way. + + The default strategy implements one step of the bootstrapping + procedure. Conditional resampling is a modification of the bootstrap + technique that preserves the conditional distribution of the data. This + is done by fitting a nearest neighbors estimator on the conditional array + and then resampling the nearest neighbors of each sample. + + Parameters + ---------- + conditional_array : array-like of shape (n_samples, n_features) + The array, which we preserve the conditional distribution of. + + *arrays : sequence of array-like of shape (n_samples,) or \ + (n_samples, n_outputs) + Indexable data-structures can be arrays, lists, dataframes or scipy + sparse matrices with consistent first dimension. + + nn_estimator : estimator object, default=None + The nearest neighbors estimator to use. If None, then a + :class:`sklearn.neighbors.NearestNeighbors` instance is used. + + replace : bool, default=True + Implements resampling with replacement. If False, this will implement + (sliced) random permutations. The replacement will take place at the level + of the sample index. + + replace_nbrs : bool, default=True + Implements resampling with replacement at the level of the nearest neighbors. + + n_samples : int, default=None + Number of samples to generate. If left to None this is + automatically set to the first dimension of the arrays. + If replace is False it should not be larger than the length of + arrays. + + random_state : int, RandomState instance or None, default=None + Determines random number generation for shuffling + the data. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. + + stratify : array-like of shape (n_samples,) or (n_samples, n_outputs), \ + default=None + If not None, data is split in a stratified fashion, using this as + the class labels. + + Returns + ------- + resampled_arrays : sequence of array-like of shape (n_samples,) or \ + (n_samples, n_outputs) + Sequence of resampled copies of the collections. The original arrays + are not impacted. + """ + max_n_samples = n_samples + rng = np.random.default_rng(random_state) + + if len(arrays) == 0: + return None + + first = arrays[0] + n_samples = first.shape[0] if hasattr(first, "shape") else len(first) + + if max_n_samples is None: + max_n_samples = n_samples + elif (max_n_samples > n_samples) and (not replace): + raise ValueError( + f"Cannot sample {max_n_samples} out of arrays with dim " + f"{n_samples} when replace is False" + ) + + check_consistent_length(conditional_array, *arrays) + + # fit nearest neighbors onto the conditional array + if nn_estimator is None: + nn_estimator = NearestNeighbors() + nn_estimator.fit(conditional_array) + + if stratify is None: + if replace: + indices = rng.integers(0, n_samples, size=(max_n_samples,)) + else: + indices = np.arange(n_samples) + rng.shuffle(indices) + indices = indices[:max_n_samples] + else: + # Code adapted from StratifiedShuffleSplit() + y = check_array(stratify, ensure_2d=False, dtype=None) + if y.ndim == 2: + # for multi-label y, map each distinct row to a string repr + # using join because str(row) uses an ellipsis if len(row) > 1000 + y = np.array([" ".join(row.astype("str")) for row in y]) + + classes, y_indices = np.unique(y, return_inverse=True) + n_classes = classes.shape[0] + + class_counts = np.bincount(y_indices) + + # Find the sorted list of instances for each class: + # (np.unique above performs a sort, so code is O(n logn) already) + class_indices = np.split( + np.argsort(y_indices, kind="mergesort"), np.cumsum(class_counts)[:-1] + ) + + n_i = _approximate_mode(class_counts, max_n_samples, random_state) + + indices = [] + + for i in range(n_classes): + indices_i = rng.choice(class_indices[i], n_i[i], replace=replace) + indices.extend(indices_i) + + indices = rng.permutation(indices) + + # now get the kNN indices for each sample (n_samples, n_neighbors) + sample_nbrs = nn_estimator.kneighbors(X=conditional_array[indices, :], return_distance=False) + + # actually sample the indices using a conditional permutation + indices = _conditional_shuffle(sample_nbrs, replace=replace_nbrs, seed=rng) + + # convert sparse matrices to CSR for row-based indexing + arrays_ = [a.tocsr() if issparse(a) else a for a in arrays] + resampled_arrays = [_safe_indexing(a, indices) for a in arrays_] + + if len(resampled_arrays) == 1: + # syntactic sugar for the unit argument case + return resampled_arrays[0] + else: + return resampled_arrays diff --git a/sktree/experimental/tests/meson.build b/sktree/experimental/tests/meson.build index b5d1ef79c..c3fdd07c4 100644 --- a/sktree/experimental/tests/meson.build +++ b/sktree/experimental/tests/meson.build @@ -3,6 +3,7 @@ python_sources = [ 'test_mutual_info.py', 'test_simulate.py', 'test_sdf.py', + 'test_monte_carlo.py', ] py3.install_sources( diff --git a/sktree/experimental/tests/test_monte_carlo.py b/sktree/experimental/tests/test_monte_carlo.py new file mode 100644 index 000000000..b76d59fb4 --- /dev/null +++ b/sktree/experimental/tests/test_monte_carlo.py @@ -0,0 +1,196 @@ +import numpy as np +import pytest +from scipy.sparse import csr_matrix +from sklearn.datasets import make_classification +from sklearn.neighbors import NearestNeighbors + +from sktree.experimental import conditional_resample + + +def test_conditional_resample_with_default_params(): + # Create a simple example dataset for testing + X, y = make_classification(n_samples=100, n_features=5, random_state=42) + # Create conditional array + nn = NearestNeighbors(n_neighbors=5) + nn.fit(X) + conditional_array = nn.kneighbors_graph(X).toarray() + # Test conditional resampling with default parameters + resampled_arrays = conditional_resample( + conditional_array, X, y, nn_estimator=NearestNeighbors() + ) + + # Check if the number of samples in resampled_arrays is the same as the input arrays + assert len(resampled_arrays) == 2 + assert len(resampled_arrays[0]) == len(X) + assert len(resampled_arrays[1]) == len(y) + + +def test_conditional_resample_without_replacement(): + # Create a simple example dataset for testing + X, y = make_classification(n_samples=100, n_features=5, random_state=42) + # Create conditional array + nn = NearestNeighbors(n_neighbors=5) + nn.fit(X) + conditional_array = nn.kneighbors_graph(X).toarray() + + # Test conditional resampling without replacement + resampled_arrays = conditional_resample( + conditional_array, X, y, nn_estimator=NearestNeighbors(), replace=False + ) + + # Check if the number of samples in resampled_arrays is the same as the input arrays + assert len(resampled_arrays) == 2 + assert len(resampled_arrays[0]) == len(X) + assert len(resampled_arrays[1]) == len(y) + + # Check if the samples are unique (no replacement) + assert len(np.unique(resampled_arrays[1])) == len( + np.unique(y) + ), f"{len(np.unique(resampled_arrays[1]))} != {len(y)}" + + +def test_conditional_resample_with_sparse_matrix(): + # Create a simple example dataset for testing + X, y = make_classification(n_samples=100, n_features=5, random_state=42) + X_sparse = csr_matrix(X) # Convert X to a sparse matrix + + # Create conditional array + nn = NearestNeighbors(n_neighbors=5) + nn.fit(X) + conditional_array = nn.kneighbors_graph(X).toarray() + + # Test conditional resampling with a sparse matrix + resampled_arrays = conditional_resample( + conditional_array, X_sparse, y, nn_estimator=NearestNeighbors() + ) + + # Check if the number of samples in resampled_arrays is the same as the input arrays + assert len(resampled_arrays) == 2 + assert resampled_arrays[0].shape[0] == len(X) + assert len(resampled_arrays[1]) == len(y) + + +def test_conditional_resample_with_stratify(): + # Create a simple example dataset for testing + X, y = make_classification(n_samples=100, n_features=5, random_state=42) + + # Create conditional array + nn = NearestNeighbors(n_neighbors=5) + nn.fit(X) + conditional_array = nn.kneighbors_graph(X).toarray() + + # Define a custom stratify function + def custom_stratify(y, category): + # Create an array where each entry is True if it belongs to the specified category, + # False otherwise + stratify_array = y == category + return stratify_array + + category_to_stratify = 1 # Change this to the category you want to stratify + + # Get the distribution of the specified category before resampling + category_distribution_before = np.sum(y == category_to_stratify) + + # Test conditional resampling with the custom stratify function + stratify = custom_stratify(y, category_to_stratify) + resampled_arrays = conditional_resample( + conditional_array, X, y, nn_estimator=NearestNeighbors(), stratify=stratify, random_state=0 + ) + + # Get the distribution of the specified category after resampling + category_distribution_after = np.sum(resampled_arrays[1] == category_to_stratify) + + # Check if the distribution of the specified category is preserved + assert category_distribution_before == category_distribution_after, ( + f"Expected {category_distribution_before} samples, got " + f"{category_distribution_after} samples" + ) + + +def test_conditional_resample_with_replace_nbrs(): + # Create a simple example dataset for testing + X, y = make_classification(n_samples=100, n_features=5, random_state=42) + + # Create conditional array + nn = NearestNeighbors(n_neighbors=5) + nn.fit(X) + conditional_array = nn.kneighbors_graph(X).toarray() + + # Test conditional resampling with replace_nbrs=False + resampled_arrays = conditional_resample( + conditional_array, X, y, nn_estimator=NearestNeighbors(), replace_nbrs=False + ) + + # Check if the number of samples in resampled_arrays is the same as the input arrays + assert len(resampled_arrays) == 2, f"Expected 2 arrays, got {len(resampled_arrays)} arrays" + assert len(resampled_arrays[0]) == len(X) + assert len(resampled_arrays[1]) == len(y) + + +def test_conditional_resample_errors(): + # 01: Test with invalid number of samples + # Create a simple example dataset for testing + X, y = make_classification(n_samples=100, n_features=5, random_state=42) + + # Test conditional resampling with an invalid stratify array (should raise an error) + with pytest.raises(ValueError, match="Cannot sample"): + conditional_resample(X, y, nn_estimator=NearestNeighbors(), replace=False, n_samples=1000) + + # 02: Test inconsistent_length + # Create an additional array with a different number of samples + additional_array = np.random.rand(80, 5) + + # Test conditional resampling with inconsistent length of input arrays (should raise an error) + with pytest.raises(ValueError): + conditional_resample(X, y, additional_array, nn_estimator=NearestNeighbors()) + + # 03: Test with invalid sample size when replace=False + # Test conditional resampling with n_samples larger than the input arrays + # (should raise an error) + with pytest.raises(ValueError): + conditional_resample(X, y, nn_estimator=NearestNeighbors(), n_samples=200, replace=False) + + +def test_conditional_resample(): + # Generate synthetic data + X, y = make_classification(n_samples=100, n_features=5, random_state=42) + + # Convert X to sparse matrix + X_sparse = csr_matrix(X) + + # Create conditional array + nn = NearestNeighbors(n_neighbors=5) + nn.fit(X) + conditional_array = nn.kneighbors_graph(X).toarray() + + # Perform conditional resampling + resampled_X = conditional_resample(conditional_array, X, replace=False, replace_nbrs=False) + resampled_X_sparse = conditional_resample( + conditional_array, X_sparse, replace=False, replace_nbrs=False + ) + + # Check that the resampled arrays have the correct shape + assert resampled_X.shape == X.shape + assert resampled_X_sparse.shape == X_sparse.shape + + # Check that the resampled arrays have the correct number of unique samples + assert len(np.unique(resampled_X, axis=0)) == X.shape[0] + assert len(np.unique(resampled_X_sparse.toarray(), axis=0)) == X_sparse.shape[0] + + # Check that the conditional distribution is preserved + for i in range(X.shape[1]): + unique_values, counts = np.unique(resampled_X[:, i], return_counts=True) + original_values, original_counts = np.unique(X[:, i], return_counts=True) + + assert np.all(unique_values == original_values) + assert np.all(counts == original_counts) + + unique_values_sparse, counts_sparse = np.unique( + resampled_X_sparse[:, i].toarray(), return_counts=True + ) + original_values_sparse, original_counts_sparse = np.unique( + X_sparse[:, i].toarray(), return_counts=True + ) + + assert np.all(unique_values_sparse == original_values_sparse) + assert np.all(counts_sparse == original_counts_sparse) diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index b21ebd3aa..abf2a4bad 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -382,11 +382,15 @@ def statistic( self.permuted_estimator_ = self._get_estimator() estimator = self.permuted_estimator_ - if not hasattr(self, "estimator_") or self.estimator_ is None: + if not hasattr(self, "estimator_"): self.estimator_ = self._get_estimator() - # Ensure that the estimator_ is fitted at least - if not _is_fitted(self.estimator_) and is_classifier(self.estimator_): + # XXX: this can be improved as an extra fit can be avoided, by + # just doing error-checking + # and then setting the internal meta data structures + # first run a dummy fit on the samples to initialize the + # internal data structure of the forest + if is_classifier(self.estimator_): _unique_y = [] for axis in range(y.shape[1]): _unique_y.append(np.unique(y[:, axis])) @@ -395,20 +399,20 @@ def statistic( unique_y = unique_y.ravel() X_dummy = np.zeros((unique_y.shape[0], X.shape[1])) self.estimator_.fit(X_dummy, unique_y) - elif not _is_fitted(estimator): + else: if y.ndim > 1 and y.shape[1] == 1: self.estimator_.fit(X[:2], y[:2].ravel()) else: self.estimator_.fit(X[:2], y[:2]) # Store a cache of the y variable - if is_classifier(self._get_estimator()): + if is_classifier(estimator): self._y = y.copy() - # XXX: this can be improved as an extra fit can be avoided, by just doing error-checking - # and then setting the internal meta data structures - # first run a dummy fit on the samples to initialize the - # internal data structure of the forest + # # XXX: this can be improved as an extra fit can be avoided, by just doing error-checking + # # and then setting the internal meta data structures + # # first run a dummy fit on the samples to initialize the + # # internal data structure of the forest if not _is_fitted(estimator) and is_classifier(estimator): _unique_y = [] for axis in range(y.shape[1]): diff --git a/sktree/stats/tests/test_coleman.py b/sktree/stats/tests/test_coleman.py index 843db417d..23904f0d5 100644 --- a/sktree/stats/tests/test_coleman.py +++ b/sktree/stats/tests/test_coleman.py @@ -79,10 +79,11 @@ { "estimator": RandomForestRegressor( max_features="sqrt", - n_estimators=125, + n_estimators=250, n_jobs=-1, + random_state=rng.integers(0, 1000), ), - # "random_state": seed, + "random_state": seed, "permute_forest_fraction": 0.5, "sample_dataset_per_tree": False, }, @@ -167,6 +168,7 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size) n_jobs=-1, ), "sample_dataset_per_tree": False, + "random_state": seed, }, 600, # n_samples 1000, # n_repeats @@ -197,9 +199,11 @@ def test_linear_model(hypotester, model_kwargs, n_samples, n_repeats, test_size) max_features="sqrt", n_estimators=200, n_jobs=-1, + random_state=seed, ), "permute_forest_fraction": 0.5, "sample_dataset_per_tree": False, + "random_state": seed, }, 600, # n_samples 1000, # n_repeats diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index da5285685..74e446cf3 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -58,8 +58,11 @@ def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree): ), f"{len(est.train_test_samples_[0][1])} {n_samples * est.test_size}" assert len(est.train_test_samples_[0][0]) == est._n_samples_ - n_samples * est.test_size + # covariate index should work with mse + est.reset() + est.statistic(iris_X[:n_samples], iris_y[:n_samples], covariate_index=[1], metric="mse") with pytest.raises(RuntimeError, match="Metric must be"): - est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mi") + est.statistic(iris_X[:n_samples], iris_y[:n_samples], covariate_index=[1], metric="mi") # covariate index should work with mse est.reset()