diff --git a/doc/whats_new/_contributors.rst b/doc/whats_new/_contributors.rst index eb441d66d..8119221ef 100644 --- a/doc/whats_new/_contributors.rst +++ b/doc/whats_new/_contributors.rst @@ -27,3 +27,4 @@ .. _Ronan Perry : https://rflperry.github.io/ .. _Haoyin Xu : https://github.com/PSSF23 .. _Yuxin Bai : https://github.com/YuxinB +.. _Ryan Hausen : https://ryanhausen.github.io diff --git a/doc/whats_new/v0.10.rst b/doc/whats_new/v0.10.rst index 1a5b75573..2c23edf4f 100644 --- a/doc/whats_new/v0.10.rst +++ b/doc/whats_new/v0.10.rst @@ -13,6 +13,9 @@ Version 0.10 Changelog --------- +- |Feature| Calculations involving nans in ``treeple.stats.utils`` now use the + ``bottleneck`` library for faster computation. By `Ryan Hausen`_ (:pr:`#306`) + Code and Documentation Contributors ----------------------------------- @@ -21,3 +24,4 @@ Thanks to everyone who has contributed to the maintenance and improvement of the project since version inception, including: * `Adam Li`_ +* `Ryan Hausen`_ diff --git a/pyproject.toml b/pyproject.toml index 3007df5b8..bc2149219 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,8 @@ all = [ 'treeple[build]', 'treeple[doc]', 'treeple[style]', - 'treeple[test]' + 'treeple[test]', + 'treeple[extra]' ] build = [ 'build', @@ -123,6 +124,9 @@ test = [ 'flaky', 'tqdm' ] +extra = [ + 'bottleneck' +] [tool.bandit] exclude_dirs = ["treeple/tests", "treeple/**/tests/*", 'treeple/_build_utils/*', 'treeple/_lib/*'] diff --git a/test_requirements.txt b/test_requirements.txt index 2693a46a8..305654872 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -4,4 +4,5 @@ pytest pytest-cov memory_profiler flaky -tqdm \ No newline at end of file +tqdm +bottleneck diff --git a/treeple/stats/tests/test_forestht.py b/treeple/stats/tests/test_forestht.py index 0eea2c257..6504a440f 100644 --- a/treeple/stats/tests/test_forestht.py +++ b/treeple/stats/tests/test_forestht.py @@ -1,9 +1,14 @@ +import importlib +import os + import numpy as np import pytest from flaky import flaky from numpy.testing import assert_almost_equal, assert_array_equal from sklearn import datasets +import treeple.stats as stats +import treeple.stats.utils as utils from treeple import HonestForestClassifier, RandomForestClassifier from treeple.stats import ( PermutationHonestForestClassifier, @@ -236,11 +241,21 @@ def test_comight_repeated_feature_sets(seed): assert result.pvalue > 0.05, f"{result.pvalue}" -def test_build_coleman_forest(): +@pytest.mark.parametrize("use_bottleneck", [True, False]) +def test_build_coleman_forest(use_bottleneck: bool): """Simple test for building a Coleman forest. Test the function under alternative and null hypothesis for a very simple dataset. """ + if use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ: + del os.environ[utils.DISABLE_BN_ENV_VAR] + importlib.reload(utils) + importlib.reload(stats) + else: + os.environ[utils.DISABLE_BN_ENV_VAR] = "1" + importlib.reload(utils) + importlib.reload(stats) + n_estimators = 100 n_samples = 30 n_features = 5 @@ -273,10 +288,10 @@ def test_build_coleman_forest(): with pytest.raises( RuntimeError, match="Permutation forest must be a PermutationHonestForestClassifier" ): - build_coleman_forest(clf, clf, X, y) + stats.build_coleman_forest(clf, clf, X, y) forest_result, orig_forest_proba, perm_forest_proba, clf_fitted, perm_clf_fitted = ( - build_coleman_forest(clf, perm_clf, X, y, metric="s@98", n_repeats=1000, seed=seed) + stats.build_coleman_forest(clf, perm_clf, X, y, metric="s@98", n_repeats=1000, seed=seed) ) assert clf_fitted._n_samples_bootstrap == round(n_samples * 1.6) assert perm_clf_fitted._n_samples_bootstrap == round(n_samples * 1.6) @@ -287,7 +302,7 @@ def test_build_coleman_forest(): assert_array_equal(orig_forest_proba.shape, perm_forest_proba.shape) X = np.vstack([_X, _X]) - forest_result, _, _, clf_fitted, perm_clf_fitted = build_coleman_forest( + forest_result, _, _, clf_fitted, perm_clf_fitted = stats.build_coleman_forest( clf, perm_clf, X, y, metric="s@98" ) assert forest_result.pvalue > 0.05, f"{forest_result.pvalue}" diff --git a/treeple/stats/tests/test_utils.py b/treeple/stats/tests/test_utils.py index 6e23fb4cd..95698c968 100644 --- a/treeple/stats/tests/test_utils.py +++ b/treeple/stats/tests/test_utils.py @@ -1,7 +1,11 @@ +import importlib +import os + import numpy as np import pytest from numpy.testing import assert_array_equal +import treeple.stats.utils as utils from treeple import HonestForestClassifier from treeple.stats.utils import get_per_tree_oob_samples @@ -32,3 +36,35 @@ def test_get_per_tree_oob_samples(bootstrap): else: with pytest.raises(RuntimeError, match="Cannot extract out-of-bag samples"): get_per_tree_oob_samples(est) + + +@pytest.mark.parametrize("use_bottleneck", [True, False]) +def test_non_nan_samples(use_bottleneck: bool): + + if use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ: + del os.environ[utils.DISABLE_BN_ENV_VAR] + importlib.reload(utils) + else: + os.environ[utils.DISABLE_BN_ENV_VAR] = "1" + importlib.reload(utils) + + posterior_array = np.array( + [ + # tree 1 + [ + [0, 1], + [np.nan, np.nan], + [np.nan, np.nan], + ], + # tree 2 + [ + [0, 1], + [np.nan, np.nan], + [1, 0], + ], + ] + ) # [2, 3, 2] + + expected = np.array([0, 2]) + actual = utils._non_nan_samples(posterior_array) + np.testing.assert_array_equal(expected, actual) diff --git a/treeple/stats/utils.py b/treeple/stats/utils.py index 98ee132c0..57f67787c 100644 --- a/treeple/stats/utils.py +++ b/treeple/stats/utils.py @@ -1,3 +1,6 @@ +import os +import sys +import warnings from typing import Optional, Tuple import numpy as np @@ -16,6 +19,24 @@ from treeple._lib.sklearn.ensemble._forest import BaseForest, ForestClassifier +BOTTLENECK_AVAILABLE = False +if "bottleneck" in sys.modules: + import bottleneck as bn + + BOTTLENECK_AVAILABLE = True + +DISABLE_BN_ENV_VAR = "TREEPLE_NO_BOTTLENECK" + +if BOTTLENECK_AVAILABLE and DISABLE_BN_ENV_VAR not in os.environ: + nanmean_f = bn.nanmean + anynan_f = lambda arr: bn.anynan(arr, axis=2) +else: + warnings.warn( + "Not using bottleneck for calculations involvings nans. Expect slower performance." + ) + nanmean_f = np.nanmean + anynan_f = lambda arr: np.isnan(arr).any(axis=2) + def _mutual_information(y_true: ArrayLike, y_pred_proba: ArrayLike) -> float: """Compute estimate of mutual information for supervised classification setting. @@ -131,7 +152,7 @@ def _non_nan_samples(posterior_arr: ArrayLike) -> ArrayLike: along axis=1. """ # Find the row indices with NaN values along the specified axis - nan_indices = np.isnan(posterior_arr).any(axis=2).all(axis=0) + nan_indices = anynan_f(posterior_arr).all(axis=0) # Invert the boolean mask to get indices without NaN values nonnan_indices = np.where(~nan_indices)[0] @@ -320,8 +341,8 @@ def _parallel_build_null_forests( # first_half_metric = metric_func(y_test[non_nan_samples, :], y_pred_first_half) # second_half_metric = metric_func(y_test[non_nan_samples, :], y_pred_second_half) - y_pred_first_half = np.nanmean(first_forest_pred[:, first_forest_samples, :], axis=0) - y_pred_second_half = np.nanmean(second_forest_pred[:, second_forest_samples, :], axis=0) + y_pred_first_half = nanmean_f(first_forest_pred[:, first_forest_samples, :], axis=0) + y_pred_second_half = nanmean_f(second_forest_pred[:, second_forest_samples, :], axis=0) # compute two instances of the metric from the sampled trees first_half_metric = metric_func(