Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added bottleneck for nan calculations #306

Merged
merged 13 commits into from
Aug 2, 2024
Merged
1 change: 1 addition & 0 deletions doc/whats_new/_contributors.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@
.. _Ronan Perry : https://rflperry.github.io/
.. _Haoyin Xu : https://github.com/PSSF23
.. _Yuxin Bai : https://github.com/YuxinB
.. _Ryan Hausen : https://ryanhausen.github.io
4 changes: 4 additions & 0 deletions doc/whats_new/v0.10.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ Version 0.10
Changelog
---------

- |Feature| Calculations involving nans in ``treeple.stats.utils`` now use the
``bottleneck`` library for faster computation. By `Ryan Hausen`_ (:pr:`#306`)


Code and Documentation Contributors
-----------------------------------
Expand All @@ -21,3 +24,4 @@ Thanks to everyone who has contributed to the maintenance and improvement of
the project since version inception, including:

* `Adam Li`_
* `Ryan Hausen`_
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ all = [
'treeple[build]',
'treeple[doc]',
'treeple[style]',
'treeple[test]'
'treeple[test]',
'treeple[extra]'
]
build = [
'build',
Expand Down Expand Up @@ -123,6 +124,9 @@ test = [
'flaky',
'tqdm'
]
extra = [
'bottleneck'
]

[tool.bandit]
exclude_dirs = ["treeple/tests", "treeple/**/tests/*", 'treeple/_build_utils/*', 'treeple/_lib/*']
Expand Down
3 changes: 2 additions & 1 deletion test_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ pytest
pytest-cov
memory_profiler
flaky
tqdm
tqdm
bottleneck
23 changes: 19 additions & 4 deletions treeple/stats/tests/test_forestht.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
import importlib
import os

import numpy as np
import pytest
from flaky import flaky
from numpy.testing import assert_almost_equal, assert_array_equal
from sklearn import datasets

import treeple.stats as stats
import treeple.stats.utils as utils
from treeple import HonestForestClassifier, RandomForestClassifier
from treeple.stats import (
PermutationHonestForestClassifier,
Expand Down Expand Up @@ -236,11 +241,21 @@ def test_comight_repeated_feature_sets(seed):
assert result.pvalue > 0.05, f"{result.pvalue}"


def test_build_coleman_forest():
@pytest.mark.parametrize("use_bottleneck", [True, False])
def test_build_coleman_forest(use_bottleneck: bool):
"""Simple test for building a Coleman forest.

Test the function under alternative and null hypothesis for a very simple dataset.
"""
if use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ:
del os.environ[utils.DISABLE_BN_ENV_VAR]
importlib.reload(utils)
importlib.reload(stats)
else:
os.environ[utils.DISABLE_BN_ENV_VAR] = "1"
importlib.reload(utils)
importlib.reload(stats)

n_estimators = 100
n_samples = 30
n_features = 5
Expand Down Expand Up @@ -273,10 +288,10 @@ def test_build_coleman_forest():
with pytest.raises(
RuntimeError, match="Permutation forest must be a PermutationHonestForestClassifier"
):
build_coleman_forest(clf, clf, X, y)
stats.build_coleman_forest(clf, clf, X, y)

forest_result, orig_forest_proba, perm_forest_proba, clf_fitted, perm_clf_fitted = (
build_coleman_forest(clf, perm_clf, X, y, metric="s@98", n_repeats=1000, seed=seed)
stats.build_coleman_forest(clf, perm_clf, X, y, metric="s@98", n_repeats=1000, seed=seed)
)
assert clf_fitted._n_samples_bootstrap == round(n_samples * 1.6)
assert perm_clf_fitted._n_samples_bootstrap == round(n_samples * 1.6)
Expand All @@ -287,7 +302,7 @@ def test_build_coleman_forest():
assert_array_equal(orig_forest_proba.shape, perm_forest_proba.shape)

X = np.vstack([_X, _X])
forest_result, _, _, clf_fitted, perm_clf_fitted = build_coleman_forest(
forest_result, _, _, clf_fitted, perm_clf_fitted = stats.build_coleman_forest(
clf, perm_clf, X, y, metric="s@98"
)
assert forest_result.pvalue > 0.05, f"{forest_result.pvalue}"
Expand Down
36 changes: 36 additions & 0 deletions treeple/stats/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import importlib
import os

import numpy as np
import pytest
from numpy.testing import assert_array_equal

import treeple.stats.utils as utils
from treeple import HonestForestClassifier
from treeple.stats.utils import get_per_tree_oob_samples

Expand Down Expand Up @@ -32,3 +36,35 @@ def test_get_per_tree_oob_samples(bootstrap):
else:
with pytest.raises(RuntimeError, match="Cannot extract out-of-bag samples"):
get_per_tree_oob_samples(est)


@pytest.mark.parametrize("use_bottleneck", [True, False])
def test_non_nan_samples(use_bottleneck: bool):

if use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ:
del os.environ[utils.DISABLE_BN_ENV_VAR]
importlib.reload(utils)
else:
os.environ[utils.DISABLE_BN_ENV_VAR] = "1"
importlib.reload(utils)

posterior_array = np.array(
[
# tree 1
[
[0, 1],
[np.nan, np.nan],
[np.nan, np.nan],
],
# tree 2
[
[0, 1],
[np.nan, np.nan],
[1, 0],
],
]
) # [2, 3, 2]

expected = np.array([0, 2])
actual = utils._non_nan_samples(posterior_array)
np.testing.assert_array_equal(expected, actual)
27 changes: 24 additions & 3 deletions treeple/stats/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import os
import sys
import warnings
from typing import Optional, Tuple

import numpy as np
Expand All @@ -16,6 +19,24 @@

from treeple._lib.sklearn.ensemble._forest import BaseForest, ForestClassifier

BOTTLENECK_AVAILABLE = False
if "bottleneck" in sys.modules:
import bottleneck as bn

BOTTLENECK_AVAILABLE = True

DISABLE_BN_ENV_VAR = "TREEPLE_NO_BOTTLENECK"

if BOTTLENECK_AVAILABLE and DISABLE_BN_ENV_VAR not in os.environ:
nanmean_f = bn.nanmean
anynan_f = lambda arr: bn.anynan(arr, axis=2)
else:
warnings.warn(
"Not using bottleneck for calculations involvings nans. Expect slower performance."
)
nanmean_f = np.nanmean
anynan_f = lambda arr: np.isnan(arr).any(axis=2)


def _mutual_information(y_true: ArrayLike, y_pred_proba: ArrayLike) -> float:
"""Compute estimate of mutual information for supervised classification setting.
Expand Down Expand Up @@ -131,7 +152,7 @@ def _non_nan_samples(posterior_arr: ArrayLike) -> ArrayLike:
along axis=1.
"""
# Find the row indices with NaN values along the specified axis
nan_indices = np.isnan(posterior_arr).any(axis=2).all(axis=0)
nan_indices = anynan_f(posterior_arr).all(axis=0)

# Invert the boolean mask to get indices without NaN values
nonnan_indices = np.where(~nan_indices)[0]
Expand Down Expand Up @@ -320,8 +341,8 @@ def _parallel_build_null_forests(
# first_half_metric = metric_func(y_test[non_nan_samples, :], y_pred_first_half)
# second_half_metric = metric_func(y_test[non_nan_samples, :], y_pred_second_half)

y_pred_first_half = np.nanmean(first_forest_pred[:, first_forest_samples, :], axis=0)
y_pred_second_half = np.nanmean(second_forest_pred[:, second_forest_samples, :], axis=0)
y_pred_first_half = nanmean_f(first_forest_pred[:, first_forest_samples, :], axis=0)
y_pred_second_half = nanmean_f(second_forest_pred[:, second_forest_samples, :], axis=0)

# compute two instances of the metric from the sampled trees
first_half_metric = metric_func(
Expand Down
Loading