Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added bottleneck for nan calculations #306

Merged
merged 13 commits into from
Aug 2, 2024
Merged
24 changes: 12 additions & 12 deletions .github/FUNDING.yml
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
# These are supported funding model platforms

github: [adam2392, PSSF23, sampan501, SUKI-O] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
patreon: # Replace with a single Patreon username
open_collective: # Replace with a single Open Collective username
ko_fi: # Replace with a single Ko-fi username
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
liberapay: # Replace with a single Liberapay username
issuehunt: # Replace with a single IssueHunt username
lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
polar: # Replace with a single Polar username
buy_me_a_coffee: adam2392 # Replace with a single Buy Me a Coffee username
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
github: [adam2392, PSSF23, sampan501, SUKI-O] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
patreon: # Replace with a single Patreon username
open_collective: # Replace with a single Open Collective username
ko_fi: # Replace with a single Ko-fi username
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
liberapay: # Replace with a single Liberapay username
issuehunt: # Replace with a single IssueHunt username
lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
polar: # Replace with a single Polar username
buy_me_a_coffee: adam2392 # Replace with a single Buy Me a Coffee username
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
1 change: 1 addition & 0 deletions doc/whats_new/_contributors.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@
.. _Ronan Perry : https://rflperry.github.io/
.. _Haoyin Xu : https://github.com/PSSF23
.. _Yuxin Bai : https://github.com/YuxinB
.. _Ryan Hausen : https://ryanhausen.github.io
3 changes: 3 additions & 0 deletions doc/whats_new/v0.8.rst
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we're on v0.10 now so you can just move this diff to that file

Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ Changelog
estimated on oob samples were biased when there was a low number of samples
due to imbalance in the classes when ``bootstrap=True``.
By `Adam Li`_ (:pr:`#283`)
- |Feature| Calculations involving nans in ``treeple.stats.utils`` now use the
``bottleneck`` library for faster computation. By `Ryan Hausen`_ (:pr:`#306`)

Code and Documentation Contributors
-----------------------------------
Expand All @@ -41,3 +43,4 @@ the project since version inception, including:

* `Adam Li`_
* `Sambit Panda`_
* `Ryan Hausen`_
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ all = [
'treeple[build]',
'treeple[doc]',
'treeple[style]',
'treeple[test]'
'treeple[test]',
'treeple[extra]'
]
build = [
'build',
Expand Down Expand Up @@ -123,6 +124,9 @@ test = [
'flaky',
'tqdm'
]
extra = [
'bottleneck'
]

[tool.bandit]
exclude_dirs = ["treeple/tests", "treeple/**/tests/*", 'treeple/_build_utils/*', 'treeple/_lib/*']
Expand Down
3 changes: 2 additions & 1 deletion test_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ pytest
pytest-cov
memory_profiler
flaky
tqdm
tqdm
bottleneck
23 changes: 19 additions & 4 deletions treeple/stats/tests/test_forestht.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
import importlib
import os

import numpy as np
import pytest
from flaky import flaky
from numpy.testing import assert_almost_equal, assert_array_equal
from sklearn import datasets

import treeple.stats as stats
import treeple.stats.utils as utils
from treeple import HonestForestClassifier, RandomForestClassifier
from treeple.stats import (
PermutationHonestForestClassifier,
Expand Down Expand Up @@ -236,11 +241,21 @@ def test_comight_repeated_feature_sets(seed):
assert result.pvalue > 0.05, f"{result.pvalue}"


def test_build_coleman_forest():
@pytest.mark.parametrize("use_bottleneck", [True, False])
def test_build_coleman_forest(use_bottleneck: bool):
"""Simple test for building a Coleman forest.

Test the function under alternative and null hypothesis for a very simple dataset.
"""
if use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ:
del os.environ[utils.DISABLE_BN_ENV_VAR]
importlib.reload(utils)
importlib.reload(stats)
else:
os.environ[utils.DISABLE_BN_ENV_VAR] = "1"
importlib.reload(utils)
importlib.reload(stats)

n_estimators = 100
n_samples = 30
n_features = 5
Expand Down Expand Up @@ -273,10 +288,10 @@ def test_build_coleman_forest():
with pytest.raises(
RuntimeError, match="Permutation forest must be a PermutationHonestForestClassifier"
):
build_coleman_forest(clf, clf, X, y)
stats.build_coleman_forest(clf, clf, X, y)

forest_result, orig_forest_proba, perm_forest_proba, clf_fitted, perm_clf_fitted = (
build_coleman_forest(clf, perm_clf, X, y, metric="s@98", n_repeats=1000, seed=seed)
stats.build_coleman_forest(clf, perm_clf, X, y, metric="s@98", n_repeats=1000, seed=seed)
)
assert clf_fitted._n_samples_bootstrap == round(n_samples * 1.6)
assert perm_clf_fitted._n_samples_bootstrap == round(n_samples * 1.6)
Expand All @@ -287,7 +302,7 @@ def test_build_coleman_forest():
assert_array_equal(orig_forest_proba.shape, perm_forest_proba.shape)

X = np.vstack([_X, _X])
forest_result, _, _, clf_fitted, perm_clf_fitted = build_coleman_forest(
forest_result, _, _, clf_fitted, perm_clf_fitted = stats.build_coleman_forest(
clf, perm_clf, X, y, metric="s@98"
)
assert forest_result.pvalue > 0.05, f"{forest_result.pvalue}"
Expand Down
36 changes: 36 additions & 0 deletions treeple/stats/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import importlib
import os

import numpy as np
import pytest
from numpy.testing import assert_array_equal

import treeple.stats.utils as utils
from treeple import HonestForestClassifier
from treeple.stats.utils import get_per_tree_oob_samples

Expand Down Expand Up @@ -32,3 +36,35 @@ def test_get_per_tree_oob_samples(bootstrap):
else:
with pytest.raises(RuntimeError, match="Cannot extract out-of-bag samples"):
get_per_tree_oob_samples(est)


@pytest.mark.parametrize("use_bottleneck", [True, False])
def test_non_nan_samples(use_bottleneck: bool):

if use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ:
del os.environ[utils.DISABLE_BN_ENV_VAR]
importlib.reload(utils)
else:
os.environ[utils.DISABLE_BN_ENV_VAR] = "1"
importlib.reload(utils)

posterior_array = np.array(
[
# tree 1
[
[0, 1],
[np.nan, np.nan],
[np.nan, np.nan],
],
# tree 2
[
[0, 1],
[np.nan, np.nan],
[1, 0],
],
]
) # [2, 3, 2]

expected = np.array([0, 2])
actual = utils._non_nan_samples(posterior_array)
np.testing.assert_array_equal(expected, actual)
27 changes: 24 additions & 3 deletions treeple/stats/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import os
import sys
import warnings
from typing import Optional, Tuple

import numpy as np
Expand All @@ -16,6 +19,24 @@

from treeple._lib.sklearn.ensemble._forest import BaseForest, ForestClassifier

BOTTLENECK_AVAILABLE = False
if "bottleneck" in sys.modules:
import bottleneck as bn

BOTTLENECK_AVAILABLE = True

DISABLE_BN_ENV_VAR = "TREEPLE_NO_BOTTLENECK"

if BOTTLENECK_AVAILABLE and DISABLE_BN_ENV_VAR not in os.environ:
nanmean_f = bn.nanmean
anynan_f = lambda arr: bn.anynan(arr, axis=2)
else:
warnings.warn(
"Not using bottleneck for calculations involvings nans. Expect slower performance."
)
nanmean_f = np.nanmean
anynan_f = lambda arr: np.isnan(arr).any(axis=2)


def _mutual_information(y_true: ArrayLike, y_pred_proba: ArrayLike) -> float:
"""Compute estimate of mutual information for supervised classification setting.
Expand Down Expand Up @@ -131,7 +152,7 @@ def _non_nan_samples(posterior_arr: ArrayLike) -> ArrayLike:
along axis=1.
"""
# Find the row indices with NaN values along the specified axis
nan_indices = np.isnan(posterior_arr).any(axis=2).all(axis=0)
nan_indices = anynan_f(posterior_arr).all(axis=0)

# Invert the boolean mask to get indices without NaN values
nonnan_indices = np.where(~nan_indices)[0]
Expand Down Expand Up @@ -320,8 +341,8 @@ def _parallel_build_null_forests(
# first_half_metric = metric_func(y_test[non_nan_samples, :], y_pred_first_half)
# second_half_metric = metric_func(y_test[non_nan_samples, :], y_pred_second_half)

y_pred_first_half = np.nanmean(first_forest_pred[:, first_forest_samples, :], axis=0)
y_pred_second_half = np.nanmean(second_forest_pred[:, second_forest_samples, :], axis=0)
y_pred_first_half = nanmean_f(first_forest_pred[:, first_forest_samples, :], axis=0)
y_pred_second_half = nanmean_f(second_forest_pred[:, second_forest_samples, :], axis=0)

# compute two instances of the metric from the sampled trees
first_half_metric = metric_func(
Expand Down
Loading