Skip to content

Commit

Permalink
MAINT Remove irrelevant files (#312)
Browse files Browse the repository at this point in the history
* Remove irrelevant files related to permutation testing
* Fix up ci to use sklearn nightly wheels

---------

Signed-off-by: Adam Li <adam2392@gmail.com>
  • Loading branch information
adam2392 authored Aug 9, 2024
1 parent 08c1607 commit 1d970b0
Show file tree
Hide file tree
Showing 22 changed files with 396 additions and 560 deletions.
1 change: 1 addition & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ jobs:
pip install --upgrade pip spin
spin setup-submodule
pip install .[build,doc]
pip install --pre --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple scikit-learn --force
- run:
name: build treeple
Expand Down
7 changes: 7 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ jobs:
pip install -r build_requirements.txt
pip install -r test_requirements.txt
- name: Install nightly wheels for scikit-learn (only for ubuntu 3.12)
if: ${{ matrix.python-version == '3.12' }} && ${{ matrix.os == 'ubuntu-latest' }}
run: |
pip install --pre --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple scikit-learn --force
- name: Prepare compiler cache
id: prep-ccache
shell: bash
Expand Down Expand Up @@ -184,6 +189,7 @@ jobs:
pip install compilers
pip install -r build_requirements.txt
pip install -r test_requirements.txt
pip install --pre --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple scikit-learn --force
- name: Prepare compiler cache
id: prep-ccache
Expand Down Expand Up @@ -279,6 +285,7 @@ jobs:
pip install spin
pip install -r build_requirements.txt
pip install -r test_requirements.txt
pip install --pre --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple scikit-learn --force
- name: Build
run: |
Expand Down
2 changes: 1 addition & 1 deletion DEVELOPING.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
- Python 3.9+
- numpy>=1.25.0
- scipy>=1.5.0
- scikit-learn>=1.4.1
- scikit-learn>=1.5.0

For the other requirements, inspect the ``pyproject.toml`` file.

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ We minimally require:
* Python (>=3.9)
* numpy
* scipy
* scikit-learn >= 1.3
* scikit-learn

Installation with Pip (<https://pypi.org/project/treeple/>)
-------------------------------------------------------------
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ requires = [
"setuptools<=65.5",
"packaging",
"Cython>=3.0.10",
"scikit-learn>=1.4.1",
"scikit-learn>=1.5.0",
"scipy>=1.5.0",
"numpy>=1.25; python_version>='3.9'"
]
Expand Down Expand Up @@ -52,7 +52,7 @@ include = [
dependencies = [
'numpy>=1.25.0',
'scipy>=1.5.0',
'scikit-learn>=1.4.1'
'scikit-learn>=1.5.0'
]

[project.optional-dependencies]
Expand Down
3 changes: 3 additions & 0 deletions treeple/_lib/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ tree_extension_metadata = {
'_tree':
{'sources': ['./sklearn/tree/' + '_tree.pyx'],
'override_options': ['cython_language=cpp', 'optimization=3']},
'_partitioner':
{'sources': ['./sklearn/tree/' + '_partitioner.pyx'],
'override_options': ['cython_language=cpp', 'optimization=3']},
'_splitter':
{'sources': ['./sklearn/tree/' + '_splitter.pyx'],
'override_options': ['cython_language=cpp', 'optimization=3']},
Expand Down
2 changes: 1 addition & 1 deletion treeple/_lib/sklearn_fork
Submodule sklearn_fork updated 587 files
10 changes: 2 additions & 8 deletions treeple/stats/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,8 @@
from .forestht import (
build_coleman_forest,
build_cv_forest,
build_oob_forest,
build_permutation_forest,
)
from .monte_carlo import PermutationTest
from .baseline import build_cv_forest, build_permutation_forest
from .forest import build_coleman_forest, build_oob_forest
from .permuteforest import PermutationHonestForestClassifier

__all__ = [
"PermutationTest",
"build_cv_forest",
"build_oob_forest",
"build_coleman_forest",
Expand Down
230 changes: 2 additions & 228 deletions treeple/stats/forestht.py → treeple/stats/baseline.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import threading
from collections import namedtuple
from typing import Callable

import numpy as np
Expand All @@ -8,12 +6,11 @@
from sklearn.base import clone
from sklearn.ensemble._base import _partition_estimators
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.utils.multiclass import type_of_target

from .._lib.sklearn.ensemble._forest import ForestClassifier
from ..tree._classes import DTYPE
from .forest import ForestTestResult, build_oob_forest
from .permuteforest import PermutationHonestForestClassifier
from .utils import METRIC_FUNCTIONS, POSITIVE_METRICS, _compute_null_distribution_coleman
from .utils import METRIC_FUNCTIONS, POSITIVE_METRICS


def _parallel_predict_proba(predict_proba, X, indices_test):
Expand All @@ -28,156 +25,6 @@ def _parallel_predict_proba(predict_proba, X, indices_test):
return prediction


def _parallel_predict_proba_oob(predict_proba, X, out, idx, test_idx, lock):
"""
This is a utility function for joblib's Parallel.
It can't go locally in ForestClassifier or ForestRegressor, because joblib
complains that it cannot pickle it when placed there.
"""
# each tree predicts proba with a list of output (n_samples, n_classes[i])
prediction = predict_proba(X, check_input=False)

indices = np.zeros(X.shape[0], dtype=bool)
indices[test_idx] = True
with lock:
out[idx, test_idx, :] = prediction[test_idx, :]
return prediction


ForestTestResult = namedtuple(
"ForestTestResult",
["observe_test_stat", "permuted_stat", "observe_stat", "pvalue", "null_dist"],
)


def build_coleman_forest(
est,
perm_est,
X,
y,
covariate_index=None,
metric="s@98",
n_repeats=10_000,
verbose=False,
seed=None,
return_posteriors=True,
**metric_kwargs,
):
"""Build a hypothesis testing forest using a two-forest approach.
The two-forest approach stems from the Coleman et al. 2022 paper, where
two forests are trained: one on the original dataset, and one on the
permuted dataset. The dataset is either permuted once, or independently for
each tree in the permuted forest. The original test statistic is computed by
comparing the metric on both forests ``(metric_forest - metric_perm_forest)``.
For full details, see :footcite:`coleman2022scalable`.
Parameters
----------
est : Forest
The type of forest to use. Must be enabled with ``bootstrap=True``.
perm_est : Forest
The forest to use for the permuted dataset.
X : ArrayLike of shape (n_samples, n_features)
Data.
y : ArrayLike of shape (n_samples, n_outputs)
Binary target, so ``n_outputs`` should be at most 1.
covariate_index : ArrayLike, optional of shape (n_covariates,)
The index array of covariates to shuffle, by default None, which
defaults to all covariates.
metric : str, optional
The metric to compute, by default "s@98", for sensitivity at
98% specificity.
n_repeats : int, optional
Number of times to bootstrap sample the two forests to construct
the null distribution, by default 10000. The construction of the
null forests will be parallelized according to the ``n_jobs``
argument of the ``est`` forest.
verbose : bool, optional
Verbosity, by default False.
seed : int, optional
Random seed, by default None.
return_posteriors : bool, optional
Whether or not to return the posteriors, by default True.
**metric_kwargs : dict, optional
Additional keyword arguments to pass to the metric function.
Returns
-------
observe_stat : float
The test statistic. To compute the test statistic, take
``permute_stat_`` and subtract ``observe_stat_``.
pvalue : float
The p-value of the test statistic.
orig_forest_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs)
The predicted posterior probabilities for each estimator on their
out of bag samples.
perm_forest_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs)
The predicted posterior probabilities for each of the permuted estimators
on their out of bag samples.
null_dist : ArrayLike of shape (n_repeats,)
The null statistic differences from permuted forests.
References
----------
.. footbibliography::
"""
metric_func: Callable[[ArrayLike, ArrayLike], float] = METRIC_FUNCTIONS[metric]

# build two sets of forests
est, orig_forest_proba = build_oob_forest(est, X, y, verbose=verbose)

if not isinstance(perm_est, PermutationHonestForestClassifier):
raise RuntimeError(
f"Permutation forest must be a PermutationHonestForestClassifier, got {type(perm_est)}"
)
perm_est, perm_forest_proba = build_oob_forest(
perm_est, X, y, verbose=verbose, covariate_index=covariate_index
)

# get the number of jobs
n_jobs = est.n_jobs

if y.ndim == 1:
y = y.reshape(-1, 1)
metric_star, metric_star_pi = _compute_null_distribution_coleman(
y,
orig_forest_proba,
perm_forest_proba,
metric,
n_repeats=n_repeats,
seed=seed,
n_jobs=n_jobs,
**metric_kwargs,
)

y_pred_proba_orig = np.nanmean(orig_forest_proba, axis=0)
y_pred_proba_perm = np.nanmean(perm_forest_proba, axis=0)
observe_stat = metric_func(y, y_pred_proba_orig, **metric_kwargs)
permute_stat = metric_func(y, y_pred_proba_perm, **metric_kwargs)

# metric^\pi - metric = observed test statistic, which under the
# null is normally distributed around 0
observe_test_stat = observe_stat - permute_stat

# metric^\pi_j - metric_j, which is centered at 0
null_dist = metric_star_pi - metric_star

# compute pvalue
if metric in POSITIVE_METRICS:
pvalue = (1 + (null_dist >= observe_test_stat).sum()) / (1 + n_repeats)
else:
pvalue = (1 + (null_dist <= observe_test_stat).sum()) / (1 + n_repeats)

forest_result = ForestTestResult(
observe_test_stat, permute_stat, observe_stat, pvalue, null_dist
)
if return_posteriors:
return forest_result, orig_forest_proba, perm_forest_proba, est, perm_est
else:
return forest_result


def build_permutation_forest(
est,
perm_est,
Expand Down Expand Up @@ -300,79 +147,6 @@ def build_permutation_forest(
return forest_result


def build_oob_forest(est: ForestClassifier, X, y, verbose=False, **est_kwargs):
"""Build a hypothesis testing forest using oob samples.
Parameters
----------
est : Forest
The type of forest to use. Must be enabled with ``bootstrap=True``.
The forest should have either ``oob_samples_`` or ``estimators_samples_``
property defined, which will be used to compute the out of bag samples
per tree.
X : ArrayLike of shape (n_samples, n_features)
Data.
y : ArrayLike of shape (n_samples, n_outputs)
Binary target, so ``n_outputs`` should be at most 1.
verbose : bool, optional
Verbosity, by default False.
**est_kwargs : dict, optional
Additional keyword arguments to pass to the forest estimator ``fit`` function.
Returns
-------
est : Forest
Fitted forest.
all_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs)
The predicted posterior probabilities for each estimator on their
out of bag samples.
"""
assert est.bootstrap
assert type_of_target(y) in ("binary")
est = clone(est)

# build forest
est.fit(X, y.ravel(), **est_kwargs)

# now evaluate
X = est._validate_X_predict(X)

# if we trained a binning tree, then we should re-bin the data
# XXX: this is inefficient and should be improved to be in line with what
# the Histogram Gradient Boosting Tree does, where the binning thresholds
# are passed into the tree itself, thus allowing us to set the node feature
# value thresholds within the tree itself.
if est.max_bins is not None:
X = est._bin_data(X, is_training_data=False).astype(DTYPE)

# Assign chunk of trees to jobs
n_jobs, _, _ = _partition_estimators(est.n_estimators, est.n_jobs)

# avoid storing the output of every estimator by summing them here
lock = threading.Lock()
# accumulate the predictions across all trees
all_proba = np.full(
(len(est.estimators_), X.shape[0], est.n_classes_), np.nan, dtype=np.float64
)
if hasattr(est, "oob_samples_"):
Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")(
delayed(_parallel_predict_proba_oob)(e.predict_proba, X, all_proba, idx, test_idx, lock)
for idx, (e, test_idx) in enumerate(zip(est.estimators_, est.oob_samples_))
)
else:
inbag_samples = est.estimators_samples_
all_samples = np.arange(X.shape[0])
oob_samples_list = [
np.setdiff1d(all_samples, inbag_samples[i]) for i in range(len(inbag_samples))
]
Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")(
delayed(_parallel_predict_proba_oob)(e.predict_proba, X, all_proba, idx, test_idx, lock)
for idx, (e, test_idx) in enumerate(zip(est.estimators_, oob_samples_list))
)

return est, all_proba


def build_cv_forest(
est,
X,
Expand Down
Loading

0 comments on commit 1d970b0

Please sign in to comment.