MAINT Remove irrelevant files (#312)

* Remove irrelevant files related to permutation testing * Fix up ci to use sklearn nightly wheels --------- Signed-off-by: Adam Li <adam2392@gmail.com>
neurodata · Aug 9, 2024 · 1d970b0 · 1d970b0
1 parent 08c1607
commit 1d970b0
Show file tree

Hide file tree

Showing 22 changed files with 396 additions and 560 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -84,6 +84,7 @@ jobs:
             pip install --upgrade pip spin
             spin setup-submodule
             pip install .[build,doc]
+            pip install --pre --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple scikit-learn --force
 
       - run:
           name: build treeple

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -84,6 +84,11 @@ jobs:
           pip install -r build_requirements.txt
           pip install -r test_requirements.txt
 
+      - name: Install nightly wheels for scikit-learn (only for ubuntu 3.12)
+        if: ${{ matrix.python-version == '3.12' }} && ${{ matrix.os == 'ubuntu-latest' }}
+        run: |
+          pip install --pre --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple scikit-learn --force
+
       - name: Prepare compiler cache
         id: prep-ccache
         shell: bash
@@ -184,6 +189,7 @@ jobs:
           pip install compilers
           pip install -r build_requirements.txt
           pip install -r test_requirements.txt
+          pip install --pre --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple scikit-learn --force
 
       - name: Prepare compiler cache
         id: prep-ccache
@@ -279,6 +285,7 @@ jobs:
           pip install spin
           pip install -r build_requirements.txt
           pip install -r test_requirements.txt
+          pip install --pre --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple scikit-learn --force
 
       - name: Build
         run: |

diff --git a/DEVELOPING.md b/DEVELOPING.md
@@ -18,7 +18,7 @@
 - Python 3.9+
 - numpy>=1.25.0
 - scipy>=1.5.0
-- scikit-learn>=1.4.1
+- scikit-learn>=1.5.0
 
 For the other requirements, inspect the ``pyproject.toml`` file.
 

diff --git a/README.md b/README.md
@@ -46,7 +46,7 @@ We minimally require:
     * Python (>=3.9)
     * numpy
     * scipy
-    * scikit-learn >= 1.3
+    * scikit-learn
 
 Installation with Pip (<https://pypi.org/project/treeple/>)
 -------------------------------------------------------------

diff --git a/pyproject.toml b/pyproject.toml
@@ -10,7 +10,7 @@ requires = [
   "setuptools<=65.5",
   "packaging",
   "Cython>=3.0.10",
-  "scikit-learn>=1.4.1",
+  "scikit-learn>=1.5.0",
   "scipy>=1.5.0",
   "numpy>=1.25; python_version>='3.9'"
 ]
@@ -52,7 +52,7 @@ include = [
 dependencies = [
   'numpy>=1.25.0',
   'scipy>=1.5.0',
-  'scikit-learn>=1.4.1'
+  'scikit-learn>=1.5.0'
 ]
 
 [project.optional-dependencies]

diff --git a/treeple/_lib/meson.build b/treeple/_lib/meson.build
@@ -8,6 +8,9 @@ tree_extension_metadata = {
   '_tree':
     {'sources': ['./sklearn/tree/' + '_tree.pyx'],
      'override_options': ['cython_language=cpp', 'optimization=3']},
+  '_partitioner':
+    {'sources': ['./sklearn/tree/' + '_partitioner.pyx'],
+     'override_options': ['cython_language=cpp', 'optimization=3']},
   '_splitter':
     {'sources': ['./sklearn/tree/' + '_splitter.pyx'],
      'override_options': ['cython_language=cpp', 'optimization=3']},

diff --git a/treeple/_lib/sklearn_fork b/treeple/_lib/sklearn_fork
diff --git a/treeple/stats/__init__.py b/treeple/stats/__init__.py
@@ -1,14 +1,8 @@
-from .forestht import (
-    build_coleman_forest,
-    build_cv_forest,
-    build_oob_forest,
-    build_permutation_forest,
-)
-from .monte_carlo import PermutationTest
+from .baseline import build_cv_forest, build_permutation_forest
+from .forest import build_coleman_forest, build_oob_forest
 from .permuteforest import PermutationHonestForestClassifier
 
 __all__ = [
-    "PermutationTest",
     "build_cv_forest",
     "build_oob_forest",
     "build_coleman_forest",

diff --git a/treeple/stats/forestht.py → treeple/stats/baseline.py b/treeple/stats/forestht.py → treeple/stats/baseline.py
@@ -1,5 +1,3 @@
-import threading
-from collections import namedtuple
 from typing import Callable
 
 import numpy as np
@@ -8,12 +6,11 @@
 from sklearn.base import clone
 from sklearn.ensemble._base import _partition_estimators
 from sklearn.model_selection import StratifiedKFold, train_test_split
-from sklearn.utils.multiclass import type_of_target
 
-from .._lib.sklearn.ensemble._forest import ForestClassifier
 from ..tree._classes import DTYPE
+from .forest import ForestTestResult, build_oob_forest
 from .permuteforest import PermutationHonestForestClassifier
-from .utils import METRIC_FUNCTIONS, POSITIVE_METRICS, _compute_null_distribution_coleman
+from .utils import METRIC_FUNCTIONS, POSITIVE_METRICS
 
 
 def _parallel_predict_proba(predict_proba, X, indices_test):
@@ -28,156 +25,6 @@ def _parallel_predict_proba(predict_proba, X, indices_test):
     return prediction
 
 
-def _parallel_predict_proba_oob(predict_proba, X, out, idx, test_idx, lock):
-    """
-    This is a utility function for joblib's Parallel.
-    It can't go locally in ForestClassifier or ForestRegressor, because joblib
-    complains that it cannot pickle it when placed there.
-    """
-    # each tree predicts proba with a list of output (n_samples, n_classes[i])
-    prediction = predict_proba(X, check_input=False)
-
-    indices = np.zeros(X.shape[0], dtype=bool)
-    indices[test_idx] = True
-    with lock:
-        out[idx, test_idx, :] = prediction[test_idx, :]
-    return prediction
-
-
-ForestTestResult = namedtuple(
-    "ForestTestResult",
-    ["observe_test_stat", "permuted_stat", "observe_stat", "pvalue", "null_dist"],
-)
-
-
-def build_coleman_forest(
-    est,
-    perm_est,
-    X,
-    y,
-    covariate_index=None,
-    metric="s@98",
-    n_repeats=10_000,
-    verbose=False,
-    seed=None,
-    return_posteriors=True,
-    **metric_kwargs,
-):
-    """Build a hypothesis testing forest using a two-forest approach.
-
-    The two-forest approach stems from the Coleman et al. 2022 paper, where
-    two forests are trained: one on the original dataset, and one on the
-    permuted dataset. The dataset is either permuted once, or independently for
-    each tree in the permuted forest. The original test statistic is computed by
-    comparing the metric on both forests ``(metric_forest - metric_perm_forest)``.
-    For full details, see :footcite:`coleman2022scalable`.
-
-    Parameters
-    ----------
-    est : Forest
-        The type of forest to use. Must be enabled with ``bootstrap=True``.
-    perm_est : Forest
-        The forest to use for the permuted dataset.
-    X : ArrayLike of shape (n_samples, n_features)
-        Data.
-    y : ArrayLike of shape (n_samples, n_outputs)
-        Binary target, so ``n_outputs`` should be at most 1.
-    covariate_index : ArrayLike, optional of shape (n_covariates,)
-        The index array of covariates to shuffle, by default None, which
-        defaults to all covariates.
-    metric : str, optional
-        The metric to compute, by default "s@98", for sensitivity at
-        98% specificity.
-    n_repeats : int, optional
-        Number of times to bootstrap sample the two forests to construct
-        the null distribution, by default 10000. The construction of the
-        null forests will be parallelized according to the ``n_jobs``
-        argument of the ``est`` forest.
-    verbose : bool, optional
-        Verbosity, by default False.
-    seed : int, optional
-        Random seed, by default None.
-    return_posteriors : bool, optional
-        Whether or not to return the posteriors, by default True.
-    **metric_kwargs : dict, optional
-        Additional keyword arguments to pass to the metric function.
-
-    Returns
-    -------
-    observe_stat : float
-        The test statistic. To compute the test statistic, take
-        ``permute_stat_`` and subtract ``observe_stat_``.
-    pvalue : float
-        The p-value of the test statistic.
-    orig_forest_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs)
-        The predicted posterior probabilities for each estimator on their
-        out of bag samples.
-    perm_forest_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs)
-        The predicted posterior probabilities for each of the permuted estimators
-        on their out of bag samples.
-    null_dist : ArrayLike of shape (n_repeats,)
-        The null statistic differences from permuted forests.
-
-    References
-    ----------
-    .. footbibliography::
-    """
-    metric_func: Callable[[ArrayLike, ArrayLike], float] = METRIC_FUNCTIONS[metric]
-
-    # build two sets of forests
-    est, orig_forest_proba = build_oob_forest(est, X, y, verbose=verbose)
-
-    if not isinstance(perm_est, PermutationHonestForestClassifier):
-        raise RuntimeError(
-            f"Permutation forest must be a PermutationHonestForestClassifier, got {type(perm_est)}"
-        )
-    perm_est, perm_forest_proba = build_oob_forest(
-        perm_est, X, y, verbose=verbose, covariate_index=covariate_index
-    )
-
-    # get the number of jobs
-    n_jobs = est.n_jobs
-
-    if y.ndim == 1:
-        y = y.reshape(-1, 1)
-    metric_star, metric_star_pi = _compute_null_distribution_coleman(
-        y,
-        orig_forest_proba,
-        perm_forest_proba,
-        metric,
-        n_repeats=n_repeats,
-        seed=seed,
-        n_jobs=n_jobs,
-        **metric_kwargs,
-    )
-
-    y_pred_proba_orig = np.nanmean(orig_forest_proba, axis=0)
-    y_pred_proba_perm = np.nanmean(perm_forest_proba, axis=0)
-    observe_stat = metric_func(y, y_pred_proba_orig, **metric_kwargs)
-    permute_stat = metric_func(y, y_pred_proba_perm, **metric_kwargs)
-
-    # metric^\pi - metric = observed test statistic, which under the
-    # null is normally distributed around 0
-    observe_test_stat = observe_stat - permute_stat
-
-    # metric^\pi_j - metric_j, which is centered at 0
-    null_dist = metric_star_pi - metric_star
-
-    # compute pvalue
-    if metric in POSITIVE_METRICS:
-        pvalue = (1 + (null_dist >= observe_test_stat).sum()) / (1 + n_repeats)
-    else:
-        pvalue = (1 + (null_dist <= observe_test_stat).sum()) / (1 + n_repeats)
-
-    forest_result = ForestTestResult(
-        observe_test_stat, permute_stat, observe_stat, pvalue, null_dist
-    )
-    if return_posteriors:
-        return forest_result, orig_forest_proba, perm_forest_proba, est, perm_est
-    else:
-        return forest_result
-
-
 def build_permutation_forest(
     est,
     perm_est,
@@ -300,79 +147,6 @@ def build_permutation_forest(
         return forest_result
 
 
-def build_oob_forest(est: ForestClassifier, X, y, verbose=False, **est_kwargs):
-    """Build a hypothesis testing forest using oob samples.
-
-    Parameters
-    ----------
-    est : Forest
-        The type of forest to use. Must be enabled with ``bootstrap=True``.
-        The forest should have either ``oob_samples_`` or ``estimators_samples_``
-        property defined, which will be used to compute the out of bag samples
-        per tree.
-    X : ArrayLike of shape (n_samples, n_features)
-        Data.
-    y : ArrayLike of shape (n_samples, n_outputs)
-        Binary target, so ``n_outputs`` should be at most 1.
-    verbose : bool, optional
-        Verbosity, by default False.
-    **est_kwargs : dict, optional
-        Additional keyword arguments to pass to the forest estimator ``fit`` function.
-
-    Returns
-    -------
-    est : Forest
-        Fitted forest.
-    all_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs)
-        The predicted posterior probabilities for each estimator on their
-        out of bag samples.
-    """
-    assert est.bootstrap
-    assert type_of_target(y) in ("binary")
-    est = clone(est)
-
-    # build forest
-    est.fit(X, y.ravel(), **est_kwargs)
-
-    # now evaluate
-    X = est._validate_X_predict(X)
-
-    # if we trained a binning tree, then we should re-bin the data
-    # XXX: this is inefficient and should be improved to be in line with what
-    # the Histogram Gradient Boosting Tree does, where the binning thresholds
-    # are passed into the tree itself, thus allowing us to set the node feature
-    # value thresholds within the tree itself.
-    if est.max_bins is not None:
-        X = est._bin_data(X, is_training_data=False).astype(DTYPE)
-
-    # Assign chunk of trees to jobs
-    n_jobs, _, _ = _partition_estimators(est.n_estimators, est.n_jobs)
-
-    # avoid storing the output of every estimator by summing them here
-    lock = threading.Lock()
-    # accumulate the predictions across all trees
-    all_proba = np.full(
-        (len(est.estimators_), X.shape[0], est.n_classes_), np.nan, dtype=np.float64
-    )
-    if hasattr(est, "oob_samples_"):
-        Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")(
-            delayed(_parallel_predict_proba_oob)(e.predict_proba, X, all_proba, idx, test_idx, lock)
-            for idx, (e, test_idx) in enumerate(zip(est.estimators_, est.oob_samples_))
-        )
-    else:
-        inbag_samples = est.estimators_samples_
-        all_samples = np.arange(X.shape[0])
-        oob_samples_list = [
-            np.setdiff1d(all_samples, inbag_samples[i]) for i in range(len(inbag_samples))
-        ]
-        Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")(
-            delayed(_parallel_predict_proba_oob)(e.predict_proba, X, all_proba, idx, test_idx, lock)
-            for idx, (e, test_idx) in enumerate(zip(est.estimators_, oob_samples_list))
-        )
-
-    return est, all_proba
-
-
 def build_cv_forest(
     est,
     X,