From 766c6c4d220e2379ca2a4a5e7b0c541eefe379c8 Mon Sep 17 00:00:00 2001
From: Ryan Hausen <ryan.hausen@gmail.com>
Date: Wed, 21 Aug 2024 16:15:25 -0400
Subject: [PATCH] wip PR revisions. updated docs, revised bottleneck function
 selection logic in test

---
 treeple/stats/forest.py            | 19 ++++++++++++++++---
 treeple/stats/tests/test_forest.py | 19 +++++++++++--------
 treeple/stats/utils.py             | 22 +++++++++++++++++-----
 3 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/treeple/stats/forest.py b/treeple/stats/forest.py
index 8a2dcee8..8c4de9b3 100644
--- a/treeple/stats/forest.py
+++ b/treeple/stats/forest.py
@@ -107,7 +107,8 @@ def build_coleman_forest(
     return_posteriors : bool, optional
         Whether or not to return the posteriors, by default True.
     use_sparse : bool, optional
-        Whether or not to use a sparse representation for the posteriors.
+        Whether or not to use a sparse for the calculation of the permutation
+        statistics, by default False. Doesn't affect return values.
     **metric_kwargs : dict, optional
         Additional keyword arguments to pass to the metric function.
 
@@ -173,11 +174,23 @@ def build_coleman_forest(
         # sparse indices and values with an array
         if return_posteriors:
             n_trees = y_pred_proba_orig_perm.shape[0] // 2
+            n_samples = y_pred_proba_orig_perm.shape[1]
             # slicing a csc matrix this way is not efficient, but it is
             # it is only done once, so I am not sure if it is worth it to
             # optimize this.
-            orig_forest_proba = y_pred_proba_orig_perm[:n_trees, :]
-            perm_forest_proba = y_pred_proba_orig_perm[n_trees:, :]
+            to_coords_data = lambda x: (x.row.astype(int), x.col.astype(int), x.data)
+
+            row, col, data = to_coords_data(y_pred_proba_orig_perm[:n_trees, :].tocoo())
+            orig_forest_proba = np.full((n_trees, n_samples), np.nan, dtype=np.float64)
+            orig_forest_proba[row, col] = data
+
+            row, col, data = to_coords_data(y_pred_proba_orig_perm[n_trees:, :].tocoo())
+            perm_forest_proba = np.full((n_trees, n_samples), np.nan, dtype=np.float64)
+            perm_forest_proba[row, col] = data
+
+            if y.shape[1] == 2:
+                orig_forest_proba = np.column_stack((orig_forest_proba, 1 - orig_forest_proba))
+                perm_forest_proba = np.column_stack((perm_forest_proba, 1 - perm_forest_proba))
     else:
         metric_star, metric_star_pi = _compute_null_distribution_coleman(
             y,
diff --git a/treeple/stats/tests/test_forest.py b/treeple/stats/tests/test_forest.py
index 9ae363d2..43757d47 100644
--- a/treeple/stats/tests/test_forest.py
+++ b/treeple/stats/tests/test_forest.py
@@ -238,21 +238,20 @@ def test_comight_repeated_feature_sets(seed):
 
 @pytest.mark.parametrize(
     ("use_bottleneck", "use_sparse"),
-    itertools.product([True, False], [True, False]),
+    itertools.product([False, True], [False, True]),
 )
 def test_build_coleman_forest(use_bottleneck: bool, use_sparse: bool):
     """Simple test for building a Coleman forest.
 
     Test the function under alternative and null hypothesis for a very simple dataset.
     """
-    if use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ:
-        del os.environ[utils.DISABLE_BN_ENV_VAR]
-        importlib.reload(utils)
-        importlib.reload(stats)
-    else:
+    if not use_bottleneck and utils.DISABLE_BN_ENV_VAR not in os.environ:
         os.environ[utils.DISABLE_BN_ENV_VAR] = "1"
-        importlib.reload(utils)
-        importlib.reload(stats)
+    elif use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ:
+        del os.environ[utils.DISABLE_BN_ENV_VAR]
+
+    importlib.reload(utils)
+    importlib.reload(stats)
 
     n_estimators = 100
     n_samples = 30
@@ -436,3 +435,7 @@ def test_build_oob_random_forest():
         assert len(np.unique(structure_samples[tree_idx])) + len(oob_samples_list[tree_idx]) == len(
             samples
         ), f"{tree_idx} {len(structure_samples[tree_idx])} + {len(oob_samples_list[tree_idx])} != {len(samples)}"
+
+
+if __name__ == "__main__":
+    test_build_coleman_forest(False, False)
diff --git a/treeple/stats/utils.py b/treeple/stats/utils.py
index abafc88c..bd700187 100644
--- a/treeple/stats/utils.py
+++ b/treeple/stats/utils.py
@@ -214,9 +214,6 @@ def _compute_null_distribution_coleman(
     metric_star_pi : ArrayLike of shape (n_samples,)
         An array of the metrics computed on the other half of the trees.
     """
-    if not BOTTLENECK_AVAILABLE:
-        warnings.warn(BOTTLENECK_WARNING)
-
     # sample two sets of equal number of trees from the combined forest these are the posteriors
     # (n_estimators * 2, n_samples, n_outputs)
     all_y_pred = np.concatenate((y_pred_proba_normal, y_pred_proba_perm), axis=0)
@@ -337,9 +334,24 @@ def get_per_tree_oob_samples(est: BaseForest):
 def _get_forest_preds_sparse(
     all_y_pred: sp.csc_matrix,  # (n_trees, n_samples)
     all_y_indicator: sp.csc_matrix,  # (n_trees, n_samples)
-    forest_indices: ArrayLike,  # (n_trees,)
+    forest_indices: ArrayLike,  # (n_trees/2,)
 ) -> ArrayLike:
-    """Get the forest predictions for a set of trees using sparse matrices."""
+    """Get the forest predictions for a set of trees using sparse matrices.
+
+    Parameters
+    ----------
+    all_y_pred : sp.csc_matrix of shape (n_trees, n_samples)
+        The predicted posteriors from the forest.
+    all_y_indicator : sp.csc_matrix of shape (n_trees, n_samples)
+        The indicator matrix for the predictions.
+    forest_indices : ArrayLike of shape (n_trees/2,)
+        The indices of the trees in the forest that we are evaluating.
+
+    Returns
+    -------
+    ArrayLike of shape (n_samples,)
+        The averaged predictions for the forest.
+    """
     forest_indicator = np.zeros(len(forest_indices) * 2, dtype=np.uint8)
     forest_indicator[forest_indices] = 1