CMI analysis update

Signed-off-by: Adam Li <adam2392@gmail.com>
neurodata · Jun 30, 2023 · 58a2717 · 58a2717
1 parent 6dccbfa
commit 58a2717
Show file tree

Hide file tree

Showing 9 changed files with 1,292 additions and 240 deletions.
diff --git a/experiments/plotting_cmi_analysis.ipynb → .../plotting_cmi_analysis_unsupervised.ipynb b/experiments/plotting_cmi_analysis.ipynb → .../plotting_cmi_analysis_unsupervised.ipynb
diff --git a/experiments/plotting_cmi_supervised.ipynb b/experiments/plotting_cmi_supervised.ipynb
diff --git a/sktree/ensemble/_honest_forest.py b/sktree/ensemble/_honest_forest.py
@@ -479,9 +479,81 @@ def honest_indices_(self):
         check_is_fitted(self)
         return [tree.honest_indices_ for tree in self.estimators_]
 
+    @property
+    def feature_importances_(self):
+        return self.estimator_.feature_importances_
+
     def _more_tags(self):
         return {"multioutput": False}
 
+    def apply(self, X):
+        """
+        Apply trees in the forest to X, return leaf indices.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, its dtype will be converted to
+            ``dtype=np.float32``. If a sparse matrix is provided, it will be
+            converted into a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        X_leaves : ndarray of shape (n_samples, n_estimators)
+            For each datapoint x in X and for each tree in the forest,
+            return the index of the leaf x ends up in.
+        """
+        return self.estimator_.apply(X)
+
+    def decision_path(self, X):
+        """
+        Return the decision path in the forest.
+
+        .. versionadded:: 0.18
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input samples. Internally, its dtype will be converted to
+            ``dtype=np.float32``. If a sparse matrix is provided, it will be
+            converted into a sparse ``csr_matrix``.
+
+        Returns
+        -------
+        indicator : sparse matrix of shape (n_samples, n_nodes)
+            Return a node indicator matrix where non zero elements indicates
+            that the samples goes through the nodes. The matrix is of CSR
+            format.
+
+        n_nodes_ptr : ndarray of shape (n_estimators + 1,)
+            The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]]
+            gives the indicator value for the i-th estimator.
+        """
+        return self.estimator_.decision_path(X)
+
+    def predict_quantiles(self, X, quantiles=0.5, method="nearest"):
+        """Predict class or regression value for X at given quantiles.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data.
+        quantiles : float, optional
+            The quantiles at which to evaluate, by default 0.5 (median).
+        method : str, optional
+            The method to interpolate, by default 'linear'. Can be any keyword
+            argument accepted by :func:`~np.quantile`.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples, n_quantiles, [n_outputs])
+            The predicted values. The ``n_outputs`` dimension is present only
+            for multi-output regressors.
+        """
+        return self.estimator_.predict_quantiles(X, quantiles, method)
+
+    def get_leaf_node_samples(self, X):
+        return self.estimator_.get_leaf_node_samples(X)
 
 def _accumulate_prediction(tree, X, out, lock, indices=None):
     """

diff --git a/sktree/experimental/forest.py b/sktree/experimental/forest.py
@@ -1,24 +1,51 @@
 from copy import copy
+from numpy.typing import ArrayLike
 
 import numpy as np
 from scipy.stats import entropy
 from sklearn.base import BaseEstimator, MetaEstimatorMixin
+from sklearn.ensemble._forest import BaseForest
 
+from sklearn.calibration import CalibratedClassifierCV
+from sktree.utils import check_is_forest
 from .ksg import entropy_continuous
 
 
 class SupervisedInfoForest(BaseEstimator, MetaEstimatorMixin):
-    def __init__(self, estimator, y_categorical: bool = False, n_jobs=None):
+    def __init__(self, estimator, y_categorical: bool = False, n_jobs=None, random_state=None):
+        """Meta estimator for mutual information.
+
+        This supervised forest estimator uses two supervised forests to estimate
+        the (conditional) mutual information between X and Y given Z. 
+
+        Parameters
+        ----------
+        estimator : _type_
+            _description_
+        y_categorical : bool, optional
+            _description_, by default False
+        n_jobs : _type_, optional
+            _description_, by default None
+        random_state : _type_, optional
+            _description_, by default None
+        """
         self.estimator = estimator
         self.y_categorical = y_categorical
         self.n_jobs = n_jobs
+        self.random_state = random_state
 
     def fit(self, X, y, Z=None):
         X, y = self._validate_data(X, y, accept_sparse="csc")
+        check_is_forest(self.estimator, allow_tree=False, ensure_fitted=False)
 
-        print(X.shape, y.shape)
         self.estimator_yxz_ = copy(self.estimator)
         self.estimator_yz_ = copy(self.estimator)
+        # self.estimator_yxz_ = CalibratedClassifierCV(
+        #     copy(self.estimator), method='isotonic', cv=5,
+        # )
+        # self.estimator_yz_ = CalibratedClassifierCV(
+        #     copy(self.estimator), method='isotonic', cv=5,
+        # )
 
         if Z is not None:
             XZ = np.hstack((X, Z))
@@ -40,9 +67,10 @@ def fit(self, X, y, Z=None):
 
         return self
 
-    def predict_cmi(self, X, Z=None):
+    def predict_cmi(self, X, y, Z=None):
         if Z is not None:
-            X, Z = self._validate_data(X, Z, accept_sparse="csc")
+            X, y = self._validate_data(X, y, accept_sparse="csc")
+            Z = self._validate_data(Z, accept_sparse="csc")
         else:
             X = self._validate_data(X, accept_sparse="csc")
 
@@ -51,8 +79,16 @@ def predict_cmi(self, X, Z=None):
         else:
             XZ = X
 
+        # if not fitted yet
+        if not hasattr(self.estimator, "estimator_yz_"):
+            self.fit(X, y, Z)
+
+        sample_indices = np.arange(0, X.shape[0])
+
         # compute the entropy of H(Y | X, Z)
-        H_yxz = self.estimator_yxz_.predict_proba(XZ)
+        H_yxz = cond_entropy(
+            self.estimator_yxz_, XZ, y, sample_indices, kappa=3, seed=self.random_state
+        )
 
         # compute the entropy of H(Y | Z)
         if Z is None:
@@ -62,6 +98,179 @@ def predict_cmi(self, X, Z=None):
             else:
                 H_yz = self.H_yz_
         else:
-            H_yz = self.estimator_yz_.predict_proba(Z)
+            H_yz = cond_entropy(
+                self.estimator_yz_, Z, y, sample_indices, kappa=3, seed=self.random_state
+            )
 
         return H_yxz - H_yz
+
+
+def approx_marginal(est: BaseForest, X: ArrayLike, weighted: bool = False):
+    """Compute the approximate marginal distribution.
+
+    A fitted tree/forest can be used to compute the approximate marginal
+    by counting the number of training (weighted) training samples
+    that fall into each leaf node for samples in X.
+
+    Parameters
+    ----------
+    est : Forest
+        Fitted forest or tree estimator.
+    X : ArrayLike of shape (n_samples, n_features_in_)
+        Samples to compute the approximate marginal distribution for.
+    weighted : bool, default=False
+        Whether to use the weighted number of samples in each leaf node.
+    """
+    check_is_forest(est)
+    n_samples = X.shape[0]
+    p_leaves = np.zeros(n_samples, est.n_estimators)
+
+    # leaf indices (n_samples, n_estimators)
+    X_leaves = est.apply(X)
+
+    for idx, tree_ in enumerate(est.estimators_.tree_):
+        # compute the empirical distribution of the leaf nodes
+        # (n_nodes,) array
+        if weighted:
+            n_node_samples = tree_.weighted_n_node_samples
+        else:
+            n_node_samples = tree_.n_node_samples
+
+        # compute the total number of samples used in this tree
+        total_samples_in_tree = n_node_samples.sum()
+
+        # compute the empirical distribution of the leaf nodes for each sample in X
+        tree_leaves = X_leaves[:, idx]
+        p_leaves[:, idx] = n_node_samples[tree_leaves] / total_samples_in_tree
+
+    # return the average over all trees
+    return p_leaves.mean(axis=1)
+
+
+def cond_entropy(
+    est: BaseForest, X: ArrayLike, y: ArrayLike, estimators_samples_: ArrayLike, kappa=3, seed=None
+):
+    rng = np.random.default_rng(seed)
+
+    check_is_forest(est)
+    # if "multioutput" in est._get_tags():
+    #     raise ValueError("Multioutput is not supported.")
+
+    # get leaves of each tree for each sample (n_samples, n_estimators)
+    X_leaves = est.apply(X)
+    n_classes = est.n_classes_
+    n_samples = X.shape[0]
+
+    cond_entropy = 0.0
+    for tree_idx, tree in enumerate(est.estimators_):
+        # (n_nodes,) array of number of samples
+        node_counts = tree.tree_.n_node_samples
+        n_nodes = len(node_counts)
+
+        # (n_nodes, n_classes_)
+        class_counts = np.zeros((n_nodes, n_classes))
+
+        # Find the indices of the training set used for partition.
+        sampled_indices = estimators_samples_[tree_idx]
+        unsampled_indices = np.delete(np.arange(0, n_samples), sampled_indices)
+
+        # Randomly split the rest into voting and evaluation.
+        total_unsampled = len(unsampled_indices)
+        rng.shuffle(unsampled_indices)
+        vote_indices = unsampled_indices[: total_unsampled // 2]
+        eval_indices = unsampled_indices[total_unsampled // 2 :]
+
+        # true classes of evaluation points
+        true_classes = y[vote_indices]
+
+        # estimated nodes
+        tree_vote_leaves = X_leaves[vote_indices, tree_idx]
+        n_vote_samples = len(tree_vote_leaves)
+        for i in range(n_vote_samples):
+            class_counts[tree_vote_leaves[i], true_classes[i]] += 1
+
+        # compute total number of samples in each leaf node
+        # then compute probabilies per class for each node
+        # (n_nodes, n_classes_)
+        n_node_leaves = class_counts.sum(axis=1)
+        class_probs = _finite_sample_correction(class_counts, n_node_leaves, n_classes, kappa=kappa)
+
+        # Place evaluation points in their corresponding leaf node.
+        # Store evaluation posterior in a num_eval-by-num_class matrix.
+        tree_eval_leaves = tree.apply(X[eval_indices, :])
+        eval_class_probs = class_probs[tree_eval_leaves, :]
+
+        # compute the entropy per sample
+        eval_entropies = [entropy(posterior) for posterior in eval_class_probs]
+        cond_entropy += np.mean(eval_entropies)
+    return cond_entropy / est.n_estimators
+
+
+def _finite_sample_correction(class_counts, n_samples_per_node, n_classes, kappa=3):
+    """Perform finite sample correction on class probabilities.
+
+    This applies the following procedure:
+
+    - if a leaf (i.e. node) is pure for a certain class, meaning there is 0 samples
+        for that class in the node, then the probability of that class is replaced
+        with ``1 / (kappa * n_classes)``, where `kappa` is a hyperparameter and
+
+
+    Parameters
+    ----------
+    class_counts : array-like of shape (n_nodes, n_classes)
+        Class counts in each node.
+    n_samples_per_node : array-like of shape (n_nodes,)
+        Total number of samples in each node.
+    n_classes : int
+        Number of classes.
+    kappa : float, default=3
+        Corrective factor hyperparameter.
+    """
+    # Avoid divide by zero.
+    n_samples_per_node[n_samples_per_node == 0] = 1
+
+    class_probs = np.divide(class_counts, n_samples_per_node[:, np.newaxis])
+
+    # Make the nodes that have no estimation indices uniform.
+    # This includes non-leaf nodes, but that will not affect the estimate.
+    class_probs[np.argwhere(class_probs.sum(axis=1) == 0)] = 1.0 / n_classes
+
+    # Apply finite sample correction and renormalize.
+    where_0 = np.argwhere(class_probs == 0)
+    for row, col in where_0:
+        class_probs[row, col] = 1.0 / (kappa * class_counts[row].sum())
+    row_sums = class_probs.sum(axis=1)
+    class_probs = class_probs / row_sums[:, None]
+
+    return class_probs
+
+
+def approx_joint(est: BaseForest, X: ArrayLike, P_Y: ArrayLike):
+    """Compute the approximate joint distribution.
+
+    A fitted tree/forest can be used to compute the approximate joint
+    by counting the number of training (weighted) training samples
+    that fall into each leaf node for samples in X. Then the conditional
+    distribution is computed as the number of Y samples in each for a
+    fitted tree or forest for P(Y | X).
+
+    Parameters
+    ----------
+    est : Forest
+        The fitted supervised forest.
+    X : ArrayLike of shape (n_samples, n_features_in_)
+        Input data.
+    P_Y : ArrayLike of shape (n_samples,)
+        The approximate marginal distribution of X.
+    """
+    check_is_forest(est)
+    if est._get_tags().get("multioutput", False):
+        raise ValueError("Multioutput is not supported.")
+
+    est.apply(X)
+
+    y_proba = est.predict_proba(X)
+
+    for iclass in range(est.n_classes_):
+        y_proba[:, iclass] *= P_X