New kcd PR

Signed-off-by: Adam Li <adam2392@gmail.com>
py-why · Aug 30, 2023 · b91d4f5 · b91d4f5
1 parent 256d8c9
commit b91d4f5
Show file tree

Hide file tree

Showing 11 changed files with 912 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -25,6 +25,7 @@ Minimally, pywhy-stats requires:
     * Python (>=3.8)
     * numpy
     * scipy
+    * scikit-learn
 
 ## User Installation
 

diff --git a/doc/api.rst b/doc/api.rst
@@ -64,3 +64,17 @@ of many data analysis procedures.
    fisherz
    kci
    power_divergence
+
+(Conditional) K-Sample Testing
+==============================
+
+Testing for invariances among conditional distributions is a core part
+of many data analysis procedures. Currently, we only support conditional
+2-sample testing among two distributions.
+
+.. currentmodule:: pywhy_stats.conditional_ksample
+.. autosummary::
+   :toctree: generated/
+
+   bregman
+   kcd
diff --git a/doc/conditional_independence.rst b/doc/conditional_independence.rst
@@ -177,6 +177,35 @@ indices of the distribution, one can convert the CD test:
 :math:`P_{i=j}(y|x) =? P_{i=k}(y|x)` into the CI test :math:`P(y|x,i) = P(y|x)`, which can
 be tested with the Chi-square CI tests.
 
+:mod:`pywhy_stats.conditional_ksample.kcd` Kernel-Approaches
+------------------------------------------------------------
+Kernel-based tests are attractive since they are semi-parametric and use kernel-based ideas
+that have been shown to be robust in the machine-learning field. The Kernel CD test is a test
+that computes a test statistic from kernels of the data and uses a weighted permutation testing
+based on the estimated propensity scores to generate samples from the null distribution
+:footcite:`Park2021conditional`, which are then used to estimate a pvalue.
+
+.. currentmodule:: pywhy_stats.conditional_ksample
+.. autosummary::
+   :toctree: generated/
+
+    kcd
+
+
+:mod:`pywhy_stats.conditional_ksample.bregman` Bregman-Divergences
+------------------------------------------------------------------
+The Bregman CD test is a divergence-based test
+that computes a test statistic from estimated Von-Neumann divergences of the data and uses a
+weighted permutation testing based on the estimated propensity scores to generate samples from the null distribution
+:footcite:`Yu2020Bregman`, which are then used to estimate a pvalue.
+
+
+.. currentmodule:: pywhy_stats.conditional_ksample
+.. autosummary::
+   :toctree: generated/
+
+    bregman
+
 ==========
 References
 ==========

diff --git a/doc/whats_new/v0.1.rst b/doc/whats_new/v0.1.rst
@@ -29,7 +29,7 @@ Changelog
 - |Feature| Implement partial correlation test :func:`pywhy_stats.independence.fisherz`, by `Adam Li`_ (:pr:`7`)
 - |Feature| Add (un)conditional kernel independence test by `Patrick Blöbaum`_, co-authored by `Adam Li`_ (:pr:`14`)
 - |Feature| Add categorical independence tests by `Adam Li`_, (:pr:`18`)
-
+- |Feature| Add conditional kernel and Bregman discrepancy tests, `pywhy_stats.kcd` and `pywhy_stats.bregman` by `Adam Li`_ (:pr:`19`)
 
 Code and Documentation Contributors
 -----------------------------------
@@ -38,4 +38,4 @@ Thanks to everyone who has contributed to the maintenance and improvement of
 the project since version inception, including:
 
 * `Adam Li`_
-
+* `Patrick Blöbaum`_
diff --git a/pywhy_stats/__init__.py b/pywhy_stats/__init__.py
@@ -1,4 +1,5 @@
 from ._version import __version__  # noqa: F401
 from .api import Methods, independence_test
+from .conditional_ksample import bregman, kcd
 from .independence import fisherz, kci, power_divergence
 from .pvalue_result import PValueResult
diff --git a/pywhy_stats/conditional_ksample/__init__.py b/pywhy_stats/conditional_ksample/__init__.py
@@ -0,0 +1 @@
+from . import bregman, kcd
diff --git a/pywhy_stats/conditional_ksample/base.py b/pywhy_stats/conditional_ksample/base.py
@@ -0,0 +1,131 @@
+from typing import Callable, Optional
+
+import numpy as np
+from joblib import Parallel, delayed
+from numpy.typing import ArrayLike
+from sklearn.base import BaseEstimator
+from sklearn.linear_model import LogisticRegression
+
+from pywhy_stats.kernel_utils import _default_regularization
+
+
+def _preprocess_propensity_data(
+    group_ind: ArrayLike,
+    propensity_model: Optional[BaseEstimator],
+    propensity_weights: Optional[ArrayLike],
+):
+    if group_ind.ndim != 1:
+        raise RuntimeError("group_ind must be a 1d array.")
+    if len(np.unique(group_ind)) != 2:
+        raise RuntimeError(
+            f"There should only be two groups. Found {len(np.unique(group_ind))} groups."
+        )
+    if propensity_model is not None and propensity_weights is not None:
+        raise ValueError(
+            "Both propensity model and propensity estimates are specified. Only one is allowed."
+        )
+    if propensity_weights is not None:
+        if propensity_weights.shape[0] != len(group_ind):
+            raise ValueError(
+                f"There are {propensity_weights.shape[0]} pre-defined estimates, while "
+                f"there are {len(group_ind)} samples."
+            )
+        if propensity_weights.shape[1] != len(np.unique(group_ind.squeeze())):
+            raise ValueError(
+                f"There are {propensity_weights.shape[1]} group pre-defined estimates, while "
+                f"there are {len(np.unique(group_ind))} unique groups."
+            )
+
+
+def _compute_propensity_scores(
+    group_ind: ArrayLike,
+    propensity_model: Optional[BaseEstimator] = None,
+    propensity_weights: Optional[ArrayLike] = None,
+    n_jobs: Optional[int] = None,
+    random_state: Optional[int] = None,
+    **kwargs,
+):
+    if propensity_model is None:
+        K: ArrayLike = kwargs.get("K")
+
+        # compute a default penalty term if using a kernel matrix
+        # C is the inverse of the regularization parameter
+        if K.shape[0] == K.shape[1]:
+            # default regularization is 1 / (2 * K)
+            propensity_penalty_ = _default_regularization(K)
+            C = 1 / (2 * propensity_penalty_)
+        else:
+            # defaults to no regularization
+            propensity_penalty_ = 0.0
+            C = 1.0
+
+        # default model is logistic regression
+        propensity_model_ = LogisticRegression(
+            penalty="l2",
+            n_jobs=n_jobs,
+            warm_start=True,
+            solver="lbfgs",
+            random_state=random_state,
+            C=C,
+        )
+    else:
+        propensity_model_ = propensity_model
+
+    # either use pre-defined propensity weights, or estimate them
+    if propensity_weights is None:
+        K = kwargs.get("K")
+        # fit and then obtain the probabilities of treatment
+        # for each sample (i.e. the propensity scores)
+        propensity_weights = propensity_model_.fit(K, group_ind.ravel()).predict_proba(K)[:, 1]
+    else:
+        propensity_weights = propensity_weights[:, 1]
+    return propensity_weights
+
+
+def compute_null(
+    func: Callable,
+    e_hat: ArrayLike,
+    X: ArrayLike,
+    Y: ArrayLike,
+    null_reps: int = 1000,
+    n_jobs=None,
+    seed=None,
+    **kwargs,
+) -> ArrayLike:
+    """Estimate null distribution using propensity weights.
+
+    Parameters
+    ----------
+    func : Callable
+        The function to compute the test statistic.
+    e_hat : Array-like of shape (n_samples,)
+        The predicted propensity score for ``group_ind == 1``.
+    X : Array-Like of shape (n_samples, n_features_x)
+        The X (covariates) array.
+    Y : Array-Like of shape (n_samples, n_features_y)
+        The Y (outcomes) array.
+    null_reps : int, optional
+        Number of times to sample null, by default 1000.
+    n_jobs : int, optional
+        Number of jobs to run in parallel, by default None.
+    seed : int, optional
+        Random generator, or random seed, by default None.
+
+    Returns
+    -------
+    null_dist : Array-like of shape (n_samples,)
+        The null distribution of test statistics.
+    """
+    rng = np.random.default_rng(seed)
+    n_samps = X.shape[0]
+
+    # compute the test statistic on the conditionally permuted
+    # dataset, where each group label is resampled for each sample
+    # according to its propensity score
+    null_dist = Parallel(n_jobs=n_jobs)(
+        [
+            delayed(func)(X, Y, group_ind=rng.binomial(1, e_hat, size=n_samps), **kwargs)
+            for _ in range(null_reps)
+        ]
+    )
+    return np.asarray(null_dist)