Add new method to estimate KL divergence using classifier

This should work better with multivariate data and mixed data types. However, it is slower than the knn appraoch. Signed-off-by: Patrick Bloebaum <bloebp@amazon.com>
py-why · Oct 20, 2023 · 716e1f5 · 716e1f5
1 parent 1540584
commit 716e1f5
Show file tree

Hide file tree

Showing 5 changed files with 153 additions and 19 deletions.
diff --git a/dowhy/gcm/divergence.py b/dowhy/gcm/divergence.py
@@ -1,12 +1,16 @@
 """Functions in this module should be considered experimental, meaning there might be breaking API changes in the
 future.
 """
+from typing import Callable, Union
 
 import numpy as np
 from scipy.stats import entropy
+from sklearn.model_selection import KFold
 from sklearn.neighbors import NearestNeighbors
 
+from dowhy.gcm.auto import AssignmentQuality, select_model
 from dowhy.gcm.constant import EPS
+from dowhy.gcm.ml.classification import ClassificationModel
 from dowhy.gcm.util.general import is_categorical, setdiff2d, shape_into_2d
 
 
@@ -16,14 +20,20 @@ def auto_estimate_kl_divergence(X: np.ndarray, Y: np.ndarray) -> float:
     elif is_probability_matrix(X):
         return estimate_kl_divergence_of_probabilities(X, Y)
     else:
-        return estimate_kl_divergence_continuous(X, Y)
+        if X.ndim == 2 and X.shape[1] > 1:
+            return estimate_kl_divergence_continuous_clf(X, Y)
+        else:
+            return estimate_kl_divergence_continuous_knn(X, Y)
 
 
-def estimate_kl_divergence_continuous(
+def estimate_kl_divergence_continuous_knn(
     X: np.ndarray, Y: np.ndarray, k: int = 1, remove_common_elements: bool = True
 ) -> float:
     """Estimates KL-Divergence using k-nearest neighbours (Wang et al., 2009).
 
+    While, in theory, this handles multidimensional inputs, consider using estimate_kl_divergence_continuous_clf
+    for data with more than one dimension.
+
     Q. Wang, S. R. Kulkarni, and S. Verdú,
     "Divergence estimation for multidimensional densities via k-nearest-neighbor distances",
     IEEE Transactions on Information Theory, vol. 55, no. 5, pp. 2392-2405, May 2009.
@@ -82,6 +92,70 @@ def estimate_kl_divergence_continuous(
     return result
 
 
+def estimate_kl_divergence_continuous_clf(
+    samples_P: np.ndarray,
+    samples_Q: np.ndarray,
+    n_splits: int = 3,
+    classifier_model: Union[AssignmentQuality, Callable[[], ClassificationModel]] = AssignmentQuality.GOOD,
+    epsilon: float = EPS,
+) -> float:
+    """Estimates KL-Divergence based on probabilities given by classifier. This is:
+
+        D_f(P || Q) = \int f(p(x)/q(x)) q(x) dx ~= -1/N \sum_x log(p(Y = 1 | x) / (1 - p(Y = 1 | x)))
+
+    Here, the KL divergence can be approximated using the log ratios of probabilities to predict whether a sample
+    comes from distribution P or Q.
+
+    :param samples_P: Samples drawn from P. Can have a different number of samples than Q.
+    :param samples_Q: Samples drawn from Q. Can have a different number of samples than P.
+    :param n_splits: Number of splits of the training and test data. The classifier is trained on the training
+                     data and evaluated on the test data to obtain the probabilities.
+    :param classifier_model: Used to estimate the probabilities for the log ratio. This can either be a
+                             ClassificationModel or an AssignmentQuality. In the latter, a model is automatically
+                             selected based on the best performance on a training set.
+    :param epsilon: If the probability is either 1 or 0, this value will be used for clipping, i.e., 0 becomes epsilon
+                    and 1 becomes 1- epsilon.
+    :return: Estimated value of the KL divergence D(P||Q).
+    """
+    samples_P, samples_Q = shape_into_2d(samples_P, samples_Q)
+
+    if samples_P.shape[1] != samples_Q.shape[1]:
+        raise ValueError("X and Y need to have the same number of features!")
+
+    all_probs = []
+
+    splits_p = list(KFold(n_splits=n_splits, shuffle=True).split(samples_P))
+    splits_q = list(KFold(n_splits=n_splits, shuffle=True).split(samples_Q))
+
+    if isinstance(classifier_model, AssignmentQuality):
+        classifier_model = select_model(
+            np.vstack([samples_P, samples_Q]),
+            np.concatenate([np.zeros(samples_P.shape[0]), np.ones(samples_Q.shape[0])]).astype(str),
+            classifier_model,
+        )[0]
+    else:
+        classifier_model = classifier_model()
+
+    for k in range(n_splits):
+        classifier_model.fit(
+            np.vstack([samples_P[splits_p[k][0]], samples_Q[splits_q[k][0]]]),
+            np.concatenate([np.zeros(len(splits_p[k][0])), np.ones(len(splits_q[k][0]))]).astype(str),
+        )
+
+        probs_P = classifier_model.predict_probabilities(samples_P[splits_p[k][1]])[:, 1]
+        probs_P[probs_P == 0] = epsilon
+        probs_P[probs_P == 1] = 1 - epsilon
+        all_probs.append(probs_P)
+
+    all_probs = np.concatenate(all_probs)
+    kl_divergence = -np.mean(np.log(all_probs / (1 - all_probs)))
+
+    if kl_divergence < 0:
+        kl_divergence = 0
+
+    return kl_divergence
+
+
 def estimate_kl_divergence_categorical(X: np.ndarray, Y: np.ndarray) -> float:
     X, Y = shape_into_2d(X, Y)
 
@@ -116,5 +190,7 @@ def estimate_kl_divergence_of_probabilities(X: np.ndarray, Y: np.ndarray) -> flo
 def is_probability_matrix(X: np.ndarray) -> bool:
     if X.ndim == 1:
         return np.all(np.isclose(np.sum(abs(X.astype(np.float64)), axis=0), 1))
+    elif X.shape[1] == 1:
+        return False
     else:
         return np.all(np.isclose(np.sum(abs(X.astype(np.float64)), axis=1), 1))
diff --git a/dowhy/gcm/stochastic_models.py b/dowhy/gcm/stochastic_models.py
@@ -14,7 +14,6 @@
 from sklearn.mixture import BayesianGaussianMixture
 
 from dowhy.gcm.causal_mechanisms import StochasticModel
-from dowhy.gcm.divergence import estimate_kl_divergence_continuous
 from dowhy.gcm.util.general import shape_into_2d
 
 _CONTINUOUS_DISTRIBUTIONS = [
@@ -127,7 +126,9 @@ def find_suitable_continuous_distribution(
                 generated_samples = distribution.rvs(size=distribution_samples.shape[0], loc=loc, scale=scale, *arg)
 
                 # Check the KL divergence between the distribution of the given and fitted distribution.
-                divergence = estimate_kl_divergence_continuous(distribution_samples, generated_samples)
+                from dowhy.gcm.divergence import estimate_kl_divergence_continuous_knn
+
+                divergence = estimate_kl_divergence_continuous_knn(distribution_samples, generated_samples)
                 if divergence < divergence_threshold:
                     currently_best_distribution = distribution
                     currently_best_parameters = params

diff --git a/tests/gcm/test_arrow_strength.py b/tests/gcm/test_arrow_strength.py
@@ -17,9 +17,13 @@
     fit,
 )
 from dowhy.gcm.auto import assign_causal_mechanisms
-from dowhy.gcm.divergence import estimate_kl_divergence_continuous
+from dowhy.gcm.divergence import estimate_kl_divergence_continuous_knn
 from dowhy.gcm.influence import arrow_strength_of_model
-from dowhy.gcm.ml import create_linear_regressor, create_logistic_regression_classifier
+from dowhy.gcm.ml import (
+    create_linear_regressor,
+    create_linear_regressor_with_given_parameters,
+    create_logistic_regression_classifier,
+)
 
 
 @pytest.fixture
@@ -34,10 +38,11 @@ def preserve_random_generator_state():
 @flaky(max_runs=5)
 def test_given_kl_divergence_attribution_func_when_estimate_arrow_strength_then_returns_expected_results():
     causal_strengths = arrow_strength(
-        _create_causal_model(), "X2", difference_estimation_func=estimate_kl_divergence_continuous
+        _create_causal_model(), "X2", difference_estimation_func=estimate_kl_divergence_continuous_knn
     )
-    assert causal_strengths[("X0", "X2")] == approx(2.76, abs=0.4)
-    assert causal_strengths[("X1", "X2")] == approx(1.6, abs=0.4)
+
+    assert causal_strengths[("X0", "X2")] == approx(1.2, abs=0.2)
+    assert causal_strengths[("X1", "X2")] == approx(0.3, abs=0.1)
 
 
 @flaky(max_runs=5)
@@ -199,12 +204,18 @@ def _create_causal_model():
     causal_model = ProbabilisticCausalModel(nx.DiGraph([("X1", "X2"), ("X0", "X2")]))
     causal_model.set_causal_mechanism("X1", ScipyDistribution(stats.norm, loc=0, scale=1))
     causal_model.set_causal_mechanism("X0", ScipyDistribution(stats.norm, loc=0, scale=1))
-    causal_model.set_causal_mechanism("X2", AdditiveNoiseModel(prediction_model=create_linear_regressor()))
+    causal_model.set_causal_mechanism(
+        "X2",
+        AdditiveNoiseModel(
+            prediction_model=create_linear_regressor_with_given_parameters([3, 1]),
+            noise_model=ScipyDistribution(stats.norm, loc=0, scale=1),
+        ),
+    )
 
     X0 = np.random.normal(0, 1, 1000)
     X1 = np.random.normal(0, 1, 1000)
 
-    test_data = pd.DataFrame({"X0": X0, "X1": X1, "X2": 3 * X0 + X1 + np.random.normal(0, 0.2, X0.shape[0])})
+    test_data = pd.DataFrame({"X0": X0, "X1": X1, "X2": 3 * X0 + X1 + np.random.normal(0, 1, X0.shape[0])})
     fit(causal_model, test_data)
 
     return causal_model
diff --git a/tests/gcm/test_divergence.py b/tests/gcm/test_divergence.py
@@ -5,7 +5,8 @@
 from dowhy.gcm.divergence import (
     auto_estimate_kl_divergence,
     estimate_kl_divergence_categorical,
-    estimate_kl_divergence_continuous,
+    estimate_kl_divergence_continuous_clf,
+    estimate_kl_divergence_continuous_knn,
     estimate_kl_divergence_of_probabilities,
     is_probability_matrix,
 )
@@ -16,8 +17,8 @@ def test_given_simple_gaussian_data_when_estimate_kl_divergence_continuous_then_
     X = np.random.normal(0, 1, 2000)
     Y = np.random.normal(1, 1, 2000)
 
-    assert estimate_kl_divergence_continuous(X, X) == approx(0, abs=0.001)
-    assert estimate_kl_divergence_continuous(X, Y) == approx(0.5, abs=0.15)
+    assert estimate_kl_divergence_continuous_knn(X, X) == approx(0, abs=0.001)
+    assert estimate_kl_divergence_continuous_knn(X, Y) == approx(0.5, abs=0.15)
 
 
 @flaky(max_runs=3)
@@ -81,5 +82,50 @@ def test_given_simple_gaussian_data_with_overlap_when_estimate_kl_divergence_con
 
     Y[:10] = X[:10]
 
-    assert estimate_kl_divergence_continuous(X, X) == approx(0, abs=0.001)
-    assert estimate_kl_divergence_continuous(X, Y) == approx(0.5, abs=0.15)
+    assert estimate_kl_divergence_continuous_knn(X, X) == approx(0, abs=0.001)
+    assert estimate_kl_divergence_continuous_knn(X, Y) == approx(0.5, abs=0.15)
+
+
+@flaky(max_runs=3)
+def test_given_simple_gaussian_data_when_estimate_kl_divergence_continuous_clf_then_returns_correct_result():
+    X = np.random.normal(0, 1, 2000)
+    Y = np.random.normal(1, 1, 2000)
+
+    assert estimate_kl_divergence_continuous_clf(X, X) == approx(0, abs=0.001)
+    assert estimate_kl_divergence_continuous_clf(X, Y) == approx(0.5, abs=0.15)
+
+
+@flaky(max_runs=3)
+def test_given_multi_dim_simple_gaussian_data_when_estimate_kl_divergence_continuous_clf_then_returns_correct_result():
+    X = np.random.normal(0, 1, (2000, 2))
+    Y = np.random.normal(1, 1, (2000, 2))
+
+    assert estimate_kl_divergence_continuous_clf(X, X) == approx(0, abs=0.001)
+    assert estimate_kl_divergence_continuous_clf(X, Y) == approx(1, abs=0.2)
+
+
+@flaky(max_runs=3)
+def test_given_multi_dim_gaussian_and_categorical_data_when_estimate_kl_divergence_continuous_clf_then_returns_correct_result():
+    X0 = np.random.normal(0, 1, 2000)
+    X1 = np.random.choice(3, 2000, replace=True).astype(str)
+
+    X0_other = np.random.normal(1, 1, 2000)
+    X1_other = (np.random.choice(4, 2000, replace=True)).astype(str)
+
+    X = np.array([X0, X1], dtype=object).T
+
+    assert estimate_kl_divergence_continuous_clf(X, X) == approx(0, abs=0.001)
+    # Only Gaussian component changed
+    assert estimate_kl_divergence_continuous_clf(X, np.array([X0_other, X1], dtype=object).T) == approx(0.5, abs=0.15)
+    assert estimate_kl_divergence_continuous_clf(X, np.array([X0_other, X1_other], dtype=object).T) == approx(
+        0.78, abs=0.15
+    )
+
+
+@flaky(max_runs=3)
+def test_given_exponential_data_when_estimate_kl_divergence_continuous_then_returns_correct_result():
+    X = np.random.exponential(1, 2000)
+    Y = np.random.exponential(0.5, 2000)
+
+    assert estimate_kl_divergence_continuous_clf(X, Y) == approx(0.31, abs=0.1)
+    assert estimate_kl_divergence_continuous_knn(X, Y) == approx(0.31, abs=0.1)
diff --git a/tests/gcm/test_fcms.py b/tests/gcm/test_fcms.py
@@ -21,7 +21,7 @@
     fit,
 )
 from dowhy.gcm.auto import assign_causal_mechanisms
-from dowhy.gcm.divergence import estimate_kl_divergence_continuous
+from dowhy.gcm.divergence import estimate_kl_divergence_continuous_clf
 from dowhy.gcm.ml import (
     SklearnRegressionModel,
     create_linear_regressor,
@@ -65,7 +65,7 @@ def test_given_linear_data_when_draw_samples_from_fitted_anm_then_generates_corr
     generated_samples = scm.causal_mechanism("X1").draw_samples(np.array([2] * 1000))
     assert np.mean(generated_samples) == approx(6, abs=0.05)
     assert np.std(generated_samples) == approx(0.1, abs=0.05)
-    assert estimate_kl_divergence_continuous(
+    assert estimate_kl_divergence_continuous_clf(
         test_data["X1"].to_numpy(), draw_samples(scm, 10000)["X1"].to_numpy()
     ) == approx(0, abs=0.05)
 
@@ -102,7 +102,7 @@ def test_given_categorical_input_data_when_draw_from_fitted_causal_graph_with_li
         test_data[:, 2].astype(float).reshape(-1, 1)
     )
 
-    assert estimate_kl_divergence_continuous(test_data[:, 2], draw_samples(scm, 1000)["X2"].to_numpy()) == approx(
+    assert estimate_kl_divergence_continuous_clf(test_data[:, 2], draw_samples(scm, 1000)["X2"].to_numpy()) == approx(
         0, abs=0.05
     )