diff --git a/dowhy/gcm/anomaly_scorers.py b/dowhy/gcm/anomaly_scorers.py index ff9fbc8eae..c5c41f35d8 100644 --- a/dowhy/gcm/anomaly_scorers.py +++ b/dowhy/gcm/anomaly_scorers.py @@ -13,7 +13,9 @@ class MedianCDFQuantileScorer(AnomalyScorer): """Given an anomalous observation x and samples from the distribution of X, this score represents: - score(x) = 1 - 2 * min[P(X >= x), P(X <= x)] + score(x) = 1 - 2 * min[P(X > x) + P(X = x) / 2, P(X < x) + P(X = x) / 2] + + Comparing two NaN values are considered equal here. It scores the observation based on the quantile of x with respect to the distribution of X. Here, if the sample x lies in the tail of the distribution, we want to have a large score. Since we apriori don't know @@ -27,7 +29,7 @@ class MedianCDFQuantileScorer(AnomalyScorer): p(X >= x) = 1 / 7 P(X <= x) = 6 / 7 With the end score of: - 1 - 2 * min[P(X >= x), P(X <= x)] = 1 - 2 / 7 = 0.71 + 1 - 2 * min[P(X > x) + P(X = x) / 2, P(X < x) + P(X = x) / 2] = 1 - 2 / 7 = 0.71 Note: For equal samples, we contribute half of the count to the left and half of the count the right side. @@ -49,7 +51,7 @@ def score(self, X: np.ndarray) -> np.ndarray: X = shape_into_2d(X) - equal_samples = np.sum(X == self._distribution_samples, axis=1) + equal_samples = np.sum(np.isclose(X, self._distribution_samples, rtol=0, atol=0, equal_nan=True), axis=1) greater_samples = np.sum(X > self._distribution_samples, axis=1) + equal_samples / 2 smaller_samples = np.sum(X < self._distribution_samples, axis=1) + equal_samples / 2 @@ -60,7 +62,9 @@ def score(self, X: np.ndarray) -> np.ndarray: class RescaledMedianCDFQuantileScorer(AnomalyScorer): """Given an anomalous observation x and samples from the distribution of X, this score represents: - score(x) = -log(2 * min[P(X >= x), P(X <= x)]) + score(x) = -log(2 * min[P(X > x) + P(X = x) / 2, P(X < x) + P(X = x) / 2]) + + Comparing two NaN values are considered equal here. This is a rescaled version of the score s obtained by the :class:`~dowhy.gcm.anomaly_scorers.MedianCDFQuantileScorer` by calculating the negative log-probability -log(1 - s). This has the advantage that small differences in the diff --git a/tests/gcm/test_anomaly_scorers.py b/tests/gcm/test_anomaly_scorers.py index 8c408588f4..c7f45e01fc 100644 --- a/tests/gcm/test_anomaly_scorers.py +++ b/tests/gcm/test_anomaly_scorers.py @@ -1,7 +1,7 @@ import numpy as np from pytest import approx -from dowhy.gcm import MedianCDFQuantileScorer, MedianDeviationScorer +from dowhy.gcm import MedianCDFQuantileScorer, MedianDeviationScorer, RescaledMedianCDFQuantileScorer def test_given_simple_toy_data_when_using_MedianCDFQuantileScorer_then_returns_expected_scores(): @@ -18,3 +18,14 @@ def test_given_simple_toy_data_when_using_MedianDeviationScorer_then_returns_exp anomaly_scorer = MedianDeviationScorer() anomaly_scorer.fit(np.array(range(0, 20)) / 10) assert anomaly_scorer.score(np.array([0.8, 1.7])).reshape(-1) == approx(np.array([0.2, 1]), abs=0.1) + + +def test_given_data_with_nans_when_using_median_quantile_scorer_with_nan_support_then_returns_expected_scores(): + training_data = np.array([1, 2, 3, 4, 5, 6, 7, 8, np.nan, np.nan]) + + scorer = RescaledMedianCDFQuantileScorer() + scorer.fit(training_data) + + assert scorer.score(np.array([1, 4, 8, np.nan])) == approx( + [-np.log(2 * 0.5 / 10), -np.log(2 * 3.5 / 10), -np.log(2 * 0.5 / 10), -np.log(2 * 1 / 10)] + )