Skip to content

Commit 489b5e3

Browse files
committed
Fix handling of NaN values in MedianCDFQuantileScorer
NaN values are now correctly counted when estimating the anomaly score. Signed-off-by: Patrick Bloebaum <bloebp@amazon.com>
1 parent b2e75a7 commit 489b5e3

File tree

2 files changed

+20
-5
lines changed

2 files changed

+20
-5
lines changed

dowhy/gcm/anomaly_scorers.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@
1313

1414
class MedianCDFQuantileScorer(AnomalyScorer):
1515
"""Given an anomalous observation x and samples from the distribution of X, this score represents:
16-
score(x) = 1 - 2 * min[P(X >= x), P(X <= x)]
16+
score(x) = 1 - 2 * min[P(X > x) + P(X = x) / 2, P(X < x) + P(X = x) / 2]
17+
18+
Comparing two NaN values are considered equal here.
1719
1820
It scores the observation based on the quantile of x with respect to the distribution of X. Here, if the
1921
sample x lies in the tail of the distribution, we want to have a large score. Since we apriori don't know
@@ -27,7 +29,7 @@ class MedianCDFQuantileScorer(AnomalyScorer):
2729
p(X >= x) = 1 / 7
2830
P(X <= x) = 6 / 7
2931
With the end score of:
30-
1 - 2 * min[P(X >= x), P(X <= x)] = 1 - 2 / 7 = 0.71
32+
1 - 2 * min[P(X > x) + P(X = x) / 2, P(X < x) + P(X = x) / 2] = 1 - 2 / 7 = 0.71
3133
3234
Note: For equal samples, we contribute half of the count to the left and half of the count the right side.
3335
@@ -49,7 +51,7 @@ def score(self, X: np.ndarray) -> np.ndarray:
4951

5052
X = shape_into_2d(X)
5153

52-
equal_samples = np.sum(X == self._distribution_samples, axis=1)
54+
equal_samples = np.sum(np.isclose(X, self._distribution_samples, rtol=0, atol=0, equal_nan=True), axis=1)
5355
greater_samples = np.sum(X > self._distribution_samples, axis=1) + equal_samples / 2
5456
smaller_samples = np.sum(X < self._distribution_samples, axis=1) + equal_samples / 2
5557

@@ -60,7 +62,9 @@ def score(self, X: np.ndarray) -> np.ndarray:
6062

6163
class RescaledMedianCDFQuantileScorer(AnomalyScorer):
6264
"""Given an anomalous observation x and samples from the distribution of X, this score represents:
63-
score(x) = -log(2 * min[P(X >= x), P(X <= x)])
65+
score(x) = -log(2 * min[P(X > x) + P(X = x) / 2, P(X < x) + P(X = x) / 2])
66+
67+
Comparing two NaN values are considered equal here.
6468
6569
This is a rescaled version of the score s obtained by the :class:`~dowhy.gcm.anomaly_scorers.MedianCDFQuantileScorer`
6670
by calculating the negative log-probability -log(1 - s). This has the advantage that small differences in the

tests/gcm/test_anomaly_scorers.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import numpy as np
22
from pytest import approx
33

4-
from dowhy.gcm import MedianCDFQuantileScorer, MedianDeviationScorer
4+
from dowhy.gcm import MedianCDFQuantileScorer, MedianDeviationScorer, RescaledMedianCDFQuantileScorer
55

66

77
def test_given_simple_toy_data_when_using_MedianCDFQuantileScorer_then_returns_expected_scores():
@@ -18,3 +18,14 @@ def test_given_simple_toy_data_when_using_MedianDeviationScorer_then_returns_exp
1818
anomaly_scorer = MedianDeviationScorer()
1919
anomaly_scorer.fit(np.array(range(0, 20)) / 10)
2020
assert anomaly_scorer.score(np.array([0.8, 1.7])).reshape(-1) == approx(np.array([0.2, 1]), abs=0.1)
21+
22+
23+
def test_given_data_with_nans_when_using_median_quantile_scorer_with_nan_support_then_returns_expected_scores():
24+
training_data = np.array([1, 2, 3, 4, 5, 6, 7, 8, np.nan, np.nan])
25+
26+
scorer = RescaledMedianCDFQuantileScorer()
27+
scorer.fit(training_data)
28+
29+
assert scorer.score(np.array([1, 4, 8, np.nan])) == approx(
30+
[-np.log(2 * 0.5 / 10), -np.log(2 * 3.5 / 10), -np.log(2 * 0.5 / 10), -np.log(2 * 1 / 10)]
31+
)

0 commit comments

Comments
 (0)