diff --git a/dowhy/gcm/anomaly_scorers.py b/dowhy/gcm/anomaly_scorers.py index 9a4735fa02..6fd800c8d1 100644 --- a/dowhy/gcm/anomaly_scorers.py +++ b/dowhy/gcm/anomaly_scorers.py @@ -14,6 +14,7 @@ class MedianCDFQuantileScorer(AnomalyScorer): """Given an anomalous observation x and samples from the distribution of X, this score represents: score(x) = 1 - 2 * min[P(X > x) + P(X = x) / 2, P(X < x) + P(X = x) / 2] + Here, the value x is considered as part of X for the computation. Comparing two NaN values are considered equal here. @@ -26,14 +27,15 @@ class MedianCDFQuantileScorer(AnomalyScorer): X = [-3, -2, -1, 0, 1, 2, 3] x = 2.5 Then, x falls in the right sided-quantile and only one sample in X is larger than x. Therefore, we get - p(X >= x) = 1 / 7 - P(X <= x) = 6 / 7 - With the end score of: - 1 - 2 * min[P(X > x) + P(X = x) / 2, P(X < x) + P(X = x) / 2] = 1 - 2 / 7 = 0.71 + P(X > x) = 1 / 8 + P(X < x) = 6 / 8 + P(X = x) = 1 / 8 + We divide by 8 here, because we consider x itself. + This gives us a score of: + 1 - 2 * min[P(X > x) + P(X = x) / 2, P(X < x) + P(X = x) / 2] = 1 - 3 / 8 = 0.625 Note: For equal samples, we contribute half of the count to the left and half of the count the right side. - - The higher the score, the less likely the sample comes from the distribution of X. + Note: For a statistically more rigorous, but also more conservative version, see RankBasedAnomalyScorer. """ def __init__(self): @@ -51,12 +53,12 @@ def score(self, X: np.ndarray) -> np.ndarray: X = shape_into_2d(X.astype(float)) - equal_samples = np.sum(np.isclose(X, self._distribution_samples, rtol=0, atol=0, equal_nan=True), axis=1) + equal_samples = np.sum(np.isclose(X, self._distribution_samples, rtol=0, atol=0, equal_nan=True), axis=1) + 1 greater_samples = np.sum(X > self._distribution_samples, axis=1) + equal_samples / 2 smaller_samples = np.sum(X < self._distribution_samples, axis=1) + equal_samples / 2 - return ( - 1 - 2 * np.amin(np.vstack([greater_samples, smaller_samples]), axis=0) / self._distribution_samples.shape[0] + return 1 - 2 * np.amin(np.vstack([greater_samples, smaller_samples]), axis=0) / ( + self._distribution_samples.shape[0] + 1 ) @@ -88,6 +90,75 @@ def score(self, X: np.ndarray) -> np.ndarray: return -np.log(scores) +class RankBasedAnomalyScorer(AnomalyScorer): + """Similar to the RescaledMedianCDFQuantileScorer, but this scorer is more directly based on ranks and the + assumption of exchangeability. + + This scorer computes anomaly scores for test samples by evaluating their ranks within the training samples (and a + given sample). For each test sample, the scorer computes its rank from above (number of samples greater than or + equal to it) and rank from below (number of samples less than or equal to it). It then calculates a p-value based + on these ranks, under the assumption of exchangeability. The p-value then represents the probability of observing + a rank as extreme as the observed rank or more extreme. + + Specifically, the p-value is computed as the minimum of: + 1. Twice the rank from above divided by the total number of samples. + 2. Twice the rank from below divided by the total number of samples. + 3. 1 (to ensure the p-value is at most 1). + + This method is non-parametric and makes no assumptions about the underlying distribution of the data. + + The anomaly score is then calculated as the negative log of this p-value (i.e. it is an information-theoretic + (IT) score). Higher anomaly scores indicate a lower probability, consequently, a higher likelihood of being an anomaly. + + For example: + X = [-3, -2, -1, 0, 1, 2, 3] + x = 2.5 + Then, + p(x >= X) = 7 / 8 + P(x <= X) = 1 / 8 + Which gives the p-value: + -log(min[1, 2 * 7 / 8, 2 * 1 / 8]) = -log(2 / 8) = + + """ + + def __init__(self): + self._distribution_samples = None + + def fit(self, X: np.ndarray) -> None: + if (X.ndim == 2 and X.shape[1] > 1) or X.ndim > 2: + raise ValueError("The RankBasedAnomalyScorer currently only supports one-dimensional data!") + + self._distribution_samples = X.reshape(-1) + + def score(self, X: np.ndarray) -> np.ndarray: + if self._distribution_samples is None: + raise ValueError("Scorer has not been fitted!") + + X = shape_into_2d(X) + + # Compute rank of every single test point in the union of the training and the respective test point. + # + 1 here to count the test sample itself. + equal_samples = np.sum(np.isclose(X, self._distribution_samples, rtol=0, atol=0, equal_nan=True), axis=1) + 1 + ranks_from_above = np.sum(X > self._distribution_samples, axis=1) + equal_samples + ranks_from_below = np.sum(X < self._distribution_samples, axis=1) + equal_samples + + # The probability to get at most rank k from above is k divided by the total number of samples. Similar for + # the case of below. Therefore, to get at most rank k either from above or below is + # min(2*k/total_num_samples, 1). We then get a p-value for exchangeability: + p_values = np.amin( + np.vstack( + [ + 2 * ranks_from_above / (self._distribution_samples.shape[0] + 1), + 2 * ranks_from_below / (self._distribution_samples.shape[0] + 1), + np.ones(X.shape[0]), + ] + ), + axis=0, + ) + + return -np.log(p_values) + + class ITAnomalyScorer(AnomalyScorer): """Transforms any anomaly scorer into an information theoretic (IT) score. This means, given a scorer S(x), an anomalous observation x and samples from the distribution of X, this scorer class represents: diff --git a/tests/gcm/test_anomaly.py b/tests/gcm/test_anomaly.py index 41bc943412..548e2b75e6 100644 --- a/tests/gcm/test_anomaly.py +++ b/tests/gcm/test_anomaly.py @@ -284,7 +284,7 @@ def test_given_multivariate_inputs_when_estimate_anomaly_scores_then_does_not_ra fit(causal_model, data) scores = anomaly_scores(causal_model, data_anomaly) - assert scores["X1"][0] == approx(-np.log(EPS), abs=3) + assert scores["X1"][0] == approx(-np.log(1 / (data.shape[0] + 1)), abs=3) @flaky(max_runs=3) diff --git a/tests/gcm/test_anomaly_scorers.py b/tests/gcm/test_anomaly_scorers.py index e28fc42c90..718cf14380 100644 --- a/tests/gcm/test_anomaly_scorers.py +++ b/tests/gcm/test_anomaly_scorers.py @@ -2,13 +2,14 @@ from pytest import approx from dowhy.gcm import MedianCDFQuantileScorer, MedianDeviationScorer, RescaledMedianCDFQuantileScorer +from dowhy.gcm.anomaly_scorers import RankBasedAnomalyScorer def test_given_simple_toy_data_when_using_MedianCDFQuantileScorer_then_returns_expected_scores(): anomaly_scorer = MedianCDFQuantileScorer() anomaly_scorer.fit(np.array(range(0, 20))) - assert anomaly_scorer.score(np.array([8, 17]))[0] == approx(1 - 17 / 20, abs=0.01) - assert anomaly_scorer.score(np.array([0])) == approx(1 - 1 / 20, abs=0.01) + assert anomaly_scorer.score(np.array([8, 17]))[0] == approx(1 - 18 / 21, abs=0.01) + assert anomaly_scorer.score(np.array([0])) == approx(1 - 2 / 21, abs=0.01) anomaly_scorer.fit(np.array([4.0 for i in range(200)])) assert anomaly_scorer.score(np.array([4])) == approx(0, abs=0.01) @@ -27,7 +28,7 @@ def test_given_data_with_nans_when_using_median_quantile_scorer_with_nan_support scorer.fit(training_data) assert scorer.score(np.array([1, 4, 8, np.nan])) == approx( - [-np.log(2 * 0.5 / 10), -np.log(2 * 3.5 / 10), -np.log(2 * 0.5 / 10), -np.log(2 * 1 / 10)] + [-np.log(2 * 1 / 11), -np.log(2 * 4 / 11), -np.log(2 * 1 / 11), -np.log(2 * 1.5 / 11)] ) @@ -38,5 +39,21 @@ def test_given_numpy_arrays_with_object_type_when_using_median_quantile_scorer_t scorer.fit(training_data) assert scorer.score(np.array([1, 4, 8, np.nan], dtype=object)) == approx( - [-np.log(2 * 0.5 / 10), -np.log(2 * 3.5 / 10), -np.log(2 * 0.5 / 10), -np.log(2 * 1 / 10)] + [-np.log(2 * 1 / 11), -np.log(2 * 4 / 11), -np.log(2 * 1 / 11), -np.log(2 * 1.5 / 11)] ) + + +def test_when_using_ranked_based_anomaly_scorer_then_returns_expected_scores(): + training_data = np.array([1, 2, 3, 4, 5, 6, 7, 8, np.nan, np.nan]) + + scorer = RankBasedAnomalyScorer() + scorer.fit(training_data) + + assert scorer.score(np.array([1, 4, 9, np.nan])) == approx( + [-np.log(2 * 2 / 11), -np.log(2 * 5 / 11), -np.log(2 * 1 / 11), -np.log(2 * 3 / 11)] + ) + + training_data = np.array([np.nan, np.nan, np.nan, np.nan]) + scorer.fit(training_data) + + assert scorer.score(np.array([1, np.nan])) == approx([-np.log(2 * 1 / 5), -np.log(1)]) diff --git a/tests/gcm/test_ml.py b/tests/gcm/test_ml.py index e3b856b132..43163f6cb1 100644 --- a/tests/gcm/test_ml.py +++ b/tests/gcm/test_ml.py @@ -65,11 +65,8 @@ def test_given_categorical_input_and_output_features_when_fit_classification_mod mdl = create_logistic_regression_classifier() mdl.fit(inputs, X2) - X2["True" == 1] = 1 - X2["False" == 0] = 0 - assert mdl.predict_probabilities(np.array([[2, "1"]], dtype=object)) == approx(np.array([[0, 1]]), abs=0.01) - assert np.sum(np.argmax(mdl.predict_probabilities(inputs), axis=1) != X2) < 20 + assert np.sum(mdl.predict(inputs).reshape(-1) != X2) < 20 _, counts = np.unique(mdl.predict(inputs), return_counts=True) assert counts / 1000 == approx(np.array([0.5, 0.5]), abs=0.05)