Skip to content

Commit

Permalink
Fix issue with MedianCDFQuantileScorer
Browse files Browse the repository at this point in the history
Before, the scorer was not able to handle numpy object types directly. However, GCM often uses the object dtype to ensure support of mixing categorical and float values. This fixes the handling of object dtypes by explicitly converting them to floats first.

Signed-off-by: Patrick Bloebaum <bloebp@amazon.com>
  • Loading branch information
bloebp committed Nov 21, 2023
1 parent 5a6ce23 commit cae656a
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 2 deletions.
4 changes: 2 additions & 2 deletions dowhy/gcm/anomaly_scorers.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,13 @@ def fit(self, X: np.ndarray) -> None:
if (X.ndim == 2 and X.shape[1] > 1) or X.ndim > 2:
raise ValueError("The MedianCDFQuantileScorer currently only supports one-dimensional data!")

self._distribution_samples = X.reshape(-1)
self._distribution_samples = X.reshape(-1).astype(float)

def score(self, X: np.ndarray) -> np.ndarray:
if self._distribution_samples is None:
raise ValueError("Scorer has not been fitted!")

X = shape_into_2d(X)
X = shape_into_2d(X.astype(float))

equal_samples = np.sum(np.isclose(X, self._distribution_samples, rtol=0, atol=0, equal_nan=True), axis=1)
greater_samples = np.sum(X > self._distribution_samples, axis=1) + equal_samples / 2
Expand Down
11 changes: 11 additions & 0 deletions tests/gcm/test_anomaly_scorers.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,14 @@ def test_given_data_with_nans_when_using_median_quantile_scorer_with_nan_support
assert scorer.score(np.array([1, 4, 8, np.nan])) == approx(
[-np.log(2 * 0.5 / 10), -np.log(2 * 3.5 / 10), -np.log(2 * 0.5 / 10), -np.log(2 * 1 / 10)]
)


def test_given_numpy_arrays_with_object_type_when_using_median_quantile_scorer_then_does_not_raise_error():
training_data = np.array([1, 2, 3, 4, 5, 6, 7, 8, np.nan, np.nan], dtype=object)

scorer = RescaledMedianCDFQuantileScorer()
scorer.fit(training_data)

assert scorer.score(np.array([1, 4, 8, np.nan], dtype=object)) == approx(
[-np.log(2 * 0.5 / 10), -np.log(2 * 3.5 / 10), -np.log(2 * 0.5 / 10), -np.log(2 * 1 / 10)]
)

0 comments on commit cae656a

Please sign in to comment.