From cae656a3f2078859a6504822dbe7eaab357ee9b7 Mon Sep 17 00:00:00 2001
From: Patrick Bloebaum <bloebp@amazon.com>
Date: Mon, 20 Nov 2023 08:16:35 -0800
Subject: [PATCH] Fix issue with MedianCDFQuantileScorer

Before, the scorer was not able to handle numpy object types directly. However, GCM often uses the object dtype to ensure support of mixing categorical and float values. This fixes the handling of object dtypes by explicitly converting them to floats first.

Signed-off-by: Patrick Bloebaum <bloebp@amazon.com>
---
 dowhy/gcm/anomaly_scorers.py      |  4 ++--
 tests/gcm/test_anomaly_scorers.py | 11 +++++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/dowhy/gcm/anomaly_scorers.py b/dowhy/gcm/anomaly_scorers.py
index c5c41f35d8..9a4735fa02 100644
--- a/dowhy/gcm/anomaly_scorers.py
+++ b/dowhy/gcm/anomaly_scorers.py
@@ -43,13 +43,13 @@ def fit(self, X: np.ndarray) -> None:
         if (X.ndim == 2 and X.shape[1] > 1) or X.ndim > 2:
             raise ValueError("The MedianCDFQuantileScorer currently only supports one-dimensional data!")
 
-        self._distribution_samples = X.reshape(-1)
+        self._distribution_samples = X.reshape(-1).astype(float)
 
     def score(self, X: np.ndarray) -> np.ndarray:
         if self._distribution_samples is None:
             raise ValueError("Scorer has not been fitted!")
 
-        X = shape_into_2d(X)
+        X = shape_into_2d(X.astype(float))
 
         equal_samples = np.sum(np.isclose(X, self._distribution_samples, rtol=0, atol=0, equal_nan=True), axis=1)
         greater_samples = np.sum(X > self._distribution_samples, axis=1) + equal_samples / 2
diff --git a/tests/gcm/test_anomaly_scorers.py b/tests/gcm/test_anomaly_scorers.py
index c7f45e01fc..e28fc42c90 100644
--- a/tests/gcm/test_anomaly_scorers.py
+++ b/tests/gcm/test_anomaly_scorers.py
@@ -29,3 +29,14 @@ def test_given_data_with_nans_when_using_median_quantile_scorer_with_nan_support
     assert scorer.score(np.array([1, 4, 8, np.nan])) == approx(
         [-np.log(2 * 0.5 / 10), -np.log(2 * 3.5 / 10), -np.log(2 * 0.5 / 10), -np.log(2 * 1 / 10)]
     )
+
+
+def test_given_numpy_arrays_with_object_type_when_using_median_quantile_scorer_then_does_not_raise_error():
+    training_data = np.array([1, 2, 3, 4, 5, 6, 7, 8, np.nan, np.nan], dtype=object)
+
+    scorer = RescaledMedianCDFQuantileScorer()
+    scorer.fit(training_data)
+
+    assert scorer.score(np.array([1, 4, 8, np.nan], dtype=object)) == approx(
+        [-np.log(2 * 0.5 / 10), -np.log(2 * 3.5 / 10), -np.log(2 * 0.5 / 10), -np.log(2 * 1 / 10)]
+    )