From b9ae10b00f2fc76e4e2b009f8da56305962f3e3b Mon Sep 17 00:00:00 2001
From: Patrick Bloebaum <bloebp@amazon.com>
Date: Thu, 9 Nov 2023 15:33:14 -0800
Subject: [PATCH] Fix issue with auto assignment with imbalanced classes

Signed-off-by: Patrick Bloebaum <bloebp@amazon.com>
---
 dowhy/gcm/auto.py      | 26 ++++++++++++++++++++------
 tests/gcm/test_auto.py | 17 ++++++++++++++++-
 2 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/dowhy/gcm/auto.py b/dowhy/gcm/auto.py
index 9144f00cac..72f81adccc 100644
--- a/dowhy/gcm/auto.py
+++ b/dowhy/gcm/auto.py
@@ -10,7 +10,7 @@
 from sklearn import metrics
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.linear_model import LinearRegression, LogisticRegression
-from sklearn.model_selection import KFold, train_test_split
+from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
 from sklearn.preprocessing import MultiLabelBinarizer
 
 from dowhy.gcm import config
@@ -343,7 +343,7 @@ def find_best_model(
     X: np.ndarray,
     Y: np.ndarray,
     metric: Optional[Callable[[np.ndarray, np.ndarray], float]] = None,
-    max_samples_per_split: int = 10000,
+    max_samples_per_split: int = 20000,
     model_selection_splits: int = 5,
     n_jobs: Optional[int] = None,
 ) -> Tuple[Callable[[], PredictionModel], List[Tuple[Callable[[], PredictionModel], float, str]]]:
@@ -370,16 +370,27 @@ def find_best_model(
         labelBinarizer = MultiLabelBinarizer()
         labelBinarizer.fit(Y)
 
-    kfolds = list(KFold(n_splits=model_selection_splits, shuffle=True).split(range(X.shape[0])))
+    if is_classification_problem:
+        if len(np.unique(Y)) == 1:
+            raise ValueError(
+                "The given target samples have only one class! To fit a classification model, there "
+                "should be at least two classes."
+            )
+        kfolds = list(StratifiedKFold(n_splits=model_selection_splits, shuffle=True).split(X, Y))
+    else:
+        kfolds = list(KFold(n_splits=model_selection_splits, shuffle=True).split(range(X.shape[0])))
 
     def estimate_average_score(prediction_model_factory: Callable[[], PredictionModel], random_seed: int) -> float:
         set_random_seed(random_seed)
 
-        average_result = 0
+        average_result = []
 
         with warnings.catch_warnings():
             warnings.filterwarnings("ignore", category=ConvergenceWarning)
             for train_indices, test_indices in kfolds:
+                if is_classification_problem and len(np.unique(Y[train_indices[:max_samples_per_split]])) == 1:
+                    continue
+
                 model_instance = prediction_model_factory()
                 model_instance.fit(X[train_indices[:max_samples_per_split]], Y[train_indices[:max_samples_per_split]])
 
@@ -389,9 +400,12 @@ def estimate_average_score(prediction_model_factory: Callable[[], PredictionMode
                     y_true = labelBinarizer.transform(y_true)
                     y_pred = labelBinarizer.transform(y_pred)
 
-                average_result += metric(y_true, y_pred)
+                average_result.append(metric(y_true, y_pred))
 
-        return average_result / model_selection_splits
+        if len(average_result) == 0:
+            return float("inf")
+        else:
+            return float(np.mean(average_result))
 
     random_seeds = np.random.randint(np.iinfo(np.int32).max, size=len(prediction_model_factories))
     average_metric_scores = Parallel(n_jobs=n_jobs)(
diff --git a/tests/gcm/test_auto.py b/tests/gcm/test_auto.py
index 956323b10c..cc4c8fcc63 100644
--- a/tests/gcm/test_auto.py
+++ b/tests/gcm/test_auto.py
@@ -1,6 +1,7 @@
 import networkx as nx
 import numpy as np
 import pandas as pd
+import pytest
 from _pytest.python_api import approx
 from flaky import flaky
 from pytest import mark
@@ -9,7 +10,7 @@
 from sklearn.naive_bayes import GaussianNB
 from sklearn.pipeline import Pipeline
 
-from dowhy.gcm import ProbabilisticCausalModel, draw_samples, fit
+from dowhy.gcm import ProbabilisticCausalModel, StructuralCausalModel, draw_samples, fit
 from dowhy.gcm.auto import AssignmentQuality, assign_causal_mechanisms, has_linear_relationship
 
 
@@ -431,3 +432,17 @@ def test_given_categorical_data_when_print_auto_summary_then_returns_expected_fo
         "Based on the type of causal mechanism, the model with the lowest metric value represents the best choice."
         in summary_string
     )
+
+
+def test_given_imbalanced_classes_when_auto_assign_mechanism_then_handles_as_expected():
+    X = np.random.normal(0, 1, 1000)
+    Y = np.array(["OneClass"] * 1000)
+
+    with pytest.raises(ValueError):
+        assign_causal_mechanisms(StructuralCausalModel(nx.DiGraph([("X", "Y")])), pd.DataFrame({"X": X, "Y": Y}))
+
+    # Having at least one sample from the second class should not raise an error.
+    X = np.append(X, 0)
+    Y = np.append(Y, "RareClass")
+
+    assign_causal_mechanisms(StructuralCausalModel(nx.DiGraph([("X", "Y")])), pd.DataFrame({"X": X, "Y": Y}))