From b9ae10b00f2fc76e4e2b009f8da56305962f3e3b Mon Sep 17 00:00:00 2001 From: Patrick Bloebaum Date: Thu, 9 Nov 2023 15:33:14 -0800 Subject: [PATCH] Fix issue with auto assignment with imbalanced classes Signed-off-by: Patrick Bloebaum --- dowhy/gcm/auto.py | 26 ++++++++++++++++++++------ tests/gcm/test_auto.py | 17 ++++++++++++++++- 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/dowhy/gcm/auto.py b/dowhy/gcm/auto.py index 9144f00cac..72f81adccc 100644 --- a/dowhy/gcm/auto.py +++ b/dowhy/gcm/auto.py @@ -10,7 +10,7 @@ from sklearn import metrics from sklearn.exceptions import ConvergenceWarning from sklearn.linear_model import LinearRegression, LogisticRegression -from sklearn.model_selection import KFold, train_test_split +from sklearn.model_selection import KFold, StratifiedKFold, train_test_split from sklearn.preprocessing import MultiLabelBinarizer from dowhy.gcm import config @@ -343,7 +343,7 @@ def find_best_model( X: np.ndarray, Y: np.ndarray, metric: Optional[Callable[[np.ndarray, np.ndarray], float]] = None, - max_samples_per_split: int = 10000, + max_samples_per_split: int = 20000, model_selection_splits: int = 5, n_jobs: Optional[int] = None, ) -> Tuple[Callable[[], PredictionModel], List[Tuple[Callable[[], PredictionModel], float, str]]]: @@ -370,16 +370,27 @@ def find_best_model( labelBinarizer = MultiLabelBinarizer() labelBinarizer.fit(Y) - kfolds = list(KFold(n_splits=model_selection_splits, shuffle=True).split(range(X.shape[0]))) + if is_classification_problem: + if len(np.unique(Y)) == 1: + raise ValueError( + "The given target samples have only one class! To fit a classification model, there " + "should be at least two classes." + ) + kfolds = list(StratifiedKFold(n_splits=model_selection_splits, shuffle=True).split(X, Y)) + else: + kfolds = list(KFold(n_splits=model_selection_splits, shuffle=True).split(range(X.shape[0]))) def estimate_average_score(prediction_model_factory: Callable[[], PredictionModel], random_seed: int) -> float: set_random_seed(random_seed) - average_result = 0 + average_result = [] with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=ConvergenceWarning) for train_indices, test_indices in kfolds: + if is_classification_problem and len(np.unique(Y[train_indices[:max_samples_per_split]])) == 1: + continue + model_instance = prediction_model_factory() model_instance.fit(X[train_indices[:max_samples_per_split]], Y[train_indices[:max_samples_per_split]]) @@ -389,9 +400,12 @@ def estimate_average_score(prediction_model_factory: Callable[[], PredictionMode y_true = labelBinarizer.transform(y_true) y_pred = labelBinarizer.transform(y_pred) - average_result += metric(y_true, y_pred) + average_result.append(metric(y_true, y_pred)) - return average_result / model_selection_splits + if len(average_result) == 0: + return float("inf") + else: + return float(np.mean(average_result)) random_seeds = np.random.randint(np.iinfo(np.int32).max, size=len(prediction_model_factories)) average_metric_scores = Parallel(n_jobs=n_jobs)( diff --git a/tests/gcm/test_auto.py b/tests/gcm/test_auto.py index 956323b10c..cc4c8fcc63 100644 --- a/tests/gcm/test_auto.py +++ b/tests/gcm/test_auto.py @@ -1,6 +1,7 @@ import networkx as nx import numpy as np import pandas as pd +import pytest from _pytest.python_api import approx from flaky import flaky from pytest import mark @@ -9,7 +10,7 @@ from sklearn.naive_bayes import GaussianNB from sklearn.pipeline import Pipeline -from dowhy.gcm import ProbabilisticCausalModel, draw_samples, fit +from dowhy.gcm import ProbabilisticCausalModel, StructuralCausalModel, draw_samples, fit from dowhy.gcm.auto import AssignmentQuality, assign_causal_mechanisms, has_linear_relationship @@ -431,3 +432,17 @@ def test_given_categorical_data_when_print_auto_summary_then_returns_expected_fo "Based on the type of causal mechanism, the model with the lowest metric value represents the best choice." in summary_string ) + + +def test_given_imbalanced_classes_when_auto_assign_mechanism_then_handles_as_expected(): + X = np.random.normal(0, 1, 1000) + Y = np.array(["OneClass"] * 1000) + + with pytest.raises(ValueError): + assign_causal_mechanisms(StructuralCausalModel(nx.DiGraph([("X", "Y")])), pd.DataFrame({"X": X, "Y": Y})) + + # Having at least one sample from the second class should not raise an error. + X = np.append(X, 0) + Y = np.append(Y, "RareClass") + + assign_causal_mechanisms(StructuralCausalModel(nx.DiGraph([("X", "Y")])), pd.DataFrame({"X": X, "Y": Y}))