py-why · drawlinson · Jan 16, 2024 · Jan 16, 2024
diff --git a/dowhy/causal_estimator.py b/dowhy/causal_estimator.py
@@ -11,6 +11,7 @@
 import dowhy.interpreters as interpreters
 from dowhy.causal_identifier.identified_estimand import IdentifiedEstimand
 from dowhy.utils.api import parse_state
+from dowhy.utils.encoding import Encoders
 
 logger = logging.getLogger(__name__)
 
@@ -112,6 +113,35 @@ def __init__(
         self._bootstrap_estimates = None
         self._bootstrap_null_estimates = None
 
+        self._encoders = Encoders()
+
+    def reset_encoders(self):
+        """
+        Removes any reference to data encoders, causing them to be re-created on next `fit()`.
+
+        It's important that data is consistently encoded otherwise models will produce inconsistent output.
+        In particular, categorical variables are one-hot encoded; the mapping of original data values
+        must be identical between model training/fitting and inference time.
+
+        Encoders are reset when `fit()` is called again, as the data is assumed to have changed.
+
+        A separate encoder is used for each subset of variables (treatment, common causes and effect modifiers).
+        """
+        self._encoders.reset()
+
+    def _encode(self, data: pd.DataFrame, encoder_name: str):
+        """
+        Encodes categorical columns in the given data, returning a new dataframe containing
+        all original data and the encoded columns. Numerical data is unchanged, categorical
+        types are one-hot encoded. `encoder_name` identifies a specific encoder to be used
+        if available, or created if not. The encoder can be reused in subsequent calls.
+
+        :param data: Data to encode.
+        :param encoder_name: The name for the encoder to be used.
+        :returns: The encoded data.
+        """
+        return self._encoders.encode(data, encoder_name)
+
     def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optional[List[str]] = None):
         """Sets the effect modifiers for the estimator
         Modifies need_conditional_estimates accordingly to effect modifiers value
@@ -124,7 +154,7 @@ def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optio
             self._effect_modifier_names = [cname for cname in effect_modifier_names if cname in data.columns]
             if len(self._effect_modifier_names) > 0:
                 self._effect_modifiers = data[self._effect_modifier_names]
-                self._effect_modifiers = pd.get_dummies(self._effect_modifiers, drop_first=True)
+                self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers")
                 self.logger.debug("Effect modifiers: " + ",".join(self._effect_modifier_names))
             else:
                 self._effect_modifier_names = []
@@ -234,7 +264,10 @@ def _estimate_conditional_effects(
                 effect_modifier_names[i] = prefix + str(em)
         # Grouping by effect modifiers and computing effect separately
         by_effect_mods = data.groupby(effect_modifier_names)
-        cond_est_fn = lambda x: self._do(self._treatment_value, x) - self._do(self._control_value, x)
+
+        def cond_est_fn(x):
+            return self._do(self._treatment_value, x) - self._do(self._control_value, x)
+
         conditional_estimates = by_effect_mods.apply(estimate_effect_fn)
         # Deleting the temporary categorical columns
         for em in effect_modifier_names:

diff --git a/dowhy/causal_estimators/causalml.py b/dowhy/causal_estimators/causalml.py
@@ -116,6 +116,7 @@ def fit(
                     effects, or return a heterogeneous effect function. Not all
                     methods support this currently.
         """
+        self.reset_encoders()  # Forget any existing encoders
         self._set_effect_modifiers(data, effect_modifier_names)
 
         # Check the backdoor variables being used
@@ -127,7 +128,7 @@ def fit(
             # Get the data of the unobserved confounders
             self._observed_common_causes = data[self._observed_common_causes_names]
             # One hot encode the data if they are categorical
-            self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
+            self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
         else:
             self._observed_common_causes = []
 
@@ -138,7 +139,7 @@ def fit(
         self._instrumental_variable_names = self._target_estimand.instrumental_variables
         if self._instrumental_variable_names:
             self._instrumental_variables = data[self._instrumental_variable_names]
-            self._instrumental_variables = pd.get_dummies(self._instrumental_variables, drop_first=True)
+            self._instrumental_variables = self._encode(self._instrumental_variables, "instrumental_variables")
         else:
             self._instrumental_variables = []
 

diff --git a/dowhy/causal_estimators/distance_matching_estimator.py b/dowhy/causal_estimators/distance_matching_estimator.py
@@ -122,6 +122,7 @@ def fit(
         """
         self.exact_match_cols = exact_match_cols
 
+        self.reset_encoders()  # Forget any existing encoders
         self._set_effect_modifiers(data, effect_modifier_names)
 
         # Check if the treatment is one-dimensional
@@ -146,7 +147,7 @@ def fit(
             # Convert the categorical variables into dummy/indicator variables
             # Basically, this gives a one hot encoding for each category
             # The first category is taken to be the base line.
-            self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
+            self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
         else:
             self._observed_common_causes = None
             error_msg = "No common causes/confounders present. Distance matching methods are not applicable"

diff --git a/dowhy/causal_estimators/econml.py b/dowhy/causal_estimators/econml.py
@@ -120,6 +120,7 @@ def fit(
                     effects, or return a heterogeneous effect function. Not all
                     methods support this currently.
         """
+        self.reset_encoders()  # Forget any existing encoders
         self._set_effect_modifiers(data, effect_modifier_names)
         # Save parameters for later refutter fitting
         self._econml_fit_params = kwargs
@@ -148,12 +149,12 @@ def fit(
                 # Also only update self._effect_modifiers, and create a copy of self._effect_modifier_names
                 # the latter can be used by other estimator methods later
                 self._effect_modifiers = data[effect_modifier_names]
-                self._effect_modifiers = pd.get_dummies(self._effect_modifiers, drop_first=True)
+                self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers")
                 self._effect_modifier_names = effect_modifier_names
             self.logger.debug("Effect modifiers: " + ",".join(effect_modifier_names))
         if self._observed_common_causes_names:
             self._observed_common_causes = data[self._observed_common_causes_names]
-            self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
+            self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
         else:
             self._observed_common_causes = None
         self.logger.debug("Back-door variables used:" + ",".join(self._observed_common_causes_names))
@@ -165,7 +166,7 @@ def fit(
             self.estimating_instrument_names = parse_state(self.iv_instrument_name)
         if self.estimating_instrument_names:
             self._estimating_instruments = data[self.estimating_instrument_names]
-            self._estimating_instruments = pd.get_dummies(self._estimating_instruments, drop_first=True)
+            self._estimating_instruments = self._encode(self._estimating_instruments, "estimating_instruments")
         else:
             self._estimating_instruments = None
 

diff --git a/dowhy/causal_estimators/instrumental_variable_estimator.py b/dowhy/causal_estimators/instrumental_variable_estimator.py
@@ -92,6 +92,7 @@ def fit(
                     effects, or return a heterogeneous effect function. Not all
                     methods support this currently.
         """
+        self.reset_encoders()  # Forget any existing encoders
         self._set_effect_modifiers(data, effect_modifier_names)
 
         self.estimating_instrument_names = self._target_estimand.instrumental_variables

diff --git a/dowhy/causal_estimators/propensity_score_estimator.py b/dowhy/causal_estimators/propensity_score_estimator.py
@@ -93,6 +93,7 @@ def fit(
                     effects, or return a heterogeneous effect function. Not all
                     methods support this currently.
         """
+        self.reset_encoders()  # Forget any existing encoders
         self._set_effect_modifiers(data, effect_modifier_names)
 
         self.logger.debug("Back-door variables used:" + ",".join(self._target_estimand.get_backdoor_variables()))
@@ -103,7 +104,8 @@ def fit(
             # Convert the categorical variables into dummy/indicator variables
             # Basically, this gives a one hot encoding for each category
             # The first category is taken to be the base line.
-            self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
+            self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
+
         else:
             self._observed_common_causes = None
             error_msg = "No common causes/confounders present. Propensity score based methods are not applicable"

diff --git a/dowhy/causal_estimators/regression_discontinuity_estimator.py b/dowhy/causal_estimators/regression_discontinuity_estimator.py
@@ -98,6 +98,7 @@ def fit(
                     effects, or return a heterogeneous effect function. Not all
                     methods support this currently.
         """
+        self.reset_encoders()  # Forget any existing encoders
         self._set_effect_modifiers(data, effect_modifier_names)
 
         self.rd_variable = data[self.rd_variable_name]

diff --git a/dowhy/causal_estimators/regression_estimator.py b/dowhy/causal_estimators/regression_estimator.py
@@ -5,7 +5,6 @@
 import statsmodels.api as sm
 
 from dowhy.causal_estimator import CausalEstimate, CausalEstimator, IdentifiedEstimand
-from dowhy.utils.encoding import one_hot_encode
 
 
 class RegressionEstimator(CausalEstimator):
@@ -71,53 +70,6 @@ def __init__(
 
         self.model = None
 
-        # Data encoders
-        # encoder_drop_first will not encode the first category value with a bit in 1-hot encoding.
-        # It will be implicit instead, by the absence of any bit representing this value in the relevant columns.
-        # Set to False to include a bit for each value of every categorical variable.
-        self.encoder_drop_first = True
-        self.reset_encoders()
-
-    def reset_encoders(self):
-        """
-        Removes any reference to data encoders, causing them to be re-created on next `fit()`.
-
-        It's important that data is consistently encoded otherwise models will produce inconsistent output.
-        In particular, categorical variables are one-hot encoded; the mapping of original data values
-        must be identical between model training/fitting and inference time.
-
-        Encoders are reset when `fit()` is called again, as the data is assumed to have changed.
-
-        A separate encoder is used for each subset of variables (treatment, common causes and effect modifiers).
-        """
-        self._encoders = {
-            "treatment": None,
-            "observed_common_causes": None,
-            "effect_modifiers": None,
-        }
-
-    def _encode(self, data: pd.DataFrame, encoder_name: str):
-        """
-        Encodes categorical columns in the given data, returning a new dataframe containing
-        all original data and the encoded columns. Numerical data is unchanged, categorical
-        types are one-hot encoded. `encoder_name` identifies a specific encoder to be used
-        if available, or created if not. The encoder can be reused in subsequent calls.
-
-        :param data: Data to encode.
-        :param encoder_name: The name for the encoder to be used.
-        :returns: The encoded data.
-        """
-        existing_encoder = self._encoders.get(encoder_name)
-        encoded_variables, encoder = one_hot_encode(
-            data,
-            drop_first=self.encoder_drop_first,
-            encoder=existing_encoder,
-        )
-
-        # Remember encoder
-        self._encoders[encoder_name] = encoder
-        return encoded_variables
-
     def fit(
         self,
         data: pd.DataFrame,
@@ -197,31 +149,6 @@ def _estimate_effect_fn(self, data_df):
         est = self.estimate_effect(data=data_df, need_conditional_estimates=False)
         return est.value
 
-    def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optional[List[str]] = None):
-        """Sets the effect modifiers for the estimator
-        Modifies need_conditional_estimates accordingly to effect modifiers value
-        :param effect_modifiers: Variables on which to compute separate
-            effects, or return a heterogeneous effect function. Not all
-            methods support this currently.
-        """
-        self._effect_modifiers = effect_modifier_names
-        if effect_modifier_names is not None:
-            self._effect_modifier_names = [cname for cname in effect_modifier_names if cname in data.columns]
-            if len(self._effect_modifier_names) > 0:
-                self._effect_modifiers = data[self._effect_modifier_names]
-                self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers")
-                self.logger.debug("Effect modifiers: " + ",".join(self._effect_modifier_names))
-            else:
-                self._effect_modifier_names = []
-        else:
-            self._effect_modifier_names = []
-
-        self.need_conditional_estimates = (
-            self.need_conditional_estimates
-            if self.need_conditional_estimates != "auto"
-            else (self._effect_modifier_names and len(self._effect_modifier_names) > 0)
-        )
-
     def _build_features(self, data_df: pd.DataFrame, treatment_values=None):
         treatment_vals = self._encode(data_df[self._target_estimand.treatment_variable], "treatment")
 

diff --git a/dowhy/causal_estimators/two_stage_regression_estimator.py b/dowhy/causal_estimators/two_stage_regression_estimator.py
@@ -167,6 +167,7 @@ def fit(
                     effects, or return a heterogeneous effect function. Not all
                     methods support this currently.
         """
+        self.reset_encoders()  # Forget any existing encoders
         self._set_effect_modifiers(data, effect_modifier_names)
 
         if len(self._target_estimand.treatment_variable) > 1:
@@ -315,10 +316,12 @@ def build_first_stage_features(self, data_df: pd.DataFrame):
         treatment_vals = data_df[self._target_estimand.treatment_variable]
         if len(self._observed_common_causes_names) > 0:
             observed_common_causes_vals = data_df[self._observed_common_causes_names]
-            observed_common_causes_vals = pd.get_dummies(observed_common_causes_vals, drop_first=True)
+            observed_common_causes_vals = self._encode(observed_common_causes_vals, "observed_common_causes")
+
         if self._effect_modifier_names:
             effect_modifiers_vals = data_df[self._effect_modifier_names]
-            effect_modifiers_vals = pd.get_dummies(effect_modifiers_vals, drop_first=True)
+            effect_modifiers_vals = self._encode(effect_modifiers_vals, "effect_modifiers")
+
         if type(treatment_vals) is not np.ndarray:
             treatment_vals = treatment_vals.to_numpy()
         if treatment_vals.shape[0] != data_df.shape[0]:

diff --git a/dowhy/causal_refuters/add_unobserved_common_cause.py b/dowhy/causal_refuters/add_unobserved_common_cause.py
@@ -19,6 +19,7 @@
 from dowhy.causal_refuter import CausalRefutation, CausalRefuter, choose_variables
 from dowhy.causal_refuters.evalue_sensitivity_analyzer import EValueSensitivityAnalyzer
 from dowhy.causal_refuters.linear_sensitivity_analyzer import LinearSensitivityAnalyzer
+from dowhy.utils.encoding import Encoders
 
 logger = logging.getLogger(__name__)
 
@@ -201,6 +202,41 @@ def include_simulated_confounder(
         )
 
 
+def preprocess_observed_common_causes(
+    data: pd.DataFrame,
+    target_estimand: IdentifiedEstimand,
+    no_common_causes_error_message: str,
+):
+    """
+    Preprocesses backdoor variables (observed common causes) and returns the pre-processed matrix.
+
+    At least one backdoor (common cause) variable is required. Raises an exception if none present.
+
+    Preprocessing has two steps:
+    1. Categorical encoding.
+    2. Standardization.
+
+    :param data: All data, some of which needs preprocessing.
+    :param target_estimand: Estimand for desired effect including definition of backdoor variables.
+    :param no_common_causes_error_message: Message to be displayed with ValueError if no backdoor variable present.
+    :return: DataFrame containing pre-processed data.
+    """
+
+    # 1. Categorical encoding of relevant variables
+    observed_common_causes_names = target_estimand.get_backdoor_variables()
+    if len(observed_common_causes_names) > 0:
+        # The encoded data is only used to calculate a parameter, so the encoder can be discarded.
+        observed_common_causes = data[observed_common_causes_names]
+        encoders = Encoders()
+        observed_common_causes = encoders.encode(observed_common_causes, "observed_common_causes")
+    else:
+        raise ValueError(no_common_causes_error_message)
+
+    # 2. Standardizing the data
+    observed_common_causes = StandardScaler().fit_transform(observed_common_causes)
+    return observed_common_causes
+
+
 def _infer_default_kappa_t(
     data: pd.DataFrame,
     target_estimand: IdentifiedEstimand,
@@ -210,19 +246,10 @@ def _infer_default_kappa_t(
     len_kappa_t: int = 10,
 ):
     """Infer default effect strength of simulated confounder on treatment."""
-    observed_common_causes_names = target_estimand.get_backdoor_variables()
-    if len(observed_common_causes_names) > 0:
-        observed_common_causes = data[observed_common_causes_names]
-        observed_common_causes = pd.get_dummies(observed_common_causes, drop_first=True)
-    else:
-        raise ValueError(
-            "There needs to be at least one common cause to"
-            + "automatically compute the default value of kappa_t."
-            + " Provide a value for kappa_t"
-        )
     t = data[treatment_name]
-    # Standardizing the data
-    observed_common_causes = StandardScaler().fit_transform(observed_common_causes)
+    no_common_causes_error_message = "There needs to be at least one common cause to automatically compute the default value of kappa_t. Provide a value for kappa_t"
+    observed_common_causes = preprocess_observed_common_causes(data, target_estimand, no_common_causes_error_message)
+
     if effect_on_t == "binary_flip":
         # Fit a model containing all confounders and compare predictions
         # using all features compared to all features except a given
@@ -272,19 +299,10 @@ def _infer_default_kappa_y(
     len_kappa_y: int = 10,
 ):
     """Infer default effect strength of simulated confounder on treatment."""
-    observed_common_causes_names = target_estimand.get_backdoor_variables()
-    if len(observed_common_causes_names) > 0:
-        observed_common_causes = data[observed_common_causes_names]
-        observed_common_causes = pd.get_dummies(observed_common_causes, drop_first=True)
-    else:
-        raise ValueError(
-            "There needs to be at least one common cause to"
-            + "automatically compute the default value of kappa_y."
-            + " Provide a value for kappa_y"
-        )
     y = data[outcome_name]
-    # Standardizing the data
-    observed_common_causes = StandardScaler().fit_transform(observed_common_causes)
+    no_common_causes_error_message = "There needs to be at least one common cause to automatically compute the default value of kappa_y. Provide a value for kappa_y"
+    observed_common_causes = preprocess_observed_common_causes(data, target_estimand, no_common_causes_error_message)
+
     if effect_on_y == "binary_flip":
         # Fit a model containing all confounders and compare predictions
         # using all features compared to all features except a given