diff --git a/dowhy/causal_estimators/regression_estimator.py b/dowhy/causal_estimators/regression_estimator.py index d0394c86c6..f7c86fb357 100644 --- a/dowhy/causal_estimators/regression_estimator.py +++ b/dowhy/causal_estimators/regression_estimator.py @@ -5,6 +5,7 @@ import statsmodels.api as sm from dowhy.causal_estimator import CausalEstimate, CausalEstimator, IdentifiedEstimand +from dowhy.utils.encoding import one_hot_encode class RegressionEstimator(CausalEstimator): @@ -70,6 +71,53 @@ def __init__( self.model = None + # Data encoders + # encoder_drop_first will not encode the first category value with a bit in 1-hot encoding. + # It will be implicit instead, by the absence of any bit representing this value in the relevant columns. + # Set to False to include a bit for each value of every categorical variable. + self.encoder_drop_first = True + self.reset_encoders() + + def reset_encoders(self): + """ + Removes any reference to data encoders, causing them to be re-created on next `fit()`. + + It's important that data is consistently encoded otherwise models will produce inconsistent output. + In particular, categorical variables are one-hot encoded; the mapping of original data values + must be identical between model training/fitting and inference time. + + Encoders are reset when `fit()` is called again, as the data is assumed to have changed. + + A separate encoder is used for each subset of variables (treatment, common causes and effect modifiers). + """ + self._encoders = { + "treatment": None, + "observed_common_causes": None, + "effect_modifiers": None, + } + + def _encode(self, data: pd.DataFrame, encoder_name: str): + """ + Encodes categorical columns in the given data, returning a new dataframe containing + all original data and the encoded columns. Numerical data is unchanged, categorical + types are one-hot encoded. `encoder_name` identifies a specific encoder to be used + if available, or created if not. The encoder can be reused in subsequent calls. + + :param data: Data to encode. + :param encoder_name: The name for the encoder to be used. + :returns: The encoded data. + """ + existing_encoder = self._encoders.get(encoder_name) + encoded_variables, encoder = one_hot_encode( + data, + drop_first=self.encoder_drop_first, + encoder=existing_encoder, + ) + + # Remember encoder + self._encoders[encoder_name] = encoder + return encoded_variables + def fit( self, data: pd.DataFrame, @@ -84,13 +132,14 @@ def fit( effects, or return a heterogeneous effect function. Not all methods support this currently. """ + self.reset_encoders() # Forget any existing encoders self._set_effect_modifiers(data, effect_modifier_names) self.logger.debug("Back-door variables used:" + ",".join(self._target_estimand.get_backdoor_variables())) self._observed_common_causes_names = self._target_estimand.get_backdoor_variables() if len(self._observed_common_causes_names) > 0: self._observed_common_causes = data[self._observed_common_causes_names] - self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True) + self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes") else: self._observed_common_causes = None @@ -148,14 +197,42 @@ def _estimate_effect_fn(self, data_df): est = self.estimate_effect(data=data_df, need_conditional_estimates=False) return est.value + def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optional[List[str]] = None): + """Sets the effect modifiers for the estimator + Modifies need_conditional_estimates accordingly to effect modifiers value + :param effect_modifiers: Variables on which to compute separate + effects, or return a heterogeneous effect function. Not all + methods support this currently. + """ + self._effect_modifiers = effect_modifier_names + if effect_modifier_names is not None: + self._effect_modifier_names = [cname for cname in effect_modifier_names if cname in data.columns] + if len(self._effect_modifier_names) > 0: + self._effect_modifiers = data[self._effect_modifier_names] + self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers") + self.logger.debug("Effect modifiers: " + ",".join(self._effect_modifier_names)) + else: + self._effect_modifier_names = [] + else: + self._effect_modifier_names = [] + + self.need_conditional_estimates = ( + self.need_conditional_estimates + if self.need_conditional_estimates != "auto" + else (self._effect_modifier_names and len(self._effect_modifier_names) > 0) + ) + def _build_features(self, data_df: pd.DataFrame, treatment_values=None): - treatment_vals = pd.get_dummies(data_df[self._target_estimand.treatment_variable], drop_first=True) + treatment_vals = self._encode(data_df[self._target_estimand.treatment_variable], "treatment") + if len(self._observed_common_causes_names) > 0: observed_common_causes_vals = data_df[self._observed_common_causes_names] - observed_common_causes_vals = pd.get_dummies(observed_common_causes_vals, drop_first=True) + observed_common_causes_vals = self._encode(observed_common_causes_vals, "observed_common_causes") + if self._effect_modifier_names: effect_modifiers_vals = data_df[self._effect_modifier_names] - effect_modifiers_vals = pd.get_dummies(effect_modifiers_vals, drop_first=True) + effect_modifiers_vals = self._encode(effect_modifiers_vals, "effect_modifiers") + # Fixing treatment value to the specified value, if provided if treatment_values is not None: treatment_vals = treatment_values @@ -164,6 +241,7 @@ def _build_features(self, data_df: pd.DataFrame, treatment_values=None): # treatment_vals and data_df should have same number of rows if treatment_vals.shape[0] != data_df.shape[0]: raise ValueError("Provided treatment values and dataframe should have the same length.") + # Bulding the feature matrix n_treatment_cols = 1 if len(treatment_vals.shape) == 1 else treatment_vals.shape[1] n_samples = treatment_vals.shape[0] @@ -195,32 +273,25 @@ def interventional_outcomes(self, data_df: pd.DataFrame, treatment_val): """ if data_df is None: - data_df = self._data + data_df = self._data.copy() + else: + data_df = data_df.copy() # don't modify arg + + # Replace treatment values with value supplied; note: Don't change column datatype! + original_type = data_df[self._target_estimand.treatment_variable].dtypes + data_df[self._target_estimand.treatment_variable] = treatment_val + data_df[self._target_estimand.treatment_variable] = data_df[self._target_estimand.treatment_variable].astype( + original_type, copy=False + ) + + return self.predict(data_df) + + def predict(self, data_df): if not self.model: # The model is always built on the entire data - _, self.model = self._build_model(data_df) - # Replacing treatment values by given x - # First, create interventional tensor in original space - interventional_treatment_values = np.full( - (data_df.shape[0], len(self._target_estimand.treatment_variable)), treatment_val - ) - # Then, use pandas to ensure that the dummies are assigned correctly for a categorical treatment - interventional_treatment_2d = pd.concat( - [ - data_df[self._target_estimand.treatment_variable].copy(), - pd.DataFrame( - data=interventional_treatment_values, - columns=data_df[self._target_estimand.treatment_variable].columns, - ), - ], - axis=0, - ).astype(data_df[self._target_estimand.treatment_variable].dtypes, copy=False) - interventional_treatment_2d = pd.get_dummies(interventional_treatment_2d, drop_first=True) - interventional_treatment_2d = interventional_treatment_2d[ - data_df[self._target_estimand.treatment_variable].shape[0] : - ] - - new_features = self._build_features(data_df, treatment_values=interventional_treatment_2d) + _, self.model = self._build_model() + + new_features = self._build_features(data_df=data_df) interventional_outcomes = self.predict_fn(data_df, self.model, new_features) return interventional_outcomes diff --git a/dowhy/utils/encoding.py b/dowhy/utils/encoding.py new file mode 100644 index 0000000000..722e1473ae --- /dev/null +++ b/dowhy/utils/encoding.py @@ -0,0 +1,62 @@ +import pandas as pd +from pandas.core.dtypes.common import is_list_like +from sklearn.preprocessing import OneHotEncoder + + +def one_hot_encode(data: pd.DataFrame, columns=None, drop_first: bool = False, encoder: OneHotEncoder = None): + """ + Replaces pandas' get_dummies with an implementation of sklearn.preprocessing.OneHotEncoder. + + The purpose of replacement is to allow encoding of new data using the same encoder, which ensures that the resulting encodings are consistent. + + If encoder is None, a new instance of sklearn.preprocessing.OneHotEncoder will be created using `fit_transform()`. Otherwise, the existing encoder is used with `fit()`. + + For compatibility with get_dummies, the encoded data will be transformed into a DataFrame. + + In all cases, the return value will be the encoded data and the encoder object (even if passed in). If `data` contains other columns than the + dummy-coded one(s), these will be prepended, unaltered, to the result. + + :param data: Data of which to get dummy indicators. + :param columns: List-like structure containing specific columns to encode. + :param drop_first: Whether to get k-1 dummies out of k categorical levels by removing the first level. + :return: DataFrame, OneHotEncoder + """ + + # Determine columns being encoded + if columns is None: + dtypes_to_encode = ["object", "string", "category"] + data_to_encode = data.select_dtypes(include=dtypes_to_encode) + elif not is_list_like(columns): + raise TypeError("Input must be a list-like for parameter `columns`") + else: + data_to_encode = data[columns] + + # If all columns are already numerical, there may be nothing to encode. + # In this case, return original data. + if len(data_to_encode.columns) == 0: + return data, encoder # Encoder may be None + + # Columns to keep in the result - not encoded. + columns_to_keep = data.columns.difference(data_to_encode.columns) + df_columns_to_keep = data[columns_to_keep].reset_index(drop=True) + + if encoder is None: # Create new encoder + drop = None + if drop_first: + drop = "first" + encoder = OneHotEncoder(drop=drop, sparse=False) # NB sparse renamed to sparse_output in sklearn 1.2+ + + encoded_data = encoder.fit_transform(data_to_encode) + + else: # Use existing encoder + encoded_data = encoder.transform(data_to_encode) + + # Convert the encoded data to a DataFrame + columns_encoded = encoder.get_feature_names_out(data_to_encode.columns) + + df_encoded = pd.DataFrame(encoded_data, columns=columns_encoded).reset_index(drop=True) # drop index from original + + # Concatenate the encoded DataFrame with the original non-categorical columns + df_result = pd.concat([df_columns_to_keep, df_encoded], axis=1) + + return df_result, encoder diff --git a/tests/utils/test_encoding.py b/tests/utils/test_encoding.py new file mode 100644 index 0000000000..3fb69062f7 --- /dev/null +++ b/tests/utils/test_encoding.py @@ -0,0 +1,87 @@ +import networkx as nx +import numpy as np +import pandas as pd +from _pytest.python_api import approx + +from dowhy.utils.encoding import one_hot_encode + + +def test_one_hot_encode_equivalent_to_get_dummies(): + + # Use a mix of already-numeric and requires encoding cols: + data = { + "C": ["X", "Y", "Z", "X", "Y", "Z"], + "N": [1, 2, 3, 4, 5, 6], + } + df = pd.DataFrame(data) + + # NB There may be small differences in type but since all values will be used in models as float, + # comparison is done as this type. + df_dummies = pd.get_dummies(df, drop_first=True) + df_dummies = df_dummies.astype(float) + + df_sklearn, _ = one_hot_encode(df, drop_first=True) + df_sklearn = df_sklearn.astype(float) + + # Check same rows + len1 = len(df_dummies) + len2 = len(df_sklearn) + assert len1 == len2 + + # Check same number of cols + len1 = len(df_dummies.columns) + len2 = len(df_sklearn.columns) + assert len1 == len2 + + # Check values + # Calculate the sum of absolute differences between the two DataFrames + # - should be zero (excl. floating point error) + sum_abs_diff = (df_dummies - df_sklearn).abs().sum().sum() + assert sum_abs_diff == approx(0.0) + + +def test_one_hot_encode_consistent_with_new_data(): + + # Use a mix of already-numeric and requires encoding cols: + data1 = { + "C": ["X", "Y", "Z", "X", "Y", "Z"], + "N": [1, 2, 3, 4, 5, 6], + } + df1 = pd.DataFrame(data1) + + # Initial encode + df_encoded1, encoder = one_hot_encode(df1, drop_first=True) + df_encoded1 = df_encoded1.astype(float) + + # Create new data with permuted rows. + # Output shape should be unchanged. + data2 = { + "C": ["Y", "Z", "X", "X", "Y", "Z"], + "N": [1, 2, 3, 4, 5, 6], + } + df2 = pd.DataFrame(data2) + + # Encode this new data. + df_encoded2, _ = one_hot_encode(df2, encoder=encoder, drop_first=True) + df_encoded2 = df_encoded2.astype(float) + + # Check same rows + len1 = len(df_encoded1) + len2 = len(df_encoded2) + assert len1 == len2 + + # Check same number of cols + len1 = len(df_encoded1.columns) + len2 = len(df_encoded2.columns) + assert len1 == len2 + + # Check permuted values are consistent + c_y1 = df_encoded1["C_Y"] + c_y2 = df_encoded2["C_Y"] + assert c_y1[1] == c_y2[0] + assert c_y1[4] == c_y2[4] + + c_z1 = df_encoded1["C_Z"] + c_z2 = df_encoded2["C_Z"] + assert c_z1[2] == c_z2[1] + assert c_z1[5] == c_z2[5]