Skip to content

Commit 65f3031

Browse files
drawlinsonDAVID RAWLINSON
andauthored
Replace all occurrences of get Pandas' get_dummies() with skLearn OneHotEncoder (#1135)
* For consistency and avoidance of future issues, replace all occurrences of Pandas' get_dummies with skLearn's OneHotEncoder. Encoder lifespan: Reuses encoders for new estimate_effect() calls, and replaces existing encoders on CausalEstimator.fit(). Additional uses of get_dummies without side-effects or consistent encoding issues in do-Sampler Propensity Scores utilities also replaced for consistency. Signed-off-by: DAVID RAWLINSON <dave@causalwizard.app> * Add categorical encoding consistency tests for CausalEstimators. Fix bug in arg order for RegressionEstimator._do(). Signed-off-by: DAVID RAWLINSON <dave@causalwizard.app> --------- Signed-off-by: DAVID RAWLINSON <dave@causalwizard.app> Co-authored-by: DAVID RAWLINSON <dave@causalwizard.app>
1 parent dfbbbca commit 65f3031

14 files changed

+409
-113
lines changed

dowhy/causal_estimator.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import dowhy.interpreters as interpreters
1212
from dowhy.causal_identifier.identified_estimand import IdentifiedEstimand
1313
from dowhy.utils.api import parse_state
14+
from dowhy.utils.encoding import Encoders
1415

1516
logger = logging.getLogger(__name__)
1617

@@ -112,6 +113,35 @@ def __init__(
112113
self._bootstrap_estimates = None
113114
self._bootstrap_null_estimates = None
114115

116+
self._encoders = Encoders()
117+
118+
def reset_encoders(self):
119+
"""
120+
Removes any reference to data encoders, causing them to be re-created on next `fit()`.
121+
122+
It's important that data is consistently encoded otherwise models will produce inconsistent output.
123+
In particular, categorical variables are one-hot encoded; the mapping of original data values
124+
must be identical between model training/fitting and inference time.
125+
126+
Encoders are reset when `fit()` is called again, as the data is assumed to have changed.
127+
128+
A separate encoder is used for each subset of variables (treatment, common causes and effect modifiers).
129+
"""
130+
self._encoders.reset()
131+
132+
def _encode(self, data: pd.DataFrame, encoder_name: str):
133+
"""
134+
Encodes categorical columns in the given data, returning a new dataframe containing
135+
all original data and the encoded columns. Numerical data is unchanged, categorical
136+
types are one-hot encoded. `encoder_name` identifies a specific encoder to be used
137+
if available, or created if not. The encoder can be reused in subsequent calls.
138+
139+
:param data: Data to encode.
140+
:param encoder_name: The name for the encoder to be used.
141+
:returns: The encoded data.
142+
"""
143+
return self._encoders.encode(data, encoder_name)
144+
115145
def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optional[List[str]] = None):
116146
"""Sets the effect modifiers for the estimator
117147
Modifies need_conditional_estimates accordingly to effect modifiers value
@@ -124,7 +154,7 @@ def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optio
124154
self._effect_modifier_names = [cname for cname in effect_modifier_names if cname in data.columns]
125155
if len(self._effect_modifier_names) > 0:
126156
self._effect_modifiers = data[self._effect_modifier_names]
127-
self._effect_modifiers = pd.get_dummies(self._effect_modifiers, drop_first=True)
157+
self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers")
128158
self.logger.debug("Effect modifiers: " + ",".join(self._effect_modifier_names))
129159
else:
130160
self._effect_modifier_names = []
@@ -234,7 +264,10 @@ def _estimate_conditional_effects(
234264
effect_modifier_names[i] = prefix + str(em)
235265
# Grouping by effect modifiers and computing effect separately
236266
by_effect_mods = data.groupby(effect_modifier_names)
237-
cond_est_fn = lambda x: self._do(self._treatment_value, x) - self._do(self._control_value, x)
267+
268+
def cond_est_fn(x):
269+
return self._do(self._treatment_value, x) - self._do(self._control_value, x)
270+
238271
conditional_estimates = by_effect_mods.apply(estimate_effect_fn)
239272
# Deleting the temporary categorical columns
240273
for em in effect_modifier_names:

dowhy/causal_estimators/causalml.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ def fit(
116116
effects, or return a heterogeneous effect function. Not all
117117
methods support this currently.
118118
"""
119+
self.reset_encoders() # Forget any existing encoders
119120
self._set_effect_modifiers(data, effect_modifier_names)
120121

121122
# Check the backdoor variables being used
@@ -127,7 +128,7 @@ def fit(
127128
# Get the data of the unobserved confounders
128129
self._observed_common_causes = data[self._observed_common_causes_names]
129130
# One hot encode the data if they are categorical
130-
self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
131+
self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
131132
else:
132133
self._observed_common_causes = []
133134

@@ -138,7 +139,7 @@ def fit(
138139
self._instrumental_variable_names = self._target_estimand.instrumental_variables
139140
if self._instrumental_variable_names:
140141
self._instrumental_variables = data[self._instrumental_variable_names]
141-
self._instrumental_variables = pd.get_dummies(self._instrumental_variables, drop_first=True)
142+
self._instrumental_variables = self._encode(self._instrumental_variables, "instrumental_variables")
142143
else:
143144
self._instrumental_variables = []
144145

dowhy/causal_estimators/distance_matching_estimator.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ def fit(
122122
"""
123123
self.exact_match_cols = exact_match_cols
124124

125+
self.reset_encoders() # Forget any existing encoders
125126
self._set_effect_modifiers(data, effect_modifier_names)
126127

127128
# Check if the treatment is one-dimensional
@@ -146,7 +147,7 @@ def fit(
146147
# Convert the categorical variables into dummy/indicator variables
147148
# Basically, this gives a one hot encoding for each category
148149
# The first category is taken to be the base line.
149-
self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
150+
self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
150151
else:
151152
self._observed_common_causes = None
152153
error_msg = "No common causes/confounders present. Distance matching methods are not applicable"

dowhy/causal_estimators/econml.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ def fit(
120120
effects, or return a heterogeneous effect function. Not all
121121
methods support this currently.
122122
"""
123+
self.reset_encoders() # Forget any existing encoders
123124
self._set_effect_modifiers(data, effect_modifier_names)
124125
# Save parameters for later refutter fitting
125126
self._econml_fit_params = kwargs
@@ -148,12 +149,12 @@ def fit(
148149
# Also only update self._effect_modifiers, and create a copy of self._effect_modifier_names
149150
# the latter can be used by other estimator methods later
150151
self._effect_modifiers = data[effect_modifier_names]
151-
self._effect_modifiers = pd.get_dummies(self._effect_modifiers, drop_first=True)
152+
self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers")
152153
self._effect_modifier_names = effect_modifier_names
153154
self.logger.debug("Effect modifiers: " + ",".join(effect_modifier_names))
154155
if self._observed_common_causes_names:
155156
self._observed_common_causes = data[self._observed_common_causes_names]
156-
self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
157+
self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
157158
else:
158159
self._observed_common_causes = None
159160
self.logger.debug("Back-door variables used:" + ",".join(self._observed_common_causes_names))
@@ -165,7 +166,7 @@ def fit(
165166
self.estimating_instrument_names = parse_state(self.iv_instrument_name)
166167
if self.estimating_instrument_names:
167168
self._estimating_instruments = data[self.estimating_instrument_names]
168-
self._estimating_instruments = pd.get_dummies(self._estimating_instruments, drop_first=True)
169+
self._estimating_instruments = self._encode(self._estimating_instruments, "estimating_instruments")
169170
else:
170171
self._estimating_instruments = None
171172

@@ -277,7 +278,7 @@ def _estimate_confidence_intervals(self, confidence_level=None, method=None):
277278
"""Returns None if the confidence interval has not been calculated."""
278279
return self.effect_intervals
279280

280-
def _do(self, x):
281+
def _do(self, x, data_df=None):
281282
raise NotImplementedError
282283

283284
def construct_symbolic_estimator(self, estimand):

dowhy/causal_estimators/instrumental_variable_estimator.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ def fit(
9292
effects, or return a heterogeneous effect function. Not all
9393
methods support this currently.
9494
"""
95+
self.reset_encoders() # Forget any existing encoders
9596
self._set_effect_modifiers(data, effect_modifier_names)
9697

9798
self.estimating_instrument_names = self._target_estimand.instrumental_variables

dowhy/causal_estimators/propensity_score_estimator.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ def fit(
9393
effects, or return a heterogeneous effect function. Not all
9494
methods support this currently.
9595
"""
96+
self.reset_encoders() # Forget any existing encoders
9697
self._set_effect_modifiers(data, effect_modifier_names)
9798

9899
self.logger.debug("Back-door variables used:" + ",".join(self._target_estimand.get_backdoor_variables()))
@@ -103,7 +104,8 @@ def fit(
103104
# Convert the categorical variables into dummy/indicator variables
104105
# Basically, this gives a one hot encoding for each category
105106
# The first category is taken to be the base line.
106-
self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
107+
self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
108+
107109
else:
108110
self._observed_common_causes = None
109111
error_msg = "No common causes/confounders present. Propensity score based methods are not applicable"

dowhy/causal_estimators/regression_discontinuity_estimator.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ def fit(
9898
effects, or return a heterogeneous effect function. Not all
9999
methods support this currently.
100100
"""
101+
self.reset_encoders() # Forget any existing encoders
101102
self._set_effect_modifiers(data, effect_modifier_names)
102103

103104
self.rd_variable = data[self.rd_variable_name]

dowhy/causal_estimators/regression_estimator.py

Lines changed: 6 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import statsmodels.api as sm
66

77
from dowhy.causal_estimator import CausalEstimate, CausalEstimator, IdentifiedEstimand
8-
from dowhy.utils.encoding import one_hot_encode
98

109

1110
class RegressionEstimator(CausalEstimator):
@@ -71,53 +70,6 @@ def __init__(
7170

7271
self.model = None
7372

74-
# Data encoders
75-
# encoder_drop_first will not encode the first category value with a bit in 1-hot encoding.
76-
# It will be implicit instead, by the absence of any bit representing this value in the relevant columns.
77-
# Set to False to include a bit for each value of every categorical variable.
78-
self.encoder_drop_first = True
79-
self.reset_encoders()
80-
81-
def reset_encoders(self):
82-
"""
83-
Removes any reference to data encoders, causing them to be re-created on next `fit()`.
84-
85-
It's important that data is consistently encoded otherwise models will produce inconsistent output.
86-
In particular, categorical variables are one-hot encoded; the mapping of original data values
87-
must be identical between model training/fitting and inference time.
88-
89-
Encoders are reset when `fit()` is called again, as the data is assumed to have changed.
90-
91-
A separate encoder is used for each subset of variables (treatment, common causes and effect modifiers).
92-
"""
93-
self._encoders = {
94-
"treatment": None,
95-
"observed_common_causes": None,
96-
"effect_modifiers": None,
97-
}
98-
99-
def _encode(self, data: pd.DataFrame, encoder_name: str):
100-
"""
101-
Encodes categorical columns in the given data, returning a new dataframe containing
102-
all original data and the encoded columns. Numerical data is unchanged, categorical
103-
types are one-hot encoded. `encoder_name` identifies a specific encoder to be used
104-
if available, or created if not. The encoder can be reused in subsequent calls.
105-
106-
:param data: Data to encode.
107-
:param encoder_name: The name for the encoder to be used.
108-
:returns: The encoded data.
109-
"""
110-
existing_encoder = self._encoders.get(encoder_name)
111-
encoded_variables, encoder = one_hot_encode(
112-
data,
113-
drop_first=self.encoder_drop_first,
114-
encoder=existing_encoder,
115-
)
116-
117-
# Remember encoder
118-
self._encoders[encoder_name] = encoder
119-
return encoded_variables
120-
12173
def fit(
12274
self,
12375
data: pd.DataFrame,
@@ -170,7 +122,7 @@ def estimate_effect(
170122
need_conditional_estimates = self.need_conditional_estimates
171123
# TODO make treatment_value and control value also as local parameters
172124
# All treatments are set to the same constant value
173-
effect_estimate = self._do(data, treatment_value) - self._do(data, control_value)
125+
effect_estimate = self._do(treatment_value, data) - self._do(control_value, data)
174126
conditional_effect_estimates = None
175127
if need_conditional_estimates:
176128
conditional_effect_estimates = self._estimate_conditional_effects(
@@ -197,31 +149,6 @@ def _estimate_effect_fn(self, data_df):
197149
est = self.estimate_effect(data=data_df, need_conditional_estimates=False)
198150
return est.value
199151

200-
def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optional[List[str]] = None):
201-
"""Sets the effect modifiers for the estimator
202-
Modifies need_conditional_estimates accordingly to effect modifiers value
203-
:param effect_modifiers: Variables on which to compute separate
204-
effects, or return a heterogeneous effect function. Not all
205-
methods support this currently.
206-
"""
207-
self._effect_modifiers = effect_modifier_names
208-
if effect_modifier_names is not None:
209-
self._effect_modifier_names = [cname for cname in effect_modifier_names if cname in data.columns]
210-
if len(self._effect_modifier_names) > 0:
211-
self._effect_modifiers = data[self._effect_modifier_names]
212-
self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers")
213-
self.logger.debug("Effect modifiers: " + ",".join(self._effect_modifier_names))
214-
else:
215-
self._effect_modifier_names = []
216-
else:
217-
self._effect_modifier_names = []
218-
219-
self.need_conditional_estimates = (
220-
self.need_conditional_estimates
221-
if self.need_conditional_estimates != "auto"
222-
else (self._effect_modifier_names and len(self._effect_modifier_names) > 0)
223-
)
224-
225152
def _build_features(self, data_df: pd.DataFrame, treatment_values=None):
226153
treatment_vals = self._encode(data_df[self._target_estimand.treatment_variable], "treatment")
227154

@@ -295,6 +222,10 @@ def predict(self, data_df):
295222
interventional_outcomes = self.predict_fn(data_df, self.model, new_features)
296223
return interventional_outcomes
297224

298-
def _do(self, data_df: pd.DataFrame, treatment_val):
225+
def _do(
226+
self,
227+
treatment_val,
228+
data_df: pd.DataFrame,
229+
):
299230
interventional_outcomes = self.interventional_outcomes(data_df, treatment_val)
300231
return interventional_outcomes.mean()

dowhy/causal_estimators/two_stage_regression_estimator.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@ def fit(
167167
effects, or return a heterogeneous effect function. Not all
168168
methods support this currently.
169169
"""
170+
self.reset_encoders() # Forget any existing encoders
170171
self._set_effect_modifiers(data, effect_modifier_names)
171172

172173
if len(self._target_estimand.treatment_variable) > 1:
@@ -315,10 +316,12 @@ def build_first_stage_features(self, data_df: pd.DataFrame):
315316
treatment_vals = data_df[self._target_estimand.treatment_variable]
316317
if len(self._observed_common_causes_names) > 0:
317318
observed_common_causes_vals = data_df[self._observed_common_causes_names]
318-
observed_common_causes_vals = pd.get_dummies(observed_common_causes_vals, drop_first=True)
319+
observed_common_causes_vals = self._encode(observed_common_causes_vals, "observed_common_causes")
320+
319321
if self._effect_modifier_names:
320322
effect_modifiers_vals = data_df[self._effect_modifier_names]
321-
effect_modifiers_vals = pd.get_dummies(effect_modifiers_vals, drop_first=True)
323+
effect_modifiers_vals = self._encode(effect_modifiers_vals, "effect_modifiers")
324+
322325
if type(treatment_vals) is not np.ndarray:
323326
treatment_vals = treatment_vals.to_numpy()
324327
if treatment_vals.shape[0] != data_df.shape[0]:

0 commit comments

Comments
 (0)