Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace all occurrences of get Pandas' get_dummies() with skLearn OneHotEncoder #1134

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 35 additions & 2 deletions dowhy/causal_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import dowhy.interpreters as interpreters
from dowhy.causal_identifier.identified_estimand import IdentifiedEstimand
from dowhy.utils.api import parse_state
from dowhy.utils.encoding import Encoders

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -112,6 +113,35 @@ def __init__(
self._bootstrap_estimates = None
self._bootstrap_null_estimates = None

self._encoders = Encoders()

def reset_encoders(self):
"""
Removes any reference to data encoders, causing them to be re-created on next `fit()`.

It's important that data is consistently encoded otherwise models will produce inconsistent output.
In particular, categorical variables are one-hot encoded; the mapping of original data values
must be identical between model training/fitting and inference time.

Encoders are reset when `fit()` is called again, as the data is assumed to have changed.

A separate encoder is used for each subset of variables (treatment, common causes and effect modifiers).
"""
self._encoders.reset()

def _encode(self, data: pd.DataFrame, encoder_name: str):
"""
Encodes categorical columns in the given data, returning a new dataframe containing
all original data and the encoded columns. Numerical data is unchanged, categorical
types are one-hot encoded. `encoder_name` identifies a specific encoder to be used
if available, or created if not. The encoder can be reused in subsequent calls.

:param data: Data to encode.
:param encoder_name: The name for the encoder to be used.
:returns: The encoded data.
"""
return self._encoders.encode(data, encoder_name)

def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optional[List[str]] = None):
"""Sets the effect modifiers for the estimator
Modifies need_conditional_estimates accordingly to effect modifiers value
Expand All @@ -124,7 +154,7 @@ def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optio
self._effect_modifier_names = [cname for cname in effect_modifier_names if cname in data.columns]
if len(self._effect_modifier_names) > 0:
self._effect_modifiers = data[self._effect_modifier_names]
self._effect_modifiers = pd.get_dummies(self._effect_modifiers, drop_first=True)
self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers")
self.logger.debug("Effect modifiers: " + ",".join(self._effect_modifier_names))
else:
self._effect_modifier_names = []
Expand Down Expand Up @@ -234,7 +264,10 @@ def _estimate_conditional_effects(
effect_modifier_names[i] = prefix + str(em)
# Grouping by effect modifiers and computing effect separately
by_effect_mods = data.groupby(effect_modifier_names)
cond_est_fn = lambda x: self._do(self._treatment_value, x) - self._do(self._control_value, x)

def cond_est_fn(x):
return self._do(self._treatment_value, x) - self._do(self._control_value, x)

conditional_estimates = by_effect_mods.apply(estimate_effect_fn)
# Deleting the temporary categorical columns
for em in effect_modifier_names:
Expand Down
5 changes: 3 additions & 2 deletions dowhy/causal_estimators/causalml.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ def fit(
effects, or return a heterogeneous effect function. Not all
methods support this currently.
"""
self.reset_encoders() # Forget any existing encoders
self._set_effect_modifiers(data, effect_modifier_names)

# Check the backdoor variables being used
Expand All @@ -127,7 +128,7 @@ def fit(
# Get the data of the unobserved confounders
self._observed_common_causes = data[self._observed_common_causes_names]
# One hot encode the data if they are categorical
self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
else:
self._observed_common_causes = []

Expand All @@ -138,7 +139,7 @@ def fit(
self._instrumental_variable_names = self._target_estimand.instrumental_variables
if self._instrumental_variable_names:
self._instrumental_variables = data[self._instrumental_variable_names]
self._instrumental_variables = pd.get_dummies(self._instrumental_variables, drop_first=True)
self._instrumental_variables = self._encode(self._instrumental_variables, "instrumental_variables")
else:
self._instrumental_variables = []

Expand Down
3 changes: 2 additions & 1 deletion dowhy/causal_estimators/distance_matching_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ def fit(
"""
self.exact_match_cols = exact_match_cols

self.reset_encoders() # Forget any existing encoders
self._set_effect_modifiers(data, effect_modifier_names)

# Check if the treatment is one-dimensional
Expand All @@ -146,7 +147,7 @@ def fit(
# Convert the categorical variables into dummy/indicator variables
# Basically, this gives a one hot encoding for each category
# The first category is taken to be the base line.
self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
else:
self._observed_common_causes = None
error_msg = "No common causes/confounders present. Distance matching methods are not applicable"
Expand Down
7 changes: 4 additions & 3 deletions dowhy/causal_estimators/econml.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ def fit(
effects, or return a heterogeneous effect function. Not all
methods support this currently.
"""
self.reset_encoders() # Forget any existing encoders
self._set_effect_modifiers(data, effect_modifier_names)
# Save parameters for later refutter fitting
self._econml_fit_params = kwargs
Expand Down Expand Up @@ -148,12 +149,12 @@ def fit(
# Also only update self._effect_modifiers, and create a copy of self._effect_modifier_names
# the latter can be used by other estimator methods later
self._effect_modifiers = data[effect_modifier_names]
self._effect_modifiers = pd.get_dummies(self._effect_modifiers, drop_first=True)
self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers")
self._effect_modifier_names = effect_modifier_names
self.logger.debug("Effect modifiers: " + ",".join(effect_modifier_names))
if self._observed_common_causes_names:
self._observed_common_causes = data[self._observed_common_causes_names]
self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
else:
self._observed_common_causes = None
self.logger.debug("Back-door variables used:" + ",".join(self._observed_common_causes_names))
Expand All @@ -165,7 +166,7 @@ def fit(
self.estimating_instrument_names = parse_state(self.iv_instrument_name)
if self.estimating_instrument_names:
self._estimating_instruments = data[self.estimating_instrument_names]
self._estimating_instruments = pd.get_dummies(self._estimating_instruments, drop_first=True)
self._estimating_instruments = self._encode(self._estimating_instruments, "estimating_instruments")
else:
self._estimating_instruments = None

Expand Down
1 change: 1 addition & 0 deletions dowhy/causal_estimators/instrumental_variable_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def fit(
effects, or return a heterogeneous effect function. Not all
methods support this currently.
"""
self.reset_encoders() # Forget any existing encoders
self._set_effect_modifiers(data, effect_modifier_names)

self.estimating_instrument_names = self._target_estimand.instrumental_variables
Expand Down
4 changes: 3 additions & 1 deletion dowhy/causal_estimators/propensity_score_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def fit(
effects, or return a heterogeneous effect function. Not all
methods support this currently.
"""
self.reset_encoders() # Forget any existing encoders
self._set_effect_modifiers(data, effect_modifier_names)

self.logger.debug("Back-door variables used:" + ",".join(self._target_estimand.get_backdoor_variables()))
Expand All @@ -103,7 +104,8 @@ def fit(
# Convert the categorical variables into dummy/indicator variables
# Basically, this gives a one hot encoding for each category
# The first category is taken to be the base line.
self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")

else:
self._observed_common_causes = None
error_msg = "No common causes/confounders present. Propensity score based methods are not applicable"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ def fit(
effects, or return a heterogeneous effect function. Not all
methods support this currently.
"""
self.reset_encoders() # Forget any existing encoders
self._set_effect_modifiers(data, effect_modifier_names)

self.rd_variable = data[self.rd_variable_name]
Expand Down
73 changes: 0 additions & 73 deletions dowhy/causal_estimators/regression_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import statsmodels.api as sm

from dowhy.causal_estimator import CausalEstimate, CausalEstimator, IdentifiedEstimand
from dowhy.utils.encoding import one_hot_encode


class RegressionEstimator(CausalEstimator):
Expand Down Expand Up @@ -71,53 +70,6 @@ def __init__(

self.model = None

# Data encoders
# encoder_drop_first will not encode the first category value with a bit in 1-hot encoding.
# It will be implicit instead, by the absence of any bit representing this value in the relevant columns.
# Set to False to include a bit for each value of every categorical variable.
self.encoder_drop_first = True
self.reset_encoders()

def reset_encoders(self):
"""
Removes any reference to data encoders, causing them to be re-created on next `fit()`.

It's important that data is consistently encoded otherwise models will produce inconsistent output.
In particular, categorical variables are one-hot encoded; the mapping of original data values
must be identical between model training/fitting and inference time.

Encoders are reset when `fit()` is called again, as the data is assumed to have changed.

A separate encoder is used for each subset of variables (treatment, common causes and effect modifiers).
"""
self._encoders = {
"treatment": None,
"observed_common_causes": None,
"effect_modifiers": None,
}

def _encode(self, data: pd.DataFrame, encoder_name: str):
"""
Encodes categorical columns in the given data, returning a new dataframe containing
all original data and the encoded columns. Numerical data is unchanged, categorical
types are one-hot encoded. `encoder_name` identifies a specific encoder to be used
if available, or created if not. The encoder can be reused in subsequent calls.

:param data: Data to encode.
:param encoder_name: The name for the encoder to be used.
:returns: The encoded data.
"""
existing_encoder = self._encoders.get(encoder_name)
encoded_variables, encoder = one_hot_encode(
data,
drop_first=self.encoder_drop_first,
encoder=existing_encoder,
)

# Remember encoder
self._encoders[encoder_name] = encoder
return encoded_variables

def fit(
self,
data: pd.DataFrame,
Expand Down Expand Up @@ -197,31 +149,6 @@ def _estimate_effect_fn(self, data_df):
est = self.estimate_effect(data=data_df, need_conditional_estimates=False)
return est.value

def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optional[List[str]] = None):
"""Sets the effect modifiers for the estimator
Modifies need_conditional_estimates accordingly to effect modifiers value
:param effect_modifiers: Variables on which to compute separate
effects, or return a heterogeneous effect function. Not all
methods support this currently.
"""
self._effect_modifiers = effect_modifier_names
if effect_modifier_names is not None:
self._effect_modifier_names = [cname for cname in effect_modifier_names if cname in data.columns]
if len(self._effect_modifier_names) > 0:
self._effect_modifiers = data[self._effect_modifier_names]
self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers")
self.logger.debug("Effect modifiers: " + ",".join(self._effect_modifier_names))
else:
self._effect_modifier_names = []
else:
self._effect_modifier_names = []

self.need_conditional_estimates = (
self.need_conditional_estimates
if self.need_conditional_estimates != "auto"
else (self._effect_modifier_names and len(self._effect_modifier_names) > 0)
)

def _build_features(self, data_df: pd.DataFrame, treatment_values=None):
treatment_vals = self._encode(data_df[self._target_estimand.treatment_variable], "treatment")

Expand Down
7 changes: 5 additions & 2 deletions dowhy/causal_estimators/two_stage_regression_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ def fit(
effects, or return a heterogeneous effect function. Not all
methods support this currently.
"""
self.reset_encoders() # Forget any existing encoders
self._set_effect_modifiers(data, effect_modifier_names)

if len(self._target_estimand.treatment_variable) > 1:
Expand Down Expand Up @@ -315,10 +316,12 @@ def build_first_stage_features(self, data_df: pd.DataFrame):
treatment_vals = data_df[self._target_estimand.treatment_variable]
if len(self._observed_common_causes_names) > 0:
observed_common_causes_vals = data_df[self._observed_common_causes_names]
observed_common_causes_vals = pd.get_dummies(observed_common_causes_vals, drop_first=True)
observed_common_causes_vals = self._encode(observed_common_causes_vals, "observed_common_causes")

if self._effect_modifier_names:
effect_modifiers_vals = data_df[self._effect_modifier_names]
effect_modifiers_vals = pd.get_dummies(effect_modifiers_vals, drop_first=True)
effect_modifiers_vals = self._encode(effect_modifiers_vals, "effect_modifiers")

if type(treatment_vals) is not np.ndarray:
treatment_vals = treatment_vals.to_numpy()
if treatment_vals.shape[0] != data_df.shape[0]:
Expand Down
66 changes: 42 additions & 24 deletions dowhy/causal_refuters/add_unobserved_common_cause.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from dowhy.causal_refuter import CausalRefutation, CausalRefuter, choose_variables
from dowhy.causal_refuters.evalue_sensitivity_analyzer import EValueSensitivityAnalyzer
from dowhy.causal_refuters.linear_sensitivity_analyzer import LinearSensitivityAnalyzer
from dowhy.utils.encoding import Encoders

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -201,6 +202,41 @@ def include_simulated_confounder(
)


def preprocess_observed_common_causes(
data: pd.DataFrame,
target_estimand: IdentifiedEstimand,
no_common_causes_error_message: str,
):
"""
Preprocesses backdoor variables (observed common causes) and returns the pre-processed matrix.

At least one backdoor (common cause) variable is required. Raises an exception if none present.

Preprocessing has two steps:
1. Categorical encoding.
2. Standardization.

:param data: All data, some of which needs preprocessing.
:param target_estimand: Estimand for desired effect including definition of backdoor variables.
:param no_common_causes_error_message: Message to be displayed with ValueError if no backdoor variable present.
:return: DataFrame containing pre-processed data.
"""

# 1. Categorical encoding of relevant variables
observed_common_causes_names = target_estimand.get_backdoor_variables()
if len(observed_common_causes_names) > 0:
# The encoded data is only used to calculate a parameter, so the encoder can be discarded.
observed_common_causes = data[observed_common_causes_names]
encoders = Encoders()
observed_common_causes = encoders.encode(observed_common_causes, "observed_common_causes")
else:
raise ValueError(no_common_causes_error_message)

# 2. Standardizing the data
observed_common_causes = StandardScaler().fit_transform(observed_common_causes)
return observed_common_causes


def _infer_default_kappa_t(
data: pd.DataFrame,
target_estimand: IdentifiedEstimand,
Expand All @@ -210,19 +246,10 @@ def _infer_default_kappa_t(
len_kappa_t: int = 10,
):
"""Infer default effect strength of simulated confounder on treatment."""
observed_common_causes_names = target_estimand.get_backdoor_variables()
if len(observed_common_causes_names) > 0:
observed_common_causes = data[observed_common_causes_names]
observed_common_causes = pd.get_dummies(observed_common_causes, drop_first=True)
else:
raise ValueError(
"There needs to be at least one common cause to"
+ "automatically compute the default value of kappa_t."
+ " Provide a value for kappa_t"
)
t = data[treatment_name]
# Standardizing the data
observed_common_causes = StandardScaler().fit_transform(observed_common_causes)
no_common_causes_error_message = "There needs to be at least one common cause to automatically compute the default value of kappa_t. Provide a value for kappa_t"
observed_common_causes = preprocess_observed_common_causes(data, target_estimand, no_common_causes_error_message)

if effect_on_t == "binary_flip":
# Fit a model containing all confounders and compare predictions
# using all features compared to all features except a given
Expand Down Expand Up @@ -272,19 +299,10 @@ def _infer_default_kappa_y(
len_kappa_y: int = 10,
):
"""Infer default effect strength of simulated confounder on treatment."""
observed_common_causes_names = target_estimand.get_backdoor_variables()
if len(observed_common_causes_names) > 0:
observed_common_causes = data[observed_common_causes_names]
observed_common_causes = pd.get_dummies(observed_common_causes, drop_first=True)
else:
raise ValueError(
"There needs to be at least one common cause to"
+ "automatically compute the default value of kappa_y."
+ " Provide a value for kappa_y"
)
y = data[outcome_name]
# Standardizing the data
observed_common_causes = StandardScaler().fit_transform(observed_common_causes)
no_common_causes_error_message = "There needs to be at least one common cause to automatically compute the default value of kappa_y. Provide a value for kappa_y"
observed_common_causes = preprocess_observed_common_causes(data, target_estimand, no_common_causes_error_message)

if effect_on_y == "binary_flip":
# Fit a model containing all confounders and compare predictions
# using all features compared to all features except a given
Expand Down
Loading