Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bug with consistency of RegressionEstimator one-hot encoding #1112

Merged
merged 2 commits into from
Dec 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 99 additions & 28 deletions dowhy/causal_estimators/regression_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import statsmodels.api as sm

from dowhy.causal_estimator import CausalEstimate, CausalEstimator, IdentifiedEstimand
from dowhy.utils.encoding import one_hot_encode


class RegressionEstimator(CausalEstimator):
Expand Down Expand Up @@ -70,6 +71,53 @@ def __init__(

self.model = None

# Data encoders
# encoder_drop_first will not encode the first category value with a bit in 1-hot encoding.
# It will be implicit instead, by the absence of any bit representing this value in the relevant columns.
# Set to False to include a bit for each value of every categorical variable.
self.encoder_drop_first = True
self.reset_encoders()

def reset_encoders(self):
"""
Removes any reference to data encoders, causing them to be re-created on next `fit()`.

It's important that data is consistently encoded otherwise models will produce inconsistent output.
In particular, categorical variables are one-hot encoded; the mapping of original data values
must be identical between model training/fitting and inference time.

Encoders are reset when `fit()` is called again, as the data is assumed to have changed.

A separate encoder is used for each subset of variables (treatment, common causes and effect modifiers).
"""
self._encoders = {
"treatment": None,
"observed_common_causes": None,
"effect_modifiers": None,
}

def _encode(self, data: pd.DataFrame, encoder_name: str):
"""
Encodes categorical columns in the given data, returning a new dataframe containing
all original data and the encoded columns. Numerical data is unchanged, categorical
types are one-hot encoded. `encoder_name` identifies a specific encoder to be used
if available, or created if not. The encoder can be reused in subsequent calls.

:param data: Data to encode.
:param encoder_name: The name for the encoder to be used.
:returns: The encoded data.
"""
existing_encoder = self._encoders.get(encoder_name)
encoded_variables, encoder = one_hot_encode(
data,
drop_first=self.encoder_drop_first,
encoder=existing_encoder,
)

# Remember encoder
self._encoders[encoder_name] = encoder
return encoded_variables

def fit(
self,
data: pd.DataFrame,
Expand All @@ -84,13 +132,14 @@ def fit(
effects, or return a heterogeneous effect function. Not all
methods support this currently.
"""
self.reset_encoders() # Forget any existing encoders
self._set_effect_modifiers(data, effect_modifier_names)

self.logger.debug("Back-door variables used:" + ",".join(self._target_estimand.get_backdoor_variables()))
self._observed_common_causes_names = self._target_estimand.get_backdoor_variables()
if len(self._observed_common_causes_names) > 0:
self._observed_common_causes = data[self._observed_common_causes_names]
self._observed_common_causes = pd.get_dummies(self._observed_common_causes, drop_first=True)
self._observed_common_causes = self._encode(self._observed_common_causes, "observed_common_causes")
else:
self._observed_common_causes = None

Expand Down Expand Up @@ -148,14 +197,42 @@ def _estimate_effect_fn(self, data_df):
est = self.estimate_effect(data=data_df, need_conditional_estimates=False)
return est.value

def _set_effect_modifiers(self, data: pd.DataFrame, effect_modifier_names: Optional[List[str]] = None):
"""Sets the effect modifiers for the estimator
Modifies need_conditional_estimates accordingly to effect modifiers value
:param effect_modifiers: Variables on which to compute separate
effects, or return a heterogeneous effect function. Not all
methods support this currently.
"""
self._effect_modifiers = effect_modifier_names
if effect_modifier_names is not None:
self._effect_modifier_names = [cname for cname in effect_modifier_names if cname in data.columns]
if len(self._effect_modifier_names) > 0:
self._effect_modifiers = data[self._effect_modifier_names]
self._effect_modifiers = self._encode(self._effect_modifiers, "effect_modifiers")
self.logger.debug("Effect modifiers: " + ",".join(self._effect_modifier_names))
else:
self._effect_modifier_names = []
else:
self._effect_modifier_names = []

self.need_conditional_estimates = (
self.need_conditional_estimates
if self.need_conditional_estimates != "auto"
else (self._effect_modifier_names and len(self._effect_modifier_names) > 0)
)

def _build_features(self, data_df: pd.DataFrame, treatment_values=None):
treatment_vals = pd.get_dummies(data_df[self._target_estimand.treatment_variable], drop_first=True)
treatment_vals = self._encode(data_df[self._target_estimand.treatment_variable], "treatment")

if len(self._observed_common_causes_names) > 0:
observed_common_causes_vals = data_df[self._observed_common_causes_names]
observed_common_causes_vals = pd.get_dummies(observed_common_causes_vals, drop_first=True)
observed_common_causes_vals = self._encode(observed_common_causes_vals, "observed_common_causes")

if self._effect_modifier_names:
effect_modifiers_vals = data_df[self._effect_modifier_names]
effect_modifiers_vals = pd.get_dummies(effect_modifiers_vals, drop_first=True)
effect_modifiers_vals = self._encode(effect_modifiers_vals, "effect_modifiers")

# Fixing treatment value to the specified value, if provided
if treatment_values is not None:
treatment_vals = treatment_values
Expand All @@ -164,6 +241,7 @@ def _build_features(self, data_df: pd.DataFrame, treatment_values=None):
# treatment_vals and data_df should have same number of rows
if treatment_vals.shape[0] != data_df.shape[0]:
raise ValueError("Provided treatment values and dataframe should have the same length.")

# Bulding the feature matrix
n_treatment_cols = 1 if len(treatment_vals.shape) == 1 else treatment_vals.shape[1]
n_samples = treatment_vals.shape[0]
Expand Down Expand Up @@ -195,32 +273,25 @@ def interventional_outcomes(self, data_df: pd.DataFrame, treatment_val):
"""

if data_df is None:
data_df = self._data
data_df = self._data.copy()
else:
data_df = data_df.copy() # don't modify arg

# Replace treatment values with value supplied; note: Don't change column datatype!
original_type = data_df[self._target_estimand.treatment_variable].dtypes
data_df[self._target_estimand.treatment_variable] = treatment_val
data_df[self._target_estimand.treatment_variable] = data_df[self._target_estimand.treatment_variable].astype(
original_type, copy=False
)

return self.predict(data_df)

def predict(self, data_df):
if not self.model:
# The model is always built on the entire data
_, self.model = self._build_model(data_df)
# Replacing treatment values by given x
# First, create interventional tensor in original space
interventional_treatment_values = np.full(
(data_df.shape[0], len(self._target_estimand.treatment_variable)), treatment_val
)
# Then, use pandas to ensure that the dummies are assigned correctly for a categorical treatment
interventional_treatment_2d = pd.concat(
[
data_df[self._target_estimand.treatment_variable].copy(),
pd.DataFrame(
data=interventional_treatment_values,
columns=data_df[self._target_estimand.treatment_variable].columns,
),
],
axis=0,
).astype(data_df[self._target_estimand.treatment_variable].dtypes, copy=False)
interventional_treatment_2d = pd.get_dummies(interventional_treatment_2d, drop_first=True)
interventional_treatment_2d = interventional_treatment_2d[
data_df[self._target_estimand.treatment_variable].shape[0] :
]

new_features = self._build_features(data_df, treatment_values=interventional_treatment_2d)
_, self.model = self._build_model()

new_features = self._build_features(data_df=data_df)
interventional_outcomes = self.predict_fn(data_df, self.model, new_features)
return interventional_outcomes

Expand Down
62 changes: 62 additions & 0 deletions dowhy/utils/encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import pandas as pd
from pandas.core.dtypes.common import is_list_like
from sklearn.preprocessing import OneHotEncoder


def one_hot_encode(data: pd.DataFrame, columns=None, drop_first: bool = False, encoder: OneHotEncoder = None):
"""
Replaces pandas' get_dummies with an implementation of sklearn.preprocessing.OneHotEncoder.

The purpose of replacement is to allow encoding of new data using the same encoder, which ensures that the resulting encodings are consistent.

If encoder is None, a new instance of sklearn.preprocessing.OneHotEncoder will be created using `fit_transform()`. Otherwise, the existing encoder is used with `fit()`.

For compatibility with get_dummies, the encoded data will be transformed into a DataFrame.

In all cases, the return value will be the encoded data and the encoder object (even if passed in). If `data` contains other columns than the
dummy-coded one(s), these will be prepended, unaltered, to the result.

:param data: Data of which to get dummy indicators.
:param columns: List-like structure containing specific columns to encode.
:param drop_first: Whether to get k-1 dummies out of k categorical levels by removing the first level.
:return: DataFrame, OneHotEncoder
"""

# Determine columns being encoded
if columns is None:
dtypes_to_encode = ["object", "string", "category"]
data_to_encode = data.select_dtypes(include=dtypes_to_encode)
elif not is_list_like(columns):
raise TypeError("Input must be a list-like for parameter `columns`")
else:
data_to_encode = data[columns]

# If all columns are already numerical, there may be nothing to encode.
# In this case, return original data.
if len(data_to_encode.columns) == 0:
return data, encoder # Encoder may be None

# Columns to keep in the result - not encoded.
columns_to_keep = data.columns.difference(data_to_encode.columns)
df_columns_to_keep = data[columns_to_keep].reset_index(drop=True)

if encoder is None: # Create new encoder
drop = None
if drop_first:
drop = "first"
encoder = OneHotEncoder(drop=drop, sparse=False) # NB sparse renamed to sparse_output in sklearn 1.2+

encoded_data = encoder.fit_transform(data_to_encode)

else: # Use existing encoder
encoded_data = encoder.transform(data_to_encode)

# Convert the encoded data to a DataFrame
columns_encoded = encoder.get_feature_names_out(data_to_encode.columns)

df_encoded = pd.DataFrame(encoded_data, columns=columns_encoded).reset_index(drop=True) # drop index from original

# Concatenate the encoded DataFrame with the original non-categorical columns
df_result = pd.concat([df_columns_to_keep, df_encoded], axis=1)

return df_result, encoder
87 changes: 87 additions & 0 deletions tests/utils/test_encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import networkx as nx
import numpy as np
import pandas as pd
from _pytest.python_api import approx

from dowhy.utils.encoding import one_hot_encode


def test_one_hot_encode_equivalent_to_get_dummies():

# Use a mix of already-numeric and requires encoding cols:
data = {
"C": ["X", "Y", "Z", "X", "Y", "Z"],
"N": [1, 2, 3, 4, 5, 6],
}
df = pd.DataFrame(data)

# NB There may be small differences in type but since all values will be used in models as float,
# comparison is done as this type.
df_dummies = pd.get_dummies(df, drop_first=True)
df_dummies = df_dummies.astype(float)

df_sklearn, _ = one_hot_encode(df, drop_first=True)
df_sklearn = df_sklearn.astype(float)

# Check same rows
len1 = len(df_dummies)
len2 = len(df_sklearn)
assert len1 == len2

# Check same number of cols
len1 = len(df_dummies.columns)
len2 = len(df_sklearn.columns)
assert len1 == len2

# Check values
# Calculate the sum of absolute differences between the two DataFrames
# - should be zero (excl. floating point error)
sum_abs_diff = (df_dummies - df_sklearn).abs().sum().sum()
assert sum_abs_diff == approx(0.0)


def test_one_hot_encode_consistent_with_new_data():

# Use a mix of already-numeric and requires encoding cols:
data1 = {
"C": ["X", "Y", "Z", "X", "Y", "Z"],
"N": [1, 2, 3, 4, 5, 6],
}
df1 = pd.DataFrame(data1)

# Initial encode
df_encoded1, encoder = one_hot_encode(df1, drop_first=True)
df_encoded1 = df_encoded1.astype(float)

# Create new data with permuted rows.
# Output shape should be unchanged.
data2 = {
"C": ["Y", "Z", "X", "X", "Y", "Z"],
"N": [1, 2, 3, 4, 5, 6],
}
df2 = pd.DataFrame(data2)

# Encode this new data.
df_encoded2, _ = one_hot_encode(df2, encoder=encoder, drop_first=True)
df_encoded2 = df_encoded2.astype(float)

# Check same rows
len1 = len(df_encoded1)
len2 = len(df_encoded2)
assert len1 == len2

# Check same number of cols
len1 = len(df_encoded1.columns)
len2 = len(df_encoded2.columns)
assert len1 == len2

# Check permuted values are consistent
c_y1 = df_encoded1["C_Y"]
c_y2 = df_encoded2["C_Y"]
assert c_y1[1] == c_y2[0]
assert c_y1[4] == c_y2[4]

c_z1 = df_encoded1["C_Z"]
c_z2 = df_encoded2["C_Z"]
assert c_z1[2] == c_z2[1]
assert c_z1[5] == c_z2[5]