Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix pandas FutureWarnings in calls to .groupby() #1164

Merged
merged 3 commits into from
Apr 22, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ you can use the namespace as follows.
data['df'].causal.do(x='v0', # name of treatment variable
variable_types={'v0': 'b', 'y': 'c', 'W0': 'c'},
outcome='y',
common_causes=['W0']).groupby('v0').mean().plot(y='y', kind='bar')
common_causes=['W0']).groupby('v0', observed=False).mean().plot(y='y', kind='bar')

.. image:: https://raw.githubusercontent.com/microsoft/dowhy/main/docs/images/do_barplot.png

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@
"outputs": [],
"source": [
"dataset = dataset[dataset.deposit_type==\"No Deposit\"]\n",
"dataset.groupby(['deposit_type','is_canceled']).count()"
"dataset.groupby(['deposit_type','is_canceled'], observed=False).count()"
]
},
{
Expand Down
4 changes: 2 additions & 2 deletions docs/source/example_notebooks/dowhy_causal_api.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
" variable_types={treatment: 'b', outcome: 'c', common_cause: 'c'},\n",
" outcome=outcome,\n",
" common_causes=[common_cause],\n",
" proceed_when_unidentifiable=True).groupby(treatment).mean().plot(y=outcome, kind='bar')"
" proceed_when_unidentifiable=True).groupby(treatment, observed=False).mean().plot(y=outcome, kind='bar')"
]
},
{
Expand All @@ -69,7 +69,7 @@
" outcome=outcome,\n",
" method='weighting', \n",
" common_causes=[common_cause],\n",
" proceed_when_unidentifiable=True).groupby(treatment).mean().plot(y=outcome, kind='bar')"
" proceed_when_unidentifiable=True).groupby(treatment, observed=False).mean().plot(y=outcome, kind='bar')"
]
},
{
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@
},
"outputs": [],
"source": [
"data.groupby(['week']).mean()[['received']].plot(kind='bar', title='average received', legend=False); "
"data.groupby(['week'], observed=False).mean()[['received']].plot(kind='bar', title='average received', legend=False); "
]
},
{
Expand Down Expand Up @@ -142,7 +142,7 @@
"metadata": {},
"outputs": [],
"source": [
"data.groupby(['week']).mean().plot(kind='bar', title='average', legend=True);"
"data.groupby(['week'], observed=False).mean().plot(kind='bar', title='average', legend=True);"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion dowhy/causal_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ def _estimate_conditional_effects(
data[prefix + str(em)] = pd.qcut(data[em], num_quantiles, duplicates="drop")
effect_modifier_names[i] = prefix + str(em)
# Grouping by effect modifiers and computing effect separately
by_effect_mods = data.groupby(effect_modifier_names)
by_effect_mods = data.groupby(effect_modifier_names, observed=False)
cond_est_fn = lambda x: self._do(self._treatment_value, x) - self._do(self._control_value, x)
conditional_estimates = by_effect_mods.apply(estimate_effect_fn)
# Deleting the temporary categorical columns
Expand Down
2 changes: 1 addition & 1 deletion dowhy/causal_estimators/distance_matching_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def estimate_effect(
for i in range(numtreatedunits):
self.matched_indices_att[treated_df_index[i]] = control.iloc[indices[i]].index.tolist()
else:
grouped = updated_df.groupby(self.exact_match_cols)
grouped = updated_df.groupby(self.exact_match_cols, observed=False)
att = 0
for name, group in grouped:
treated = group.loc[group[self._target_estimand.treatment_variable[0]] == 1]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def estimate_effect(
num_strata,
self.clipping_threshold,
)
num_ret_strata = clipped.groupby(["strata"]).count().reset_index()
num_ret_strata = clipped.groupby(["strata"], observed=False).count().reset_index()
# At least 90% of the strata should be included in analysis
if num_ret_strata.shape[0] >= 0.5 * num_strata:
strata_found = True
Expand Down Expand Up @@ -172,7 +172,7 @@ def estimate_effect(
)

# sum weighted outcomes over all strata (weight by treated population)
weighted_outcomes = clipped.groupby("strata").agg(
weighted_outcomes = clipped.groupby("strata", observed=False).agg(
{self._target_estimand.treatment_variable[0]: ["sum"], "dbar": ["sum"], "d_y": ["sum"], "dbar_y": ["sum"]}
)
weighted_outcomes.columns = ["_".join(x) for x in weighted_outcomes.columns.to_numpy().ravel()]
Expand Down Expand Up @@ -233,7 +233,7 @@ def _get_strata(self, data: pd.DataFrame, num_strata, clipping_threshold):
data[self._target_estimand.treatment_variable[0]] * data[self._target_estimand.outcome_variable[0]]
)
data["dbar_y"] = data["dbar"] * data[self._target_estimand.outcome_variable[0]]
stratified = data.groupby("strata")
stratified = data.groupby("strata", observed=False)
clipped = stratified.filter(
lambda strata: min(
strata.loc[strata[self._target_estimand.treatment_variable[0]] == 1].shape[0],
Expand All @@ -244,7 +244,7 @@ def _get_strata(self, data: pd.DataFrame, num_strata, clipping_threshold):
self.logger.debug(
"After using clipping_threshold={0}, here are the number of data points in each strata:\n {1}".format(
clipping_threshold,
clipped.groupby(["strata", self._target_estimand.treatment_variable[0]])[
clipped.groupby(["strata", self._target_estimand.treatment_variable[0]], observed=False)[
self._target_estimand.outcome_variable
].count(),
)
Expand Down
8 changes: 4 additions & 4 deletions dowhy/causal_refuters/dummy_outcome_refuter.py
Original file line number Diff line number Diff line change
Expand Up @@ -748,7 +748,7 @@ def preprocess_data_by_treatment(
variable_type = data[treatment_variable_name].dtypes

if bool == variable_type:
groups = data.groupby(treatment_variable_name)
groups = data.groupby(treatment_variable_name, observed=False)
return groups
# We use string arguments to account for both 32 and 64 bit varaibles
elif "float" in variable_type.name or "int" in variable_type.name:
Expand All @@ -757,14 +757,14 @@ def preprocess_data_by_treatment(
std_dev = data[treatment_variable_name].std()
num_bins = (data.max() - data.min()) / (bucket_size_scale_factor * std_dev)
data["bins"] = pd.cut(data[treatment_variable_name], num_bins)
groups = data.groupby("bins")
groups = data.groupby("bins", observed=False)
data.drop("bins", axis=1, inplace=True)
return groups

elif "categorical" in variable_type.name:
# Action for categorical variables
groups = data.groupby(treatment_variable_name)
groups = data.groupby("bins")
groups = data.groupby(treatment_variable_name, observed=False)
groups = data.groupby("bins", observed=False)
return groups
else:
raise ValueError("Passed {}. Expected bool, float, int or categorical.".format(variable_type.name))
Expand Down
4 changes: 2 additions & 2 deletions dowhy/interpreters/confounder_distribution_interpreter.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,10 @@ def interpret(self, data: pd.DataFrame):

# before weights are applied we count number rows in each category
# which is equivalent to summing over weight=1
barplot_df_before = df.groupby([self.var_name, treated]).size().reset_index(name="count")
barplot_df_before = df.groupby([self.var_name, treated], observed=False).size().reset_index(name="count")

# after weights are applied we need to sum over the given weights
barplot_df_after = df.groupby([self.var_name, treated]).agg({"weight": np.sum}).reset_index()
barplot_df_after = df.groupby([self.var_name, treated], observed=False).agg({"weight": np.sum}).reset_index()
barplot_df_after.rename(columns={"weight": "count"}, inplace=True)

title1 = "Distribution of " + self.var_name + " before applying the weights"
Expand Down
20 changes: 10 additions & 10 deletions dowhy/interpreters/propensity_balance_interpreter.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,38 +41,38 @@ def interpret(self, data: pd.DataFrame):
)

# First, calculating mean differences by strata
mean_diff = df_long.groupby(self.estimate._treatment_name + ["common_cause_id", "strata"]).agg(
mean_diff = df_long.groupby(self.estimate._treatment_name + ["common_cause_id", "strata"], observed=False).agg(
mean_w=("W", np.mean)
)
mean_diff = (
mean_diff.groupby(["common_cause_id", "strata"]).transform(lambda x: x.max() - x.min()).reset_index()
mean_diff.groupby(["common_cause_id", "strata"], observed=False).transform(lambda x: x.max() - x.min()).reset_index()
)
mean_diff = mean_diff.query("v0==True")
size_by_w_strata = (
df_long.groupby(["common_cause_id", "strata"]).agg(size=("propensity_score", np.size)).reset_index()
df_long.groupby(["common_cause_id", "strata"], observed=False).agg(size=("propensity_score", np.size)).reset_index()
)
size_by_strata = df_long.groupby(["common_cause_id"]).agg(size=("propensity_score", np.size)).reset_index()
size_by_strata = df_long.groupby(["common_cause_id"], observed=False).agg(size=("propensity_score", np.size)).reset_index()
size_by_strata = pd.merge(size_by_w_strata, size_by_strata, on="common_cause_id")
mean_diff_strata = pd.merge(mean_diff, size_by_strata, on=("common_cause_id", "strata"))

stddev_by_w_strata = df_long.groupby(["common_cause_id", "strata"]).agg(stddev=("W", np.std)).reset_index()
stddev_by_w_strata = df_long.groupby(["common_cause_id", "strata"], observed=False).agg(stddev=("W", np.std)).reset_index()
mean_diff_strata = pd.merge(mean_diff_strata, stddev_by_w_strata, on=["common_cause_id", "strata"])
mean_diff_strata["scaled_mean"] = (mean_diff_strata["mean_w"] / mean_diff_strata["stddev"]) * (
mean_diff_strata["size_x"] / mean_diff_strata["size_y"]
)
mean_diff_strata = (
mean_diff_strata.groupby("common_cause_id").agg(std_mean_diff=("scaled_mean", np.sum)).reset_index()
mean_diff_strata.groupby("common_cause_id", observed=False).agg(std_mean_diff=("scaled_mean", np.sum)).reset_index()
)

# Second, without strata
mean_diff_overall = df_long.groupby(self.estimate._treatment_name + ["common_cause_id"]).agg(
mean_diff_overall = df_long.groupby(self.estimate._treatment_name + ["common_cause_id"], observed=False).agg(
mean_w=("W", np.mean)
)
mean_diff_overall = (
mean_diff_overall.groupby("common_cause_id").transform(lambda x: x.max() - x.min()).reset_index()
mean_diff_overall.groupby("common_cause_id", observed=False).transform(lambda x: x.max() - x.min()).reset_index()
)
mean_diff_overall = mean_diff_overall[mean_diff_overall[self.estimate._treatment_name[0]] == True] # TODO
stddev_overall = df_long.groupby(["common_cause_id"]).agg(stddev=("W", np.std)).reset_index()
stddev_overall = df_long.groupby(["common_cause_id"], observed=False).agg(stddev=("W", np.std)).reset_index()
mean_diff_overall = pd.merge(mean_diff_overall, stddev_overall, on=["common_cause_id"])
mean_diff_overall["std_mean_diff"] = mean_diff_overall["mean_w"] / mean_diff_overall["stddev"]

Expand All @@ -86,7 +86,7 @@ def interpret(self, data: pd.DataFrame):

plt.style.use("seaborn-white")
fig, ax = plt.subplots(1, 1)
for label, subdf in plot_df.groupby("common_cause_id"):
for label, subdf in plot_df.groupby("common_cause_id", observed=False):
subdf.plot(kind="line", x="sample", y="std_mean_diff", ax=ax, label=label)
plt.legend(title="Common causes")
plt.ylabel("Standardized mean difference between treatment and control")
Expand Down
6 changes: 3 additions & 3 deletions tests/do_sampler/test_pandas_do_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def test_pandas_api_with_full_specification_of_type(self, N, variable_types):

data["df"].causal.do(
x="v0", variable_types=variable_types, outcome="y", proceed_when_unidentifiable=True, common_causes=["W0"]
).groupby("v0").mean()
).groupby("v0", observed=False).mean()
assert True

@mark.parametrize(
Expand All @@ -216,7 +216,7 @@ def test_pandas_api_with_partial_specification_of_type(self, N, variable_types):

data["df"].causal.do(
x="v0", variable_types=variable_types, outcome="y", proceed_when_unidentifiable=True, common_causes=["W0"]
).groupby("v0").mean()
).groupby("v0", observed=False).mean()
assert True

@mark.parametrize(
Expand All @@ -232,7 +232,7 @@ def test_pandas_api_with_no_specification_of_type(self, N, variable_types):

data["df"].causal.do(
x="v0", variable_types=variable_types, outcome="y", proceed_when_unidentifiable=True, common_causes=["W0"]
).groupby("v0").mean()
).groupby("v0", observed=False).mean()
assert True

@mark.parametrize(
Expand Down