py-why · amit-sharma · Apr 22, 2024 · Apr 20, 2024 · Apr 22, 2024 · Apr 22, 2024
diff --git a/README.rst b/README.rst
@@ -251,7 +251,7 @@ you can use the namespace as follows.
     data['df'].causal.do(x='v0', # name of treatment variable
                          variable_types={'v0': 'b', 'y': 'c', 'W0': 'c'},
                          outcome='y',
-                         common_causes=['W0']).groupby('v0').mean().plot(y='y', kind='bar')
+                         common_causes=['W0']).groupby('v0', observed=False).mean().plot(y='y', kind='bar')
 
 .. image:: https://raw.githubusercontent.com/microsoft/dowhy/main/docs/images/do_barplot.png
 

diff --git a/.../source/example_notebooks/DoWhy-The Causal Story Behind Hotel Booking Cancellations.ipynb b/.../source/example_notebooks/DoWhy-The Causal Story Behind Hotel Booking Cancellations.ipynb
@@ -154,7 +154,7 @@
    "outputs": [],
    "source": [
     "dataset = dataset[dataset.deposit_type==\"No Deposit\"]\n",
-    "dataset.groupby(['deposit_type','is_canceled']).count()"
+    "dataset.groupby(['deposit_type','is_canceled'], observed=False).count()"
    ]
   },
   {

diff --git a/docs/source/example_notebooks/dowhy_causal_api.ipynb b/docs/source/example_notebooks/dowhy_causal_api.ipynb
@@ -55,7 +55,7 @@
     "                     variable_types={treatment: 'b', outcome: 'c', common_cause: 'c'},\n",
     "                     outcome=outcome,\n",
     "                     common_causes=[common_cause],\n",
-    "                     proceed_when_unidentifiable=True).groupby(treatment).mean().plot(y=outcome, kind='bar')"
+    "                     proceed_when_unidentifiable=True).groupby(treatment, observed=False).mean().plot(y=outcome, kind='bar')"
    ]
   },
   {
@@ -69,7 +69,7 @@
     "              outcome=outcome,\n",
     "              method='weighting', \n",
     "              common_causes=[common_cause],\n",
-    "              proceed_when_unidentifiable=True).groupby(treatment).mean().plot(y=outcome, kind='bar')"
+    "              proceed_when_unidentifiable=True).groupby(treatment, observed=False).mean().plot(y=outcome, kind='bar')"
    ]
   },
   {

diff --git a/docs/source/example_notebooks/dowhy_example_effect_of_memberrewards_program.ipynb b/docs/source/example_notebooks/dowhy_example_effect_of_memberrewards_program.ipynb
diff --git a/docs/source/example_notebooks/gcm_supply_chain_dist_change.ipynb b/docs/source/example_notebooks/gcm_supply_chain_dist_change.ipynb
@@ -101,7 +101,7 @@
    },
    "outputs": [],
    "source": [
-    "data.groupby(['week']).mean()[['received']].plot(kind='bar', title='average received', legend=False); "
+    "data.groupby(['week'], observed=False).mean()[['received']].plot(kind='bar', title='average received', legend=False); "
    ]
   },
   {
@@ -142,7 +142,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data.groupby(['week']).mean().plot(kind='bar', title='average', legend=True);"
+    "data.groupby(['week'], observed=False).mean().plot(kind='bar', title='average', legend=True);"
    ]
   },
   {

diff --git a/dowhy/causal_estimator.py b/dowhy/causal_estimator.py
@@ -234,7 +234,7 @@ def _estimate_conditional_effects(
                 data[prefix + str(em)] = pd.qcut(data[em], num_quantiles, duplicates="drop")
                 effect_modifier_names[i] = prefix + str(em)
         # Grouping by effect modifiers and computing effect separately
-        by_effect_mods = data.groupby(effect_modifier_names)
+        by_effect_mods = data.groupby(effect_modifier_names, observed=False)
         cond_est_fn = lambda x: self._do(self._treatment_value, x) - self._do(self._control_value, x)
         conditional_estimates = by_effect_mods.apply(estimate_effect_fn)
         # Deleting the temporary categorical columns

diff --git a/dowhy/causal_estimators/distance_matching_estimator.py b/dowhy/causal_estimators/distance_matching_estimator.py
@@ -219,7 +219,7 @@ def estimate_effect(
                 for i in range(numtreatedunits):
                     self.matched_indices_att[treated_df_index[i]] = control.iloc[indices[i]].index.tolist()
             else:
-                grouped = updated_df.groupby(self.exact_match_cols)
+                grouped = updated_df.groupby(self.exact_match_cols, observed=False)
                 att = 0
                 for name, group in grouped:
                     treated = group.loc[group[self._target_estimand.treatment_variable[0]] == 1]

diff --git a/dowhy/causal_estimators/propensity_score_stratification_estimator.py b/dowhy/causal_estimators/propensity_score_stratification_estimator.py
@@ -140,7 +140,7 @@ def estimate_effect(
                         num_strata,
                         self.clipping_threshold,
                     )
-                    num_ret_strata = clipped.groupby(["strata"]).count().reset_index()
+                    num_ret_strata = clipped.groupby(["strata"], observed=False).count().reset_index()
                     # At least 90% of the strata should be included in analysis
                     if num_ret_strata.shape[0] >= 0.5 * num_strata:
                         strata_found = True
@@ -172,7 +172,7 @@ def estimate_effect(
             )
 
         # sum weighted outcomes over all strata  (weight by treated population)
-        weighted_outcomes = clipped.groupby("strata").agg(
+        weighted_outcomes = clipped.groupby("strata", observed=False).agg(
             {self._target_estimand.treatment_variable[0]: ["sum"], "dbar": ["sum"], "d_y": ["sum"], "dbar_y": ["sum"]}
         )
         weighted_outcomes.columns = ["_".join(x) for x in weighted_outcomes.columns.to_numpy().ravel()]
@@ -233,7 +233,7 @@ def _get_strata(self, data: pd.DataFrame, num_strata, clipping_threshold):
             data[self._target_estimand.treatment_variable[0]] * data[self._target_estimand.outcome_variable[0]]
         )
         data["dbar_y"] = data["dbar"] * data[self._target_estimand.outcome_variable[0]]
-        stratified = data.groupby("strata")
+        stratified = data.groupby("strata", observed=False)
         clipped = stratified.filter(
             lambda strata: min(
                 strata.loc[strata[self._target_estimand.treatment_variable[0]] == 1].shape[0],
@@ -244,7 +244,7 @@ def _get_strata(self, data: pd.DataFrame, num_strata, clipping_threshold):
         self.logger.debug(
             "After using clipping_threshold={0}, here are the number of data points in each strata:\n {1}".format(
                 clipping_threshold,
-                clipped.groupby(["strata", self._target_estimand.treatment_variable[0]])[
+                clipped.groupby(["strata", self._target_estimand.treatment_variable[0]], observed=False)[
                     self._target_estimand.outcome_variable
                 ].count(),
             )

diff --git a/dowhy/causal_refuters/dummy_outcome_refuter.py b/dowhy/causal_refuters/dummy_outcome_refuter.py
@@ -748,7 +748,7 @@ def preprocess_data_by_treatment(
     variable_type = data[treatment_variable_name].dtypes
 
     if bool == variable_type:
-        groups = data.groupby(treatment_variable_name)
+        groups = data.groupby(treatment_variable_name, observed=False)
         return groups
     # We use string arguments to account for both 32 and 64 bit varaibles
     elif "float" in variable_type.name or "int" in variable_type.name:
@@ -757,14 +757,14 @@ def preprocess_data_by_treatment(
         std_dev = data[treatment_variable_name].std()
         num_bins = (data.max() - data.min()) / (bucket_size_scale_factor * std_dev)
         data["bins"] = pd.cut(data[treatment_variable_name], num_bins)
-        groups = data.groupby("bins")
+        groups = data.groupby("bins", observed=False)
         data.drop("bins", axis=1, inplace=True)
         return groups
 
     elif "categorical" in variable_type.name:
         # Action for categorical variables
-        groups = data.groupby(treatment_variable_name)
-        groups = data.groupby("bins")
+        groups = data.groupby(treatment_variable_name, observed=False)
+        groups = data.groupby("bins", observed=False)
         return groups
     else:
         raise ValueError("Passed {}. Expected bool, float, int or categorical.".format(variable_type.name))

diff --git a/dowhy/interpreters/confounder_distribution_interpreter.py b/dowhy/interpreters/confounder_distribution_interpreter.py
@@ -81,10 +81,10 @@ def interpret(self, data: pd.DataFrame):
 
         # before weights are applied we count number rows in each category
         # which is equivalent to summing over weight=1
-        barplot_df_before = df.groupby([self.var_name, treated]).size().reset_index(name="count")
+        barplot_df_before = df.groupby([self.var_name, treated], observed=False).size().reset_index(name="count")
 
         # after weights are applied we need to sum over the given weights
-        barplot_df_after = df.groupby([self.var_name, treated]).agg({"weight": np.sum}).reset_index()
+        barplot_df_after = df.groupby([self.var_name, treated], observed=False).agg({"weight": np.sum}).reset_index()
         barplot_df_after.rename(columns={"weight": "count"}, inplace=True)
 
         title1 = "Distribution of " + self.var_name + " before applying the weights"

diff --git a/dowhy/interpreters/propensity_balance_interpreter.py b/dowhy/interpreters/propensity_balance_interpreter.py
@@ -41,38 +41,38 @@ def interpret(self, data: pd.DataFrame):
         )
 
         # First, calculating mean differences by strata
-        mean_diff = df_long.groupby(self.estimate._treatment_name + ["common_cause_id", "strata"]).agg(
+        mean_diff = df_long.groupby(self.estimate._treatment_name + ["common_cause_id", "strata"], observed=False).agg(
             mean_w=("W", np.mean)
         )
         mean_diff = (
-            mean_diff.groupby(["common_cause_id", "strata"]).transform(lambda x: x.max() - x.min()).reset_index()
+            mean_diff.groupby(["common_cause_id", "strata"], observed=False).transform(lambda x: x.max() - x.min()).reset_index()
         )
         mean_diff = mean_diff.query("v0==True")
         size_by_w_strata = (
-            df_long.groupby(["common_cause_id", "strata"]).agg(size=("propensity_score", np.size)).reset_index()
+            df_long.groupby(["common_cause_id", "strata"], observed=False).agg(size=("propensity_score", np.size)).reset_index()
         )
-        size_by_strata = df_long.groupby(["common_cause_id"]).agg(size=("propensity_score", np.size)).reset_index()
+        size_by_strata = df_long.groupby(["common_cause_id"], observed=False).agg(size=("propensity_score", np.size)).reset_index()
         size_by_strata = pd.merge(size_by_w_strata, size_by_strata, on="common_cause_id")
         mean_diff_strata = pd.merge(mean_diff, size_by_strata, on=("common_cause_id", "strata"))
 
-        stddev_by_w_strata = df_long.groupby(["common_cause_id", "strata"]).agg(stddev=("W", np.std)).reset_index()
+        stddev_by_w_strata = df_long.groupby(["common_cause_id", "strata"], observed=False).agg(stddev=("W", np.std)).reset_index()
         mean_diff_strata = pd.merge(mean_diff_strata, stddev_by_w_strata, on=["common_cause_id", "strata"])
         mean_diff_strata["scaled_mean"] = (mean_diff_strata["mean_w"] / mean_diff_strata["stddev"]) * (
             mean_diff_strata["size_x"] / mean_diff_strata["size_y"]
         )
         mean_diff_strata = (
-            mean_diff_strata.groupby("common_cause_id").agg(std_mean_diff=("scaled_mean", np.sum)).reset_index()
+            mean_diff_strata.groupby("common_cause_id", observed=False).agg(std_mean_diff=("scaled_mean", np.sum)).reset_index()
         )
 
         # Second, without strata
-        mean_diff_overall = df_long.groupby(self.estimate._treatment_name + ["common_cause_id"]).agg(
+        mean_diff_overall = df_long.groupby(self.estimate._treatment_name + ["common_cause_id"], observed=False).agg(
             mean_w=("W", np.mean)
         )
         mean_diff_overall = (
-            mean_diff_overall.groupby("common_cause_id").transform(lambda x: x.max() - x.min()).reset_index()
+            mean_diff_overall.groupby("common_cause_id", observed=False).transform(lambda x: x.max() - x.min()).reset_index()
         )
         mean_diff_overall = mean_diff_overall[mean_diff_overall[self.estimate._treatment_name[0]] == True]  # TODO
-        stddev_overall = df_long.groupby(["common_cause_id"]).agg(stddev=("W", np.std)).reset_index()
+        stddev_overall = df_long.groupby(["common_cause_id"], observed=False).agg(stddev=("W", np.std)).reset_index()
         mean_diff_overall = pd.merge(mean_diff_overall, stddev_overall, on=["common_cause_id"])
         mean_diff_overall["std_mean_diff"] = mean_diff_overall["mean_w"] / mean_diff_overall["stddev"]
 
@@ -86,7 +86,7 @@ def interpret(self, data: pd.DataFrame):
 
         plt.style.use("seaborn-white")
         fig, ax = plt.subplots(1, 1)
-        for label, subdf in plot_df.groupby("common_cause_id"):
+        for label, subdf in plot_df.groupby("common_cause_id", observed=False):
             subdf.plot(kind="line", x="sample", y="std_mean_diff", ax=ax, label=label)
         plt.legend(title="Common causes")
         plt.ylabel("Standardized mean difference between treatment and control")

diff --git a/tests/do_sampler/test_pandas_do_api.py b/tests/do_sampler/test_pandas_do_api.py
@@ -200,7 +200,7 @@ def test_pandas_api_with_full_specification_of_type(self, N, variable_types):
 
         data["df"].causal.do(
             x="v0", variable_types=variable_types, outcome="y", proceed_when_unidentifiable=True, common_causes=["W0"]
-        ).groupby("v0").mean()
+        ).groupby("v0", observed=False).mean()
         assert True
 
     @mark.parametrize(
@@ -216,7 +216,7 @@ def test_pandas_api_with_partial_specification_of_type(self, N, variable_types):
 
         data["df"].causal.do(
             x="v0", variable_types=variable_types, outcome="y", proceed_when_unidentifiable=True, common_causes=["W0"]
-        ).groupby("v0").mean()
+        ).groupby("v0", observed=False).mean()
         assert True
 
     @mark.parametrize(
@@ -232,7 +232,7 @@ def test_pandas_api_with_no_specification_of_type(self, N, variable_types):
 
         data["df"].causal.do(
             x="v0", variable_types=variable_types, outcome="y", proceed_when_unidentifiable=True, common_causes=["W0"]
-        ).groupby("v0").mean()
+        ).groupby("v0", observed=False).mean()
         assert True
 
     @mark.parametrize(