From e090c212095f05666633531fd4c30915bb7b5a51 Mon Sep 17 00:00:00 2001
From: Andy Wong <andy.wong@strong.io>
Date: Tue, 18 Jun 2024 15:58:49 -0600
Subject: [PATCH 1/6] sklearn 1.x - change argname

---
 tests/preprocessing/sklearn/test_dataframe_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/preprocessing/sklearn/test_dataframe_transformer.py b/tests/preprocessing/sklearn/test_dataframe_transformer.py
index f285379..25b1e7a 100644
--- a/tests/preprocessing/sklearn/test_dataframe_transformer.py
+++ b/tests/preprocessing/sklearn/test_dataframe_transformer.py
@@ -19,7 +19,7 @@ class TestDataFrameTransformer:
             (np.zeros((3, 2)), pd.DataFrame(np.zeros((3, 2)))),
             # convert sparse:
             (
-                    OneHotEncoder(sparse=True).fit_transform([['a'], ['b'], ['c'], ['d']]),
+                    OneHotEncoder(sparse_output=True).fit_transform([['a'], ['b'], ['c'], ['d']]),
                     pd.DataFrame(np.eye(4))
             )
         ]

From fd6326c38b19b3fa4c1044998aff1686c01479db Mon Sep 17 00:00:00 2001
From: Andy Wong <andy.wong@strong.io>
Date: Tue, 18 Jun 2024 15:59:40 -0600
Subject: [PATCH 2/6] fix typehinting and imports

---
 foundry/evaluation/marginal_effects.py    |  2 +-
 tests/evaluation/test_marginal_effects.py | 12 +++++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/foundry/evaluation/marginal_effects.py b/foundry/evaluation/marginal_effects.py
index b9f36c7..62f022a 100644
--- a/foundry/evaluation/marginal_effects.py
+++ b/foundry/evaluation/marginal_effects.py
@@ -15,7 +15,7 @@
 
 
 class Binned:
-    def __init__(self, col: str, bins: Union[int, Sequence] = 20, **kwargs):
+    def __init__(self, col: str, bins: Union[None, int, Sequence] = 20, **kwargs):
         """
         This class creates an object which can bin a pandas.Series.
         ```
diff --git a/tests/evaluation/test_marginal_effects.py b/tests/evaluation/test_marginal_effects.py
index d80fc93..6084bd3 100644
--- a/tests/evaluation/test_marginal_effects.py
+++ b/tests/evaluation/test_marginal_effects.py
@@ -1,13 +1,15 @@
 from typing import Callable
-import pandas as pd
+from unittest.mock import create_autospec
+
 import numpy as np
+import pandas as pd
 import pytest
-from unittest.mock import create_autospec
-from pandas.testing import assert_series_equal
+from foundry.evaluation.marginal_effects import (Binned, MarginalEffects,
+                                                 binned, raw)
+from pandas.testing import assert_frame_equal, assert_series_equal
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
 
-from foundry.evaluation.marginal_effects import Binned, MarginalEffects, binned, raw
 
 class TestBinned():
     @pytest.mark.parametrize(
@@ -72,7 +74,7 @@ def test_binned_init(self, bins):
                 )
             ),
             (
-                False,
+                None,
                 pd.Series(list(range(20)), name="my_feature")
             )
         ],

From 3cee20eefe852fcb880dd7516dcd673ecd37bb2a Mon Sep 17 00:00:00 2001
From: Andy Wong <andy.wong@strong.io>
Date: Tue, 18 Jun 2024 16:00:12 -0600
Subject: [PATCH 3/6] add test for _get_binned_feature_map

---
 tests/evaluation/test_marginal_effects.py | 57 +++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/tests/evaluation/test_marginal_effects.py b/tests/evaluation/test_marginal_effects.py
index 6084bd3..8ee6859 100644
--- a/tests/evaluation/test_marginal_effects.py
+++ b/tests/evaluation/test_marginal_effects.py
@@ -135,3 +135,60 @@ def test_feature_names_in(self, col_transformer__columns, expected):
 
         assert isinstance(me.feature_names_in, list)
         assert list(sorted(expected)) == list(sorted(me.feature_names_in))
+
+    binned_col_A = pd.Series(
+        [
+            pd.Interval(0.999, 2.0),
+            pd.Interval(2.0, 3.0),
+        ],
+        dtype=pd.CategoricalDtype(
+            categories=[
+                pd.Interval(0.999, 2.0),
+                pd.Interval(2.0, 3.0)
+            ],
+            ordered=True
+        ),
+        name="binnedA"
+    )
+
+    @pytest.mark.parametrize(
+        argnames=["aggfun", "expected"],
+        argvalues=[
+            (
+                "mid",
+                pd.DataFrame({"binnedA": binned_col_A, "colA": [1.4995, 2.5]})
+            ),
+            (
+                "min",
+                pd.DataFrame({"binnedA": binned_col_A, "colA": [1, 3]})
+            ),
+            (
+                np.median,
+                pd.DataFrame({"binnedA": binned_col_A, "colA": [1.5, 3.0]})
+            ),
+        ]
+    )
+    def test__get_binned_feature_map(self, aggfun, expected):
+        df = (
+            self.x_data
+            .assign(
+                **{
+                    "binnedA": [
+                        pd.Interval(0.999, 2.0),
+                        pd.Interval(0.999, 2.0),
+                        pd.Interval(2.0, 3.0),
+                    ],
+                },
+            )
+            .astype({"binnedA": self.binned_col_A.dtype})
+        )
+
+        test = MarginalEffects._get_binned_feature_map(
+            df,
+            "binnedA",
+            "colA",
+            aggfun=aggfun,
+        )
+
+        print(test.dtypes, expected.dtypes)
+        assert_frame_equal(test, expected)

From 344f459114880c710a6449c0829427b20d117205 Mon Sep 17 00:00:00 2001
From: Andy Wong <andy.wong@strong.io>
Date: Tue, 18 Jun 2024 16:30:00 -0600
Subject: [PATCH 4/6] empty bin test

---
 tests/evaluation/test_marginal_effects.py | 29 +++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/tests/evaluation/test_marginal_effects.py b/tests/evaluation/test_marginal_effects.py
index 8ee6859..9cd1da1 100644
--- a/tests/evaluation/test_marginal_effects.py
+++ b/tests/evaluation/test_marginal_effects.py
@@ -192,3 +192,32 @@ def test__get_binned_feature_map(self, aggfun, expected):
 
         print(test.dtypes, expected.dtypes)
         assert_frame_equal(test, expected)
+
+    def test__get_binned_feature_map_empty_bins(self):
+        df = (
+            self.x_data
+            .assign(
+                **{
+                    "binnedA": pd.Categorical(
+                        [
+                            pd.Interval(0.999, 2.0),
+                            pd.Interval(0.999, 2.0),
+                            pd.Interval(2.0, 3.0),
+                        ],
+                        categories=[
+                            pd.Interval(-np.inf, 0.999),
+                            pd.Interval(0.999, 2.0),
+                            pd.Interval(2.0, 3.0)
+                        ],
+                    )
+                },
+            )
+        )
+
+        with pytest.raises(ValueError):
+            MarginalEffects._get_binned_feature_map(
+                df,
+                "binnedA",
+                "colA",
+                "median",
+            )

From 0de4dd2fe3ced03fa8096d62ce99d59245d41f6f Mon Sep 17 00:00:00 2001
From: Andy Wong <andy.wong@strong.io>
Date: Tue, 18 Jun 2024 16:49:33 -0600
Subject: [PATCH 5/6] refactor function

---
 foundry/evaluation/marginal_effects.py | 57 ++++++++++++++++++--------
 1 file changed, 41 insertions(+), 16 deletions(-)

diff --git a/foundry/evaluation/marginal_effects.py b/foundry/evaluation/marginal_effects.py
index 62f022a..4e9dbe7 100644
--- a/foundry/evaluation/marginal_effects.py
+++ b/foundry/evaluation/marginal_effects.py
@@ -504,26 +504,51 @@ def _get_binned_feature_map(X: pd.DataFrame,
                                 aggfun: Union[str, Callable]) -> pd.DataFrame:
         """
         Get a dataframe that maps the binned version of a feature to the aggregates of its original values.
+
+        :param X: A dataframe which contains the columns binned_fname and fname
+        :param binned_fname: The column name of the binned data
+        :param fname: The column name of the unbinned data
+        :param aggfun: the aggregation of X[fname] based on grouping by binned_fname. The special case of 'mid' will use
+        the midpoint of the bins in X[binned_fname]
+
+        :returns: a pd.DataFrame with columns [binned_fname, fname]. The returned[fname] will contain the aggregated values.
+        :raises ValueError: if fname and binned_fname are the same
+        :raises ValueError: if there are inf or na in the resulting aggregated values.
         """
-        assert binned_fname != fname
+        if binned_fname == fname:
+            raise ValueError("binned_fname and fname cannot be the same column.")
 
         if aggfun == 'mid':
-            # creates a df with unique values of `binned_fname` and `nans` for `fname`.
-            # this will then get filled with the midpoint below:
-            # todo: less hacky way to do this
-            df_mapping = X.groupby(binned_fname, observed=False)[fname].agg('count').reset_index()
-            df_mapping[fname] = float('nan')
-        else:
-            df_mapping = X.groupby(binned_fname, observed=False)[fname].agg(aggfun).reset_index()
-
-        # for any bins that aren't actually observed, use the midpoint:
-        midpoints = pd.Series([x.mid for x in df_mapping[binned_fname]])
-        if np.isinf(midpoints).any() and df_mapping[fname].isnull().any():
-            raise ValueError(
-                f"[{fname}] `inf` bin cuts cannot be used when no data present in the bin:"
-                f"{df_mapping[binned_fname][np.isinf(midpoints)]}"
+            aggfun = lambda series: series.name.mid
+
+        df_mapping = (
+            X
+            .groupby(
+                binned_fname,
+                group_keys=True,
+                observed=False
             )
-        df_mapping[fname].fillna(midpoints, inplace=True)
+            [fname]
+            .apply(aggfun)
+            .reset_index()
+            .assign(**{
+                fname: lambda df: (
+                    df
+                    [fname]
+                    .fillna(
+                        df
+                        [binned_fname]
+                        .map(lambda interval: interval.mid)
+                        .astype(float)
+                    )
+                )
+            })
+        )
+
+        with pd.option_context("mode.use_inf_as_na", True):
+            if df_mapping[fname].isna().any():
+                raise ValueError(f"aggfun resulted in invalid values: \n {df_mapping}")
+
         return df_mapping
 
     def _get_df_novary(self,

From b18b294d064ee6e0c5218ce9998cb75d812aeb01 Mon Sep 17 00:00:00 2001
From: Andy Wong <andy.wong@strong.io>
Date: Tue, 18 Jun 2024 16:54:43 -0600
Subject: [PATCH 6/6] cleanup

---
 foundry/evaluation/marginal_effects.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/foundry/evaluation/marginal_effects.py b/foundry/evaluation/marginal_effects.py
index 4e9dbe7..c9bf5d3 100644
--- a/foundry/evaluation/marginal_effects.py
+++ b/foundry/evaluation/marginal_effects.py
@@ -509,7 +509,8 @@ def _get_binned_feature_map(X: pd.DataFrame,
         :param binned_fname: The column name of the binned data
         :param fname: The column name of the unbinned data
         :param aggfun: the aggregation of X[fname] based on grouping by binned_fname. The special case of 'mid' will use
-        the midpoint of the bins in X[binned_fname]
+        the midpoint of the bins in X[binned_fname]. In the case that there are no actual values in a bin to aggregate, the midpoint
+        of the bin will be used.
 
         :returns: a pd.DataFrame with columns [binned_fname, fname]. The returned[fname] will contain the aggregated values.
         :raises ValueError: if fname and binned_fname are the same
@@ -526,11 +527,11 @@ def _get_binned_feature_map(X: pd.DataFrame,
             .groupby(
                 binned_fname,
                 group_keys=True,
-                observed=False
+                observed=False,
+                as_index=False,
             )
             [fname]
             .apply(aggfun)
-            .reset_index()
             .assign(**{
                 fname: lambda df: (
                     df