From e090c212095f05666633531fd4c30915bb7b5a51 Mon Sep 17 00:00:00 2001 From: Andy Wong Date: Tue, 18 Jun 2024 15:58:49 -0600 Subject: [PATCH 1/6] sklearn 1.x - change argname --- tests/preprocessing/sklearn/test_dataframe_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/preprocessing/sklearn/test_dataframe_transformer.py b/tests/preprocessing/sklearn/test_dataframe_transformer.py index f285379..25b1e7a 100644 --- a/tests/preprocessing/sklearn/test_dataframe_transformer.py +++ b/tests/preprocessing/sklearn/test_dataframe_transformer.py @@ -19,7 +19,7 @@ class TestDataFrameTransformer: (np.zeros((3, 2)), pd.DataFrame(np.zeros((3, 2)))), # convert sparse: ( - OneHotEncoder(sparse=True).fit_transform([['a'], ['b'], ['c'], ['d']]), + OneHotEncoder(sparse_output=True).fit_transform([['a'], ['b'], ['c'], ['d']]), pd.DataFrame(np.eye(4)) ) ] From fd6326c38b19b3fa4c1044998aff1686c01479db Mon Sep 17 00:00:00 2001 From: Andy Wong Date: Tue, 18 Jun 2024 15:59:40 -0600 Subject: [PATCH 2/6] fix typehinting and imports --- foundry/evaluation/marginal_effects.py | 2 +- tests/evaluation/test_marginal_effects.py | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/foundry/evaluation/marginal_effects.py b/foundry/evaluation/marginal_effects.py index b9f36c7..62f022a 100644 --- a/foundry/evaluation/marginal_effects.py +++ b/foundry/evaluation/marginal_effects.py @@ -15,7 +15,7 @@ class Binned: - def __init__(self, col: str, bins: Union[int, Sequence] = 20, **kwargs): + def __init__(self, col: str, bins: Union[None, int, Sequence] = 20, **kwargs): """ This class creates an object which can bin a pandas.Series. ``` diff --git a/tests/evaluation/test_marginal_effects.py b/tests/evaluation/test_marginal_effects.py index d80fc93..6084bd3 100644 --- a/tests/evaluation/test_marginal_effects.py +++ b/tests/evaluation/test_marginal_effects.py @@ -1,13 +1,15 @@ from typing import Callable -import pandas as pd +from unittest.mock import create_autospec + import numpy as np +import pandas as pd import pytest -from unittest.mock import create_autospec -from pandas.testing import assert_series_equal +from foundry.evaluation.marginal_effects import (Binned, MarginalEffects, + binned, raw) +from pandas.testing import assert_frame_equal, assert_series_equal from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline -from foundry.evaluation.marginal_effects import Binned, MarginalEffects, binned, raw class TestBinned(): @pytest.mark.parametrize( @@ -72,7 +74,7 @@ def test_binned_init(self, bins): ) ), ( - False, + None, pd.Series(list(range(20)), name="my_feature") ) ], From 3cee20eefe852fcb880dd7516dcd673ecd37bb2a Mon Sep 17 00:00:00 2001 From: Andy Wong Date: Tue, 18 Jun 2024 16:00:12 -0600 Subject: [PATCH 3/6] add test for _get_binned_feature_map --- tests/evaluation/test_marginal_effects.py | 57 +++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/tests/evaluation/test_marginal_effects.py b/tests/evaluation/test_marginal_effects.py index 6084bd3..8ee6859 100644 --- a/tests/evaluation/test_marginal_effects.py +++ b/tests/evaluation/test_marginal_effects.py @@ -135,3 +135,60 @@ def test_feature_names_in(self, col_transformer__columns, expected): assert isinstance(me.feature_names_in, list) assert list(sorted(expected)) == list(sorted(me.feature_names_in)) + + binned_col_A = pd.Series( + [ + pd.Interval(0.999, 2.0), + pd.Interval(2.0, 3.0), + ], + dtype=pd.CategoricalDtype( + categories=[ + pd.Interval(0.999, 2.0), + pd.Interval(2.0, 3.0) + ], + ordered=True + ), + name="binnedA" + ) + + @pytest.mark.parametrize( + argnames=["aggfun", "expected"], + argvalues=[ + ( + "mid", + pd.DataFrame({"binnedA": binned_col_A, "colA": [1.4995, 2.5]}) + ), + ( + "min", + pd.DataFrame({"binnedA": binned_col_A, "colA": [1, 3]}) + ), + ( + np.median, + pd.DataFrame({"binnedA": binned_col_A, "colA": [1.5, 3.0]}) + ), + ] + ) + def test__get_binned_feature_map(self, aggfun, expected): + df = ( + self.x_data + .assign( + **{ + "binnedA": [ + pd.Interval(0.999, 2.0), + pd.Interval(0.999, 2.0), + pd.Interval(2.0, 3.0), + ], + }, + ) + .astype({"binnedA": self.binned_col_A.dtype}) + ) + + test = MarginalEffects._get_binned_feature_map( + df, + "binnedA", + "colA", + aggfun=aggfun, + ) + + print(test.dtypes, expected.dtypes) + assert_frame_equal(test, expected) From 344f459114880c710a6449c0829427b20d117205 Mon Sep 17 00:00:00 2001 From: Andy Wong Date: Tue, 18 Jun 2024 16:30:00 -0600 Subject: [PATCH 4/6] empty bin test --- tests/evaluation/test_marginal_effects.py | 29 +++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/evaluation/test_marginal_effects.py b/tests/evaluation/test_marginal_effects.py index 8ee6859..9cd1da1 100644 --- a/tests/evaluation/test_marginal_effects.py +++ b/tests/evaluation/test_marginal_effects.py @@ -192,3 +192,32 @@ def test__get_binned_feature_map(self, aggfun, expected): print(test.dtypes, expected.dtypes) assert_frame_equal(test, expected) + + def test__get_binned_feature_map_empty_bins(self): + df = ( + self.x_data + .assign( + **{ + "binnedA": pd.Categorical( + [ + pd.Interval(0.999, 2.0), + pd.Interval(0.999, 2.0), + pd.Interval(2.0, 3.0), + ], + categories=[ + pd.Interval(-np.inf, 0.999), + pd.Interval(0.999, 2.0), + pd.Interval(2.0, 3.0) + ], + ) + }, + ) + ) + + with pytest.raises(ValueError): + MarginalEffects._get_binned_feature_map( + df, + "binnedA", + "colA", + "median", + ) From 0de4dd2fe3ced03fa8096d62ce99d59245d41f6f Mon Sep 17 00:00:00 2001 From: Andy Wong Date: Tue, 18 Jun 2024 16:49:33 -0600 Subject: [PATCH 5/6] refactor function --- foundry/evaluation/marginal_effects.py | 57 ++++++++++++++++++-------- 1 file changed, 41 insertions(+), 16 deletions(-) diff --git a/foundry/evaluation/marginal_effects.py b/foundry/evaluation/marginal_effects.py index 62f022a..4e9dbe7 100644 --- a/foundry/evaluation/marginal_effects.py +++ b/foundry/evaluation/marginal_effects.py @@ -504,26 +504,51 @@ def _get_binned_feature_map(X: pd.DataFrame, aggfun: Union[str, Callable]) -> pd.DataFrame: """ Get a dataframe that maps the binned version of a feature to the aggregates of its original values. + + :param X: A dataframe which contains the columns binned_fname and fname + :param binned_fname: The column name of the binned data + :param fname: The column name of the unbinned data + :param aggfun: the aggregation of X[fname] based on grouping by binned_fname. The special case of 'mid' will use + the midpoint of the bins in X[binned_fname] + + :returns: a pd.DataFrame with columns [binned_fname, fname]. The returned[fname] will contain the aggregated values. + :raises ValueError: if fname and binned_fname are the same + :raises ValueError: if there are inf or na in the resulting aggregated values. """ - assert binned_fname != fname + if binned_fname == fname: + raise ValueError("binned_fname and fname cannot be the same column.") if aggfun == 'mid': - # creates a df with unique values of `binned_fname` and `nans` for `fname`. - # this will then get filled with the midpoint below: - # todo: less hacky way to do this - df_mapping = X.groupby(binned_fname, observed=False)[fname].agg('count').reset_index() - df_mapping[fname] = float('nan') - else: - df_mapping = X.groupby(binned_fname, observed=False)[fname].agg(aggfun).reset_index() - - # for any bins that aren't actually observed, use the midpoint: - midpoints = pd.Series([x.mid for x in df_mapping[binned_fname]]) - if np.isinf(midpoints).any() and df_mapping[fname].isnull().any(): - raise ValueError( - f"[{fname}] `inf` bin cuts cannot be used when no data present in the bin:" - f"{df_mapping[binned_fname][np.isinf(midpoints)]}" + aggfun = lambda series: series.name.mid + + df_mapping = ( + X + .groupby( + binned_fname, + group_keys=True, + observed=False ) - df_mapping[fname].fillna(midpoints, inplace=True) + [fname] + .apply(aggfun) + .reset_index() + .assign(**{ + fname: lambda df: ( + df + [fname] + .fillna( + df + [binned_fname] + .map(lambda interval: interval.mid) + .astype(float) + ) + ) + }) + ) + + with pd.option_context("mode.use_inf_as_na", True): + if df_mapping[fname].isna().any(): + raise ValueError(f"aggfun resulted in invalid values: \n {df_mapping}") + return df_mapping def _get_df_novary(self, From b18b294d064ee6e0c5218ce9998cb75d812aeb01 Mon Sep 17 00:00:00 2001 From: Andy Wong Date: Tue, 18 Jun 2024 16:54:43 -0600 Subject: [PATCH 6/6] cleanup --- foundry/evaluation/marginal_effects.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/foundry/evaluation/marginal_effects.py b/foundry/evaluation/marginal_effects.py index 4e9dbe7..c9bf5d3 100644 --- a/foundry/evaluation/marginal_effects.py +++ b/foundry/evaluation/marginal_effects.py @@ -509,7 +509,8 @@ def _get_binned_feature_map(X: pd.DataFrame, :param binned_fname: The column name of the binned data :param fname: The column name of the unbinned data :param aggfun: the aggregation of X[fname] based on grouping by binned_fname. The special case of 'mid' will use - the midpoint of the bins in X[binned_fname] + the midpoint of the bins in X[binned_fname]. In the case that there are no actual values in a bin to aggregate, the midpoint + of the bin will be used. :returns: a pd.DataFrame with columns [binned_fname, fname]. The returned[fname] will contain the aggregated values. :raises ValueError: if fname and binned_fname are the same @@ -526,11 +527,11 @@ def _get_binned_feature_map(X: pd.DataFrame, .groupby( binned_fname, group_keys=True, - observed=False + observed=False, + as_index=False, ) [fname] .apply(aggfun) - .reset_index() .assign(**{ fname: lambda df: ( df