diff --git a/sklearn/_config.py b/sklearn/_config.py index e4c398c9c5444..fca0e92c24f74 100644 --- a/sklearn/_config.py +++ b/sklearn/_config.py @@ -15,6 +15,7 @@ "enable_cython_pairwise_dist": True, "array_api_dispatch": False, "transform_output": "default", + "preserve_output_dtypes": False, } _threadlocal = threading.local() @@ -54,6 +55,7 @@ def set_config( enable_cython_pairwise_dist=None, array_api_dispatch=None, transform_output=None, + preserve_output_dtypes=None, ): """Set global scikit-learn configuration @@ -134,6 +136,11 @@ def set_config( .. versionadded:: 1.2 + preserve_output_dtypes : bool, default=None + Preserve input pandas dtypes when ``transform_output="pandas"`` and + transformers emit an unmodified subset or reordering of the input + columns. Global default: False. + See Also -------- config_context : Context manager for global scikit-learn configuration. @@ -157,6 +164,8 @@ def set_config( local_config["array_api_dispatch"] = array_api_dispatch if transform_output is not None: local_config["transform_output"] = transform_output + if preserve_output_dtypes is not None: + local_config["preserve_output_dtypes"] = preserve_output_dtypes @contextmanager @@ -170,6 +179,7 @@ def config_context( enable_cython_pairwise_dist=None, array_api_dispatch=None, transform_output=None, + preserve_output_dtypes=None, ): """Context manager for global scikit-learn configuration. @@ -249,6 +259,12 @@ def config_context( .. versionadded:: 1.2 + preserve_output_dtypes : bool, default=None + Preserve input pandas dtypes when ``transform_output="pandas"`` and + transformers emit an unmodified subset or reordering of the input + columns. If None, the existing value won't change. The default value + is False. + Yields ------ None. @@ -286,6 +302,7 @@ def config_context( enable_cython_pairwise_dist=enable_cython_pairwise_dist, array_api_dispatch=array_api_dispatch, transform_output=transform_output, + preserve_output_dtypes=preserve_output_dtypes, ) try: diff --git a/sklearn/utils/_set_output.py b/sklearn/utils/_set_output.py index 335773c6af96c..f0072e6d50556 100644 --- a/sklearn/utils/_set_output.py +++ b/sklearn/utils/_set_output.py @@ -1,5 +1,6 @@ from functools import wraps +import numpy as np from scipy.sparse import issparse from . import check_pandas_support @@ -12,6 +13,8 @@ def _wrap_in_pandas_container( *, columns, index=None, + original_input=None, + preserve_dtypes=False, ): """Create a Pandas DataFrame. @@ -36,6 +39,14 @@ def _wrap_in_pandas_container( index : array-like, default=None Index for data. + original_input : DataFrame, default=None + Original pandas input provided to the estimator. + + preserve_dtypes : bool, default=False + Whether to attempt preserving the dtypes from ``original_input`` + when the output is an unmodified subset or reordering of the input + columns. + Returns ------- dataframe : DataFrame @@ -57,9 +68,31 @@ def _wrap_in_pandas_container( data_to_wrap.columns = columns if index is not None: data_to_wrap.index = index - return data_to_wrap - - return pd.DataFrame(data_to_wrap, index=index, columns=columns) + df = data_to_wrap + else: + df = pd.DataFrame(data_to_wrap, index=index, columns=columns) + + if preserve_dtypes and isinstance(original_input, pd.DataFrame): + resolved_columns = df.columns if columns is None else columns + if resolved_columns is not None: + try: + resolved_columns = list(resolved_columns) + except TypeError: + resolved_columns = list(df.columns) + + if all(col in original_input.columns for col in resolved_columns): + original_subset = original_input.loc[:, resolved_columns] + left = df.to_numpy(dtype=object, na_value=np.nan) + right = original_subset.to_numpy(dtype=object, na_value=np.nan) + left_isna = pd.isna(left) + right_isna = pd.isna(right) + if np.array_equal(left_isna, right_isna) and np.array_equal( + left[~left_isna], right[~left_isna] + ): + df = original_subset.copy() + df.columns = resolved_columns + + return df def _get_output_config(method, estimator=None): @@ -81,19 +114,24 @@ def _get_output_config(method, estimator=None): - "dense": specifies the dense container for `method`. This can be `"default"` or `"pandas"`. + - "preserve_dtypes": whether to preserve pandas dtypes when wrapping + the output in a DataFrame. """ + global_config = get_config() est_sklearn_output_config = getattr(estimator, "_sklearn_output_config", {}) if method in est_sklearn_output_config: dense_config = est_sklearn_output_config[method] else: - dense_config = get_config()[f"{method}_output"] + dense_config = global_config[f"{method}_output"] + + preserve_dtypes = global_config["preserve_output_dtypes"] if dense_config not in {"default", "pandas"}: raise ValueError( f"output config must be 'default' or 'pandas' got {dense_config}" ) - return {"dense": dense_config} + return {"dense": dense_config, "preserve_dtypes": preserve_dtypes} def _wrap_data_with_container(method, data_to_wrap, original_input, estimator): @@ -131,6 +169,8 @@ def _wrap_data_with_container(method, data_to_wrap, original_input, estimator): data_to_wrap=data_to_wrap, index=getattr(original_input, "index", None), columns=estimator.get_feature_names_out, + original_input=original_input, + preserve_dtypes=output_config["preserve_dtypes"], ) diff --git a/sklearn/utils/tests/test_set_output.py b/sklearn/utils/tests/test_set_output.py index ac73ca09439ff..ee684e5044e94 100644 --- a/sklearn/utils/tests/test_set_output.py +++ b/sklearn/utils/tests/test_set_output.py @@ -5,6 +5,9 @@ from numpy.testing import assert_array_equal from sklearn._config import config_context, get_config +from sklearn.base import TransformerMixin +from sklearn.feature_selection import SelectKBest, f_classif +from sklearn.preprocessing import FunctionTransformer, OneHotEncoder from sklearn.utils._set_output import _wrap_in_pandas_container from sklearn.utils._set_output import _safe_set_output from sklearn.utils._set_output import _SetOutputMixin @@ -260,3 +263,141 @@ class C(A, B): pass assert C().transform(None) == "B" + + +def test_preserve_pandas_dtypes_subset_selector(): + pd = pytest.importorskip("pandas") + + X = pd.DataFrame( + { + "float16_col": pd.Series([5.1, 4.9, 5.0, 5.3], dtype=np.float16), + "categorical_col": pd.Categorical([0, 1, 0, 1], categories=[0, 1]), + "float64_col": [1.4, 1.5, 1.3, 1.6], + } + ) + y = np.array([0, 1, 0, 1]) + + def constant_scores(X_arr, y_arr): + scores = np.array([3.0, 2.0, 1.0]) + pvalues = np.zeros_like(scores) + return scores, pvalues + + selector = SelectKBest(score_func=constant_scores, k=2).set_output( + transform="pandas" + ) + + with config_context(transform_output="pandas", preserve_output_dtypes=True): + X_selected = selector.fit_transform(X, y) + + expected_columns = ["float16_col", "categorical_col"] + assert list(X_selected.columns) == expected_columns + pd.testing.assert_series_equal( + X_selected.dtypes, X[expected_columns].dtypes, check_names=False + ) + + +def test_preserve_dtypes_skips_when_values_change(): + pd = pytest.importorskip("pandas") + + X = pd.DataFrame( + {"float16_col": pd.Series([1.0, 2.0, 3.0], dtype=np.float16)} + ) + + def double_and_upcast(df): + data = df.to_numpy(dtype=np.float64) * 2 + return pd.DataFrame(data, index=df.index, columns=df.columns) + + transformer = FunctionTransformer( + double_and_upcast, feature_names_out="one-to-one" + ).set_output(transform="pandas") + + with config_context(transform_output="pandas", preserve_output_dtypes=True): + X_trans = transformer.fit_transform(X) + + assert X_trans.dtypes.iloc[0] == np.dtype(np.float64) + + +def test_preserve_dtypes_skips_when_new_columns_added(): + pd = pytest.importorskip("pandas") + + X = pd.DataFrame({"color": pd.Categorical(["red", "blue", "red"])}) + + encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas") + + with config_context(transform_output="pandas", preserve_output_dtypes=True): + X_encoded = encoder.fit_transform(X) + + assert "color" not in X_encoded.columns + assert not X_encoded.dtypes.eq(X["color"].dtype).any() + + +class DuplicateSelector(TransformerMixin): + def __init__(self, columns): + self.columns = list(columns) + + def fit(self, X, y=None): + name_counts = {} + positions = [] + for column in self.columns: + offset = name_counts.get(column, 0) + matches = [idx for idx, name in enumerate(X.columns) if name == column] + if offset >= len(matches): + raise ValueError(f"Column {column!r} not available enough times") + positions.append(matches[offset]) + name_counts[column] = offset + 1 + self._positions = positions + return self + + def transform(self, X, y=None): + return X.iloc[:, self._positions] + + def get_feature_names_out(self, input_features=None): + return np.asarray(self.columns, dtype=object) + + +def test_preserve_dtypes_with_duplicate_columns(): + pd = pytest.importorskip("pandas") + + X = pd.DataFrame( + { + "first": pd.Series([1, 2, 3], dtype=np.int64), + "second": pd.Series([1.5, 2.5, 3.5], dtype=np.float32), + } + ) + X.columns = ["feature", "feature"] + + selector = DuplicateSelector(["feature", "feature"]).set_output( + transform="pandas" + ) + + with config_context(transform_output="pandas", preserve_output_dtypes=True): + X_selected = selector.fit(X).transform(X) + + pd.testing.assert_series_equal( + X_selected.dtypes, X.dtypes, check_names=False + ) + + +def test_preserve_dtypes_with_nans(): + pd = pytest.importorskip("pandas") + + X = pd.DataFrame( + { + "float_nan": pd.Series([1.0, np.nan, 3.0], dtype="Float32"), + "int_nan": pd.Series([1, None, 3], dtype="Int64"), + "extra": pd.Series([0.5, 0.6, 0.7], dtype=np.float64), + } + ) + + selector = DuplicateSelector(["float_nan", "int_nan"]).set_output( + transform="pandas" + ) + + with config_context(transform_output="pandas", preserve_output_dtypes=True): + X_selected = selector.fit(X).transform(X) + + expected = X[["float_nan", "int_nan"]] + pd.testing.assert_series_equal( + X_selected.dtypes, expected.dtypes, check_names=False + ) + pd.testing.assert_frame_equal(X_selected, expected)