Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions sklearn/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"enable_cython_pairwise_dist": True,
"array_api_dispatch": False,
"transform_output": "default",
"preserve_output_dtypes": False,
}
_threadlocal = threading.local()

Expand Down Expand Up @@ -54,6 +55,7 @@ def set_config(
enable_cython_pairwise_dist=None,
array_api_dispatch=None,
transform_output=None,
preserve_output_dtypes=None,
):
"""Set global scikit-learn configuration

Expand Down Expand Up @@ -134,6 +136,11 @@ def set_config(

.. versionadded:: 1.2

preserve_output_dtypes : bool, default=None
Preserve input pandas dtypes when ``transform_output="pandas"`` and
transformers emit an unmodified subset or reordering of the input
columns. Global default: False.

See Also
--------
config_context : Context manager for global scikit-learn configuration.
Expand All @@ -157,6 +164,8 @@ def set_config(
local_config["array_api_dispatch"] = array_api_dispatch
if transform_output is not None:
local_config["transform_output"] = transform_output
if preserve_output_dtypes is not None:
local_config["preserve_output_dtypes"] = preserve_output_dtypes


@contextmanager
Expand All @@ -170,6 +179,7 @@ def config_context(
enable_cython_pairwise_dist=None,
array_api_dispatch=None,
transform_output=None,
preserve_output_dtypes=None,
):
"""Context manager for global scikit-learn configuration.

Expand Down Expand Up @@ -249,6 +259,12 @@ def config_context(

.. versionadded:: 1.2

preserve_output_dtypes : bool, default=None
Preserve input pandas dtypes when ``transform_output="pandas"`` and
transformers emit an unmodified subset or reordering of the input
columns. If None, the existing value won't change. The default value
is False.

Yields
------
None.
Expand Down Expand Up @@ -286,6 +302,7 @@ def config_context(
enable_cython_pairwise_dist=enable_cython_pairwise_dist,
array_api_dispatch=array_api_dispatch,
transform_output=transform_output,
preserve_output_dtypes=preserve_output_dtypes,
)

try:
Expand Down
50 changes: 45 additions & 5 deletions sklearn/utils/_set_output.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from functools import wraps

import numpy as np
from scipy.sparse import issparse

from . import check_pandas_support
Expand All @@ -12,6 +13,8 @@ def _wrap_in_pandas_container(
*,
columns,
index=None,
original_input=None,
preserve_dtypes=False,
):
"""Create a Pandas DataFrame.

Expand All @@ -36,6 +39,14 @@ def _wrap_in_pandas_container(
index : array-like, default=None
Index for data.

original_input : DataFrame, default=None
Original pandas input provided to the estimator.

preserve_dtypes : bool, default=False
Whether to attempt preserving the dtypes from ``original_input``
when the output is an unmodified subset or reordering of the input
columns.

Returns
-------
dataframe : DataFrame
Expand All @@ -57,9 +68,31 @@ def _wrap_in_pandas_container(
data_to_wrap.columns = columns
if index is not None:
data_to_wrap.index = index
return data_to_wrap

return pd.DataFrame(data_to_wrap, index=index, columns=columns)
df = data_to_wrap
else:
df = pd.DataFrame(data_to_wrap, index=index, columns=columns)

if preserve_dtypes and isinstance(original_input, pd.DataFrame):
resolved_columns = df.columns if columns is None else columns
if resolved_columns is not None:
try:
resolved_columns = list(resolved_columns)
except TypeError:
resolved_columns = list(df.columns)

if all(col in original_input.columns for col in resolved_columns):
original_subset = original_input.loc[:, resolved_columns]
left = df.to_numpy(dtype=object, na_value=np.nan)
right = original_subset.to_numpy(dtype=object, na_value=np.nan)
left_isna = pd.isna(left)
right_isna = pd.isna(right)
if np.array_equal(left_isna, right_isna) and np.array_equal(
left[~left_isna], right[~left_isna]
):
df = original_subset.copy()
df.columns = resolved_columns

return df


def _get_output_config(method, estimator=None):
Expand All @@ -81,19 +114,24 @@ def _get_output_config(method, estimator=None):

- "dense": specifies the dense container for `method`. This can be
`"default"` or `"pandas"`.
- "preserve_dtypes": whether to preserve pandas dtypes when wrapping
the output in a DataFrame.
"""
global_config = get_config()
est_sklearn_output_config = getattr(estimator, "_sklearn_output_config", {})
if method in est_sklearn_output_config:
dense_config = est_sklearn_output_config[method]
else:
dense_config = get_config()[f"{method}_output"]
dense_config = global_config[f"{method}_output"]

preserve_dtypes = global_config["preserve_output_dtypes"]

if dense_config not in {"default", "pandas"}:
raise ValueError(
f"output config must be 'default' or 'pandas' got {dense_config}"
)

return {"dense": dense_config}
return {"dense": dense_config, "preserve_dtypes": preserve_dtypes}


def _wrap_data_with_container(method, data_to_wrap, original_input, estimator):
Expand Down Expand Up @@ -131,6 +169,8 @@ def _wrap_data_with_container(method, data_to_wrap, original_input, estimator):
data_to_wrap=data_to_wrap,
index=getattr(original_input, "index", None),
columns=estimator.get_feature_names_out,
original_input=original_input,
preserve_dtypes=output_config["preserve_dtypes"],
)


Expand Down
141 changes: 141 additions & 0 deletions sklearn/utils/tests/test_set_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
from numpy.testing import assert_array_equal

from sklearn._config import config_context, get_config
from sklearn.base import TransformerMixin
from sklearn.feature_selection import SelectKBest, f_classif

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nit] Looks like is no longer used in this module—could you drop the import while you’re here?

from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.utils._set_output import _wrap_in_pandas_container
from sklearn.utils._set_output import _safe_set_output
from sklearn.utils._set_output import _SetOutputMixin
Expand Down Expand Up @@ -260,3 +263,141 @@ class C(A, B):
pass

assert C().transform(None) == "B"


def test_preserve_pandas_dtypes_subset_selector():
pd = pytest.importorskip("pandas")

X = pd.DataFrame(
{
"float16_col": pd.Series([5.1, 4.9, 5.0, 5.3], dtype=np.float16),
"categorical_col": pd.Categorical([0, 1, 0, 1], categories=[0, 1]),
"float64_col": [1.4, 1.5, 1.3, 1.6],
}
)
y = np.array([0, 1, 0, 1])

def constant_scores(X_arr, y_arr):
scores = np.array([3.0, 2.0, 1.0])
pvalues = np.zeros_like(scores)
return scores, pvalues

selector = SelectKBest(score_func=constant_scores, k=2).set_output(
transform="pandas"
)

with config_context(transform_output="pandas", preserve_output_dtypes=True):
X_selected = selector.fit_transform(X, y)

expected_columns = ["float16_col", "categorical_col"]
assert list(X_selected.columns) == expected_columns
pd.testing.assert_series_equal(
X_selected.dtypes, X[expected_columns].dtypes, check_names=False
)


def test_preserve_dtypes_skips_when_values_change():
pd = pytest.importorskip("pandas")

X = pd.DataFrame(
{"float16_col": pd.Series([1.0, 2.0, 3.0], dtype=np.float16)}
)

def double_and_upcast(df):
data = df.to_numpy(dtype=np.float64) * 2
return pd.DataFrame(data, index=df.index, columns=df.columns)

transformer = FunctionTransformer(
double_and_upcast, feature_names_out="one-to-one"
).set_output(transform="pandas")

with config_context(transform_output="pandas", preserve_output_dtypes=True):
X_trans = transformer.fit_transform(X)

assert X_trans.dtypes.iloc[0] == np.dtype(np.float64)


def test_preserve_dtypes_skips_when_new_columns_added():
pd = pytest.importorskip("pandas")

X = pd.DataFrame({"color": pd.Categorical(["red", "blue", "red"])})

encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")

with config_context(transform_output="pandas", preserve_output_dtypes=True):
X_encoded = encoder.fit_transform(X)

assert "color" not in X_encoded.columns
assert not X_encoded.dtypes.eq(X["color"].dtype).any()


class DuplicateSelector(TransformerMixin):
def __init__(self, columns):
self.columns = list(columns)

def fit(self, X, y=None):
name_counts = {}
positions = []
for column in self.columns:
offset = name_counts.get(column, 0)
matches = [idx for idx, name in enumerate(X.columns) if name == column]
if offset >= len(matches):
raise ValueError(f"Column {column!r} not available enough times")
positions.append(matches[offset])
name_counts[column] = offset + 1
self._positions = positions
return self

def transform(self, X, y=None):
return X.iloc[:, self._positions]

def get_feature_names_out(self, input_features=None):
return np.asarray(self.columns, dtype=object)


def test_preserve_dtypes_with_duplicate_columns():
pd = pytest.importorskip("pandas")

X = pd.DataFrame(
{
"first": pd.Series([1, 2, 3], dtype=np.int64),
"second": pd.Series([1.5, 2.5, 3.5], dtype=np.float32),
}
)
X.columns = ["feature", "feature"]

selector = DuplicateSelector(["feature", "feature"]).set_output(
transform="pandas"
)

with config_context(transform_output="pandas", preserve_output_dtypes=True):
X_selected = selector.fit(X).transform(X)

pd.testing.assert_series_equal(
X_selected.dtypes, X.dtypes, check_names=False
)


def test_preserve_dtypes_with_nans():
pd = pytest.importorskip("pandas")

X = pd.DataFrame(
{
"float_nan": pd.Series([1.0, np.nan, 3.0], dtype="Float32"),
"int_nan": pd.Series([1, None, 3], dtype="Int64"),
"extra": pd.Series([0.5, 0.6, 0.7], dtype=np.float64),
}
)

selector = DuplicateSelector(["float_nan", "int_nan"]).set_output(
transform="pandas"
)

with config_context(transform_output="pandas", preserve_output_dtypes=True):
X_selected = selector.fit(X).transform(X)

expected = X[["float_nan", "int_nan"]]
pd.testing.assert_series_equal(
X_selected.dtypes, expected.dtypes, check_names=False
)
pd.testing.assert_frame_equal(X_selected, expected)