diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index 1d918bc0c4643..0a1d26911ec91 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -112,10 +112,27 @@ class IterativeImputer(_BaseImputer): imputed target feature. Can provide significant speed-up when the number of features is huge. If `None`, all features will be used. - initial_strategy : {'mean', 'median', 'most_frequent', 'constant'}, \ + initial_strategy : {'mean', 'median', 'most_frequent', 'constant'} or \ + estimator with ``fit``, ``transform``, ``get_params`` and \ + ``get_feature_names_out``, \ default='mean' - Which strategy to use to initialize the missing values. Same as the - `strategy` parameter in :class:`~sklearn.impute.SimpleImputer`. + Strategy or imputer used to initialize the missing values. When a + string is provided, it matches the `strategy` parameter of + :class:`~sklearn.impute.SimpleImputer`. When an imputer instance is + provided, it is cloned and used for the initialization step. The + instance must implement ``fit``, ``transform``, ``get_params`` and + ``get_feature_names_out``. In this case, + :class:`~sklearn.impute.SimpleImputer`'s parameters such as + ``missing_values`` and ``keep_empty_features`` are synchronized when + supported, and ``fill_value`` is ignored. + + fill_value : object, default=None + When ``initial_strategy='constant'``, the value used to replace + missing values prior to the iterative imputation rounds. Mirrors the + ``fill_value`` parameter of :class:`~sklearn.impute.SimpleImputer`. + Either a scalar value or an array of shape ``(n_features,)`` can be + provided. ``np.nan`` is accepted. Ignored when ``initial_strategy`` is + an imputer instance. imputation_order : {'ascending', 'descending', 'roman', 'arabic', \ 'random'}, default='ascending' @@ -181,8 +198,10 @@ class IterativeImputer(_BaseImputer): Attributes ---------- - initial_imputer_ : object of type :class:`~sklearn.impute.SimpleImputer` - Imputer used to initialize the missing values. + initial_imputer_ : estimator + Imputer used to initialize the missing values. It is a + :class:`~sklearn.impute.SimpleImputer` when ``initial_strategy`` is a + string, otherwise it is a clone of the provided imputer instance. imputation_sequence_ : list of tuples Each tuple has `(feat_idx, neighbor_feat_idx, estimator)`, where @@ -279,8 +298,15 @@ class IterativeImputer(_BaseImputer): "tol": [Interval(Real, 0, None, closed="left")], "n_nearest_features": [None, Interval(Integral, 1, None, closed="left")], "initial_strategy": [ - StrOptions({"mean", "median", "most_frequent", "constant"}) + StrOptions({"mean", "median", "most_frequent", "constant"}), + HasMethods([ + "fit", + "transform", + "get_params", + "get_feature_names_out", + ]), ], + "fill_value": "no_validation", "imputation_order": [ StrOptions({"ascending", "descending", "roman", "arabic", "random"}) ], @@ -301,6 +327,7 @@ def __init__( tol=1e-3, n_nearest_features=None, initial_strategy="mean", + fill_value=None, imputation_order="ascending", skip_complete=False, min_value=-np.inf, @@ -322,6 +349,7 @@ def __init__( self.tol = tol self.n_nearest_features = n_nearest_features self.initial_strategy = initial_strategy + self.fill_value = fill_value self.imputation_order = imputation_order self.skip_complete = skip_complete self.min_value = min_value @@ -610,11 +638,31 @@ def _initial_imputation(self, X, in_fit=False): X_missing_mask = _get_mask(X, self.missing_values) mask_missing_values = X_missing_mask.copy() if self.initial_imputer_ is None: - self.initial_imputer_ = SimpleImputer( - missing_values=self.missing_values, - strategy=self.initial_strategy, - keep_empty_features=self.keep_empty_features, - ) + if isinstance(self.initial_strategy, str): + self.initial_imputer_ = SimpleImputer( + missing_values=self.missing_values, + strategy=self.initial_strategy, + fill_value=self.fill_value, + keep_empty_features=self.keep_empty_features, + ) + else: + self.initial_imputer_ = clone(self.initial_strategy) + init_params = self.initial_imputer_.get_params(deep=False) + params_to_set = {} + if "missing_values" in init_params: + params_to_set["missing_values"] = self.missing_values + if "keep_empty_features" in init_params: + params_to_set["keep_empty_features"] = ( + self.keep_empty_features + ) + if params_to_set: + self.initial_imputer_.set_params(**params_to_set) + if self.fill_value is not None: + warnings.warn( + "'fill_value' is ignored when ``initial_strategy`` is set " + "to an imputer instance.", + UserWarning, + ) X_filled = self.initial_imputer_.fit_transform(X) else: X_filled = self.initial_imputer_.transform(X) diff --git a/sklearn/impute/tests/test_impute.py b/sklearn/impute/tests/test_impute.py index 86553effafcbf..d99c68e7e6e8f 100644 --- a/sklearn/impute/tests/test_impute.py +++ b/sklearn/impute/tests/test_impute.py @@ -12,10 +12,12 @@ from sklearn.utils._testing import assert_allclose_dense_sparse from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal +from sklearn.utils._param_validation import InvalidParameterError # make IterativeImputer available from sklearn.experimental import enable_iterative_imputer # noqa +from sklearn.base import clone from sklearn.datasets import load_diabetes from sklearn.impute import MissingIndicator from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer @@ -94,6 +96,86 @@ def test_imputation_shape(strategy): assert X_imputed.shape == (10, 2) +class _MinimalImputer: + def fit(self, X, y=None): + return self + + def transform(self, X): + return X + + def get_params(self, deep=True): + return {} + + +@pytest.mark.parametrize("fill_value", [7, np.nan]) +def test_iterative_imputer_constant_strategy_respects_fill_value(fill_value): + X = np.array([[np.nan, 1], [2, np.nan], [3, 4]], dtype=float) + + expected = SimpleImputer( + strategy="constant", fill_value=fill_value + ).fit_transform(X) + + imputer = IterativeImputer( + initial_strategy="constant", fill_value=fill_value, max_iter=0, random_state=0 + ) + transformed = imputer.fit_transform(X) + + assert_allclose(transformed, expected, equal_nan=True) + + +def test_iterative_imputer_constant_default_fill_value_matches_simple_imputer(): + X = np.array([[np.nan, 1], [2, np.nan], [3, 4]], dtype=float) + + expected = SimpleImputer(strategy="constant").fit_transform(X) + + imputer = IterativeImputer(initial_strategy="constant", max_iter=0, random_state=0) + transformed = imputer.fit_transform(X) + + assert_allclose(transformed, expected, equal_nan=True) + + +def test_iterative_imputer_accepts_imputer_instance_for_initial_strategy(): + X = np.array([[-1, 1, -1], [2, -1, 3], [-1, -1, -1]], dtype=float) + + seed_imputer = SimpleImputer(strategy="most_frequent") + expected = clone(seed_imputer).set_params( + missing_values=-1, keep_empty_features=True + ).fit_transform(X) + + imputer = IterativeImputer( + initial_strategy=seed_imputer, + max_iter=0, + missing_values=-1, + keep_empty_features=True, + random_state=0, + ) + + transformed = imputer.fit_transform(X) + + assert_allclose(transformed, expected, equal_nan=True) + + +def test_iterative_imputer_warns_when_fill_value_with_imputer_instance(): + X = np.array([[np.nan, 1], [2, np.nan]], dtype=float) + + seed_imputer = SimpleImputer(strategy="mean") + imputer = IterativeImputer( + initial_strategy=seed_imputer, fill_value=0, max_iter=0, random_state=0 + ) + + with pytest.warns(UserWarning, match="'fill_value' is ignored"): + imputer.fit_transform(X) + + +def test_iterative_imputer_requires_feature_names_out(): + X = np.array([[np.nan, 1], [2, np.nan]], dtype=float) + + imputer = IterativeImputer(initial_strategy=_MinimalImputer(), max_iter=0) + + with pytest.raises(InvalidParameterError): + imputer.fit(X) + + @pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent"]) def test_imputation_deletion_warning(strategy): X = np.ones((3, 5))