Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 59 additions & 11 deletions sklearn/impute/_iterative.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,10 +112,27 @@ class IterativeImputer(_BaseImputer):
imputed target feature. Can provide significant speed-up when the
number of features is huge. If `None`, all features will be used.

initial_strategy : {'mean', 'median', 'most_frequent', 'constant'}, \
initial_strategy : {'mean', 'median', 'most_frequent', 'constant'} or \
estimator with ``fit``, ``transform``, ``get_params`` and \
``get_feature_names_out``, \
default='mean'
Which strategy to use to initialize the missing values. Same as the
`strategy` parameter in :class:`~sklearn.impute.SimpleImputer`.
Strategy or imputer used to initialize the missing values. When a
string is provided, it matches the `strategy` parameter of
:class:`~sklearn.impute.SimpleImputer`. When an imputer instance is
provided, it is cloned and used for the initialization step. The
instance must implement ``fit``, ``transform``, ``get_params`` and
``get_feature_names_out``. In this case,
:class:`~sklearn.impute.SimpleImputer`'s parameters such as
``missing_values`` and ``keep_empty_features`` are synchronized when
supported, and ``fill_value`` is ignored.

fill_value : object, default=None
When ``initial_strategy='constant'``, the value used to replace
missing values prior to the iterative imputation rounds. Mirrors the
``fill_value`` parameter of :class:`~sklearn.impute.SimpleImputer`.
Either a scalar value or an array of shape ``(n_features,)`` can be
provided. ``np.nan`` is accepted. Ignored when ``initial_strategy`` is
an imputer instance.

imputation_order : {'ascending', 'descending', 'roman', 'arabic', \
'random'}, default='ascending'
Expand Down Expand Up @@ -181,8 +198,10 @@ class IterativeImputer(_BaseImputer):

Attributes
----------
initial_imputer_ : object of type :class:`~sklearn.impute.SimpleImputer`
Imputer used to initialize the missing values.
initial_imputer_ : estimator
Imputer used to initialize the missing values. It is a
:class:`~sklearn.impute.SimpleImputer` when ``initial_strategy`` is a
string, otherwise it is a clone of the provided imputer instance.

imputation_sequence_ : list of tuples
Each tuple has `(feat_idx, neighbor_feat_idx, estimator)`, where
Expand Down Expand Up @@ -279,8 +298,15 @@ class IterativeImputer(_BaseImputer):
"tol": [Interval(Real, 0, None, closed="left")],
"n_nearest_features": [None, Interval(Integral, 1, None, closed="left")],
"initial_strategy": [
StrOptions({"mean", "median", "most_frequent", "constant"})
StrOptions({"mean", "median", "most_frequent", "constant"}),
HasMethods([
"fit",
"transform",
"get_params",
"get_feature_names_out",
]),
],
"fill_value": "no_validation",
"imputation_order": [
StrOptions({"ascending", "descending", "roman", "arabic", "random"})
],
Expand All @@ -301,6 +327,7 @@ def __init__(
tol=1e-3,
n_nearest_features=None,
initial_strategy="mean",
fill_value=None,
imputation_order="ascending",
skip_complete=False,
min_value=-np.inf,
Expand All @@ -322,6 +349,7 @@ def __init__(
self.tol = tol
self.n_nearest_features = n_nearest_features
self.initial_strategy = initial_strategy
self.fill_value = fill_value
self.imputation_order = imputation_order
self.skip_complete = skip_complete
self.min_value = min_value
Expand Down Expand Up @@ -610,11 +638,31 @@ def _initial_imputation(self, X, in_fit=False):
X_missing_mask = _get_mask(X, self.missing_values)
mask_missing_values = X_missing_mask.copy()
if self.initial_imputer_ is None:
self.initial_imputer_ = SimpleImputer(
missing_values=self.missing_values,
strategy=self.initial_strategy,
keep_empty_features=self.keep_empty_features,
)
if isinstance(self.initial_strategy, str):
self.initial_imputer_ = SimpleImputer(
missing_values=self.missing_values,
strategy=self.initial_strategy,
fill_value=self.fill_value,
keep_empty_features=self.keep_empty_features,
)
else:
self.initial_imputer_ = clone(self.initial_strategy)
init_params = self.initial_imputer_.get_params(deep=False)
params_to_set = {}
if "missing_values" in init_params:
params_to_set["missing_values"] = self.missing_values
if "keep_empty_features" in init_params:
params_to_set["keep_empty_features"] = (
self.keep_empty_features
)
if params_to_set:
self.initial_imputer_.set_params(**params_to_set)
if self.fill_value is not None:
warnings.warn(
"'fill_value' is ignored when ``initial_strategy`` is set "
"to an imputer instance.",
UserWarning,
)
X_filled = self.initial_imputer_.fit_transform(X)
else:
X_filled = self.initial_imputer_.transform(X)
Expand Down
82 changes: 82 additions & 0 deletions sklearn/impute/tests/test_impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@
from sklearn.utils._testing import assert_allclose_dense_sparse
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._param_validation import InvalidParameterError

# make IterativeImputer available
from sklearn.experimental import enable_iterative_imputer # noqa

from sklearn.base import clone
from sklearn.datasets import load_diabetes
from sklearn.impute import MissingIndicator
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
Expand Down Expand Up @@ -94,6 +96,86 @@ def test_imputation_shape(strategy):
assert X_imputed.shape == (10, 2)


class _MinimalImputer:
def fit(self, X, y=None):
return self

def transform(self, X):
return X

def get_params(self, deep=True):
return {}


@pytest.mark.parametrize("fill_value", [7, np.nan])
def test_iterative_imputer_constant_strategy_respects_fill_value(fill_value):
X = np.array([[np.nan, 1], [2, np.nan], [3, 4]], dtype=float)

expected = SimpleImputer(
strategy="constant", fill_value=fill_value
).fit_transform(X)

imputer = IterativeImputer(
initial_strategy="constant", fill_value=fill_value, max_iter=0, random_state=0
)
transformed = imputer.fit_transform(X)

assert_allclose(transformed, expected, equal_nan=True)


def test_iterative_imputer_constant_default_fill_value_matches_simple_imputer():
X = np.array([[np.nan, 1], [2, np.nan], [3, 4]], dtype=float)

expected = SimpleImputer(strategy="constant").fit_transform(X)

imputer = IterativeImputer(initial_strategy="constant", max_iter=0, random_state=0)
transformed = imputer.fit_transform(X)

assert_allclose(transformed, expected, equal_nan=True)


def test_iterative_imputer_accepts_imputer_instance_for_initial_strategy():
X = np.array([[-1, 1, -1], [2, -1, 3], [-1, -1, -1]], dtype=float)

seed_imputer = SimpleImputer(strategy="most_frequent")
expected = clone(seed_imputer).set_params(
missing_values=-1, keep_empty_features=True
).fit_transform(X)

imputer = IterativeImputer(
initial_strategy=seed_imputer,
max_iter=0,
missing_values=-1,
keep_empty_features=True,
random_state=0,
)

transformed = imputer.fit_transform(X)

assert_allclose(transformed, expected, equal_nan=True)


def test_iterative_imputer_warns_when_fill_value_with_imputer_instance():
X = np.array([[np.nan, 1], [2, np.nan]], dtype=float)

seed_imputer = SimpleImputer(strategy="mean")
imputer = IterativeImputer(
initial_strategy=seed_imputer, fill_value=0, max_iter=0, random_state=0
)

with pytest.warns(UserWarning, match="'fill_value' is ignored"):
imputer.fit_transform(X)


def test_iterative_imputer_requires_feature_names_out():
X = np.array([[np.nan, 1], [2, np.nan]], dtype=float)

imputer = IterativeImputer(initial_strategy=_MinimalImputer(), max_iter=0)

with pytest.raises(InvalidParameterError):
imputer.fit(X)


@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent"])
def test_imputation_deletion_warning(strategy):
X = np.ones((3, 5))
Expand Down