Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 20 additions & 14 deletions sklearn/feature_selection/_sequential.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@
import warnings

from ._base import SelectorMixin
from ..base import BaseEstimator, MetaEstimatorMixin, clone
from ..base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier
from ..utils._param_validation import HasMethods, Hidden, Interval, StrOptions
from ..utils._param_validation import RealNotInt
from ..utils._tags import _safe_tags
from ..utils.validation import check_is_fitted
from ..model_selection import cross_val_score
from ..model_selection import check_cv, cross_val_score
from ..metrics import get_scorer_names


Expand Down Expand Up @@ -79,19 +79,23 @@ class SequentialFeatureSelector(SelectorMixin, MetaEstimatorMixin, BaseEstimator

If None, the estimator's score method is used.

cv : int, cross-validation generator or an iterable, default=None
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
cv : int, cross-validation generator, or iterable, default=None
Determines the cross-validation splitting strategy. Possible inputs
for ``cv`` are:

- None, to use the default 5-fold cross validation,
- ``None``, to use the default 5-fold cross validation,
- integer, to specify the number of folds in a `(Stratified)KFold`,
- :term:`CV splitter`,
- An iterable yielding (train, test) splits as arrays of indices.
- an iterable yielding (train, test) splits as arrays of indices.

For integer/None inputs, if the estimator is a classifier and ``y`` is
either binary or multiclass, :class:`StratifiedKFold` is used. In all
other cases, :class:`KFold` is used. These splitters are instantiated
with `shuffle=False` so the splits will be the same across calls.
For integer/``None`` inputs, if the estimator is a classifier and ``y``
is either binary or multiclass, :class:`StratifiedKFold` is used. In
all other cases, :class:`KFold` is used. These splitters are
instantiated with ``shuffle=False`` so the splits will be the same
across calls.

Iterables, including generators, are materialized once for reuse. This
can increase memory consumption when the number of folds is large.

Refer :ref:`User Guide <cross_validation>` for the various
cross-validation strategies that can be used here.
Expand Down Expand Up @@ -273,9 +277,11 @@ def fit(self, X, y=None):

old_score = -np.inf
is_auto_select = self.tol is not None and self.n_features_to_select == "auto"
cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))

for _ in range(n_iterations):
new_feature_idx, new_score = self._get_best_new_feature_score(
cloned_estimator, X, y, current_mask
cloned_estimator, X, y, current_mask, cv
)
if is_auto_select and ((new_score - old_score) < self.tol):
break
Expand All @@ -291,7 +297,7 @@ def fit(self, X, y=None):

return self

def _get_best_new_feature_score(self, estimator, X, y, current_mask):
def _get_best_new_feature_score(self, estimator, X, y, current_mask, cv):
# Return the best new feature and its score to add to the current_mask,
# i.e. return the best new feature and its score to add (resp. remove)
# when doing forward selection (resp. backward selection).
Expand All @@ -309,7 +315,7 @@ def _get_best_new_feature_score(self, estimator, X, y, current_mask):
estimator,
X_new,
y,
cv=self.cv,
cv=cv,
scoring=self.scoring,
n_jobs=self.n_jobs,
).mean()
Expand Down
39 changes: 37 additions & 2 deletions sklearn/feature_selection/tests/test_sequential.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.datasets import make_regression, make_blobs
from sklearn.datasets import make_blobs, make_classification, make_regression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier


def test_bad_n_features_to_select():
Expand Down Expand Up @@ -314,3 +315,37 @@ def test_backward_neg_tol():

assert 0 < sfs.get_support().sum() < X.shape[1]
assert new_score < initial_score


def test_sfs_supports_iterable_cv_generator():
X, y = make_classification(n_samples=40, n_features=8, random_state=0)

groups = np.zeros_like(y, dtype=int)
groups[y.size // 2 :] = 1

logo = LeaveOneGroupOut()
cv = logo.split(X, y, groups=groups)

selector = SequentialFeatureSelector(
KNeighborsClassifier(n_neighbors=3),
n_features_to_select=3,
scoring="accuracy",
cv=cv,
)

selector.fit(X, y)

assert selector.get_support().sum() == 3


def test_sfs_baseline_cv_int_runs():
X, y = make_regression(n_samples=60, n_features=10, random_state=0)

selector = SequentialFeatureSelector(
LinearRegression(), n_features_to_select=4, cv=5
)

selector.fit(X, y)

assert selector.get_support().sum() == 4
assert selector.transform(X).shape[1] == 4