From a977d150f6052d7dd21e51b350722ac289f6b8f1 Mon Sep 17 00:00:00 2001 From: Casey Brooks Date: Fri, 26 Dec 2025 13:38:50 +0000 Subject: [PATCH] fix(feature_selection): preserve iterable cv in sfs --- sklearn/feature_selection/_sequential.py | 34 +++++++++------- .../tests/test_sequential.py | 39 ++++++++++++++++++- 2 files changed, 57 insertions(+), 16 deletions(-) diff --git a/sklearn/feature_selection/_sequential.py b/sklearn/feature_selection/_sequential.py index e983c55de7d25..c9ecdb2616e9f 100644 --- a/sklearn/feature_selection/_sequential.py +++ b/sklearn/feature_selection/_sequential.py @@ -8,12 +8,12 @@ import warnings from ._base import SelectorMixin -from ..base import BaseEstimator, MetaEstimatorMixin, clone +from ..base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier from ..utils._param_validation import HasMethods, Hidden, Interval, StrOptions from ..utils._param_validation import RealNotInt from ..utils._tags import _safe_tags from ..utils.validation import check_is_fitted -from ..model_selection import cross_val_score +from ..model_selection import check_cv, cross_val_score from ..metrics import get_scorer_names @@ -79,19 +79,23 @@ class SequentialFeatureSelector(SelectorMixin, MetaEstimatorMixin, BaseEstimator If None, the estimator's score method is used. - cv : int, cross-validation generator or an iterable, default=None - Determines the cross-validation splitting strategy. - Possible inputs for cv are: + cv : int, cross-validation generator, or iterable, default=None + Determines the cross-validation splitting strategy. Possible inputs + for ``cv`` are: - - None, to use the default 5-fold cross validation, + - ``None``, to use the default 5-fold cross validation, - integer, to specify the number of folds in a `(Stratified)KFold`, - :term:`CV splitter`, - - An iterable yielding (train, test) splits as arrays of indices. + - an iterable yielding (train, test) splits as arrays of indices. - For integer/None inputs, if the estimator is a classifier and ``y`` is - either binary or multiclass, :class:`StratifiedKFold` is used. In all - other cases, :class:`KFold` is used. These splitters are instantiated - with `shuffle=False` so the splits will be the same across calls. + For integer/``None`` inputs, if the estimator is a classifier and ``y`` + is either binary or multiclass, :class:`StratifiedKFold` is used. In + all other cases, :class:`KFold` is used. These splitters are + instantiated with ``shuffle=False`` so the splits will be the same + across calls. + + Iterables, including generators, are materialized once for reuse. This + can increase memory consumption when the number of folds is large. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. @@ -273,9 +277,11 @@ def fit(self, X, y=None): old_score = -np.inf is_auto_select = self.tol is not None and self.n_features_to_select == "auto" + cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) + for _ in range(n_iterations): new_feature_idx, new_score = self._get_best_new_feature_score( - cloned_estimator, X, y, current_mask + cloned_estimator, X, y, current_mask, cv ) if is_auto_select and ((new_score - old_score) < self.tol): break @@ -291,7 +297,7 @@ def fit(self, X, y=None): return self - def _get_best_new_feature_score(self, estimator, X, y, current_mask): + def _get_best_new_feature_score(self, estimator, X, y, current_mask, cv): # Return the best new feature and its score to add to the current_mask, # i.e. return the best new feature and its score to add (resp. remove) # when doing forward selection (resp. backward selection). @@ -309,7 +315,7 @@ def _get_best_new_feature_score(self, estimator, X, y, current_mask): estimator, X_new, y, - cv=self.cv, + cv=cv, scoring=self.scoring, n_jobs=self.n_jobs, ).mean() diff --git a/sklearn/feature_selection/tests/test_sequential.py b/sklearn/feature_selection/tests/test_sequential.py index f6451a36005ac..176a1446edfdd 100644 --- a/sklearn/feature_selection/tests/test_sequential.py +++ b/sklearn/feature_selection/tests/test_sequential.py @@ -6,11 +6,12 @@ from sklearn.preprocessing import StandardScaler from sklearn.pipeline import make_pipeline from sklearn.feature_selection import SequentialFeatureSelector -from sklearn.datasets import make_regression, make_blobs +from sklearn.datasets import make_blobs, make_classification, make_regression from sklearn.linear_model import LinearRegression from sklearn.ensemble import HistGradientBoostingRegressor -from sklearn.model_selection import cross_val_score +from sklearn.model_selection import LeaveOneGroupOut, cross_val_score from sklearn.cluster import KMeans +from sklearn.neighbors import KNeighborsClassifier def test_bad_n_features_to_select(): @@ -314,3 +315,37 @@ def test_backward_neg_tol(): assert 0 < sfs.get_support().sum() < X.shape[1] assert new_score < initial_score + + +def test_sfs_supports_iterable_cv_generator(): + X, y = make_classification(n_samples=40, n_features=8, random_state=0) + + groups = np.zeros_like(y, dtype=int) + groups[y.size // 2 :] = 1 + + logo = LeaveOneGroupOut() + cv = logo.split(X, y, groups=groups) + + selector = SequentialFeatureSelector( + KNeighborsClassifier(n_neighbors=3), + n_features_to_select=3, + scoring="accuracy", + cv=cv, + ) + + selector.fit(X, y) + + assert selector.get_support().sum() == 3 + + +def test_sfs_baseline_cv_int_runs(): + X, y = make_regression(n_samples=60, n_features=10, random_state=0) + + selector = SequentialFeatureSelector( + LinearRegression(), n_features_to_select=4, cv=5 + ) + + selector.fit(X, y) + + assert selector.get_support().sum() == 4 + assert selector.transform(X).shape[1] == 4