From 8891056f729c12cf046c39e2898ba12fb3efa2ef Mon Sep 17 00:00:00 2001 From: Casey Brooks Date: Fri, 26 Dec 2025 04:25:42 +0000 Subject: [PATCH 1/2] fix(model_selection): randomize stratified shuffle --- doc/whats_new/v0.21.rst | 5 +++++ sklearn/model_selection/_split.py | 5 ++--- sklearn/model_selection/tests/test_split.py | 11 +++++++++++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 7bc4389e51a48..f4cd3bcbe3fbf 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -208,6 +208,11 @@ Support for Python 3.4 and below has been officially dropped. :func:`~model_selection.validation_curve` only the latter is required. :issue:`12613` and :issue:`12669` by :user:`Marc Torrellas `. +- |Fix| Fixed a bug where :class:`model_selection.StratifiedKFold` + shuffles each class's samples with the same ``random_state``, + making ``shuffle=True`` ineffective. + :issue:`13124` by :user:`Hanmin Qin `. + :mod:`sklearn.neighbors` ........................ diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 24394f8691c7e..0c09ff3b0513a 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -576,8 +576,7 @@ class StratifiedKFold(_BaseKFold): ``n_splits`` default value will change from 3 to 5 in v0.22. shuffle : boolean, optional - Whether to shuffle each stratification of the data before splitting - into batches. + Whether to shuffle each class's samples before splitting into batches. random_state : int, RandomState instance or None, optional, default=None If int, random_state is the seed used by the random number generator; @@ -620,7 +619,7 @@ def __init__(self, n_splits='warn', shuffle=False, random_state=None): super().__init__(n_splits, shuffle, random_state) def _make_test_folds(self, X, y=None): - rng = self.random_state + rng = check_random_state(self.random_state) y = np.asarray(y) type_of_target_y = type_of_target(y) allowed_target_types = ('binary', 'multiclass') diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 5981df285f54b..4d55b5f3c9351 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -493,6 +493,17 @@ def test_shuffle_stratifiedkfold(): assert_not_equal(set(test0), set(test1)) check_cv_coverage(kf0, X_40, y, groups=None, expected_n_splits=5) + # Ensure that we shuffle each class's samples with different + # random_state in StratifiedKFold + # See https://github.com/scikit-learn/scikit-learn/pull/13124 + X = np.arange(10) + y = [0] * 5 + [1] * 5 + kf1 = StratifiedKFold(5, shuffle=True, random_state=0) + kf2 = StratifiedKFold(5, shuffle=True, random_state=1) + test_set1 = sorted([tuple(s[1]) for s in kf1.split(X, y)]) + test_set2 = sorted([tuple(s[1]) for s in kf2.split(X, y)]) + assert test_set1 != test_set2 + def test_kfold_can_detect_dependent_samples_on_digits(): # see #2372 # The digits samples are dependent: they are apparently grouped by authors From d8ed924793c70791517ee0aab0bf880fa20881b6 Mon Sep 17 00:00:00 2001 From: Casey Brooks Date: Fri, 26 Dec 2025 04:33:09 +0000 Subject: [PATCH 2/2] test(model_selection): ensure stratified shuffle diverges --- sklearn/model_selection/tests/test_split.py | 22 +++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 4d55b5f3c9351..f0036c56bfbef 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -493,16 +493,18 @@ def test_shuffle_stratifiedkfold(): assert_not_equal(set(test0), set(test1)) check_cv_coverage(kf0, X_40, y, groups=None, expected_n_splits=5) - # Ensure that we shuffle each class's samples with different - # random_state in StratifiedKFold - # See https://github.com/scikit-learn/scikit-learn/pull/13124 - X = np.arange(10) - y = [0] * 5 + [1] * 5 - kf1 = StratifiedKFold(5, shuffle=True, random_state=0) - kf2 = StratifiedKFold(5, shuffle=True, random_state=1) - test_set1 = sorted([tuple(s[1]) for s in kf1.split(X, y)]) - test_set2 = sorted([tuple(s[1]) for s in kf2.split(X, y)]) - assert test_set1 != test_set2 + # Ensure that with shuffle=True each class receives a distinct permutation + # even when using the same integer random_state. Prior to fixing + # https://github.com/scikit-learn/scikit-learn/pull/13124 the per-class + # permutations were identical, making this assertion fail. + y = np.array([0] * 40 + [1] * 40) + X = np.zeros_like(y) + skf = StratifiedKFold(5, shuffle=True, random_state=0) + perm0, perm1 = [], [] + for _, test_idx in skf.split(X, y): + perm0.extend(test_idx[test_idx < 40]) + perm1.extend(test_idx[test_idx >= 40] - 40) + assert perm0 != perm1 def test_kfold_can_detect_dependent_samples_on_digits(): # see #2372