diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index dd969c12b2833..99406d0f702bd 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -172,8 +172,9 @@ def fit(self, X, y=None): # 1D k-means procedure km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1) centers = km.fit(column[:, None]).cluster_centers_[:, 0] - bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5 - bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max] + centers.sort() + interior = (centers[1:] + centers[:-1]) * 0.5 + bin_edges[jj] = np.r_[col_min, interior, col_max] self.bin_edges_ = bin_edges self.n_bins_ = n_bins diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py index c2e307e85d51e..9de8f2afe718a 100644 --- a/sklearn/preprocessing/tests/test_discretization.py +++ b/sklearn/preprocessing/tests/test_discretization.py @@ -203,6 +203,30 @@ def test_nonuniform_strategies(strategy, expected_2bins, expected_3bins): assert_array_equal(expected_3bins, Xt.ravel()) +def test_kmeans_strategy_handles_unsorted_bin_edges_regression(): + X = np.array([0, 0.5, 2, 3, 9, 10]).reshape(-1, 1) + n_bins = 5 + est = KBinsDiscretizer(n_bins=n_bins, strategy='kmeans', + encode='ordinal') + + Xt = est.fit_transform(X) + + Xt = Xt.ravel() + assert Xt.min() >= 0 + assert Xt.max() <= n_bins - 1 + + +def test_kmeans_strategy_bin_edges_are_sorted(): + rng = np.random.RandomState(0) + X = rng.uniform(low=[-3, 5, 10], high=[7, 15, 20], size=(200, 3)) + est = KBinsDiscretizer(n_bins=5, strategy='kmeans', encode='ordinal') + + est.fit(X) + + for edges in est.bin_edges_: + assert np.all(np.diff(edges) >= 0) + + @pytest.mark.parametrize('strategy', ['uniform', 'kmeans', 'quantile']) @pytest.mark.parametrize('encode', ['ordinal', 'onehot', 'onehot-dense']) def test_inverse_transform(strategy, encode):