diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py index b7fbdf7da3ad1..b63f5e663d730 100644 --- a/sklearn/cluster/k_means_.py +++ b/sklearn/cluster/k_means_.py @@ -291,7 +291,7 @@ def k_means(X, n_clusters, sample_weight=None, init='k-means++', if n_init <= 0: raise ValueError("Invalid number of initializations." " n_init=%d must be bigger than zero." % n_init) - random_state = check_random_state(random_state) + rng = check_random_state(random_state) if max_iter <= 0: raise ValueError('Number of iterations should be a positive number,' @@ -346,6 +346,8 @@ def k_means(X, n_clusters, sample_weight=None, init='k-means++', # precompute squared norms of data points x_squared_norms = row_norms(X, squared=True) + seeds = rng.randint(0, np.iinfo(np.int32).max, size=n_init) + best_labels, best_inertia, best_centers = None, None, None if n_clusters == 1: # elkan doesn't make sense for a single cluster, full will produce @@ -363,13 +365,13 @@ def k_means(X, n_clusters, sample_weight=None, init='k-means++', if effective_n_jobs(n_jobs) == 1: # For a single thread, less memory is needed if we just store one set # of the best results (as opposed to one set per run per thread). - for it in range(n_init): + for seed in seeds: # run a k-means once labels, inertia, centers, n_iter_ = kmeans_single( X, sample_weight, n_clusters, max_iter=max_iter, init=init, verbose=verbose, precompute_distances=precompute_distances, tol=tol, x_squared_norms=x_squared_norms, - random_state=random_state) + random_state=seed) # determine if these results are the best so far if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() @@ -378,7 +380,6 @@ def k_means(X, n_clusters, sample_weight=None, init='k-means++', best_n_iter = n_iter_ else: # parallelisation of k-means runs - seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) results = Parallel(n_jobs=n_jobs, verbose=0)( delayed(kmeans_single)(X, sample_weight, n_clusters, max_iter=max_iter, init=init, diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 362b0a9145fca..e66f37f3561f1 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -239,6 +239,34 @@ def test_k_means_plus_plus_init_2_jobs(): _check_fitted_model(km) +@if_safe_multiprocessing_with_blas +@pytest.mark.parametrize('algorithm', ['full', 'elkan']) +def test_kmeans_consistent_inertia_across_n_jobs(algorithm): + X_local, _ = make_blobs(n_samples=200, n_features=5, centers=4, + random_state=0) + inertias = [] + for n_jobs in [1, 2, 3]: + km = KMeans(n_clusters=4, n_init=8, random_state=42, + algorithm=algorithm, n_jobs=n_jobs) + km.fit(X_local) + inertias.append(km.inertia_) + assert_allclose(inertias, inertias[0], rtol=1e-12) + + +@if_safe_multiprocessing_with_blas +@pytest.mark.parametrize('algorithm', ['full', 'elkan']) +def test_k_means_function_consistent_inertia_across_n_jobs(algorithm): + X_local, _ = make_blobs(n_samples=200, n_features=5, centers=4, + random_state=0) + inertias = [] + for n_jobs in [1, 2, 3]: + _, _, inertia = k_means(X_local, n_clusters=4, n_init=8, + random_state=42, algorithm=algorithm, + n_jobs=n_jobs) + inertias.append(inertia) + assert_allclose(inertias, inertias[0], rtol=1e-12) + + def test_k_means_precompute_distances_flag(): # check that a warning is raised if the precompute_distances flag is not # supported