Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions sklearn/cluster/k_means_.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ def k_means(X, n_clusters, sample_weight=None, init='k-means++',
if n_init <= 0:
raise ValueError("Invalid number of initializations."
" n_init=%d must be bigger than zero." % n_init)
random_state = check_random_state(random_state)
rng = check_random_state(random_state)

if max_iter <= 0:
raise ValueError('Number of iterations should be a positive number,'
Expand Down Expand Up @@ -346,6 +346,8 @@ def k_means(X, n_clusters, sample_weight=None, init='k-means++',
# precompute squared norms of data points
x_squared_norms = row_norms(X, squared=True)

seeds = rng.randint(0, np.iinfo(np.int32).max, size=n_init)

best_labels, best_inertia, best_centers = None, None, None
if n_clusters == 1:
# elkan doesn't make sense for a single cluster, full will produce
Expand All @@ -363,13 +365,13 @@ def k_means(X, n_clusters, sample_weight=None, init='k-means++',
if effective_n_jobs(n_jobs) == 1:
# For a single thread, less memory is needed if we just store one set
# of the best results (as opposed to one set per run per thread).
for it in range(n_init):
for seed in seeds:
# run a k-means once
labels, inertia, centers, n_iter_ = kmeans_single(
X, sample_weight, n_clusters, max_iter=max_iter, init=init,
verbose=verbose, precompute_distances=precompute_distances,
tol=tol, x_squared_norms=x_squared_norms,
random_state=random_state)
random_state=seed)
# determine if these results are the best so far
if best_inertia is None or inertia < best_inertia:
best_labels = labels.copy()
Expand All @@ -378,7 +380,6 @@ def k_means(X, n_clusters, sample_weight=None, init='k-means++',
best_n_iter = n_iter_
else:
# parallelisation of k-means runs
seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
results = Parallel(n_jobs=n_jobs, verbose=0)(
delayed(kmeans_single)(X, sample_weight, n_clusters,
max_iter=max_iter, init=init,
Expand Down
28 changes: 28 additions & 0 deletions sklearn/cluster/tests/test_k_means.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,34 @@ def test_k_means_plus_plus_init_2_jobs():
_check_fitted_model(km)


@if_safe_multiprocessing_with_blas
@pytest.mark.parametrize('algorithm', ['full', 'elkan'])
def test_kmeans_consistent_inertia_across_n_jobs(algorithm):
X_local, _ = make_blobs(n_samples=200, n_features=5, centers=4,
random_state=0)
inertias = []
for n_jobs in [1, 2, 3]:
km = KMeans(n_clusters=4, n_init=8, random_state=42,
algorithm=algorithm, n_jobs=n_jobs)
km.fit(X_local)
inertias.append(km.inertia_)
assert_allclose(inertias, inertias[0], rtol=1e-12)


@if_safe_multiprocessing_with_blas
@pytest.mark.parametrize('algorithm', ['full', 'elkan'])
def test_k_means_function_consistent_inertia_across_n_jobs(algorithm):
X_local, _ = make_blobs(n_samples=200, n_features=5, centers=4,
random_state=0)
inertias = []
for n_jobs in [1, 2, 3]:
_, _, inertia = k_means(X_local, n_clusters=4, n_init=8,
random_state=42, algorithm=algorithm,
n_jobs=n_jobs)
inertias.append(inertia)
assert_allclose(inertias, inertias[0], rtol=1e-12)


def test_k_means_precompute_distances_flag():
# check that a warning is raised if the precompute_distances flag is not
# supported
Expand Down