forked from eriklindernoren/ML-From-Scratch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
k_means.py
116 lines (96 loc) · 3.91 KB
/
k_means.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import sys
import os
import math
import random
from sklearn import datasets
import numpy as np
# Import helper functions
from mlfromscratch.utils.data_manipulation import normalize
from mlfromscratch.utils.data_operation import euclidean_distance
from mlfromscratch.unsupervised_learning import *
from mlfromscratch.utils import Plot
class KMeans():
"""A simple clustering method that forms k clusters by iteratively reassigning
samples to the closest centroids and after that moves the centroids to the center
of the new formed clusters.
Parameters:
-----------
k: int
The number of clusters the algorithm will form.
max_iterations: int
The number of iterations the algorithm will run for if it does
not converge before that.
"""
def __init__(self, k=2, max_iterations=500):
self.k = k
self.max_iterations = max_iterations
def _init_random_centroids(self, X):
""" Initialize the centroids as random samples """
n_samples, n_features = np.shape(X)
centroids = np.zeros((self.k, n_features))
for i in range(self.k):
centroid = X[np.random.choice(range(n_samples))]
centroids[i] = centroid
return centroids
def _closest_centroid(self, sample, centroids):
""" Return the index of the closest centroid to the sample """
closest_i = None
closest_distance = float("inf")
for i, centroid in enumerate(centroids):
distance = euclidean_distance(sample, centroid)
if distance < closest_distance:
closest_i = i
closest_distance = distance
return closest_i
def _create_clusters(self, centroids, X):
""" Assign the samples to the closest centroids to create clusters """
n_samples = np.shape(X)[0]
clusters = [[] for _ in range(self.k)]
for sample_i, sample in enumerate(X):
centroid_i = self._closest_centroid(sample, centroids)
clusters[centroid_i].append(sample_i)
return clusters
def _calculate_centroids(self, clusters, X):
""" Calculate new centroids as the means of the samples in each cluster """
n_features = np.shape(X)[1]
centroids = np.zeros((self.k, n_features))
for i, cluster in enumerate(clusters):
centroid = np.mean(X[cluster], axis=0)
centroids[i] = centroid
return centroids
def _get_cluster_labels(self, clusters, X):
""" Classify samples as the index of their clusters """
# One prediction for each sample
y_pred = np.zeros(np.shape(X)[0])
for cluster_i, cluster in enumerate(clusters):
for sample_i in cluster:
y_pred[sample_i] = cluster_i
return y_pred
def predict(self, X):
""" Do K-Means clustering and return cluster indices """
# Initialize centroids
centroids = self._init_random_centroids(X)
# Iterate until convergence or for max iterations
for _ in range(self.max_iterations):
# Assign samples to closest centroids (create clusters)
clusters = self._create_clusters(centroids, X)
prev_centroids = centroids
# Calculate new centroids from the clusters
centroids = self._calculate_centroids(clusters, X)
# If no centroids have changed => convergence
diff = centroids - prev_centroids
if not diff.any():
break
return self._get_cluster_labels(clusters, X)
def main():
# Load the dataset
X, y = datasets.make_blobs()
# Cluster the data using K-Means
clf = KMeans(k=3)
y_pred = clf.predict(X)
# Project the data onto the 2 primary principal components
p = MatplotlibWrapper()
p.plot_in_2d(X, y_pred, title="K-Means Clustering")
p.plot_in_2d(X, y, title="Actual Clustering")
if __name__ == "__main__":
main()