-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkmeans.py
executable file
·171 lines (155 loc) · 7.44 KB
/
kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import numpy as np
import utils
import operator
def kmeans(tseries, nbclusters, nb_iterations=10, init_clusters=None, init_avg_with_clusters=False,
averaging_algorithm='dba', distance_algorithm='dtw', tseries_pad=None):
"""
Performs the k-means algorithm using the averaging algorithm specified.
:param tseries: The list of time series on which we would like to apply k-means.
:param nb_iterations: Maximum number of k-means iterations.
:param nbclusters: The number of clusters a.k.a 'k' in the k-means
"""
# get the averaging method (check constants.py)
avg_method = utils.constants.AVERAGING_ALGORITHMS[averaging_algorithm]
# get the distance method (check constants.py)
distance_method = utils.constants.DISTANCE_ALGORITHMS[distance_algorithm]
# get dsitance method parameters
dist_fun_params = utils.constants.DISTANCE_ALGORITHMS_PARAMS[distance_algorithm]
if distance_algorithm == 'shapedtw':
utils.utils.add_dist_fun_params_for_shapedtw(dist_fun_params, tseries[0].shape[0])
current_pad = None
# init the clusters
if init_clusters is None:
# init the k-means algorithm randomly
clusterIdx = np.random.permutation(len(tseries))[:nbclusters]
clusters = [tseries[i] for i in clusterIdx]
else:
clusters = init_clusters
for i in range(nb_iterations):
if distance_algorithm == 'shapedtw':
reach = dist_fun_params['shape_reach']
clusters_pad = np.pad(np.asarray(clusters), ((0, 0), (reach, reach)), mode='edge')
# affect the time series to its closest clusters
affect = []
# init array of distance to closest cluster for each time series
distances_clust = []
# start the affectation of each time series
for idx_series, series in enumerate(tseries):
# initialize the cluster index of the series
cidx = -1
# initialize the minimum distance to a cluster
minDist = np.inf
# loop through all clusters
for i, cluster in enumerate(clusters):
# calculate the distance between the cluster and the series
ss = series
cc = cluster
if distance_algorithm == 'shapedtw':
ss = tseries_pad[idx_series]
cc = clusters_pad[i]
dist, __ = distance_method(ss, cc, **dist_fun_params)
# check if it is better than best so far minimum distance
if dist < minDist:
# assign the new cluster and the minimum distance
minDist = dist
cidx = i
# add the best cluster index to the affectation array
affect.append(cidx)
# this is used to solve the empty clusters problem
distances_clust.append((idx_series, minDist))
# empty clusters list
empty_clusters = []
# recompute the clusters
for i in range(nbclusters):
# get the new group of time series affected to cluster index i
current = [tseries[j] for j, x in enumerate(affect) if x == i]
if distance_algorithm == 'shapedtw':
current_pad = [tseries_pad[j] for j, x in enumerate(affect) if x == i]
# check if cluster is empty
if len(current) == 0:
# add the cluster index to the empty clusters list
empty_clusters.append(i)
# skip averaging an empty cluster
continue
# compute the new avg or center of this cluster
current = np.asarray(current)
if distance_algorithm == 'shapedtw':
current_pad = np.asarray(current_pad)
dba_avg = avg_method(current, tseries_pad=current_pad)
else:
dba_avg = avg_method(current)
# assign this new cluster center whose index is i
clusters[i] = dba_avg
# check if we have empty clusters
if len(empty_clusters) > 0:
# sort the distances to get the ones furthest from their clusters
distances_clust.sort(key=operator.itemgetter(1), reverse=True)
# loop through the empty clusters
for i, idx_clust in enumerate(empty_clusters):
# replace the empty cluster with the farest time series from its old cluster
clusters[idx_clust] = tseries[distances_clust[i][0]]
# re-affectation
# affect the time series to its closest clusters
affect = []
# start the affectation of each time series
for idx_series, series in enumerate(tseries):
# initialize the cluster index of the series
cidx = -1
# initialize the minimum distance to a cluster
minDist = np.inf
# loop through all clusters
for i, cluster in enumerate(clusters):
ss = series
cc = cluster
if distance_algorithm == 'shapedtw':
ss = tseries_pad[idx_series]
cc = clusters_pad[i]
# calculate the distance between the cluster and the series
dist, __ = distance_method(ss, cc, **dist_fun_params)
# check if it is better than best so far minimum distance
if dist < minDist:
# assign the new cluster and the minimum distance
minDist = dist
cidx = i
# add the best cluster index to the affectation array
affect.append(cidx)
# return the clusters as well as the corre
return clusters, affect
def kmeans_save_itr(curr_dir, tseries, nbclusters, nb_iterations=10, init_clusters=None,
averaging_algorithm='dba', distance_algorithm='dtw'):
"""
This function is similar to normal kmeans
but saves the state at each kmeans iteration
in order to continue execution on mesocentre
:param curr_dir:
:param tseries:
:param nbclusters:
:param nb_iterations:
:param init_clusters:
:param averaging_algorithm:
:param distance_algorithm:
:return: usual kmeans result
"""
tseries_pad = None
# if shapedtw is used then you should pad before
# this is used because padding inside the distance function is very expensive
if distance_algorithm == 'shapedtw':
reach = utils.constants.DISTANCE_ALGORITHMS_PARAMS[distance_algorithm]['shape_reach']
tseries_pad = np.pad(np.asarray(tseries), ((0, 0), (reach, reach)), mode='edge')
clusters = init_clusters
for k_itr in range(nb_iterations):
kmeans_file = curr_dir + '/kmeans_clusters_' + str(k_itr) + '.npy'
kmeans_affect_file = curr_dir + '/kmeans_affect_' + str(k_itr) + '.npy'
if utils.utils.check_if_file_exits(kmeans_file):
# clusters already computed till this iteration
clusters = np.load(kmeans_file)
affect = np.load(kmeans_affect_file)
else:
# run the kmeans algorithm for one iteration
# with this clusters as init
clusters, affect = kmeans(tseries, nbclusters, nb_iterations=1, init_clusters=clusters,
averaging_algorithm=averaging_algorithm, init_avg_with_clusters=True,
distance_algorithm=distance_algorithm, tseries_pad=tseries_pad)
np.save(kmeans_file, np.array(clusters))
np.save(kmeans_affect_file, np.array(affect))
return init_clusters, affect