-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclustering.py
371 lines (300 loc) · 14.4 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
# --------------------------------
# 2nd Delivery of the PRI project
# 86379 - Ana Evans
# 86389 - Artur Guimarães
# 86417 - Francisco Rosa
# --------------------------------
import nltk
import sklearn
import math
import numpy as np
import scipy.sparse
from sklearn.metrics import *
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import *
# File imports
from file_treatment import read_from_file
from file_treatment import get_files_from_directory
from data_set_treatment import tfidf_process
from data_set_treatment import get_topics
from data_set_treatment import get_R_set
from data_set_treatment import process_collection
topics = {}
tfidf_vec_info = [None, None, None]
Labels_pred = []
# --------------------------------------------------------------------------------
# get_clustering_score() - Gets a clustering methods score based on supervised, unsupervised
# or mixed metrics. Uses a linear combination of all results.
# --------------------------------------------------------------------------------
def get_clustering_score(x, labels_true, labels_pred, target):
unsupervised_methods = { 'sil_score' : silhouette_score, 'calin_score' : calinski_harabasz_score, 'davies_score' : davies_bouldin_score}
supervised_methods = {'rands_score' : adjusted_rand_score, 'v_score' : v_measure_score, 'complete_score' : completeness_score,
'fowlkes_score' : fowlkes_mallows_score, 'homogenity_score': homogeneity_score, 'mutual_score' : mutual_info_score}
if target == 'supervised':
unsupervised_methods = {}
elif target == 'unsupervised':
supervised_methods = {}
result = 0
for method in unsupervised_methods:
result += unsupervised_methods[method](x, labels_pred)
for method in supervised_methods:
score = supervised_methods[method](labels_true, labels_pred)
result += score
print('{} = {}'.format(method, score))
return result / (len(unsupervised_methods) + len(supervised_methods))
# --------------------------------------------------------------------------------
# trainsKmeans - trains the KMeans algorithm
#
# Input: vec_D - the set of document or topics to be clustered
# y - the set of ids and relevance info
# clusters - the list with the number of clusters to be attempted
# distance - parameter for the evaluation measures
#
# Behaviour: creates the KMeans clusters for the number of clusters previously
# defined, and then applies a set of evaluation measures to select the best one
#
# Output: A list containing te best number of clusters to use, the list of labels
# and the rands score
# --------------------------------------------------------------------------------
def trainKmeans(vec_D, y, clusters, distance):
array_D = vec_D.toarray()
best_result = [None, None, 0, 0]
for i in clusters:
clustering_kmeans = KMeans(n_clusters=i).fit(vec_D)
labels_pred = clustering_kmeans.labels_
score_mean = get_clustering_score(array_D, y, labels_pred, 'unsupervised')
if score_mean > best_result[2]:
best_result = [clustering_kmeans, labels_pred, score_mean, i]
return best_result
# --------------------------------------------------------------------------------
# trainsAgglomerative - trains the Agglomerative Clustering algorithm
#
# Input: vec_D - the set of document or topics to be clustered
# y - the set of ids and relevance info
# clusters - the list with the number of clusters to be attempted
# distance - parameter for the evaluation measures
#
# Behaviour: creates the Agglomerative clusters for the number of clusters previously
# defined, and then applies a set of evaluation measures to select the best one
#
# Output: A list containing te best number of clusters to use, the list of labels
# and the rands score
# --------------------------------------------------------------------------------
def trainAgglomerative(vec_D, y, clusters, distance):
vec_D = vec_D.toarray()
best_result = [None, None, 0, 0]
for i in clusters:
clustering_agg = AgglomerativeClustering(n_clusters=i).fit(vec_D)
labels_pred = clustering_agg.labels_
score_mean = get_clustering_score(vec_D, y, labels_pred, 'unsupervised')
if score_mean > best_result[2]:
best_result = [clustering_agg, labels_pred, score_mean, i]
return best_result
# ----------------------------------------------------------------------------------------------------
# Clustering: Applies Clustering, a unsupervised learning technique to classify
# sets of documents instead of individual documents
#
# Input: D - set of documents or topics to be clustered
# **kwargs - Optional parameters with the following functionality (default
# values prefixed by *)
# clusters [*range(2,20) | list of ints > 0]: List with the number of clusters to attempt in
# the clustering algorithms
# distance [*euclidian | manhattan]: Distance measurement used
# top_cluster_words [*5 | int]: Number of top words that represent each cluster
#
# Behaviour: Starts by obtaining the processed collection of documents. Then vectorizes
# them throught the function tfidf_process and obtains the vectorizer itself, the document
# keys and the document vector. Afterwards, we obtain the r_set entries relevant to the doc keys,
# and deal with the kwargs information. These arguments are sent to the clustering training functions
# which return the information regarding the best clustering they found. We compare KM and AC through
# their best rand score, and then process the information to output.
#
# Output: A list of cluster results. These results consist in a pair per cluster, with the cluster
# centroid and the set of document/topic ids which comprise it
# ----------------------------------------------------------------------------------------------------
def clustering(D, **kwargs):
global tfidf_vec_info
global labels_pred
mode = 'docs' if 'mode' not in kwargs else kwargs['mode']
doc_keys = None
doc_vectors = None
y = []
if mode == 'docs':
tfidf_vec_info = tfidf_process(D, remove_stopwords='english', **kwargs)
doc_keys = tfidf_vec_info[1]
doc_vectors = tfidf_vec_info[2]
r_set = get_R_set('material/',index='doc_id')[0][1]
for i in range(len(doc_keys)):
y.append([])
if doc_keys[i] in r_set:
for r in r_set[doc_keys[i]]:
y[i].append('{}_{}'.format(r, r_set[doc_keys[i]][r]))
y = np.array(y, dtype=object)
elif mode == 'topics':
tfidf_vec_info = tfidf_process(D, min_df = 1, max_df=1.0, **kwargs)
doc_keys = tfidf_vec_info[1]
doc_vectors = tfidf_vec_info[2]
r_set = get_R_set('material/')[0][1]
for i in range(len(doc_keys)):
y.append([])
if doc_keys[i] in r_set:
for r in r_set[doc_keys[i]]:
y[i].append('{}_{}'.format(r, r_set[doc_keys[i]][r]))
y = np.array(y, dtype=object)
clustering_methods = [trainKmeans, trainAgglomerative] if 'methods' not in kwargs else kwargs['methods']
clusters = list(range(2,100)) if 'clusters' not in kwargs else kwargs['clusters']
distances = ['euclidean'] if 'distance' not in kwargs else kwargs['distance']
top_cluster_words = 5 if 'top_cluster_words' not in kwargs else kwargs['top_cluster_words']
# [Object, Labels, Score, n_clusters]
best_clusters = [None, None, 0, 0]
for method in clustering_methods:
for dist in distances:
clustering = method(doc_vectors, y, clusters, dist)
if clustering[2] > best_clusters[2]:
best_clusters = clustering
result = []
doc_labels = best_clusters[1]
labels_pred = best_clusters[1]
for i in range(best_clusters[3]):
result.append([None,[]])
centroids = None
if type(best_clusters[0]) == KMeans:
centroids = np.array(best_clusters[0].cluster_centers_)
for i in range(len(doc_keys)):
centroid = doc_labels[i]
result[centroid][1].append(doc_keys[i])
else:
docs_per_centroid = {}
for i in range(len(doc_keys)):
centroid = doc_labels[i]
result[centroid][1].append(doc_keys[i])
if centroid not in docs_per_centroid:
docs_per_centroid[centroid] = [doc_vectors[i].toarray()[0]]
else:
docs_per_centroid[centroid].append(doc_vectors[i].toarray()[0])
centroids = []
for centroid in docs_per_centroid:
value = np.mean(docs_per_centroid[centroid], axis=0)
centroids.append(value)
for i in range(len(centroids)):
aux_centroid = centroids[i][centroids[i] != 0.]
sorted_args = np.argsort(aux_centroid)
inverse_transformed = tfidf_vec_info[0].inverse_transform(centroids[i])[0]
n_args = len(inverse_transformed)
entry_range = top_cluster_words if n_args > top_cluster_words else n_args
centroid_result = []
for j in range(entry_range):
centroid_result.append(inverse_transformed[sorted_args[j]])
entry = centroid_result
result[i][0] = entry
return result
# ----------------------------------------------------------------------------------------------------
# interpret(): Outputs the representation of a cluster cointaining a median (centroid) and a medoid
#
# Input: cluster - A document/topic cluster
# D - Set of documents or topics in cluster
# **kwargs - Optional parameters with the following functionality (default
# values prefixed by *)
#
# Behaviour: Creates a TF-IDF Vectorizer to process pairwise distances between each document in
# a cluster to calculate its medoid, and uses previous information to represent the centroid.
#
# Output: A list with centroid representation in top cluster words, and a medoid represented by
# a document or topic id
# ----------------------------------------------------------------------------------------------------
def interpret(cluster, D, **kwargs):
documents = {}
for doc_id in cluster[1]:
documents[doc_id] = D[doc_id]
doc_vectors = tfidf_process(documents, min_df = 1, max_df=1.0, **kwargs)[2]
distance_matrix = pairwise_distances(doc_vectors)
centroid = cluster[0]
medoid_index = np.argmin(distance_matrix.sum(axis=0))
return [centroid, cluster[1][medoid_index]]
# ----------------------------------------------------------------------------------------------------
# get_topic_subset() - Retrieves topics from the global variable topics that are contained within
# Q
# ----------------------------------------------------------------------------------------------------
def get_topic_subset(q_test):
result = {}
for topic in topics:
if topic in q_test:
result[topic] = topics[topic]
return result
# ----------------------------------------------------------------------------------------------------
# get_category_ids() - Helper function to get category id's from collection D to serve as ground
# truth
# ----------------------------------------------------------------------------------------------------
def get_category_ids(doc_keys):
clustered_docs = {}
for doc in doc_keys:
clustered_docs[doc] = True
D = get_files_from_directory('../rcv1/', clustered_docs)
D[0].extend(D[1])
D = D[0]
codes_y = []
for doc in D:
codes_y.append([])
if len(doc.find_all('codes')) > 1:
topic_section = doc.find_all('codes')[1]
topic_ids = topic_section.find_all('code')
for iden in topic_ids:
codes_y[-1].append(iden['code'])
codes_y = np.array(codes_y, dtype=object)
return codes_y
# ----------------------------------------------------------------------------------------------------
# evaluate: Serves as the main class for the clustering process and outputs its results
#
# Input: D - Document or topic collection to use
# **kwargs - Optional parameters with the following functionality (default
# values prefixed by *)
# mode [*docs | topics]: Chooses if we are running a topic or document collection
#
# Behaviour: It encapsulates the entirity of the clustering process and prints its output results
# ordered by the result of each individual cluster.
#
# Output: None
# ----------------------------------------------------------------------------------------------------
def evaluate(D, **kwargs):
global tfidf_vec_info
global labels_pred
mode = 'docs' if 'mode' not in kwargs else kwargs['mode']
doc_dic = D
if mode == 'docs':
doc_dic = process_collection(D, False, **kwargs)
elif mode == 'topics':
doc_dic = get_topic_subset(D)
clusters = clustering(doc_dic, **kwargs)
cluster_info = []
for cluster in clusters:
cluster_info.append(interpret(cluster, doc_dic, **kwargs))
n_clusters = len(clusters)
name = 'document' if mode == 'docs' else 'topic'
print("The clustering solution has k = {} clusters".format(n_clusters))
for i in range(n_clusters):
print("\nCluster {}:".format(i+1))
print("Centroid has top words {}".format(cluster_info[i][0]))
print("Medoid is {} with id {}".format(name,cluster_info[i][1]))
print("Cluster is composed by {} {}s".format(len(clusters[i][1]), name))
print("{}s in cluster -> {}".format(name, clusters[i][1]))
if mode == 'docs' and 'external' in kwargs and kwargs['external']:
print('\nExternal evaluation for clustering solution:')
doc_keys = tfidf_vec_info[1]
category_ids = get_category_ids(doc_keys)
final_score = get_clustering_score(None, category_ids, labels_pred, 'supervised')
print('\nFinal Mean supervised score = {}'.format(final_score))
return
# --------------------------------------------------------------------------------
# ~ Just the Main Function ~
# --------------------------------------------------------------------------------
def main():
global topics
material_dic = 'material/'
topics = get_topics(material_dic)
#D_set = get_files_from_directory('../rcv1_test/19961001')[1]
#D = get_files_from_directory('../rcv1_test/19960821/', None)[1]
D = list(range(101, 201, 1))
#print(read_from_file('topics_processed'))
evaluate(D, mode='topics')
main()