-
Notifications
You must be signed in to change notification settings - Fork 0
/
02_k-means.py
92 lines (61 loc) · 2.77 KB
/
02_k-means.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
import cPickle as cpkl
import sys
import datetime
import numpy as np
from sklearn.cluster import KMeans
from joblib import Parallel, delayed
import multiprocessing
import support_functions as sup
def kmeans_p(kmeans, matrix):
return kmeans.fit(matrix)
num_cores = multiprocessing.cpu_count()
# file_pattern = sys.argv[2]
# file_ext = sys.argv[3]
file_pattern = 'MyTrain'
file_ext = 'csv'
data_path = sys.argv[1]
has_reader = 1
train_fold_path = os.path.join(data_path, 'train')
# Reading dataset size file
NUM_USERS, NUM_ITEMS = cpkl.load(open(os.path.join(data_path, 'dataset_size.pkl'), 'rb'))
# num_folds = int(sys.argv[6])
# num_clusters = int(sys.argv[7])
# cluster_list = [5, 10, 50, 100, 250, 500, 1000]
# cluster_list = [5, 10, 50, 100, 250, 500]
cluster_list = cpkl.load(open(os.path.join(data_path, 'cluster_list.pkl'), 'rb'))
log_file = sup.create_logfile(sys.argv[0])
file_base = file_pattern + '{}.' + file_ext
fold = 0
file = os.path.join(train_fold_path, file_base.format(fold))
# rat_dict = sup.read_ratings_csv_to_dict(file,True)
sup.logmsg('Creating ratings matrix for user-based and item-based clustering', log_file)
rating_matrix_user = sup.read_ratings_csv_to_matrix(file, True, NUM_USERS, NUM_ITEMS,
dtype=np.float)
rating_matrix_item = np.matrix(rating_matrix_user).getT()
sup.logmsg('Creating kmeans base structure', log_file)
kmeans = Parallel(n_jobs=num_cores, backend="threading") \
(delayed(KMeans)(n_clusters=num_clusters, n_init=10, algorithm="full",
precompute_distances=True, n_jobs=1)
for num_clusters in cluster_list)
sup.logmsg('Running user-based k-means', log_file)
clustered_data = Parallel(n_jobs=num_cores, backend="threading") \
(delayed(kmeans_p)(km, rating_matrix_user) for km in kmeans)
sup.logmsg('Writing user-based kmeans data structure on disk', log_file)
for i in range(len(cluster_list)):
num_clusters = cluster_list[i]
write_file_name = 'user_{}_{}.cpkl'.format(num_clusters, fold)
write_path = os.path.join(data_path, 'kmeans')
sup.write_cpkl2(clustered_data[i], write_path, write_file_name)
sup.logmsg('Done with user-based', log_file)
sup.logmsg('Running item-based k-means', log_file)
clustered_data = Parallel(n_jobs=num_cores, backend="threading") \
(delayed(kmeans_p)(km, rating_matrix_item) for km in kmeans)
sup.logmsg('Writing item-based kmeans data structure on disk', log_file)
for i in range(len(cluster_list)):
num_clusters = cluster_list[i]
write_file_name = 'item_{}_{}.cpkl'.format(num_clusters, fold)
write_path = os.path.join(data_path, 'kmeans')
sup.write_cpkl2(clustered_data[i], write_path, write_file_name)
sup.logmsg('Done with item-based', log_file)
sup.logmsg('Done with fold kmeans', log_file)