@@ -24,12 +24,10 @@ import numpy as np
24
24
from six.moves import range
25
25
from six import iteritems
26
26
27
-
28
- def cosine (n_x , yr , min_support ):
27
+ def cosine (n_x , yr , min_support , common_ratings_only = True ):
29
28
""" Compute the cosine similarity between all pairs of users (or items).
30
29
31
- Only **common** users (or items) are taken into account. The cosine
32
- similarity is defined as:
30
+ The cosine similarity is defined as:
33
31
34
32
.. math::
35
33
\\ text{cosine_sim}(u, v) = \\ frac{
@@ -52,8 +50,20 @@ def cosine(n_x, yr, min_support):
52
50
53
51
For details on cosine similarity, see on `Wikipedia
54
52
<https://en.wikipedia.org/wiki/Cosine_similarity#Definition>`__.
53
+
54
+ Depending on ``common_ratings_only`` field of ``sim_options``
55
+ only common users (or items) are taken into account, or full rating
56
+ vectors (default: True).
55
57
"""
56
58
59
+ if common_ratings_only:
60
+ return cosine_common_ratings_only(n_x, yr, min_support)
61
+ else :
62
+ return cosine_full_rating_vectors(n_x, yr, min_support)
63
+
64
+
65
+ def cosine_common_ratings_only (n_x , yr , min_support ):
66
+
57
67
# sum (r_xy * r_x'y) for common ys
58
68
cdef np.ndarray[np.double_t, ndim= 2 ] prods
59
69
# number of common ys
@@ -80,8 +90,92 @@ def cosine(n_x, yr, min_support):
80
90
for xj, rj in y_ratings:
81
91
freq[xi, xj] += 1
82
92
prods[xi, xj] += ri * rj
83
- sqi[xi, xj] += ri** 2
84
- sqj[xi, xj] += rj** 2
93
+ sqi[xi, xj] += ri ** 2
94
+ sqj[xi, xj] += rj ** 2
95
+
96
+ for xi in range (n_x):
97
+ sim[xi, xi] = 1
98
+ for xj in range (xi + 1 , n_x):
99
+ if freq[xi, xj] < min_sprt:
100
+ sim[xi, xj] = 0
101
+ else :
102
+ denum = np.sqrt(sqi[xi, xj] * sqj[xi, xj])
103
+ sim[xi, xj] = prods[xi, xj] / denum
104
+
105
+ sim[xj, xi] = sim[xi, xj]
106
+
107
+ return sim
108
+
109
+
110
+ def cosine_full_rating_vectors (n_x , yr , min_support ):
111
+
112
+ # sum (r_xy * r_x'y) for common ys
113
+ cdef np.ndarray[np.double_t, ndim= 2 ] prods
114
+ # number of common ys
115
+ cdef np.ndarray[np.int_t, ndim= 2 ] freq
116
+ # sum (r_xy ^ 2) for common ys
117
+ cdef np.ndarray[np.double_t, ndim= 2 ] sqi
118
+ # sum (r_x'y ^ 2) for common ys
119
+ cdef np.ndarray[np.double_t, ndim= 2 ] sqj
120
+ # the similarity matrix
121
+ cdef np.ndarray[np.double_t, ndim= 2 ] sim
122
+
123
+ cdef int xi, xj
124
+ cdef double ri, rj
125
+ cdef int min_sprt = min_support
126
+
127
+ prods = np.zeros((n_x, n_x), np.double)
128
+ freq = np.zeros((n_x, n_x), np.int)
129
+ sqi = np.zeros((n_x, n_x), np.double)
130
+ sqj = np.zeros((n_x, n_x), np.double)
131
+ sim = np.zeros((n_x, n_x), np.double)
132
+
133
+ for y, y_ratings in iteritems(yr):
134
+
135
+ # yr_ratings data structure is sparse. But for cosine similarity it is
136
+ # necessary to obtain all pairs, substituting missing ratings for 0.
137
+ # Implementation:
138
+ # Iterate through the range of x-indexes, taking 0-rating for each
139
+ # index unless this index is actually present in the iter
140
+ sorted_y_ratings = sorted (y_ratings, key = lambda x : x[0 ])
141
+ xi_iter = iter (sorted_y_ratings)
142
+ try :
143
+ xi_non_missing, ri_non_missing = next(xi_iter)
144
+ except StopIteration :
145
+ xi_non_missing = n_x
146
+ for xi_all in range (n_x):
147
+ if xi_all < xi_non_missing:
148
+ xi = xi_all
149
+ ri = 0
150
+ else :
151
+ xi = xi_non_missing
152
+ ri = ri_non_missing
153
+ try :
154
+ xi_non_missing, ri_non_missing = next(xi_iter)
155
+ except StopIteration :
156
+ xi_non_missing = n_x
157
+
158
+ xj_iter = iter (sorted_y_ratings)
159
+ try :
160
+ xj_non_missing, rj_non_missing = next(xj_iter)
161
+ except StopIteration :
162
+ xj_non_missing = n_x
163
+ for xj_all in range (n_x):
164
+ if xj_all < xj_non_missing:
165
+ xj = xj_all
166
+ rj = 0
167
+ else :
168
+ xj = xj_non_missing
169
+ rj = rj_non_missing
170
+ try :
171
+ xj_non_missing, rj_non_missing = next(xj_iter)
172
+ except StopIteration :
173
+ xj_non_missing = n_x
174
+
175
+ freq[xi, xj] += 1
176
+ prods[xi, xj] += ri * rj
177
+ sqi[xi, xj] += ri ** 2
178
+ sqj[xi, xj] += rj ** 2
85
179
86
180
for xi in range (n_x):
87
181
sim[xi, xi] = 1
@@ -149,7 +243,7 @@ def msd(n_x, yr, min_support):
149
243
for y, y_ratings in iteritems(yr):
150
244
for xi, ri in y_ratings:
151
245
for xj, rj in y_ratings:
152
- sq_diff[xi, xj] += (ri - rj)** 2
246
+ sq_diff[xi, xj] += (ri - rj) ** 2
153
247
freq[xi, xj] += 1
154
248
155
249
for xi in range (n_x):
@@ -232,8 +326,8 @@ def pearson(n_x, yr, min_support):
232
326
for xj, rj in y_ratings:
233
327
prods[xi, xj] += ri * rj
234
328
freq[xi, xj] += 1
235
- sqi[xi, xj] += ri** 2
236
- sqj[xi, xj] += rj** 2
329
+ sqi[xi, xj] += ri ** 2
330
+ sqj[xi, xj] += rj ** 2
237
331
si[xi, xj] += ri
238
332
sj[xi, xj] += rj
239
333
@@ -341,8 +435,8 @@ def pearson_baseline(n_x, yr, min_support, global_mean, x_biases, y_biases,
341
435
diff_i = (ri - (partial_bias + x_biases_[xi]))
342
436
diff_j = (rj - (partial_bias + x_biases_[xj]))
343
437
prods[xi, xj] += diff_i * diff_j
344
- sq_diff_i[xi, xj] += diff_i** 2
345
- sq_diff_j[xi, xj] += diff_j** 2
438
+ sq_diff_i[xi, xj] += diff_i ** 2
439
+ sq_diff_j[xi, xj] += diff_j ** 2
346
440
347
441
for xi in range (n_x):
348
442
sim[xi, xi] = 1
0 commit comments