Skip to content

Commit 3fd53d4

Browse files
committed
Cosine full vector implementation, as discussed in #135
1 parent fa85c0d commit 3fd53d4

File tree

4 files changed

+160
-15
lines changed

4 files changed

+160
-15
lines changed

doc/source/prediction_algorithms.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,9 @@ argument is a dictionary with the following (all optional) keys:
130130
``'False'``) for the similarity not to be zero. Simply put, if
131131
:math:`|I_{uv}| < \text{min_support}` then :math:`\text{sim}(u, v) = 0`. The
132132
same goes for items.
133+
- ``'common_ratings_only'``: Determines whether only common user/item ratings are
134+
taken into account or all the full rating vectors are considered
135+
(only relevant for cosine-based similraty). Default is True.
133136
- ``'shrinkage'``: Shrinkage parameter to apply (only relevant for
134137
:func:`pearson_baseline <surprise.similarities.pearson_baseline>` similarity).
135138
Default is 100.

surprise/prediction_algorithms/algo_base.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,10 @@ def compute_similarities(self):
285285
bx, by = bi, bu
286286

287287
args += [self.trainset.global_mean, bx, by, shrinkage]
288+
elif name == 'cosine':
289+
common_ratings_only = self.sim_options.get('common_ratings_only',
290+
True)
291+
args += [common_ratings_only]
288292

289293
try:
290294
print('Computing the {0} similarity matrix...'.format(name))

surprise/similarities.pyx

Lines changed: 105 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,10 @@ import numpy as np
2424
from six.moves import range
2525
from six import iteritems
2626

27-
28-
def cosine(n_x, yr, min_support):
27+
def cosine(n_x, yr, min_support, common_ratings_only=True):
2928
"""Compute the cosine similarity between all pairs of users (or items).
3029
31-
Only **common** users (or items) are taken into account. The cosine
32-
similarity is defined as:
30+
The cosine similarity is defined as:
3331
3432
.. math::
3533
\\text{cosine_sim}(u, v) = \\frac{
@@ -52,8 +50,20 @@ def cosine(n_x, yr, min_support):
5250
5351
For details on cosine similarity, see on `Wikipedia
5452
<https://en.wikipedia.org/wiki/Cosine_similarity#Definition>`__.
53+
54+
Depending on ``common_ratings_only`` field of ``sim_options``
55+
only common users (or items) are taken into account, or full rating
56+
vectors (default: True).
5557
"""
5658

59+
if common_ratings_only:
60+
return cosine_common_ratings_only(n_x, yr, min_support)
61+
else:
62+
return cosine_full_rating_vectors(n_x, yr, min_support)
63+
64+
65+
def cosine_common_ratings_only(n_x, yr, min_support):
66+
5767
# sum (r_xy * r_x'y) for common ys
5868
cdef np.ndarray[np.double_t, ndim=2] prods
5969
# number of common ys
@@ -80,8 +90,92 @@ def cosine(n_x, yr, min_support):
8090
for xj, rj in y_ratings:
8191
freq[xi, xj] += 1
8292
prods[xi, xj] += ri * rj
83-
sqi[xi, xj] += ri**2
84-
sqj[xi, xj] += rj**2
93+
sqi[xi, xj] += ri ** 2
94+
sqj[xi, xj] += rj ** 2
95+
96+
for xi in range(n_x):
97+
sim[xi, xi] = 1
98+
for xj in range(xi + 1, n_x):
99+
if freq[xi, xj] < min_sprt:
100+
sim[xi, xj] = 0
101+
else:
102+
denum = np.sqrt(sqi[xi, xj] * sqj[xi, xj])
103+
sim[xi, xj] = prods[xi, xj] / denum
104+
105+
sim[xj, xi] = sim[xi, xj]
106+
107+
return sim
108+
109+
110+
def cosine_full_rating_vectors(n_x, yr, min_support):
111+
112+
# sum (r_xy * r_x'y) for common ys
113+
cdef np.ndarray[np.double_t, ndim=2] prods
114+
# number of common ys
115+
cdef np.ndarray[np.int_t, ndim=2] freq
116+
# sum (r_xy ^ 2) for common ys
117+
cdef np.ndarray[np.double_t, ndim=2] sqi
118+
# sum (r_x'y ^ 2) for common ys
119+
cdef np.ndarray[np.double_t, ndim=2] sqj
120+
# the similarity matrix
121+
cdef np.ndarray[np.double_t, ndim=2] sim
122+
123+
cdef int xi, xj
124+
cdef double ri, rj
125+
cdef int min_sprt = min_support
126+
127+
prods = np.zeros((n_x, n_x), np.double)
128+
freq = np.zeros((n_x, n_x), np.int)
129+
sqi = np.zeros((n_x, n_x), np.double)
130+
sqj = np.zeros((n_x, n_x), np.double)
131+
sim = np.zeros((n_x, n_x), np.double)
132+
133+
for y, y_ratings in iteritems(yr):
134+
135+
# yr_ratings data structure is sparse. But for cosine similarity it is
136+
# necessary to obtain all pairs, substituting missing ratings for 0.
137+
# Implementation:
138+
# Iterate through the range of x-indexes, taking 0-rating for each
139+
# index unless this index is actually present in the iter
140+
sorted_y_ratings = sorted(y_ratings, key=lambda x: x[0])
141+
xi_iter = iter(sorted_y_ratings)
142+
try:
143+
xi_non_missing, ri_non_missing = next(xi_iter)
144+
except StopIteration:
145+
xi_non_missing = n_x
146+
for xi_all in range(n_x):
147+
if xi_all < xi_non_missing:
148+
xi = xi_all
149+
ri = 0
150+
else:
151+
xi = xi_non_missing
152+
ri = ri_non_missing
153+
try:
154+
xi_non_missing, ri_non_missing = next(xi_iter)
155+
except StopIteration:
156+
xi_non_missing = n_x
157+
158+
xj_iter = iter(sorted_y_ratings)
159+
try:
160+
xj_non_missing, rj_non_missing = next(xj_iter)
161+
except StopIteration:
162+
xj_non_missing = n_x
163+
for xj_all in range(n_x):
164+
if xj_all < xj_non_missing:
165+
xj = xj_all
166+
rj = 0
167+
else:
168+
xj = xj_non_missing
169+
rj = rj_non_missing
170+
try:
171+
xj_non_missing, rj_non_missing = next(xj_iter)
172+
except StopIteration:
173+
xj_non_missing = n_x
174+
175+
freq[xi, xj] += 1
176+
prods[xi, xj] += ri * rj
177+
sqi[xi, xj] += ri ** 2
178+
sqj[xi, xj] += rj ** 2
85179

86180
for xi in range(n_x):
87181
sim[xi, xi] = 1
@@ -149,7 +243,7 @@ def msd(n_x, yr, min_support):
149243
for y, y_ratings in iteritems(yr):
150244
for xi, ri in y_ratings:
151245
for xj, rj in y_ratings:
152-
sq_diff[xi, xj] += (ri - rj)**2
246+
sq_diff[xi, xj] += (ri - rj) ** 2
153247
freq[xi, xj] += 1
154248

155249
for xi in range(n_x):
@@ -232,8 +326,8 @@ def pearson(n_x, yr, min_support):
232326
for xj, rj in y_ratings:
233327
prods[xi, xj] += ri * rj
234328
freq[xi, xj] += 1
235-
sqi[xi, xj] += ri**2
236-
sqj[xi, xj] += rj**2
329+
sqi[xi, xj] += ri ** 2
330+
sqj[xi, xj] += rj ** 2
237331
si[xi, xj] += ri
238332
sj[xi, xj] += rj
239333

@@ -341,8 +435,8 @@ def pearson_baseline(n_x, yr, min_support, global_mean, x_biases, y_biases,
341435
diff_i = (ri - (partial_bias + x_biases_[xi]))
342436
diff_j = (rj - (partial_bias + x_biases_[xj]))
343437
prods[xi, xj] += diff_i * diff_j
344-
sq_diff_i[xi, xj] += diff_i**2
345-
sq_diff_j[xi, xj] += diff_j**2
438+
sq_diff_i[xi, xj] += diff_i ** 2
439+
sq_diff_j[xi, xj] += diff_j ** 2
346440

347441
for xi in range(n_x):
348442
sim[xi, xi] = 1

tests/test_similarities.py

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,11 @@
1212

1313
n_x = 8
1414
yr_global = {
15-
0: [(0, 3), (1, 3), (2, 3), (5, 1), (6, 1.5), (7, 3)], # noqa
15+
0: [(0, 3), (1, 3), (2, 3), (5, 1), (6, 1.5), (7, 3)], # noqa
1616
1: [(0, 4), (1, 4), (2, 4), ], # noqa
1717
2: [ (2, 5), (3, 2), (4, 3) ], # noqa
18-
3: [(1, 1), (2, 4), (3, 2), (4, 3), (5, 3), (6, 3.5), (7, 2)], # noqa
19-
4: [(1, 5), (2, 1), (5, 2), (6, 2.5), (7, 2.5)], # noqa
18+
3: [ (1, 1), (2, 4), (3, 2), (4, 3), (5, 3), (6, 3.5), (7, 2)], # noqa
19+
4: [ (1, 5), (2, 1), (5, 2), (6, 2.5), (7, 2.5)], # noqa
2020
}
2121

2222

@@ -48,7 +48,51 @@ def test_cosine_sim():
4848
# cosine sim is necessarily 1
4949
assert sim[3, 4] == 1
5050

51-
# pairs of users (0, 3) have no common items
51+
# pairs of users (0, 3) have no common items
52+
assert sim[0, 3] == 0
53+
assert sim[0, 4] == 0
54+
55+
# check for float point support and computation correctness
56+
dot_product56 = 1 * 1.5 + 3 * 3.5 + 2 * 2.5
57+
assert sim[5, 6] == (dot_product56 /
58+
((1 ** 2 + 3 ** 2 + 2 ** 2) *
59+
(1.5 ** 2 + 3.5 ** 2 + 2.5 ** 2)) ** 0.5
60+
)
61+
62+
# ensure min_support is taken into account. Only users 1 and 2 have more
63+
# than 4 common ratings.
64+
sim = sims.cosine(n_x, yr, min_support=4)
65+
for i in range(n_x):
66+
for j in range(i + 1, n_x):
67+
if i != 1 and j != 2:
68+
assert sim[i, j] == 0
69+
70+
71+
def test_cosine_full_vectors_sim():
72+
"""Tests for the cosine similarity."""
73+
74+
yr = yr_global.copy()
75+
76+
# # shuffle every rating list, to ensure the order in which ratings are
77+
# # processed does not matter (it's important because it used to be error
78+
# # prone when we were using itertools.combinations)
79+
# for _, ratings in yr.items():
80+
# random.shuffle(ratings)
81+
82+
sim = sims.cosine(n_x, yr, min_support=1, common_ratings_only=False)
83+
84+
# check symetry and bounds (as ratings are > 0, cosine sim must be >= 0)
85+
for xi in range(n_x):
86+
assert sim[xi, xi] == 1
87+
for xj in range(n_x):
88+
assert sim[xi, xj] == sim[xj, xi]
89+
assert 0 <= sim[xi, xj] <= 1
90+
91+
# users 0, 1 and 2 have different ratings when non-common items considered
92+
assert sim[0, 1] < 1
93+
assert sim[0, 2] < 1
94+
95+
# pairs of users (0, 3) and (0,4) have no common items
5296
assert sim[0, 3] == 0
5397
assert sim[0, 4] == 0
5498

0 commit comments

Comments
 (0)