Cosine full vector implementation, as discussed in #135

ODemidenko · ODemidenko · commit 3fd53d4027a9 · 2018-02-05T13:50:47.000+03:00
diff --git a/doc/source/prediction_algorithms.rst b/doc/source/prediction_algorithms.rst
@@ -130,6 +130,9 @@ argument is a dictionary with the following (all optional) keys:
   ``'False'``) for the similarity not to be zero. Simply put, if
   :math:`|I_{uv}| < \text{min_support}` then :math:`\text{sim}(u, v) = 0`. The
   same goes for items.
+- ``'common_ratings_only'``: Determines whether only common user/item ratings are
+  taken into account or all the full rating vectors are considered
+  (only relevant for cosine-based similraty). Default is True.
 - ``'shrinkage'``: Shrinkage parameter to apply (only relevant for
   :func:`pearson_baseline <surprise.similarities.pearson_baseline>` similarity).
   Default is 100.
diff --git a/surprise/prediction_algorithms/algo_base.py b/surprise/prediction_algorithms/algo_base.py
@@ -285,6 +285,10 @@ def compute_similarities(self):
                 bx, by = bi, bu
 
             args += [self.trainset.global_mean, bx, by, shrinkage]
+        elif name == 'cosine':
+            common_ratings_only = self.sim_options.get('common_ratings_only',
+                                                       True)
+            args += [common_ratings_only]
 
         try:
             print('Computing the {0} similarity matrix...'.format(name))
diff --git a/surprise/similarities.pyx b/surprise/similarities.pyx
@@ -24,12 +24,10 @@ import numpy as np
 from six.moves import range
 from six import iteritems
 
-
-def cosine(n_x, yr, min_support):
+def cosine(n_x, yr, min_support, common_ratings_only=True):
     """Compute the cosine similarity between all pairs of users (or items).
 
-    Only **common** users (or items) are taken into account. The cosine
-    similarity is defined as:
+    The cosine similarity is defined as:
 
     .. math::
         \\text{cosine_sim}(u, v) = \\frac{
@@ -52,8 +50,20 @@ def cosine(n_x, yr, min_support):
 
     For details on cosine similarity, see on `Wikipedia
     <https://en.wikipedia.org/wiki/Cosine_similarity#Definition>`__.
+
+    Depending on ``common_ratings_only`` field of ``sim_options``
+    only common users (or items) are taken into account, or full rating
+    vectors (default: True).
     """
 
+    if common_ratings_only:
+        return cosine_common_ratings_only(n_x, yr, min_support)
+    else:
+        return cosine_full_rating_vectors(n_x, yr, min_support)
+
+
+def cosine_common_ratings_only(n_x, yr, min_support):
+
     # sum (r_xy * r_x'y) for common ys
     cdef np.ndarray[np.double_t, ndim=2] prods
     # number of common ys
@@ -80,8 +90,92 @@ def cosine(n_x, yr, min_support):
             for xj, rj in y_ratings:
                 freq[xi, xj] += 1
                 prods[xi, xj] += ri * rj
-                sqi[xi, xj] += ri**2
-                sqj[xi, xj] += rj**2
+                sqi[xi, xj] += ri ** 2
+                sqj[xi, xj] += rj ** 2
+
+    for xi in range(n_x):
+        sim[xi, xi] = 1
+        for xj in range(xi + 1, n_x):
+            if freq[xi, xj] < min_sprt:
+                sim[xi, xj] = 0
+            else:
+                denum = np.sqrt(sqi[xi, xj] * sqj[xi, xj])
+                sim[xi, xj] = prods[xi, xj] / denum
+
+            sim[xj, xi] = sim[xi, xj]
+
+    return sim
+
+
+def cosine_full_rating_vectors(n_x, yr, min_support):
+
+    # sum (r_xy * r_x'y) for common ys
+    cdef np.ndarray[np.double_t, ndim=2] prods
+    # number of common ys
+    cdef np.ndarray[np.int_t, ndim=2] freq
+    # sum (r_xy ^ 2) for common ys
+    cdef np.ndarray[np.double_t, ndim=2] sqi
+    # sum (r_x'y ^ 2) for common ys
+    cdef np.ndarray[np.double_t, ndim=2] sqj
+    # the similarity matrix
+    cdef np.ndarray[np.double_t, ndim=2] sim
+
+    cdef int xi, xj
+    cdef double ri, rj
+    cdef int min_sprt = min_support
+
+    prods = np.zeros((n_x, n_x), np.double)
+    freq = np.zeros((n_x, n_x), np.int)
+    sqi = np.zeros((n_x, n_x), np.double)
+    sqj = np.zeros((n_x, n_x), np.double)
+    sim = np.zeros((n_x, n_x), np.double)
+
+    for y, y_ratings in iteritems(yr):
+
+        # yr_ratings data structure is sparse. But for cosine similarity it is
+        # necessary to obtain all pairs, substituting missing ratings for 0.
+        # Implementation:
+        # Iterate through the range of x-indexes, taking 0-rating for each
+        # index unless this index is actually present in the iter
+        sorted_y_ratings = sorted(y_ratings, key=lambda x: x[0])
+        xi_iter = iter(sorted_y_ratings)
+        try:
+            xi_non_missing, ri_non_missing = next(xi_iter)
+        except StopIteration:
+            xi_non_missing = n_x
+        for xi_all in range(n_x):
+            if xi_all < xi_non_missing:
+                xi = xi_all
+                ri = 0
+            else:
+                xi = xi_non_missing
+                ri = ri_non_missing
+                try:
+                    xi_non_missing, ri_non_missing = next(xi_iter)
+                except StopIteration:
+                    xi_non_missing = n_x
+
+            xj_iter = iter(sorted_y_ratings)
+            try:
+                xj_non_missing, rj_non_missing = next(xj_iter)
+            except StopIteration:
+                xj_non_missing = n_x
+            for xj_all in range(n_x):
+                if xj_all < xj_non_missing:
+                    xj = xj_all
+                    rj = 0
+                else:
+                    xj = xj_non_missing
+                    rj = rj_non_missing
+                    try:
+                        xj_non_missing, rj_non_missing = next(xj_iter)
+                    except StopIteration:
+                        xj_non_missing = n_x
+
+                freq[xi, xj] += 1
+                prods[xi, xj] += ri * rj
+                sqi[xi, xj] += ri ** 2
+                sqj[xi, xj] += rj ** 2
 
     for xi in range(n_x):
         sim[xi, xi] = 1
@@ -149,7 +243,7 @@ def msd(n_x, yr, min_support):
     for y, y_ratings in iteritems(yr):
         for xi, ri in y_ratings:
             for xj, rj in y_ratings:
-                sq_diff[xi, xj] += (ri - rj)**2
+                sq_diff[xi, xj] += (ri - rj) ** 2
                 freq[xi, xj] += 1
 
     for xi in range(n_x):
@@ -232,8 +326,8 @@ def pearson(n_x, yr, min_support):
             for xj, rj in y_ratings:
                 prods[xi, xj] += ri * rj
                 freq[xi, xj] += 1
-                sqi[xi, xj] += ri**2
-                sqj[xi, xj] += rj**2
+                sqi[xi, xj] += ri ** 2
+                sqj[xi, xj] += rj ** 2
                 si[xi, xj] += ri
                 sj[xi, xj] += rj
 
@@ -341,8 +435,8 @@ def pearson_baseline(n_x, yr, min_support, global_mean, x_biases, y_biases,
                 diff_i = (ri - (partial_bias + x_biases_[xi]))
                 diff_j = (rj - (partial_bias + x_biases_[xj]))
                 prods[xi, xj] += diff_i * diff_j
-                sq_diff_i[xi, xj] += diff_i**2
-                sq_diff_j[xi, xj] += diff_j**2
+                sq_diff_i[xi, xj] += diff_i ** 2
+                sq_diff_j[xi, xj] += diff_j ** 2
 
     for xi in range(n_x):
         sim[xi, xi] = 1
diff --git a/tests/test_similarities.py b/tests/test_similarities.py
@@ -12,11 +12,11 @@
 
 n_x = 8
 yr_global = {
-    0: [(0, 3), (1, 3), (2, 3), (5, 1),                 (6, 1.5), (7, 3)],  # noqa
+    0: [(0, 3), (1, 3), (2, 3),                 (5, 1), (6, 1.5), (7, 3)],  # noqa
     1: [(0, 4), (1, 4), (2, 4),                                         ],  # noqa
     2: [                (2, 5), (3, 2), (4, 3)                          ],  # noqa
-    3: [(1, 1),         (2, 4), (3, 2), (4, 3), (5, 3), (6, 3.5), (7, 2)],  # noqa
-    4: [(1, 5),         (2, 1),                 (5, 2), (6, 2.5), (7, 2.5)], # noqa
+    3: [        (1, 1), (2, 4), (3, 2), (4, 3), (5, 3), (6, 3.5), (7, 2)],  # noqa
+    4: [        (1, 5), (2, 1),                 (5, 2), (6, 2.5), (7, 2.5)], # noqa
 }
 
 
@@ -48,7 +48,51 @@ def test_cosine_sim():
     # cosine sim is necessarily 1
     assert sim[3, 4] == 1
 
-    # pairs of users (0, 3)  have no common items
+    # pairs of users (0, 3) have no common items
+    assert sim[0, 3] == 0
+    assert sim[0, 4] == 0
+
+    # check for float point support and computation correctness
+    dot_product56 = 1 * 1.5 + 3 * 3.5 + 2 * 2.5
+    assert sim[5, 6] == (dot_product56 /
+                         ((1 ** 2 + 3 ** 2 + 2 ** 2) *
+                          (1.5 ** 2 + 3.5 ** 2 + 2.5 ** 2)) ** 0.5
+                         )
+
+    # ensure min_support is taken into account. Only users 1 and 2 have more
+    # than 4 common ratings.
+    sim = sims.cosine(n_x, yr, min_support=4)
+    for i in range(n_x):
+        for j in range(i + 1, n_x):
+            if i != 1 and j != 2:
+                assert sim[i, j] == 0
+
+
+def test_cosine_full_vectors_sim():
+    """Tests for the cosine similarity."""
+
+    yr = yr_global.copy()
+
+    # # shuffle every rating list, to ensure the order in which ratings are
+    # # processed does not matter (it's important because it used to be error
+    # # prone when we were using itertools.combinations)
+    # for _, ratings in yr.items():
+    #     random.shuffle(ratings)
+
+    sim = sims.cosine(n_x, yr, min_support=1, common_ratings_only=False)
+
+    # check symetry and bounds (as ratings are > 0, cosine sim must be >= 0)
+    for xi in range(n_x):
+        assert sim[xi, xi] == 1
+        for xj in range(n_x):
+            assert sim[xi, xj] == sim[xj, xi]
+            assert 0 <= sim[xi, xj] <= 1
+
+    # users 0, 1 and 2 have different ratings when non-common items considered
+    assert sim[0, 1] < 1
+    assert sim[0, 2] < 1
+
+    # pairs of users (0, 3) and (0,4) have no common items
     assert sim[0, 3] == 0
     assert sim[0, 4] == 0