-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathRBO.py
124 lines (96 loc) · 4.32 KB
/
RBO.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import math
from bisect import bisect_left
def _numtest(floatn):
return "{:.3f}".format(floatn)
def set_at_depth(lst, depth):
ans = set()
for v in lst[:depth]:
if isinstance(v, set):
ans.update(v)
else:
ans.add(v)
return ans
def raw_overlap(list1, list2, depth):
set1, set2 = set_at_depth(list1, depth), set_at_depth(list2, depth)
return len(set1.intersection(set2)), len(set1), len(set2)
def overlap(list1, list2, depth):
return agreement(list1, list2, depth) * min(depth, len(list1), len(list2))
# NOTE: comment the preceding and uncomment the following line if you want
# to stick to the algorithm as defined by the paper
# return raw_overlap(list1, list2, depth)[0]
def agreement(list1, list2, depth):
len_intersection, len_set1, len_set2 = raw_overlap(list1, list2, depth)
return 2 * len_intersection / (len_set1 + len_set2)
def cumulative_agreement(list1, list2, depth):
return (agreement(list1, list2, d) for d in range(1, depth + 1))
def average_overlap(list1, list2, depth=None):
depth = min(len(list1), len(list2)) if depth is None else depth
return sum(cumulative_agreement(list1, list2, depth)) / depth
def rbo_at_k(list1, list2, p, depth=None):
depth = min(len(list1), len(list2)) if depth is None else depth
d_a = enumerate(cumulative_agreement(list1, list2, depth))
return (1 - p) * sum(p ** d * a for (d, a) in d_a)
def rbo_min(list1, list2, p, depth=None):
depth = min(len(list1), len(list2)) if depth is None else depth
x_k = overlap(list1, list2, depth)
log_term = x_k * math.log(1 - p)
sum_term = sum(p ** d / d * (overlap(list1, list2, d) - x_k)
for d in range(1, depth + 1))
return (1 - p) / p * (sum_term - log_term)
def rbo_res(list1, list2, p):
S, L = sorted((list1, list2), key=len)
s, l = len(S), len(L)
x_l = overlap(list1, list2, l)
# since overlap(...) can be fractional in the general case of ties and f
# must be an integer → math.ceil()
f = math.ceil(l + s - x_l)
# upper bound of range() is non-inclusive, therefore + 1 is needed
term1 = s * sum(p ** d / d for d in range(s + 1, f + 1))
term2 = l * sum(p ** d / d for d in range(l + 1, f + 1))
term3 = x_l * (math.log(1 / (1 - p)) - sum(p ** d / d for d in range(1, f + 1)))
return p ** s + p ** l - p ** f - (1 - p) / p * (term1 + term2 + term3)
def rbo_ext(list1, list2, p):
S, L = sorted((list1, list2), key=len)
s, l = len(S), len(L)
x_l = overlap(list1, list2, l)
x_s = overlap(list1, list2, s)
# the paper says overlap(..., d) / d, but it should be replaced by
# agreement(..., d) defined as per equation (28) so that ties are handled
# properly (otherwise values > 1 will be returned)
# sum1 = sum(p**d * overlap(list1, list2, d)[0] / d for d in range(1, l + 1))
sum1 = sum(p ** d * agreement(list1, list2, d) for d in range(1, l + 1))
sum2 = sum(p ** d * x_s * (d - s) / s / d for d in range(s + 1, l + 1))
term1 = (1 - p) / p * (sum1 + sum2)
term2 = p ** l * ((x_l - x_s) / l + x_s / s)
return term1 + term2
def rbo(list1, list2, p):
if not 0 <= p <= 1:
raise ValueError("The ``p`` parameter must be between 0 and 1.")
args = (list1, list2, p)
return dict(min=rbo_min(*args), res=rbo_res(*args), ext=rbo_ext(*args))
def sort_dict(dct):
scores = []
items = []
# items should be unique, scores don't have to
for item, score in dct.items():
# sort in descending order, i.e. according to ``-score``
score = -score
i = bisect_left(scores, score)
if i == len(scores):
scores.append(score)
items.append(item)
elif scores[i] == score:
existing_item = items[i]
if isinstance(existing_item, set):
existing_item.add(item)
else:
items[i] = {existing_item, item}
else:
scores.insert(i, score)
items.insert(i, item)
return items
def rbo_dict(dict1, dict2, p=0.95):
"""RBO - Rank Biased Overlap
This is the API, The final function to use!"""
list1, list2 = sort_dict(dict1), sort_dict(dict2)
return rbo(list1, list2, p)