-
Notifications
You must be signed in to change notification settings - Fork 8
/
experiments.py
373 lines (299 loc) · 11.5 KB
/
experiments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
#!/usr/bin/env python3
from collections import Counter
import json
import matplotlib.pyplot as plt
from math import log
from math import isclose
import numpy as np
from numpy import product
from scipy.integrate import quad
from scipy.stats import rv_discrete
from accuracy import probability_precise
from accuracy import probability_optimized
from accuracy import accuracy_optimized
from algorithms import scale
from algorithms import epsilon
from algorithms import factor
from algorithms import Laplace
from naive import sparse
datasets = {
'bms-pos': "BMS-POS",
'kosarak': "Kosarak",
'aol': "AOL",
'zipf': "Zipf",
}
e = 0.1 # this is what [@svt] used
cs = list(range(25, 301, 25))
b_min = 0.01 # only compute probabilities down to this value
N = 100
def threshold(c, queries):
return (queries[c-1] + queries[c])/2
def ratios(c, monotonic=True):
m = factor(monotonic)
return {
'1': 1,
'3': 3,
'c': m*c,
'c23': (m*c)**(2/3),
}
colors = ['black', 'magenta', 'blue', 'red']
def read_data(data):
with open('data/{}.json'.format(data)) as f:
items = json.load(f, object_hook=convert)
counts = Counter(items.values())
array = np.loadtxt('data/{}.txt'.format(data), dtype=int)
return counts, array
def above(queries, T, a):
return {k: v for k, v in queries.items() if k >= T + a}
def below(queries, T, a):
return {k: v for k, v in queries.items() if k <= T - a}
def write_alphas(data, start=None, end=None):
# compute probabilities only for unique tuples with numbers of queries
# above and below the T+/-alpha range
query_counts, query_array = read_data(data)
k = query_array[0]
print("loaded {}, max. value: {}".format(data, k))
for c in cs[start:end]:
T = threshold(c, query_array)
print('T:', T, end=' ')
result = compute_alphas(c, T, k, query_counts)
with open('experiments/{}-alphas {}.txt'.format(data, c), 'w') as f:
json.dump(result, f, separators=(',', ':'))
def compute_alphas(c, T, k, query_counts):
# collect unique sets of queries outside the T+/-alpha range
# this implicitly takes the largest `a` for each set
above_below = {}
for a in range(int(T)):
queries_below = below(query_counts, T, a)
queries_above = above(query_counts, T, a)
key = (len(queries_below.keys()), len(queries_above.keys()))
above_below[key] = {
'alpha': a,
'below': queries_below,
'above': queries_above,
}
print('c:', c, len(above_below), end='\r')
result = {}
for _, v in above_below.items():
result[v['alpha']] = {
'below': v['below'],
'above': v['above'],
}
return result
def read_alphas(data, c):
with open('experiments/{}-alphas {}.txt'.format(data, c)) as f:
return json.load(f, object_hook=convert)
def convert(d):
result = {}
for k, v in d.items():
if k.isdigit():
result[int(k)] = v
else:
result[k] = v
return result
def basic(a, k, s1, s2, *args):
return probability_precise(a, k, s1, s2)
def precise(a, k, s1, s2, queries, alphas, T):
below = alphas[a]['below']
above = alphas[a]['above']
# IMPORTANT: we *must* compute the probability of a correct response here,
# although eventually we want to have the error probability.
# the reason is that queries are usually so far away from the threshold that
# getting a wrong response is incredibly improbable, which in effect produces
# zeroes instead of *very very* small numbers. and by small I mean so small
# that even using gmpy2 with precision 10000 doesn't capture them properly.
rs = [False] * len(below) + [True] * len(above)
qs = {**below, **above}
def pred(x):
return product([query_above(s2, q, r, x)**n for (r, (q, n)) in zip(rs, qs.items())])
def state(x):
return Laplace(s1, T).pdf(x) * pred(x)
# integrate over some sufficiently large quantile of the threshold distribution,
# here we take everything except a 2*error part.
# bounds of [0, max(queries)] as I had them before will give catastrophically wrong results.
error = 1/1e12
T_bound = s1 * log(1/error)
return 1 - quad(state, T-T_bound, T+T_bound, points=[T])[0]
def query_above(scale, loc, is_above, threshold):
"""Pr(query(scale, loc) => is_above | threshold_value )"""
pr_above = 1 - Laplace(scale, loc).cdf(threshold)
if is_above:
return pr_above
else:
return 1 - pr_above
def write_probability(data, func, start=None, end=None):
query_counts, query_array = read_data(data)
k = query_array[0]
for c in cs[start:end]:
T = threshold(c, query_array)
alphas = read_alphas(data, c)
total = len(alphas)
for s, r in ratios(c).items():
print("c: {}, r: {}".format(c, s))
s1, s2 = scale(*epsilon(e, r), c)
last = 1
for i, a in enumerate(alphas.keys()):
p = func(a, k, s1, s2, query_array, alphas, T)
# catch problems with integration
if p > last or (p == last and p < 1) or p <= 0:
break
else:
last = p
print(total - i, a, p, end='\r')
with open('experiments/{}-{} {} {}.txt'.format(data, func.__name__, c, s), 'a') as f:
print(a, p, file=f)
print()
def write_samples(data):
"""recreate the experiments from [@svt]"""
database = np.loadtxt('data/{}.txt'.format(data), dtype=int)
for n in range(N):
print(n)
queries = np.random.permutation(range(len(database)))
for c in cs:
print(c, end='\r')
T = threshold(c, database)
for s, r in ratios(c).items():
ser = []
response = sparse(database, queries, T, e, r, c)
ser.append(score_error_rate(database, queries, response, c))
with open('experiments/{}-samples {} {}.txt'.format(data, c, s), 'a') as f:
for x in ser:
print(x, file=f)
print()
def score_error_rate(database, queries, response, c):
best_avg = sum(database[:c]) / c
sampled_score = sum(database[q] for q, x in zip(queries, response) if x)
sampled_length = len([x for x in response if x])
sampled_avg = sampled_score / sampled_length
return 1 - sampled_avg / best_avg
mean = np.mean
median = np.median
def plot_samples(data, average=mean):
fig, ax = plt.subplots(figsize=(5,3))
for s, color in zip(ratios(1).keys(), colors):
ys = []
std = []
for c in cs:
samples = np.loadtxt('experiments/{}-samples {} {}.txt'.format(data, c, s))
ys.append(average(samples))
std.append((0, np.std(samples)))
ax.errorbar(cs, ys, yerr=list(zip(*std)), color=color, capsize=5, fmt='-o', barsabove=True)
ax.set_aspect(150) # this value makes no sense to me
plt.xlim(min(cs),max(cs))
plt.ylim(0,1)
plt.yticks(np.arange(0, 1.1, 0.1))
plt.xticks(cs)
plt.xlabel("c")
plt.ylabel("SER")
plt.title("{}, SER samples".format(datasets[data]))
plt.show()
def plot_accuracy(data, func):
fig, ax = plt.subplots(figsize=(5,3))
colors = ['black', 'magenta', 'blue', 'red']
counts, array = read_data(data)
for s, color in zip(ratios(1).keys(), colors):
ys = []
std = []
for c in cs:
results = np.genfromtxt('experiments/{}-{} {} {}.txt'.format(data, func.__name__, c, s), dtype=None)
ser, prob = to_pdf(*to_cdf(results, c, array))
# need to handle the case where the estimation is too bad
# and does not lead to a proper distribution
if isclose(sum(prob), 1, rel_tol=1e-05):
rv = rv_discrete(values=(ser, prob))
ys.append(rv.mean())
std.append((0, rv.std()))
else:
ys.append(1)
std.append((0, 0))
ax.errorbar(cs, ys, yerr=list(zip(*std)), color=color, capsize=5, fmt='-o', barsabove=True)
ax.set_aspect(150) # this value makes no sense to me
plt.xlim(min(cs),max(cs))
plt.ylim(0,1)
plt.yticks(np.arange(0, 1.1, 0.1))
plt.xticks(cs)
plt.xlabel("c")
plt.ylabel("SER")
plt.title("{}, SER estimation".format(datasets[data]))
plt.show()
def plot_accuracy_alpha(data, func):
fig, ax = plt.subplots(figsize=(5,3))
xs = []
ys = []
pr = []
for c in cs:
counts, array = read_data(data)
results = np.genfromtxt('experiments/{}-{} {} c23.txt'.format(data, func.__name__, c), dtype=None)
results = np.atleast_1d(results) # https://stackoverflow.com/a/24247766
ser, prob = to_pdf(*to_cdf(results, c, array))
prob = discrete_pdf(prob)
colors = np.zeros((len(ser), 4))
colors[:,0] = 1 # red
colors[:,3] = prob # alpha
ax.scatter([c] * len(ser), ser, color=colors, marker='s', edgecolors='none')
ax.set_aspect(150) # this value makes no sense to me
plt.xlim(min(cs),max(cs))
plt.ylim(0,1)
plt.yticks(np.arange(0, 1.1, 0.1))
plt.xticks([1] + cs)
plt.xlabel("c")
plt.ylabel("SER")
plt.title("{}, SER estimation".format(datasets[data]))
plt.show()
def plot_accuracy_parameters(c, k):
fig, ax = plt.subplots(figsize=(6,4))
b_min = 0.01
MAX = 0
rs = {s: r for r, s in zip(ratios(c).values(), ["1", "3", "c", r"c$^{2/3}$"])}
for s, r in rs.items():
s1, s2 = scale(*epsilon(e, r), c)
new_MAX = accuracy_optimized(b_min, k, s1, s2)
if MAX < new_MAX:
MAX = new_MAX
for (s, r), color in zip(rs.items(), colors):
s1, s2 = scale(*epsilon(e, r), c)
xs = np.arange(MAX)
ys = [probability_optimized(x, k, s1, s2) for x in xs]
ax.plot(xs, ys, color=color, linewidth=2.0, label=s)
plt.ylim(0, 1)
plt.xlim(0, MAX)
plt.legend(loc='lower left')
plt.xlabel(r"$\alpha$")
plt.ylabel(r"$\beta$")
plt.title(r"$\epsilon$ = {}, c = {}, k = {}".format(e, c, k))
plt.show()
def plot_accuracy_slice(data, func, e, c, s):
fig, ax = plt.subplots(figsize=(7,4))
_counts, array = read_data(data)
ys = []
std = []
results = np.genfromtxt('experiments/{}-{} {} {}.txt'.format(data, func.__name__, c, s), dtype=None)
ser, prob = to_cdf(results, c, array)
ax.bar(ser, prob, color="red", align='edge',
label=r"$\mathrm{Pr}(\mathrm{SER} \leq x) \geq 1 - \beta$")
ax.bar(ser, discrete_pdf(prob), width=1/100, color="blue", align='edge',
label=r"$\mathrm{Pr}(\mathrm{SER} = x) \approx 1 - \beta$")
plt.xlim(0,1)
plt.ylim(0,1)
plt.xlabel(r"$x$")
plt.ylabel(r"$1 - \beta$")
ax.legend(loc='upper left')
plt.title(r"{}, $k = {}, c = {}, \epsilon = {}$".format(datasets[data], len(array), c, e))
plt.show()
def to_cdf(results, c, database):
ser = []
prob = []
for a, b in results:
ser.append(score_error_rate_alpha(database, c, a))
prob.append(1 - b)
return ser, prob
def to_pdf(ser, prob):
return ser, discrete_pdf(prob)
def score_error_rate_alpha(database, c, a):
T = threshold(c, database)
# the length is always c, so we don't need to divide
best_case = sum(database[:c])
worst_case = sum(database[database >= T - a][-c:])
return 1 - worst_case / best_case
def discrete_pdf(ys):
return [ys[0]] + [ys[i] - ys[i-1] for i in range(1, len(ys))]