-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhll.py
151 lines (117 loc) · 3.95 KB
/
hll.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from random import *
from math import *
class ResizeFactor(object):
def __init__(self, lg_):
self.X1 = 0;
self.X2 = 1;
self.X4 = 2;
self.X8 = 3;
self._lg = lg_
def lg(self):
return self._lg
def getRF(self, lg_):
if lg_ == self.X1:
return self.X1
if lg_ == self.X2:
return self.X2
if lg_ == self.X4:
return self.X4
return self.X8;
def getValue(self):
return 1 << self._lg;
DEFAULT_UPDATE_SEED = 9001
MIN_LG_ARR_ITEMS = 4
MAX_ITEMS_SEEN = 0xFFFFFFFFFF
class ReservoirSketch(object):
def startingSubMultiple(self, lgtarget, lgrf, lgmin):
if lgtarget <= lgmin:
return lgmin
elif lgrf == 0:
return lgtarget
return ((lgtarget-lgmin) % (lgrf+lgmin))
def getAdjustedSize(self, mxsize, resizetarget):
if (mxsize - (resizetarget<<1)) < 0:
return mxsize
return resizetarget
def __init__(self, k, rf_, randseed=DEFAULT_UPDATE_SEED):
self.MIN_LG_ARR_ITEMS = 4
self.MAX_ITEMS_SEEN = 0xFFFFFFFFFF
self.DEFAULT_RESIZE_FACTOR = ResizeFactor(100)
self.rf = ResizeFactor(100)
#var rrand : RandomStream(real);
#var irand : RandomStream(int);
self.reservoirSize = k
self.itemsSeen = 0
self.ceilingLgK = log(self.reservoirSize ** 2)/log(2)
self.initialLgSize = self.startingSubMultiple(self.ceilingLgK, rf_.lg(), MIN_LG_ARR_ITEMS)
self.currItemsAlloc = self.getAdjustedSize(self.reservoirSize, 1 << int(self.initialLgSize))
self.arrDom = self.currItemsAlloc
self.arr = [ 0 for i in xrange(self.arrDom)]
# TODO
#rrand = new RandomStream(real, randseed);
#irand = new RandomStream(int, randseed);
def ReservoirSketch(self, data, itemsseen, rf_, k, randseed=DEFAULT_UPDATE_SEED):
self.reservoirSize = k
self.currItemsAlloc = len(data)
self.itemsSeen = itemsseen
self.rf = rf_
self.arrDom = len(data)
self.arr = data
# TODO
#rrand = new RandomStream(real, randseed);
#irand = new RandomStream(int, randseed);
def numsamples(self):
return min(self.reservoirSize, self.itemsSeen)
def update(self, item):
if self.itemsSeen == MAX_ITEMS_SEEN:
return False
if self.itemsSeen < self.reservoirSize:
if self.itemsSeen >= self.currItemsAlloc:
self.growReservoir()
self.arr[self.itemsSeen] = item
self.itemsSeen +=1
else:
self.itemsSeen+=1;
if (randint(self.reservoirSize) * self.itemsSeen) < self.reservoirSize:
newSlot = random() % self.reservoirSize
self.arr[newSlot] = item
return True
def reset(self):
self.ceilingLgK = log2(pow2(self.reservoirSize))
self.initialLgSize = self.startingSubMultiple(ceilingLgK, self.rf.lg(), MIN_LG_ARR_ITEMS)
self.currItemsAlloc = self.getAdjustedSize(self.reservoirSize, 1 << self.initialLgSize);
self.arrDom = currItemsAlloc
self.itemsSeen = 0
def downSampledCopy(self, maxK):
ris = ReservoirSketch(maxK, self.rf)
for item in arr:
ris.update[item]
if ris.itemsSeen < self.itemsSeen:
ris.itemsSeen += self.itemsSeen - ris.itemsSeen
return ris
def growReservoir(self):
self.currItemsAlloc = self.getAdjustedSize(self.reservoirSize, self.currItemsAlloc << self.rf.lg())
if self.arrDom < (self.currItemsAlloc-1):
self.arrDom = currItemsAlloc
def samples(self):
return self.arr
def implicitSampleWeight(self):
if itemsSeen < reservoirSize:
return 1.0
return ((1.0 * float(itemsSeen)) / float(reservoirSize))
def get(self, pos):
return self.arr[pos]
def put(self, value, pos):
self.arr[pos] = value
def these(self):
for a in self.arr:
yield a
if __name__ == "__main__":
rf = ResizeFactor(10)
rs = ReservoirSketch(100, rf)
dataDom = 100
data = [ i for i in xrange(dataDom)]
for i in data:
rs.update(i)
for i in rs.these():
print(i)