-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsampling.chpl
181 lines (143 loc) · 4.01 KB
/
sampling.chpl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
use Random;
record ResizeFactor {
var X1 = 0;
var X2 = 1;
var X4 = 2;
var X8 = 3;
var _lg:int;
proc ResizeFactor(lg_) {
_lg = lg_;
}
proc lg() { return _lg; }
proc getRF(lg_) {
select lg_ {
when X1 do return X1;
when X2 do return X2;
when X4 do return X4;
}
return X8;
}
proc getValue() {
return 1 << _lg;
}
}
const DEFAULT_UPDATE_SEED = 9001:int;
record ReservoirSketch {
type T;
var MIN_LG_ARR_ITEMS = 4;
var MAX_ITEMS_SEEN = 0xFFFFFFFFFF;
var DEFAULT_RESIZE_FACTOR:ResizeFactor;
var reservoirSize:int;
var currItemsAlloc:int;
var itemsSeen:int;
var rf:ResizeFactor;
var arrDom: domain(1);
var arr:[arrDom] T;
var rrand : RandomStream(real);
var irand : RandomStream(int);
proc startingSubMultiple(lgtarget, lgrf, lgmin) {
return if lgtarget <= lgmin then lgmin else if lgrf == 0 then lgtarget else ((lgtarget-lgmin) % (lgrf+lgmin));
}
proc getAdjustedSize(mxsize, resizetarget) {
return if (mxsize - (resizetarget << 1)) < 0 then mxsize else resizetarget;
}
proc ReservoirSketch(type T, k:int, rf_:ResizeFactor, randseed=DEFAULT_UPDATE_SEED) {
reservoirSize = k;
itemsSeen = 0;
var ceilingLgK = log2(reservoirSize ** 2);
var initialLgSize = startingSubMultiple(ceilingLgK, rf_.lg(), MIN_LG_ARR_ITEMS);
currItemsAlloc = getAdjustedSize(reservoirSize, 1 << initialLgSize);
arrDom = {0..#currItemsAlloc};
rrand = new RandomStream(real, randseed);
irand = new RandomStream(int, randseed);
}
proc ReservoirSketch(type T, data:[]T, itemsseen:int, rf_:ResizeFactor, k:int, randseed=DEFAULT_UPDATE_SEED) {
reservoirSize = k;
currItemsAlloc = data.domain.high;
itemsSeen = itemsseen;
rf = rf_;
arrDom = {data.domain.low..data.domain.high};
arr = data;
rrand = new RandomStream(real, randseed);
irand = new RandomStream(int, randseed);
}
proc numsamples() { return min(reservoirSize, itemsSeen):int; }
proc update(item:T) {
if itemsSeen == MAX_ITEMS_SEEN {
return false;
}
if itemsSeen < reservoirSize {
if itemsSeen >= currItemsAlloc {
growReservoir();
}
arr(itemsSeen) = item;
itemsSeen +=1;
}
else {
itemsSeen+=1;
if (rrand.getNext():int * itemsSeen) < reservoirSize {
var val = irand.getNext();
val *= if val < 0 then -1 else 1;
var newSlot = val % reservoirSize;
newSlot *= if newSlot < 0 then -1 else 1;
arr(newSlot) = item;
}
}
return true;
}
proc reset() {
var ceilingLgK = log2(pow2(reservoirSize));
var initialLgSize = startingSubMultiple(ceilingLgK, rf.lg(), MIN_LG_ARR_ITEMS);
currItemsAlloc = getAdjustedSize(reservoirSize, 1 << initialLgSize);
arrDom = {0..#currItemsAlloc};
itemsSeen = 0;
}
proc downSampledCopy(maxK) {
var ris = new ReservoirSketch(T, maxK, rf);
for item in arr {
ris.update(item);
}
if ris.itemsSeen < itemsSeen {
ris.itemsSeen += itemsSeen - ris.itemsSeen;
}
return ris;
}
proc growReservoir() {
currItemsAlloc = getAdjustedSize(reservoirSize, currItemsAlloc << rf.lg());
if arrDom.high < (currItemsAlloc-1) {
arrDom = {arrDom.low..#currItemsAlloc};
}
}
proc samples() {
return arr;
}
proc implicitSampleWeight() {
return if itemsSeen < reservoirSize then 1.0 else ((1.0 * itemsSeen:real) / reservoirSize:real);
}
proc this(pos) { return arr(pos); }
proc put(value:T, pos:int) {
arr(pos) = value;
}
iter these() {
for a in arr {
yield a;
}
}
}
/*
proc main() {
var rf = new ResizeFactor(10);
var rs = new ReservoirSketch(int, 100, rf);
var dataDom = {0..#100};
var data : [dataDom] int;
forall i in dataDom {
data(i) = i;
}
for i in data {
rs.update(i);
}
for i in rs {
writeln(i);
}
}
*/