-
Notifications
You must be signed in to change notification settings - Fork 0
/
interdataset.cpp
321 lines (278 loc) · 7.2 KB
/
interdataset.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
#include <unordered_map>
#include "shared.hpp"
#include "interdataset.hpp"
#include "cuda/cudasampler.hpp"
// calculate the index of item (i,j) in a (n x n) lower-triangular matrix
// calculate the product sums
interdataset::interdataset(const int K) : _nfiles(K)
{
// ensure the artifact resulting from this implementation is
// respected. In practice, a K above about 8 will take a *long*
// time to complete, so K=32 or 64 here doesn't matter.
if ((size_t)K > sizeof(int)*8) {
fprintf (stderr, "Error: too many data files (K=%i,max=%zu)", K, sizeof(int)*8);
throw "fail!";
}
int
m = nphis(),
m2 = 1<<m;
std::vector<int> idx, set(K);
if (true) {
// this uses the sensible ordering of phis (i.e. the same as is used for MCMC output)
for (int i = 0; i < K-1; i++) {
for (int j = i+1; j < K; j++) {
idx.push_back(i);
idx.push_back(j);
}
}
} else {
// this is the order used in the Matlab MDI code—reversed for some reason!
for (int i = K-2; i >= 0; --i) {
for (int j = K-1; j > i; --j) {
idx.push_back(i);
idx.push_back(j);
}
}
}
std::unordered_map<std::vector<bool>,uint32_t> map;
_weightord.push_back(0);
_phiord.reserve(m2);
std::vector<bool> key((1<<K)-1);
for (int i = 0; i < m2; i++) {
// zero out the key
std::fill(key.begin(), key.end(), false);
// assign every file to its own cluster
for (int j = 0; j < K; j++) set[j] = j;
int used = (1<<K)-1;
// loop through phis we're interested in and merge clusters
// according to which ones are together
int x = i;
// we want to loop through all set bits, so terminating condition
// is when there aren't any left
while (x) {
// ffs() is "find first set" bit, so @j will iterate over set
// bits only and @a and @b are the clusters we want to merge
const int
j = ffs(x)-1,
a = set[idx[j * 2]],
b = set[idx[j * 2 + 1]];
// merge together
for (int k = 0; k < K; k++) {
if (set[k] == b)
set[k] = a;
}
// update our cluster used flags
used = (used & ~(1<<b)) | (1<<a);
// mask this entry off so we don't find it next time around
x &= ~(1<<j);
}
// loop through all the used clusters
while (used) {
// get the first cluster
int k = ffs(used)-1, l = 0;
for (int j = 0; j < K; j++) {
if (set[j] == k) {
l |= 1<<j;
}
}
// add the sum in
key[l-1] = true;
// mask this cluster off as being used so we don't hit it next
// time around
used &= ~(1<<k);
}
int idx;
auto got = map.find (key);
if (got == map.end()) {
idx = map.size();
map.insert(std::make_pair(key,idx));
// add this into our table so access can be FAST later
int i = _weightord.size(), n = 0;
_weightord.push_back(0);
for (size_t j = 0; j < key.size(); j++) {
if (key[j]) {
_weightord.push_back(j);
n++;
}
}
_weightord[i] = n;
} else {
idx = got->second;
}
_phiord.push_back(idx);
}
_weightord[0] = map.size();
_weightord.shrink_to_fit();
_phiord.shrink_to_fit();
}
void
mdisampler::sampleFromPrior()
{
/* TODO: This currently doesn't sample from the prior, just set of
* independent Gammas which is not what the function name suggests,
* but is fine for seeding parameters for a random walk */
gamma_distribution<> ginit(2,1);
for (auto i = 0; i < _dpmass.size(); i++)
_dpmass(i) = ginit(generator);
for (auto i = 0; i < _weight.cols(); i++)
for (auto j = 0; j < _weight.rows(); j++)
_weight(j, i) = ginit(generator);
for (auto i = 0; i < _phi.size(); i++)
_phi(i) = ginit(generator);
_nu = ginit(generator);
updateColumnProducts();
}
void
mdisampler::updateColumnProducts()
{
for (int k = 0; k < nfiles(); k++) {
const int k_ = (1<<k)-1;
_prodsum(k_) = (_prod.col(k_) = _weight.col(k)).sum();
for (int l = 0; l < k_; l++) {
int l_ = k_+l+1;
_prodsum(l_) = (_prod.col(l_) = _prod.col(k_).array() * _prod.col(l).array()).sum();
}
}
}
Eigen::VectorXd
mdisampler::calcProdCoef () const
{
const auto& weightord = _inter.getWeightOrd();
Eigen::VectorXd coef(weightord[0]);
for (int i = 0, j = 1; i < coef.size(); i++) {
double prod = 1;
for (int n = weightord[j++]; n > 0; n--) {
prod *= _prodsum(weightord[j++]);
}
assert(std::isfinite(prod));
coef[i] = prod;
}
return coef;
}
double
mdisampler::calcProdPhi(const Eigen::VectorXd &coef) const
{
const auto& phiord = _inter.getPhiOrd();
int
KK = nphis();
double sum = 0;
for (size_t i = 0; i < phiord.size(); i++) {
double prod = coef[phiord[i]];
for (int j = 0; j < KK; j++) {
if(i & (1<<j))
prod *= _phi(j);
}
assert(std::isfinite(prod));
sum += prod;
}
return sum;
}
// calculate the normalisation constant from the given product vector
double
mdisampler::nuConditionalBeta() const
{
if (nfiles() == 1)
return _prodsum(0);
return calcProdPhi(calcProdCoef());
}
// calculate the conditional distribution for phi_{k,l}
double
mdisampler::phiConditionalBeta (int k, int l) const
{
const auto& phiord = _inter.getPhiOrd();
int
KK = nphis(),
phiidx = phiBetween(k, l);
Eigen::VectorXd coef = calcProdCoef();
double sum = 0;
for (size_t i = 0; i < phiord.size(); i++) {
if (!(i & (1<<phiidx)))
continue;
double prod = coef[phiord[i]];
for (int j = 0; j < KK; j++) {
if(i & (1<<j))
prod *= _phi(j);
}
sum += prod;
}
return _nu * sum / _phi(phiidx);
}
double
mdisampler::gammaConditionalBeta (int k, int clus) const
{
if (nfiles() == 1)
return _nu;
const auto& weightord = _inter.getWeightOrd();
Eigen::VectorXd coef(weightord[0]);
for (int i = 0, j = 1; i < coef.size(); i++) {
double prod = 1;
for (int n = weightord[j++]; n > 0; n--) {
int kk = weightord[j++];
// if this product is affected by our dataset of interest
if ((kk+1) & (1<<k)) {
int kk1 = (kk+1) & ~(1<<k);
for (int m = 0; m < nfiles(); m++) {
if(kk1 & (1<<m)) {
prod *= _weight(clus,m);
}
}
} else {
// otherwise, just use the precalculated value
prod *= _prodsum(kk);
}
}
coef[i] = prod;
}
return _nu * calcProdPhi(coef);
}
// void
// mdisampler::setWeight(int j, int k, double weight)
// {
// assert(weight >= 0);
// _weight(j,k) = weight;
// // should be able to optimise this a bit more!
// updateColumnProducts();
// }
void
mdisampler::setDpMass(int k, double mass)
{
assert(mass >= 0);
_dpmass(k) = mass;
}
void
mdisampler::setPhi(int k, int l, double phi)
{
assert(phi >= 0);
_phi[phiBetween (k, l)] = phi;
}
void
mdisampler::setNu(double x)
{
assert(x >= 0);
_nu = x;
}
void
mdisampler::setWeight(const Eigen::MatrixXf &x)
{
assert(x.rows() == nclus() && x.cols() == nfiles());
assert(x.minCoeff() >= 0);
_weight = x;
}
void
mdisampler::setWeight(int k, const Eigen::VectorXf &x)
{
assert(x.minCoeff() >= 0);
_weight.col(k) = x;
}
void
mdisampler::setDpMass(const Eigen::VectorXf &x)
{
assert(x.size() == nfiles());
_dpmass = x;
}
void
mdisampler::setPhis(const Eigen::VectorXf &x)
{
assert(x.size() == nphis());
_phi = x;
}