-
Notifications
You must be signed in to change notification settings - Fork 78
/
range_cache.h
543 lines (510 loc) · 15 KB
/
range_cache.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
/*
* range_cache.h
*
* Classes that encapsulate the caching of
*/
#ifndef RANGE_CACHE_H_
#define RANGE_CACHE_H_
#include <iostream>
#include <map>
#include <stdexcept>
#include <stdint.h>
#include <utility>
#include "ds.h"
#include "ebwt.h"
#include "row_chaser.h"
#define RANGE_NOT_SET OFF_MASK
#define RANGE_CACHE_BAD_ALLOC OFF_MASK
/**
* Manages a pool of memory used exclusively for range cache entries.
* This manager is allocate-only; it exists mainly so that we can avoid
* lots of new[]s and delete[]s.
*
* A given stretch of words may be one of two types: a cache entry, or
* a cache entry wrapper. A cache entry has a length and a list of
* already-resolved reference positions. A cache entry wrapper has a
* pointer to a cache entry for a different range, along with an
* integer indicating how many "jumps" to the left that range is from
* the one that owns the wrapper.
*/
class RangeCacheMemPool {
public:
RangeCacheMemPool(TIndexOffU lim /* max cache size in bytes */) :
lim_(lim >> 2 /* convert to words */), occ_(0), buf_(NULL),
closed_(false)
{
if(lim_ > 0) {
try {
buf_ = new TIndexOffU[lim_];
if(buf_ == NULL) throw std::bad_alloc();
} catch(std::bad_alloc& e) {
cerr << "Allocation error allocating " << lim
<< " words of range-cache memory" << endl;
throw 1;
}
assert(buf_ != NULL);
// Fill with 1s to signal that these elements are
// uninitialized
memset(buf_, 0xff, lim_ << 2 /* convert back to bytes */);
}
}
~RangeCacheMemPool() {
// Release all word memory!
if(lim_ > 0) delete[] buf_;
}
/**
* Allocate numElts elements from the word pool.
*/
TIndexOffU alloc(TIndexOffU numElts) {
assert_gt(numElts, 0);
assert_leq(occ_, lim_);
if(occ_ + numElts > lim_ || numElts >= CACHE_WRAPPER_BIT) {
return RANGE_CACHE_BAD_ALLOC;
}
assert_gt(lim_, 0);
TIndexOffU ret = occ_;
assert(allocs_.find(ret) == allocs_.end());
ASSERT_ONLY(allocs_.insert(ret));
// Clear the first elt so that we don't think there's already
// something there
#ifndef NDEBUG
for(TIndexOffU i = 0; i < numElts; i++) {
assert_eq(OFF_MASK, buf_[occ_ + i]);
}
#endif
buf_[occ_] = 0;
occ_ += numElts;
assert_leq(occ_, lim_);
if(lim_ - occ_ < 10) {
// No more room - don't try anymore
closed_ = true;
}
return ret;
}
/**
* Turn a pool-array index into a pointer; check that it doesn't
* fall outside the pool first.
*/
inline TIndexOffU *get(TIndexOffU off) {
assert_gt(lim_, 0);
assert_lt(off, lim_);
assert(allocs_.find(off) != allocs_.end());
TIndexOffU *ret = buf_ + off;
assert_neq(CACHE_WRAPPER_BIT, ret[0]);
assert_neq(OFF_MASK, ret[0]);
return ret;
}
/**
* Return true iff there's no more room in the cache.
*/
inline bool closed() {
return closed_;
}
private:
TIndexOffU lim_; /// limit on number of 32-bit words to dish out in total
TIndexOffU occ_; /// number of occupied words
TIndexOffU *buf_; /// buffer of 32-bit words
bool closed_; ///
#ifndef NDEBUG
std::set<TIndexOffU> allocs_; // elements allocated
#endif
};
/**
* A view to a range of cached reference positions.
*/
class RangeCacheEntry {
public:
/**
*
*/
RangeCacheEntry(bool sanity = false) :
top_(OFF_MASK), jumps_(0), len_(0), ents_(NULL), ebwt_(NULL),
sanity_(sanity)
{ }
/**
* Create a new RangeCacheEntry from the data in the pool at 'ents'.
*/
RangeCacheEntry(RangeCacheMemPool& pool, TIndexOffU top,
TIndexOffU ent, Ebwt* ebwt, bool sanity = false) :
sanity_(sanity)
{
init(pool, top, ent, ebwt);
}
/**
* Initialize a RangeCacheEntry from the data in the pool at 'ents'.
*/
void init(RangeCacheMemPool& pool, TIndexOffU top, TIndexOffU ent, Ebwt* ebwt) {
assert(ebwt != NULL);
top_ = top;
ebwt_ = ebwt;
TIndexOffU *ents = pool.get(ent);
assert_neq(CACHE_WRAPPER_BIT, ents[0]);
// Is hi bit set?
if((ents[0] & CACHE_WRAPPER_BIT) != 0) {
// If so, the target is a wrapper and the non-hi bits
// contain the # jumps
jumps_ = (ents[0] & ~CACHE_WRAPPER_BIT);
assert_gt(jumps_, 0);
assert_leq(jumps_, ebwt_->_eh._len);
// Get the target entry
TIndexOffU *dest = pool.get(ents[1]);
// Get the length from the target entry
len_ = dest[0];
assert_leq(top_ + len_, ebwt_->_eh._len);
assert_gt(len_, 0);
assert_leq(len_, ebwt_->_eh._len);
// Get the pointer to the entries themselves
ents_ = dest + 1;
} else {
// Not a wrapper, so there are no jumps
jumps_ = 0;
// Get the length from the target entry
len_ = ents[0];
assert_leq(top_ + len_, ebwt_->_eh._len);
assert_gt(len_, 0);
assert_leq(len_, ebwt_->_eh._len);
// Get the pointer to the entries themselves
ents_ = ents + 1;
}
if (sanity_)
assert(sanityCheckEnts());
}
/**
* Initialize a wrapper with given number of jumps and given target
* entry index.
*/
void init(RangeCacheMemPool& pool, TIndexOffU top, TIndexOffU jumps,
TIndexOffU ent, Ebwt* ebwt)
{
assert(ebwt != NULL);
ebwt_ = ebwt;
top_ = top;
jumps_ = jumps;
TIndexOffU *ents = pool.get(ent);
// Must not be a wrapper
assert_eq(0, ents[0] & CACHE_WRAPPER_BIT);
// Get the length from the target entry
len_ = ents[0];
assert_gt(len_, 0);
assert_leq(len_, ebwt_->_eh._len);
// Get the pointer to the entries themselves
ents_ = ents + 1;
assert_leq(top_ + len_, ebwt_->_eh._len);
if (sanity_)
assert(sanityCheckEnts());
}
TIndexOffU len() const {
assert(ents_ != NULL);
assert(ebwt_ != NULL);
return len_;
}
TIndexOffU jumps() const {
assert(ents_ != NULL);
assert(ebwt_ != NULL);
return jumps_;
}
/**
*
*/
void reset() {
ents_ = NULL;
}
/**
* Return true iff this object represents a valid cache entry.
*/
bool valid() const {
return ents_ != NULL;
}
Ebwt *ebwt() {
return ebwt_;
}
/**
* Install a result obtained by a client of this cache; be sure to
* adjust for how many jumps down the tunnel the cache entry is
* situated.
*/
void install(TIndexOffU elt, TIndexOffU val) {
if(ents_ == NULL) {
// This is not a valid cache entry; do nothing
return;
}
assert(ents_ != NULL);
assert(ebwt_ != NULL);
assert_leq(jumps_, val);
assert_neq(OFF_MASK, val);
assert_leq(top_ + len_, ebwt_->_eh._len);
if(elt < len_) {
val -= jumps_;
if(verbose_) cout << "Installed reference offset: " << (top_ + elt) << endl;
ASSERT_ONLY(TIndexOffU sanity = RowChaser::toFlatRefOff(ebwt_, 1, top_ + elt));
assert_eq(sanity, val);
#ifndef NDEBUG
for(size_t i = 0; i < len_; i++) {
if(i == elt) continue;
assert_neq(val, ents_[i]);
}
#endif
ents_[elt] = val;
} else {
// ignore install request
if(verbose_) cout << "Fell off end of cache entry for install: " << (top_ + elt) << endl;
}
}
/**
* Get an element from the cache, adjusted for tunnel jumps.
*/
inline TIndexOffU get(TIndexOffU elt) const {
if(ents_ == NULL) {
// This is not a valid cache entry; do nothing
return RANGE_NOT_SET;
}
assert(ents_ != NULL);
assert(ebwt_ != NULL);
assert_leq(top_ + len_, ebwt_->_eh._len);
if(elt < len_ && ents_[elt] != RANGE_NOT_SET) {
if(verbose_) cout << "Retrieved result from cache: " << (top_ + elt) << endl;
TIndexOffU ret = ents_[elt] + jumps_;
ASSERT_ONLY(TIndexOffU sanity = RowChaser::toFlatRefOff(ebwt_, 1, top_ + elt));
assert_eq(sanity, ret);
return ret;
} else {
if(verbose_) cout << "Cache entry not set: " << (top_ + elt) << endl;
return RANGE_NOT_SET;
}
}
/**
* Check that len_ and the ents_ array both make sense.
*/
static bool sanityCheckEnts(TIndexOffU len, TIndexOffU *ents, Ebwt* ebwt) {
assert_gt(len, 0);
assert_leq(len, ebwt->_eh._len);
if(len < 10) {
for(size_t i = 0; i < len; i++) {
if(ents[i] == OFF_MASK) continue;
assert_leq(ents[i], ebwt->_eh._len);
for(size_t j = i+1; j < len; j++) {
if(ents[j] == OFF_MASK) continue;
assert_neq(ents[i], ents[j]);
}
}
} else {
std::set<TIndexOffU> seen;
for(size_t i = 0; i < len; i++) {
if(ents[i] == OFF_MASK) continue;
assert(seen.find(ents[i]) == seen.end());
seen.insert(ents[i]);
}
}
return true;
}
/**
* Check that len_ and the ents_ array both make sense.
*/
bool sanityCheckEnts() {
return RangeCacheEntry::sanityCheckEnts(len_, ents_, ebwt_);
}
private:
TIndexOffU top_; /// top pointer for this range
TIndexOffU jumps_; /// how many tunnel-jumps it is away from the requester
TIndexOffU len_; /// # of entries in cache entry
TIndexOffU *ents_; /// ptr to entries, which are flat offs within joined ref
Ebwt *ebwt_; /// index that alignments are in
bool verbose_; /// be talkative?
bool sanity_; /// do consistency checks?
};
/**
*
*/
class RangeCache {
typedef EList<TIndexOffU> TUVec;
typedef std::map<TIndexOffU, TIndexOffU> TMap;
typedef std::map<TIndexOffU, TIndexOffU>::iterator TMapItr;
public:
RangeCache(TIndexOffU lim, Ebwt* ebwt) :
lim_(lim), map_(), pool_(lim), closed_(false), ebwt_(ebwt), sanity_(true) { }
/**
* Given top and bot offsets, retrieve the canonical cache entry
* that best covers that range. The cache entry may not directly
* correspond to the top offset provided, rather, it might be an
* entry that lies "at the end of the tunnel" when top and bot are
* walked backward.
*/
bool lookup(TIndexOffU top, TIndexOffU bot, RangeCacheEntry& ent) {
if(ebwt_ == NULL || lim_ == 0) return false;
assert_gt(bot, top);
ent.reset();
TMapItr itr = map_.find(top);
if(itr == map_.end()) {
// No cache entry for the given 'top' offset
if(closed_) {
return false; // failed to get cache entry
} else {
if(pool_.closed()) {
closed_ = true;
return false; // failed to get cache entry
}
}
// Use the tunnel
bool ret = tunnel(top, bot, ent);
return ret;
} else {
// There is a cache entry for the given 'top' offset
TIndexOffU ret = itr->second;
ent.init(pool_, top, ret, ebwt_);
return true; // success
}
}
/**
* Exhaustively check all entries linked to from map_ to ensure
* they're well-formed.
*/
bool repOk() {
#ifndef NDEBUG
for(TMapItr itr = map_.begin(); itr != map_.end(); itr++) {
TIndexOffU top = itr->first;
TIndexOffU idx = itr->second;
TIndexOffU jumps = 0;
assert_leq(top, ebwt_->_eh._len);
TIndexOffU *ents = pool_.get(idx);
if((ents[0] & CACHE_WRAPPER_BIT) != 0) {
jumps = ents[0] & ~CACHE_WRAPPER_BIT;
assert_leq(jumps, ebwt_->_eh._len);
idx = ents[1];
ents = pool_.get(idx);
}
TIndexOffU len = ents[0];
assert_leq(top + len, ebwt_->_eh._len);
if (sanity_)
RangeCacheEntry::sanityCheckEnts(len, ents + 1, ebwt_);
}
#endif
return true;
}
protected:
/**
* Tunnel through to the first range that 1) includes all the same
* suffixes (though longer) as the given range, and 2) has a cache
* entry for it.
*/
bool tunnel(TIndexOffU top, TIndexOffU bot, RangeCacheEntry& ent) {
assert_gt(bot, top);
TUVec tops;
const TIndexOffU spread = bot - top;
SideLocus tloc, bloc;
SideLocus::initFromTopBot(top, bot, ebwt_->_eh, ebwt_->_ebwt, tloc, bloc);
TIndexOffU newtop = top, newbot = bot;
TIndexOffU jumps = 0;
// Walk left through the tunnel
while(true) {
if(ebwt_->rowL(tloc) != ebwt_->rowL(bloc)) {
// Different characters at top and bot positions of
// BWT; this means that the calls to mapLF below are
// guaranteed to yield rows in two different character-
// sections of the BWT.
break;
}
// Advance top and bot
newtop = ebwt_->mapLF(tloc);
newbot = ebwt_->mapLF(bloc);
assert_geq(newbot, newtop);
assert_leq(newbot - newtop, spread);
// If the new spread is the same as the old spread, we can
// be confident that the new range includes all of the same
// suffixes as the last range (though longer by 1 char)
if((newbot - newtop) == spread) {
// Check if newtop is already cached
TMapItr itr = map_.find(newtop);
jumps++;
if(itr != map_.end()) {
// This range, which is further to the left in the
// same tunnel as the query range, has a cache
// entry already, so use that
TIndexOffU idx = itr->second;
TIndexOffU *ents = pool_.get(idx);
if((ents[0] & CACHE_WRAPPER_BIT) != 0) {
// The cache entry we found was a wrapper; make
// a new wrapper that points to that wrapper's
// target, with the appropriate number of jumps
jumps += (ents[0] & ~CACHE_WRAPPER_BIT);
idx = ents[1];
}
// Allocate a new wrapper
TIndexOffU newentIdx = pool_.alloc(2);
if(newentIdx != RANGE_CACHE_BAD_ALLOC) {
// We successfully made a new wrapper entry;
// now populate it and install it in map_
TIndexOffU *newent = pool_.get(newentIdx); // get ptr to it
assert_eq(0, newent[0]);
newent[0] = CACHE_WRAPPER_BIT | jumps; // set jumps
newent[1] = idx; // set target
assert(map_.find(top) == map_.end());
map_[top] = newentIdx;
if(sanity_) assert(repOk());
}
// Initialize the entry
ent.init(pool_, top, jumps, idx, ebwt_);
return true;
}
// Save this range
tops.push_back(newtop);
SideLocus::initFromTopBot(newtop, newbot, ebwt_->_eh, ebwt_->_ebwt, tloc, bloc);
} else {
// Not all the suffixes were preserved, so we can't
// link the source range's cached result to this
// range's cached results
break;
}
assert_eq(jumps, tops.size());
}
assert_eq(jumps, tops.size());
// Try to create a new cache entry for the leftmost range in
// the tunnel (which might be the query range)
TIndexOffU newentIdx = pool_.alloc(spread + 1);
if(newentIdx != RANGE_CACHE_BAD_ALLOC) {
// Successfully allocated new range cache entry; install it
TIndexOffU *newent = pool_.get(newentIdx);
assert_eq(0, newent[0]);
// Store cache-range length in first word
newent[0] = spread;
assert_lt(newent[0], CACHE_WRAPPER_BIT);
assert_eq(spread, newent[0]);
TIndexOffU entTop = top;
TIndexOffU jumps = 0;
if(tops.size() > 0) {
entTop = tops.back();
jumps = (TIndexOff)tops.size();
}
// Cache the entry for the end of the tunnel
assert(map_.find(entTop) == map_.end());
map_[entTop] = newentIdx;
if(sanity_) assert(repOk());
ent.init(pool_, entTop, jumps, newentIdx, ebwt_);
assert_eq(spread, newent[0]);
if(jumps > 0) {
assert_neq(entTop, top);
// Cache a wrapper entry for the query range (if possible)
TIndexOffU wrapentIdx = pool_.alloc(2);
if(wrapentIdx != RANGE_CACHE_BAD_ALLOC) {
TIndexOffU *wrapent = pool_.get(wrapentIdx);
assert_eq(0, wrapent[0]);
wrapent[0] = CACHE_WRAPPER_BIT | jumps;
wrapent[1] = newentIdx;
assert(map_.find(top) == map_.end());
map_[top] = wrapentIdx;
if(sanity_) assert(repOk());
}
}
return true;
} else {
// Could not allocate new range cache entry
return false;
}
}
TIndexOffU lim_; /// Total number of key/val bytes to keep in cache
TMap map_; ///
RangeCacheMemPool pool_; /// Memory pool
bool closed_; /// Out of space; no new entries
Ebwt* ebwt_; /// Index that alignments are in
bool sanity_;
};
#endif /* RANGE_CACHE_H_ */