Skip to content

Commit

Permalink
make RMQ return index instead of value, make USE_SDSL a compile-time …
Browse files Browse the repository at this point in the history
…option
  • Loading branch information
Anton Pirogov committed Jun 9, 2016
1 parent 8c7f0dc commit 453839d
Show file tree
Hide file tree
Showing 11 changed files with 125 additions and 92 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
CXX := g++
CFLAGS := -DVECBIT=0 -std=c++11 -Isrc -Ilibdivsufsort/include -Igsl/include -Isdsl/include -Wall -Wextra -O2 -g -ggdb -Wno-missing-braces -funroll-loops # -pg -Wshadow
CFLAGS := -DUSE_SDSL -std=c++11 -Isrc -Ilibdivsufsort/include -Igsl/include -Isdsl/include -Wall -Wextra -O2 -g -ggdb -Wno-missing-braces -funroll-loops # -pg -Wshadow
LDFLAGS := -lm -lgsl -lgslcblas -lblas -ldivsufsort -ldivsufsort64 -lsdsl -Llibdivsufsort/lib -Lgsl/lib -Lsdsl/lib # -pg
TARGET := dnalc

Expand Down
43 changes: 22 additions & 21 deletions src/esa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,20 @@ using namespace std;
#include "bench.h"
#include "esa.h"

using namespace sdsl;

// calculate suffix array using divsufsort
int_vector<VECBIT> getSa(char const *seq, size_t n) {
uint_vec getSa(char const *seq, size_t n) {
sauchar_t *t = (sauchar_t *)seq;
vector<saidx64_t> sa(n + 1);
if (divsufsort64(t, sa.data(), (saidx64_t)n) != 0) {
cout << "ERROR[esa]: suffix sorting failed." << endl;
exit(-1);
}
int_vector<VECBIT> ret(n+1);
uint_vec ret(n+1);
for (size_t i=0; i<n+1; i++)
ret[i] = sa[i];
if (!VECBIT)
util::bit_compress(ret);
#ifdef USE_SDSL
sdsl::util::bit_compress(ret);
#endif
return ret;
}

Expand Down Expand Up @@ -62,8 +61,9 @@ void calcLcp(Esa &esa) {
h--;
}
}
if (!VECBIT)
util::bit_compress(lcp);
#ifdef USE_SDSL
sdsl::util::bit_compress(lcp);
#endif
}

Esa::Esa(char const *seq, size_t len) : str(seq), n(len) {
Expand All @@ -73,11 +73,12 @@ Esa::Esa(char const *seq, size_t len) : str(seq), n(len) {
sa = getSa(seq, n);
tock("libdivsufsort");

isa = int_vector<VECBIT>(n+1);
isa = uint_vec(n+1);
for (size_t i = 0; i < n; i++)
isa[sa[i]] = i;
if (!VECBIT)
util::bit_compress(isa);
#ifdef USE_SDSL
sdsl::util::bit_compress(isa);
#endif

tick();
calcLcp(*this);
Expand All @@ -91,23 +92,23 @@ void Esa::print() const {
<< endl;
}

RMQ Esa::precomputeLcp() const { return RMQ(lcp); }
RMQ Esa::precomputeLcp() const { return RMQ(&lcp); }

int64_t Esa::getLcp(RMQ const &rmq, size_t sai, size_t saj) const {
if (sai == saj)
return this->n - sai;
size_t l = min(this->isa[sai], this->isa[saj]) + 1;
size_t r = max(this->isa[sai], this->isa[saj]);
return rmq(l, r);
return lcp[rmq(l, r)];
}

// reduce esa to half (seq+$+revseq+$ -> seq+$) without recomputing
// important! asserting that n and str are replaced by user!
void reduceEsa(Esa &esa) {
size_t n = esa.sa.size() / 2;
int_vector<VECBIT> sa(n);
int_vector<VECBIT> isa(n);
int_vector<VECBIT> lcp(n + 1);
uint_vec sa(n);
uint_vec isa(n);
uint_vec lcp(n + 1);
lcp[0] = lcp[n] = 0;
sa[0] = n - 1;
isa[n - 1] = 0;
Expand All @@ -127,11 +128,11 @@ void reduceEsa(Esa &esa) {
esa.sa.resize(0);
esa.isa.resize(0);
esa.lcp.resize(0);
if (!VECBIT) {
util::bit_compress(sa);
util::bit_compress(isa);
util::bit_compress(lcp);
}
#ifdef USE_SDSL
sdsl::util::bit_compress(sa);
sdsl::util::bit_compress(isa);
sdsl::util::bit_compress(lcp);
#endif
esa.sa = sa;
esa.isa = isa;
esa.lcp = lcp;
Expand Down
9 changes: 3 additions & 6 deletions src/esa.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,6 @@
#include "rmq.h"
#include <divsufsort64.h>

#include <sdsl/int_vector.hpp>
#include <sdsl/suffix_arrays.hpp>

/* define data container */
class Esa {
public:
Expand All @@ -20,9 +17,9 @@ class Esa {
int64_t getLcp(RMQ const &rmq, size_t sai, size_t saj) const;
void print() const;

sdsl::int_vector<VECBIT> sa; /* suffix array */
sdsl::int_vector<VECBIT> isa; /* inverse suffix array */
sdsl::int_vector<VECBIT> lcp; /* longest common prefix array */
uint_vec sa; /* suffix array */
uint_vec isa; /* inverse suffix array */
uint_vec lcp; /* longest common prefix array */
char const *str; /* pointer to underlying string */
size_t n; /* length of sa and lcp */
};
Expand Down
7 changes: 4 additions & 3 deletions src/factors.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@
#include <cstddef>
#include <cstdint>
#include <vector>
#include <sdsl/int_vector.hpp>

#include "rmq.h"

/* a factorization of a string (LempelZiv or MatchLength) */
struct Fact {
void print() const;

sdsl::int_vector<VECBIT> fact; /* positions of factors */
sdsl::int_vector<VECBIT> lpf; /* lpf in case of Lempel-Ziv, otherwise empty */
uint_vec fact; /* positions of factors */
uint_vec lpf; /* lpf in case of Lempel-Ziv, otherwise empty */
std::vector<int64_t> prevOcc; /* LZ: prevOcc array (previous occurence of factor s_j) */

char const *str; /* string */
Expand Down
23 changes: 11 additions & 12 deletions src/lempelziv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,6 @@
#include <vector>
using namespace std;

#include <sdsl/int_vector.hpp>
using namespace sdsl;

// sum type used in lpf algorithm
struct Elem {
Elem(int64_t i, int64_t lcp);
Expand All @@ -38,12 +35,12 @@ Elem::Elem(int64_t a, int64_t b) : i(a), lcp(b) {}
* Computer Society, Los Alamitos, CA, 2008,
* pp. 482-488.
*/
void computeLpf(int_vector<VECBIT> &lpf, vector<int64_t> &prevOcc, Esa const &esa) {
void computeLpf(uint_vec &lpf, vector<int64_t> &prevOcc, Esa const &esa) {
size_t n = esa.n;
auto &sa = esa.sa;
auto &lcp = esa.lcp;

lpf = int_vector<VECBIT>(n + 1);
lpf = uint_vec(n + 1);
prevOcc = vector<int64_t>(n);

lpf[n] = 0;
Expand Down Expand Up @@ -73,14 +70,14 @@ void computeLpf(int_vector<VECBIT> &lpf, vector<int64_t> &prevOcc, Esa const &es
}

// alternative prevOcc calculation (from same paper)
void computeLpf2(int_vector<VECBIT> &lpf, vector<int64_t> &prevOcc, Esa const &esa) {
void computeLpf2(uint_vec &lpf, vector<int64_t> &prevOcc, Esa const &esa) {
size_t n = esa.n;
auto &sa = esa.sa;
vector<int64_t> lprev(n);
vector<int64_t> lnext(n);
vector<int64_t> prevl(n);
vector<int64_t> prevr(n);
lpf = int_vector<VECBIT>(n, 0);
lpf = uint_vec(n, 0);
vector<int64_t> lpfl(n, 0);
vector<int64_t> lpfr(n, 0);
prevOcc = vector<int64_t>(n);
Expand Down Expand Up @@ -134,18 +131,20 @@ void computeLZFact(Fact &lzf, Esa const &esa, bool alternative) {
computeLpf2(lzf.lpf, lzf.prevOcc, esa);
else
computeLpf(lzf.lpf, lzf.prevOcc, esa);
if (!VECBIT)
util::bit_compress(lzf.lpf);
#ifdef USE_SDSL
sdsl::util::bit_compress(lzf.lpf);
#endif

vector<size_t> lzftmp;
lzftmp.push_back(0);
while (lzftmp.back() < esa.n)
lzftmp.push_back(lzftmp.back() + max(1UL, (size_t)lzf.lpf[lzftmp.back()]));
lzftmp.pop_back();

lzf.fact = int_vector<VECBIT>(lzftmp.size());
lzf.fact = uint_vec(lzftmp.size());
for (size_t i=0; i<lzftmp.size(); i++)
lzf.fact[i] = lzftmp[i];
if (!VECBIT)
util::bit_compress(lzf.fact);
#ifdef USE_SDSL
sdsl::util::bit_compress(lzf.fact);
#endif
}
4 changes: 2 additions & 2 deletions src/matchlength.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
#include "shulen.h"
#include <algorithm>
#include <vector>
#include <sdsl/int_vector.hpp>
using namespace std;

void computeMLFact(Fact &mlf, Esa const &esa) {
Expand All @@ -34,6 +33,7 @@ void computeMLFact(Fact &mlf, Esa const &esa) {
mlf.fact.resize(factmp.size());
for (i=0; i<factmp.size(); i++)
mlf.fact[i] = factmp[i];
if (!VECBIT)
#ifdef USE_SDSL
sdsl::util::bit_compress(mlf.fact);
#endif
}
73 changes: 48 additions & 25 deletions src/rmq.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,13 @@
#include <cmath>
#include <algorithm>
using namespace std;
#include <sdsl/int_vector.hpp>
using namespace sdsl;

#include "rmq.h"

#define BLOCKSIZE(n) ((size_t)(log2(n) / 4) + 1)
#define BLOCKNUM(n) (n / BLOCKSIZE(n) + 1)

int_vector<VECBIT> precomputePow2RMQ(int_vector<VECBIT> const &A) {
uint_vec precomputePow2RMQ(uint_vec const &A) {
auto n = A.size();
size_t row = log2(n) + 1;
vector<int64_t> B(n * row);
Expand All @@ -30,15 +28,16 @@ int_vector<VECBIT> precomputePow2RMQ(int_vector<VECBIT> const &A) {
B[i * row + j] = B[i2 * row + (j - 1)];
}
}
int_vector<VECBIT> ret(B.size());
uint_vec ret(B.size());
for (size_t i=0; i<B.size(); i++)
ret[i] = B[i];
if (!VECBIT)
util::bit_compress(ret);
#ifdef USE_SDSL
sdsl::util::bit_compress(ret);
#endif
return ret;
}

int64_t getRMQwithPow2(int_vector<VECBIT> const &A, int_vector<VECBIT> const &B, size_t l,
size_t getRMQwithPow2(uint_vec const &A, uint_vec const &B, size_t l,
size_t r) {
auto n = A.size();
assert(l <= r);
Expand All @@ -53,16 +52,18 @@ int64_t getRMQwithPow2(int_vector<VECBIT> const &A, int_vector<VECBIT> const &B,
range >>= 1;
power--;
size_t l2 = l + (diff - range);
int64_t cand1 = A[B[l * row + power] - 1];
int64_t cand2 = A[B[l2 * row + power] - 1];
return min(cand1, cand2);
size_t cand1 = B[l * row + power] - 1;
size_t cand2 = B[l2 * row + power] - 1;
if (A[cand1] <= A[cand2])
return cand1;
return cand2;
}

int_vector<VECBIT> precomputeBlockRMQ(sdsl::int_vector<VECBIT> const &A) {
uint_vec precomputeBlockRMQ(uint_vec const &A) {
auto n = A.size();
size_t bSz = BLOCKSIZE(n);
size_t bNum = BLOCKNUM(n);
int_vector<VECBIT> B(bNum);
uint_vec B(bNum);
for (size_t i = 0; i < bNum; i++) {
size_t next = min((i + 1) * bSz, n);
size_t min = SIZE_MAX;
Expand All @@ -71,47 +72,69 @@ int_vector<VECBIT> precomputeBlockRMQ(sdsl::int_vector<VECBIT> const &A) {
min = A[j];
B[i] = min;
}
if (!VECBIT)
util::bit_compress(B);
#ifdef USE_SDSL
sdsl::util::bit_compress(B);
#endif
return B;
}

RMQ::RMQ(sdsl::int_vector<VECBIT> const &A) : arr(&A) {
this->btab = precomputeBlockRMQ(A);
RMQ_custom::RMQ_custom(uint_vec const *A) : arr(A) {
this->btab = precomputeBlockRMQ(*A);
this->ptab = precomputePow2RMQ(this->btab);
}

size_t RMQ::operator()(size_t l, size_t r) const {
size_t RMQ_custom::operator()(size_t l, size_t r) const {
auto &A = *arr;
auto n = arr->size();
assert(l <= r);
if (l == r)
return A[l];
return l;

size_t bSz = BLOCKSIZE(n);

size_t lblock = l / bSz;
size_t rblock = r / bSz;
if (lblock == rblock) {
size_t min = SIZE_MAX;
size_t minind=0;
for (size_t i = l; i <= r; i++)
if (A[i] < min)
if (A[i] < min) {
min = A[i];
return min;
minind = i;
}
return minind;
}

size_t lmin = SIZE_MAX;
size_t rmin = SIZE_MAX;
size_t medmin = SIZE_MAX;
size_t lminind=0, rminind=0, medminind=0;

size_t nextbs = min((lblock + 1) * bSz, n);
for (size_t i = l; i < nextbs; i++)
if (A[i] < lmin)
if (A[i] < lmin) {
lmin = A[i];
lminind = i;
}
for (size_t i = rblock * bSz; i <= r; i++)
if (A[i] < rmin)
if (A[i] < rmin) {
rmin = A[i];
if (rblock - lblock > 1)
medmin = getRMQwithPow2(btab, ptab, lblock + 1, rblock - 1);
return min(medmin, min(lmin, rmin));
rminind = i;
}
if (rblock - lblock > 1) {
size_t btabind = getRMQwithPow2(btab, ptab, lblock + 1, rblock - 1);
medmin = btab[btabind];
nextbs = min((btabind+1)*bSz, n);
for (size_t i = btabind*bSz; i < nextbs; i++)
if (A[i] == medmin) {
medminind = i;
break;
}
}

if (lmin <= medmin && lmin <= rmin)
return lminind;
if (medmin <= rmin)
return medminind;
return rminind;
}
Loading

0 comments on commit 453839d

Please sign in to comment.