diff --git a/Makefile b/Makefile index 3d1321f..f65589a 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ CXX := g++ -CFLAGS := -DVECBIT=0 -std=c++11 -Isrc -Ilibdivsufsort/include -Igsl/include -Isdsl/include -Wall -Wextra -O2 -g -ggdb -Wno-missing-braces -funroll-loops # -pg -Wshadow +CFLAGS := -DUSE_SDSL -std=c++11 -Isrc -Ilibdivsufsort/include -Igsl/include -Isdsl/include -Wall -Wextra -O2 -g -ggdb -Wno-missing-braces -funroll-loops # -pg -Wshadow LDFLAGS := -lm -lgsl -lgslcblas -lblas -ldivsufsort -ldivsufsort64 -lsdsl -Llibdivsufsort/lib -Lgsl/lib -Lsdsl/lib # -pg TARGET := dnalc diff --git a/src/esa.cpp b/src/esa.cpp index 79e4e38..89d0f25 100644 --- a/src/esa.cpp +++ b/src/esa.cpp @@ -19,21 +19,20 @@ using namespace std; #include "bench.h" #include "esa.h" -using namespace sdsl; - // calculate suffix array using divsufsort -int_vector getSa(char const *seq, size_t n) { +uint_vec getSa(char const *seq, size_t n) { sauchar_t *t = (sauchar_t *)seq; vector sa(n + 1); if (divsufsort64(t, sa.data(), (saidx64_t)n) != 0) { cout << "ERROR[esa]: suffix sorting failed." << endl; exit(-1); } - int_vector ret(n+1); + uint_vec ret(n+1); for (size_t i=0; i(n+1); + isa = uint_vec(n+1); for (size_t i = 0; i < n; i++) isa[sa[i]] = i; - if (!VECBIT) - util::bit_compress(isa); +#ifdef USE_SDSL + sdsl::util::bit_compress(isa); +#endif tick(); calcLcp(*this); @@ -91,23 +92,23 @@ void Esa::print() const { << endl; } -RMQ Esa::precomputeLcp() const { return RMQ(lcp); } +RMQ Esa::precomputeLcp() const { return RMQ(&lcp); } int64_t Esa::getLcp(RMQ const &rmq, size_t sai, size_t saj) const { if (sai == saj) return this->n - sai; size_t l = min(this->isa[sai], this->isa[saj]) + 1; size_t r = max(this->isa[sai], this->isa[saj]); - return rmq(l, r); + return lcp[rmq(l, r)]; } // reduce esa to half (seq+$+revseq+$ -> seq+$) without recomputing // important! asserting that n and str are replaced by user! void reduceEsa(Esa &esa) { size_t n = esa.sa.size() / 2; - int_vector sa(n); - int_vector isa(n); - int_vector lcp(n + 1); + uint_vec sa(n); + uint_vec isa(n); + uint_vec lcp(n + 1); lcp[0] = lcp[n] = 0; sa[0] = n - 1; isa[n - 1] = 0; @@ -127,11 +128,11 @@ void reduceEsa(Esa &esa) { esa.sa.resize(0); esa.isa.resize(0); esa.lcp.resize(0); - if (!VECBIT) { - util::bit_compress(sa); - util::bit_compress(isa); - util::bit_compress(lcp); - } +#ifdef USE_SDSL + sdsl::util::bit_compress(sa); + sdsl::util::bit_compress(isa); + sdsl::util::bit_compress(lcp); +#endif esa.sa = sa; esa.isa = isa; esa.lcp = lcp; diff --git a/src/esa.h b/src/esa.h index 33ea345..b6f08ef 100644 --- a/src/esa.h +++ b/src/esa.h @@ -9,9 +9,6 @@ #include "rmq.h" #include -#include -#include - /* define data container */ class Esa { public: @@ -20,9 +17,9 @@ class Esa { int64_t getLcp(RMQ const &rmq, size_t sai, size_t saj) const; void print() const; - sdsl::int_vector sa; /* suffix array */ - sdsl::int_vector isa; /* inverse suffix array */ - sdsl::int_vector lcp; /* longest common prefix array */ + uint_vec sa; /* suffix array */ + uint_vec isa; /* inverse suffix array */ + uint_vec lcp; /* longest common prefix array */ char const *str; /* pointer to underlying string */ size_t n; /* length of sa and lcp */ }; diff --git a/src/factors.h b/src/factors.h index f946785..7a72fef 100644 --- a/src/factors.h +++ b/src/factors.h @@ -2,14 +2,15 @@ #include #include #include -#include + +#include "rmq.h" /* a factorization of a string (LempelZiv or MatchLength) */ struct Fact { void print() const; - sdsl::int_vector fact; /* positions of factors */ - sdsl::int_vector lpf; /* lpf in case of Lempel-Ziv, otherwise empty */ + uint_vec fact; /* positions of factors */ + uint_vec lpf; /* lpf in case of Lempel-Ziv, otherwise empty */ std::vector prevOcc; /* LZ: prevOcc array (previous occurence of factor s_j) */ char const *str; /* string */ diff --git a/src/lempelziv.cpp b/src/lempelziv.cpp index efb65b6..aa347e9 100644 --- a/src/lempelziv.cpp +++ b/src/lempelziv.cpp @@ -18,9 +18,6 @@ #include using namespace std; -#include -using namespace sdsl; - // sum type used in lpf algorithm struct Elem { Elem(int64_t i, int64_t lcp); @@ -38,12 +35,12 @@ Elem::Elem(int64_t a, int64_t b) : i(a), lcp(b) {} * Computer Society, Los Alamitos, CA, 2008, * pp. 482-488. */ -void computeLpf(int_vector &lpf, vector &prevOcc, Esa const &esa) { +void computeLpf(uint_vec &lpf, vector &prevOcc, Esa const &esa) { size_t n = esa.n; auto &sa = esa.sa; auto &lcp = esa.lcp; - lpf = int_vector(n + 1); + lpf = uint_vec(n + 1); prevOcc = vector(n); lpf[n] = 0; @@ -73,14 +70,14 @@ void computeLpf(int_vector &lpf, vector &prevOcc, Esa const &es } // alternative prevOcc calculation (from same paper) -void computeLpf2(int_vector &lpf, vector &prevOcc, Esa const &esa) { +void computeLpf2(uint_vec &lpf, vector &prevOcc, Esa const &esa) { size_t n = esa.n; auto &sa = esa.sa; vector lprev(n); vector lnext(n); vector prevl(n); vector prevr(n); - lpf = int_vector(n, 0); + lpf = uint_vec(n, 0); vector lpfl(n, 0); vector lpfr(n, 0); prevOcc = vector(n); @@ -134,8 +131,9 @@ void computeLZFact(Fact &lzf, Esa const &esa, bool alternative) { computeLpf2(lzf.lpf, lzf.prevOcc, esa); else computeLpf(lzf.lpf, lzf.prevOcc, esa); - if (!VECBIT) - util::bit_compress(lzf.lpf); +#ifdef USE_SDSL + sdsl::util::bit_compress(lzf.lpf); +#endif vector lzftmp; lzftmp.push_back(0); @@ -143,9 +141,10 @@ void computeLZFact(Fact &lzf, Esa const &esa, bool alternative) { lzftmp.push_back(lzftmp.back() + max(1UL, (size_t)lzf.lpf[lzftmp.back()])); lzftmp.pop_back(); - lzf.fact = int_vector(lzftmp.size()); + lzf.fact = uint_vec(lzftmp.size()); for (size_t i=0; i #include -#include using namespace std; void computeMLFact(Fact &mlf, Esa const &esa) { @@ -34,6 +33,7 @@ void computeMLFact(Fact &mlf, Esa const &esa) { mlf.fact.resize(factmp.size()); for (i=0; i #include using namespace std; -#include -using namespace sdsl; #include "rmq.h" #define BLOCKSIZE(n) ((size_t)(log2(n) / 4) + 1) #define BLOCKNUM(n) (n / BLOCKSIZE(n) + 1) -int_vector precomputePow2RMQ(int_vector const &A) { +uint_vec precomputePow2RMQ(uint_vec const &A) { auto n = A.size(); size_t row = log2(n) + 1; vector B(n * row); @@ -30,15 +28,16 @@ int_vector precomputePow2RMQ(int_vector const &A) { B[i * row + j] = B[i2 * row + (j - 1)]; } } - int_vector ret(B.size()); + uint_vec ret(B.size()); for (size_t i=0; i const &A, int_vector const &B, size_t l, +size_t getRMQwithPow2(uint_vec const &A, uint_vec const &B, size_t l, size_t r) { auto n = A.size(); assert(l <= r); @@ -53,16 +52,18 @@ int64_t getRMQwithPow2(int_vector const &A, int_vector const &B, range >>= 1; power--; size_t l2 = l + (diff - range); - int64_t cand1 = A[B[l * row + power] - 1]; - int64_t cand2 = A[B[l2 * row + power] - 1]; - return min(cand1, cand2); + size_t cand1 = B[l * row + power] - 1; + size_t cand2 = B[l2 * row + power] - 1; + if (A[cand1] <= A[cand2]) + return cand1; + return cand2; } -int_vector precomputeBlockRMQ(sdsl::int_vector const &A) { +uint_vec precomputeBlockRMQ(uint_vec const &A) { auto n = A.size(); size_t bSz = BLOCKSIZE(n); size_t bNum = BLOCKNUM(n); - int_vector B(bNum); + uint_vec B(bNum); for (size_t i = 0; i < bNum; i++) { size_t next = min((i + 1) * bSz, n); size_t min = SIZE_MAX; @@ -71,22 +72,23 @@ int_vector precomputeBlockRMQ(sdsl::int_vector const &A) { min = A[j]; B[i] = min; } - if (!VECBIT) - util::bit_compress(B); +#ifdef USE_SDSL + sdsl::util::bit_compress(B); +#endif return B; } -RMQ::RMQ(sdsl::int_vector const &A) : arr(&A) { - this->btab = precomputeBlockRMQ(A); +RMQ_custom::RMQ_custom(uint_vec const *A) : arr(A) { + this->btab = precomputeBlockRMQ(*A); this->ptab = precomputePow2RMQ(this->btab); } -size_t RMQ::operator()(size_t l, size_t r) const { +size_t RMQ_custom::operator()(size_t l, size_t r) const { auto &A = *arr; auto n = arr->size(); assert(l <= r); if (l == r) - return A[l]; + return l; size_t bSz = BLOCKSIZE(n); @@ -94,24 +96,45 @@ size_t RMQ::operator()(size_t l, size_t r) const { size_t rblock = r / bSz; if (lblock == rblock) { size_t min = SIZE_MAX; + size_t minind=0; for (size_t i = l; i <= r; i++) - if (A[i] < min) + if (A[i] < min) { min = A[i]; - return min; + minind = i; + } + return minind; } size_t lmin = SIZE_MAX; size_t rmin = SIZE_MAX; size_t medmin = SIZE_MAX; + size_t lminind=0, rminind=0, medminind=0; size_t nextbs = min((lblock + 1) * bSz, n); for (size_t i = l; i < nextbs; i++) - if (A[i] < lmin) + if (A[i] < lmin) { lmin = A[i]; + lminind = i; + } for (size_t i = rblock * bSz; i <= r; i++) - if (A[i] < rmin) + if (A[i] < rmin) { rmin = A[i]; - if (rblock - lblock > 1) - medmin = getRMQwithPow2(btab, ptab, lblock + 1, rblock - 1); - return min(medmin, min(lmin, rmin)); + rminind = i; + } + if (rblock - lblock > 1) { + size_t btabind = getRMQwithPow2(btab, ptab, lblock + 1, rblock - 1); + medmin = btab[btabind]; + nextbs = min((btabind+1)*bSz, n); + for (size_t i = btabind*bSz; i < nextbs; i++) + if (A[i] == medmin) { + medminind = i; + break; + } + } + + if (lmin <= medmin && lmin <= rmin) + return lminind; + if (medmin <= rmin) + return medminind; + return rminind; } diff --git a/src/rmq.h b/src/rmq.h index 0bd807e..dc70f41 100644 --- a/src/rmq.h +++ b/src/rmq.h @@ -3,20 +3,35 @@ #include #include +#ifdef USE_SDSL #include +#include +#ifndef VECBIT +#define VECBIT 0 +#endif +typedef sdsl::int_vector uint_vec; +#else +typedef std::vector uint_vec; +#endif -class RMQ { +class RMQ_custom { public: - RMQ(sdsl::int_vector const &A); + RMQ_custom(uint_vec const *A); size_t operator()(size_t l, size_t r) const; - sdsl::int_vector const *arr; + uint_vec const *arr; - sdsl::int_vector btab; - sdsl::int_vector ptab; + uint_vec btab; + uint_vec ptab; }; -sdsl::int_vector precomputePow2RMQ(std::vector const &A); +uint_vec precomputePow2RMQ(uint_vec const &A); int64_t getRMQwithPow2(std::vector const &A, std::vector const &B, size_t l, size_t r); -sdsl::int_vector precomputeBlockRMQ(std::vector const &A); +uint_vec precomputeBlockRMQ(uint_vec const &A); + +#ifdef USE_SDSL +typedef sdsl::rmq_succinct_sct<> RMQ; +#else +typedef RMQ_custom RMQ; +#endif diff --git a/tests/esa_tests.cpp b/tests/esa_tests.cpp index 5eadebd..a34659b 100644 --- a/tests/esa_tests.cpp +++ b/tests/esa_tests.cpp @@ -52,8 +52,8 @@ void test_revEsaRnd() { Esa resa(srev, n); auto rmq = resa.precomputeLcp(); - /* printEsa(esa); */ - /* printEsa(resa); */ + // esa.print(); + // resa.print(); for (size_t i = 0; i < n - 1; i++) for (size_t j = i + 1; j < n; j++) { @@ -61,7 +61,7 @@ void test_revEsaRnd() { int64_t obs = resa.getLcp(rmq, n - i - 1, n - j - 1); if (exp != obs) printf("%zu %zu -> %zu %zu\n", i, j, n - i - 1, n - j - 1); - mu_assert_eq(exp, obs, "lcs does not match"); + mu_assert_eq(exp, obs, "lcs does not match at pos " << i); } } diff --git a/tests/lempelziv_tests.cpp b/tests/lempelziv_tests.cpp index 96b5a09..df0b0ed 100644 --- a/tests/lempelziv_tests.cpp +++ b/tests/lempelziv_tests.cpp @@ -83,7 +83,7 @@ void test_randomSequence() { int64_t po = lzf.prevOcc[lzf.fact[i]]; bool tmp = po <= (int64_t)lzf.fact[i]; if (!tmp) - printf("prev: %ld curr: %ld\n", po, lzf.fact[i]); + printf("prev: %ld curr: %ld\n", po, (size_t)lzf.fact[i]); mu_assert(tmp, "prev can not be after"); if (po >= 0) mu_assert(!strncmp(esa.str + lzf.fact[i], esa.str + po, factLen(lzf, i)), diff --git a/tests/rmq_tests.cpp b/tests/rmq_tests.cpp index 6c906df..b8f09e4 100644 --- a/tests/rmq_tests.cpp +++ b/tests/rmq_tests.cpp @@ -2,19 +2,16 @@ #include "minunit.h" #include "rmq.h" -#include -using namespace sdsl; - void test_rmqSmall() { size_t n = 25; - int_vector array(n); + uint_vec array(n); for (size_t i = 0; i < n; i++) { array[i] = rand() % n; /* printf("%ld ", array[i]); */ } /* printf("\n\n"); */ - RMQ rmq(array); + RMQ rmq(&array); // rmq_succinct_sct<> rmq(&array); for (size_t i = 0; i < n; i++) @@ -25,20 +22,20 @@ void test_rmqSmall() { exp = array[k]; } - // int64_t obs = array[rmq(i, j)]; - int64_t obs = rmq(i, j); + int64_t obs = array[rmq(i, j)]; + // int64_t obs = rmq(i, j); mu_assert_eq(exp, obs, "wrong range minimum (tried " << i << " and " << j << ")"); } } void test_rmqRand() { size_t n = 10000; - sdsl::int_vector array(n); + uint_vec array(n); for (size_t i = 0; i < n; i++) { array[i] = rand() % n; } - RMQ rmq(array); + RMQ rmq(&array); // rmq_succinct_sct<> rmq(&array); for (size_t i = 0; i < 1000; i++) { @@ -49,8 +46,8 @@ void test_rmqRand() { if ((int64_t)array[k] < exp) exp = array[k]; - // int64_t obs = array[rmq(l, r)]; - int64_t obs = rmq(l, r); + int64_t obs = array[rmq(l, r)]; + // int64_t obs = rmq(l, r); mu_assert_eq(exp, obs, "wrong range minimum (tried " << l << " and " << r << ")"); } }