make RMQ return index instead of value, make USE_SDSL a compile-time …

…option
EvolBioInf · Jun 9, 2016 · 453839d · 453839d
1 parent 8c7f0dc
commit 453839d
Show file tree

Hide file tree

Showing 11 changed files with 125 additions and 92 deletions.
diff --git a/Makefile b/Makefile
@@ -1,5 +1,5 @@
 CXX := g++
-CFLAGS := -DVECBIT=0 -std=c++11 -Isrc -Ilibdivsufsort/include -Igsl/include -Isdsl/include -Wall -Wextra -O2 -g -ggdb -Wno-missing-braces -funroll-loops # -pg -Wshadow
+CFLAGS := -DUSE_SDSL -std=c++11 -Isrc -Ilibdivsufsort/include -Igsl/include -Isdsl/include -Wall -Wextra -O2 -g -ggdb -Wno-missing-braces -funroll-loops # -pg -Wshadow
 LDFLAGS := -lm -lgsl -lgslcblas -lblas -ldivsufsort -ldivsufsort64 -lsdsl -Llibdivsufsort/lib -Lgsl/lib -Lsdsl/lib # -pg
 TARGET := dnalc
 

diff --git a/src/esa.cpp b/src/esa.cpp
@@ -19,21 +19,20 @@ using namespace std;
 #include "bench.h"
 #include "esa.h"
 
-using namespace sdsl;
-
 // calculate suffix array using divsufsort
-int_vector<VECBIT> getSa(char const *seq, size_t n) {
+uint_vec getSa(char const *seq, size_t n) {
   sauchar_t *t = (sauchar_t *)seq;
   vector<saidx64_t> sa(n + 1);
   if (divsufsort64(t, sa.data(), (saidx64_t)n) != 0) {
     cout << "ERROR[esa]: suffix sorting failed." << endl;
     exit(-1);
   }
-  int_vector<VECBIT> ret(n+1);
+  uint_vec ret(n+1);
   for (size_t i=0; i<n+1; i++)
     ret[i] = sa[i];
-  if (!VECBIT)
-    util::bit_compress(ret);
+#ifdef USE_SDSL
+    sdsl::util::bit_compress(ret);
+#endif
   return ret;
 }
 
@@ -62,8 +61,9 @@ void calcLcp(Esa &esa) {
         h--;
     }
   }
-  if (!VECBIT)
-    util::bit_compress(lcp);
+#ifdef USE_SDSL
+  sdsl::util::bit_compress(lcp);
+#endif
 }
 
 Esa::Esa(char const *seq, size_t len) : str(seq), n(len) {
@@ -73,11 +73,12 @@ Esa::Esa(char const *seq, size_t len) : str(seq), n(len) {
   sa = getSa(seq, n);
   tock("libdivsufsort");
 
-  isa = int_vector<VECBIT>(n+1);
+  isa = uint_vec(n+1);
   for (size_t i = 0; i < n; i++)
     isa[sa[i]] = i;
-  if (!VECBIT)
-    util::bit_compress(isa);
+#ifdef USE_SDSL
+  sdsl::util::bit_compress(isa);
+#endif
 
   tick();
   calcLcp(*this);
@@ -91,23 +92,23 @@ void Esa::print() const {
          << endl;
 }
 
-RMQ Esa::precomputeLcp() const { return RMQ(lcp); }
+RMQ Esa::precomputeLcp() const { return RMQ(&lcp); }
 
 int64_t Esa::getLcp(RMQ const &rmq, size_t sai, size_t saj) const {
   if (sai == saj)
     return this->n - sai;
   size_t l = min(this->isa[sai], this->isa[saj]) + 1;
   size_t r = max(this->isa[sai], this->isa[saj]);
-  return rmq(l, r);
+  return lcp[rmq(l, r)];
 }
 
 // reduce esa to half (seq+$+revseq+$ -> seq+$) without recomputing
 // important! asserting that n and str are replaced by user!
 void reduceEsa(Esa &esa) {
   size_t n = esa.sa.size() / 2;
-  int_vector<VECBIT> sa(n);
-  int_vector<VECBIT> isa(n);
-  int_vector<VECBIT> lcp(n + 1);
+  uint_vec sa(n);
+  uint_vec isa(n);
+  uint_vec lcp(n + 1);
   lcp[0] = lcp[n] = 0;
   sa[0] = n - 1;
   isa[n - 1] = 0;
@@ -127,11 +128,11 @@ void reduceEsa(Esa &esa) {
   esa.sa.resize(0);
   esa.isa.resize(0);
   esa.lcp.resize(0);
-  if (!VECBIT) {
-    util::bit_compress(sa);
-    util::bit_compress(isa);
-    util::bit_compress(lcp);
-  }
+#ifdef USE_SDSL
+  sdsl::util::bit_compress(sa);
+  sdsl::util::bit_compress(isa);
+  sdsl::util::bit_compress(lcp);
+#endif
   esa.sa = sa;
   esa.isa = isa;
   esa.lcp = lcp;

diff --git a/src/esa.h b/src/esa.h
@@ -9,9 +9,6 @@
 #include "rmq.h"
 #include <divsufsort64.h>
 
-#include <sdsl/int_vector.hpp>
-#include <sdsl/suffix_arrays.hpp>
-
 /* define data container */
 class Esa {
 public:
@@ -20,9 +17,9 @@ class Esa {
   int64_t getLcp(RMQ const &rmq, size_t sai, size_t saj) const;
   void print() const;
 
-  sdsl::int_vector<VECBIT> sa;    /* suffix array */
-  sdsl::int_vector<VECBIT> isa;   /* inverse suffix array */
-  sdsl::int_vector<VECBIT> lcp;   /* longest common prefix array */
+  uint_vec sa;    /* suffix array */
+  uint_vec isa;   /* inverse suffix array */
+  uint_vec lcp;   /* longest common prefix array */
   char const *str;            /* pointer to underlying string */
   size_t n;                   /* length of sa and lcp */
 };

diff --git a/src/factors.h b/src/factors.h
@@ -2,14 +2,15 @@
 #include <cstddef>
 #include <cstdint>
 #include <vector>
-#include <sdsl/int_vector.hpp>
+
+#include "rmq.h"
 
 /* a factorization of a string (LempelZiv or MatchLength) */
 struct Fact {
   void print() const;
 
-  sdsl::int_vector<VECBIT> fact;     /* positions of factors */
-  sdsl::int_vector<VECBIT> lpf;      /* lpf in case of Lempel-Ziv, otherwise empty */
+  uint_vec fact;     /* positions of factors */
+  uint_vec lpf;      /* lpf in case of Lempel-Ziv, otherwise empty */
   std::vector<int64_t> prevOcc; /* LZ: prevOcc array (previous occurence of factor s_j) */
 
   char const *str; /* string */

diff --git a/src/lempelziv.cpp b/src/lempelziv.cpp
@@ -18,9 +18,6 @@
 #include <vector>
 using namespace std;
 
-#include <sdsl/int_vector.hpp>
-using namespace sdsl;
-
 // sum type used in lpf algorithm
 struct Elem {
   Elem(int64_t i, int64_t lcp);
@@ -38,12 +35,12 @@ Elem::Elem(int64_t a, int64_t b) : i(a), lcp(b) {}
  *   Computer Society, Los Alamitos, CA, 2008,
  *   pp. 482-488.
  */
-void computeLpf(int_vector<VECBIT> &lpf, vector<int64_t> &prevOcc, Esa const &esa) {
+void computeLpf(uint_vec &lpf, vector<int64_t> &prevOcc, Esa const &esa) {
   size_t n = esa.n;
   auto &sa = esa.sa;
   auto &lcp = esa.lcp;
 
-  lpf = int_vector<VECBIT>(n + 1);
+  lpf = uint_vec(n + 1);
   prevOcc = vector<int64_t>(n);
 
   lpf[n] = 0;
@@ -73,14 +70,14 @@ void computeLpf(int_vector<VECBIT> &lpf, vector<int64_t> &prevOcc, Esa const &es
 }
 
 // alternative prevOcc calculation (from same paper)
-void computeLpf2(int_vector<VECBIT> &lpf, vector<int64_t> &prevOcc, Esa const &esa) {
+void computeLpf2(uint_vec &lpf, vector<int64_t> &prevOcc, Esa const &esa) {
   size_t n = esa.n;
   auto &sa = esa.sa;
   vector<int64_t> lprev(n);
   vector<int64_t> lnext(n);
   vector<int64_t> prevl(n);
   vector<int64_t> prevr(n);
-  lpf = int_vector<VECBIT>(n, 0);
+  lpf = uint_vec(n, 0);
   vector<int64_t> lpfl(n, 0);
   vector<int64_t> lpfr(n, 0);
   prevOcc = vector<int64_t>(n);
@@ -134,18 +131,20 @@ void computeLZFact(Fact &lzf, Esa const &esa, bool alternative) {
     computeLpf2(lzf.lpf, lzf.prevOcc, esa);
   else
     computeLpf(lzf.lpf, lzf.prevOcc, esa);
-  if (!VECBIT)
-    util::bit_compress(lzf.lpf);
+#ifdef USE_SDSL
+  sdsl::util::bit_compress(lzf.lpf);
+#endif
 
   vector<size_t> lzftmp;
   lzftmp.push_back(0);
   while (lzftmp.back() < esa.n)
     lzftmp.push_back(lzftmp.back() + max(1UL, (size_t)lzf.lpf[lzftmp.back()]));
   lzftmp.pop_back();
 
-  lzf.fact = int_vector<VECBIT>(lzftmp.size());
+  lzf.fact = uint_vec(lzftmp.size());
   for (size_t i=0; i<lzftmp.size(); i++)
     lzf.fact[i] = lzftmp[i];
-  if (!VECBIT)
-    util::bit_compress(lzf.fact);
+#ifdef USE_SDSL
+  sdsl::util::bit_compress(lzf.fact);
+#endif
 }
diff --git a/src/matchlength.cpp b/src/matchlength.cpp
@@ -7,7 +7,6 @@
 #include "shulen.h"
 #include <algorithm>
 #include <vector>
-#include <sdsl/int_vector.hpp>
 using namespace std;
 
 void computeMLFact(Fact &mlf, Esa const &esa) {
@@ -34,6 +33,7 @@ void computeMLFact(Fact &mlf, Esa const &esa) {
   mlf.fact.resize(factmp.size());
   for (i=0; i<factmp.size(); i++)
     mlf.fact[i] = factmp[i];
-  if (!VECBIT)
+#ifdef USE_SDSL
     sdsl::util::bit_compress(mlf.fact);
+#endif
 }
diff --git a/src/rmq.cpp b/src/rmq.cpp
@@ -2,15 +2,13 @@
 #include <cmath>
 #include <algorithm>
 using namespace std;
-#include <sdsl/int_vector.hpp>
-using namespace sdsl;
 
 #include "rmq.h"
 
 #define BLOCKSIZE(n) ((size_t)(log2(n) / 4) + 1)
 #define BLOCKNUM(n) (n / BLOCKSIZE(n) + 1)
 
-int_vector<VECBIT> precomputePow2RMQ(int_vector<VECBIT> const &A) {
+uint_vec precomputePow2RMQ(uint_vec const &A) {
   auto n = A.size();
   size_t row = log2(n) + 1;
   vector<int64_t> B(n * row);
@@ -30,15 +28,16 @@ int_vector<VECBIT> precomputePow2RMQ(int_vector<VECBIT> const &A) {
         B[i * row + j] = B[i2 * row + (j - 1)];
     }
   }
-  int_vector<VECBIT> ret(B.size());
+  uint_vec ret(B.size());
   for (size_t i=0; i<B.size(); i++)
     ret[i] = B[i];
-  if (!VECBIT)
-    util::bit_compress(ret);
+#ifdef USE_SDSL
+  sdsl::util::bit_compress(ret);
+#endif
   return ret;
 }
 
-int64_t getRMQwithPow2(int_vector<VECBIT> const &A, int_vector<VECBIT> const &B, size_t l,
+size_t getRMQwithPow2(uint_vec const &A, uint_vec const &B, size_t l,
                        size_t r) {
   auto n = A.size();
   assert(l <= r);
@@ -53,16 +52,18 @@ int64_t getRMQwithPow2(int_vector<VECBIT> const &A, int_vector<VECBIT> const &B,
   range >>= 1;
   power--;
   size_t l2 = l + (diff - range);
-  int64_t cand1 = A[B[l * row + power] - 1];
-  int64_t cand2 = A[B[l2 * row + power] - 1];
-  return min(cand1, cand2);
+  size_t cand1 = B[l * row + power] - 1;
+  size_t cand2 = B[l2 * row + power] - 1;
+  if (A[cand1] <= A[cand2])
+    return cand1;
+  return cand2;
 }
 
-int_vector<VECBIT> precomputeBlockRMQ(sdsl::int_vector<VECBIT> const &A) {
+uint_vec precomputeBlockRMQ(uint_vec const &A) {
   auto n = A.size();
   size_t bSz = BLOCKSIZE(n);
   size_t bNum = BLOCKNUM(n);
-  int_vector<VECBIT> B(bNum);
+  uint_vec B(bNum);
   for (size_t i = 0; i < bNum; i++) {
     size_t next = min((i + 1) * bSz, n);
     size_t min = SIZE_MAX;
@@ -71,47 +72,69 @@ int_vector<VECBIT> precomputeBlockRMQ(sdsl::int_vector<VECBIT> const &A) {
         min = A[j];
     B[i] = min;
   }
-  if (!VECBIT)
-    util::bit_compress(B);
+#ifdef USE_SDSL
+  sdsl::util::bit_compress(B);
+#endif
   return B;
 }
 
-RMQ::RMQ(sdsl::int_vector<VECBIT> const &A) : arr(&A) {
-  this->btab = precomputeBlockRMQ(A);
+RMQ_custom::RMQ_custom(uint_vec const *A) : arr(A) {
+  this->btab = precomputeBlockRMQ(*A);
   this->ptab = precomputePow2RMQ(this->btab);
 }
 
-size_t RMQ::operator()(size_t l, size_t r) const {
+size_t RMQ_custom::operator()(size_t l, size_t r) const {
   auto &A = *arr;
   auto n = arr->size();
   assert(l <= r);
   if (l == r)
-    return A[l];
+    return l;
 
   size_t bSz = BLOCKSIZE(n);
 
   size_t lblock = l / bSz;
   size_t rblock = r / bSz;
   if (lblock == rblock) {
     size_t min = SIZE_MAX;
+    size_t minind=0;
     for (size_t i = l; i <= r; i++)
-      if (A[i] < min)
+      if (A[i] < min) {
         min = A[i];
-    return min;
+        minind = i;
+      }
+    return minind;
   }
 
   size_t lmin = SIZE_MAX;
   size_t rmin = SIZE_MAX;
   size_t medmin = SIZE_MAX;
+  size_t lminind=0, rminind=0, medminind=0;
 
   size_t nextbs = min((lblock + 1) * bSz, n);
   for (size_t i = l; i < nextbs; i++)
-    if (A[i] < lmin)
+    if (A[i] < lmin) {
       lmin = A[i];
+      lminind = i;
+    }
   for (size_t i = rblock * bSz; i <= r; i++)
-    if (A[i] < rmin)
+    if (A[i] < rmin) {
       rmin = A[i];
-  if (rblock - lblock > 1)
-    medmin = getRMQwithPow2(btab, ptab, lblock + 1, rblock - 1);
-  return min(medmin, min(lmin, rmin));
+      rminind = i;
+    }
+  if (rblock - lblock > 1) {
+    size_t btabind = getRMQwithPow2(btab, ptab, lblock + 1, rblock - 1);
+    medmin = btab[btabind];
+    nextbs = min((btabind+1)*bSz, n);
+    for (size_t i = btabind*bSz; i < nextbs; i++)
+      if (A[i] == medmin) {
+        medminind = i;
+        break;
+      }
+  }
+
+  if (lmin <= medmin && lmin <= rmin)
+    return lminind;
+  if (medmin <= rmin)
+    return medminind;
+  return rminind;
 }