Skip to content

Commit

Permalink
Added the MTF generation of the BWT for testcase of self - delimiting
Browse files Browse the repository at this point in the history
codes benchmark.
  • Loading branch information
wayne authored and wayne committed Aug 25, 2014
1 parent da70b32 commit 1c9902b
Show file tree
Hide file tree
Showing 18 changed files with 1,082 additions and 13 deletions.
83 changes: 83 additions & 0 deletions benchmark/self_delimiting_codes/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
include ../../Make.helper
SRC_DIR = src
BIN_DIR = bin
LIBS = -lsdsl
RES_FILE = results/result.csv #result file of benchmark
VAT_FILE = results/vat.csv #vector assignment table (vector name -> sdsl type)
TC_FILE = results/tc.csv #test case table (contains only test case names)

#utility
empty:=
space:= $(empty) $(empty)
comma:= ,

#load test cases
TC_IDS := $(call config_ids,test_case.config)
TC_SRC := $(foreach TC_ID,$(TC_IDS),\
$(call config_select,test_case.config,$(TC_ID),2))
TC_FILES := $(foreach TC_ID,$(TC_IDS),\
$(if $(findstring BWT_MTF,$(call config_select,test_case.config,$(TC_ID),6)),\
../tmp/BWT_MTF.$(TC_ID),\
$(call config_select,test_case.config,$(TC_ID),2)))

all: $(RES_FILE)

timing: $(RES_FILE)
@cd visualize;make

#compilation of bwt - mtf - transform algorithm
$(BIN_DIR)/gen_bwt_mtf: $(SRC_DIR)/gen_bwt_mtf.cpp
@$(MY_CXX) $(MY_CXX_FLAGS) $(C_OPTIONS) -L$(LIB_DIR) "$(SRC_DIR)/gen_bwt_mtf.cpp"\
-I$(INC_DIR) -o "$(BIN_DIR)/gen_bwt_mtf" $(LIBS) -ldivsufsort -ldivsufsort64

#generation of MTF of BWT
../tmp/BWT_MTF.%: $(TC_SRC) $(BIN_DIR)/gen_bwt_mtf
$(eval TC_ID:=$*)
$(eval TC_PATH:=$(call config_select,test_case.config,$(TC_ID),2))
$(eval NUM_BYTE:=$(call config_select,test_case.config,$(TC_ID),5))
@$(BIN_DIR)/gen_bwt_mtf $(TC_PATH) ../tmp/BWT_MTF.$(TC_ID) ../tmp $(NUM_BYTE)

#compilation and creation of vector assignment table
$(BIN_DIR)/sdcbenchmark: $(SRC_DIR)/sdc_benchmark.cpp vectors.config compile_options.config
$(eval VTYPES := $(subst $(space),$(comma),$(strip $(call config_column,vectors.config,2))))
$(eval VNAMES := $(subst $(space),\"$(comma)\",$(strip $(call config_column,vectors.config,3))))
$(eval VNAMES := $(addprefix {\",$(VNAMES)))
$(eval VNAMES := $(addsuffix \"},$(VNAMES)))
$(eval C_OPTIONS:=$(call config_ids,compile_options.config))
@echo "Compiling build for vectors $(VNAMES)"
@$(MY_CXX) $(MY_CXX_FLAGS) $(C_OPTIONS) -DVTYPES="$(VTYPES)" -DVNAMES="$(VNAMES)" -L$(LIB_DIR)\
"$(SRC_DIR)/sdc_benchmark.cpp" -I$(INC_DIR) -o "$(BIN_DIR)/sdcbenchmark" $(LIBS)
$(eval V_IDS := $(call config_ids,vectors.config))
$(eval V_ASSIGNMENTTABLE := $(subst $(space),\n,$(strip $(foreach V_ID,$(V_IDS),\
$(call config_select,vectors.config,$(V_ID),3);$(call config_select,vectors.config,$(V_ID),2)))))
@echo "Writing Vector Assignment Table"
@echo "vector;sdsltype" > $(VAT_FILE)
@echo "$(V_ASSIGNMENTTABLE)" >> $(VAT_FILE)

#execution and creation of test case table
$(RES_FILE): test_case.config $(TC_FILES) $(BIN_DIR)/sdcbenchmark
$(eval ARGS := $(foreach TC_ID,$(TC_IDS),\
$(call config_select,test_case.config,$(TC_ID),3) $(space) \
$(if $(findstring BWT_MTF,$(call config_select,test_case.config,$(TC_ID),6)),\
../tmp/BWT_MTF.$(TC_ID),\
$(call config_select,test_case.config,$(TC_ID),2)) $(space) \
$(call config_select,test_case.config,$(TC_ID),5) ) )
@echo "Executing Benchmark"
@$(BIN_DIR)/sdcbenchmark $(ARGS) | tee $(RES_FILE)
$(eval TC_TABLE := $(subst $(space),\n,$(strip $(call config_column,test_case.config,3))))
@echo "Writing Test Case file"
@echo "testcase\\nOverall" > $(TC_FILE)
@echo "$(TC_TABLE)" >> $(TC_FILE)

include ../Make.download

clean-build:
@echo "Remove executables"
rm -f $(BIN_DIR)/*

clean-result:
@echo "Remove results"
rm -f results/*

cleanall: clean-build clean-result
@cd visualize;make cleanall
55 changes: 55 additions & 0 deletions benchmark/self_delimiting_codes/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Benchmarking self delimiting codes

## Methodology

Explored dimensions:

* self - delimiting code implementations
* test cases
* methods (`encoding`, `decoding`)

## Directory structure

* [bin](./bin): Contains the executables of the project.
* [results](./results): Contains the results of the experiments.
* [src](./src): Contains the source code of the benchmark.
* [visualize](./visualize): Contains LaTex files and a makefile for generating a report

## Prerequisites

* To run the test on larger test cases (>= 200 MB), you should have at least 2 GB
of free memory (some vectors have very poor compression).
* For the visualization you need the following software:
- [pdflatex][LT] to generate the pdf reports.
- [pgfplots][PGFP] version 1.10 installed in [LT] to generate plots in pdf reports.

## Usage

* `make timing` compiles the programs, downloads or generates
the test instances, builds the compression vectors,
runs the performance tests and generates a report located at
`visualize/self_delimiting_codes.pdf`. The raw numbers of the encoding / decoding
rates and compression can be found in the file `results/result.csv`.
The used test cases can be found in file `results/tc.csv`.
The tested vectors can be found in file `results/vat.csv`.
The default benchmark took about 6 hours on my machine (Asus P50IJ
Pentium(R) Dual-Core CPU T4500 @ 2.30GHz 2GB).
* All created binaries and test results can be deleted
by calling `make cleanall`.

## Customization of the benchmark

The project contains several configuration files:

* [vectors.config][VCONFIG]: Specify different compression vectors and their used coders.
* [test_case.config][TCCONFIG]: Specify test instances by ID, path, LaTeX-name
for the report, and download URL.
* [compile_options.config][CCONFIG]: Specify compile options by option string.

Note that the benchmark will execute every combination of vectors and test cases.

[LT]: http://www.tug.org/applications/pdftex/ "pdflatex"
[PGFP]: http://www.ctan.org/pkg/pgfplots "pgfplots"
[VCONFIG]: ./vectors.config "vectors.config"
[TCCONFIG]: ./test_case.config "test_case.config"
[CCONFIG]: ./compile_options.config "compile_options.config"
2 changes: 2 additions & 0 deletions benchmark/self_delimiting_codes/bin/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*
!.gitignore
2 changes: 2 additions & 0 deletions benchmark/self_delimiting_codes/compile_options.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Compile options
-O3 -funroll-loops -fomit-frame-pointer -ffast-math -DNDEBUG
2 changes: 2 additions & 0 deletions benchmark/self_delimiting_codes/results/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*
!.gitignore
119 changes: 119 additions & 0 deletions benchmark/self_delimiting_codes/src/gen_bwt_mtf.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#include <iostream>
#include <fstream>
#include <sdsl/suffix_arrays.hpp>
#include <string>
#include <vector>

using namespace sdsl;

//routine to save a vector in different formats, see lower implementations
template<class INT_VECTOR, uint8_t num_byte>
void saveVector(const INT_VECTOR &v, const char *dest);

//main function to generate MTF of BWT of an integer vector.
// CSA_WT: used wavelet - tree - based suffix array implementation
// INT_VECTOR: used integer vector for extracting BWT
// num_byte: value indicating how result has to be opened / saved
// srcfile: file from which to generate
// destfile: file where to save result
// tmpdir: directory used for temporary results
// conf_bwt_key: key what is able to fetch bwt after suffix array construction
template<class CSA_WT, class INT_VECTOR, uint8_t num_byte>
void gen_bwt_mtf(const char *srcfile, const char *destfile, const char *tmpdir,
const char *conf_bwt_key) {
//utility for CSA generation
cache_config cc(false, tmpdir, "gen_bwt_mtf_");
INT_VECTOR bwt;

//create suffix array
CSA_WT wt;
construct(wt, srcfile, cc, num_byte);

//compute alphabet table from suffix array
std::vector<uint64_t> alph_tbl( wt.sigma );
for (uint64_t i = 0; i < wt.sigma; i++) {
alph_tbl.push_back( wt.comp2char[i] );
}

//fetch bwt
load_from_file(bwt, cache_file_name(conf_bwt_key, cc));

//create mtf
for (uint64_t i = 0; i < bwt.size(); i++) {
uint64_t c = bwt[i];
//find c in alphabet table and move it to front
uint64_t j = 0;
do {
uint64_t tmp = alph_tbl[j];
alph_tbl[j++] = c;
c = tmp;
} while (c != alph_tbl.front());
//and write it's index to mtf transform of bwt
bwt[i] = j-1;
}

//save everything
saveVector<INT_VECTOR, num_byte>( bwt, destfile );

//and free resources
util::delete_all_files(cc.file_map);
}

//functions for saving an integer vector in different formats
//generic version (raw output)
template<class INT_VECTOR, uint8_t num_byte>
void saveVector(const INT_VECTOR &v, const char *dest) {
std::ofstream out(dest);
out.write((char *)v.data(), num_byte * v.size());
}
//serialization of integer vector
template<>
void saveVector<int_vector<>, 0>(const int_vector<> &v, const char *dest) {
store_to_file(v, dest);
}
//decimal digits
template<>
void saveVector<int_vector<>, 'd'>(const int_vector<> &v, const char *dest) {
std::ofstream out(dest);
if (v.size()) out << v[0];
for (uint64_t i = 1; i < v.size(); i++) {
out << " " << v[i];
}
}

//main function
int main(int argc, char* argv[]) {
if (argc != 5) {
std::cout<<"Usage: input_file output_file temp_dir num_byte" << std::endl;
return 1;
}
std::cout << "Calculate MTF Transform of BWT of " << argv[1]
<< " and store it to " << argv[2] << std::endl;

typedef csa_wt<> csa_wt_byte;
typedef csa_wt<wt_int<>, 64, 64, sa_order_sa_sampling<>, int_vector<>, int_alphabet<>> csa_wt_int;

switch (argv[4][0]) {
case 'd': //decimal digits
gen_bwt_mtf<csa_wt_int, int_vector<>, 'd'>(argv[1], argv[2], argv[3], conf::KEY_BWT_INT);
return 0;
case '0': //serialized integer vector
gen_bwt_mtf<csa_wt_int, int_vector<>, 0>(argv[1], argv[2], argv[3], conf::KEY_BWT_INT);
return 0;
case '1': //byte integer vector
gen_bwt_mtf<csa_wt_byte, int_vector<8>, 1>(argv[1], argv[2], argv[3], conf::KEY_BWT);
return 0;
case '2': //2 byte integer vector
gen_bwt_mtf<csa_wt_int, int_vector<>, 2>(argv[1], argv[2], argv[3], conf::KEY_BWT_INT);
return 0;
case '4': //4 byte integer vector
gen_bwt_mtf<csa_wt_int, int_vector<>, 4>(argv[1], argv[2], argv[3], conf::KEY_BWT_INT);
return 0;
case '8': //8 byte integer vector
gen_bwt_mtf<csa_wt_int, int_vector<>, 8>(argv[1], argv[2], argv[3], conf::KEY_BWT_INT);
return 0;
default:
std::cout << "Illegal num_byte, allowed are 'd', 0, 1, 2, 4, 8" << std::endl;
return 1;
}
}
Loading

0 comments on commit 1c9902b

Please sign in to comment.