-
Notifications
You must be signed in to change notification settings - Fork 349
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added the MTF generation of the BWT for testcase of self - delimiting
codes benchmark.
- Loading branch information
wayne
authored and
wayne
committed
Aug 25, 2014
1 parent
da70b32
commit 1c9902b
Showing
18 changed files
with
1,082 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
include ../../Make.helper | ||
SRC_DIR = src | ||
BIN_DIR = bin | ||
LIBS = -lsdsl | ||
RES_FILE = results/result.csv #result file of benchmark | ||
VAT_FILE = results/vat.csv #vector assignment table (vector name -> sdsl type) | ||
TC_FILE = results/tc.csv #test case table (contains only test case names) | ||
|
||
#utility | ||
empty:= | ||
space:= $(empty) $(empty) | ||
comma:= , | ||
|
||
#load test cases | ||
TC_IDS := $(call config_ids,test_case.config) | ||
TC_SRC := $(foreach TC_ID,$(TC_IDS),\ | ||
$(call config_select,test_case.config,$(TC_ID),2)) | ||
TC_FILES := $(foreach TC_ID,$(TC_IDS),\ | ||
$(if $(findstring BWT_MTF,$(call config_select,test_case.config,$(TC_ID),6)),\ | ||
../tmp/BWT_MTF.$(TC_ID),\ | ||
$(call config_select,test_case.config,$(TC_ID),2))) | ||
|
||
all: $(RES_FILE) | ||
|
||
timing: $(RES_FILE) | ||
@cd visualize;make | ||
|
||
#compilation of bwt - mtf - transform algorithm | ||
$(BIN_DIR)/gen_bwt_mtf: $(SRC_DIR)/gen_bwt_mtf.cpp | ||
@$(MY_CXX) $(MY_CXX_FLAGS) $(C_OPTIONS) -L$(LIB_DIR) "$(SRC_DIR)/gen_bwt_mtf.cpp"\ | ||
-I$(INC_DIR) -o "$(BIN_DIR)/gen_bwt_mtf" $(LIBS) -ldivsufsort -ldivsufsort64 | ||
|
||
#generation of MTF of BWT | ||
../tmp/BWT_MTF.%: $(TC_SRC) $(BIN_DIR)/gen_bwt_mtf | ||
$(eval TC_ID:=$*) | ||
$(eval TC_PATH:=$(call config_select,test_case.config,$(TC_ID),2)) | ||
$(eval NUM_BYTE:=$(call config_select,test_case.config,$(TC_ID),5)) | ||
@$(BIN_DIR)/gen_bwt_mtf $(TC_PATH) ../tmp/BWT_MTF.$(TC_ID) ../tmp $(NUM_BYTE) | ||
|
||
#compilation and creation of vector assignment table | ||
$(BIN_DIR)/sdcbenchmark: $(SRC_DIR)/sdc_benchmark.cpp vectors.config compile_options.config | ||
$(eval VTYPES := $(subst $(space),$(comma),$(strip $(call config_column,vectors.config,2)))) | ||
$(eval VNAMES := $(subst $(space),\"$(comma)\",$(strip $(call config_column,vectors.config,3)))) | ||
$(eval VNAMES := $(addprefix {\",$(VNAMES))) | ||
$(eval VNAMES := $(addsuffix \"},$(VNAMES))) | ||
$(eval C_OPTIONS:=$(call config_ids,compile_options.config)) | ||
@echo "Compiling build for vectors $(VNAMES)" | ||
@$(MY_CXX) $(MY_CXX_FLAGS) $(C_OPTIONS) -DVTYPES="$(VTYPES)" -DVNAMES="$(VNAMES)" -L$(LIB_DIR)\ | ||
"$(SRC_DIR)/sdc_benchmark.cpp" -I$(INC_DIR) -o "$(BIN_DIR)/sdcbenchmark" $(LIBS) | ||
$(eval V_IDS := $(call config_ids,vectors.config)) | ||
$(eval V_ASSIGNMENTTABLE := $(subst $(space),\n,$(strip $(foreach V_ID,$(V_IDS),\ | ||
$(call config_select,vectors.config,$(V_ID),3);$(call config_select,vectors.config,$(V_ID),2))))) | ||
@echo "Writing Vector Assignment Table" | ||
@echo "vector;sdsltype" > $(VAT_FILE) | ||
@echo "$(V_ASSIGNMENTTABLE)" >> $(VAT_FILE) | ||
|
||
#execution and creation of test case table | ||
$(RES_FILE): test_case.config $(TC_FILES) $(BIN_DIR)/sdcbenchmark | ||
$(eval ARGS := $(foreach TC_ID,$(TC_IDS),\ | ||
$(call config_select,test_case.config,$(TC_ID),3) $(space) \ | ||
$(if $(findstring BWT_MTF,$(call config_select,test_case.config,$(TC_ID),6)),\ | ||
../tmp/BWT_MTF.$(TC_ID),\ | ||
$(call config_select,test_case.config,$(TC_ID),2)) $(space) \ | ||
$(call config_select,test_case.config,$(TC_ID),5) ) ) | ||
@echo "Executing Benchmark" | ||
@$(BIN_DIR)/sdcbenchmark $(ARGS) | tee $(RES_FILE) | ||
$(eval TC_TABLE := $(subst $(space),\n,$(strip $(call config_column,test_case.config,3)))) | ||
@echo "Writing Test Case file" | ||
@echo "testcase\\nOverall" > $(TC_FILE) | ||
@echo "$(TC_TABLE)" >> $(TC_FILE) | ||
|
||
include ../Make.download | ||
|
||
clean-build: | ||
@echo "Remove executables" | ||
rm -f $(BIN_DIR)/* | ||
|
||
clean-result: | ||
@echo "Remove results" | ||
rm -f results/* | ||
|
||
cleanall: clean-build clean-result | ||
@cd visualize;make cleanall |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
# Benchmarking self delimiting codes | ||
|
||
## Methodology | ||
|
||
Explored dimensions: | ||
|
||
* self - delimiting code implementations | ||
* test cases | ||
* methods (`encoding`, `decoding`) | ||
|
||
## Directory structure | ||
|
||
* [bin](./bin): Contains the executables of the project. | ||
* [results](./results): Contains the results of the experiments. | ||
* [src](./src): Contains the source code of the benchmark. | ||
* [visualize](./visualize): Contains LaTex files and a makefile for generating a report | ||
|
||
## Prerequisites | ||
|
||
* To run the test on larger test cases (>= 200 MB), you should have at least 2 GB | ||
of free memory (some vectors have very poor compression). | ||
* For the visualization you need the following software: | ||
- [pdflatex][LT] to generate the pdf reports. | ||
- [pgfplots][PGFP] version 1.10 installed in [LT] to generate plots in pdf reports. | ||
|
||
## Usage | ||
|
||
* `make timing` compiles the programs, downloads or generates | ||
the test instances, builds the compression vectors, | ||
runs the performance tests and generates a report located at | ||
`visualize/self_delimiting_codes.pdf`. The raw numbers of the encoding / decoding | ||
rates and compression can be found in the file `results/result.csv`. | ||
The used test cases can be found in file `results/tc.csv`. | ||
The tested vectors can be found in file `results/vat.csv`. | ||
The default benchmark took about 6 hours on my machine (Asus P50IJ | ||
Pentium(R) Dual-Core CPU T4500 @ 2.30GHz 2GB). | ||
* All created binaries and test results can be deleted | ||
by calling `make cleanall`. | ||
|
||
## Customization of the benchmark | ||
|
||
The project contains several configuration files: | ||
|
||
* [vectors.config][VCONFIG]: Specify different compression vectors and their used coders. | ||
* [test_case.config][TCCONFIG]: Specify test instances by ID, path, LaTeX-name | ||
for the report, and download URL. | ||
* [compile_options.config][CCONFIG]: Specify compile options by option string. | ||
|
||
Note that the benchmark will execute every combination of vectors and test cases. | ||
|
||
[LT]: http://www.tug.org/applications/pdftex/ "pdflatex" | ||
[PGFP]: http://www.ctan.org/pkg/pgfplots "pgfplots" | ||
[VCONFIG]: ./vectors.config "vectors.config" | ||
[TCCONFIG]: ./test_case.config "test_case.config" | ||
[CCONFIG]: ./compile_options.config "compile_options.config" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
* | ||
!.gitignore |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# Compile options | ||
-O3 -funroll-loops -fomit-frame-pointer -ffast-math -DNDEBUG |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
* | ||
!.gitignore |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
#include <iostream> | ||
#include <fstream> | ||
#include <sdsl/suffix_arrays.hpp> | ||
#include <string> | ||
#include <vector> | ||
|
||
using namespace sdsl; | ||
|
||
//routine to save a vector in different formats, see lower implementations | ||
template<class INT_VECTOR, uint8_t num_byte> | ||
void saveVector(const INT_VECTOR &v, const char *dest); | ||
|
||
//main function to generate MTF of BWT of an integer vector. | ||
// CSA_WT: used wavelet - tree - based suffix array implementation | ||
// INT_VECTOR: used integer vector for extracting BWT | ||
// num_byte: value indicating how result has to be opened / saved | ||
// srcfile: file from which to generate | ||
// destfile: file where to save result | ||
// tmpdir: directory used for temporary results | ||
// conf_bwt_key: key what is able to fetch bwt after suffix array construction | ||
template<class CSA_WT, class INT_VECTOR, uint8_t num_byte> | ||
void gen_bwt_mtf(const char *srcfile, const char *destfile, const char *tmpdir, | ||
const char *conf_bwt_key) { | ||
//utility for CSA generation | ||
cache_config cc(false, tmpdir, "gen_bwt_mtf_"); | ||
INT_VECTOR bwt; | ||
|
||
//create suffix array | ||
CSA_WT wt; | ||
construct(wt, srcfile, cc, num_byte); | ||
|
||
//compute alphabet table from suffix array | ||
std::vector<uint64_t> alph_tbl( wt.sigma ); | ||
for (uint64_t i = 0; i < wt.sigma; i++) { | ||
alph_tbl.push_back( wt.comp2char[i] ); | ||
} | ||
|
||
//fetch bwt | ||
load_from_file(bwt, cache_file_name(conf_bwt_key, cc)); | ||
|
||
//create mtf | ||
for (uint64_t i = 0; i < bwt.size(); i++) { | ||
uint64_t c = bwt[i]; | ||
//find c in alphabet table and move it to front | ||
uint64_t j = 0; | ||
do { | ||
uint64_t tmp = alph_tbl[j]; | ||
alph_tbl[j++] = c; | ||
c = tmp; | ||
} while (c != alph_tbl.front()); | ||
//and write it's index to mtf transform of bwt | ||
bwt[i] = j-1; | ||
} | ||
|
||
//save everything | ||
saveVector<INT_VECTOR, num_byte>( bwt, destfile ); | ||
|
||
//and free resources | ||
util::delete_all_files(cc.file_map); | ||
} | ||
|
||
//functions for saving an integer vector in different formats | ||
//generic version (raw output) | ||
template<class INT_VECTOR, uint8_t num_byte> | ||
void saveVector(const INT_VECTOR &v, const char *dest) { | ||
std::ofstream out(dest); | ||
out.write((char *)v.data(), num_byte * v.size()); | ||
} | ||
//serialization of integer vector | ||
template<> | ||
void saveVector<int_vector<>, 0>(const int_vector<> &v, const char *dest) { | ||
store_to_file(v, dest); | ||
} | ||
//decimal digits | ||
template<> | ||
void saveVector<int_vector<>, 'd'>(const int_vector<> &v, const char *dest) { | ||
std::ofstream out(dest); | ||
if (v.size()) out << v[0]; | ||
for (uint64_t i = 1; i < v.size(); i++) { | ||
out << " " << v[i]; | ||
} | ||
} | ||
|
||
//main function | ||
int main(int argc, char* argv[]) { | ||
if (argc != 5) { | ||
std::cout<<"Usage: input_file output_file temp_dir num_byte" << std::endl; | ||
return 1; | ||
} | ||
std::cout << "Calculate MTF Transform of BWT of " << argv[1] | ||
<< " and store it to " << argv[2] << std::endl; | ||
|
||
typedef csa_wt<> csa_wt_byte; | ||
typedef csa_wt<wt_int<>, 64, 64, sa_order_sa_sampling<>, int_vector<>, int_alphabet<>> csa_wt_int; | ||
|
||
switch (argv[4][0]) { | ||
case 'd': //decimal digits | ||
gen_bwt_mtf<csa_wt_int, int_vector<>, 'd'>(argv[1], argv[2], argv[3], conf::KEY_BWT_INT); | ||
return 0; | ||
case '0': //serialized integer vector | ||
gen_bwt_mtf<csa_wt_int, int_vector<>, 0>(argv[1], argv[2], argv[3], conf::KEY_BWT_INT); | ||
return 0; | ||
case '1': //byte integer vector | ||
gen_bwt_mtf<csa_wt_byte, int_vector<8>, 1>(argv[1], argv[2], argv[3], conf::KEY_BWT); | ||
return 0; | ||
case '2': //2 byte integer vector | ||
gen_bwt_mtf<csa_wt_int, int_vector<>, 2>(argv[1], argv[2], argv[3], conf::KEY_BWT_INT); | ||
return 0; | ||
case '4': //4 byte integer vector | ||
gen_bwt_mtf<csa_wt_int, int_vector<>, 4>(argv[1], argv[2], argv[3], conf::KEY_BWT_INT); | ||
return 0; | ||
case '8': //8 byte integer vector | ||
gen_bwt_mtf<csa_wt_int, int_vector<>, 8>(argv[1], argv[2], argv[3], conf::KEY_BWT_INT); | ||
return 0; | ||
default: | ||
std::cout << "Illegal num_byte, allowed are 'd', 0, 1, 2, 4, 8" << std::endl; | ||
return 1; | ||
} | ||
} |
Oops, something went wrong.