Skip to content

Commit

Permalink
add more dataset to alp dataset.
Browse files Browse the repository at this point in the history
  • Loading branch information
azimafroozeh committed Sep 11, 2024
1 parent 8e19a77 commit 31e25d8
Show file tree
Hide file tree
Showing 20 changed files with 432,131 additions and 311 deletions.
85 changes: 72 additions & 13 deletions benchmarks/bench_compression_ratio/alp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,67 @@ class alp_test : public ::testing::Test {
}
};

// we prefer the binary_path over csv_path
void read_data(std::vector<double>& data, const std::string& csv_file_path, const std::string& bin_file_path) {
if (csv_file_path.empty()) {

// Open the binary file in input mode
std::ifstream file(bin_file_path, std::ios::binary | std::ios::in);

if (!file) { throw std::runtime_error("Failed to open file: " + bin_file_path); }

// Get the size of the file
file.seekg(0, std::ios::end);
std::streamsize fileSize = file.tellg();
file.seekg(0, std::ios::beg);

// Ensure the file size is a multiple of the size of a double
if (fileSize % sizeof(double) != 0) { throw std::runtime_error("File size is not a multiple of double size!"); }
// Calculate the number of doubles
std::size_t numDoubles = fileSize / sizeof(double);

// Resize the vector to hold all the doubles
data.resize(numDoubles);

// Read the data into the vector
file.read(reinterpret_cast<char*>(data.data()), fileSize);

// Close the file
file.close();
return;
}
if (bin_file_path.empty()) {
const auto& path = csv_file_path;
std::ifstream file(path);

if (!file) { throw std::runtime_error("Failed to open file: " + path); }

std::string line;
// Read each line, convert it to double, and store it in the vector
while (std::getline(file, line)) {
try {
// Convert the string to double and add to the vector
data.push_back(std::stod(line));
} catch (const std::invalid_argument& e) {
throw std::runtime_error("Invalid data in file: " + line);
} catch (const std::out_of_range& e) {
//
throw std::runtime_error("Number out of range in file: " + line);
}
}

file.close();
return;
}
throw std::runtime_error("No bin or csv file specified");
}

/*
* Test to encode and decode whole datasets using ALP
* This test will output and write a file with the estimated bits/value after compression with alp
*/

TEST_F(alp_test, test_alp_on_whole_datasets) {

if (const auto v = std::getenv("ALP_DATASET_DIR_PATH"); v == nullptr) {
throw std::runtime_error("Environment variable ALP_DATASET_DIR_PATH is not set!");
}
Expand All @@ -141,19 +195,22 @@ TEST_F(alp_test, test_alp_on_whole_datasets) {
std::cout << dataset.name << std::endl;

std::vector<alp_bench::VectorMetadata> compression_metadata;
size_t tuples_count;
auto* data_column = mapper::mmap_file<double>(tuples_count, dataset.binary_file_path);
std::vector<double> data;
read_data(data, dataset.csv_file_path, dataset.binary_file_path);
double* data_column = data.data();
size_t n_tuples = data.size();

double value_to_encode {0.0};
size_t vector_idx {0};
size_t rowgroup_counter {0};
size_t rowgroup_offset {0};
alp::state stt;
size_t rowgroups_count = std::ceil(static_cast<double>(tuples_count) / ROWGROUP_SIZE);
size_t vectors_count = tuples_count / VECTOR_SIZE;
size_t rowgroups_count = std::ceil(static_cast<double>(n_tuples) / ROWGROUP_SIZE);
size_t vectors_count = n_tuples / VECTOR_SIZE;
/* Init */
alp::AlpEncode<double>::init(data_column, rowgroup_offset, tuples_count, smp_arr, stt);
alp::AlpEncode<double>::init(data_column, rowgroup_offset, n_tuples, smp_arr, stt);
/* Encode - Decode - Validate. */
for (size_t i = 0; i < tuples_count; i++) {
for (size_t i = 0; i < n_tuples; i++) {
value_to_encode = data_column[i];
dbl_arr[vector_idx] = value_to_encode;
vector_idx = vector_idx + 1;
Expand All @@ -163,7 +220,7 @@ TEST_F(alp_test, test_alp_on_whole_datasets) {
if (vector_idx != VECTOR_SIZE) { continue; }
if (rowgroup_counter == ROWGROUP_SIZE) {
rowgroup_counter = 0;
alp::AlpEncode<double>::init(data_column, rowgroup_offset, tuples_count, smp_arr, stt);
alp::AlpEncode<double>::init(data_column, rowgroup_offset, n_tuples, smp_arr, stt);
}
alp::AlpEncode<double>::encode(dbl_arr, exc_arr, pos_arr, exc_c_arr, encoded_arr, stt);
alp::AlpEncode<double>::analyze_ffor(encoded_arr, bit_width, base_arr);
Expand Down Expand Up @@ -207,8 +264,10 @@ TEST_F(alp_test, test_alprd_on_whole_datasets) {
if (!dataset.suitable_for_cutting) { continue; }

std::vector<alp_bench::VectorMetadata> compression_metadata;
size_t tuples_count;
auto* data_column = mapper::mmap_file<double>(tuples_count, dataset.binary_file_path);
std::vector<double> data;
read_data(data, dataset.csv_file_path, dataset.binary_file_path);
double* data_column = data.data();
size_t n_tuples = data.size();
double value_to_encode = 0.0;
size_t vector_idx {0};
size_t rowgroup_counter {0};
Expand All @@ -218,14 +277,14 @@ TEST_F(alp_test, test_alprd_on_whole_datasets) {
size_t vectors_count {1};

/* Init */
alp::AlpEncode<double>::init(data_column, rowgroup_offset, tuples_count, smp_arr, stt);
alp::AlpEncode<double>::init(data_column, rowgroup_offset, n_tuples, smp_arr, stt);

ASSERT_EQ(stt.scheme, alp::SCHEME::ALP_RD);

alp::AlpRD<double>::init(data_column, rowgroup_offset, tuples_count, smp_arr, stt);
alp::AlpRD<double>::init(data_column, rowgroup_offset, n_tuples, smp_arr, stt);

/* Encode - Decode - Validate. */
for (size_t i = 0; i < tuples_count; i++) {
for (size_t i = 0; i < n_tuples; i++) {
value_to_encode = data_column[i];
dbl_arr[vector_idx] = value_to_encode;
vector_idx = vector_idx + 1;
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/bench_speed/bench_alp_cutter_decode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ void benchmark_all(benchmark::Benchmark& benchmark) {
glue_arr = new (std::align_val_t {64}) double[VECTOR_SIZE];

for (auto& dataset : alp_bench::alp_dataset) {
std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in);
std::ifstream ifile(dataset.csv_file_path, std::ios::in);
if (!dataset.suitable_for_cutting) { continue; }
if (dataset.name.find("bw") != std::string::npos) { continue; }

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/bench_speed/bench_alp_cutter_encode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ void benchmark_all(benchmark::Benchmark& benchmark) {
glue_arr = new (std::align_val_t {64}) double[VECTOR_SIZE];

for (auto& dataset : alp_bench::alp_dataset) {
std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in);
std::ifstream ifile(dataset.csv_file_path, std::ios::in);
if (!dataset.suitable_for_cutting) { continue; }
if (dataset.name.find("bw") != std::string::npos) { continue; }

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/bench_speed/bench_alp_encode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ void benchmark_all(benchmark::Benchmark& benchmark) {
rg_smp_arr = new (std::align_val_t {64}) double[1024];

for (auto& dataset : alp_bench::alp_dataset) {
std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in);
std::ifstream ifile(dataset.csv_file_path, std::ios::in);
if (dataset.suitable_for_cutting) { continue; }
if (dataset.name.find("bw") != std::string::npos) { continue; }

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/bench_speed/bench_alp_without_sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ void benchmark_all(benchmark::Benchmark& benchmark) {
rg_smp_arr = new (std::align_val_t {64}) double[1024];

for (auto& dataset : alp_bench::alp_dataset) {
std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in);
std::ifstream ifile(dataset.csv_file_path, std::ios::in);
if (dataset.suitable_for_cutting) { continue; }
if (dataset.name.find("bw") != std::string::npos) { continue; }

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/bench_speed/bench_chimp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ void benchmark_all(benchmark::Benchmark& benchmark) {
leading_zero_unpacked = new (std::align_val_t {64}) uint8_t[1024];

for (auto& dataset : alp_bench::alp_dataset) {
std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in);
std::ifstream ifile(dataset.csv_file_path, std::ios::in);

// check to see that the file was opened correctly:
if (!ifile.is_open()) {
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/bench_speed/bench_chimp128.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ void benchmark_all(benchmark::Benchmark& benchmark) {
unpacked_data_arr = new (std::align_val_t {64}) alp_bench::UnpackedData[1024];

for (auto& dataset : alp_bench::alp_dataset) {
std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in);
std::ifstream ifile(dataset.csv_file_path, std::ios::in);
if (dataset.name.find("bw") != std::string::npos) { continue; }

// check to see that the file was opened correctly:
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/bench_speed/bench_gorillas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ void benchmark_all(benchmark::Benchmark& benchmark) {
flags = new (std::align_val_t {64}) alp_bench::GorillasConstants::Flags[1024];

for (auto& dataset : alp_bench::alp_dataset) {
std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in);
std::ifstream ifile(dataset.csv_file_path, std::ios::in);
if (dataset.name.find("bw") != std::string::npos) { continue; }

// check to see that the file was opened correctly:
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/bench_speed/bench_patas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ void benchmark_all(benchmark::Benchmark& benchmark) {
dbl_arr = new (std::align_val_t {64}) double[1024];

for (auto& dataset : alp_bench::alp_dataset) {
std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in);
std::ifstream ifile(dataset.csv_file_path, std::ios::in);

// check to see that the file was opened correctly:
if (!ifile.is_open()) {
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/test/test_chimp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class chimp_test : public ::testing::Test {

TEST_F(chimp_test, test_chimp) {
for (auto& dataset : alp_bench::alp_dataset) {
std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in);
std::ifstream ifile(dataset.csv_file_path, std::ios::in);
ASSERT_EQ(ifile.fail(), false);

// Read Data
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/test/test_chimp128.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class chimp128_test : public ::testing::Test {

TEST_F(chimp128_test, test_chimp) {
for (auto& dataset : alp_bench::alp_dataset) {
std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in);
std::ifstream ifile(dataset.csv_file_path, std::ios::in);
ASSERT_EQ(ifile.fail(), false);

// Read Data
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/test/test_gorillas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class gorillas_test : public ::testing::Test {

TEST_F(gorillas_test, test_gorillas) {
for (auto& dataset : alp_bench::alp_dataset) {
std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in);
std::ifstream ifile(dataset.csv_file_path, std::ios::in);
ASSERT_EQ(ifile.fail(), false);

// Read Data
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/test/test_patas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class patas_test : public ::testing::Test {

TEST_F(patas_test, one_vec) {
for (auto& dataset : alp_bench::alp_dataset) {
std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in);
std::ifstream ifile(dataset.csv_file_path, std::ios::in);
ASSERT_EQ(ifile.fail(), false);

// Read Data
Expand Down
12 changes: 7 additions & 5 deletions data/include/column.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ namespace alp_bench {
struct Column {
uint64_t id;
std::string name;
const std::string sample_csv_file_path;
const std::string csv_file_path;
const std::string binary_file_path;
uint8_t factor {0};
uint16_t exponent {0};
Expand All @@ -19,10 +19,12 @@ struct Column {
};

struct paths {
std::string GENERATED_COLUMNS_CSV_PATH = std::string {CMAKE_SOURCE_DIR} + "/data/generated/";
std::string ALP_DATASET_SAMPLE_CSV_PATH = std::string {CMAKE_SOURCE_DIR} + "/data/samples/";
std::string EDGE_DATASET_CSV_PATH = std::string {CMAKE_SOURCE_DIR} + "/data/edge_case/";
std::string RESULT_DIR_PATH = std::string {CMAKE_SOURCE_DIR} + "/publication/";
std::string GENERATED_COLUMNS_CSV_PATH = std::string {CMAKE_SOURCE_DIR} + "/data/generated/";
std::string ALP_DATASET_CSV_PATH = std::string {CMAKE_SOURCE_DIR} + "/data/samples/";
std::string EDGE_DATASET_CSV_PATH = std::string {CMAKE_SOURCE_DIR} + "/data/edge_case/";
std::string RESULT_DIR_PATH = std::string {CMAKE_SOURCE_DIR} + "/publication/";
std::string ALP_ISSUE_CSV_PATH = std::string {CMAKE_SOURCE_DIR} + "/data/issue/";

std::string ALP_DATASET_BINARY_DIR_PATH = " ";

explicit paths() {
Expand Down
Loading

0 comments on commit 31e25d8

Please sign in to comment.