Skip to content

Commit

Permalink
add evalimplsts dataset. not complete yet.
Browse files Browse the repository at this point in the history
  • Loading branch information
azimafroozeh committed Sep 15, 2024
1 parent 184382b commit 17bebe9
Show file tree
Hide file tree
Showing 8 changed files with 142 additions and 49 deletions.
168 changes: 123 additions & 45 deletions benchmarks/bench_compression_ratio/bench_alp_compression_ratio.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,61 @@ double get_average_exception_count(std::vector<alp_bench::VectorMetadata>& vecto
return avg_exceptions_count;
}

// we prefer the binary_path over csv_path
void read_data(std::vector<double>& data, const std::string& csv_file_path, const std::string& bin_file_path) {
if (!bin_file_path.empty()) {

// Open the binary file in input mode
std::ifstream file(bin_file_path, std::ios::binary | std::ios::in);

if (!file) { throw std::runtime_error("Failed to open file: " + bin_file_path); }

// Get the size of the file
file.seekg(0, std::ios::end);
std::streamsize fileSize = file.tellg();
file.seekg(0, std::ios::beg);

// Ensure the file size is a multiple of the size of a double
if (fileSize % sizeof(double) != 0) { throw std::runtime_error("File size is not a multiple of double size!"); }
// Calculate the number of doubles
std::size_t numDoubles = fileSize / sizeof(double);

// Resize the vector to hold all the doubles
data.resize(numDoubles);

// Read the data into the vector
file.read(reinterpret_cast<char*>(data.data()), fileSize);

// Close the file
file.close();
return;
}
if (!csv_file_path.empty()) {
const auto& path = csv_file_path;
std::ifstream file(path);

if (!file) { throw std::runtime_error("Failed to open file: " + path); }

std::string line;
// Read each line, convert it to double, and store it in the vector
while (std::getline(file, line)) {
try {
// Convert the string to double and add to the vector
data.push_back(std::stod(line));
} catch (const std::invalid_argument& e) {
throw std::runtime_error("Invalid data in file: " + line);
} catch (const std::out_of_range& e) {
//
throw std::runtime_error("Number out of range in file: " + line);
}
}

file.close();
return;
}
throw std::runtime_error("No bin or csv file specified");
}

class alp_test : public ::testing::Test {
public:
double* intput_buf {};
Expand Down Expand Up @@ -119,41 +174,30 @@ class alp_test : public ::testing::Test {
delete[] unffor_right_buf;
delete[] unffor_left_arr;
}
};

/*
* Test to encode and decode whole datasets using ALP
* This test will output and write a file with the estimated bits/value after compression with alp
*/

TEST_F(alp_test, test_alp_on_whole_datasets) {

if (const auto v = std::getenv("ALP_DATASET_DIR_PATH"); v == nullptr) {
throw std::runtime_error("Environment variable ALP_DATASET_DIR_PATH is not set!");
}

std::ofstream ofile(alp_bench::PATHS.RESULT_DIR_PATH + "alp_compression_ratio.csv", std::ios::out);
ofile << "dataset,size,rowgroups_count,vectors_count\n";

for (auto& dataset : alp_bench::alp_dataset) {
if (dataset.suitable_for_cutting) { continue; }
void bench_alp_compression_ratio(const alp_bench::Column& dataset, std::ofstream& ofile) {
if (dataset.suitable_for_cutting) { return; }

std::cout << dataset.name << std::endl;

std::vector<double> data;
read_data(data, dataset.csv_file_path, dataset.binary_file_path);
double* data_column = data.data();
size_t n_tuples = data.size();

std::vector<alp_bench::VectorMetadata> compression_metadata;
size_t tuples_count;
auto* data_column = mapper::mmap_file<double>(tuples_count, dataset.binary_file_path);
double value_to_encode {0.0};
size_t vector_idx {0};
size_t rowgroup_counter {0};
size_t rowgroup_offset {0};
alp::state<double> stt;
size_t rowgroups_count = std::ceil(static_cast<double>(tuples_count) / ROWGROUP_SIZE);
size_t vectors_count = tuples_count / VECTOR_SIZE;
double value_to_encode {0.0};
size_t vector_idx {0};
size_t rowgroup_counter {0};
size_t rowgroup_offset {0};
alp::state<double> stt;
size_t rowgroups_count = std::ceil(static_cast<double>(n_tuples) / ROWGROUP_SIZE);
size_t vectors_count = n_tuples / VECTOR_SIZE;

/* Init */
alp::encoder<double>::init(data_column, rowgroup_offset, tuples_count, sample_buf, stt);
alp::encoder<double>::init(data_column, rowgroup_offset, n_tuples, sample_buf, stt);
/* Encode - Decode - Validate. */
for (size_t i = 0; i < tuples_count; i++) {
for (size_t i = 0; i < n_tuples; i++) {
value_to_encode = data_column[i];
intput_buf[vector_idx] = value_to_encode;
vector_idx = vector_idx + 1;
Expand All @@ -163,7 +207,7 @@ TEST_F(alp_test, test_alp_on_whole_datasets) {
if (vector_idx != VECTOR_SIZE) { continue; }
if (rowgroup_counter == ROWGROUP_SIZE) {
rowgroup_counter = 0;
alp::encoder<double>::init(data_column, rowgroup_offset, tuples_count, sample_buf, stt);
alp::encoder<double>::init(data_column, rowgroup_offset, n_tuples, sample_buf, stt);
}
alp::encoder<double>::encode(intput_buf, exc_arr, pos_arr, exc_c_arr, encoded_buf, stt);
alp::encoder<double>::analyze_ffor(encoded_buf, bit_width, base_buf);
Expand Down Expand Up @@ -193,22 +237,17 @@ TEST_F(alp_test, test_alp_on_whole_datasets) {
ASSERT_EQ(alp_bench::to_str(compression_ratio), alp_bench::results.find(dataset.name)->second);
}
}
}

/*
* Test to encode and decode whole datasets using ALP RD (aka ALP Cutter)
* This test will output and write a file with the estimated bits/value after compression with alp
*/
TEST_F(alp_test, test_alprd_on_whole_datasets) {
std::ofstream ofile(alp_bench::PATHS.RESULT_DIR_PATH + "alp_rd_compression_ratio.csv", std::ios::out);
ofile << "dataset,size,rowgroups_count,vectors_count\n";

for (auto& dataset : alp_bench::alp_dataset) {
if (!dataset.suitable_for_cutting) { continue; }
void bench_alp_rd_compression_ratio(const alp_bench::Column& dataset, std::ofstream& ofile) {
if (!dataset.suitable_for_cutting) { return; }

std::vector<alp_bench::VectorMetadata> compression_metadata;
size_t tuples_count;
auto* data_column = mapper::mmap_file<double>(tuples_count, dataset.binary_file_path);

std::vector<double> data;
read_data(data, dataset.csv_file_path, dataset.binary_file_path);
double* data_column = data.data();
size_t n_tuples = data.size();

double value_to_encode = 0.0;
size_t vector_idx {0};
size_t rowgroup_counter {0};
Expand All @@ -218,14 +257,14 @@ TEST_F(alp_test, test_alprd_on_whole_datasets) {
size_t vectors_count {1};

/* Init */
alp::encoder<double>::init(data_column, rowgroup_offset, tuples_count, sample_buf, stt);
alp::encoder<double>::init(data_column, rowgroup_offset, n_tuples, sample_buf, stt);

ASSERT_EQ(stt.scheme, alp::Scheme::ALP_RD);

alp::rd_encoder<double>::init(data_column, rowgroup_offset, tuples_count, sample_buf, stt);
alp::rd_encoder<double>::init(data_column, rowgroup_offset, n_tuples, sample_buf, stt);

/* Encode - Decode - Validate. */
for (size_t i = 0; i < tuples_count; i++) {
for (size_t i = 0; i < n_tuples; i++) {
value_to_encode = data_column[i];
intput_buf[vector_idx] = value_to_encode;
vector_idx = vector_idx + 1;
Expand Down Expand Up @@ -281,6 +320,45 @@ TEST_F(alp_test, test_alprd_on_whole_datasets) {
ASSERT_EQ(alp_bench::to_str(compression_ratio), alp_bench::results.find(dataset.name)->second);
}
}
};

/*
* Test to encode and decode whole datasets using ALP
* This test will output and write a file with the estimated bits/value after compression with alp
*/
TEST_F(alp_test, test_alp_on_whole_datasets) {
if (const auto v = std::getenv("ALP_DATASET_DIR_PATH"); v == nullptr) {
throw std::runtime_error("Environment variable ALP_DATASET_DIR_PATH is not set!");
}

std::ofstream ofile(alp_bench::PATHS.RESULT_DIR_PATH + "alp_compression_ratio.csv", std::ios::out);
ofile << "dataset,size,rowgroups_count,vectors_count\n";

for (auto& dataset : alp_bench::alp_dataset) {
bench_alp_compression_ratio(dataset, ofile);
}
}

/*
* Test to encode and decode whole datasets using ALP RD (aka ALP Cutter)
* This test will output and write a file with the estimated bits/value after compression with alp
*/
TEST_F(alp_test, test_alprd_on_whole_datasets) {
std::ofstream ofile(alp_bench::PATHS.RESULT_DIR_PATH + "alp_rd_compression_ratio.csv", std::ios::out);
ofile << "dataset,size,rowgroups_count,vectors_count\n";

for (auto& dataset : alp_bench::alp_dataset) {
bench_alp_rd_compression_ratio(dataset, ofile);
}
}

TEST_F(alp_test, test_alprd_on_evalimplsts) {
std::ofstream ofile(alp_bench::PATHS.RESULT_DIR_PATH + "evalimplsts.csv", std::ios::out);
ofile << "dataset,size,rowgroups_count,vectors_count\n";

for (auto& dataset : alp_bench::evalimplsts) {
bench_alp_rd_compression_ratio(dataset, ofile);
}
}

// NOLINTEND
File renamed without changes.
2 changes: 1 addition & 1 deletion data/include/column.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ struct paths {
std::string ALP_DATASET_CSV_PATH = std::string {CMAKE_SOURCE_DIR} + "/data/samples/";
std::string EDGE_DATASET_CSV_PATH = std::string {CMAKE_SOURCE_DIR} + "/data/edge_case/";
std::string RESULT_DIR_PATH = std::string {CMAKE_SOURCE_DIR} + "/publication/";
std::string ALP_ISSUE_CSV_PATH = std::string {CMAKE_SOURCE_DIR} + "/data/issue/";
std::string EVALIMPLSTS_CSV_PATH = std::string {CMAKE_SOURCE_DIR} + "/data/evalimplsts/";

std::string ALP_DATASET_BINARY_DIR_PATH = " ";

Expand Down
1 change: 1 addition & 0 deletions data/include/data.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "column.hpp"
#include "double_columns.hpp"
#include "edge_case.hpp"
#include "evalimplsts.hpp"
#include "float_columns.hpp"
#include "generated_columns.hpp"

Expand Down
3 changes: 1 addition & 2 deletions data/include/double_columns.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

namespace alp_bench {

inline std::array<Column, 31> alp_dataset = {{
inline std::array<Column, 30> alp_dataset = {{

{1,
"Air-Pressure",
Expand Down Expand Up @@ -280,7 +280,6 @@ inline std::array<Column, 31> alp_dataset = {{
12,
0,
16},
{31, "issue_8", PATHS.ALP_ISSUE_CSV_PATH + "active_power.csv", "", 0, 0, 0, 0, true},

}};
} // namespace alp_bench
Expand Down
14 changes: 14 additions & 0 deletions data/include/evalimplsts.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#ifndef ALP_DOUBLE_EVALIMPLSTS_HPP
#define ALP_DOUBLE_EVALIMPLSTS_HPP

#include "column.hpp"

namespace alp_bench {

inline std::array<Column, 1> evalimplsts = {{
// prev issue_8
{0, "active_power", PATHS.EVALIMPLSTS_CSV_PATH + "active_power.csv", "", 0, 0, 0, 0, true},

}};
} // namespace alp_bench
#endif
1 change: 0 additions & 1 deletion publication/alp_rd_compression_ratio.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
dataset,size,rowgroups_count,vectors_count
POI-lat,55.74,5,415
POI-lon,56.56,5,415
issue_8,61.47,5,422
2 changes: 2 additions & 0 deletions publication/evalimplsts.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
dataset,size,rowgroups_count,vectors_count
active_power,61.47,5,422

0 comments on commit 17bebe9

Please sign in to comment.