Skip to content

Commit

Permalink
Availability & Reproducibility (#17)
Browse files Browse the repository at this point in the history
-  avx512 source
-  avx2
  • Loading branch information
azimafroozeh authored Sep 24, 2024
1 parent 819df1e commit dd65567
Show file tree
Hide file tree
Showing 17 changed files with 60,092 additions and 152 deletions.
6 changes: 3 additions & 3 deletions include/alp/encoder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -347,11 +347,11 @@ struct encoder {

#ifdef __AVX512F__
for (size_t i {0}; i < config::VECTOR_SIZE; i = i + 8) {
__m512d l = _mm512_loadu_pd(tmp_dbl_arr + i);
__m512d r = _mm512_loadu_pd(input_vector + i);
__m512d l = _mm512_loadu_pd(ENCODED_DBL_ARR + i);
__m512d r = _mm512_loadu_pd(DBL_ARR_WITHOUT_SPECIALS + i);
__m512i index = _mm512_loadu_pd(INDEX_ARR + i);
auto is_exception = _mm512_cmpneq_pd_mask(l, r);
_mm512_mask_compressstoreu_pd(tmp_index + exceptions_idx, is_exception, index);
_mm512_mask_compressstoreu_pd(TMP_INDEX_ARR + exceptions_idx, is_exception, index);
exceptions_idx += LOOKUP_TABLE[is_exception];
}
#else
Expand Down
24 changes: 12 additions & 12 deletions include/fastlanes/macros.hpp
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
#ifndef FASTLANES_MACROS_HPP
#define FASTLANES_MACROS_HPP

// #define _mm256_set1_epi64 _mm256_set1_epi64x
//
// #define _mm128_loadu_si128 _mm_loadu_si128
// #define _mm128_storeu_si128 _mm_storeu_si128
// #define _mm128_and_si128 _mm_and_si128
// #define _mm128_or_si128 _mm_or_si128
// #define _mm128_srli_epi64 _mm_srli_epi64
// #define _mm128_slli_epi64 _mm_slli_epi64
// #define _mm128_set1_epi8 _mm_set1_epi8
// #define _mm128_set1_epi16 _mm_set1_epi16
// #define _mm128_set1_epi32 _mm_set1_epi32
// #define _mm128_set1_epi64 _mm_set1_epi64x
#define _mm256_set1_epi64 _mm256_set1_epi64x

#define _mm128_loadu_si128 _mm_loadu_si128
#define _mm128_storeu_si128 _mm_storeu_si128
#define _mm128_and_si128 _mm_and_si128
#define _mm128_or_si128 _mm_or_si128
#define _mm128_srli_epi64 _mm_srli_epi64
#define _mm128_slli_epi64 _mm_slli_epi64
#define _mm128_set1_epi8 _mm_set1_epi8
#define _mm128_set1_epi16 _mm_set1_epi16
#define _mm128_set1_epi32 _mm_set1_epi32
#define _mm128_set1_epi64 _mm_set1_epi64x

#endif // FASTLANES_MACROS_HPP
9 changes: 9 additions & 0 deletions publication/master_script/master_script.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,15 @@ else
fi

# Run benchmarks

# arm64 arch
"$TARGET_DIR/build/publication/source_code/generated/arm64v8/neon_intrinsic_uf1/arm64v8_neon_intrinsic_1024_uf1_falp_bench"

# x86 arch
"$TARGET_DIR/build/publication/source_code/generated/x86_64/avx2_intrinsic_uf1/x86_64_avx2_intrinsic_1024_uf1_falp_bench"
"$TARGET_DIR/build/publication/source_code/generated/x86_64/avx512bw_intrinsic_uf1/x86_64_avx512bw_intrinsic_1024_uf1_falp_bench"

# rest
"$TARGET_DIR/build/publication/source_code/bench_compression_ratio/bench_alp_compression_ratio"
"$TARGET_DIR/build/publication/source_code/bench_compression_ratio/bench_alp32_compression_ratio"
"$TARGET_DIR/build/publication/source_code/bench_compression_ratio/bench_zstd_compression_ratio"
Expand Down
2 changes: 1 addition & 1 deletion publication/source_code/generated/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ else ()
add_subdirectory(arm64v8)
add_compile_definitions(ALP_ARM64V8)
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
# add_subdirectory(x86_64)
add_subdirectory(x86_64)
add_compile_definitions(ALP_X86_64)
else ()
endif ()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,130 +1,147 @@
#include "arm64v8_neon_intrinsic_1024_uf1_falp_bench.hpp"
#include "alp/alp.hpp"
#include "datasets.hpp"
#include "alp/ffor.hpp"
#include "alp/unffor.hpp"
static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_fused_decode(alp_bench::Dataset& dataset, int64_t* ffor_arr, uint8_t bw, int64_t*base_arr,uint8_t factor,uint8_t exponent,double* dec_dbl_arr,double* exc_arr,uint16_t* pos_arr,uint16_t* exc_c_arr)
{
#include "alp.hpp"
#include "data.hpp"
static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_fused_decode(alp_bench::Column& dataset,
int64_t* ffor_arr,
uint8_t bw,
int64_t* base_arr,
uint8_t factor,
uint8_t exponent,
double* dec_dbl_arr,
double* exc_arr,
uint16_t* pos_arr,
uint16_t* exc_c_arr) {
int benchmark_number = dataset.id;

#ifdef NDEBUG
uint64_t iterations = 3000000;
#else
uint64_t iterations = 1;
#endif

std::string benchmark_name = dataset.name + "_fused";

uint64_t cycles = benchmark::cycleclock::Now();
for (uint64_t i = 0; i < iterations; ++i) {
generated::falp::arm64v8::neon::falp(reinterpret_cast<uint64_t*>(ffor_arr),
dec_dbl_arr,
bw,
reinterpret_cast<uint64_t*>(base_arr),
factor,
exponent);
alp::AlpDecode<double>::patch_exceptions(dec_dbl_arr, exc_arr, pos_arr, exc_c_arr);
}

cycles = benchmark::cycleclock::Now() - cycles;

return benchmark::BenchmarkReporter::Run(
benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024));
#ifdef NDEBUG
uint64_t iterations = 3000000;
#else
uint64_t iterations = 1;
#endif

std::string benchmark_name = dataset.name + "_fused";

uint64_t cycles = benchmark::cycleclock::Now();
for (uint64_t i = 0; i < iterations; ++i) {
generated::falp::arm64v8::neon::falp(reinterpret_cast<uint64_t*>(ffor_arr),
dec_dbl_arr,
bw,
reinterpret_cast<uint64_t*>(base_arr),
factor,
exponent);
alp::decoder<double>::patch_exceptions(dec_dbl_arr, exc_arr, pos_arr, exc_c_arr);
}

cycles = benchmark::cycleclock::Now() - cycles;

return benchmark::BenchmarkReporter::Run(
benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024));
}
static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_decode(alp_bench::Dataset& dataset, int64_t* ffor_arr, int64_t* unffor_arr, uint8_t bw, int64_t* base_arr, uint8_t factor, uint8_t exponent, double* dec_dbl_arr, double* exc_arr, uint16_t* pos_arr, uint16_t* exc_c_arr)
{

int benchmark_number = dataset.id;

#ifdef NDEBUG
uint64_t iterations = 3000000;
#else
uint64_t iterations = 1;
#endif

std::string benchmark_name = dataset.name + "";

uint64_t cycles = benchmark::cycleclock::Now();
for (uint64_t i = 0; i < iterations; ++i) {
alp::generated::unffor::fallback::scalar::unffor(ffor_arr, unffor_arr, bw, base_arr);
alp::AlpDecode<double>(reinterpret_cast<const uint64_t*>(unffor_arr), factor, exponent, dec_dbl_arr);
alp::AlpDecode<double>::patch_exceptions(dec_dbl_arr, exc_arr, pos_arr, exc_c_arr);
}

cycles = benchmark::cycleclock::Now() - cycles;

return benchmark::BenchmarkReporter::Run(
benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024));

static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_decode(alp_bench::Column& dataset,
int64_t* ffor_arr,
int64_t* unffor_arr,
uint8_t bw,
int64_t* base_arr,
uint8_t factor,
uint8_t exponent,
double* dec_dbl_arr,
double* exc_arr,
uint16_t* pos_arr,
uint16_t* exc_c_arr) {

int benchmark_number = dataset.id;

#ifdef NDEBUG
uint64_t iterations = 3000000;
#else
uint64_t iterations = 1;
#endif

std::string benchmark_name = dataset.name + "";

uint64_t cycles = benchmark::cycleclock::Now();
for (uint64_t i = 0; i < iterations; ++i) {
fastlanes::generated::unffor::fallback::scalar::unffor(ffor_arr, unffor_arr, bw, base_arr);
alp::decoder<double>::decode(unffor_arr, factor, exponent, dec_dbl_arr);
alp::decoder<double>::patch_exceptions(dec_dbl_arr, exc_arr, pos_arr, exc_c_arr);
}

cycles = benchmark::cycleclock::Now() - cycles;

return benchmark::BenchmarkReporter::Run(
benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024));
}
void benchmark_all(benchmark::Benchmark& benchmark)
{

double* dbl_arr;
double* exc_arr;
uint16_t* pos_arr;
uint16_t* exc_c_arr;
int64_t* ffor_arr;
int64_t* unffor_arr;

int64_t* base_arr;
int64_t* dig_arr;
double* dec_dbl_arr;

uint8_t bw;
uint8_t factor;
uint8_t exponent;

alp::state stt;

dbl_arr = new (std::align_val_t {64}) double[1024];
exc_arr = new (std::align_val_t {64}) double[1024];
pos_arr = new (std::align_val_t {64}) uint16_t[1024];
dig_arr = new (std::align_val_t {64}) int64_t[1024];
dec_dbl_arr = new (std::align_val_t {64}) double[1024];
exc_c_arr = new (std::align_val_t {64}) uint16_t[1024];
ffor_arr = new (std::align_val_t {64}) int64_t[1024];
unffor_arr = new (std::align_val_t {64}) int64_t[1024];
base_arr = new (std::align_val_t {64}) int64_t[1024];

for (auto& dataset : alp_bench::datasets) {
std::ifstream ifile(dataset.sample_csv_file_path, std::ios::in);

// check to see that the file was opened correctly:
if (!ifile.is_open()) {
exit(1); // exit or do additional error checking
}

double num = 0.0;
// keep storing values from the text file so long as data exists:
size_t c {0};
while (ifile >> num) {
dbl_arr[c] = num;
c += 1;
}

factor = dataset.factor;
exponent = dataset.exponent;

alp::AlpEncode<double>::encode(dbl_arr, exc_arr, pos_arr, exc_c_arr, dig_arr, stt);
alp::AlpEncode<double>::analyze_ffor(dig_arr, bw, base_arr);
alp::generated::ffor::fallback::scalar::ffor(dig_arr, ffor_arr, bw, base_arr);

benchmark.Run(bench_alp_fused_decode(
dataset, unffor_arr, bw, base_arr, factor, exponent, dec_dbl_arr, exc_arr, pos_arr, exc_c_arr));

benchmark.Run(bench_alp_decode(
dataset, ffor_arr, unffor_arr, bw, base_arr, factor, exponent, dec_dbl_arr, exc_arr, pos_arr, exc_c_arr));

ifile.close();}
void benchmark_all(benchmark::Benchmark& benchmark) {

double* dbl_arr;
double* exc_arr;
uint16_t* pos_arr;
uint16_t* exc_c_arr;
int64_t* ffor_arr;
int64_t* unffor_arr;
double* smp_arr;

int64_t* base_arr;
int64_t* dig_arr;
double* dec_dbl_arr;

uint8_t bw;
uint8_t factor;
uint8_t exponent;

alp::state<double> stt;

dbl_arr = new (std::align_val_t {64}) double[1024];
exc_arr = new (std::align_val_t {64}) double[1024];
pos_arr = new (std::align_val_t {64}) uint16_t[1024];
dig_arr = new (std::align_val_t {64}) int64_t[1024];
dec_dbl_arr = new (std::align_val_t {64}) double[1024];
exc_c_arr = new (std::align_val_t {64}) uint16_t[1024];
ffor_arr = new (std::align_val_t {64}) int64_t[1024];
unffor_arr = new (std::align_val_t {64}) int64_t[1024];
base_arr = new (std::align_val_t {64}) int64_t[1024];
smp_arr = new double[1024];

for (auto& dataset : alp_bench::get_alp_dataset()) {
std::ifstream ifile(dataset.csv_file_path, std::ios::in);

// check to see that the file was opened correctly:
if (!ifile.is_open()) {
exit(1); // exit or do additional error checking
}

double num = 0.0;
// keep storing values from the text file so long as data exists:
size_t c {0};
while (ifile >> num) {
dbl_arr[c] = num;
c += 1;
}

factor = dataset.factor;
exponent = dataset.exponent;

alp::encoder<double>::init(dbl_arr, 0, 1024, smp_arr, stt);

alp::encoder<double>::encode(dbl_arr, exc_arr, pos_arr, exc_c_arr, dig_arr, stt);
alp::encoder<double>::analyze_ffor(dig_arr, bw, base_arr);
fastlanes::generated::ffor::fallback::scalar::ffor(dig_arr, ffor_arr, bw, base_arr);

benchmark.Run(bench_alp_fused_decode(
dataset, unffor_arr, bw, base_arr, factor, exponent, dec_dbl_arr, exc_arr, pos_arr, exc_c_arr));

benchmark.Run(bench_alp_decode(
dataset, ffor_arr, unffor_arr, bw, base_arr, factor, exponent, dec_dbl_arr, exc_arr, pos_arr, exc_c_arr));

ifile.close();
}
}
int main()
{
benchmark::Benchmark benchmark =
benchmark::create("arm64v8_neon_intrinsic_1024_uf1_falp")
.save()
.at(std::string(SOURCE_DIR) + "/alp_pub/results/" + benchmark::CmakeInfo::getCmakeToolchainFile())
.print()
.add_extra_info(benchmark::CmakeInfo::getCmakeInfo());
benchmark_all(benchmark);
int main() {
benchmark::Benchmark benchmark =
benchmark::create("arm64v8_neon_intrinsic_1024_uf1_falp")
.save()
.at(std::string(SOURCE_DIR) + "/publication/results/" + benchmark::CmakeInfo::getCmakeToolchainFile())
.print()
.add_extra_info(benchmark::CmakeInfo::getCmakeInfo());
benchmark_all(benchmark);
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,13 @@ LIST(APPEND ALP_GENERATED_OBJECT_FILES
get_target_property(TARGET_NAME arm64v8_neon_intrinsic_1024_uf1_falp NAME)
get_target_property(TARGET_COMPILE_OPTIONS arm64v8_neon_intrinsic_1024_uf1_falp COMPILE_OPTIONS)
#------------------------------------------------------------------------------------------------------
if (BUILD_TESTING)
add_executable(arm64v8_neon_intrinsic_1024_uf1_falp_test arm64v8_neon_intrinsic_1024_uf1_falp_test.cpp)
target_link_libraries(arm64v8_neon_intrinsic_1024_uf1_falp_test PRIVATE ALP gtest_main arm64v8_neon_intrinsic_1024_uf1_falp)
target_include_directories(arm64v8_neon_intrinsic_1024_uf1_falp_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
gtest_discover_tests(arm64v8_neon_intrinsic_1024_uf1_falp_test)
endif ()
add_executable(arm64v8_neon_intrinsic_1024_uf1_falp_test arm64v8_neon_intrinsic_1024_uf1_falp_test.cpp)
target_link_libraries(arm64v8_neon_intrinsic_1024_uf1_falp_test PRIVATE ALP gtest_main arm64v8_neon_intrinsic_1024_uf1_falp)
target_include_directories(arm64v8_neon_intrinsic_1024_uf1_falp_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
gtest_discover_tests(arm64v8_neon_intrinsic_1024_uf1_falp_test)
#------------------------------------------------------------------------------------------------------
if (BUILD_BENCHMARK)
configure_file(${CMAKE_SOURCE_DIR}/alp_bench/alp_bench.hpp ${CMAKE_CURRENT_BINARY_DIR}/arm64v8_neon_intrinsic_1024_uf1_falp_bench.hpp)
add_executable(arm64v8_neon_intrinsic_1024_uf1_falp_bench arm64v8_neon_intrinsic_1024_uf1_falp_bench.cpp)
target_link_libraries(arm64v8_neon_intrinsic_1024_uf1_falp_bench PRIVATE ALP arm64v8_neon_intrinsic_1024_uf1_falp)
target_include_directories(arm64v8_neon_intrinsic_1024_uf1_falp_bench PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
add_alp_benchmark(arm64v8_neon_intrinsic_1024_uf1_falp_bench)
endif ()
configure_file(${CMAKE_SOURCE_DIR}/benchmarks/fls_bench/fls_bench.hpp ${CMAKE_CURRENT_BINARY_DIR}/arm64v8_neon_intrinsic_1024_uf1_falp_bench.hpp)
add_executable(arm64v8_neon_intrinsic_1024_uf1_falp_bench arm64v8_neon_intrinsic_1024_uf1_falp_bench.cpp)
target_link_libraries(arm64v8_neon_intrinsic_1024_uf1_falp_bench PRIVATE ALP arm64v8_neon_intrinsic_1024_uf1_falp)
target_include_directories(arm64v8_neon_intrinsic_1024_uf1_falp_bench PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
add_alp_benchmark(arm64v8_neon_intrinsic_1024_uf1_falp_bench)
6 changes: 6 additions & 0 deletions publication/source_code/generated/x86_64/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
add_subdirectory(avx2_intrinsic_uf1)
add_subdirectory(avx512bw_intrinsic_uf1)

set(FLS_GENERATED_OBJECT_FILES
${FLS_GENERATED_OBJECT_FILES}
PARENT_SCOPE)
Loading

0 comments on commit dd65567

Please sign in to comment.