From 000174be47f98d0c263adaba08707f5b6fdabc18 Mon Sep 17 00:00:00 2001 From: Benjamin Wagner Date: Sun, 22 Oct 2023 14:10:08 +0200 Subject: [PATCH 1/4] Add Prefetching to Hash Join This commit adds prefetching to our hash joins. Rather than doing a single lookup call, we now split the logic into three calls: 1. A hash call -> Performs the key hash on the hash table 2. A prefetch call -> Prefetches the respective slots from the hash table 2. The actual lookup call -> Does the key lookup, on the slot that was previously prefetched This will be interesting for our vectorized backend, as we can issue many independent memory loads in short succession. In the next commits we will: 1. Allow disabling the prefetch calls for JIT compiled code. They are rather pointless as the lookup right after will load the data into cache anyways. 2. Perform dynamic chunking in the vectorized code. At the end of the pipeline when we move into hash table operations, we will dynamically reduce the chunk size to ~256 in order to make sure the prefetching only fills the L1/L2 caches. If the prefetching range becomes too large, then we start evicting parts of the hash table from cache again. After this, we should have a significantly faster vectorized execution backend. --- bench/ht_benchmark.cpp | 3 +- bench/vectorized_ht.cpp | 72 +++++++++++++++++- src/algebra/Join.cpp | 16 +++- src/algebra/Join.h | 5 ++ .../suboperators/RuntimeFunctionSubop.cpp | 35 ++------- .../suboperators/RuntimeFunctionSubop.h | 76 ++++++++++++++++++- .../RuntimeFunctionSubopFragmentizer.cpp | 34 ++++++++- src/runtime/HashTableRuntime.cpp | 68 +++++++++++++++++ src/runtime/HashTableRuntime.h | 12 ++- src/runtime/NewHashTables.cpp | 40 ++++++++-- src/runtime/NewHashTables.h | 12 +++ test/runtime/test_atomic_hash_table.cpp | 9 ++- .../test_atomic_hash_table_complex_key.cpp | 8 +- 13 files changed, 334 insertions(+), 56 deletions(-) diff --git a/bench/ht_benchmark.cpp b/bench/ht_benchmark.cpp index c831ac1..6227565 100644 --- a/bench/ht_benchmark.cpp +++ b/bench/ht_benchmark.cpp @@ -229,8 +229,7 @@ BENCHMARK(ht_lookup_unordered_map_nomatch>)->ArgsProduct({{ BENCHMARK(ht_lookup_unordered_map_nomatch>)->ArgsProduct({{1'000, 100'000, 10'000'000, 50'000'000}}); BENCHMARK(ht_lookup_unordered_map_nomatch>)->ArgsProduct({{1'000, 100'000, 10'000'000, 50'000'000}}); BENCHMARK(ht_lookup_unordered_map_nomatch>)->ArgsProduct({{1'000, 100'000, 10'000'000, 50'000'000}}); -*/ - +*/ } } diff --git a/bench/vectorized_ht.cpp b/bench/vectorized_ht.cpp index dc28c7a..a2823f8 100644 --- a/bench/vectorized_ht.cpp +++ b/bench/vectorized_ht.cpp @@ -1,4 +1,5 @@ #include "benchmark/benchmark.h" +#include "runtime/NewHashTables.h" #include "xxhash.h" #include #include @@ -6,6 +7,8 @@ #include #include +using namespace inkfuse; + /** * Microbenchmarks inspired by Peter's feedback: In vectorized engines, * parallel hash table access can be made more efficient than in a tuple-at-a time @@ -48,7 +51,6 @@ * BM_ht_perf_vectorized/524288/256 10098416 ns 10093265 ns 72 items_per_second=51.9443M/s * BM_ht_perf_vectorized/33554432/256 971872286 ns 971838853 ns 1 items_per_second=34.5267M/s * BM_ht_perf_vectorized/1073741824/256 51425526675 ns 51422464322 ns 1 items_per_second=20.8808M/s - * BM_ht_perf_vectorized/33554432/256 933936147 ns 933873161 ns 1 items_per_second=35.9304M/s * */ namespace { @@ -188,10 +190,72 @@ void BM_ht_perf_vectorized(benchmark::State& state) { state.SetItemsProcessed(state.iterations() * num_elems); } +void BM_ht_perf_tat_inkfuse(benchmark::State& state) { + const uint64_t num_elems = state.range(0); + inkfuse::SimpleKeyComparator comp{8}; + AtomicHashTable ht{comp, 16, 2 * num_elems}; + for (uint64_t k = 1; k <= num_elems; ++k) { + const uint64_t key = 7 * k; + char* value = ht.insert(reinterpret_cast(&key)); + reinterpret_cast(value)[1] = k; + } + for (auto _ : state) { + for (uint64_t k = 1; k <= num_elems; ++k) { + const uint64_t key = 7 * k; + char* res = ht.lookup(reinterpret_cast(&key)); + if (reinterpret_cast(res)[1] > num_elems) { + throw std::runtime_error("bad ht lookup for " + std::to_string(k)); + } + } + } + state.SetItemsProcessed(state.iterations() * num_elems); +} + +void BM_ht_perf_vectorized_inkfuse(benchmark::State& state) { + const uint64_t num_elems = state.range(0); + const uint64_t batch_size = state.range(1); + inkfuse::SimpleKeyComparator comp{8}; + AtomicHashTable ht{comp, 16, 2 * num_elems}; + for (uint64_t k = 1; k <= num_elems; ++k) { + const uint64_t key = 7 * k; + char* value = ht.insert(reinterpret_cast(&key)); + reinterpret_cast(value)[1] = k; + } + std::vector keys(batch_size); + std::vector hashes(batch_size); + for (auto _ : state) { + // Lookup every key again. + for (uint64_t k = 1; k <= num_elems; k += batch_size) { + const auto curr_batch = std::min(batch_size, num_elems - k + 1); + for (uint64_t tid = 0; tid < curr_batch; ++tid) { + keys[tid] = 7 * (k + tid); + } + for (uint64_t tid = 0; tid < curr_batch; ++tid) { + hashes[tid] = ht.compute_hash(reinterpret_cast(&keys[tid])); + } + for (uint64_t tid = 0; tid < curr_batch; ++tid) { + ht.slot_prefetch(hashes[tid]); + } + for (uint64_t tid = 0; tid < curr_batch; ++tid) { + const auto* res = ht.lookup(reinterpret_cast(&keys[tid]), hashes[tid]); + if (reinterpret_cast(res)[1] > num_elems) { + throw std::runtime_error("bad ht lookup for " + std::to_string(k)); + } + } + } + } + state.SetItemsProcessed(state.iterations() * num_elems); +} + BENCHMARK(BM_ht_perf_tat)->Arg(1 << 9)->Arg(1 << 13)->Arg(1 << 15)->Arg(1 << 19)->Arg(1 << 25)->Arg(1 << 30); -// Different hash table sizes. +BENCHMARK(BM_ht_perf_tat_inkfuse)->Arg(1 << 9)->Arg(1 << 13)->Arg(1 << 15)->Arg(1 << 19)->Arg(1 << 25)->Arg(1 << 30); + BENCHMARK(BM_ht_perf_vectorized)->ArgPair(1 << 9, 256)->ArgPair(1 << 13, 256)->ArgPair(1 << 15, 256)->ArgPair(1 << 19, 256)->ArgPair(1 << 25, 256)->ArgPair(1 << 30, 256); -// Different internal batch sizes. -BENCHMARK(BM_ht_perf_vectorized)->ArgPair(1 << 25, 64)->ArgPair(1 << 25, 128)->ArgPair(1 << 25, 256)->ArgPair(1 << 25, 512)->ArgPair(1 << 25, 1024)->ArgPair(1 << 25, 2024)->ArgPair(1 << 25, 4048)->ArgPair(1 << 25, 8096)->ArgPair(1<<25, 16192); +// Different internal batch sizes. 256 is a good value. +BENCHMARK(BM_ht_perf_vectorized)->ArgPair(1 << 25, 64)->ArgPair(1 << 25, 128)->ArgPair(1 << 25, 256)->ArgPair(1 << 25, 512)->ArgPair(1 << 25, 1024)->ArgPair(1 << 25, 2024)->ArgPair(1 << 25, 4048)->ArgPair(1 << 25, 8096)->ArgPair(1 << 25, 16192); + +BENCHMARK(BM_ht_perf_vectorized_inkfuse)->ArgPair(1 << 9, 256)->ArgPair(1 << 13, 256)->ArgPair(1 << 15, 256)->ArgPair(1 << 19, 256)->ArgPair(1 << 25, 256)->ArgPair(1 << 30, 256); +// Different internal batch sizes. 256 is a good value. +BENCHMARK(BM_ht_perf_vectorized_inkfuse)->ArgPair(1 << 25, 64)->ArgPair(1 << 25, 128)->ArgPair(1 << 25, 256)->ArgPair(1 << 25, 512)->ArgPair(1 << 25, 1024)->ArgPair(1 << 25, 2024)->ArgPair(1 << 25, 4048)->ArgPair(1 << 25, 8096)->ArgPair(1 << 25, 16192); } // namespacf diff --git a/src/algebra/Join.cpp b/src/algebra/Join.cpp index c5e3609..43de72c 100644 --- a/src/algebra/Join.cpp +++ b/src/algebra/Join.cpp @@ -131,6 +131,11 @@ void Join::plan() { lookup_left.emplace(IR::Pointer::build(IR::Char::build())); lookup_right.emplace(IR::Pointer::build(IR::Char::build())); filter_pseudo_iu.emplace(IR::Void::build()); + + // The probe hash is always a unit64_t. + hash_right.emplace(IR::UnsignedInt::build(8)); + // Pseudo IU for making sure we prefetch before we probe. + prefetch_pseudo.emplace(IR::Void::build()); } void Join::decay(inkfuse::PipelineDAG& dag) const { @@ -245,12 +250,19 @@ void Join::decayPkJoin(inkfuse::PipelineDAG& dag) const { pseudo.push_back(&pseudo_iu); } + // 2.2.1 Compute the hash. + probe_pipe.attachSuboperator(RuntimeFunctionSubop::htHash>(this, *hash_right, *scratch_pad_right, std::move(pseudo), &ht_state)); + + // 2.2.2 Prefetch the slot. + probe_pipe.attachSuboperator(RuntimeFunctionSubop::htPrefetch>(this, &*prefetch_pseudo, *hash_right, &ht_state)); + + // 2.2.3 Perfom the lookup. if (type == JoinType::LeftSemi) { // Lookup on a slot disables the slot, giving semi-join behaviour. - probe_pipe.attachSuboperator(RuntimeFunctionSubop::htLookupDisable(this, *lookup_right, *scratch_pad_right, std::move(pseudo), &ht_state)); + probe_pipe.attachSuboperator(RuntimeFunctionSubop::htLookupWithHash, true>(this, *lookup_right, *scratch_pad_right, *hash_right, &*prefetch_pseudo, &ht_state)); } else { // Regular lookup that does not disable slots. - probe_pipe.attachSuboperator(RuntimeFunctionSubop::htLookup>(this, *lookup_right, *scratch_pad_right, std::move(pseudo), &ht_state)); + probe_pipe.attachSuboperator(RuntimeFunctionSubop::htLookupWithHash, false>(this, *lookup_right, *scratch_pad_right, *hash_right, &*prefetch_pseudo, &ht_state)); } // 2.3 Filter on probe matches. diff --git a/src/algebra/Join.h b/src/algebra/Join.h index 332ec31..547646d 100644 --- a/src/algebra/Join.h +++ b/src/algebra/Join.h @@ -62,6 +62,11 @@ struct Join : public RelAlgOp { /// Packed scratch pad IU right. std::optional scratch_pad_right; + /// Computed hash on the probe side. + std::optional hash_right; + /// Prefetch pseudo IU - ensures that we prefetch before probing. + std::optional prefetch_pseudo; + /// Lookup result left. std::optional lookup_left; /// Lookup result right. diff --git a/src/algebra/suboperators/RuntimeFunctionSubop.cpp b/src/algebra/suboperators/RuntimeFunctionSubop.cpp index 0ba7167..453a2fc 100644 --- a/src/algebra/suboperators/RuntimeFunctionSubop.cpp +++ b/src/algebra/suboperators/RuntimeFunctionSubop.cpp @@ -53,29 +53,6 @@ std::unique_ptr RuntimeFunctionSubop::htInsert(const inkfu pointers_)); } -std::unique_ptr RuntimeFunctionSubop::htLookupDisable(const RelAlgOp* source, const IU& pointers_, const IU& keys_, std::vector pseudo_ius_, DefferredStateInitializer* state_init_) { - std::string fct_name = "ht_at_sk_lookup_disable"; - std::vector in_ius{&keys_}; - for (auto pseudo : pseudo_ius_) { - // Pseudo IUs are used as input IUs in the backing graph, but do not influence arguments. - in_ius.push_back(pseudo); - } - std::vector ref{keys_.type->id() != "ByteArray" && keys_.type->id() != "Ptr_Char"}; - std::vector out_ius_{&pointers_}; - std::vector args{&keys_}; - const IU* out = &pointers_; - return std::unique_ptr( - new RuntimeFunctionSubop( - source, - state_init_, - std::move(fct_name), - std::move(in_ius), - std::move(out_ius_), - std::move(args), - std::move(ref), - out)); -} - std::unique_ptr RuntimeFunctionSubop::htNoKeyLookup(const RelAlgOp* source, const IU& pointers_, const IU& input_dependency, DefferredStateInitializer* state_init_) { std::string fct_name = "ht_nk_lookup"; std::vector in_ius{&input_dependency}; @@ -122,12 +99,14 @@ void RuntimeFunctionSubop::consumeAllChildren(CompilationContext& context) { std::unordered_set provided; - // Declare the output IUs. + // Declare the output IUs that are not pseudo IUs. for (const IU* out_iu : provided_ius) { - provided.emplace(out_iu); - auto iu_name = context.buildIUIdentifier(*out_iu); - const auto& declare = builder.appendStmt(IR::DeclareStmt::build(std::move(iu_name), out_iu->type)); - context.declareIU(*out_iu, declare); + if (out_iu->type->id() != "Void") { + provided.emplace(out_iu); + auto iu_name = context.buildIUIdentifier(*out_iu); + const auto& declare = builder.appendStmt(IR::DeclareStmt::build(std::move(iu_name), out_iu->type)); + context.declareIU(*out_iu, declare); + } } // Assemble the input expressions. diff --git a/src/algebra/suboperators/RuntimeFunctionSubop.h b/src/algebra/suboperators/RuntimeFunctionSubop.h index 4b6a4d8..794c422 100644 --- a/src/algebra/suboperators/RuntimeFunctionSubop.h +++ b/src/algebra/suboperators/RuntimeFunctionSubop.h @@ -30,8 +30,80 @@ struct RuntimeFunctionSubop : public TemplatedSuboperator htInsert(const RelAlgOp* source, const IU* pointers_, const IU& key_, std::vector pseudo_ius_, DefferredStateInitializer* state_init_ = nullptr); - /// Build a hash table lookup function that disables every found slot. - static std::unique_ptr htLookupDisable(const RelAlgOp* source, const IU& pointers_, const IU& key_, std::vector pseudo_ius_, DefferredStateInitializer* state_init_ = nullptr); + /// Hash a key with the hash table's hash function. + template + static std::unique_ptr htHash(const RelAlgOp* source, const IU& hash_, const IU& key_, std::vector pseudo_ius_, DefferredStateInitializer* state_init_ = nullptr) { + std::string fct_name = "ht_" + HashTable::ID + "_compute_hash"; + std::vector in_ius{&key_}; + for (auto pseudo : pseudo_ius_) { + // Pseudo IUs are used as input IUs in the backing graph, but do not influence arguments. + in_ius.push_back(pseudo); + } + std::vector ref{key_.type->id() != "ByteArray" && key_.type->id() != "Ptr_Char"}; + std::vector out_ius_{&hash_}; + std::vector args{&key_}; + const IU* out = &hash_; + return std::unique_ptr( + new RuntimeFunctionSubop( + source, + state_init_, + std::move(fct_name), + std::move(in_ius), + std::move(out_ius_), + std::move(args), + std::move(ref), + out)); + } + + /// Hash a key with the hash table's hash function. + template + static std::unique_ptr htPrefetch(const RelAlgOp* source, const IU* prefetch_pseudo, const IU& hash_, DefferredStateInitializer* state_init_ = nullptr) { + std::string fct_name = "ht_" + HashTable::ID + "_slot_prefetch"; + std::vector in_ius{&hash_}; + std::vector ref{false}; + std::vector out_ius_{}; + if (prefetch_pseudo) { + out_ius_.push_back(prefetch_pseudo); + } + std::vector args{&hash_}; + return std::unique_ptr( + new RuntimeFunctionSubop( + source, + state_init_, + std::move(fct_name), + std::move(in_ius), + std::move(out_ius_), + std::move(args), + std::move(ref), + /* out = */ nullptr)); + } + + /// Build a hash table lookup function. + template + static std::unique_ptr htLookupWithHash(const RelAlgOp* source, const IU& pointers_, const IU& key_, const IU& hash_, const IU* prefetch_pseudo_, DefferredStateInitializer* state_init_ = nullptr) { + std::string fct_name = "ht_" + HashTable::ID + "_lookup_with_hash"; + if constexpr (disable_slot) { + fct_name += "_disable"; + } + std::vector in_ius{&key_, &hash_}; + if (prefetch_pseudo_) { + in_ius.push_back(prefetch_pseudo_); + } + std::vector ref{key_.type->id() != "ByteArray" && key_.type->id() != "Ptr_Char", false}; + std::vector out_ius_{&pointers_}; + std::vector args{&key_, &hash_}; + const IU* out = &pointers_; + return std::unique_ptr( + new RuntimeFunctionSubop( + source, + state_init_, + std::move(fct_name), + std::move(in_ius), + std::move(out_ius_), + std::move(args), + std::move(ref), + out)); + } /// Build a hash table lookup function. template diff --git a/src/interpreter/RuntimeFunctionSubopFragmentizer.cpp b/src/interpreter/RuntimeFunctionSubopFragmentizer.cpp index 3fb62de..c55c78c 100644 --- a/src/interpreter/RuntimeFunctionSubopFragmentizer.cpp +++ b/src/interpreter/RuntimeFunctionSubopFragmentizer.cpp @@ -38,13 +38,31 @@ RuntimeFunctionSubopFragmentizer::RuntimeFunctionSubopFragmentizer() { name = op.id(); } - // Fragmentize hash table lookup that disables the slot (for left semi joins). + // Fragmentize Vectorized Hash Table Primitives { + // Hash: auto& [name, pipe] = pipes.emplace_back(); const auto& key = generated_ius.emplace_back(in_type); - const auto& result_ptr = generated_ius.emplace_back(IR::Pointer::build(IR::Char::build())); - // No pseudo-IU inputs, these only matter for more complex DAGs. - const auto& op = pipe.attachSuboperator(RuntimeFunctionSubop::htLookupDisable(nullptr, result_ptr, key, {})); + const auto& hash = generated_ius.emplace_back(IR::UnsignedInt::build(8)); + const auto& op = pipe.attachSuboperator(RuntimeFunctionSubop::htHash>(nullptr, hash, key, {})); + name = op.id(); + } + { + // Lookup don't disable slot: + auto& [name, pipe] = pipes.emplace_back(); + const auto& hash = generated_ius.emplace_back(IR::UnsignedInt::build(8)); + const auto& key = generated_ius.emplace_back(in_type); + const auto& result = generated_ius.emplace_back(IR::Pointer::build(IR::Char::build())); + const auto& op = pipe.attachSuboperator(RuntimeFunctionSubop::htLookupWithHash, false>(nullptr, result, key, hash, nullptr)); + name = op.id(); + } + { + // Lookup disable slot: + auto& [name, pipe] = pipes.emplace_back(); + const auto& hash = generated_ius.emplace_back(IR::UnsignedInt::build(8)); + const auto& key = generated_ius.emplace_back(in_type); + const auto& result = generated_ius.emplace_back(IR::Pointer::build(IR::Char::build())); + const auto& op = pipe.attachSuboperator(RuntimeFunctionSubop::htLookupWithHash, true>(nullptr, result, key, hash, nullptr)); name = op.id(); } @@ -88,6 +106,14 @@ RuntimeFunctionSubopFragmentizer::RuntimeFunctionSubopFragmentizer() { } } + // Fragmentize Prefetch. + { + auto& [name, pipe] = pipes.emplace_back(); + const auto& hash = generated_ius.emplace_back(IR::UnsignedInt::build(8)); + const auto& op = pipe.attachSuboperator(RuntimeFunctionSubop::htPrefetch>(nullptr, nullptr, hash)); + name = op.id(); + } + // Fragmentize tuple materialization. { auto& [name, pipe] = pipes.emplace_back(); diff --git a/src/runtime/HashTableRuntime.cpp b/src/runtime/HashTableRuntime.cpp index 21c1f96..149197f 100644 --- a/src/runtime/HashTableRuntime.cpp +++ b/src/runtime/HashTableRuntime.cpp @@ -58,6 +58,38 @@ extern "C" void HashTableRuntime::ht_dl_it_advance(void* table, char** it_data, } // Atomic hash table. +extern "C" uint64_t HashTableRuntime::ht_at_sk_compute_hash(void* table, char* key) { + return reinterpret_cast*>(table)->compute_hash(key); +} + +extern "C" void HashTableRuntime::ht_at_sk_slot_prefetch(void* table, uint64_t hash) { + reinterpret_cast*>(table)->slot_prefetch(hash); +} + +extern "C" char* HashTableRuntime::ht_at_sk_lookup_with_hash(void* table, char* key, uint64_t hash) { + return reinterpret_cast*>(table)->lookup(key, hash); +} + +extern "C" char* HashTableRuntime::ht_at_sk_lookup_with_hash_disable(void* table, char* key, uint64_t hash) { + return reinterpret_cast*>(table)->lookupDisable(key, hash); +} + +extern "C" uint64_t HashTableRuntime::ht_at_ck_compute_hash(void* table, char* key) { + return reinterpret_cast*>(table)->compute_hash(key); +} + +extern "C" void HashTableRuntime::ht_at_ck_slot_prefetch(void* table, uint64_t hash) { + reinterpret_cast*>(table)->slot_prefetch(hash); +} + +extern "C" char* HashTableRuntime::ht_at_ck_lookup_with_hash(void* table, char* key, uint64_t hash) { + return reinterpret_cast*>(table)->lookup(key, hash); +} + +extern "C" char* HashTableRuntime::ht_at_ck_lookup_with_hash_disable(void* table, char* key, uint64_t hash) { + return reinterpret_cast*>(table)->lookupDisable(key, hash); +} + extern "C" char* HashTableRuntime::ht_at_sk_lookup(void* table, char* key) { return reinterpret_cast*>(table)->lookup(key); } @@ -139,6 +171,42 @@ void HashTableRuntime::registerRuntime() { RuntimeFunctionBuilder("ht_at_ck_lookup", IR::Pointer::build(IR::Char::build())) .addArg("table", IR::Pointer::build(IR::Void::build())) .addArg("key", IR::Pointer::build(IR::Char::build()), true); + + RuntimeFunctionBuilder("ht_at_sk_compute_hash", IR::UnsignedInt::build(8)) + .addArg("table", IR::Pointer::build(IR::Void::build())) + .addArg("key", IR::Pointer::build(IR::Char::build()), true); + + RuntimeFunctionBuilder("ht_at_sk_slot_prefetch", IR::Void::build()) + .addArg("table", IR::Pointer::build(IR::Void::build())) + .addArg("hash", IR::UnsignedInt::build(8), true); + + RuntimeFunctionBuilder("ht_at_sk_lookup_with_hash", IR::Pointer::build(IR::Char::build())) + .addArg("table", IR::Pointer::build(IR::Void::build())) + .addArg("key", IR::Pointer::build(IR::Char::build())) + .addArg("hash", IR::UnsignedInt::build(8), true); + + RuntimeFunctionBuilder("ht_at_sk_lookup_with_hash_disable", IR::Pointer::build(IR::Char::build())) + .addArg("table", IR::Pointer::build(IR::Void::build())) + .addArg("key", IR::Pointer::build(IR::Char::build())) + .addArg("hash", IR::UnsignedInt::build(8), true); + + RuntimeFunctionBuilder("ht_at_ck_compute_hash", IR::UnsignedInt::build(8)) + .addArg("table", IR::Pointer::build(IR::Void::build())) + .addArg("key", IR::Pointer::build(IR::Char::build()), true); + + RuntimeFunctionBuilder("ht_at_ck_slot_prefetch", IR::Void::build()) + .addArg("table", IR::Pointer::build(IR::Void::build())) + .addArg("hash", IR::UnsignedInt::build(8), true); + + RuntimeFunctionBuilder("ht_at_ck_lookup_with_hash", IR::Pointer::build(IR::Char::build())) + .addArg("table", IR::Pointer::build(IR::Void::build())) + .addArg("key", IR::Pointer::build(IR::Char::build())) + .addArg("hash", IR::UnsignedInt::build(8), true); + + RuntimeFunctionBuilder("ht_at_ck_lookup_with_hash_disable", IR::Pointer::build(IR::Char::build())) + .addArg("table", IR::Pointer::build(IR::Void::build())) + .addArg("key", IR::Pointer::build(IR::Char::build())) + .addArg("hash", IR::UnsignedInt::build(8), true); } } diff --git a/src/runtime/HashTableRuntime.h b/src/runtime/HashTableRuntime.h index 8fa418d..b9c041f 100644 --- a/src/runtime/HashTableRuntime.h +++ b/src/runtime/HashTableRuntime.h @@ -29,13 +29,21 @@ extern "C" char* ht_at_sk_lookup(void* table, char* key); extern "C" char* ht_at_sk_lookup_disable(void* table, char* key); extern "C" char* ht_at_ck_lookup(void* table, char* key); +extern "C" uint64_t ht_at_sk_compute_hash(void* table, char* key); +extern "C" void ht_at_sk_slot_prefetch(void* table, uint64_t hash); +extern "C" char* ht_at_sk_lookup_with_hash(void* table, char* key, uint64_t hash); +extern "C" char* ht_at_sk_lookup_with_hash_disable(void* table, char* key, uint64_t hash); + +extern "C" uint64_t ht_at_ck_compute_hash(void* table, char* key); +extern "C" void ht_at_ck_slot_prefetch(void* table, uint64_t hash); +extern "C" char* ht_at_ck_lookup_with_hash(void* table, char* key, uint64_t hash); +extern "C" char* ht_at_ck_lookup_with_hash_disable(void* table, char* key, uint64_t hash); + /// Special lookup function if we know we have a 0-byte key. extern "C" char* ht_nk_lookup(void* table); void registerRuntime(); }; - - } #endif //INKFUSE_HASHTABLERUNTIME_H diff --git a/src/runtime/NewHashTables.cpp b/src/runtime/NewHashTables.cpp index ad3b3cd..43882f7 100644 --- a/src/runtime/NewHashTables.cpp +++ b/src/runtime/NewHashTables.cpp @@ -83,14 +83,27 @@ AtomicHashTable::AtomicHashTable(Comparator comp_, uint16_t total_sl } template -char* AtomicHashTable::lookup(const char* key) const { +uint64_t AtomicHashTable::compute_hash(const char* key) const { + return comp.hash(key); +} + +template +void AtomicHashTable::slot_prefetch(uint64_t hash) const { + const uint64_t slot_id = hash & mod_mask; + // Prefetch the actual data array. + __builtin_prefetch(&data[slot_id * total_slot_size]); + // Prefetch the bitmask slot. + __builtin_prefetch(&tags[slot_id]); +} + +template +char* AtomicHashTable::lookup(const char* key, uint64_t hash) const { + const uint64_t slot_id = hash & mod_mask; // Look up the initial slot in the linear probing chain. - const uint64_t hash = comp.hash(key); - const auto idx = hash & mod_mask; IteratorState it{ - .idx = idx, - .data_ptr = &data[idx * total_slot_size], - .tag_ptr = &tags[idx], + .idx = slot_id, + .data_ptr = &data[slot_id * total_slot_size], + .tag_ptr = &tags[slot_id], }; // The tag we are looking for. const uint8_t target_tag = tag_fill_mask | static_cast(hash >> 56ul); @@ -111,9 +124,14 @@ char* AtomicHashTable::lookup(const char* key) const { } template -char* AtomicHashTable::lookupDisable(const char* key) { - // Look up the initial slot in the linear probing chain. +char* AtomicHashTable::lookup(const char* key) const { const uint64_t hash = comp.hash(key); + return lookup(key, hash); +} + +template +char* AtomicHashTable::lookupDisable(const char* key, uint64_t hash) { + // Look up the initial slot in the linear probing chain. const auto idx = hash & mod_mask; IteratorState it{ .idx = idx, @@ -148,6 +166,12 @@ char* AtomicHashTable::lookupDisable(const char* key) { return it.data_ptr; } +template +char* AtomicHashTable::lookupDisable(const char* key) { + const uint64_t hash = comp.hash(key); + return lookupDisable(key, hash); +} + template template char* AtomicHashTable::insert(const char* key) { diff --git a/src/runtime/NewHashTables.h b/src/runtime/NewHashTables.h index de265cd..6fb2e70 100644 --- a/src/runtime/NewHashTables.h +++ b/src/runtime/NewHashTables.h @@ -43,6 +43,18 @@ struct AtomicHashTable { AtomicHashTable(Comparator comp_, uint16_t total_slot_size_, size_t num_slots_); + /// Compute the hash for a given key. + uint64_t compute_hash(const char* key) const; + /// Prefetch the tag and data slots for a specific hash. + void slot_prefetch(uint64_t hash) const; + /// Get the pointer to a given key, or nullptr if the group does not exist. + /// Already requires the hash was computed. + char* lookup(const char* key, uint64_t hash) const; + /// Get the pointer to a given key, or nullptr if the group does not exist. + /// If it finds a slot, disables it. Needed for e.g. left semi joins. + /// Already requires the hash was computed. + char* lookupDisable(const char* key, uint64_t hash); + /// Get the pointer to a given key, or nullptr if the group does not exist. char* lookup(const char* key) const; /// Get the pointer to a given key, or nullptr if the group does not exist. diff --git a/test/runtime/test_atomic_hash_table.cpp b/test/runtime/test_atomic_hash_table.cpp index 9fbcf3a..3dd7b30 100644 --- a/test/runtime/test_atomic_hash_table.cpp +++ b/test/runtime/test_atomic_hash_table.cpp @@ -62,7 +62,9 @@ struct AtomicHashTableTestT : public ::testing::TestWithParam { void checkContains(const RandomDataResult& data, size_t idx) { const char* key_ptr = &data.keys[idx * std::get<0>(GetParam())]; const char* payload_ptr = &data.payloads[idx * 16]; - auto slot_lookup = ht.lookup(key_ptr); + const auto hash = ht.compute_hash(key_ptr); + ht.slot_prefetch(hash); + const auto slot_lookup = ht.lookup(key_ptr, hash); ASSERT_NE(slot_lookup, nullptr); // Check that key was serialized properly. EXPECT_EQ(std::memcmp(slot_lookup, key_ptr, std::get<0>(GetParam())), 0); @@ -71,7 +73,10 @@ struct AtomicHashTableTestT : public ::testing::TestWithParam { } void checkNotContains(const RandomDataResult& data, size_t idx) { - auto slot = ht.lookup(&data.keys[idx * std::get<0>(GetParam())]); + const char* key_ptr = &data.keys[idx * std::get<0>(GetParam())]; + const auto hash = ht.compute_hash(key_ptr); + ht.slot_prefetch(hash); + const auto slot = ht.lookup(key_ptr, hash); EXPECT_EQ(slot, nullptr); } diff --git a/test/runtime/test_atomic_hash_table_complex_key.cpp b/test/runtime/test_atomic_hash_table_complex_key.cpp index 4f68792..2cb2fb8 100644 --- a/test/runtime/test_atomic_hash_table_complex_key.cpp +++ b/test/runtime/test_atomic_hash_table_complex_key.cpp @@ -63,7 +63,9 @@ struct AtomicComplexHashTableTestT : public ::testing::TestWithParam { void checkContains(const std::vector& data, size_t idx) { const char* raw_string = data[idx].data(); const char* key_ptr = reinterpret_cast(&raw_string); - auto slot_lookup = ht.lookup(key_ptr); + const auto hash = ht.compute_hash(key_ptr); + ht.slot_prefetch(hash); + const auto slot_lookup = ht.lookup(key_ptr, hash); ASSERT_NE(slot_lookup, nullptr); // Check that key was serialized properly. EXPECT_EQ(std::strcmp(*reinterpret_cast(slot_lookup), *reinterpret_cast(key_ptr)), 0); @@ -75,7 +77,9 @@ struct AtomicComplexHashTableTestT : public ::testing::TestWithParam { if (std::find(data_exists.begin(), data_exists.end(), str) == data_exists.end()) { const char* raw_string = str.data(); const char* key_ptr = reinterpret_cast(&raw_string); - auto slot = ht.lookup(key_ptr); + const auto hash = ht.compute_hash(key_ptr); + ht.slot_prefetch(hash); + const auto slot = ht.lookup(key_ptr, hash); EXPECT_EQ(slot, nullptr); } } From ce337a6f6db53a63bd51ccabd07e3a4eae8f5e95 Mon Sep 17 00:00:00 2001 From: Benjamin Wagner Date: Sun, 22 Oct 2023 14:58:43 +0200 Subject: [PATCH 2/4] Allow Supberators to Only Generate Code for the Vectorized Backend This commit is the next one in the chain to generate a faster vectorized backend. We can now make both a `Suboperator` and the `CompilationContext` with additional optimziation hints. This allows us to mark suboperator that generate prefetching code in a way that does not generate code for operator-fusing codegen. The prefetching calls are now only emitted for functions in the vectorized backend, but do not generate code for compiled execution. In general, the prefetching for operator fusing code is not important, as we will do a lookup on the same tuple right after, which will then cause the respective cache miss. As a result, prefetching only generates more instructions and function calls. --- src/algebra/CompilationContext.cpp | 25 ++++++++++++---- src/algebra/CompilationContext.h | 30 +++++++++++++++---- .../suboperators/RuntimeFunctionSubop.cpp | 12 ++++---- .../suboperators/RuntimeFunctionSubop.h | 25 +++++++++------- src/algebra/suboperators/Suboperator.h | 18 +++++++++++ src/interpreter/FragmentGenerator.cpp | 13 ++++---- 6 files changed, 90 insertions(+), 33 deletions(-) diff --git a/src/algebra/CompilationContext.cpp b/src/algebra/CompilationContext.cpp index df842db..d86cd65 100644 --- a/src/algebra/CompilationContext.cpp +++ b/src/algebra/CompilationContext.cpp @@ -6,12 +6,12 @@ namespace inkfuse { -CompilationContext::CompilationContext(std::string program_name, const Pipeline& pipeline_) - : pipeline(pipeline_), program(std::make_shared(std::move(program_name), false)), fct_name("execute") { +CompilationContext::CompilationContext(std::string program_name, const Pipeline& pipeline_, OptimizationHints hints_) + : pipeline(pipeline_), program(std::make_shared(std::move(program_name), false)), fct_name("execute"), optimization_hints(hints_) { } -CompilationContext::CompilationContext(IR::ProgramArc program_, std::string fct_name_, const Pipeline& pipeline_) - : pipeline(pipeline_), program(std::move(program_)), fct_name(std::move(fct_name_)) { +CompilationContext::CompilationContext(IR::ProgramArc program_, std::string fct_name_, const Pipeline& pipeline_, OptimizationHints hints_) + : pipeline(pipeline_), program(std::move(program_)), fct_name(std::move(fct_name_)), optimization_hints(hints_) { } void CompilationContext::compile() { @@ -57,8 +57,17 @@ void CompilationContext::notifyIUsReady(Suboperator& op) { // Consume in the original requestor. requestor->consume(*iu, *this); if (++properties[requestor].serviced_requests == requestor->getNumSourceIUs()) { - // Consume in the original requestor notifying it that all children were produced successfuly. - requestor->consumeAllChildren(*this); + const bool generates_fusing = optimization_hints.mode == OptimizationHints::CodegenMode::OperatorFusing; + const bool only_generate_when_vectorized = requestor->getOptimizationProperties().ct_only_vectorized; + if (generates_fusing && only_generate_when_vectorized) { + // We don't need to generate any code for this suboperator. + // Directly mark the output IUs as ready (those are all pseudo IUs). + notifyIUsReady(*requestor); + } else { + // Consume in the original requestor notifying it that all children were produced successfuly. + // Actually let the consumer generate the required code. + requestor->consumeAllChildren(*this); + } } } @@ -139,6 +148,10 @@ IR::FunctionBuilder& CompilationContext::getFctBuilder() { return builder->fct_builder; } +const OptimizationHints& CompilationContext::getOptimizationHints() const { + return optimization_hints; +} + CompilationContext::Builder::Builder(IR::Program& program, std::string fct_name) : ir_builder(program.getIRBuilder()), fct_builder(createFctBuilder(ir_builder, std::move(fct_name))) { } diff --git a/src/algebra/CompilationContext.h b/src/algebra/CompilationContext.h index dd33b2c..7d46efb 100644 --- a/src/algebra/CompilationContext.h +++ b/src/algebra/CompilationContext.h @@ -7,22 +7,37 @@ #include "exec/FuseChunk.h" #include -#include -#include -#include #include +#include +#include #include +#include namespace inkfuse { struct Suboperator; +/// Hints that can be used during the code generation to generate more optimized +/// code. Examples: +/// +/// When we are generating `OperatorFusing` code, we do not issue prefetch instructions. +/// These are exclusively used in the vectorized backends to issue independent loads +/// and hide cache miss latency for followup operators. +struct OptimizationHints { + enum class CodegenMode { + OperatorFusing, + Vectorized, + }; + + CodegenMode mode = CodegenMode::OperatorFusing; +}; + /// Context for compiling a single pipeline. struct CompilationContext { /// Set up a compilation context for generating code for a full given pipeline. - CompilationContext(std::string program_name, const Pipeline& pipeline_); + CompilationContext(std::string program_name, const Pipeline& pipeline_, OptimizationHints hints_ = OptimizationHints{}); /// Set up a compilation context which will generate the code within a specific IR program for the full pipeline. - CompilationContext(IR::ProgramArc program_, std::string fct_name_, const Pipeline& pipeline_); + CompilationContext(IR::ProgramArc program_, std::string fct_name_, const Pipeline& pipeline_, OptimizationHints hints_ = OptimizationHints{}); /// Compile the code for this context. void compile(); @@ -50,6 +65,9 @@ struct CompilationContext { const IR::Program& getProgram(); IR::FunctionBuilder& getFctBuilder(); + /// Get the optimization hints for the generated program. + const OptimizationHints& getOptimizationHints() const; + private: static IR::FunctionBuilder createFctBuilder(IR::IRBuilder& program, std::string fct_name); @@ -73,6 +91,8 @@ struct CompilationContext { const std::string fct_name; /// The backing IR program. IR::ProgramArc program; + /// Optimization hints that can be used during code generation. + OptimizationHints optimization_hints; /// The function builder for the generated code. std::optional builder; /// Which sub-operators were computed already? Needed to prevent double-computation in DAGs. diff --git a/src/algebra/suboperators/RuntimeFunctionSubop.cpp b/src/algebra/suboperators/RuntimeFunctionSubop.cpp index 453a2fc..db0cb57 100644 --- a/src/algebra/suboperators/RuntimeFunctionSubop.cpp +++ b/src/algebra/suboperators/RuntimeFunctionSubop.cpp @@ -99,14 +99,12 @@ void RuntimeFunctionSubop::consumeAllChildren(CompilationContext& context) { std::unordered_set provided; - // Declare the output IUs that are not pseudo IUs. + // Declare the output IUs. for (const IU* out_iu : provided_ius) { - if (out_iu->type->id() != "Void") { - provided.emplace(out_iu); - auto iu_name = context.buildIUIdentifier(*out_iu); - const auto& declare = builder.appendStmt(IR::DeclareStmt::build(std::move(iu_name), out_iu->type)); - context.declareIU(*out_iu, declare); - } + provided.emplace(out_iu); + auto iu_name = context.buildIUIdentifier(*out_iu); + const auto& declare = builder.appendStmt(IR::DeclareStmt::build(std::move(iu_name), out_iu->type)); + context.declareIU(*out_iu, declare); } // Assemble the input expressions. diff --git a/src/algebra/suboperators/RuntimeFunctionSubop.h b/src/algebra/suboperators/RuntimeFunctionSubop.h index 794c422..6f9ddcd 100644 --- a/src/algebra/suboperators/RuntimeFunctionSubop.h +++ b/src/algebra/suboperators/RuntimeFunctionSubop.h @@ -66,16 +66,21 @@ struct RuntimeFunctionSubop : public TemplatedSuboperator args{&hash_}; - return std::unique_ptr( - new RuntimeFunctionSubop( - source, - state_init_, - std::move(fct_name), - std::move(in_ius), - std::move(out_ius_), - std::move(args), - std::move(ref), - /* out = */ nullptr)); + std::unique_ptr result_subop{new RuntimeFunctionSubop( + source, + state_init_, + std::move(fct_name), + std::move(in_ius), + std::move(out_ius_), + std::move(args), + std::move(ref), + /* out = */ nullptr)}; + // Prefetch instructions should never be generated in the operator-fusing code. + // When performing operator-fusing code generation, we are going through + // the code tuple-at-a time. As a result, the followup superator (e.g. HT lookup) + // will directly cause the cache miss anyways. + result_subop->optimization_properties.ct_only_vectorized = true; + return result_subop; } /// Build a hash table lookup function. diff --git a/src/algebra/suboperators/Suboperator.h b/src/algebra/suboperators/Suboperator.h index ce25e64..f74623d 100644 --- a/src/algebra/suboperators/Suboperator.h +++ b/src/algebra/suboperators/Suboperator.h @@ -116,6 +116,21 @@ struct Suboperator { const std::vector& getSourceIUs() const { return source_ius; } const std::vector& getIUs() const { return provided_ius; } + /// Properties that can influence runtime and code generation behaviour of suboperators. + /// These allow improving the performance of the system. + struct OptimizationProperties { + /// Compile-time property. When set, the suboperator does not generate code + /// for when compiled for operator-fusing code generation. + bool ct_only_vectorized = false; + /// Runtime property. If a chunk size preference is set, the vectorized backend + /// will try to break a morsel into smaller chunk that respect the chunk size + /// preference. This allows us to e.g. perform hash table lookups in a highly + /// optimized way. We can go to smaller chunk sizes that will have better cache + /// locality. This matters as we split prefetching and lookups into two phases. + std::optional rt_chunk_size_prefeference = std::nullopt; + }; + const OptimizationProperties& getOptimizationProperties() const { return optimization_properties; }; + protected: /// The operator which decayed into this Suboperator. const RelAlgOp* source; @@ -127,6 +142,9 @@ struct Suboperator { /// interpreting an operator to make sure that the input columns are extracted in /// the right order. std::vector source_ius; + + /// Optimization properties that can be used to improve suboperator performance. + OptimizationProperties optimization_properties; }; /// Empty state which can be used in the templated suboperators. diff --git a/src/interpreter/FragmentGenerator.cpp b/src/interpreter/FragmentGenerator.cpp index 579a29b..3c0fef4 100644 --- a/src/interpreter/FragmentGenerator.cpp +++ b/src/interpreter/FragmentGenerator.cpp @@ -6,10 +6,10 @@ #include "interpreter/CountingSinkFragmentizer.h" #include "interpreter/ExpressionFragmentizer.h" #include "interpreter/HashTableSourceFragmentizer.h" +#include "interpreter/KeyPackingFragmentizer.h" #include "interpreter/RuntimeExpressionFragmentizer.h" #include "interpreter/RuntimeFunctionSubopFragmentizer.h" #include "interpreter/RuntimeKeyExpressionFragmentizer.h" -#include "interpreter/KeyPackingFragmentizer.h" #include "interpreter/TScanFragmentizer.h" namespace inkfuse { @@ -53,7 +53,7 @@ TypeDecorator& TypeDecorator::attachFloatingPoints() { TypeDecorator& TypeDecorator::attachNumeric() { attachIntegers(); attachFloatingPoints(); - // We also count dates as numeric types. This is because a date internally + // We also count dates as numeric types. This is because a date internally // is represented as a 4 byte signed integer (day offset to the epoch). types.push_back(IR::Date::build()); return *this; @@ -66,8 +66,7 @@ TypeDecorator& TypeDecorator::attachTypes() { return *this; } -TypeDecorator& TypeDecorator::attachStringType() -{ +TypeDecorator& TypeDecorator::attachStringType() { types.push_back(IR::String::build()); return *this; } @@ -104,6 +103,10 @@ IR::ProgramArc FragmentGenerator::build() { // Create the IR program. auto program = std::make_shared("fragments", false); + // Custom optimization hints that indicate that we generate vectorized code. + OptimizationHints hints{ + .mode = OptimizationHints::CodegenMode::Vectorized, + }; // And generate the code for all fragments. for (auto& fragmentizer : fragmentizers) { for (const auto& [name, pipe] : fragmentizer->getFragments()) { @@ -111,7 +114,7 @@ IR::ProgramArc FragmentGenerator::build() { // the right fuse-chunk input and output operators which are needed in the actual fragment. // This in turn means that sub-operators don't have to create fuse chunk sources and sinks themselves. auto repiped = pipe.repipeAll(0, pipe.getSubops().size()); - CompilationContext context(program, name, *repiped); + CompilationContext context(program, name, *repiped, hints); context.compile(); } } From 98cac9e37b624d3bdae7731c326b79b8ba36ff8e Mon Sep 17 00:00:00 2001 From: Benjamin Wagner Date: Sun, 29 Oct 2023 11:37:22 +0100 Subject: [PATCH 3/4] Vectorize Join Hash Table Build When building a hash table during runtime we can apply the same tricks we know from how to make vectorized hash tables fast. We split the building into batches of 256 tuples. This allows for higher insert throughput on large hash tables. --- src/algebra/Join.cpp | 20 +++++++++++++++++--- src/runtime/NewHashTables.cpp | 16 ++++++++++++++-- src/runtime/NewHashTables.h | 3 +++ 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/src/algebra/Join.cpp b/src/algebra/Join.cpp index 43de72c..a895ed6 100644 --- a/src/algebra/Join.cpp +++ b/src/algebra/Join.cpp @@ -41,16 +41,30 @@ void materializedTupleToHashTable( assert(ht_state.hash_table); assert(!mat.handles.empty()); assert(mat.handles.size() == mat.materializers.size()); + const size_t batch_size = 256; + std::vector hashes(batch_size); for (auto& read_handle : mat.handles) { // Pick morsels from the read handle. while (const TupleMaterializer::MatChunk* chunk = read_handle->pullChunk()) { // Materialize all tuples from the chunk. + // We traverse the materialized tuple in batches of 256 similar as a vectorized + // engine would. For large hash tables this increases throughput significantly. const char* curr_tuple = reinterpret_cast(chunk->data.get()); while (curr_tuple < chunk->end_ptr) { - // Copy over the whole tuple into the hash table. - ht_state.hash_table->insert(curr_tuple); + size_t curr_batch_size = std::min(batch_size, (chunk->end_ptr - curr_tuple) / slot_size); + const char* curr_tuple_hash_it = curr_tuple; + for (size_t batch_idx = 0; batch_idx < curr_batch_size; ++batch_idx) { + hashes[batch_idx] = ht_state.hash_table->compute_hash(curr_tuple_hash_it); + curr_tuple_hash_it += slot_size; + } + for (size_t batch_idx = 0; batch_idx < curr_batch_size; ++batch_idx) { + ht_state.hash_table->slot_prefetch(hashes[batch_idx]); + } + for (size_t batch_idx = 0; batch_idx < curr_batch_size; ++batch_idx) { + ht_state.hash_table->insert(curr_tuple, hashes[batch_idx]); + curr_tuple += slot_size; + } // Move to the next tuple. - curr_tuple += slot_size; } } } diff --git a/src/runtime/NewHashTables.cpp b/src/runtime/NewHashTables.cpp index 43882f7..995e4ef 100644 --- a/src/runtime/NewHashTables.cpp +++ b/src/runtime/NewHashTables.cpp @@ -174,9 +174,8 @@ char* AtomicHashTable::lookupDisable(const char* key) { template template -char* AtomicHashTable::insert(const char* key) { +char* AtomicHashTable::insert(const char* key, uint64_t hash) { // Look up the initial slot in the linear probing chain . - const uint64_t hash = comp.hash(key); const auto idx = hash & mod_mask; IteratorState it{ .idx = idx, @@ -203,6 +202,13 @@ char* AtomicHashTable::insert(const char* key) { } } +template +template +char* AtomicHashTable::insert(const char* key) { + const uint64_t hash = comp.hash(key); + return insert(key, hash); +} + template typename AtomicHashTable::IteratorState AtomicHashTable::itStart() const { IteratorState it; @@ -257,10 +263,16 @@ template class AtomicHashTable; template char* AtomicHashTable::insert(const char* key); template char* AtomicHashTable::insert(const char* key); +template char* AtomicHashTable::insert(const char* key, uint64_t hash); +template char* AtomicHashTable::insert(const char* key, uint64_t hash); + template class AtomicHashTable; template char* AtomicHashTable::insert(const char* key); template char* AtomicHashTable::insert(const char* key); +template char* AtomicHashTable::insert(const char* key, uint64_t hash); +template char* AtomicHashTable::insert(const char* key, uint64_t hash); + template class ExclusiveHashTable; template class ExclusiveHashTable; diff --git a/src/runtime/NewHashTables.h b/src/runtime/NewHashTables.h index 6fb2e70..446e6c1 100644 --- a/src/runtime/NewHashTables.h +++ b/src/runtime/NewHashTables.h @@ -65,6 +65,9 @@ struct AtomicHashTable { /// the payload as well. template char* insert(const char* key); + /// Insert variation when we already computed the hash. + template + char* insert(const char* key, uint64_t hash); private: /// An iterator within the atomic hash table. From 08b3b677916e09166e58a132d570e239715a55e9 Mon Sep 17 00:00:00 2001 From: Benjamin Wagner Date: Sun, 29 Oct 2023 13:17:43 +0100 Subject: [PATCH 4/4] Fix CI The CI suddenly started breaking. Make it more robust by: - Fixing Ubuntu 22.04 (this alone is not enough) - Fixing libc++ as the C++ standard library - Work around https://github.com/llvm/llvm-project/issues/59432 The second seemed to be the actual failure. It seems like we were calling into the include headers of libstdc++ from a system GCC installation and that was causing build issues. This then caused ASAN failures coming from Ubuntu packaging issues which are fixed by running the tests with disabled ASAN alloc/dealloc mismatch warnings. --- .github/workflows/build_test.yml | 7 +++++-- CMakeLists.txt | 2 +- thirdparty/PerfEvent.hpp | 1 + 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml index 0da00fb..ab67487 100644 --- a/.github/workflows/build_test.yml +++ b/.github/workflows/build_test.yml @@ -8,7 +8,7 @@ on: jobs: build: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 # Run builds and tests both in Debug and RelWithDebInfo strategy: matrix: @@ -21,5 +21,8 @@ jobs: build-type: ${{ matrix.build-type }} - name: Test working-directory: ${{github.workspace}}/build - run: ./tester + # Unfortunately we're running into https://github.com/llvm/llvm-project/issues/59432 + # This is some Ubuntu packaging issue that causes alloc/dealloc mismatches when asan + # is enabled with libc++ + run: ASAN_OPTIONS=alloc_dealloc_mismatch=0 ./tester diff --git a/CMakeLists.txt b/CMakeLists.txt index 912a6bd..0477fd2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,7 +12,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/") set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic -stdlib=libc++") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -rdynamic -g -O0 -fsanitize=address") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}") diff --git a/thirdparty/PerfEvent.hpp b/thirdparty/PerfEvent.hpp index 9f8cde3..502db9e 100644 --- a/thirdparty/PerfEvent.hpp +++ b/thirdparty/PerfEvent.hpp @@ -34,6 +34,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include #include #include +#include #include #include