From 000174be47f98d0c263adaba08707f5b6fdabc18 Mon Sep 17 00:00:00 2001
From: Benjamin Wagner <benjamin.wagner@firebolt.io>
Date: Sun, 22 Oct 2023 14:10:08 +0200
Subject: [PATCH 1/4] Add Prefetching to Hash Join

This commit adds prefetching to our hash joins. Rather than doing a
single lookup call, we now split the logic into three calls:

1. A hash call -> Performs the key hash on the hash table
2. A prefetch call -> Prefetches the respective slots from the hash
   table
2. The actual lookup call -> Does the key lookup, on the slot that was
   previously prefetched

This will be interesting for our vectorized backend, as we can issue
many independent memory loads in short succession.

In the next commits we will:

1. Allow disabling the prefetch calls for JIT compiled code. They are
   rather pointless as the lookup right after will load the data into
   cache anyways.
2. Perform dynamic chunking in the vectorized code. At the end of the
   pipeline when we move into hash table operations, we will dynamically
   reduce the chunk size to ~256 in order to make sure the prefetching
   only fills the L1/L2 caches. If the prefetching range becomes too
   large, then we start evicting parts of the hash table from cache
   again.

After this, we should have a significantly faster vectorized execution
backend.
---
 bench/ht_benchmark.cpp                        |  3 +-
 bench/vectorized_ht.cpp                       | 72 +++++++++++++++++-
 src/algebra/Join.cpp                          | 16 +++-
 src/algebra/Join.h                            |  5 ++
 .../suboperators/RuntimeFunctionSubop.cpp     | 35 ++-------
 .../suboperators/RuntimeFunctionSubop.h       | 76 ++++++++++++++++++-
 .../RuntimeFunctionSubopFragmentizer.cpp      | 34 ++++++++-
 src/runtime/HashTableRuntime.cpp              | 68 +++++++++++++++++
 src/runtime/HashTableRuntime.h                | 12 ++-
 src/runtime/NewHashTables.cpp                 | 40 ++++++++--
 src/runtime/NewHashTables.h                   | 12 +++
 test/runtime/test_atomic_hash_table.cpp       |  9 ++-
 .../test_atomic_hash_table_complex_key.cpp    |  8 +-
 13 files changed, 334 insertions(+), 56 deletions(-)
diff --git a/bench/ht_benchmark.cpp b/bench/ht_benchmark.cpp
index c831ac1..6227565 100644
--- a/bench/ht_benchmark.cpp
+++ b/bench/ht_benchmark.cpp
@@ -229,8 +229,7 @@ BENCHMARK(ht_lookup_unordered_map_nomatch<std::pair<TB32, TB8>>)->ArgsProduct({{
 BENCHMARK(ht_lookup_unordered_map_nomatch<std::pair<TB32, TB64>>)->ArgsProduct({{1'000, 100'000, 10'000'000, 50'000'000}});
 BENCHMARK(ht_lookup_unordered_map_nomatch<std::pair<TB64, TB8>>)->ArgsProduct({{1'000, 100'000, 10'000'000, 50'000'000}});
 BENCHMARK(ht_lookup_unordered_map_nomatch<std::pair<TB64, TB64>>)->ArgsProduct({{1'000, 100'000, 10'000'000, 50'000'000}});
-*/ 
-
+*/
 }
 
 }
diff --git a/bench/vectorized_ht.cpp b/bench/vectorized_ht.cpp
index dc28c7a..a2823f8 100644
--- a/bench/vectorized_ht.cpp
+++ b/bench/vectorized_ht.cpp
@@ -1,4 +1,5 @@
 #include "benchmark/benchmark.h"
+#include "runtime/NewHashTables.h"
 #include "xxhash.h"
 #include <algorithm>
 #include <chrono>
@@ -6,6 +7,8 @@
 #include <iostream>
 #include <vector>
 
+using namespace inkfuse;
+
 /**
  * Microbenchmarks inspired by Peter's feedback: In vectorized engines,
  * parallel hash table access can be made more efficient than in a tuple-at-a time
@@ -48,7 +51,6 @@
  * BM_ht_perf_vectorized/524288/256        10098416 ns     10093265 ns           72 items_per_second=51.9443M/s
  * BM_ht_perf_vectorized/33554432/256     971872286 ns    971838853 ns            1 items_per_second=34.5267M/s
  * BM_ht_perf_vectorized/1073741824/256 51425526675 ns  51422464322 ns            1 items_per_second=20.8808M/s
- * BM_ht_perf_vectorized/33554432/256     933936147 ns    933873161 ns            1 items_per_second=35.9304M/s
  *
  */
 namespace {
@@ -188,10 +190,72 @@ void BM_ht_perf_vectorized(benchmark::State& state) {
    state.SetItemsProcessed(state.iterations() * num_elems);
 }
 
+void BM_ht_perf_tat_inkfuse(benchmark::State& state) {
+   const uint64_t num_elems = state.range(0);
+   inkfuse::SimpleKeyComparator comp{8};
+   AtomicHashTable<inkfuse::SimpleKeyComparator> ht{comp, 16, 2 * num_elems};
+   for (uint64_t k = 1; k <= num_elems; ++k) {
+      const uint64_t key = 7 * k;
+      char* value = ht.insert<true>(reinterpret_cast<const char*>(&key));
+      reinterpret_cast<uint64_t*>(value)[1] = k;
+   }
+   for (auto _ : state) {
+      for (uint64_t k = 1; k <= num_elems; ++k) {
+         const uint64_t key = 7 * k;
+         char* res = ht.lookup(reinterpret_cast<const char*>(&key));
+         if (reinterpret_cast<const uint64_t*>(res)[1] > num_elems) {
+            throw std::runtime_error("bad ht lookup for " + std::to_string(k));
+         }
+      }
+   }
+   state.SetItemsProcessed(state.iterations() * num_elems);
+}
+
+void BM_ht_perf_vectorized_inkfuse(benchmark::State& state) {
+   const uint64_t num_elems = state.range(0);
+   const uint64_t batch_size = state.range(1);
+   inkfuse::SimpleKeyComparator comp{8};
+   AtomicHashTable<inkfuse::SimpleKeyComparator> ht{comp, 16, 2 * num_elems};
+   for (uint64_t k = 1; k <= num_elems; ++k) {
+      const uint64_t key = 7 * k;
+      char* value = ht.insert<true>(reinterpret_cast<const char*>(&key));
+      reinterpret_cast<uint64_t*>(value)[1] = k;
+   }
+   std::vector<uint64_t> keys(batch_size);
+   std::vector<uint64_t> hashes(batch_size);
+   for (auto _ : state) {
+      // Lookup every key again.
+      for (uint64_t k = 1; k <= num_elems; k += batch_size) {
+         const auto curr_batch = std::min(batch_size, num_elems - k + 1);
+         for (uint64_t tid = 0; tid < curr_batch; ++tid) {
+            keys[tid] = 7 * (k + tid);
+         }
+         for (uint64_t tid = 0; tid < curr_batch; ++tid) {
+            hashes[tid] = ht.compute_hash(reinterpret_cast<const char*>(&keys[tid]));
+         }
+         for (uint64_t tid = 0; tid < curr_batch; ++tid) {
+            ht.slot_prefetch(hashes[tid]);
+         }
+         for (uint64_t tid = 0; tid < curr_batch; ++tid) {
+            const auto* res = ht.lookup(reinterpret_cast<const char*>(&keys[tid]), hashes[tid]);
+            if (reinterpret_cast<const uint64_t*>(res)[1] > num_elems) {
+               throw std::runtime_error("bad ht lookup for " + std::to_string(k));
+            }
+         }
+      }
+   }
+   state.SetItemsProcessed(state.iterations() * num_elems);
+}
+
 BENCHMARK(BM_ht_perf_tat)->Arg(1 << 9)->Arg(1 << 13)->Arg(1 << 15)->Arg(1 << 19)->Arg(1 << 25)->Arg(1 << 30);
-// Different hash table sizes.
+BENCHMARK(BM_ht_perf_tat_inkfuse)->Arg(1 << 9)->Arg(1 << 13)->Arg(1 << 15)->Arg(1 << 19)->Arg(1 << 25)->Arg(1 << 30);
+
 BENCHMARK(BM_ht_perf_vectorized)->ArgPair(1 << 9, 256)->ArgPair(1 << 13, 256)->ArgPair(1 << 15, 256)->ArgPair(1 << 19, 256)->ArgPair(1 << 25, 256)->ArgPair(1 << 30, 256);
-// Different internal batch sizes.
-BENCHMARK(BM_ht_perf_vectorized)->ArgPair(1 << 25, 64)->ArgPair(1 << 25, 128)->ArgPair(1 << 25, 256)->ArgPair(1 << 25, 512)->ArgPair(1 << 25, 1024)->ArgPair(1 << 25, 2024)->ArgPair(1 << 25, 4048)->ArgPair(1 << 25, 8096)->ArgPair(1<<25, 16192);
+// Different internal batch sizes. 256 is a good value.
+BENCHMARK(BM_ht_perf_vectorized)->ArgPair(1 << 25, 64)->ArgPair(1 << 25, 128)->ArgPair(1 << 25, 256)->ArgPair(1 << 25, 512)->ArgPair(1 << 25, 1024)->ArgPair(1 << 25, 2024)->ArgPair(1 << 25, 4048)->ArgPair(1 << 25, 8096)->ArgPair(1 << 25, 16192);
+
+BENCHMARK(BM_ht_perf_vectorized_inkfuse)->ArgPair(1 << 9, 256)->ArgPair(1 << 13, 256)->ArgPair(1 << 15, 256)->ArgPair(1 << 19, 256)->ArgPair(1 << 25, 256)->ArgPair(1 << 30, 256);
+// Different internal batch sizes. 256 is a good value.
+BENCHMARK(BM_ht_perf_vectorized_inkfuse)->ArgPair(1 << 25, 64)->ArgPair(1 << 25, 128)->ArgPair(1 << 25, 256)->ArgPair(1 << 25, 512)->ArgPair(1 << 25, 1024)->ArgPair(1 << 25, 2024)->ArgPair(1 << 25, 4048)->ArgPair(1 << 25, 8096)->ArgPair(1 << 25, 16192);
 
 } // namespacf
diff --git a/src/algebra/Join.cpp b/src/algebra/Join.cpp
index c5e3609..43de72c 100644
--- a/src/algebra/Join.cpp
+++ b/src/algebra/Join.cpp
@@ -131,6 +131,11 @@ void Join::plan() {
    lookup_left.emplace(IR::Pointer::build(IR::Char::build()));
    lookup_right.emplace(IR::Pointer::build(IR::Char::build()));
    filter_pseudo_iu.emplace(IR::Void::build());
+
+   // The probe hash is always a unit64_t.
+   hash_right.emplace(IR::UnsignedInt::build(8));
+   // Pseudo IU for making sure we prefetch before we probe.
+   prefetch_pseudo.emplace(IR::Void::build());
 }
 
 void Join::decay(inkfuse::PipelineDAG& dag) const {
@@ -245,12 +250,19 @@ void Join::decayPkJoin(inkfuse::PipelineDAG& dag) const {
          pseudo.push_back(&pseudo_iu);
       }
 
+      // 2.2.1 Compute the hash.
+      probe_pipe.attachSuboperator(RuntimeFunctionSubop::htHash<AtomicHashTable<SimpleKeyComparator>>(this, *hash_right, *scratch_pad_right, std::move(pseudo), &ht_state));
+
+      // 2.2.2 Prefetch the slot.
+      probe_pipe.attachSuboperator(RuntimeFunctionSubop::htPrefetch<AtomicHashTable<SimpleKeyComparator>>(this, &*prefetch_pseudo, *hash_right, &ht_state));
+
+      // 2.2.3 Perfom the lookup.
       if (type == JoinType::LeftSemi) {
          // Lookup on a slot disables the slot, giving semi-join behaviour.
-         probe_pipe.attachSuboperator(RuntimeFunctionSubop::htLookupDisable(this, *lookup_right, *scratch_pad_right, std::move(pseudo), &ht_state));
+         probe_pipe.attachSuboperator(RuntimeFunctionSubop::htLookupWithHash<AtomicHashTable<SimpleKeyComparator>, true>(this, *lookup_right, *scratch_pad_right, *hash_right, &*prefetch_pseudo, &ht_state));
       } else {
          // Regular lookup that does not disable slots.
-         probe_pipe.attachSuboperator(RuntimeFunctionSubop::htLookup<AtomicHashTable<SimpleKeyComparator>>(this, *lookup_right, *scratch_pad_right, std::move(pseudo), &ht_state));
+         probe_pipe.attachSuboperator(RuntimeFunctionSubop::htLookupWithHash<AtomicHashTable<SimpleKeyComparator>, false>(this, *lookup_right, *scratch_pad_right, *hash_right, &*prefetch_pseudo, &ht_state));
       }
 
       // 2.3 Filter on probe matches.
diff --git a/src/algebra/Join.h b/src/algebra/Join.h
index 332ec31..547646d 100644
--- a/src/algebra/Join.h
+++ b/src/algebra/Join.h
@@ -62,6 +62,11 @@ struct Join : public RelAlgOp {
    /// Packed scratch pad IU right.
    std::optional<IU> scratch_pad_right;
 
+   /// Computed hash on the probe side.
+   std::optional<IU> hash_right;
+   /// Prefetch pseudo IU - ensures that we prefetch before probing.
+   std::optional<IU> prefetch_pseudo;
+
    /// Lookup result left.
    std::optional<IU> lookup_left;
    /// Lookup result right.
diff --git a/src/algebra/suboperators/RuntimeFunctionSubop.cpp b/src/algebra/suboperators/RuntimeFunctionSubop.cpp
index 0ba7167..453a2fc 100644
--- a/src/algebra/suboperators/RuntimeFunctionSubop.cpp
+++ b/src/algebra/suboperators/RuntimeFunctionSubop.cpp
@@ -53,29 +53,6 @@ std::unique_ptr<RuntimeFunctionSubop> RuntimeFunctionSubop::htInsert(const inkfu
          pointers_));
 }
 
-std::unique_ptr<RuntimeFunctionSubop> RuntimeFunctionSubop::htLookupDisable(const RelAlgOp* source, const IU& pointers_, const IU& keys_, std::vector<const IU*> pseudo_ius_, DefferredStateInitializer* state_init_) {
-   std::string fct_name = "ht_at_sk_lookup_disable";
-   std::vector<const IU*> in_ius{&keys_};
-   for (auto pseudo : pseudo_ius_) {
-      // Pseudo IUs are used as input IUs in the backing graph, but do not influence arguments.
-      in_ius.push_back(pseudo);
-   }
-   std::vector<bool> ref{keys_.type->id() != "ByteArray" && keys_.type->id() != "Ptr_Char"};
-   std::vector<const IU*> out_ius_{&pointers_};
-   std::vector<const IU*> args{&keys_};
-   const IU* out = &pointers_;
-   return std::unique_ptr<RuntimeFunctionSubop>(
-      new RuntimeFunctionSubop(
-         source,
-         state_init_,
-         std::move(fct_name),
-         std::move(in_ius),
-         std::move(out_ius_),
-         std::move(args),
-         std::move(ref),
-         out));
-}
-
 std::unique_ptr<RuntimeFunctionSubop> RuntimeFunctionSubop::htNoKeyLookup(const RelAlgOp* source, const IU& pointers_, const IU& input_dependency, DefferredStateInitializer* state_init_) {
    std::string fct_name = "ht_nk_lookup";
    std::vector<const IU*> in_ius{&input_dependency};
@@ -122,12 +99,14 @@ void RuntimeFunctionSubop::consumeAllChildren(CompilationContext& context) {
 
    std::unordered_set<const IU*> provided;
 
-   // Declare the output IUs.
+   // Declare the output IUs that are not pseudo IUs.
    for (const IU* out_iu : provided_ius) {
-      provided.emplace(out_iu);
-      auto iu_name = context.buildIUIdentifier(*out_iu);
-      const auto& declare = builder.appendStmt(IR::DeclareStmt::build(std::move(iu_name), out_iu->type));
-      context.declareIU(*out_iu, declare);
+      if (out_iu->type->id() != "Void") {
+         provided.emplace(out_iu);
+         auto iu_name = context.buildIUIdentifier(*out_iu);
+         const auto& declare = builder.appendStmt(IR::DeclareStmt::build(std::move(iu_name), out_iu->type));
+         context.declareIU(*out_iu, declare);
+      }
    }
 
    // Assemble the input expressions.
diff --git a/src/algebra/suboperators/RuntimeFunctionSubop.h b/src/algebra/suboperators/RuntimeFunctionSubop.h
index 4b6a4d8..794c422 100644
--- a/src/algebra/suboperators/RuntimeFunctionSubop.h
+++ b/src/algebra/suboperators/RuntimeFunctionSubop.h
@@ -30,8 +30,80 @@ struct RuntimeFunctionSubop : public TemplatedSuboperator<RuntimeFunctionSubopSt
    /// Build an insert function for a hash table.
    static std::unique_ptr<RuntimeFunctionSubop> htInsert(const RelAlgOp* source, const IU* pointers_, const IU& key_, std::vector<const IU*> pseudo_ius_, DefferredStateInitializer* state_init_ = nullptr);
 
-   /// Build a hash table lookup function that disables every found slot.
-   static std::unique_ptr<RuntimeFunctionSubop> htLookupDisable(const RelAlgOp* source, const IU& pointers_, const IU& key_, std::vector<const IU*> pseudo_ius_, DefferredStateInitializer* state_init_ = nullptr);
+   /// Hash a key with the hash table's hash function.
+   template <class HashTable>
+   static std::unique_ptr<RuntimeFunctionSubop> htHash(const RelAlgOp* source, const IU& hash_, const IU& key_, std::vector<const IU*> pseudo_ius_, DefferredStateInitializer* state_init_ = nullptr) {
+      std::string fct_name = "ht_" + HashTable::ID + "_compute_hash";
+      std::vector<const IU*> in_ius{&key_};
+      for (auto pseudo : pseudo_ius_) {
+         // Pseudo IUs are used as input IUs in the backing graph, but do not influence arguments.
+         in_ius.push_back(pseudo);
+      }
+      std::vector<bool> ref{key_.type->id() != "ByteArray" && key_.type->id() != "Ptr_Char"};
+      std::vector<const IU*> out_ius_{&hash_};
+      std::vector<const IU*> args{&key_};
+      const IU* out = &hash_;
+      return std::unique_ptr<RuntimeFunctionSubop>(
+         new RuntimeFunctionSubop(
+            source,
+            state_init_,
+            std::move(fct_name),
+            std::move(in_ius),
+            std::move(out_ius_),
+            std::move(args),
+            std::move(ref),
+            out));
+   }
+
+   /// Hash a key with the hash table's hash function.
+   template <class HashTable>
+   static std::unique_ptr<RuntimeFunctionSubop> htPrefetch(const RelAlgOp* source, const IU* prefetch_pseudo, const IU& hash_, DefferredStateInitializer* state_init_ = nullptr) {
+      std::string fct_name = "ht_" + HashTable::ID + "_slot_prefetch";
+      std::vector<const IU*> in_ius{&hash_};
+      std::vector<bool> ref{false};
+      std::vector<const IU*> out_ius_{};
+      if (prefetch_pseudo) {
+         out_ius_.push_back(prefetch_pseudo);
+      }
+      std::vector<const IU*> args{&hash_};
+      return std::unique_ptr<RuntimeFunctionSubop>(
+         new RuntimeFunctionSubop(
+            source,
+            state_init_,
+            std::move(fct_name),
+            std::move(in_ius),
+            std::move(out_ius_),
+            std::move(args),
+            std::move(ref),
+            /* out = */ nullptr));
+   }
+
+   /// Build a hash table lookup function.
+   template <class HashTable, bool disable_slot>
+   static std::unique_ptr<RuntimeFunctionSubop> htLookupWithHash(const RelAlgOp* source, const IU& pointers_, const IU& key_, const IU& hash_, const IU* prefetch_pseudo_, DefferredStateInitializer* state_init_ = nullptr) {
+      std::string fct_name = "ht_" + HashTable::ID + "_lookup_with_hash";
+      if constexpr (disable_slot) {
+         fct_name += "_disable";
+      }
+      std::vector<const IU*> in_ius{&key_, &hash_};
+      if (prefetch_pseudo_) {
+         in_ius.push_back(prefetch_pseudo_);
+      }
+      std::vector<bool> ref{key_.type->id() != "ByteArray" && key_.type->id() != "Ptr_Char", false};
+      std::vector<const IU*> out_ius_{&pointers_};
+      std::vector<const IU*> args{&key_, &hash_};
+      const IU* out = &pointers_;
+      return std::unique_ptr<RuntimeFunctionSubop>(
+         new RuntimeFunctionSubop(
+            source,
+            state_init_,
+            std::move(fct_name),
+            std::move(in_ius),
+            std::move(out_ius_),
+            std::move(args),
+            std::move(ref),
+            out));
+   }
 
    /// Build a hash table lookup function.
    template <class HashTable>
diff --git a/src/interpreter/RuntimeFunctionSubopFragmentizer.cpp b/src/interpreter/RuntimeFunctionSubopFragmentizer.cpp
index 3fb62de..c55c78c 100644
--- a/src/interpreter/RuntimeFunctionSubopFragmentizer.cpp
+++ b/src/interpreter/RuntimeFunctionSubopFragmentizer.cpp
@@ -38,13 +38,31 @@ RuntimeFunctionSubopFragmentizer::RuntimeFunctionSubopFragmentizer() {
          name = op.id();
       }
 
-      // Fragmentize hash table lookup that disables the slot (for left semi joins).
+      // Fragmentize Vectorized Hash Table Primitives
       {
+         // Hash:
          auto& [name, pipe] = pipes.emplace_back();
          const auto& key = generated_ius.emplace_back(in_type);
-         const auto& result_ptr = generated_ius.emplace_back(IR::Pointer::build(IR::Char::build()));
-         // No pseudo-IU inputs, these only matter for more complex DAGs.
-         const auto& op = pipe.attachSuboperator(RuntimeFunctionSubop::htLookupDisable(nullptr, result_ptr, key, {}));
+         const auto& hash = generated_ius.emplace_back(IR::UnsignedInt::build(8));
+         const auto& op = pipe.attachSuboperator(RuntimeFunctionSubop::htHash<AtomicHashTable<SimpleKeyComparator>>(nullptr, hash, key, {}));
+         name = op.id();
+      }
+      {
+         // Lookup don't disable slot:
+         auto& [name, pipe] = pipes.emplace_back();
+         const auto& hash = generated_ius.emplace_back(IR::UnsignedInt::build(8));
+         const auto& key = generated_ius.emplace_back(in_type);
+         const auto& result = generated_ius.emplace_back(IR::Pointer::build(IR::Char::build()));
+         const auto& op = pipe.attachSuboperator(RuntimeFunctionSubop::htLookupWithHash<AtomicHashTable<SimpleKeyComparator>, false>(nullptr, result, key, hash, nullptr));
+         name = op.id();
+      }
+      {
+         // Lookup disable slot:
+         auto& [name, pipe] = pipes.emplace_back();
+         const auto& hash = generated_ius.emplace_back(IR::UnsignedInt::build(8));
+         const auto& key = generated_ius.emplace_back(in_type);
+         const auto& result = generated_ius.emplace_back(IR::Pointer::build(IR::Char::build()));
+         const auto& op = pipe.attachSuboperator(RuntimeFunctionSubop::htLookupWithHash<AtomicHashTable<SimpleKeyComparator>, true>(nullptr, result, key, hash, nullptr));
          name = op.id();
       }
 
@@ -88,6 +106,14 @@ RuntimeFunctionSubopFragmentizer::RuntimeFunctionSubopFragmentizer() {
       }
    }
 
+   // Fragmentize Prefetch.
+   {
+      auto& [name, pipe] = pipes.emplace_back();
+      const auto& hash = generated_ius.emplace_back(IR::UnsignedInt::build(8));
+      const auto& op = pipe.attachSuboperator(RuntimeFunctionSubop::htPrefetch<AtomicHashTable<SimpleKeyComparator>>(nullptr, nullptr, hash));
+      name = op.id();
+   }
+
    // Fragmentize tuple materialization.
    {
       auto& [name, pipe] = pipes.emplace_back();
diff --git a/src/runtime/HashTableRuntime.cpp b/src/runtime/HashTableRuntime.cpp
index 21c1f96..149197f 100644
--- a/src/runtime/HashTableRuntime.cpp
+++ b/src/runtime/HashTableRuntime.cpp
@@ -58,6 +58,38 @@ extern "C" void HashTableRuntime::ht_dl_it_advance(void* table, char** it_data,
 }
 
 // Atomic hash table.
+extern "C" uint64_t HashTableRuntime::ht_at_sk_compute_hash(void* table, char* key) {
+   return reinterpret_cast<AtomicHashTable<SimpleKeyComparator>*>(table)->compute_hash(key);
+}
+
+extern "C" void HashTableRuntime::ht_at_sk_slot_prefetch(void* table, uint64_t hash) {
+   reinterpret_cast<AtomicHashTable<SimpleKeyComparator>*>(table)->slot_prefetch(hash);
+}
+
+extern "C" char* HashTableRuntime::ht_at_sk_lookup_with_hash(void* table, char* key, uint64_t hash) {
+   return reinterpret_cast<AtomicHashTable<SimpleKeyComparator>*>(table)->lookup(key, hash);
+}
+
+extern "C" char* HashTableRuntime::ht_at_sk_lookup_with_hash_disable(void* table, char* key, uint64_t hash) {
+   return reinterpret_cast<AtomicHashTable<SimpleKeyComparator>*>(table)->lookupDisable(key, hash);
+}
+
+extern "C" uint64_t HashTableRuntime::ht_at_ck_compute_hash(void* table, char* key) {
+   return reinterpret_cast<AtomicHashTable<ComplexKeyComparator>*>(table)->compute_hash(key);
+}
+
+extern "C" void HashTableRuntime::ht_at_ck_slot_prefetch(void* table, uint64_t hash) {
+   reinterpret_cast<AtomicHashTable<ComplexKeyComparator>*>(table)->slot_prefetch(hash);
+}
+
+extern "C" char* HashTableRuntime::ht_at_ck_lookup_with_hash(void* table, char* key, uint64_t hash) {
+   return reinterpret_cast<AtomicHashTable<ComplexKeyComparator>*>(table)->lookup(key, hash);
+}
+
+extern "C" char* HashTableRuntime::ht_at_ck_lookup_with_hash_disable(void* table, char* key, uint64_t hash) {
+   return reinterpret_cast<AtomicHashTable<ComplexKeyComparator>*>(table)->lookupDisable(key, hash);
+}
+
 extern "C" char* HashTableRuntime::ht_at_sk_lookup(void* table, char* key) {
    return reinterpret_cast<AtomicHashTable<SimpleKeyComparator>*>(table)->lookup(key);
 }
@@ -139,6 +171,42 @@ void HashTableRuntime::registerRuntime() {
    RuntimeFunctionBuilder("ht_at_ck_lookup", IR::Pointer::build(IR::Char::build()))
       .addArg("table", IR::Pointer::build(IR::Void::build()))
       .addArg("key", IR::Pointer::build(IR::Char::build()), true);
+
+   RuntimeFunctionBuilder("ht_at_sk_compute_hash", IR::UnsignedInt::build(8))
+      .addArg("table", IR::Pointer::build(IR::Void::build()))
+      .addArg("key", IR::Pointer::build(IR::Char::build()), true);
+
+   RuntimeFunctionBuilder("ht_at_sk_slot_prefetch", IR::Void::build())
+      .addArg("table", IR::Pointer::build(IR::Void::build()))
+      .addArg("hash", IR::UnsignedInt::build(8), true);
+
+   RuntimeFunctionBuilder("ht_at_sk_lookup_with_hash", IR::Pointer::build(IR::Char::build()))
+      .addArg("table", IR::Pointer::build(IR::Void::build()))
+      .addArg("key", IR::Pointer::build(IR::Char::build()))
+      .addArg("hash", IR::UnsignedInt::build(8), true);
+
+   RuntimeFunctionBuilder("ht_at_sk_lookup_with_hash_disable", IR::Pointer::build(IR::Char::build()))
+      .addArg("table", IR::Pointer::build(IR::Void::build()))
+      .addArg("key", IR::Pointer::build(IR::Char::build()))
+      .addArg("hash", IR::UnsignedInt::build(8), true);
+
+   RuntimeFunctionBuilder("ht_at_ck_compute_hash", IR::UnsignedInt::build(8))
+      .addArg("table", IR::Pointer::build(IR::Void::build()))
+      .addArg("key", IR::Pointer::build(IR::Char::build()), true);
+
+   RuntimeFunctionBuilder("ht_at_ck_slot_prefetch", IR::Void::build())
+      .addArg("table", IR::Pointer::build(IR::Void::build()))
+      .addArg("hash", IR::UnsignedInt::build(8), true);
+
+   RuntimeFunctionBuilder("ht_at_ck_lookup_with_hash", IR::Pointer::build(IR::Char::build()))
+      .addArg("table", IR::Pointer::build(IR::Void::build()))
+      .addArg("key", IR::Pointer::build(IR::Char::build()))
+      .addArg("hash", IR::UnsignedInt::build(8), true);
+
+   RuntimeFunctionBuilder("ht_at_ck_lookup_with_hash_disable", IR::Pointer::build(IR::Char::build()))
+      .addArg("table", IR::Pointer::build(IR::Void::build()))
+      .addArg("key", IR::Pointer::build(IR::Char::build()))
+      .addArg("hash", IR::UnsignedInt::build(8), true);
 }
 
 }
diff --git a/src/runtime/HashTableRuntime.h b/src/runtime/HashTableRuntime.h
index 8fa418d..b9c041f 100644
--- a/src/runtime/HashTableRuntime.h
+++ b/src/runtime/HashTableRuntime.h
@@ -29,13 +29,21 @@ extern "C" char* ht_at_sk_lookup(void* table, char* key);
 extern "C" char* ht_at_sk_lookup_disable(void* table, char* key);
 extern "C" char* ht_at_ck_lookup(void* table, char* key);
 
+extern "C" uint64_t ht_at_sk_compute_hash(void* table, char* key);
+extern "C" void ht_at_sk_slot_prefetch(void* table, uint64_t hash);
+extern "C" char* ht_at_sk_lookup_with_hash(void* table, char* key, uint64_t hash);
+extern "C" char* ht_at_sk_lookup_with_hash_disable(void* table, char* key, uint64_t hash);
+
+extern "C" uint64_t ht_at_ck_compute_hash(void* table, char* key);
+extern "C" void ht_at_ck_slot_prefetch(void* table, uint64_t hash);
+extern "C" char* ht_at_ck_lookup_with_hash(void* table, char* key, uint64_t hash);
+extern "C" char* ht_at_ck_lookup_with_hash_disable(void* table, char* key, uint64_t hash);
+
 /// Special lookup function if we know we have a 0-byte key.
 extern "C" char* ht_nk_lookup(void* table);
 
 void registerRuntime();
 };
-
-
 }
 
 #endif //INKFUSE_HASHTABLERUNTIME_H
diff --git a/src/runtime/NewHashTables.cpp b/src/runtime/NewHashTables.cpp
index ad3b3cd..43882f7 100644
--- a/src/runtime/NewHashTables.cpp
+++ b/src/runtime/NewHashTables.cpp
@@ -83,14 +83,27 @@ AtomicHashTable<Comparator>::AtomicHashTable(Comparator comp_, uint16_t total_sl
 }
 
 template <class Comparator>
-char* AtomicHashTable<Comparator>::lookup(const char* key) const {
+uint64_t AtomicHashTable<Comparator>::compute_hash(const char* key) const {
+   return comp.hash(key);
+}
+
+template <class Comparator>
+void AtomicHashTable<Comparator>::slot_prefetch(uint64_t hash) const {
+   const uint64_t slot_id = hash & mod_mask;
+   // Prefetch the actual data array.
+   __builtin_prefetch(&data[slot_id * total_slot_size]);
+   // Prefetch the bitmask slot.
+   __builtin_prefetch(&tags[slot_id]);
+}
+
+template <class Comparator>
+char* AtomicHashTable<Comparator>::lookup(const char* key, uint64_t hash) const {
+   const uint64_t slot_id = hash & mod_mask;
    // Look up the initial slot in the linear probing chain.
-   const uint64_t hash = comp.hash(key);
-   const auto idx = hash & mod_mask;
    IteratorState it{
-      .idx = idx,
-      .data_ptr = &data[idx * total_slot_size],
-      .tag_ptr = &tags[idx],
+      .idx = slot_id,
+      .data_ptr = &data[slot_id * total_slot_size],
+      .tag_ptr = &tags[slot_id],
    };
    // The tag we are looking for.
    const uint8_t target_tag = tag_fill_mask | static_cast<uint8_t>(hash >> 56ul);
@@ -111,9 +124,14 @@ char* AtomicHashTable<Comparator>::lookup(const char* key) const {
 }
 
 template <class Comparator>
-char* AtomicHashTable<Comparator>::lookupDisable(const char* key) {
-   // Look up the initial slot in the linear probing chain.
+char* AtomicHashTable<Comparator>::lookup(const char* key) const {
    const uint64_t hash = comp.hash(key);
+   return lookup(key, hash);
+}
+
+template <class Comparator>
+char* AtomicHashTable<Comparator>::lookupDisable(const char* key, uint64_t hash) {
+   // Look up the initial slot in the linear probing chain.
    const auto idx = hash & mod_mask;
    IteratorState it{
       .idx = idx,
@@ -148,6 +166,12 @@ char* AtomicHashTable<Comparator>::lookupDisable(const char* key) {
    return it.data_ptr;
 }
 
+template <class Comparator>
+char* AtomicHashTable<Comparator>::lookupDisable(const char* key) {
+   const uint64_t hash = comp.hash(key);
+   return lookupDisable(key, hash);
+}
+
 template <class Comparator>
 template <bool copy_only_key>
 char* AtomicHashTable<Comparator>::insert(const char* key) {
diff --git a/src/runtime/NewHashTables.h b/src/runtime/NewHashTables.h
index de265cd..6fb2e70 100644
--- a/src/runtime/NewHashTables.h
+++ b/src/runtime/NewHashTables.h
@@ -43,6 +43,18 @@ struct AtomicHashTable {
 
    AtomicHashTable(Comparator comp_, uint16_t total_slot_size_, size_t num_slots_);
 
+   /// Compute the hash for a given key.
+   uint64_t compute_hash(const char* key) const;
+   /// Prefetch the tag and data slots for a specific hash.
+   void slot_prefetch(uint64_t hash) const;
+   /// Get the pointer to a given key, or nullptr if the group does not exist.
+   /// Already requires the hash was computed.
+   char* lookup(const char* key, uint64_t hash) const;
+   /// Get the pointer to a given key, or nullptr if the group does not exist.
+   /// If it finds a slot, disables it. Needed for e.g. left semi joins.
+   /// Already requires the hash was computed.
+   char* lookupDisable(const char* key, uint64_t hash);
+
    /// Get the pointer to a given key, or nullptr if the group does not exist.
    char* lookup(const char* key) const;
    /// Get the pointer to a given key, or nullptr if the group does not exist.
diff --git a/test/runtime/test_atomic_hash_table.cpp b/test/runtime/test_atomic_hash_table.cpp
index 9fbcf3a..3dd7b30 100644
--- a/test/runtime/test_atomic_hash_table.cpp
+++ b/test/runtime/test_atomic_hash_table.cpp
@@ -62,7 +62,9 @@ struct AtomicHashTableTestT : public ::testing::TestWithParam<ParamT> {
    void checkContains(const RandomDataResult& data, size_t idx) {
       const char* key_ptr = &data.keys[idx * std::get<0>(GetParam())];
       const char* payload_ptr = &data.payloads[idx * 16];
-      auto slot_lookup = ht.lookup(key_ptr);
+      const auto hash = ht.compute_hash(key_ptr);
+      ht.slot_prefetch(hash);
+      const auto slot_lookup = ht.lookup(key_ptr, hash);
       ASSERT_NE(slot_lookup, nullptr);
       // Check that key was serialized properly.
       EXPECT_EQ(std::memcmp(slot_lookup, key_ptr, std::get<0>(GetParam())), 0);
@@ -71,7 +73,10 @@ struct AtomicHashTableTestT : public ::testing::TestWithParam<ParamT> {
    }
 
    void checkNotContains(const RandomDataResult& data, size_t idx) {
-      auto slot = ht.lookup(&data.keys[idx * std::get<0>(GetParam())]);
+      const char* key_ptr = &data.keys[idx * std::get<0>(GetParam())];
+      const auto hash = ht.compute_hash(key_ptr);
+      ht.slot_prefetch(hash);
+      const auto slot = ht.lookup(key_ptr, hash);
       EXPECT_EQ(slot, nullptr);
    }
 
diff --git a/test/runtime/test_atomic_hash_table_complex_key.cpp b/test/runtime/test_atomic_hash_table_complex_key.cpp
index 4f68792..2cb2fb8 100644
--- a/test/runtime/test_atomic_hash_table_complex_key.cpp
+++ b/test/runtime/test_atomic_hash_table_complex_key.cpp
@@ -63,7 +63,9 @@ struct AtomicComplexHashTableTestT : public ::testing::TestWithParam<ParamT> {
    void checkContains(const std::vector<std::string>& data, size_t idx) {
       const char* raw_string = data[idx].data();
       const char* key_ptr = reinterpret_cast<const char*>(&raw_string);
-      auto slot_lookup = ht.lookup(key_ptr);
+      const auto hash = ht.compute_hash(key_ptr);
+      ht.slot_prefetch(hash);
+      const auto slot_lookup = ht.lookup(key_ptr, hash);
       ASSERT_NE(slot_lookup, nullptr);
       // Check that key was serialized properly.
       EXPECT_EQ(std::strcmp(*reinterpret_cast<char**>(slot_lookup), *reinterpret_cast<char* const*>(key_ptr)), 0);
@@ -75,7 +77,9 @@ struct AtomicComplexHashTableTestT : public ::testing::TestWithParam<ParamT> {
       if (std::find(data_exists.begin(), data_exists.end(), str) == data_exists.end()) {
          const char* raw_string = str.data();
          const char* key_ptr = reinterpret_cast<const char*>(&raw_string);
-         auto slot = ht.lookup(key_ptr);
+         const auto hash = ht.compute_hash(key_ptr);
+         ht.slot_prefetch(hash);
+         const auto slot = ht.lookup(key_ptr, hash);
          EXPECT_EQ(slot, nullptr);
       }
    }

From ce337a6f6db53a63bd51ccabd07e3a4eae8f5e95 Mon Sep 17 00:00:00 2001
From: Benjamin Wagner <benjamin.wagner@firebolt.io>
Date: Sun, 22 Oct 2023 14:58:43 +0200
Subject: [PATCH 2/4] Allow Supberators to Only Generate Code for the
 Vectorized Backend

This commit is the next one in the chain to generate a faster vectorized
backend. We can now make both a `Suboperator` and the
`CompilationContext` with additional optimziation hints.

This allows us to mark suboperator that generate prefetching code in a
way that does not generate code for operator-fusing codegen.
The prefetching calls are now only emitted for functions in the
vectorized backend, but do not generate code for compiled execution.

In general, the prefetching for operator fusing code is not important,
as we will do a lookup on the same tuple right after, which will then
cause the respective cache miss. As a result, prefetching only generates
more instructions and  function calls.
---
 src/algebra/CompilationContext.cpp            | 25 ++++++++++++----
 src/algebra/CompilationContext.h              | 30 +++++++++++++++----
 .../suboperators/RuntimeFunctionSubop.cpp     | 12 ++++----
 .../suboperators/RuntimeFunctionSubop.h       | 25 +++++++++-------
 src/algebra/suboperators/Suboperator.h        | 18 +++++++++++
 src/interpreter/FragmentGenerator.cpp         | 13 ++++----
 6 files changed, 90 insertions(+), 33 deletions(-)

diff --git a/src/algebra/CompilationContext.cpp b/src/algebra/CompilationContext.cpp
index df842db..d86cd65 100644
--- a/src/algebra/CompilationContext.cpp
+++ b/src/algebra/CompilationContext.cpp
@@ -6,12 +6,12 @@
 
 namespace inkfuse {
 
-CompilationContext::CompilationContext(std::string program_name, const Pipeline& pipeline_)
-   : pipeline(pipeline_), program(std::make_shared<IR::Program>(std::move(program_name), false)), fct_name("execute") {
+CompilationContext::CompilationContext(std::string program_name, const Pipeline& pipeline_, OptimizationHints hints_)
+   : pipeline(pipeline_), program(std::make_shared<IR::Program>(std::move(program_name), false)), fct_name("execute"), optimization_hints(hints_) {
 }
 
-CompilationContext::CompilationContext(IR::ProgramArc program_, std::string fct_name_, const Pipeline& pipeline_)
-   : pipeline(pipeline_), program(std::move(program_)), fct_name(std::move(fct_name_)) {
+CompilationContext::CompilationContext(IR::ProgramArc program_, std::string fct_name_, const Pipeline& pipeline_, OptimizationHints hints_)
+   : pipeline(pipeline_), program(std::move(program_)), fct_name(std::move(fct_name_)), optimization_hints(hints_) {
 }
 
 void CompilationContext::compile() {
@@ -57,8 +57,17 @@ void CompilationContext::notifyIUsReady(Suboperator& op) {
    // Consume in the original requestor.
    requestor->consume(*iu, *this);
    if (++properties[requestor].serviced_requests == requestor->getNumSourceIUs()) {
-      // Consume in the original requestor notifying it that all children were produced successfuly.
-      requestor->consumeAllChildren(*this);
+      const bool generates_fusing = optimization_hints.mode == OptimizationHints::CodegenMode::OperatorFusing;
+      const bool only_generate_when_vectorized = requestor->getOptimizationProperties().ct_only_vectorized;
+      if (generates_fusing && only_generate_when_vectorized) {
+         // We don't need to generate any code for this suboperator.
+         // Directly mark the output IUs as ready (those are all pseudo IUs).
+         notifyIUsReady(*requestor);
+      } else {
+         // Consume in the original requestor notifying it that all children were produced successfuly.
+         // Actually let the consumer generate the required code.
+         requestor->consumeAllChildren(*this);
+      }
    }
 }
 
@@ -139,6 +148,10 @@ IR::FunctionBuilder& CompilationContext::getFctBuilder() {
    return builder->fct_builder;
 }
 
+const OptimizationHints& CompilationContext::getOptimizationHints() const {
+   return optimization_hints;
+}
+
 CompilationContext::Builder::Builder(IR::Program& program, std::string fct_name)
    : ir_builder(program.getIRBuilder()), fct_builder(createFctBuilder(ir_builder, std::move(fct_name))) {
 }
diff --git a/src/algebra/CompilationContext.h b/src/algebra/CompilationContext.h
index dd33b2c..7d46efb 100644
--- a/src/algebra/CompilationContext.h
+++ b/src/algebra/CompilationContext.h
@@ -7,22 +7,37 @@
 #include "exec/FuseChunk.h"
 
 #include <cstdint>
-#include <optional>
-#include <memory>
-#include <unordered_map>
 #include <map>
+#include <memory>
+#include <optional>
 #include <set>
+#include <unordered_map>
 
 namespace inkfuse {
 
 struct Suboperator;
 
+/// Hints that can be used during the code generation to generate more optimized
+/// code. Examples:
+///
+/// When we are generating `OperatorFusing` code, we do not issue prefetch instructions.
+/// These are exclusively used in the vectorized backends to issue independent loads
+/// and hide cache miss latency for followup operators.
+struct OptimizationHints {
+   enum class CodegenMode {
+      OperatorFusing,
+      Vectorized,
+   };
+
+   CodegenMode mode = CodegenMode::OperatorFusing;
+};
+
 /// Context for compiling a single pipeline.
 struct CompilationContext {
    /// Set up a compilation context for generating code for a full given pipeline.
-   CompilationContext(std::string program_name, const Pipeline& pipeline_);
+   CompilationContext(std::string program_name, const Pipeline& pipeline_, OptimizationHints hints_ = OptimizationHints{});
    /// Set up a compilation context which will generate the code within a specific IR program for the full pipeline.
-   CompilationContext(IR::ProgramArc program_, std::string fct_name_, const Pipeline& pipeline_);
+   CompilationContext(IR::ProgramArc program_, std::string fct_name_, const Pipeline& pipeline_, OptimizationHints hints_ = OptimizationHints{});
 
    /// Compile the code for this context.
    void compile();
@@ -50,6 +65,9 @@ struct CompilationContext {
    const IR::Program& getProgram();
    IR::FunctionBuilder& getFctBuilder();
 
+   /// Get the optimization hints for the generated program.
+   const OptimizationHints& getOptimizationHints() const;
+
    private:
    static IR::FunctionBuilder createFctBuilder(IR::IRBuilder& program, std::string fct_name);
 
@@ -73,6 +91,8 @@ struct CompilationContext {
    const std::string fct_name;
    /// The backing IR program.
    IR::ProgramArc program;
+   /// Optimization hints that can be used during code generation.
+   OptimizationHints optimization_hints;
    /// The function builder for the generated code.
    std::optional<Builder> builder;
    /// Which sub-operators were computed already? Needed to prevent double-computation in DAGs.
diff --git a/src/algebra/suboperators/RuntimeFunctionSubop.cpp b/src/algebra/suboperators/RuntimeFunctionSubop.cpp
index 453a2fc..db0cb57 100644
--- a/src/algebra/suboperators/RuntimeFunctionSubop.cpp
+++ b/src/algebra/suboperators/RuntimeFunctionSubop.cpp
@@ -99,14 +99,12 @@ void RuntimeFunctionSubop::consumeAllChildren(CompilationContext& context) {
 
    std::unordered_set<const IU*> provided;
 
-   // Declare the output IUs that are not pseudo IUs.
+   // Declare the output IUs.
    for (const IU* out_iu : provided_ius) {
-      if (out_iu->type->id() != "Void") {
-         provided.emplace(out_iu);
-         auto iu_name = context.buildIUIdentifier(*out_iu);
-         const auto& declare = builder.appendStmt(IR::DeclareStmt::build(std::move(iu_name), out_iu->type));
-         context.declareIU(*out_iu, declare);
-      }
+      provided.emplace(out_iu);
+      auto iu_name = context.buildIUIdentifier(*out_iu);
+      const auto& declare = builder.appendStmt(IR::DeclareStmt::build(std::move(iu_name), out_iu->type));
+      context.declareIU(*out_iu, declare);
    }
 
    // Assemble the input expressions.
diff --git a/src/algebra/suboperators/RuntimeFunctionSubop.h b/src/algebra/suboperators/RuntimeFunctionSubop.h
index 794c422..6f9ddcd 100644
--- a/src/algebra/suboperators/RuntimeFunctionSubop.h
+++ b/src/algebra/suboperators/RuntimeFunctionSubop.h
@@ -66,16 +66,21 @@ struct RuntimeFunctionSubop : public TemplatedSuboperator<RuntimeFunctionSubopSt
          out_ius_.push_back(prefetch_pseudo);
       }
       std::vector<const IU*> args{&hash_};
-      return std::unique_ptr<RuntimeFunctionSubop>(
-         new RuntimeFunctionSubop(
-            source,
-            state_init_,
-            std::move(fct_name),
-            std::move(in_ius),
-            std::move(out_ius_),
-            std::move(args),
-            std::move(ref),
-            /* out = */ nullptr));
+      std::unique_ptr<RuntimeFunctionSubop> result_subop{new RuntimeFunctionSubop(
+         source,
+         state_init_,
+         std::move(fct_name),
+         std::move(in_ius),
+         std::move(out_ius_),
+         std::move(args),
+         std::move(ref),
+         /* out = */ nullptr)};
+      // Prefetch instructions should never be generated in the operator-fusing code.
+      // When performing operator-fusing code generation, we are going through
+      // the code tuple-at-a time. As a result, the followup superator (e.g. HT lookup)
+      // will directly cause the cache miss anyways.
+      result_subop->optimization_properties.ct_only_vectorized = true;
+      return result_subop;
    }
 
    /// Build a hash table lookup function.
diff --git a/src/algebra/suboperators/Suboperator.h b/src/algebra/suboperators/Suboperator.h
index ce25e64..f74623d 100644
--- a/src/algebra/suboperators/Suboperator.h
+++ b/src/algebra/suboperators/Suboperator.h
@@ -116,6 +116,21 @@ struct Suboperator {
    const std::vector<const IU*>& getSourceIUs() const { return source_ius; }
    const std::vector<const IU*>& getIUs() const { return provided_ius; }
 
+   /// Properties that can influence runtime and code generation behaviour of suboperators.
+   /// These allow improving the performance of the system.
+   struct OptimizationProperties {
+      /// Compile-time property. When set, the suboperator does not generate code
+      /// for when compiled for operator-fusing code generation.
+      bool ct_only_vectorized = false;
+      /// Runtime property. If a chunk size preference is set, the vectorized backend
+      /// will try to break a morsel into smaller chunk that respect the chunk size
+      /// preference. This allows us to e.g. perform hash table lookups in a highly
+      /// optimized way. We can go to smaller chunk sizes that will have better cache
+      /// locality. This matters as we split prefetching and lookups into two phases.
+      std::optional<size_t> rt_chunk_size_prefeference = std::nullopt;
+   };
+   const OptimizationProperties& getOptimizationProperties() const { return optimization_properties; };
+
    protected:
    /// The operator which decayed into this Suboperator.
    const RelAlgOp* source;
@@ -127,6 +142,9 @@ struct Suboperator {
    /// interpreting an operator to make sure that the input columns are extracted in
    /// the right order.
    std::vector<const IU*> source_ius;
+
+   /// Optimization properties that can be used to improve suboperator performance.
+   OptimizationProperties optimization_properties;
 };
 
 /// Empty state which can be used in the templated suboperators.
diff --git a/src/interpreter/FragmentGenerator.cpp b/src/interpreter/FragmentGenerator.cpp
index 579a29b..3c0fef4 100644
--- a/src/interpreter/FragmentGenerator.cpp
+++ b/src/interpreter/FragmentGenerator.cpp
@@ -6,10 +6,10 @@
 #include "interpreter/CountingSinkFragmentizer.h"
 #include "interpreter/ExpressionFragmentizer.h"
 #include "interpreter/HashTableSourceFragmentizer.h"
+#include "interpreter/KeyPackingFragmentizer.h"
 #include "interpreter/RuntimeExpressionFragmentizer.h"
 #include "interpreter/RuntimeFunctionSubopFragmentizer.h"
 #include "interpreter/RuntimeKeyExpressionFragmentizer.h"
-#include "interpreter/KeyPackingFragmentizer.h"
 #include "interpreter/TScanFragmentizer.h"
 
 namespace inkfuse {
@@ -53,7 +53,7 @@ TypeDecorator& TypeDecorator::attachFloatingPoints() {
 TypeDecorator& TypeDecorator::attachNumeric() {
    attachIntegers();
    attachFloatingPoints();
-   // We also count dates as numeric types. This is because a date internally 
+   // We also count dates as numeric types. This is because a date internally
    // is represented as a 4 byte signed integer (day offset to the epoch).
    types.push_back(IR::Date::build());
    return *this;
@@ -66,8 +66,7 @@ TypeDecorator& TypeDecorator::attachTypes() {
    return *this;
 }
 
-TypeDecorator& TypeDecorator::attachStringType()
-{
+TypeDecorator& TypeDecorator::attachStringType() {
    types.push_back(IR::String::build());
    return *this;
 }
@@ -104,6 +103,10 @@ IR::ProgramArc FragmentGenerator::build() {
 
    // Create the IR program.
    auto program = std::make_shared<IR::Program>("fragments", false);
+   // Custom optimization hints that indicate that we generate vectorized code.
+   OptimizationHints hints{
+      .mode = OptimizationHints::CodegenMode::Vectorized,
+   };
    // And generate the code for all fragments.
    for (auto& fragmentizer : fragmentizers) {
       for (const auto& [name, pipe] : fragmentizer->getFragments()) {
@@ -111,7 +114,7 @@ IR::ProgramArc FragmentGenerator::build() {
          // the right fuse-chunk input and output operators which are needed in the actual fragment.
          // This in turn means that sub-operators don't have to create fuse chunk sources and sinks themselves.
          auto repiped = pipe.repipeAll(0, pipe.getSubops().size());
-         CompilationContext context(program, name, *repiped);
+         CompilationContext context(program, name, *repiped, hints);
          context.compile();
       }
    }

From 98cac9e37b624d3bdae7731c326b79b8ba36ff8e Mon Sep 17 00:00:00 2001
From: Benjamin Wagner <benjamin.wagner@firebolt.io>
Date: Sun, 29 Oct 2023 11:37:22 +0100
Subject: [PATCH 3/4] Vectorize Join Hash Table Build

When building a hash table during runtime we can apply the same tricks
we know from how to make vectorized hash tables fast.

We split the building into batches of 256 tuples. This allows for higher
insert throughput on large hash tables.
---
 src/algebra/Join.cpp          | 20 +++++++++++++++++---
 src/runtime/NewHashTables.cpp | 16 ++++++++++++++--
 src/runtime/NewHashTables.h   |  3 +++
 3 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/src/algebra/Join.cpp b/src/algebra/Join.cpp
index 43de72c..a895ed6 100644
--- a/src/algebra/Join.cpp
+++ b/src/algebra/Join.cpp
@@ -41,16 +41,30 @@ void materializedTupleToHashTable(
    assert(ht_state.hash_table);
    assert(!mat.handles.empty());
    assert(mat.handles.size() == mat.materializers.size());
+   const size_t batch_size = 256;
+   std::vector<uint64_t> hashes(batch_size);
    for (auto& read_handle : mat.handles) {
       // Pick morsels from the read handle.
       while (const TupleMaterializer::MatChunk* chunk = read_handle->pullChunk()) {
          // Materialize all tuples from the chunk.
+         // We traverse the materialized tuple in batches of 256 similar as a vectorized
+         // engine would. For large hash tables this increases throughput significantly.
          const char* curr_tuple = reinterpret_cast<const char*>(chunk->data.get());
          while (curr_tuple < chunk->end_ptr) {
-            // Copy over the whole tuple into the hash table.
-            ht_state.hash_table->insert<false>(curr_tuple);
+            size_t curr_batch_size = std::min(batch_size, (chunk->end_ptr - curr_tuple) / slot_size);
+            const char* curr_tuple_hash_it = curr_tuple;
+            for (size_t batch_idx = 0; batch_idx < curr_batch_size; ++batch_idx) {
+               hashes[batch_idx] = ht_state.hash_table->compute_hash(curr_tuple_hash_it);
+               curr_tuple_hash_it += slot_size;
+            }
+            for (size_t batch_idx = 0; batch_idx < curr_batch_size; ++batch_idx) {
+               ht_state.hash_table->slot_prefetch(hashes[batch_idx]);
+            }
+            for (size_t batch_idx = 0; batch_idx < curr_batch_size; ++batch_idx) {
+               ht_state.hash_table->insert<false>(curr_tuple, hashes[batch_idx]);
+               curr_tuple += slot_size;
+            }
             // Move to the next tuple.
-            curr_tuple += slot_size;
          }
       }
    }
diff --git a/src/runtime/NewHashTables.cpp b/src/runtime/NewHashTables.cpp
index 43882f7..995e4ef 100644
--- a/src/runtime/NewHashTables.cpp
+++ b/src/runtime/NewHashTables.cpp
@@ -174,9 +174,8 @@ char* AtomicHashTable<Comparator>::lookupDisable(const char* key) {
 
 template <class Comparator>
 template <bool copy_only_key>
-char* AtomicHashTable<Comparator>::insert(const char* key) {
+char* AtomicHashTable<Comparator>::insert(const char* key, uint64_t hash) {
    // Look up the initial slot in the linear probing chain .
-   const uint64_t hash = comp.hash(key);
    const auto idx = hash & mod_mask;
    IteratorState it{
       .idx = idx,
@@ -203,6 +202,13 @@ char* AtomicHashTable<Comparator>::insert(const char* key) {
    }
 }
 
+template <class Comparator>
+template <bool copy_only_key>
+char* AtomicHashTable<Comparator>::insert(const char* key) {
+   const uint64_t hash = comp.hash(key);
+   return insert(key, hash);
+}
+
 template <class Comparator>
 typename AtomicHashTable<Comparator>::IteratorState AtomicHashTable<Comparator>::itStart() const {
    IteratorState it;
@@ -257,10 +263,16 @@ template class AtomicHashTable<SimpleKeyComparator>;
 template char* AtomicHashTable<SimpleKeyComparator>::insert<true>(const char* key);
 template char* AtomicHashTable<SimpleKeyComparator>::insert<false>(const char* key);
 
+template char* AtomicHashTable<SimpleKeyComparator>::insert<true>(const char* key, uint64_t hash);
+template char* AtomicHashTable<SimpleKeyComparator>::insert<false>(const char* key, uint64_t hash);
+
 template class AtomicHashTable<ComplexKeyComparator>;
 template char* AtomicHashTable<ComplexKeyComparator>::insert<true>(const char* key);
 template char* AtomicHashTable<ComplexKeyComparator>::insert<false>(const char* key);
 
+template char* AtomicHashTable<ComplexKeyComparator>::insert<true>(const char* key, uint64_t hash);
+template char* AtomicHashTable<ComplexKeyComparator>::insert<false>(const char* key, uint64_t hash);
+
 template class ExclusiveHashTable<SimpleKeyComparator>;
 template class ExclusiveHashTable<ComplexKeyComparator>;
 
diff --git a/src/runtime/NewHashTables.h b/src/runtime/NewHashTables.h
index 6fb2e70..446e6c1 100644
--- a/src/runtime/NewHashTables.h
+++ b/src/runtime/NewHashTables.h
@@ -65,6 +65,9 @@ struct AtomicHashTable {
    ///                       the payload as well.
    template <bool copy_only_key = true>
    char* insert(const char* key);
+   /// Insert variation when we already computed the hash.
+   template <bool copy_only_key = true>
+   char* insert(const char* key, uint64_t hash);
 
    private:
    /// An iterator within the atomic hash table.

From 08b3b677916e09166e58a132d570e239715a55e9 Mon Sep 17 00:00:00 2001
From: Benjamin Wagner <benjamin.wagner@firebolt.io>
Date: Sun, 29 Oct 2023 13:17:43 +0100
Subject: [PATCH 4/4] Fix CI

The CI suddenly started breaking. Make it more robust by:
- Fixing Ubuntu 22.04 (this alone is not enough)
- Fixing libc++ as the C++ standard library
- Work around https://github.com/llvm/llvm-project/issues/59432

The second seemed to be the actual failure. It seems like we were
calling into the include headers of libstdc++ from a system GCC
installation and that was causing build issues.

This then caused ASAN failures coming from Ubuntu packaging issues which
are fixed by running the tests with disabled ASAN alloc/dealloc mismatch
warnings.
---
 .github/workflows/build_test.yml | 7 +++++--
 CMakeLists.txt                   | 2 +-
 thirdparty/PerfEvent.hpp         | 1 +
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml
index 0da00fb..ab67487 100644
--- a/.github/workflows/build_test.yml
+++ b/.github/workflows/build_test.yml
@@ -8,7 +8,7 @@ on:
 
 jobs:
   build:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     # Run builds and tests both in Debug and RelWithDebInfo
     strategy:
         matrix:
@@ -21,5 +21,8 @@ jobs:
           build-type: ${{ matrix.build-type }}
     - name: Test
       working-directory: ${{github.workspace}}/build
-      run: ./tester
+      # Unfortunately we're running into https://github.com/llvm/llvm-project/issues/59432
+      # This is some Ubuntu packaging issue that causes alloc/dealloc mismatches when asan
+      # is enabled with libc++
+      run: ASAN_OPTIONS=alloc_dealloc_mismatch=0 ./tester
       
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 912a6bd..0477fd2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,7 +12,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/")
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic -stdlib=libc++")
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -rdynamic -g -O0 -fsanitize=address")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
 
diff --git a/thirdparty/PerfEvent.hpp b/thirdparty/PerfEvent.hpp
index 9f8cde3..502db9e 100644
--- a/thirdparty/PerfEvent.hpp
+++ b/thirdparty/PerfEvent.hpp
@@ -34,6 +34,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include <map>
 #include <string>
 #include <vector>
+#include <sstream>
 
 #include <asm/unistd.h>
 #include <linux/perf_event.h>