Vectorize Join Hash Table Build

When building a hash table during runtime we can apply the same tricks we know from how to make vectorized hash tables fast. We split the building into batches of 256 tuples. This allows for higher insert throughput on large hash tables.
wagjamin · Oct 29, 2023 · 98cac9e · 98cac9e
1 parent ce337a6
commit 98cac9e
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 5 deletions.
diff --git a/src/algebra/Join.cpp b/src/algebra/Join.cpp
@@ -41,16 +41,30 @@ void materializedTupleToHashTable(
    assert(ht_state.hash_table);
    assert(!mat.handles.empty());
    assert(mat.handles.size() == mat.materializers.size());
+   const size_t batch_size = 256;
+   std::vector<uint64_t> hashes(batch_size);
    for (auto& read_handle : mat.handles) {
       // Pick morsels from the read handle.
       while (const TupleMaterializer::MatChunk* chunk = read_handle->pullChunk()) {
          // Materialize all tuples from the chunk.
+         // We traverse the materialized tuple in batches of 256 similar as a vectorized
+         // engine would. For large hash tables this increases throughput significantly.
          const char* curr_tuple = reinterpret_cast<const char*>(chunk->data.get());
          while (curr_tuple < chunk->end_ptr) {
-            // Copy over the whole tuple into the hash table.
-            ht_state.hash_table->insert<false>(curr_tuple);
+            size_t curr_batch_size = std::min(batch_size, (chunk->end_ptr - curr_tuple) / slot_size);
+            const char* curr_tuple_hash_it = curr_tuple;
+            for (size_t batch_idx = 0; batch_idx < curr_batch_size; ++batch_idx) {
+               hashes[batch_idx] = ht_state.hash_table->compute_hash(curr_tuple_hash_it);
+               curr_tuple_hash_it += slot_size;
+            }
+            for (size_t batch_idx = 0; batch_idx < curr_batch_size; ++batch_idx) {
+               ht_state.hash_table->slot_prefetch(hashes[batch_idx]);
+            }
+            for (size_t batch_idx = 0; batch_idx < curr_batch_size; ++batch_idx) {
+               ht_state.hash_table->insert<false>(curr_tuple, hashes[batch_idx]);
+               curr_tuple += slot_size;
+            }
             // Move to the next tuple.
-            curr_tuple += slot_size;
          }
       }
    }

diff --git a/src/runtime/NewHashTables.cpp b/src/runtime/NewHashTables.cpp
@@ -174,9 +174,8 @@ char* AtomicHashTable<Comparator>::lookupDisable(const char* key) {
 
 template <class Comparator>
 template <bool copy_only_key>
-char* AtomicHashTable<Comparator>::insert(const char* key) {
+char* AtomicHashTable<Comparator>::insert(const char* key, uint64_t hash) {
    // Look up the initial slot in the linear probing chain .
-   const uint64_t hash = comp.hash(key);
    const auto idx = hash & mod_mask;
    IteratorState it{
       .idx = idx,
@@ -203,6 +202,13 @@ char* AtomicHashTable<Comparator>::insert(const char* key) {
    }
 }
 
+template <class Comparator>
+template <bool copy_only_key>
+char* AtomicHashTable<Comparator>::insert(const char* key) {
+   const uint64_t hash = comp.hash(key);
+   return insert(key, hash);
+}
+
 template <class Comparator>
 typename AtomicHashTable<Comparator>::IteratorState AtomicHashTable<Comparator>::itStart() const {
    IteratorState it;
@@ -257,10 +263,16 @@ template class AtomicHashTable<SimpleKeyComparator>;
 template char* AtomicHashTable<SimpleKeyComparator>::insert<true>(const char* key);
 template char* AtomicHashTable<SimpleKeyComparator>::insert<false>(const char* key);
 
+template char* AtomicHashTable<SimpleKeyComparator>::insert<true>(const char* key, uint64_t hash);
+template char* AtomicHashTable<SimpleKeyComparator>::insert<false>(const char* key, uint64_t hash);
+
 template class AtomicHashTable<ComplexKeyComparator>;
 template char* AtomicHashTable<ComplexKeyComparator>::insert<true>(const char* key);
 template char* AtomicHashTable<ComplexKeyComparator>::insert<false>(const char* key);
 
+template char* AtomicHashTable<ComplexKeyComparator>::insert<true>(const char* key, uint64_t hash);
+template char* AtomicHashTable<ComplexKeyComparator>::insert<false>(const char* key, uint64_t hash);
+
 template class ExclusiveHashTable<SimpleKeyComparator>;
 template class ExclusiveHashTable<ComplexKeyComparator>;
 

diff --git a/src/runtime/NewHashTables.h b/src/runtime/NewHashTables.h
@@ -65,6 +65,9 @@ struct AtomicHashTable {
    ///                       the payload as well.
    template <bool copy_only_key = true>
    char* insert(const char* key);
+   /// Insert variation when we already computed the hash.
+   template <bool copy_only_key = true>
+   char* insert(const char* key, uint64_t hash);
 
    private:
    /// An iterator within the atomic hash table.