Improve Prefetching Primitives

At the moment our ROF backend doesn't perform better than the regular JIT backend. We need to get to a point where the vectorized hash table lookups actually help our overall performance. This commit makes a first step in this direction: Improve hash/prefetch primitives: Until now we had a single primitive that would hash, and then another primitive that would prefetch based on the hash. Looking at the ROF paper their prefetching strategy is actually different: they fuse hashing and prefetching into a single primitive. This allows overlapping memory loads with computation, leading to better CPU utilization. We now do the same and have a fused hash/prefetch primitive.
wagjamin · Nov 3, 2023 · cb9a38e · cb9a38e
1 parent 111f77a
commit cb9a38e
Show file tree

Hide file tree

Showing 21 changed files with 124 additions and 109 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -12,10 +12,9 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/")
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic -stdlib=libc++")
-# set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -rdynamic -g -O0 -fsanitize=address")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic -stdlib=libc++ -gdwarf-4")
 # Generate DWARF 4 in debug to work on older GDB versions
-set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -rdynamic -g -gdwarf-4 -O0 -fsanitize=address")
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -O0 -fsanitize=address")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
 
 # ---------------------------------------------------------------------------

diff --git a/bench/benchmarks.cpp b/bench/benchmarks.cpp
@@ -1,3 +1,3 @@
 #include <benchmark/benchmark.h>
 
-BENCHMARK_MAIN();
+BENCHMARK_MAIN();
diff --git a/bench/compiler_invoke.cpp b/bench/compiler_invoke.cpp
@@ -1,5 +1,6 @@
 #include "benchmark/benchmark.h"
 #include "exec/InterruptableJob.h"
+#include <array>
 #include <fstream>
 
 /// The benchmarks in this file test the overhead of invoking the
@@ -84,4 +85,4 @@ BENCHMARK(invoke_gcc_direct)->Arg(0)->Arg(1);
 
 }
 
-}
+}
diff --git a/bench/vectorized_ht.cpp b/bench/vectorized_ht.cpp
@@ -107,6 +107,13 @@ struct BenchmarkHashTable {
       __builtin_prefetch(&entries[slot_idx]);
    };
 
+   const uint64_t vec_slot_and_load(const KeyType& key) const {
+      const auto hash = XXH3_64bits(&key, key_size);
+      const auto slot = hash % capacity;
+      __builtin_prefetch(&entries[slot]);
+      return slot;
+   };
+
    const Entry* vec_lookup(const KeyType& key, uint64_t slot_idx) const {
       const Entry* entry = &entries[slot_idx];
       while (entry->key != 0) {
@@ -190,6 +197,42 @@ void BM_ht_perf_vectorized(benchmark::State& state) {
    state.SetItemsProcessed(state.iterations() * num_elems);
 }
 
+/**
+ * Vectorized hash table as in the ROF paper. Fused prefetching & hash
+ * computation to overlap loads and computation nicely.
+ */
+void BM_ht_perf_vectorized_rof(benchmark::State& state) {
+   const uint64_t num_elems = state.range(0);
+   const uint64_t batch_size = state.range(1);
+   BenchmarkHashTable<uint64_t, uint64_t> ht{static_cast<size_t>(num_elems) * 2, 8};
+   for (uint64_t k = 1; k <= num_elems; ++k) {
+      ht.tat_insert(7 * k, k);
+   }
+   std::vector<uint64_t> keys(batch_size);
+   std::vector<uint64_t> slots(batch_size);
+   for (auto _ : state) {
+      // Lookup every key again.
+      for (uint64_t k = 1; k <= num_elems; k += batch_size) {
+         const auto curr_batch = std::min(batch_size, num_elems - k + 1);
+         for (uint64_t tid = 0; tid < curr_batch; ++tid) {
+            keys[tid] = 7 * (k + tid);
+         }
+         for (uint64_t tid = 0; tid < curr_batch; ++tid) {
+            slots[tid] = ht.vec_slot_and_load(keys[tid]);
+         }
+         for (uint64_t tid = 0; tid < curr_batch; ++tid) {
+            const auto* res = ht.vec_lookup(keys[tid], slots[tid]);
+            // We have to do something with the result, otherwise the compiler is too smart
+            // to optimize memory accesses away.
+            if (res->value > num_elems) {
+               throw std::runtime_error("bad ht lookup for " + std::to_string(k));
+            }
+         }
+      }
+   }
+   state.SetItemsProcessed(state.iterations() * num_elems);
+}
+
 void BM_ht_perf_tat_inkfuse(benchmark::State& state) {
    const uint64_t num_elems = state.range(0);
    inkfuse::SimpleKeyComparator comp{8};
@@ -231,10 +274,7 @@ void BM_ht_perf_vectorized_inkfuse(benchmark::State& state) {
             keys[tid] = 7 * (k + tid);
          }
          for (uint64_t tid = 0; tid < curr_batch; ++tid) {
-            hashes[tid] = ht.compute_hash(reinterpret_cast<const char*>(&keys[tid]));
-         }
-         for (uint64_t tid = 0; tid < curr_batch; ++tid) {
-            ht.slot_prefetch(hashes[tid]);
+            hashes[tid] = ht.compute_hash_and_prefetch(reinterpret_cast<const char*>(&keys[tid]));
          }
          for (uint64_t tid = 0; tid < curr_batch; ++tid) {
             const auto* res = ht.lookup(reinterpret_cast<const char*>(&keys[tid]), hashes[tid]);
@@ -254,8 +294,12 @@ BENCHMARK(BM_ht_perf_vectorized)->ArgPair(1 << 9, 256)->ArgPair(1 << 13, 256)->A
 // Different internal batch sizes. 256 is a good value.
 BENCHMARK(BM_ht_perf_vectorized)->ArgPair(1 << 25, 64)->ArgPair(1 << 25, 128)->ArgPair(1 << 25, 256)->ArgPair(1 << 25, 512)->ArgPair(1 << 25, 1024)->ArgPair(1 << 25, 2024)->ArgPair(1 << 25, 4048)->ArgPair(1 << 25, 8096)->ArgPair(1 << 25, 16192);
 
+BENCHMARK(BM_ht_perf_vectorized_rof)->ArgPair(1 << 9, 256)->ArgPair(1 << 13, 256)->ArgPair(1 << 15, 256)->ArgPair(1 << 19, 256)->ArgPair(1 << 25, 256)->ArgPair(1 << 30, 256);
+// Different internal batch sizes. 256 is a good value.
+BENCHMARK(BM_ht_perf_vectorized_rof)->ArgPair(1 << 25, 64)->ArgPair(1 << 25, 128)->ArgPair(1 << 25, 256)->ArgPair(1 << 25, 512)->ArgPair(1 << 25, 1024)->ArgPair(1 << 25, 2024)->ArgPair(1 << 25, 4048)->ArgPair(1 << 25, 8096)->ArgPair(1 << 25, 16192);
+
 BENCHMARK(BM_ht_perf_vectorized_inkfuse)->ArgPair(1 << 9, 256)->ArgPair(1 << 13, 256)->ArgPair(1 << 15, 256)->ArgPair(1 << 19, 256)->ArgPair(1 << 25, 256)->ArgPair(1 << 30, 256);
 // Different internal batch sizes. 256 is a good value.
 BENCHMARK(BM_ht_perf_vectorized_inkfuse)->ArgPair(1 << 25, 64)->ArgPair(1 << 25, 128)->ArgPair(1 << 25, 256)->ArgPair(1 << 25, 512)->ArgPair(1 << 25, 1024)->ArgPair(1 << 25, 2024)->ArgPair(1 << 25, 4048)->ArgPair(1 << 25, 8096)->ArgPair(1 << 25, 16192);
 
-} // namespacf
+} // namespace
diff --git a/src/algebra/Join.cpp b/src/algebra/Join.cpp
@@ -54,12 +54,9 @@ void materializedTupleToHashTable(
             size_t curr_batch_size = std::min(batch_size, (chunk->end_ptr - curr_tuple) / slot_size);
             const char* curr_tuple_hash_it = curr_tuple;
             for (size_t batch_idx = 0; batch_idx < curr_batch_size; ++batch_idx) {
-               hashes[batch_idx] = ht_state.hash_table->compute_hash(curr_tuple_hash_it);
+               hashes[batch_idx] = ht_state.hash_table->compute_hash_and_prefetch(curr_tuple_hash_it);
                curr_tuple_hash_it += slot_size;
             }
-            for (size_t batch_idx = 0; batch_idx < curr_batch_size; ++batch_idx) {
-               ht_state.hash_table->slot_prefetch(hashes[batch_idx]);
-            }
             for (size_t batch_idx = 0; batch_idx < curr_batch_size; ++batch_idx) {
                ht_state.hash_table->insert<false>(curr_tuple, hashes[batch_idx]);
                curr_tuple += slot_size;
@@ -269,19 +266,16 @@ void Join::decayPkJoin(inkfuse::PipelineDAG& dag) const {
                pseudo.push_back(&pseudo_iu);
             }
 
-            // 2.2.1 Compute the hash.
-            probe_pipe.attachSuboperator(RuntimeFunctionSubop::htHash<AtomicHashTable<SimpleKeyComparator>>(this, *hash_right, *scratch_pad_right, std::move(pseudo), &ht_state));
-
-            // 2.2.2 Prefetch the slot.
-            probe_pipe.attachSuboperator(RuntimeFunctionSubop::htPrefetch<AtomicHashTable<SimpleKeyComparator>>(this, &*prefetch_pseudo, *hash_right, &ht_state));
+            // 2.2.1 Compute the hash and prefetch the slot.
+            probe_pipe.attachSuboperator(RuntimeFunctionSubop::htHashAndPrefetch<AtomicHashTable<SimpleKeyComparator>>(this, *hash_right, *scratch_pad_right, std::move(pseudo), &ht_state));
 
-            // 2.2.3 Perfom the lookup.
+            // 2.2.2 Perfom the lookup.
             if (type == JoinType::LeftSemi) {
                // Lookup on a slot disables the slot, giving semi-join behaviour.
-               probe_pipe.attachSuboperator(RuntimeFunctionSubop::htLookupWithHash<AtomicHashTable<SimpleKeyComparator>, true>(this, *lookup_right, *scratch_pad_right, *hash_right, &*prefetch_pseudo, &ht_state));
+               probe_pipe.attachSuboperator(RuntimeFunctionSubop::htLookupWithHash<AtomicHashTable<SimpleKeyComparator>, true>(this, *lookup_right, *scratch_pad_right, *hash_right, /* prefetch_pseudo = */ nullptr, &ht_state));
             } else {
                // Regular lookup that does not disable slots.
-               probe_pipe.attachSuboperator(RuntimeFunctionSubop::htLookupWithHash<AtomicHashTable<SimpleKeyComparator>, false>(this, *lookup_right, *scratch_pad_right, *hash_right, &*prefetch_pseudo, &ht_state));
+               probe_pipe.attachSuboperator(RuntimeFunctionSubop::htLookupWithHash<AtomicHashTable<SimpleKeyComparator>, false>(this, *lookup_right, *scratch_pad_right, *hash_right, /* prefetch_pseudo = */ nullptr, &ht_state));
             }
          }
 

diff --git a/src/algebra/suboperators/RuntimeFunctionSubop.h b/src/algebra/suboperators/RuntimeFunctionSubop.h
@@ -30,10 +30,10 @@ struct RuntimeFunctionSubop : public TemplatedSuboperator<RuntimeFunctionSubopSt
    /// Build an insert function for a hash table.
    static std::unique_ptr<RuntimeFunctionSubop> htInsert(const RelAlgOp* source, const IU* pointers_, const IU& key_, std::vector<const IU*> pseudo_ius_, DefferredStateInitializer* state_init_ = nullptr);
 
-   /// Hash a key with the hash table's hash function.
+   /// Hash a key with the hash table's hash function and prefetch the corresponding slot.
    template <class HashTable>
-   static std::unique_ptr<RuntimeFunctionSubop> htHash(const RelAlgOp* source, const IU& hash_, const IU& key_, std::vector<const IU*> pseudo_ius_, DefferredStateInitializer* state_init_ = nullptr) {
-      std::string fct_name = "ht_" + HashTable::ID + "_compute_hash";
+   static std::unique_ptr<RuntimeFunctionSubop> htHashAndPrefetch(const RelAlgOp* source, const IU& hash_, const IU& key_, std::vector<const IU*> pseudo_ius_, DefferredStateInitializer* state_init_ = nullptr) {
+      std::string fct_name = "ht_" + HashTable::ID + "_compute_hash_and_prefetch";
       std::vector<const IU*> in_ius{&key_};
       for (auto pseudo : pseudo_ius_) {
          // Pseudo IUs are used as input IUs in the backing graph, but do not influence arguments.
@@ -55,34 +55,6 @@ struct RuntimeFunctionSubop : public TemplatedSuboperator<RuntimeFunctionSubopSt
             out));
    }
 
-   /// Hash a key with the hash table's hash function.
-   template <class HashTable>
-   static std::unique_ptr<RuntimeFunctionSubop> htPrefetch(const RelAlgOp* source, const IU* prefetch_pseudo, const IU& hash_, DefferredStateInitializer* state_init_ = nullptr) {
-      std::string fct_name = "ht_" + HashTable::ID + "_slot_prefetch";
-      std::vector<const IU*> in_ius{&hash_};
-      std::vector<bool> ref{false};
-      std::vector<const IU*> out_ius_{};
-      if (prefetch_pseudo) {
-         out_ius_.push_back(prefetch_pseudo);
-      }
-      std::vector<const IU*> args{&hash_};
-      std::unique_ptr<RuntimeFunctionSubop> result_subop{new RuntimeFunctionSubop(
-         source,
-         state_init_,
-         std::move(fct_name),
-         std::move(in_ius),
-         std::move(out_ius_),
-         std::move(args),
-         std::move(ref),
-         /* out = */ nullptr)};
-      // Prefetch instructions should never be generated in the operator-fusing code.
-      // When performing operator-fusing code generation, we are going through
-      // the code tuple-at-a time. As a result, the followup superator (e.g. HT lookup)
-      // will directly cause the cache miss anyways.
-      result_subop->optimization_properties.ct_only_vectorized = true;
-      return result_subop;
-   }
-
    /// Build a hash table lookup function.
    template <class HashTable, bool disable_slot>
    static std::unique_ptr<RuntimeFunctionSubop> htLookupWithHash(const RelAlgOp* source, const IU& pointers_, const IU& key_, const IU& hash_, const IU* prefetch_pseudo_, DefferredStateInitializer* state_init_ = nullptr) {

diff --git a/src/interpreter/RuntimeFunctionSubopFragmentizer.cpp b/src/interpreter/RuntimeFunctionSubopFragmentizer.cpp
@@ -40,11 +40,11 @@ RuntimeFunctionSubopFragmentizer::RuntimeFunctionSubopFragmentizer() {
 
       // Fragmentize Vectorized Hash Table Primitives
       {
-         // Hash:
+         // Hash and prefetch:
          auto& [name, pipe] = pipes.emplace_back();
          const auto& key = generated_ius.emplace_back(in_type);
          const auto& hash = generated_ius.emplace_back(IR::UnsignedInt::build(8));
-         const auto& op = pipe.attachSuboperator(RuntimeFunctionSubop::htHash<AtomicHashTable<SimpleKeyComparator>>(nullptr, hash, key, {}));
+         const auto& op = pipe.attachSuboperator(RuntimeFunctionSubop::htHashAndPrefetch<AtomicHashTable<SimpleKeyComparator>>(nullptr, hash, key, {}));
          name = op.id();
       }
       {
@@ -106,14 +106,6 @@ RuntimeFunctionSubopFragmentizer::RuntimeFunctionSubopFragmentizer() {
       }
    }
 
-   // Fragmentize Prefetch.
-   {
-      auto& [name, pipe] = pipes.emplace_back();
-      const auto& hash = generated_ius.emplace_back(IR::UnsignedInt::build(8));
-      const auto& op = pipe.attachSuboperator(RuntimeFunctionSubop::htPrefetch<AtomicHashTable<SimpleKeyComparator>>(nullptr, nullptr, hash));
-      name = op.id();
-   }
-
    // Fragmentize tuple materialization.
    {
       auto& [name, pipe] = pipes.emplace_back();

diff --git a/src/runtime/HashTableRuntime.cpp b/src/runtime/HashTableRuntime.cpp
@@ -58,8 +58,8 @@ extern "C" void HashTableRuntime::ht_dl_it_advance(void* table, char** it_data,
 }
 
 // Atomic hash table.
-extern "C" uint64_t HashTableRuntime::ht_at_sk_compute_hash(void* table, char* key) {
-   return reinterpret_cast<AtomicHashTable<SimpleKeyComparator>*>(table)->compute_hash(key);
+extern "C" uint64_t HashTableRuntime::ht_at_sk_compute_hash_and_prefetch(void* table, char* key) {
+   return reinterpret_cast<AtomicHashTable<SimpleKeyComparator>*>(table)->compute_hash_and_prefetch(key);
 }
 
 extern "C" void HashTableRuntime::ht_at_sk_slot_prefetch(void* table, uint64_t hash) {
@@ -74,8 +74,8 @@ extern "C" char* HashTableRuntime::ht_at_sk_lookup_with_hash_disable(void* table
    return reinterpret_cast<AtomicHashTable<SimpleKeyComparator>*>(table)->lookupDisable(key, hash);
 }
 
-extern "C" uint64_t HashTableRuntime::ht_at_ck_compute_hash(void* table, char* key) {
-   return reinterpret_cast<AtomicHashTable<ComplexKeyComparator>*>(table)->compute_hash(key);
+extern "C" uint64_t HashTableRuntime::ht_at_ck_compute_hash_and_prefetch(void* table, char* key) {
+   return reinterpret_cast<AtomicHashTable<ComplexKeyComparator>*>(table)->compute_hash_and_prefetch(key);
 }
 
 extern "C" void HashTableRuntime::ht_at_ck_slot_prefetch(void* table, uint64_t hash) {
@@ -172,7 +172,7 @@ void HashTableRuntime::registerRuntime() {
       .addArg("table", IR::Pointer::build(IR::Void::build()))
       .addArg("key", IR::Pointer::build(IR::Char::build()), true);
 
-   RuntimeFunctionBuilder("ht_at_sk_compute_hash", IR::UnsignedInt::build(8))
+   RuntimeFunctionBuilder("ht_at_sk_compute_hash_and_prefetch", IR::UnsignedInt::build(8))
       .addArg("table", IR::Pointer::build(IR::Void::build()))
       .addArg("key", IR::Pointer::build(IR::Char::build()), true);
 
@@ -190,7 +190,7 @@ void HashTableRuntime::registerRuntime() {
       .addArg("key", IR::Pointer::build(IR::Char::build()))
       .addArg("hash", IR::UnsignedInt::build(8), true);
 
-   RuntimeFunctionBuilder("ht_at_ck_compute_hash", IR::UnsignedInt::build(8))
+   RuntimeFunctionBuilder("ht_at_ck_compute_hash_and_prefetch", IR::UnsignedInt::build(8))
       .addArg("table", IR::Pointer::build(IR::Void::build()))
       .addArg("key", IR::Pointer::build(IR::Char::build()), true);
 

diff --git a/src/runtime/HashTableRuntime.h b/src/runtime/HashTableRuntime.h
@@ -29,12 +29,12 @@ extern "C" char* ht_at_sk_lookup(void* table, char* key);
 extern "C" char* ht_at_sk_lookup_disable(void* table, char* key);
 extern "C" char* ht_at_ck_lookup(void* table, char* key);
 
-extern "C" uint64_t ht_at_sk_compute_hash(void* table, char* key);
+extern "C" uint64_t ht_at_sk_compute_hash_and_prefetch(void* table, char* key);
 extern "C" void ht_at_sk_slot_prefetch(void* table, uint64_t hash);
 extern "C" char* ht_at_sk_lookup_with_hash(void* table, char* key, uint64_t hash);
 extern "C" char* ht_at_sk_lookup_with_hash_disable(void* table, char* key, uint64_t hash);
 
-extern "C" uint64_t ht_at_ck_compute_hash(void* table, char* key);
+extern "C" uint64_t ht_at_ck_compute_hash_and_prefetch(void* table, char* key);
 extern "C" void ht_at_ck_slot_prefetch(void* table, uint64_t hash);
 extern "C" char* ht_at_ck_lookup_with_hash(void* table, char* key, uint64_t hash);
 extern "C" char* ht_at_ck_lookup_with_hash_disable(void* table, char* key, uint64_t hash);

diff --git a/src/runtime/NewHashTables.cpp b/src/runtime/NewHashTables.cpp
@@ -83,8 +83,14 @@ AtomicHashTable<Comparator>::AtomicHashTable(Comparator comp_, uint16_t total_sl
 }
 
 template <class Comparator>
-uint64_t AtomicHashTable<Comparator>::compute_hash(const char* key) const {
-   return comp.hash(key);
+uint64_t AtomicHashTable<Comparator>::compute_hash_and_prefetch(const char* key) const {
+   uint64_t hash = comp.hash(key);
+   const uint64_t slot_id = hash & mod_mask;
+   // Prefetch the actual data array.
+   __builtin_prefetch(&data[slot_id * total_slot_size]);
+   // Prefetch the bitmask slot.
+   __builtin_prefetch(&tags[slot_id]);
+   return hash;
 }
 
 template <class Comparator>

diff --git a/src/runtime/NewHashTables.h b/src/runtime/NewHashTables.h
@@ -43,8 +43,8 @@ struct AtomicHashTable {
 
    AtomicHashTable(Comparator comp_, uint16_t total_slot_size_, size_t num_slots_);
 
-   /// Compute the hash for a given key.
-   uint64_t compute_hash(const char* key) const;
+   /// Compute the hash for a given key and prefetch the corresponding hash table slot.
+   uint64_t compute_hash_and_prefetch(const char* key) const;
    /// Prefetch the tag and data slots for a specific hash.
    void slot_prefetch(uint64_t hash) const;
    /// Get the pointer to a given key, or nullptr if the group does not exist.

diff --git a/test/operators/test_expression.cpp b/test/operators/test_expression.cpp
@@ -4,6 +4,7 @@
 #include "algebra/RelAlgOp.h"
 #include "codegen/Value.h"
 #include "codegen/backend_c/BackendC.h"
+#include "exec/FuseChunk.h"
 #include "exec/PipelineExecutor.h"
 #include <gtest/gtest.h>
 
@@ -167,8 +168,8 @@ TEST_P(ExpressionTParametrized, hash) {
    auto& ctx = exec.getExecutionContext();
    auto& c_in1 = ctx.getColumn(source, 0);
 
-   c_in1.size = 1000;
-   for (uint16_t k = 0; k < 1000; ++k) {
+   c_in1.size = DEFAULT_CHUNK_SIZE;
+   for (uint16_t k = 0; k < DEFAULT_CHUNK_SIZE; ++k) {
       reinterpret_cast<uint64_t*>(c_in1.raw_data)[k] = k;
    }
 
@@ -180,7 +181,7 @@ TEST_P(ExpressionTParametrized, hash) {
    std::unordered_set<uint64_t> seen;
    // This set should have no hash collisions.
    auto& hash_col = ctx.getColumn(hash_iu, 0);
-   for (uint16_t k = 0; k < 1000; ++k) {
+   for (uint16_t k = 0; k < DEFAULT_CHUNK_SIZE; ++k) {
       auto elem = reinterpret_cast<uint64_t*>(hash_col.raw_data)[k];
       EXPECT_EQ(seen.count(elem), 0);
       seen.insert(elem);

diff --git a/test/operators/test_table_scan.cpp b/test/operators/test_table_scan.cpp
@@ -4,6 +4,7 @@
 #include "algebra/TableScan.h"
 #include "algebra/suboperators/sinks/FuseChunkSink.h"
 #include "codegen/backend_c/BackendC.h"
+#include "exec/FuseChunk.h"
 #include "exec/PipelineExecutor.h"
 #include <gtest/gtest.h>
 
@@ -15,9 +16,9 @@ TEST(test_table_scan, scan_1) {
    StoredRelation rel;
    auto& col_1 = rel.attachPODColumn("col_1", IR::UnsignedInt::build(8));
    auto& storage = col_1.getStorage();
-   storage.resize(8 * 1000);
-   for (uint64_t k = 0; k < 1000; ++k)
-   {
+   // two full fuse chunks in the source table
+   storage.resize(8 * 2 * DEFAULT_CHUNK_SIZE);
+   for (uint64_t k = 0; k < 2 * DEFAULT_CHUNK_SIZE; ++k) {
       reinterpret_cast<uint64_t*>(storage.data())[k] = k;
    }
 
@@ -40,9 +41,8 @@ TEST(test_table_scan, scan_1) {
    EXPECT_NO_THROW(exec.runPipeline());
    auto& col = exec.getExecutionContext().getColumn(tscan_iu, 0);
 
-   for (uint64_t k = 0; k < 1000; ++k)
-   {
-      EXPECT_EQ(reinterpret_cast<uint64_t*>(col.raw_data)[k], k);
+   for (uint64_t k = 0; k < DEFAULT_CHUNK_SIZE; ++k) {
+      EXPECT_EQ(reinterpret_cast<uint64_t*>(col.raw_data)[k], DEFAULT_CHUNK_SIZE + k);
    }
 }