From 71525b85503b7cf4c564996e0e6135e225477084 Mon Sep 17 00:00:00 2001
From: houzhenggang <houzhenggang@meituan.com>
Date: Tue, 20 May 2025 10:37:32 +0800
Subject: [PATCH 01/12] FixedBlockPool

---
 .../SynchronizedShardedMap.h                  |  30 +-
 .../dram_kv_embedding_cache.h                 |  61 ++--
 .../fixed_block_pool.h                        | 128 ++++++++
 .../store_value_utils.h                       |  81 +++++
 .../dram_kv_embedding_cache/CMakeLists.txt    |  14 +
 .../fixed_block_pool_test.cpp                 | 301 ++++++++++++++++++
 .../sharded_map_test.cpp                      | 164 ++++++++++
 .../store_value_utils_test.cpp                |  86 +++++
 8 files changed, 835 insertions(+), 30 deletions(-)
 create mode 100644 fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h
 create mode 100644 fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h
 create mode 100644 fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt
 create mode 100644 fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp
 create mode 100644 fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp
 create mode 100644 fbgemm_gpu/test/dram_kv_embedding_cache/store_value_utils_test.cpp
diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h b/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h
index 12d8be97b5..3cd4c61c6f 100644
--- a/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h
@@ -8,8 +8,10 @@
 
 #pragma once
 
+#include <folly/Synchronized.h>
 #include <folly/container/F14Map.h>
-#include "folly/Synchronized.h"
+
+#include "fixed_block_pool.h"
 
 namespace kv_mem {
 
@@ -29,18 +31,30 @@ class SynchronizedShardedMap {
  public:
   using iterator = typename folly::F14FastMap<K, V>::const_iterator;
 
-  explicit SynchronizedShardedMap(std::size_t numShards) : shards_(numShards) {}
+  explicit SynchronizedShardedMap(std::size_t numShards,
+                                  std::size_t block_size,
+                                  std::size_t block_alignment,
+                                  std::size_t blocks_per_chunk = 8192)
+      : shards_(numShards), mempools_(numShards) {
+    // Init mempools_
+    for (auto& pool : mempools_) {
+      pool = std::make_unique<kv_mem::FixedBlockPool>(
+          block_size, block_alignment, blocks_per_chunk);
+    }
+  }
 
   // Get shard map by index
-  auto& by(int index) {
-    return shards_.at(index % shards_.size());
-  }
+  auto& by(int index) { return shards_.at(index % shards_.size()); }
 
-  auto getNumShards() {
-    return shards_.size();
+  // Get shard pool by index
+  auto* pool_by(int index) {
+    return mempools_.at(index % shards_.size()).get();
   }
 
+  auto getNumShards() { return shards_.size(); }
+
  private:
   std::vector<folly::Synchronized<folly::F14FastMap<K, V>, M>> shards_;
+  std::vector<std::unique_ptr<FixedBlockPool>> mempools_;
 };
-} // namespace kv_mem
+}  // namespace kv_mem
diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h
index 0b195b695b..23460c41bb 100644
--- a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h
@@ -15,7 +15,7 @@
 
 #include "SynchronizedShardedMap.h"
 #include "deeplearning/fbgemm/fbgemm_gpu/src/ssd_split_embeddings_cache/initializer.h"
-#include "store_value.h"
+#include "store_value_utils.h"
 
 #include <ATen/core/ivalue.h>
 #include <caffe2/torch/fb/distributed/wireSerializer/WireSerializer.h>
@@ -70,8 +70,13 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
         max_D_(max_D),
         num_shards_(num_shards),
         weight_ttl_in_hours_(weight_ttl_in_hours),
-        kv_store_(SynchronizedShardedMap<int64_t, StoreValue<weight_type>>(
-            num_shards_)),
+        block_size_(StoreValueUtils::calculate_block_size<weight_type>(max_D)),
+        block_alignment_(StoreValueUtils::calculate_block_alignment<weight_type>()),
+        kv_store_(SynchronizedShardedMap<int64_t, weight_type*>(
+            num_shards_,
+            block_size_,
+            block_alignment_,
+            /*blocks_per_chunk=*/8192)),
         elem_size_(row_storage_bitwidth / 8) {
     executor_ = std::make_unique<folly::CPUThreadPoolExecutor>(std::max<size_t>(
         num_threads, facebook::Proc::getCpuInfo().numCpuCores));
@@ -185,20 +190,31 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
                       CHECK_EQ(indices.size(0), weights.size(0));
                       {
                         auto wlmap = kv_store_.by(shard_id).wlock();
-
+                        auto* pool = kv_store_.pool_by(shard_id);
                         for (auto index_iter = indexes.begin();
                              index_iter != indexes.end();
                              index_iter++) {
                           const auto& id_index = *index_iter;
                           auto id = indices[id_index].template item<index_t>();
-                          wlmap->try_emplace(
-                              id,
-                              StoreValue<weight_type>(std::vector<weight_type>(
-                                  weights[id_index]
-                                      .template data_ptr<weight_type>(),
-                                  weights[id_index]
-                                          .template data_ptr<weight_type>() +
-                                      weights[id_index].numel())));
+
+                          // use mempool
+                          weight_type* block = nullptr;
+                          // First check if the key already exists
+                          auto it = wlmap->find(id);
+                          if (it != wlmap->end()) {
+                            block = it->second;
+                          } else {
+                            // Key doesn't exist, allocate new block and insert.
+                            block = StoreValueUtils::allocate<weight_type>(
+                                block_size_, block_alignment_, pool);
+                            wlmap->insert({id, block});
+                          }
+                          StoreValueUtils::update_timestamp<weight_type>(block);
+                          auto* data_ptr = StoreValueUtils::data_ptr<weight_type>(block);
+                          std::copy(
+                              weights[id_index].template data_ptr<weight_type>
+                              weights[id_index].template data_ptr<weight_type>() + weights[id_index].numel(),
+                              data_ptr);
                         }
                       }
                     });
@@ -265,6 +281,7 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
                               weights.data_ptr<weight_type>();
                           auto id = indices[id_index].template item<index_t>();
                           auto wlmap = kv_store_.by(shard_id).wlock();
+                          auto* pool = kv_store_.pool_by(shard_id);
                           const auto cached_iter = wlmap->find(id);
                           if (cached_iter == wlmap->end()) {
                             fill_from_row_storage(
@@ -276,16 +293,13 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
                                     row_storage_data_ptr));
                             continue;
                           }
-                          const auto& cache_results =
-                              cached_iter->second.getValueAndPromote();
-                          CHECK_EQ(cache_results.size(), max_D_);
+                          // use mempool
+                          const auto* data_ptr = StoreValueUtils::data_ptr<weight_type>(cached_iter->second);
+                          StoreValueUtils::update_timestamp(cached_iter->second);
                           std::copy(
-                              reinterpret_cast<const weight_type*>(
-                                  &(cache_results[0])),
-                              reinterpret_cast<const weight_type*>(
-                                  &(cache_results[max_D_])),
-                              &(weights_data_ptr
-                                    [id_index * max_D_])); // dst_start
+                              data_ptr,
+                              data_ptr + max_D_,
+                              &weights_data[index * max_D_]); // dst_start
                         }
                       }
                     });
@@ -368,7 +382,10 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
   int64_t max_D_;
   int64_t num_shards_;
   int64_t weight_ttl_in_hours_;
-  SynchronizedShardedMap<int64_t, StoreValue<weight_type>> kv_store_;
+  // mempool params
+  size_t block_size_;
+  size_t block_alignment_;
+  SynchronizedShardedMap<int64_t, weight_type*> kv_store_;
   std::atomic_bool is_eviction_ongoing_ = false;
   std::vector<std::unique_ptr<ssd::Initializer>> initializers_;
   int64_t elem_size_;
diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h
new file mode 100644
index 0000000000..a474b367c3
--- /dev/null
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h
@@ -0,0 +1,128 @@
+#pragma once
+
+#include <cstddef>
+#include <memory_resource>
+#include <stdexcept>
+#include <vector>
+
+#include <cassert>
+
+namespace kv_mem {
+class FixedBlockPool : public std::pmr::memory_resource {
+ public:
+  explicit FixedBlockPool(
+      std::size_t block_size,       // Size of each memory block
+      std::size_t block_alignment,  // Memory block alignment requirement
+      std::size_t blocks_per_chunk = 8192,  // Number of blocks per chunk
+      std::pmr::memory_resource* upstream = std::pmr::new_delete_resource())
+      // Minimum block size is 8 bytes
+      : block_size_(std::max(block_size, sizeof(void*))),
+        block_alignment_(block_alignment),
+        blocks_per_chunk_(blocks_per_chunk),
+        upstream_(upstream),
+        chunks_(upstream) {
+    // Validate minimum data size, whether it's less than 8 bytes
+    // half type, 2 bytes, minimum embedding length 4
+    // float type, 4 bytes, minimum embedding length 2
+    // Large objects use memory pool, small objects are placed directly in the
+    // hashtable
+    if (block_size < sizeof(void*)) {
+      // Block size must be at least able to store a pointer (for free list)
+      throw std::invalid_argument("Block size must be at least sizeof(void*)");
+    }
+
+    // Validate that alignment requirement is a power of 2
+    if ((block_alignment_ & (block_alignment_ - 1)) != 0) {
+      throw std::invalid_argument("Alignment must be power of two");
+    }
+
+    // Validate that block size is a multiple of alignment
+    if (block_size_ % block_alignment_ != 0) {
+      throw std::invalid_argument("Block size must align with alignment");
+    }
+
+    // Ensure block size is at least 1
+    if (block_size_ < 1) {
+      throw std::invalid_argument("Block size must be at least 1");
+    }
+  }
+
+  // Release all allocated memory during destruction
+  ~FixedBlockPool() override {
+    for (auto&& chunk : chunks_) {
+      upstream_->deallocate(chunk.ptr, chunk.size, chunk.alignment);
+    }
+  }
+
+ protected:
+  // Core allocation function
+  void* do_allocate(std::size_t bytes, std::size_t alignment) override {
+    // Only handle matching block size and alignment requirements
+    if (bytes != block_size_ || alignment != block_alignment_) {
+      throw std::bad_alloc();
+    }
+
+    // Allocate a new chunk when no blocks are available
+    if (!free_list_) {
+      allocate_chunk();
+    }
+
+    // Take a block from the head of the free list
+    void* result = free_list_;
+    free_list_ = *static_cast<void**>(free_list_);
+    return result;
+  }
+
+  // Core deallocation function
+  void do_deallocate(void* p,
+                     [[maybe_unused]] std::size_t bytes,
+                     [[maybe_unused]] std::size_t alignment) override {
+    // Insert memory block back to the head of free list
+    *static_cast<void**>(p) = free_list_;
+    free_list_ = p;
+  }
+
+  // Resource equality comparison (only the same object is equal)
+  [[nodiscard]] bool do_is_equal(
+      const std::pmr::memory_resource& other) const noexcept override {
+    return this == &other;
+  }
+
+ private:
+  // Chunk metadata
+  struct chunk_info {
+    void* ptr;         // Memory block pointer
+    std::size_t size;  // Total size
+    std::size_t alignment;
+  };
+
+  // Allocate a new memory chunk
+  void allocate_chunk() {
+    const std::size_t chunk_size = block_size_ * blocks_per_chunk_;
+
+    // Allocate aligned memory through upstream resource
+    void* chunk_ptr = upstream_->allocate(chunk_size, block_alignment_);
+
+    // Record chunk information for later release
+    chunks_.push_back({chunk_ptr, chunk_size, block_alignment_});
+
+    // Initialize free list: link blocks in reverse order from chunk end to
+    // beginning (improves locality)
+    char* current = static_cast<char*>(chunk_ptr) + chunk_size;
+    for (std::size_t i = 0; i < blocks_per_chunk_; ++i) {
+      current -= block_size_;
+      *reinterpret_cast<void**>(current) = free_list_;
+      free_list_ = current;
+    }
+  }
+
+  // Member variables
+  const std::size_t block_size_;  // Block size (not less than pointer size)
+  const std::size_t block_alignment_;    // Block alignment requirement
+  const std::size_t blocks_per_chunk_;   // Number of blocks per chunk
+  std::pmr::memory_resource* upstream_;  // Upstream memory resource
+  std::pmr::vector<chunk_info> chunks_{
+      1024};                   // Records of all allocated chunks
+  void* free_list_ = nullptr;  // Free block list head pointer
+};
+}  // namespace kv_mem
diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h b/fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h
new file mode 100644
index 0000000000..c10c318621
--- /dev/null
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h
@@ -0,0 +1,81 @@
+#pragma once
+#include <chrono>
+
+#include "fixed_block_pool.h"
+
+namespace kv_mem {
+
+class StoreValueUtils {
+ public:
+  // Metadata structure (publicly accessible)
+  struct alignas(8) MetaHeader {
+    int64_t timestamp;  // 8 bytes
+    // Can be extended with other fields: uint32_t counter, uint64_t key, etc.
+  };
+
+  // Create memory block with metadata
+  template <typename scalar_t>
+  static scalar_t* allocate(size_t& block_size,
+                            size_t& alignment,
+                            FixedBlockPool* pool) {
+    return reinterpret_cast<scalar_t*>(pool->allocate(block_size, alignment));
+  }
+
+  // Destroy memory block
+  template <typename scalar_t>
+  static void deallocate(scalar_t* block,
+                         size_t& block_size,
+                         size_t& alignment,
+                         FixedBlockPool* pool) {
+    pool->deallocate(block, block_size, alignment);
+  }
+
+  // Calculate storage size
+  template <typename scalar_t>
+  static size_t calculate_block_size(size_t dimension) {
+    return sizeof(MetaHeader) + dimension * sizeof(scalar_t);
+  }
+
+  // Calculate alignment requirements
+  template <typename scalar_t>
+  static size_t calculate_block_alignment() {
+    return std::max(alignof(MetaHeader), alignof(scalar_t));
+  }
+
+  // Metadata operations
+  template <typename scalar_t>
+  static int64_t get_timestamp(const scalar_t* block) {
+    return reinterpret_cast<const MetaHeader*>(block)->timestamp;
+  }
+
+  template <typename scalar_t>
+  static void set_timestamp(scalar_t* block, int64_t ts) {
+    reinterpret_cast<MetaHeader*>(block)->timestamp = ts;
+  }
+
+  template <typename scalar_t>
+  static void update_timestamp(scalar_t* block) {
+    reinterpret_cast<MetaHeader*>(block)->timestamp = current_timestamp();
+  }
+
+  // Data pointer retrieval
+  template <typename scalar_t>
+  static scalar_t* data_ptr(scalar_t* block) {
+    return reinterpret_cast<scalar_t*>(reinterpret_cast<char*>(block) +
+                                       sizeof(MetaHeader));
+  }
+
+  template <typename scalar_t>
+  static const scalar_t* data_ptr(const scalar_t* block) {
+    return reinterpret_cast<const scalar_t*>(
+        reinterpret_cast<const char*>(block) + sizeof(MetaHeader));
+  }
+
+  static int64_t current_timestamp() {
+    return std::chrono::duration_cast<std::chrono::seconds>(
+               std::chrono::system_clock::now().time_since_epoch())
+        .count();
+    // facebook::WallClockUtil::NowInUsecFast();
+  }
+};
+}  // namespace kv_mem
\ No newline at end of file
diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt b/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt
new file mode 100644
index 0000000000..e9a41eac51
--- /dev/null
+++ b/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_executable(fixed_block_pool_test ${CMAKE_CURRENT_SOURCE_DIR}/fixed_block_pool_test.cpp)
+target_compile_features(fixed_block_pool_test PUBLIC cxx_std_17)
+target_include_directories(fixed_block_pool_test PUBLIC ${FBGEMM_SOURCE_DIR})
+target_link_libraries(fixed_block_pool_test gtest gtest_main)
+
+add_executable(sharded_map_test ${CMAKE_CURRENT_SOURCE_DIR}/sharded_map_test.cpp)
+target_compile_features(sharded_map_test PUBLIC cxx_std_17)
+target_include_directories(fixed_block_pool_test PUBLIC ${FBGEMM_SOURCE_DIR})
+target_link_libraries(sharded_map_test gtest gtest_main Folly::folly)
+
+add_executable(store_value_utils_test ${CMAKE_CURRENT_SOURCE_DIR}/store_value_utils_test.cpp)
+target_compile_features(store_value_utils_test PUBLIC cxx_std_17)
+target_include_directories(store_value_utils_test PUBLIC ${FBGEMM_SOURCE_DIR})
+target_link_libraries(store_value_utils_test gtest gtest_main Folly::folly)
\ No newline at end of file
diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp
new file mode 100644
index 0000000000..606819ad8d
--- /dev/null
+++ b/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp
@@ -0,0 +1,301 @@
+#include "fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h"
+
+#include <cstring>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "fixed_block_pool.h"
+namespace kv_mem {
+
+double test_std_vector(size_t vector_size, size_t repeat_count) {
+  float sum = 0.0f;  // Prevent optimization
+  std::vector<std::vector<float>>
+      all_vectors;  // Store all vectors to prevent release
+  all_vectors.reserve(repeat_count);
+
+  auto start = std::chrono::high_resolution_clock::now();
+  for (size_t i = 0; i < repeat_count; ++i) {
+    all_vectors.emplace_back(vector_size);
+    auto& vec = all_vectors.back();
+
+    for (size_t j = 0; j < vector_size; ++j) {
+      vec[j] = static_cast<float>(j);
+    }
+
+    // Simple usage to prevent optimization
+    sum += vec[0];
+  }
+
+  auto end = std::chrono::high_resolution_clock::now();
+  return std::chrono::duration<double, std::milli>(end - start).count();
+}
+
+// Testing memory pool allocation
+double test_pool_vector(size_t vector_size, size_t repeat_count) {
+  // Create a memory pool large enough
+  FixedBlockPool pool(vector_size * sizeof(float), alignof(float), 8092);
+  std::pmr::polymorphic_allocator<float> alloc(&pool);
+
+  auto start = std::chrono::high_resolution_clock::now();
+  float sum = 0.0f;  // Prevent optimization
+  for (size_t i = 0; i < repeat_count; ++i) {
+    float* arr = alloc.allocate(vector_size);
+
+    for (size_t j = 0; j < vector_size; ++j) {
+      arr[j] = static_cast<float>(j);
+    }
+
+    // Simple usage to prevent optimization
+    sum += arr[0];
+
+    // Removed deallocate statement, no longer releasing memory to avoid memory
+    // reuse
+    //  alloc.deallocate(arr, dim);
+  }
+
+  auto end = std::chrono::high_resolution_clock::now();
+  return std::chrono::duration<double, std::milli>(end - start).count();
+}
+
+void benchmark_memory_allocators() {
+  std::cout << "====== Testing performance difference between memory pool and "
+               "native vector allocation for 10 million "
+               "times ======"
+            << std::endl;
+
+  // Vector sizes to test (in number of float elements)
+  std::vector<size_t> vector_sizes = {4, 8, 16, 32, 64, 128, 256};
+
+  // Repeat count (10 million times)
+  const size_t repeat_count = 10'000'000;
+
+  for (const auto& size : vector_sizes) {
+    std::cout << "Vector size: " << size << " floats ("
+              << (size * sizeof(float)) << " bytes)" << std::endl;
+
+    // Testing standard vector
+    double std_time = test_std_vector(size, repeat_count);
+    std::cout << "  Standard vector: " << std::fixed << std::setprecision(2)
+              << std_time << " ms" << std::endl;
+
+    // Testing memory pool
+    double pool_time = test_pool_vector(size, repeat_count);
+    std::cout << "  Memory pool: " << std::fixed << std::setprecision(2)
+              << pool_time << " ms" << std::endl;
+
+    // Calculate speed improvement
+    double speedup = std_time / pool_time;
+    std::cout << "  Speed improvement: " << std::fixed << std::setprecision(2)
+              << speedup << "x" << std::endl;
+
+    std::cout << std::endl;
+    std::cout << "============================" << std::endl;
+  }
+}
+
+// Basic functionality test: Integer keys
+TEST(FixedBlockPoolTest, benchmark_memory_allocators) {
+  benchmark_memory_allocators();
+}
+
+// Test constructor normal case
+TEST(FixedBlockPoolTest, ConstructorNormal) {
+  EXPECT_NO_THROW({ kv_mem::FixedBlockPool pool(16, 8); });
+}
+
+// Test constructor exception cases
+TEST(FixedBlockPoolTest, ConstructorExceptions) {
+  // Block size smaller than pointer size
+  EXPECT_THROW({ kv_mem::FixedBlockPool pool(1, 1); }, std::invalid_argument);
+
+  // Alignment not a power of 2
+  EXPECT_THROW({ kv_mem::FixedBlockPool pool(16, 3); }, std::invalid_argument);
+
+  // Block size not a multiple of alignment
+  EXPECT_THROW({ kv_mem::FixedBlockPool pool(10, 8); }, std::invalid_argument);
+}
+
+// Test basic memory allocation and deallocation
+TEST(FixedBlockPoolTest, BasicAllocation) {
+  const size_t block_size = 16;
+  const size_t alignment = 8;
+  kv_mem::FixedBlockPool pool(block_size, alignment);
+
+  void* p = pool.allocate(block_size, alignment);
+  EXPECT_NE(p, nullptr);
+
+  // Verify allocated memory is usable
+  std::memset(p, 0xAB, block_size);
+
+  pool.deallocate(p, block_size, alignment);
+}
+
+// Test multiple allocations and deallocations
+TEST(FixedBlockPoolTest, MultipleAllocations) {
+  const size_t block_size = 32;
+  const size_t alignment = 8;
+  kv_mem::FixedBlockPool pool(block_size, alignment);
+
+  std::vector<void*> blocks;
+  const int NUM_BLOCKS = 100;
+
+  // Allocate multiple blocks
+  for (int i = 0; i < NUM_BLOCKS; ++i) {
+    void* p = pool.allocate(block_size, alignment);
+    EXPECT_NE(p, nullptr);
+    // Write some data
+    *static_cast<int*>(p) = i;
+    blocks.push_back(p);
+  }
+
+  // Verify data
+  for (int i = 0; i < NUM_BLOCKS; ++i) {
+    EXPECT_EQ(*static_cast<int*>(blocks[i]), i);
+  }
+
+  // Release all blocks
+  for (auto p : blocks) {
+    pool.deallocate(p, block_size, alignment);
+  }
+}
+
+// Test cross-chunk allocation (each chunk has only 10 blocks)
+TEST(FixedBlockPoolTest, CrossChunkAllocation) {
+  const size_t block_size = 16;
+  const size_t alignment = 8;
+  const size_t blocks_per_chunk = 10;
+  kv_mem::FixedBlockPool pool(block_size, alignment, blocks_per_chunk);
+
+  std::vector<void*> blocks;
+  const int NUM_BLOCKS = 25;  // Exceeds 2 chunks
+
+  // Allocate blocks beyond a single chunk capacity
+  for (int i = 0; i < NUM_BLOCKS; ++i) {
+    void* p = pool.allocate(block_size, alignment);
+    EXPECT_NE(p, nullptr);
+    blocks.push_back(p);
+  }
+
+  // Release all blocks
+  for (auto p : blocks) {
+    pool.deallocate(p, block_size, alignment);
+  }
+}
+
+// Test memory alignment
+TEST(FixedBlockPoolTest, MemoryAlignment) {
+  const size_t block_size = 64;
+  const size_t alignment = 32;
+  kv_mem::FixedBlockPool pool(block_size, alignment);
+
+  void* p = pool.allocate(block_size, alignment);
+  EXPECT_NE(p, nullptr);
+
+  // Verify address is aligned to specified alignment
+  uintptr_t addr = reinterpret_cast<uintptr_t>(p);
+  EXPECT_EQ(addr % alignment, 0);
+
+  pool.deallocate(p, block_size, alignment);
+}
+
+// Test error handling - allocating blocks with mismatched size or alignment
+TEST(FixedBlockPoolTest, ErrorHandling) {
+  const size_t block_size = 16;
+  const size_t alignment = 8;
+  kv_mem::FixedBlockPool pool(block_size, alignment);
+
+  // Try to allocate memory with incorrect size
+  EXPECT_THROW(
+      { [[maybe_unused]] void* p = pool.allocate(block_size * 2, alignment); },
+      std::bad_alloc);
+
+  // Try to allocate memory with incorrect alignment
+  EXPECT_THROW(
+      { [[maybe_unused]] void* p = pool.allocate(block_size, alignment * 2); },
+      std::bad_alloc);
+}
+
+// Test memory reuse after deallocation
+TEST(FixedBlockPoolTest, ReuseAfterDeallocation) {
+  const size_t block_size = 16;
+  const size_t alignment = 8;
+  kv_mem::FixedBlockPool pool(block_size, alignment);
+
+  void* p1 = pool.allocate(block_size, alignment);
+  void* p2 = pool.allocate(block_size, alignment);
+
+  // Release the first block
+  pool.deallocate(p1, block_size, alignment);
+
+  // Reallocate, should get the recently freed block (due to LIFO order)
+  void* p3 = pool.allocate(block_size, alignment);
+  EXPECT_EQ(p3, p1);
+
+  // Cleanup
+  pool.deallocate(p2, block_size, alignment);
+  pool.deallocate(p3, block_size, alignment);
+}
+
+// Test custom upstream memory resource
+TEST(FixedBlockPoolTest, CustomUpstreamResource) {
+  const size_t block_size = 16;
+  const size_t alignment = 8;
+
+  // Use custom memory resource that tracks allocations
+  int allocate_count = 0;
+  int deallocate_count = 0;
+
+  class CountingResource : public std::pmr::memory_resource {
+   public:
+    CountingResource(int& alloc_count, int& dealloc_count)
+        : alloc_count_(alloc_count), dealloc_count_(dealloc_count) {}
+
+   protected:
+    void* do_allocate(size_t bytes, size_t alignment) override {
+      ++alloc_count_;
+      return std::pmr::new_delete_resource()->allocate(bytes, alignment);
+    }
+
+    void do_deallocate(void* p, size_t bytes, size_t alignment) override {
+      ++dealloc_count_;
+      std::pmr::new_delete_resource()->deallocate(p, bytes, alignment);
+    }
+
+    bool do_is_equal(
+        const std::pmr::memory_resource& other) const noexcept override {
+      return this == &other;
+    }
+
+   private:
+    int& alloc_count_;
+    int& dealloc_count_;
+  };
+
+  CountingResource upstream(allocate_count, deallocate_count);
+  {
+    kv_mem::FixedBlockPool pool(block_size, alignment, 1024, &upstream);
+
+    // Allocate some blocks to trigger chunk allocation
+    std::vector<void*> blocks;
+    for (int i = 0; i < 10; ++i) {
+      blocks.push_back(pool.allocate(block_size, alignment));
+    }
+
+    // Verify upstream resource was called
+    EXPECT_GT(allocate_count, 0);
+    EXPECT_EQ(deallocate_count, 0);
+
+    // Release all blocks
+    for (auto p : blocks) {
+      pool.deallocate(p, block_size, alignment);
+    }
+  }
+  // Destructor should release all chunks
+  EXPECT_GT(deallocate_count, 0);
+}
+
+}  // namespace kv_mem
\ No newline at end of file
diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp
new file mode 100644
index 0000000000..2189b55009
--- /dev/null
+++ b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp
@@ -0,0 +1,164 @@
+#include <cstdio>
+#include <iostream>
+
+#include <array>
+#include <gtest/gtest.h>
+#include "fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h"
+#include "fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h"
+
+namespace kv_mem {
+std::vector<float> generateFixedEmbedding(int dimension) {
+  return std::vector<float>(dimension, 1.0);
+}
+
+void memPoolEmbedding(int dimension, size_t numInserts, size_t numLookups) {
+  const size_t numShards = 1;
+
+  // 初始化带内存池的哈希表
+  SynchronizedShardedMap<unsigned long, float*> embeddingMap(
+      numShards,
+      dimension * sizeof(float),  // block_size
+      alignof(float),             // block_alignment
+      8192);                      // blocks_per_chunk
+  double insertTime, lookupTime;
+  {
+    std::vector<float> fixedEmbedding = generateFixedEmbedding(dimension);
+
+    auto wlock = embeddingMap.by(0).wlock();
+    auto* pool = embeddingMap.pool_by(0);
+    std::pmr::polymorphic_allocator<float> alloc(pool);
+
+    auto startInsert = std::chrono::high_resolution_clock::now();
+    for (size_t i = 0; i < numInserts; i++) {
+      float* arr = alloc.allocate(dimension);
+      std::copy(fixedEmbedding.begin(), fixedEmbedding.end(), arr);
+      wlock->insert_or_assign(i, arr);
+    }
+    auto endInsert = std::chrono::high_resolution_clock::now();
+    insertTime =
+        std::chrono::duration<double, std::milli>(endInsert - startInsert)
+            .count();
+  }
+
+  std::vector<float> lookEmbedding(dimension);
+  size_t hitCount = 0;
+  {
+    auto rlock = embeddingMap.by(0).rlock();
+    auto startLookup = std::chrono::high_resolution_clock::now();
+    for (size_t i = 0; i < numLookups; i++) {
+      auto it = rlock->find(i % numInserts);
+      if (it != rlock->end()) {
+        hitCount++;
+        std::copy(it->second, it->second + dimension, lookEmbedding.data());
+      }
+    }
+    auto endLookup = std::chrono::high_resolution_clock::now();
+    lookupTime =
+        std::chrono::duration<double, std::milli>(endLookup - startLookup)
+            .count();
+  }
+
+  std::cout << std::left << std::setw(20) << dimension;
+  std::cout << std::fixed << std::setprecision(2);
+  std::cout << std::setw(20) << insertTime;
+  std::cout << std::setw(20) << lookupTime;
+  std::cout << std::setw(20) << (100.0 * (double)hitCount / (double)numLookups);
+  std::cout << std::endl;
+}
+
+void memPoolEmbeddingWithTime(int dimension,
+                              size_t numInserts,
+                              size_t numLookups) {
+  const size_t numShards = 1;
+  size_t block_size = StoreValueUtils::calculate_block_size<float>(dimension);
+  size_t block_alignment = StoreValueUtils::calculate_block_alignment<float>();
+
+  // 初始化带内存池的哈希表
+  SynchronizedShardedMap<unsigned long, float*> embeddingMap(
+      numShards,
+      block_size,       // block_size
+      block_alignment,  // block_alignment
+      8192);            // blocks_per_chunk
+  double insertTime, lookupTime;
+  // 测试插入性能
+  {
+    std::vector<float> fixedEmbedding = generateFixedEmbedding(dimension);
+
+    auto wlock = embeddingMap.by(0).wlock();
+    auto* pool = embeddingMap.pool_by(0);
+
+    auto startInsert = std::chrono::high_resolution_clock::now();
+    for (size_t i = 0; i < numInserts; i++) {
+      auto* block =
+          StoreValueUtils::allocate<float>(block_size, block_alignment, pool);
+      auto* data_ptr = StoreValueUtils::data_ptr<float>(block);
+      std::copy(fixedEmbedding.begin(), fixedEmbedding.end(), data_ptr);
+      wlock->insert_or_assign(i, block);
+    }
+    auto endInsert = std::chrono::high_resolution_clock::now();
+    insertTime =
+        std::chrono::duration<double, std::milli>(endInsert - startInsert)
+            .count();
+  }
+
+  std::vector<float> lookEmbedding(dimension);
+  size_t hitCount = 0;
+  {
+    auto rlock = embeddingMap.by(0).rlock();
+    auto startLookup = std::chrono::high_resolution_clock::now();
+    for (size_t i = 0; i < numLookups; i++) {
+      auto it = rlock->find(i % numInserts);
+      if (it != rlock->end()) {
+        hitCount++;
+        const float* data_ptr = StoreValueUtils::data_ptr<float>(it->second);
+        // update timestamp
+        StoreValueUtils::update_timestamp<float>(it->second);
+        std::copy(data_ptr, data_ptr + dimension, lookEmbedding.data());
+      }
+    }
+    auto endLookup = std::chrono::high_resolution_clock::now();
+    lookupTime =
+        std::chrono::duration<double, std::milli>(endLookup - startLookup)
+            .count();
+  }
+
+  std::cout << std::left << std::setw(20) << dimension;
+  std::cout << std::fixed << std::setprecision(2);
+  std::cout << std::setw(20) << insertTime;
+  std::cout << std::setw(20) << lookupTime;
+  std::cout << std::setw(20) << (100.0 * (double)hitCount / (double)numLookups);
+  std::cout << std::endl;
+}
+
+int benchmark() {
+  std::vector<int> dimensions = {4, 8, 16, 32, 64};
+  // 操作数量
+  const size_t numInserts = 1'000'000;  // 1 million insert
+  const size_t numLookups = 1'000'000;  // 1 million find
+
+  std::cout
+      << "======================= mempool ===================================="
+      << std::endl;
+  std::cout << std::left << std::setw(20) << "dim" << std::setw(20)
+            << "insert time (ms)" << std::setw(20) << "find time (ms)"
+            << std::setw(20) << "hit rate (%)" << std::endl;
+  for (int dim : dimensions) {
+    memPoolEmbedding(dim, numInserts, numLookups);
+  }
+  std::cout << std::endl << std ::endl;
+
+  std::cout << "======================= mempool with time "
+               "===================================="
+            << std::endl;
+  std::cout << std::left << std::setw(20) << "dim" << std::setw(20)
+            << "insert time (ms)" << std::setw(20) << "find time (ms)"
+            << std::setw(20) << "hit rate (%)" << std::endl;
+  for (int dim : dimensions) {
+    memPoolEmbeddingWithTime(dim, numInserts, numLookups);
+  }
+  std::cout << std::endl << std ::endl;
+  return 0;
+}
+TEST(SynchronizedShardedMap, benchmark) { benchmark(); }
+
+}  // namespace kv_mem
\ No newline at end of file
diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/store_value_utils_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/store_value_utils_test.cpp
new file mode 100644
index 0000000000..c1506c16e1
--- /dev/null
+++ b/fbgemm_gpu/test/dram_kv_embedding_cache/store_value_utils_test.cpp
@@ -0,0 +1,86 @@
+#include "fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h"
+
+#include "gtest/gtest.h"
+namespace kv_mem {
+
+TEST(StoreValueUtils, BasicFunctionality) {
+  constexpr int dim = 4;
+  size_t block_size = StoreValueUtils::calculate_block_size<float>(dim);
+  size_t alignment = StoreValueUtils::calculate_block_alignment<float>();
+
+  // Initialize memory pool
+  FixedBlockPool pool(block_size, alignment, 1024);
+
+  // Test memory allocation
+  float* block = StoreValueUtils::allocate<float>(block_size, alignment, &pool);
+  StoreValueUtils::update_timestamp(block);
+  ASSERT_NE(block, nullptr);
+
+  // Verify metadata header
+  int64_t ts1 = StoreValueUtils::get_timestamp<float>(block);
+  EXPECT_LE(StoreValueUtils::current_timestamp(), ts1);
+
+  // Test data pointer offset
+  float* data = StoreValueUtils::data_ptr<float>(block);
+  ASSERT_EQ(reinterpret_cast<char*>(data) - reinterpret_cast<char*>(block), sizeof(StoreValueUtils::MetaHeader));
+
+  // Test timestamp update
+  StoreValueUtils::update_timestamp<float>(block);
+  int64_t ts2 = StoreValueUtils::get_timestamp<float>(block);
+  EXPECT_GE(ts2, ts1);  // New timestamp should be greater or equal
+
+  // Test memory deallocation
+  EXPECT_NO_THROW(StoreValueUtils::deallocate<float>(block, block_size, alignment, &pool));
+}
+
+TEST(StoreValueUtils, MultiDimensionTest) {
+  // Test memory alignment for different dimensions
+  const std::vector<int> test_dims = {1, 4, 16, 64, 256};
+  for (int dim : test_dims) {
+    size_t block_size = StoreValueUtils::calculate_block_size<float>(dim);
+    size_t alignment = StoreValueUtils::calculate_block_alignment<float>();
+
+    // Verify alignment requirements
+    EXPECT_EQ(alignment % alignof(StoreValueUtils::MetaHeader), 0);
+    EXPECT_EQ(alignment % alignof(float), 0);
+
+    // Verify block size calculation
+    const size_t expected_size = sizeof(StoreValueUtils::MetaHeader) + dim * sizeof(float);
+    EXPECT_EQ(block_size, expected_size);
+  }
+}
+
+TEST(StoreValueUtils, TimestampPrecision) {
+  // Test timestamp precision accuracy
+  constexpr int test_iterations = 1000;
+  int64_t prev_ts = StoreValueUtils::current_timestamp();
+
+  for (int i = 0; i < test_iterations; ++i) {
+    int64_t curr_ts = StoreValueUtils::current_timestamp();
+    EXPECT_GE(curr_ts, prev_ts);  // Timestamps should be monotonically increasing
+    prev_ts = curr_ts;
+  }
+}
+
+TEST(StoreValueUtils, DataIntegrity) {
+  // Test data storage integrity
+  constexpr int dim = 8;
+  std::vector<float> src_data(dim, 3.14f);
+
+  size_t block_size = StoreValueUtils::calculate_block_size<float>(dim);
+  size_t alignment = StoreValueUtils::calculate_block_alignment<float>();
+  FixedBlockPool pool(block_size, alignment, 1024);
+
+  // Allocate and write data
+  float* block = StoreValueUtils::allocate<float>(block_size, alignment, &pool);
+  float* data_ptr = StoreValueUtils::data_ptr<float>(block);
+  std::copy(src_data.begin(), src_data.end(), data_ptr);
+
+  // Verify data consistency
+  for (int i = 0; i < dim; ++i) {
+    EXPECT_FLOAT_EQ(data_ptr[i], src_data[i]);
+  }
+
+  StoreValueUtils::deallocate<float>(block, block_size, alignment, &pool);
+}
+}  // namespace kv_mem
\ No newline at end of file

From f712994af21571553d09d0ddb7ea42a99b48acc9 Mon Sep 17 00:00:00 2001
From: houzhenggang <houzhenggang@meituan.com>
Date: Tue, 20 May 2025 12:40:39 +0800
Subject: [PATCH 02/12] use weights_data_ptr

---
 .../src/dram_kv_embedding_cache/dram_kv_embedding_cache.h       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h
index 4f2d90a9ca..4507e2060f 100644
--- a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h
@@ -300,7 +300,7 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
                           std::copy(
                               data_ptr,
                               data_ptr + max_D_,
-                              &weights_data[index * max_D_]);  // dst_start
+                              &(weights_data_ptr[index * max_D_]));  // dst_start
                         }
                       }
                     });

From 9c209a2eb3070139443dd8843d43b50e88e5c379 Mon Sep 17 00:00:00 2001
From: houzhenggang <houzhenggang@meituan.com>
Date: Tue, 20 May 2025 13:44:03 +0800
Subject: [PATCH 03/12] update some annotate

---
 fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h   | 1 +
 fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp | 4 ----
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h b/fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h
index c10c318621..e2ac55acc9 100644
--- a/fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h
@@ -8,6 +8,7 @@ namespace kv_mem {
 class StoreValueUtils {
  public:
   // Metadata structure (publicly accessible)
+  // alignas(8) MetaHeader >= sizeof(void*), avoid mempool block too small.
   struct alignas(8) MetaHeader {
     int64_t timestamp;  // 8 bytes
     // Can be extended with other fields: uint32_t counter, uint64_t key, etc.
diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp
index 2189b55009..6aa9373ac4 100644
--- a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp
+++ b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp
@@ -14,7 +14,6 @@ std::vector<float> generateFixedEmbedding(int dimension) {
 void memPoolEmbedding(int dimension, size_t numInserts, size_t numLookups) {
   const size_t numShards = 1;
 
-  // 初始化带内存池的哈希表
   SynchronizedShardedMap<unsigned long, float*> embeddingMap(
       numShards,
       dimension * sizeof(float),  // block_size
@@ -73,14 +72,12 @@ void memPoolEmbeddingWithTime(int dimension,
   size_t block_size = StoreValueUtils::calculate_block_size<float>(dimension);
   size_t block_alignment = StoreValueUtils::calculate_block_alignment<float>();
 
-  // 初始化带内存池的哈希表
   SynchronizedShardedMap<unsigned long, float*> embeddingMap(
       numShards,
       block_size,       // block_size
       block_alignment,  // block_alignment
       8192);            // blocks_per_chunk
   double insertTime, lookupTime;
-  // 测试插入性能
   {
     std::vector<float> fixedEmbedding = generateFixedEmbedding(dimension);
 
@@ -132,7 +129,6 @@ void memPoolEmbeddingWithTime(int dimension,
 
 int benchmark() {
   std::vector<int> dimensions = {4, 8, 16, 32, 64};
-  // 操作数量
   const size_t numInserts = 1'000'000;  // 1 million insert
   const size_t numLookups = 1'000'000;  // 1 million find
 

From bdec70251a08b9a4fb0c9a626dc01d8d704fa7d6 Mon Sep 17 00:00:00 2001
From: houzhenggang <houzhenggang@meituan.com>
Date: Mon, 26 May 2025 14:23:47 +0800
Subject: [PATCH 04/12] update MetaHeader

---
 .../fixed_block_pool.h                        | 163 ++++++++++++--
 .../src/dram_kv_embedding_cache/store_value.h |  56 -----
 .../store_value_utils.h                       |  82 -------
 .../dram_kv_embedding_cache/CMakeLists.txt    |  19 +-
 .../feature_evict_test.cpp                    | 202 ++++++++++++++++++
 .../fixed_block_pool_test.cpp                 |  86 +++++++-
 .../sharded_map_test.cpp                      |  15 +-
 7 files changed, 458 insertions(+), 165 deletions(-)
 delete mode 100644 fbgemm_gpu/src/dram_kv_embedding_cache/store_value.h
 delete mode 100644 fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h
 create mode 100644 fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp

diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h
index a474b367c3..bcc1e36fca 100644
--- a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <chrono>
 #include <cstddef>
 #include <memory_resource>
 #include <stdexcept>
@@ -10,6 +11,130 @@
 namespace kv_mem {
 class FixedBlockPool : public std::pmr::memory_resource {
  public:
+  // Chunk metadata
+  struct ChunkInfo {
+    void* ptr;         // Memory block pointer
+    std::size_t size;  // Total size
+    std::size_t alignment;
+  };
+
+  // Metadata structure (publicly accessible)
+  // alignas(8) MetaHeader >= sizeof(void*), avoid mempool block too small.
+  struct alignas(8) MetaHeader {
+    uint64_t key;   // 8 bytes
+    int32_t score;  // 4 bytes
+    bool used;      // 1 byte
+  };
+
+  // Metadata operations
+
+  // Key operations
+  static uint64_t get_key(const void* block) {
+    return reinterpret_cast<const MetaHeader*>(block)->key;
+  }
+  static void set_key(void* block, uint64_t key) {
+    reinterpret_cast<MetaHeader*>(block)->key = key;
+  }
+
+  // used operations
+  static bool get_used(const void* block) {
+    return reinterpret_cast<const MetaHeader*>(block)->used;
+  }
+  static void set_used(void* block, bool used) {
+    reinterpret_cast<MetaHeader*>(block)->used = used;
+  }
+
+  // Score operations
+  static int32_t get_score(const void* block) {
+    return reinterpret_cast<const MetaHeader*>(block)->score;
+  }
+  static void set_score(void* block, int32_t score) {
+    reinterpret_cast<MetaHeader*>(block)->score = score;
+  }
+  static void update_score(void* block) {
+    auto& score = reinterpret_cast<MetaHeader*>(block)->score;
+    // Avoid addition removal
+    if (score < std::numeric_limits<int32_t>::max()) {
+      score++;
+    }
+  }
+  // timestamp operations
+  static void update_timestamp(void* block) {
+    reinterpret_cast<MetaHeader*>(block)->score = current_timestamp();
+  }
+  static int32_t current_timestamp() {
+    auto stamp = std::chrono::duration_cast<std::chrono::seconds>(
+                     std::chrono::system_clock::now().time_since_epoch())
+                     .count();
+    return static_cast<int32_t>(stamp);
+    // facebook::WallClockUtil::NowInUsecFast();
+  }
+
+  // 与类型有关
+  // Calculate storage size
+  template <typename scalar_t>
+  static size_t calculate_block_size(size_t dimension) {
+    return sizeof(FixedBlockPool::MetaHeader) + dimension * sizeof(scalar_t);
+  }
+
+  // Calculate alignment requirements
+  template <typename scalar_t>
+  static size_t calculate_block_alignment() {
+    return std::max(alignof(FixedBlockPool::MetaHeader), alignof(scalar_t));
+  }
+
+  // Data pointer retrieval
+  template <typename scalar_t>
+  static scalar_t* data_ptr(scalar_t* block) {
+    return reinterpret_cast<scalar_t*>(reinterpret_cast<char*>(block) +
+                                       sizeof(FixedBlockPool::MetaHeader));
+  }
+
+  template <typename scalar_t>
+  static const scalar_t* data_ptr(const scalar_t* block) {
+    return reinterpret_cast<const scalar_t*>(
+        reinterpret_cast<const char*>(block) +
+        sizeof(FixedBlockPool::MetaHeader));
+  }
+
+  // Create memory block with metadata
+  template <typename scalar_t>
+  static scalar_t* allocate_t(size_t& block_size,
+                              size_t& alignment,
+                              FixedBlockPool* pool) {
+    auto* block =
+        reinterpret_cast<scalar_t*>(pool->allocate(block_size, alignment));
+    return block;
+  }
+
+  // Destroy memory block
+  template <typename scalar_t>
+  static void deallocate_t(scalar_t* block,
+                           size_t& block_size,
+                           size_t& alignment,
+                           FixedBlockPool* pool) {
+    pool->deallocate(block, block_size, alignment);
+  }
+
+  // 使用示例
+  template <typename scalar_t>
+  static void get_keys_with_low_score(FixedBlockPool* pool,
+                                      int32_t threshold,
+                                      float decay,
+                                      std::vector<uint64_t>& result) {
+    pool->for_each_block([&decay, &threshold, &result](void* block) {
+      if (FixedBlockPool::get_used(block)) {
+        auto score = FixedBlockPool::get_score(static_cast<scalar_t*>(block));
+        score = score * decay;
+        FixedBlockPool::set_score(static_cast<scalar_t*>(block), score);
+        if (score < threshold) {
+          result.push_back(
+              FixedBlockPool::get_key(static_cast<scalar_t*>(block)));
+        }
+      }
+    });
+  }
+
   explicit FixedBlockPool(
       std::size_t block_size,       // Size of each memory block
       std::size_t block_alignment,  // Memory block alignment requirement
@@ -54,6 +179,21 @@ class FixedBlockPool : public std::pmr::memory_resource {
     }
   }
 
+  // 新增获取chunks信息的接口
+  [[nodiscard]] const auto& get_chunks() const noexcept { return chunks_; }
+
+  // 新增遍历所有block的接口
+  template <typename Func>
+  void for_each_block(Func&& func) const {
+    for (const auto& chunk : chunks_) {
+      char* current = static_cast<char*>(chunk.ptr);
+      for (size_t i = 0; i < blocks_per_chunk_; ++i) {
+        func(current);
+        current += block_size_;
+      }
+    }
+  }
+
  protected:
   // Core allocation function
   void* do_allocate(std::size_t bytes, std::size_t alignment) override {
@@ -70,6 +210,7 @@ class FixedBlockPool : public std::pmr::memory_resource {
     // Take a block from the head of the free list
     void* result = free_list_;
     free_list_ = *static_cast<void**>(free_list_);
+    FixedBlockPool::set_used(result, true);
     return result;
   }
 
@@ -80,6 +221,7 @@ class FixedBlockPool : public std::pmr::memory_resource {
     // Insert memory block back to the head of free list
     *static_cast<void**>(p) = free_list_;
     free_list_ = p;
+    FixedBlockPool::set_used(free_list_, false);
   }
 
   // Resource equality comparison (only the same object is equal)
@@ -89,13 +231,6 @@ class FixedBlockPool : public std::pmr::memory_resource {
   }
 
  private:
-  // Chunk metadata
-  struct chunk_info {
-    void* ptr;         // Memory block pointer
-    std::size_t size;  // Total size
-    std::size_t alignment;
-  };
-
   // Allocate a new memory chunk
   void allocate_chunk() {
     const std::size_t chunk_size = block_size_ * blocks_per_chunk_;
@@ -103,6 +238,9 @@ class FixedBlockPool : public std::pmr::memory_resource {
     // Allocate aligned memory through upstream resource
     void* chunk_ptr = upstream_->allocate(chunk_size, block_alignment_);
 
+    // Block used flag set false.
+    FixedBlockPool::set_used(chunk_ptr, false);
+
     // Record chunk information for later release
     chunks_.push_back({chunk_ptr, chunk_size, block_alignment_});
 
@@ -118,11 +256,10 @@ class FixedBlockPool : public std::pmr::memory_resource {
 
   // Member variables
   const std::size_t block_size_;  // Block size (not less than pointer size)
-  const std::size_t block_alignment_;    // Block alignment requirement
-  const std::size_t blocks_per_chunk_;   // Number of blocks per chunk
-  std::pmr::memory_resource* upstream_;  // Upstream memory resource
-  std::pmr::vector<chunk_info> chunks_{
-      1024};                   // Records of all allocated chunks
-  void* free_list_ = nullptr;  // Free block list head pointer
+  const std::size_t block_alignment_;         // Block alignment requirement
+  const std::size_t blocks_per_chunk_;        // Number of blocks per chunk
+  std::pmr::memory_resource* upstream_;       // Upstream memory resource
+  std::pmr::vector<ChunkInfo> chunks_{1024};  // Records of all allocated chunks
+  void* free_list_ = nullptr;                 // Free block list head pointer
 };
 }  // namespace kv_mem
diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/store_value.h b/fbgemm_gpu/src/dram_kv_embedding_cache/store_value.h
deleted file mode 100644
index 375c63ce46..0000000000
--- a/fbgemm_gpu/src/dram_kv_embedding_cache/store_value.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-#include <ATen/core/ivalue.h>
-#include "common/time/Time.h"
-
-namespace kv_mem {
-
-/// @ingroup embedding-dram-kvstore
-///
-/// @brief data structure to store tensor value and it's timestamp
-//
-template <typename scalar_t>
-class StoreValue {
- public:
-  explicit StoreValue(std::vector<scalar_t>&& value) {
-    value_ = std::move(value);
-    timestamp_ = facebook::WallClockUtil::NowInUsecFast();
-  }
-
-  explicit StoreValue(StoreValue&& pv) noexcept {
-    timestamp_ = facebook::WallClockUtil::NowInUsecFast();
-    value_ = std::move(pv.value_);
-  }
-
-  int64_t getTimestamp() const {
-    return timestamp_;
-  }
-
-  const std::vector<scalar_t>& getValue() const {
-    return value_;
-  }
-
-  const std::vector<scalar_t>& getValueAndPromote() {
-    timestamp_ = facebook::WallClockUtil::NowInUsecFast();
-    return value_;
-  }
-
- private:
-  StoreValue& operator=(const StoreValue&) = delete;
-  StoreValue& operator=(const StoreValue&&) = delete;
-  StoreValue(const StoreValue& other) = delete;
-
-  // cached tensor value
-  std::vector<scalar_t> value_;
-
-  // last visit timestamp
-  int64_t timestamp_;
-};
-} // namespace kv_mem
diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h b/fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h
deleted file mode 100644
index e2ac55acc9..0000000000
--- a/fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h
+++ /dev/null
@@ -1,82 +0,0 @@
-#pragma once
-#include <chrono>
-
-#include "fixed_block_pool.h"
-
-namespace kv_mem {
-
-class StoreValueUtils {
- public:
-  // Metadata structure (publicly accessible)
-  // alignas(8) MetaHeader >= sizeof(void*), avoid mempool block too small.
-  struct alignas(8) MetaHeader {
-    int64_t timestamp;  // 8 bytes
-    // Can be extended with other fields: uint32_t counter, uint64_t key, etc.
-  };
-
-  // Create memory block with metadata
-  template <typename scalar_t>
-  static scalar_t* allocate(size_t& block_size,
-                            size_t& alignment,
-                            FixedBlockPool* pool) {
-    return reinterpret_cast<scalar_t*>(pool->allocate(block_size, alignment));
-  }
-
-  // Destroy memory block
-  template <typename scalar_t>
-  static void deallocate(scalar_t* block,
-                         size_t& block_size,
-                         size_t& alignment,
-                         FixedBlockPool* pool) {
-    pool->deallocate(block, block_size, alignment);
-  }
-
-  // Calculate storage size
-  template <typename scalar_t>
-  static size_t calculate_block_size(size_t dimension) {
-    return sizeof(MetaHeader) + dimension * sizeof(scalar_t);
-  }
-
-  // Calculate alignment requirements
-  template <typename scalar_t>
-  static size_t calculate_block_alignment() {
-    return std::max(alignof(MetaHeader), alignof(scalar_t));
-  }
-
-  // Metadata operations
-  template <typename scalar_t>
-  static int64_t get_timestamp(const scalar_t* block) {
-    return reinterpret_cast<const MetaHeader*>(block)->timestamp;
-  }
-
-  template <typename scalar_t>
-  static void set_timestamp(scalar_t* block, int64_t ts) {
-    reinterpret_cast<MetaHeader*>(block)->timestamp = ts;
-  }
-
-  template <typename scalar_t>
-  static void update_timestamp(scalar_t* block) {
-    reinterpret_cast<MetaHeader*>(block)->timestamp = current_timestamp();
-  }
-
-  // Data pointer retrieval
-  template <typename scalar_t>
-  static scalar_t* data_ptr(scalar_t* block) {
-    return reinterpret_cast<scalar_t*>(reinterpret_cast<char*>(block) +
-                                       sizeof(MetaHeader));
-  }
-
-  template <typename scalar_t>
-  static const scalar_t* data_ptr(const scalar_t* block) {
-    return reinterpret_cast<const scalar_t*>(
-        reinterpret_cast<const char*>(block) + sizeof(MetaHeader));
-  }
-
-  static int64_t current_timestamp() {
-    return std::chrono::duration_cast<std::chrono::seconds>(
-               std::chrono::system_clock::now().time_since_epoch())
-        .count();
-    // facebook::WallClockUtil::NowInUsecFast();
-  }
-};
-}  // namespace kv_mem
\ No newline at end of file
diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt b/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt
index e9a41eac51..9bf610f50d 100644
--- a/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt
+++ b/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt
@@ -1,14 +1,21 @@
+find_package(folly REQUIRED)
+find_package(gflags REQUIRED)
+
+include_directories(
+        ${FBGEMM_SOURCE_DIR}
+)
+
 add_executable(fixed_block_pool_test ${CMAKE_CURRENT_SOURCE_DIR}/fixed_block_pool_test.cpp)
 target_compile_features(fixed_block_pool_test PUBLIC cxx_std_17)
-target_include_directories(fixed_block_pool_test PUBLIC ${FBGEMM_SOURCE_DIR})
+target_compile_options(fixed_block_pool_test PUBLIC "-O3")
 target_link_libraries(fixed_block_pool_test gtest gtest_main)
 
 add_executable(sharded_map_test ${CMAKE_CURRENT_SOURCE_DIR}/sharded_map_test.cpp)
 target_compile_features(sharded_map_test PUBLIC cxx_std_17)
-target_include_directories(fixed_block_pool_test PUBLIC ${FBGEMM_SOURCE_DIR})
+target_compile_options(sharded_map_test PUBLIC "-O3")
 target_link_libraries(sharded_map_test gtest gtest_main Folly::folly)
 
-add_executable(store_value_utils_test ${CMAKE_CURRENT_SOURCE_DIR}/store_value_utils_test.cpp)
-target_compile_features(store_value_utils_test PUBLIC cxx_std_17)
-target_include_directories(store_value_utils_test PUBLIC ${FBGEMM_SOURCE_DIR})
-target_link_libraries(store_value_utils_test gtest gtest_main Folly::folly)
\ No newline at end of file
+add_executable(evict_test ${CMAKE_CURRENT_SOURCE_DIR}/evict_test.cpp)
+target_compile_features(evict_test PUBLIC cxx_std_17)
+target_compile_options(evict_test PUBLIC "-O3")
+target_link_libraries(evict_test gtest gtest_main Folly::folly)
\ No newline at end of file
diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp
new file mode 100644
index 0000000000..6ae9631ce8
--- /dev/null
+++ b/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp
@@ -0,0 +1,202 @@
+//
+// Created by arron on 2025/5/22.
+//
+#include <cstdio>
+#include <iostream>
+#include <random>
+
+#include <array>
+#include <cmath>
+#include <gtest/gtest.h>
+
+#include "fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h"
+
+namespace kv_mem {
+
+// Zipf分布生成器实现
+// alpha=1.3 → 约90%重复率
+// alpha=1.5 → 约95%重复率
+// alpha=2.0 → 约99%重复率
+class ZipfGenerator {
+ public:
+  ZipfGenerator(double alpha, unsigned long n) : alpha_(alpha), n_(n), dist_(0.0, 1.0) {
+    // 预计算调和数
+    c_ = 0.0;
+    for (unsigned long i = 1; i <= n_; ++i) c_ += 1.0 / std::pow(i, alpha_);
+    c_ = 1.0 / c_;
+  }
+
+  template <typename Generator>
+  unsigned long operator()(Generator& gen) {
+    while (true) {
+      double u = dist_(gen);
+      double v = dist_(gen);
+      unsigned long k = static_cast<unsigned long>(std::floor(std::pow(u, -1.0 / (alpha_ - 1.0))));
+      if (k > n_) continue;
+      double T = std::pow((k + 1.0) / k, alpha_ - 1.0);
+      double accept_prob = (std::pow(k, -alpha_)) / (c_ * v * (T - 1.0) * k / n_);
+      if (accept_prob >= 1.0 || dist_(gen) < accept_prob) {
+        return k;
+      }
+    }
+  }
+
+ private:
+  double alpha_;     // 分布参数（>1.0）
+  unsigned long n_;  // 元素总数
+  double c_;         // 归一化常数
+  std::uniform_real_distribution<double> dist_;
+};
+
+std::vector<float> generateFixedEmbedding(int dimension) { return std::vector<float>(dimension, 1.0); }
+
+void memPoolEmbeddingWithTime(int dimension, size_t numInserts, size_t numLookups) {
+  const size_t numShards = 1;
+  size_t block_size = FixedBlockPool::calculate_block_size<float>(dimension);
+  size_t block_alignment = FixedBlockPool::calculate_block_alignment<float>();
+
+  const size_t TOTAL_KEYS = 1'000'000;  // 1百万个可能的键
+  const double ZIPF_ALPHA = 1.5;        // 调整这个参数控制热点程度
+
+  ZipfGenerator zipf(ZIPF_ALPHA, TOTAL_KEYS);
+  std::random_device rd;
+  std::mt19937 gen(rd());
+
+  SynchronizedShardedMap<unsigned long, float*> embeddingMap(numShards,
+                                                             block_size,       // block_size
+                                                             block_alignment,  // block_alignment
+                                                             8192);            // blocks_per_chunk
+  double insertTime, lookupTime;
+  {
+    std::vector<float> fixedEmbedding = generateFixedEmbedding(dimension);
+
+    auto wlmap = embeddingMap.by(0).wlock();
+    auto* pool = embeddingMap.pool_by(0);
+
+    auto startInsert = std::chrono::high_resolution_clock::now();
+    for (size_t i = 0; i < numInserts; i++) {
+      auto id = zipf(gen);
+      // use mempool
+      float* block = nullptr;
+      // First check if the key already exists
+      auto it = wlmap->find(id);
+      if (it != wlmap->end()) {
+        block = it->second;
+      } else {
+        // Key doesn't exist, allocate new block and insert.
+        block = FixedBlockPool::allocate_t<float>(block_size, block_alignment, pool);
+        FixedBlockPool::set_key(block, id);
+        FixedBlockPool::set_score(block, 0);
+        FixedBlockPool::set_used(block, true);
+
+        wlmap->insert({id, block});
+      }
+      FixedBlockPool::update_score(block);
+      auto* data_ptr = FixedBlockPool::data_ptr<float>(block);
+      std::copy(fixedEmbedding.begin(), fixedEmbedding.end(), data_ptr);
+    }
+    auto endInsert = std::chrono::high_resolution_clock::now();
+    insertTime = std::chrono::duration<double, std::milli>(endInsert - startInsert).count();
+  }
+
+  std::vector<float> lookEmbedding(dimension);
+  size_t hitCount = 0;
+  {
+    auto rlock = embeddingMap.by(0).rlock();
+    auto startLookup = std::chrono::high_resolution_clock::now();
+    for (size_t i = 0; i < numLookups; i++) {
+      auto id = zipf(gen);
+      auto it = rlock->find(id);
+      if (it != rlock->end()) {
+        hitCount++;
+        const float* data_ptr = FixedBlockPool::data_ptr<float>(it->second);
+        std::copy(data_ptr, data_ptr + dimension, lookEmbedding.data());
+      }
+    }
+    auto endLookup = std::chrono::high_resolution_clock::now();
+    lookupTime = std::chrono::duration<double, std::milli>(endLookup - startLookup).count();
+  }
+
+  {
+    size_t score_sum = 0;
+    auto rlock = embeddingMap.by(0).rlock();
+    for (const auto& [key, block] : *rlock) {
+      score_sum += FixedBlockPool::get_score(block);
+    }
+    ASSERT_EQ(score_sum, numInserts);
+  }
+
+  // 遍历 chunk 找到要淘汰的 key
+  // 对 map 进行加锁，释放资源
+  std::vector<uint64_t> low_keys;
+  {
+    auto rlock = embeddingMap.by(0).rlock();
+    std::cout << "map num:" << rlock->size() << std::endl;
+    auto* pool = embeddingMap.pool_by(0);
+    FixedBlockPool::get_keys_with_low_score<float>(pool, 1, 0.99, low_keys);
+    std::cout << "low key num:" << low_keys.size() << std::endl;
+  }
+
+  // 获取写锁，进行map 删除， pool 内存释放
+  {
+    // 获取写锁，进行map删除和pool内存释放
+    auto wlock = embeddingMap.by(0).wlock();
+    auto* pool = embeddingMap.pool_by(0);
+
+    for (auto& key : low_keys) {
+      // 1. 从map中查找并获取对应的block指针
+      auto it = wlock->find(key);
+      if (it != wlock->end()) {
+        float* block = it->second;
+        FixedBlockPool::deallocate_t<float>(block, block_size, block_alignment, pool);
+        // 3. 从map中移除该键值对
+        wlock->erase(it);
+      }
+    }
+    std::cout << "after delete, map size:" << wlock->size() << std::endl;
+  }
+
+  // 删除阶段：分批次处理，每次处理1000个key
+  const size_t batch_size = 1000;
+  for (size_t i = 0; i < low_keys.size(); i += batch_size) {
+    auto start = low_keys.begin() + i;
+    auto end = (i + batch_size < low_keys.size()) ? low_keys.begin() + i + batch_size : low_keys.end();
+    std::vector<uint64_t> batch(start, end);
+
+    // 获取写锁处理当前批次
+    auto wlock = embeddingMap.by(0).wlock();
+    auto* pool = embeddingMap.pool_by(0);
+
+    for (auto key : batch) {
+      auto it = wlock->find(key);
+      if (it != wlock->end()) {
+        float* block = it->second;
+        FixedBlockPool::deallocate_t<float>(block, block_size, block_alignment, pool);
+        wlock->erase(it);
+      }
+    }
+    std::cout << "after delete, map size:" << wlock->size() << std::endl;
+  }
+
+  std::cout << std::left << std::setw(20) << dimension;
+  std::cout << std::fixed << std::setprecision(2);
+  std::cout << std::setw(20) << insertTime;
+  std::cout << std::setw(20) << lookupTime;
+  std::cout << std::setw(20) << (100.0 * (double)hitCount / (double)numLookups);
+  std::cout << std::endl;
+}
+
+int benchmark() {
+  std::vector<int> dimensions = {4};
+  const size_t numInserts = 1'000'000;  // 1 million insert
+  const size_t numLookups = 1'000'000;  // 1 million find
+
+  std::cout << "======================= mempool ====================================" << std::endl;
+  std::cout << std::left << std::setw(20) << "dim" << std::setw(20) << "insert time (ms)" << std::setw(20) << "find time (ms)" << std::setw(20) << "hit rate (%)" << std::endl;
+  for (int dim : dimensions) {
+    memPoolEmbeddingWithTime(dim, numInserts, numLookups);
+  }
+  return 0;
+}
+TEST(Evict, benchmark) { benchmark(); }
+}  // namespace kv_mem
diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp
index 606819ad8d..8d7767c879 100644
--- a/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp
+++ b/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp
@@ -8,7 +8,6 @@
 
 #include <gtest/gtest.h>
 
-#include "fixed_block_pool.h"
 namespace kv_mem {
 
 double test_std_vector(size_t vector_size, size_t repeat_count) {
@@ -298,4 +297,89 @@ TEST(FixedBlockPoolTest, CustomUpstreamResource) {
   EXPECT_GT(deallocate_count, 0);
 }
 
+TEST(FixedBlockPool, BasicFunctionality) {
+  constexpr int dim = 4;
+  size_t block_size = FixedBlockPool ::calculate_block_size<float>(dim);
+  size_t alignment = FixedBlockPool::calculate_block_alignment<float>();
+
+  // Initialize memory pool
+  FixedBlockPool pool(block_size, alignment, 1024);
+
+  // Test memory allocation
+  auto* block = FixedBlockPool::allocate_t<float>(block_size, alignment, &pool);
+  FixedBlockPool::update_timestamp(block);
+  ASSERT_NE(block, nullptr);
+
+  // Verify metadata header
+  int64_t ts1 = FixedBlockPool::get_score(block);
+  EXPECT_LE(FixedBlockPool::current_timestamp(), ts1);
+
+  // Test data pointer offset
+  float* data = FixedBlockPool::data_ptr<float>(block);
+  ASSERT_EQ(reinterpret_cast<char*>(data) - reinterpret_cast<char*>(block),
+            sizeof(FixedBlockPool::MetaHeader));
+
+  // Test timestamp update
+  FixedBlockPool::update_timestamp(block);
+  int64_t ts2 = FixedBlockPool::get_score(block);
+  EXPECT_GE(ts2, ts1);  // New timestamp should be greater or equal
+
+  // Test memory deallocation
+  EXPECT_NO_THROW(
+      FixedBlockPool::deallocate_t<float>(block, block_size, alignment, &pool));
+}
+
+TEST(FixedBlockPool, MultiDimensionTest) {
+  // Test memory alignment for different dimensions
+  const std::vector<int> test_dims = {1, 4, 16, 64, 256};
+  for (int dim : test_dims) {
+    size_t block_size = FixedBlockPool::calculate_block_size<float>(dim);
+    size_t alignment = FixedBlockPool::calculate_block_alignment<float>();
+
+    // Verify alignment requirements
+    EXPECT_EQ(alignment % alignof(FixedBlockPool::MetaHeader), 0);
+    EXPECT_EQ(alignment % alignof(float), 0);
+
+    // Verify block size calculation
+    const size_t expected_size =
+        sizeof(FixedBlockPool::MetaHeader) + dim * sizeof(float);
+    EXPECT_EQ(block_size, expected_size);
+  }
+}
+
+TEST(FixedBlockPool, TimestampPrecision) {
+  // Test timestamp precision accuracy
+  constexpr int test_iterations = 1000;
+  int64_t prev_ts = FixedBlockPool::current_timestamp();
+
+  for (int i = 0; i < test_iterations; ++i) {
+    int64_t curr_ts = FixedBlockPool::current_timestamp();
+    EXPECT_GE(curr_ts,
+              prev_ts);  // Timestamps should be monotonically increasing
+    prev_ts = curr_ts;
+  }
+}
+
+TEST(FixedBlockPool, DataIntegrity) {
+  // Test data storage integrity
+  constexpr int dim = 8;
+  std::vector<float> src_data(dim, 3.14f);
+
+  size_t block_size = FixedBlockPool::calculate_block_size<float>(dim);
+  size_t alignment = FixedBlockPool::calculate_block_alignment<float>();
+  FixedBlockPool pool(block_size, alignment, 1024);
+
+  // Allocate and write data
+  auto* block = FixedBlockPool::allocate_t<float>(block_size, alignment, &pool);
+  auto* data_ptr = FixedBlockPool::data_ptr<float>(block);
+  std::copy(src_data.begin(), src_data.end(), data_ptr);
+
+  // Verify data consistency
+  for (int i = 0; i < dim; ++i) {
+    EXPECT_FLOAT_EQ(data_ptr[i], src_data[i]);
+  }
+
+  FixedBlockPool::deallocate_t<float>(block, block_size, alignment, &pool);
+}
+
 }  // namespace kv_mem
\ No newline at end of file
diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp
index 6aa9373ac4..f19e1e219a 100644
--- a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp
+++ b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp
@@ -3,8 +3,9 @@
 
 #include <array>
 #include <gtest/gtest.h>
+
 #include "fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h"
-#include "fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h"
+#include "fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h"
 
 namespace kv_mem {
 std::vector<float> generateFixedEmbedding(int dimension) {
@@ -69,8 +70,8 @@ void memPoolEmbeddingWithTime(int dimension,
                               size_t numInserts,
                               size_t numLookups) {
   const size_t numShards = 1;
-  size_t block_size = StoreValueUtils::calculate_block_size<float>(dimension);
-  size_t block_alignment = StoreValueUtils::calculate_block_alignment<float>();
+  size_t block_size = MemPoolUtils::calculate_block_size<float>(dimension);
+  size_t block_alignment = MemPoolUtils::calculate_block_alignment<float>();
 
   SynchronizedShardedMap<unsigned long, float*> embeddingMap(
       numShards,
@@ -87,8 +88,8 @@ void memPoolEmbeddingWithTime(int dimension,
     auto startInsert = std::chrono::high_resolution_clock::now();
     for (size_t i = 0; i < numInserts; i++) {
       auto* block =
-          StoreValueUtils::allocate<float>(block_size, block_alignment, pool);
-      auto* data_ptr = StoreValueUtils::data_ptr<float>(block);
+          MemPoolUtils::allocate<float>(block_size, block_alignment, pool);
+      auto* data_ptr = MemPoolUtils::data_ptr<float>(block);
       std::copy(fixedEmbedding.begin(), fixedEmbedding.end(), data_ptr);
       wlock->insert_or_assign(i, block);
     }
@@ -107,9 +108,9 @@ void memPoolEmbeddingWithTime(int dimension,
       auto it = rlock->find(i % numInserts);
       if (it != rlock->end()) {
         hitCount++;
-        const float* data_ptr = StoreValueUtils::data_ptr<float>(it->second);
+        const float* data_ptr = MemPoolUtils::data_ptr<float>(it->second);
         // update timestamp
-        StoreValueUtils::update_timestamp<float>(it->second);
+        FixedBlockPool::update_timestamp(it->second);
         std::copy(data_ptr, data_ptr + dimension, lookEmbedding.data());
       }
     }

From a2cab1b59b495af7df87301643eb9a18a2b2d648 Mon Sep 17 00:00:00 2001
From: houzhenggang <houzhenggang@meituan.com>
Date: Tue, 27 May 2025 10:11:47 +0800
Subject: [PATCH 05/12] FeatureEvict

---
 .../dram_kv_embedding_cache/feature_evict.h   | 199 +++++++++++++++
 .../fixed_block_pool.h                        | 159 +++++-------
 .../feature_evict_test.cpp                    | 237 +++++-------------
 .../fixed_block_pool_test.cpp                 |  14 +-
 .../sharded_map_test.cpp                      |  11 +-
 .../store_value_utils_test.cpp                |  86 -------
 6 files changed, 329 insertions(+), 377 deletions(-)
 create mode 100644 fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h
 delete mode 100644 fbgemm_gpu/test/dram_kv_embedding_cache/store_value_utils_test.cpp

diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h b/fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h
new file mode 100644
index 0000000000..c531d60966
--- /dev/null
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h
@@ -0,0 +1,199 @@
+//
+// Created by root on 25-5-26.
+//
+#pragma once
+
+#include <atomic>
+#include <chrono>
+#include <cstddef>
+#include <memory_resource>
+#include <stdexcept>
+#include <vector>
+
+#include <cassert>
+#include <folly/executors/CPUThreadPoolExecutor.h>
+#include <folly/futures/Future.h>
+
+#include "SynchronizedShardedMap.h"
+
+namespace kv_mem {
+
+class FeatureEvictBase {
+ public:
+  FeatureEvictBase(folly::CPUThreadPoolExecutor* executor,
+                   SynchronizedShardedMap<int64_t, float*>& kv_store)
+      : executor_(executor),
+        kv_store_(kv_store),
+        evict_flag_(false),
+        evict_interrupt_(false),
+        num_shards_(kv_store.getNumShards()) {
+    init_shard_status();
+    // evict_flag_ 表示是否有任务在进行
+    // evict_interrupt_ 表示是否有任务被中断
+  }
+
+  virtual ~FeatureEvictBase() {
+    // 析构时，需要等待任务执行完成
+    wait_completion();  // 等待所有异步任务完成
+  };
+
+  // 触发异步淘汰
+  // 如果有执行中的任务，直接返回, 防止多次触发
+  // 如果没有执行中的任务，初始化任务状态
+  void trigger_evict() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (evict_flag_.exchange(true)) return;
+    prepare_evict();
+  }
+
+  // 恢复任务执行，如果有进行中的任务返回true, 没有返回false
+  bool resume() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (!evict_flag_.load()) return false;
+    evict_interrupt_.store(false);
+    for (int shard_id = 0; shard_id < num_shards_; ++shard_id) {
+      submit_shard_task(shard_id);
+    }
+    return true;
+  };
+
+  // 暂停淘汰过程，如果有进行中的任务返回true, 没有返回false
+  // 在暂停阶段，判断淘汰是否完成
+  bool pause() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (!evict_flag_.load()) return false;
+    evict_interrupt_.store(true);
+    check_and_reset_evict_flag();
+    wait_completion();
+    return true;
+  }
+
+  // 检查是否正在淘汰
+  bool is_evicting() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    check_and_reset_evict_flag();
+    return evict_flag_.load();
+  }
+
+ protected:
+  void init_shard_status() {
+    block_cursors_.resize(num_shards_);
+    block_nums_snapshot_.resize(num_shards_);
+    shards_finished_.clear();
+    for (int i = 0; i < num_shards_; ++i) {
+      block_cursors_[i] = 0;
+      block_nums_snapshot_[i] = 0;
+      shards_finished_.emplace_back(std::make_unique<std::atomic<bool>>(false));
+    }
+  }
+
+  // 初始化分片状态
+  void prepare_evict() {
+    for (int shard_id = 0; shard_id < num_shards_; ++shard_id) {
+      auto rlmap = kv_store_.by(shard_id).rlock();
+      auto* mempool = kv_store_.pool_by(shard_id);
+      block_nums_snapshot_[shard_id] =
+          mempool->get_chunks().size() * mempool->get_blocks_per_chunk();
+      block_cursors_[shard_id] = 0;
+      shards_finished_[shard_id]->store(false);
+    }
+  }
+
+  void submit_shard_task(int shard_id) {
+    if (shards_finished_[shard_id]->load()) return;
+    futures_.emplace_back(folly::via(executor_).thenValue(
+        [this, shard_id](auto&&) { process_shard(shard_id); }));
+  }
+
+  void process_shard(int shard_id) {
+    auto wlock = kv_store_.by(shard_id).wlock();
+    auto* pool = kv_store_.pool_by(shard_id);
+    while (!evict_interrupt_.load() &&
+           block_cursors_[shard_id] < block_nums_snapshot_[shard_id]) {
+      auto* block = pool->get_block<float>(block_cursors_[shard_id]++);
+      if (block && evict_block(block)) {
+        int64_t key = FixedBlockPool::get_key(block);
+        auto it = wlock->find(key);
+        if (it != wlock->end() && block == it->second) {
+          wlock->erase(key);
+          pool->deallocate_t<float>(block);
+        }
+      }
+    }
+
+    // 判断循环正常结束
+    if (block_cursors_[shard_id] >= block_nums_snapshot_[shard_id]) {
+      shards_finished_[shard_id]->store(true);
+    }
+  }
+
+  virtual bool evict_block(float* block) = 0;
+
+  void wait_completion() {
+    folly::collectAll(futures_).wait();
+    futures_.clear();
+  }
+
+  // 检查并重置
+  void check_and_reset_evict_flag() {
+    bool all_finished = true;
+    for (int i = 0; i < num_shards_; ++i) {
+      if (!shards_finished_[i]->load()) all_finished = false;
+    }
+    if (all_finished) evict_flag_.store(false);
+  }
+
+  folly::CPUThreadPoolExecutor* executor_;             // 线程池
+  SynchronizedShardedMap<int64_t, float*>& kv_store_;  // shard map
+  std::vector<std::size_t> block_cursors_;             // 已处理的block 索引
+  std::vector<std::size_t> block_nums_snapshot_;  // 触发淘汰时，记录的block总数
+  std::vector<std::unique_ptr<std::atomic<bool>>>
+      shards_finished_;                              // 已完成的shard标识
+  std::atomic<bool> evict_flag_;                     // 表示是否驱逐任务在进行
+  std::atomic<bool> evict_interrupt_;                // 表示驱逐任务是否暂停
+  std::vector<folly::Future<folly::Unit>> futures_;  // 分片任务记录
+  std::mutex mutex_;  // 接口锁，保证 public 接口 线程安全
+  int num_shards_;    // 并发任务数
+};
+
+class CounterBasedEvict : public FeatureEvictBase {
+ public:
+  CounterBasedEvict(folly::CPUThreadPoolExecutor* executor,
+                    SynchronizedShardedMap<int64_t, float*>& kv_store,
+                    float decay_rate,
+                    int threshold)
+      : FeatureEvictBase(executor, kv_store),
+        decay_rate_(decay_rate),
+        threshold_(threshold) {}
+
+ protected:
+  bool evict_block(float* block) override {
+    // 应用衰减并检查阈值
+    auto current_count = FixedBlockPool::get_count(block);
+    current_count *= decay_rate_;
+    FixedBlockPool::set_count(block, current_count);
+    return current_count < threshold_;
+  }
+
+ private:
+  float decay_rate_;
+  uint32_t threshold_;
+};
+
+class TimeBasedEvict : public FeatureEvictBase {
+ public:
+  TimeBasedEvict(folly::CPUThreadPoolExecutor* executor,
+                 SynchronizedShardedMap<int64_t, float*>& kv_store,
+                 uint32_t ttl)
+      : FeatureEvictBase(executor, kv_store), ttl_(ttl) {}
+
+ protected:
+  bool evict_block(float* block) override {
+    auto current_time = FixedBlockPool::current_timestamp();
+    return current_time - FixedBlockPool::get_timestamp(block) > ttl_;
+  }
+
+ private:
+  uint32_t ttl_;
+};
+}  // namespace kv_mem
diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h
index bcc1e36fca..0ffb9e40e5 100644
--- a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h
@@ -9,6 +9,8 @@
 #include <cassert>
 
 namespace kv_mem {
+static constexpr uint32_t kMaxInt31Counter = 2147483647;
+
 class FixedBlockPool : public std::pmr::memory_resource {
  public:
   // Chunk metadata
@@ -20,54 +22,41 @@ class FixedBlockPool : public std::pmr::memory_resource {
 
   // Metadata structure (publicly accessible)
   // alignas(8) MetaHeader >= sizeof(void*), avoid mempool block too small.
-  struct alignas(8) MetaHeader {
-    uint64_t key;   // 8 bytes
-    int32_t score;  // 4 bytes
-    bool used;      // 1 byte
+  // Metadata structure (publicly accessible)
+  struct alignas(8) MetaHeader {  // 16bytes
+    int64_t key;                  // feature key 8bytes
+    uint32_t timestamp;           // 4 bytes，the unit is second, uint32 indicates a range of over 120 years
+    uint32_t count : 31;          // only 31 bit is used, max value is 2147483647
+    bool used : 1;                // Mark whether this block is in use for the judgment of memory pool traversal
+    // Can be extended with other fields: uint32_t click, etc.
   };
 
   // Metadata operations
 
   // Key operations
-  static uint64_t get_key(const void* block) {
-    return reinterpret_cast<const MetaHeader*>(block)->key;
-  }
-  static void set_key(void* block, uint64_t key) {
-    reinterpret_cast<MetaHeader*>(block)->key = key;
-  }
+  static uint64_t get_key(const void* block) { return reinterpret_cast<const MetaHeader*>(block)->key; }
+  static void set_key(void* block, uint64_t key) { reinterpret_cast<MetaHeader*>(block)->key = key; }
 
   // used operations
-  static bool get_used(const void* block) {
-    return reinterpret_cast<const MetaHeader*>(block)->used;
-  }
-  static void set_used(void* block, bool used) {
-    reinterpret_cast<MetaHeader*>(block)->used = used;
-  }
+  static bool get_used(const void* block) { return reinterpret_cast<const MetaHeader*>(block)->used; }
+  static void set_used(void* block, bool used) { reinterpret_cast<MetaHeader*>(block)->used = used; }
 
   // Score operations
-  static int32_t get_score(const void* block) {
-    return reinterpret_cast<const MetaHeader*>(block)->score;
-  }
-  static void set_score(void* block, int32_t score) {
-    reinterpret_cast<MetaHeader*>(block)->score = score;
-  }
-  static void update_score(void* block) {
-    auto& score = reinterpret_cast<MetaHeader*>(block)->score;
+  static uint32_t get_count(const void* block) { return reinterpret_cast<const MetaHeader*>(block)->count; }
+  static void set_count(void* block, uint32_t count) { reinterpret_cast<MetaHeader*>(block)->count = count; }
+  static void update_count(void* block) {
     // Avoid addition removal
-    if (score < std::numeric_limits<int32_t>::max()) {
-      score++;
+    if (reinterpret_cast<MetaHeader*>(block)->count < kMaxInt31Counter) {
+      reinterpret_cast<MetaHeader*>(block)->count++;
     }
   }
   // timestamp operations
-  static void update_timestamp(void* block) {
-    reinterpret_cast<MetaHeader*>(block)->score = current_timestamp();
-  }
-  static int32_t current_timestamp() {
-    auto stamp = std::chrono::duration_cast<std::chrono::seconds>(
-                     std::chrono::system_clock::now().time_since_epoch())
-                     .count();
-    return static_cast<int32_t>(stamp);
+  static uint32_t get_timestamp(const void* block) { return reinterpret_cast<const MetaHeader*>(block)->timestamp; }
+  static void update_timestamp(void* block) { reinterpret_cast<MetaHeader*>(block)->timestamp = current_timestamp(); }
+  static uint32_t current_timestamp() {
+    // std::chrono::duration_cast<std::chrono::seconds>(std::chrono::system_clock::now().time_since_epoch()).count();
     // facebook::WallClockUtil::NowInUsecFast();
+    return std::time(nullptr);
   }
 
   // 与类型有关
@@ -86,60 +75,29 @@ class FixedBlockPool : public std::pmr::memory_resource {
   // Data pointer retrieval
   template <typename scalar_t>
   static scalar_t* data_ptr(scalar_t* block) {
-    return reinterpret_cast<scalar_t*>(reinterpret_cast<char*>(block) +
-                                       sizeof(FixedBlockPool::MetaHeader));
+    return reinterpret_cast<scalar_t*>(reinterpret_cast<char*>(block) + sizeof(FixedBlockPool::MetaHeader));
   }
 
   template <typename scalar_t>
   static const scalar_t* data_ptr(const scalar_t* block) {
-    return reinterpret_cast<const scalar_t*>(
-        reinterpret_cast<const char*>(block) +
-        sizeof(FixedBlockPool::MetaHeader));
-  }
-
-  // Create memory block with metadata
-  template <typename scalar_t>
-  static scalar_t* allocate_t(size_t& block_size,
-                              size_t& alignment,
-                              FixedBlockPool* pool) {
-    auto* block =
-        reinterpret_cast<scalar_t*>(pool->allocate(block_size, alignment));
-    return block;
+    return reinterpret_cast<const scalar_t*>(reinterpret_cast<const char*>(block) + sizeof(FixedBlockPool::MetaHeader));
   }
 
-  // Destroy memory block
   template <typename scalar_t>
-  static void deallocate_t(scalar_t* block,
-                           size_t& block_size,
-                           size_t& alignment,
-                           FixedBlockPool* pool) {
-    pool->deallocate(block, block_size, alignment);
-  }
-
-  // 使用示例
-  template <typename scalar_t>
-  static void get_keys_with_low_score(FixedBlockPool* pool,
-                                      int32_t threshold,
-                                      float decay,
-                                      std::vector<uint64_t>& result) {
-    pool->for_each_block([&decay, &threshold, &result](void* block) {
-      if (FixedBlockPool::get_used(block)) {
-        auto score = FixedBlockPool::get_score(static_cast<scalar_t*>(block));
-        score = score * decay;
-        FixedBlockPool::set_score(static_cast<scalar_t*>(block), score);
-        if (score < threshold) {
-          result.push_back(
-              FixedBlockPool::get_key(static_cast<scalar_t*>(block)));
-        }
-      }
-    });
-  }
+  scalar_t* get_block(size_t index) {
+    char* current_chunk = static_cast<char*>(chunks_[index / blocks_per_chunk_].ptr);
+    char* block = current_chunk + block_size_ * (index % blocks_per_chunk_);
+    if (FixedBlockPool::get_used(block)) {
+      return reinterpret_cast<scalar_t*>(block);
+    } else {
+      return nullptr;
+    }
+  };
 
-  explicit FixedBlockPool(
-      std::size_t block_size,       // Size of each memory block
-      std::size_t block_alignment,  // Memory block alignment requirement
-      std::size_t blocks_per_chunk = 8192,  // Number of blocks per chunk
-      std::pmr::memory_resource* upstream = std::pmr::new_delete_resource())
+  explicit FixedBlockPool(std::size_t block_size,               // Size of each memory block
+                          std::size_t block_alignment,          // Memory block alignment requirement
+                          std::size_t blocks_per_chunk = 8192,  // Number of blocks per chunk
+                          std::pmr::memory_resource* upstream = std::pmr::new_delete_resource())
       // Minimum block size is 8 bytes
       : block_size_(std::max(block_size, sizeof(void*))),
         block_alignment_(block_alignment),
@@ -179,21 +137,23 @@ class FixedBlockPool : public std::pmr::memory_resource {
     }
   }
 
-  // 新增获取chunks信息的接口
-  [[nodiscard]] const auto& get_chunks() const noexcept { return chunks_; }
+  // Create memory block with metadata
+  template <typename scalar_t>
+  scalar_t* allocate_t() {
+    return reinterpret_cast<scalar_t*>(this->allocate(block_size_, block_alignment_));
+  }
 
-  // 新增遍历所有block的接口
-  template <typename Func>
-  void for_each_block(Func&& func) const {
-    for (const auto& chunk : chunks_) {
-      char* current = static_cast<char*>(chunk.ptr);
-      for (size_t i = 0; i < blocks_per_chunk_; ++i) {
-        func(current);
-        current += block_size_;
-      }
-    }
+  // Destroy memory block
+  template <typename scalar_t>
+  void deallocate_t(scalar_t* block) {
+    this->deallocate(block, block_size_, block_alignment_);
   }
 
+  [[nodiscard]] const auto& get_chunks() const noexcept { return chunks_; }
+  [[nodiscard]] std::size_t get_block_size() const noexcept { return block_size_; }
+  [[nodiscard]] std::size_t get_block_alignment() const noexcept { return block_alignment_; }
+  [[nodiscard]] std::size_t get_blocks_per_chunk() const noexcept { return blocks_per_chunk_; }
+
  protected:
   // Core allocation function
   void* do_allocate(std::size_t bytes, std::size_t alignment) override {
@@ -215,9 +175,7 @@ class FixedBlockPool : public std::pmr::memory_resource {
   }
 
   // Core deallocation function
-  void do_deallocate(void* p,
-                     [[maybe_unused]] std::size_t bytes,
-                     [[maybe_unused]] std::size_t alignment) override {
+  void do_deallocate(void* p, [[maybe_unused]] std::size_t bytes, [[maybe_unused]] std::size_t alignment) override {
     // Insert memory block back to the head of free list
     *static_cast<void**>(p) = free_list_;
     free_list_ = p;
@@ -225,10 +183,7 @@ class FixedBlockPool : public std::pmr::memory_resource {
   }
 
   // Resource equality comparison (only the same object is equal)
-  [[nodiscard]] bool do_is_equal(
-      const std::pmr::memory_resource& other) const noexcept override {
-    return this == &other;
-  }
+  [[nodiscard]] bool do_is_equal(const std::pmr::memory_resource& other) const noexcept override { return this == &other; }
 
  private:
   // Allocate a new memory chunk
@@ -238,9 +193,6 @@ class FixedBlockPool : public std::pmr::memory_resource {
     // Allocate aligned memory through upstream resource
     void* chunk_ptr = upstream_->allocate(chunk_size, block_alignment_);
 
-    // Block used flag set false.
-    FixedBlockPool::set_used(chunk_ptr, false);
-
     // Record chunk information for later release
     chunks_.push_back({chunk_ptr, chunk_size, block_alignment_});
 
@@ -250,12 +202,13 @@ class FixedBlockPool : public std::pmr::memory_resource {
     for (std::size_t i = 0; i < blocks_per_chunk_; ++i) {
       current -= block_size_;
       *reinterpret_cast<void**>(current) = free_list_;
+      FixedBlockPool::set_used(current, false);
       free_list_ = current;
     }
   }
 
   // Member variables
-  const std::size_t block_size_;  // Block size (not less than pointer size)
+  const std::size_t block_size_;              // Block size (not less than pointer size)
   const std::size_t block_alignment_;         // Block alignment requirement
   const std::size_t blocks_per_chunk_;        // Number of blocks per chunk
   std::pmr::memory_resource* upstream_;       // Upstream memory resource
diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp
index 6ae9631ce8..464ed6294f 100644
--- a/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp
+++ b/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp
@@ -1,202 +1,91 @@
 //
 // Created by arron on 2025/5/22.
 //
+#include "fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h"
+
 #include <cstdio>
 #include <iostream>
-#include <random>
 
 #include <array>
-#include <cmath>
+#include <folly/executors/CPUThreadPoolExecutor.h>
 #include <gtest/gtest.h>
 
 #include "fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h"
 
 namespace kv_mem {
-
-// Zipf分布生成器实现
-// alpha=1.3 → 约90%重复率
-// alpha=1.5 → 约95%重复率
-// alpha=2.0 → 约99%重复率
-class ZipfGenerator {
- public:
-  ZipfGenerator(double alpha, unsigned long n) : alpha_(alpha), n_(n), dist_(0.0, 1.0) {
-    // 预计算调和数
-    c_ = 0.0;
-    for (unsigned long i = 1; i <= n_; ++i) c_ += 1.0 / std::pow(i, alpha_);
-    c_ = 1.0 / c_;
-  }
-
-  template <typename Generator>
-  unsigned long operator()(Generator& gen) {
-    while (true) {
-      double u = dist_(gen);
-      double v = dist_(gen);
-      unsigned long k = static_cast<unsigned long>(std::floor(std::pow(u, -1.0 / (alpha_ - 1.0))));
-      if (k > n_) continue;
-      double T = std::pow((k + 1.0) / k, alpha_ - 1.0);
-      double accept_prob = (std::pow(k, -alpha_)) / (c_ * v * (T - 1.0) * k / n_);
-      if (accept_prob >= 1.0 || dist_(gen) < accept_prob) {
-        return k;
-      }
+class FeatureEvictTest : public ::testing::Test {
+ protected:
+  static constexpr int NUM_SHARDS = 4;
+  static constexpr int DIMENSION = 128;
+  size_t BLOCK_SIZE = FixedBlockPool::calculate_block_size<float>(DIMENSION);
+  size_t BLOCK_ALIGNMENT = FixedBlockPool::calculate_block_alignment<float>();
+
+  void SetUp() override {
+    executor_ = std::make_unique<folly::CPUThreadPoolExecutor>(4);
+    kv_store_ = std::make_unique<SynchronizedShardedMap<int64_t, float*>>(
+        NUM_SHARDS, BLOCK_SIZE, BLOCK_ALIGNMENT);
+
+    // 插入测试数据
+
+    for (int i = 0; i < 1000; ++i) {
+      int shard_id = i % NUM_SHARDS;
+      auto wlock = kv_store_->by(shard_id).wlock();
+      auto* pool = kv_store_->pool_by(shard_id);
+      float* block = pool->allocate_t<float>();
+      FixedBlockPool::set_key(block, i);
+      FixedBlockPool::set_count(block, 1);  // 初始分数
+      FixedBlockPool::set_used(block, true);
+      wlock->insert({i, block});
     }
-  }
-
- private:
-  double alpha_;     // 分布参数（>1.0）
-  unsigned long n_;  // 元素总数
-  double c_;         // 归一化常数
-  std::uniform_real_distribution<double> dist_;
-};
-
-std::vector<float> generateFixedEmbedding(int dimension) { return std::vector<float>(dimension, 1.0); }
 
-void memPoolEmbeddingWithTime(int dimension, size_t numInserts, size_t numLookups) {
-  const size_t numShards = 1;
-  size_t block_size = FixedBlockPool::calculate_block_size<float>(dimension);
-  size_t block_alignment = FixedBlockPool::calculate_block_alignment<float>();
-
-  const size_t TOTAL_KEYS = 1'000'000;  // 1百万个可能的键
-  const double ZIPF_ALPHA = 1.5;        // 调整这个参数控制热点程度
-
-  ZipfGenerator zipf(ZIPF_ALPHA, TOTAL_KEYS);
-  std::random_device rd;
-  std::mt19937 gen(rd());
-
-  SynchronizedShardedMap<unsigned long, float*> embeddingMap(numShards,
-                                                             block_size,       // block_size
-                                                             block_alignment,  // block_alignment
-                                                             8192);            // blocks_per_chunk
-  double insertTime, lookupTime;
-  {
-    std::vector<float> fixedEmbedding = generateFixedEmbedding(dimension);
-
-    auto wlmap = embeddingMap.by(0).wlock();
-    auto* pool = embeddingMap.pool_by(0);
-
-    auto startInsert = std::chrono::high_resolution_clock::now();
-    for (size_t i = 0; i < numInserts; i++) {
-      auto id = zipf(gen);
-      // use mempool
-      float* block = nullptr;
-      // First check if the key already exists
-      auto it = wlmap->find(id);
-      if (it != wlmap->end()) {
-        block = it->second;
-      } else {
-        // Key doesn't exist, allocate new block and insert.
-        block = FixedBlockPool::allocate_t<float>(block_size, block_alignment, pool);
-        FixedBlockPool::set_key(block, id);
-        FixedBlockPool::set_score(block, 0);
-        FixedBlockPool::set_used(block, true);
-
-        wlmap->insert({id, block});
-      }
-      FixedBlockPool::update_score(block);
-      auto* data_ptr = FixedBlockPool::data_ptr<float>(block);
-      std::copy(fixedEmbedding.begin(), fixedEmbedding.end(), data_ptr);
+    for (int i = 1000; i < 2000; ++i) {
+      int shard_id = i % NUM_SHARDS;
+      auto wlock = kv_store_->by(shard_id).wlock();
+      auto* pool = kv_store_->pool_by(shard_id);
+      float* block = pool->allocate_t<float>();
+      FixedBlockPool::set_key(block, i);
+      FixedBlockPool::set_count(block, 2);  // 初始分数
+      FixedBlockPool::set_used(block, true);
+      wlock->insert({i, block});
     }
-    auto endInsert = std::chrono::high_resolution_clock::now();
-    insertTime = std::chrono::duration<double, std::milli>(endInsert - startInsert).count();
   }
 
-  std::vector<float> lookEmbedding(dimension);
-  size_t hitCount = 0;
-  {
-    auto rlock = embeddingMap.by(0).rlock();
-    auto startLookup = std::chrono::high_resolution_clock::now();
-    for (size_t i = 0; i < numLookups; i++) {
-      auto id = zipf(gen);
-      auto it = rlock->find(id);
-      if (it != rlock->end()) {
-        hitCount++;
-        const float* data_ptr = FixedBlockPool::data_ptr<float>(it->second);
-        std::copy(data_ptr, data_ptr + dimension, lookEmbedding.data());
-      }
-    }
-    auto endLookup = std::chrono::high_resolution_clock::now();
-    lookupTime = std::chrono::duration<double, std::milli>(endLookup - startLookup).count();
-  }
+  std::unique_ptr<folly::CPUThreadPoolExecutor> executor_;
+  std::unique_ptr<SynchronizedShardedMap<int64_t, float*>> kv_store_;
+};
 
-  {
-    size_t score_sum = 0;
-    auto rlock = embeddingMap.by(0).rlock();
-    for (const auto& [key, block] : *rlock) {
-      score_sum += FixedBlockPool::get_score(block);
-    }
-    ASSERT_EQ(score_sum, numInserts);
-  }
+TEST_F(FeatureEvictTest, BasicEviction) {
+  CounterBasedEvict evictor(executor_.get(), *kv_store_.get(), 0.5f, 1);
 
-  // 遍历 chunk 找到要淘汰的 key
-  // 对 map 进行加锁，释放资源
-  std::vector<uint64_t> low_keys;
-  {
-    auto rlock = embeddingMap.by(0).rlock();
-    std::cout << "map num:" << rlock->size() << std::endl;
-    auto* pool = embeddingMap.pool_by(0);
-    FixedBlockPool::get_keys_with_low_score<float>(pool, 1, 0.99, low_keys);
-    std::cout << "low key num:" << low_keys.size() << std::endl;
+  // 初始验证
+  size_t total_blocks = 0;
+  for (int shard_id = 0; shard_id < NUM_SHARDS; ++shard_id) {
+    auto rlock = kv_store_->by(shard_id).rlock();
+    total_blocks += rlock->size();
   }
+  ASSERT_EQ(total_blocks, 2000);
 
-  // 获取写锁，进行map 删除， pool 内存释放
-  {
-    // 获取写锁，进行map删除和pool内存释放
-    auto wlock = embeddingMap.by(0).wlock();
-    auto* pool = embeddingMap.pool_by(0);
+  // 执行淘汰
+  evictor.trigger_evict();
 
-    for (auto& key : low_keys) {
-      // 1. 从map中查找并获取对应的block指针
-      auto it = wlock->find(key);
-      if (it != wlock->end()) {
-        float* block = it->second;
-        FixedBlockPool::deallocate_t<float>(block, block_size, block_alignment, pool);
-        // 3. 从map中移除该键值对
-        wlock->erase(it);
-      }
-    }
-    std::cout << "after delete, map size:" << wlock->size() << std::endl;
+  // 验证淘汰过程
+  while (evictor.is_evicting()) {
+    evictor.resume();
+    std::this_thread::sleep_for(std::chrono::microseconds(5));
+    evictor.pause();
   }
 
-  // 删除阶段：分批次处理，每次处理1000个key
-  const size_t batch_size = 1000;
-  for (size_t i = 0; i < low_keys.size(); i += batch_size) {
-    auto start = low_keys.begin() + i;
-    auto end = (i + batch_size < low_keys.size()) ? low_keys.begin() + i + batch_size : low_keys.end();
-    std::vector<uint64_t> batch(start, end);
-
-    // 获取写锁处理当前批次
-    auto wlock = embeddingMap.by(0).wlock();
-    auto* pool = embeddingMap.pool_by(0);
-
-    for (auto key : batch) {
-      auto it = wlock->find(key);
-      if (it != wlock->end()) {
-        float* block = it->second;
-        FixedBlockPool::deallocate_t<float>(block, block_size, block_alignment, pool);
-        wlock->erase(it);
-      }
+  // 验证结果
+  size_t remaining = 0;
+  for (int shard_id = 0; shard_id < NUM_SHARDS; ++shard_id) {
+    auto rlock = kv_store_->by(shard_id).rlock();
+    remaining += rlock->size();
+    // 验证分数衰减
+    for (const auto& [key, block] : *rlock) {
+      ASSERT_EQ(FixedBlockPool::get_count(block), 1);
     }
-    std::cout << "after delete, map size:" << wlock->size() << std::endl;
-  }
-
-  std::cout << std::left << std::setw(20) << dimension;
-  std::cout << std::fixed << std::setprecision(2);
-  std::cout << std::setw(20) << insertTime;
-  std::cout << std::setw(20) << lookupTime;
-  std::cout << std::setw(20) << (100.0 * (double)hitCount / (double)numLookups);
-  std::cout << std::endl;
-}
-
-int benchmark() {
-  std::vector<int> dimensions = {4};
-  const size_t numInserts = 1'000'000;  // 1 million insert
-  const size_t numLookups = 1'000'000;  // 1 million find
-
-  std::cout << "======================= mempool ====================================" << std::endl;
-  std::cout << std::left << std::setw(20) << "dim" << std::setw(20) << "insert time (ms)" << std::setw(20) << "find time (ms)" << std::setw(20) << "hit rate (%)" << std::endl;
-  for (int dim : dimensions) {
-    memPoolEmbeddingWithTime(dim, numInserts, numLookups);
   }
-  return 0;
+  std::cout << "remaining: " << remaining << std::endl;
+  ASSERT_EQ(remaining, 1000);
 }
-TEST(Evict, benchmark) { benchmark(); }
 }  // namespace kv_mem
diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp
index 8d7767c879..47ef59d2de 100644
--- a/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp
+++ b/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp
@@ -306,12 +306,12 @@ TEST(FixedBlockPool, BasicFunctionality) {
   FixedBlockPool pool(block_size, alignment, 1024);
 
   // Test memory allocation
-  auto* block = FixedBlockPool::allocate_t<float>(block_size, alignment, &pool);
+  auto* block = pool.allocate_t<float>();
   FixedBlockPool::update_timestamp(block);
   ASSERT_NE(block, nullptr);
 
   // Verify metadata header
-  int64_t ts1 = FixedBlockPool::get_score(block);
+  int64_t ts1 = FixedBlockPool::get_timestamp(block);
   EXPECT_LE(FixedBlockPool::current_timestamp(), ts1);
 
   // Test data pointer offset
@@ -321,12 +321,11 @@ TEST(FixedBlockPool, BasicFunctionality) {
 
   // Test timestamp update
   FixedBlockPool::update_timestamp(block);
-  int64_t ts2 = FixedBlockPool::get_score(block);
+  int64_t ts2 = FixedBlockPool::get_timestamp(block);
   EXPECT_GE(ts2, ts1);  // New timestamp should be greater or equal
 
   // Test memory deallocation
-  EXPECT_NO_THROW(
-      FixedBlockPool::deallocate_t<float>(block, block_size, alignment, &pool));
+  EXPECT_NO_THROW(pool.deallocate_t<float>(block));
 }
 
 TEST(FixedBlockPool, MultiDimensionTest) {
@@ -370,7 +369,7 @@ TEST(FixedBlockPool, DataIntegrity) {
   FixedBlockPool pool(block_size, alignment, 1024);
 
   // Allocate and write data
-  auto* block = FixedBlockPool::allocate_t<float>(block_size, alignment, &pool);
+  auto* block = pool.allocate_t<float>();
   auto* data_ptr = FixedBlockPool::data_ptr<float>(block);
   std::copy(src_data.begin(), src_data.end(), data_ptr);
 
@@ -378,8 +377,7 @@ TEST(FixedBlockPool, DataIntegrity) {
   for (int i = 0; i < dim; ++i) {
     EXPECT_FLOAT_EQ(data_ptr[i], src_data[i]);
   }
-
-  FixedBlockPool::deallocate_t<float>(block, block_size, alignment, &pool);
+  pool.deallocate_t<float>(block);
 }
 
 }  // namespace kv_mem
\ No newline at end of file
diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp
index f19e1e219a..a2f0dcfb1a 100644
--- a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp
+++ b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp
@@ -70,8 +70,8 @@ void memPoolEmbeddingWithTime(int dimension,
                               size_t numInserts,
                               size_t numLookups) {
   const size_t numShards = 1;
-  size_t block_size = MemPoolUtils::calculate_block_size<float>(dimension);
-  size_t block_alignment = MemPoolUtils::calculate_block_alignment<float>();
+  size_t block_size = FixedBlockPool::calculate_block_size<float>(dimension);
+  size_t block_alignment = FixedBlockPool::calculate_block_alignment<float>();
 
   SynchronizedShardedMap<unsigned long, float*> embeddingMap(
       numShards,
@@ -87,9 +87,8 @@ void memPoolEmbeddingWithTime(int dimension,
 
     auto startInsert = std::chrono::high_resolution_clock::now();
     for (size_t i = 0; i < numInserts; i++) {
-      auto* block =
-          MemPoolUtils::allocate<float>(block_size, block_alignment, pool);
-      auto* data_ptr = MemPoolUtils::data_ptr<float>(block);
+      auto* block = pool->allocate_t<float>();
+      auto* data_ptr = FixedBlockPool::data_ptr<float>(block);
       std::copy(fixedEmbedding.begin(), fixedEmbedding.end(), data_ptr);
       wlock->insert_or_assign(i, block);
     }
@@ -108,7 +107,7 @@ void memPoolEmbeddingWithTime(int dimension,
       auto it = rlock->find(i % numInserts);
       if (it != rlock->end()) {
         hitCount++;
-        const float* data_ptr = MemPoolUtils::data_ptr<float>(it->second);
+        const float* data_ptr = FixedBlockPool::data_ptr<float>(it->second);
         // update timestamp
         FixedBlockPool::update_timestamp(it->second);
         std::copy(data_ptr, data_ptr + dimension, lookEmbedding.data());
diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/store_value_utils_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/store_value_utils_test.cpp
deleted file mode 100644
index c1506c16e1..0000000000
--- a/fbgemm_gpu/test/dram_kv_embedding_cache/store_value_utils_test.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-#include "fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h"
-
-#include "gtest/gtest.h"
-namespace kv_mem {
-
-TEST(StoreValueUtils, BasicFunctionality) {
-  constexpr int dim = 4;
-  size_t block_size = StoreValueUtils::calculate_block_size<float>(dim);
-  size_t alignment = StoreValueUtils::calculate_block_alignment<float>();
-
-  // Initialize memory pool
-  FixedBlockPool pool(block_size, alignment, 1024);
-
-  // Test memory allocation
-  float* block = StoreValueUtils::allocate<float>(block_size, alignment, &pool);
-  StoreValueUtils::update_timestamp(block);
-  ASSERT_NE(block, nullptr);
-
-  // Verify metadata header
-  int64_t ts1 = StoreValueUtils::get_timestamp<float>(block);
-  EXPECT_LE(StoreValueUtils::current_timestamp(), ts1);
-
-  // Test data pointer offset
-  float* data = StoreValueUtils::data_ptr<float>(block);
-  ASSERT_EQ(reinterpret_cast<char*>(data) - reinterpret_cast<char*>(block), sizeof(StoreValueUtils::MetaHeader));
-
-  // Test timestamp update
-  StoreValueUtils::update_timestamp<float>(block);
-  int64_t ts2 = StoreValueUtils::get_timestamp<float>(block);
-  EXPECT_GE(ts2, ts1);  // New timestamp should be greater or equal
-
-  // Test memory deallocation
-  EXPECT_NO_THROW(StoreValueUtils::deallocate<float>(block, block_size, alignment, &pool));
-}
-
-TEST(StoreValueUtils, MultiDimensionTest) {
-  // Test memory alignment for different dimensions
-  const std::vector<int> test_dims = {1, 4, 16, 64, 256};
-  for (int dim : test_dims) {
-    size_t block_size = StoreValueUtils::calculate_block_size<float>(dim);
-    size_t alignment = StoreValueUtils::calculate_block_alignment<float>();
-
-    // Verify alignment requirements
-    EXPECT_EQ(alignment % alignof(StoreValueUtils::MetaHeader), 0);
-    EXPECT_EQ(alignment % alignof(float), 0);
-
-    // Verify block size calculation
-    const size_t expected_size = sizeof(StoreValueUtils::MetaHeader) + dim * sizeof(float);
-    EXPECT_EQ(block_size, expected_size);
-  }
-}
-
-TEST(StoreValueUtils, TimestampPrecision) {
-  // Test timestamp precision accuracy
-  constexpr int test_iterations = 1000;
-  int64_t prev_ts = StoreValueUtils::current_timestamp();
-
-  for (int i = 0; i < test_iterations; ++i) {
-    int64_t curr_ts = StoreValueUtils::current_timestamp();
-    EXPECT_GE(curr_ts, prev_ts);  // Timestamps should be monotonically increasing
-    prev_ts = curr_ts;
-  }
-}
-
-TEST(StoreValueUtils, DataIntegrity) {
-  // Test data storage integrity
-  constexpr int dim = 8;
-  std::vector<float> src_data(dim, 3.14f);
-
-  size_t block_size = StoreValueUtils::calculate_block_size<float>(dim);
-  size_t alignment = StoreValueUtils::calculate_block_alignment<float>();
-  FixedBlockPool pool(block_size, alignment, 1024);
-
-  // Allocate and write data
-  float* block = StoreValueUtils::allocate<float>(block_size, alignment, &pool);
-  float* data_ptr = StoreValueUtils::data_ptr<float>(block);
-  std::copy(src_data.begin(), src_data.end(), data_ptr);
-
-  // Verify data consistency
-  for (int i = 0; i < dim; ++i) {
-    EXPECT_FLOAT_EQ(data_ptr[i], src_data[i]);
-  }
-
-  StoreValueUtils::deallocate<float>(block, block_size, alignment, &pool);
-}
-}  // namespace kv_mem
\ No newline at end of file

From 9e842adfd46735b36adb958b8f61e5ded89382cc Mon Sep 17 00:00:00 2001
From: houzhenggang <houzhenggang@meituan.com>
Date: Tue, 27 May 2025 15:14:22 +0800
Subject: [PATCH 06/12] feature evict add fmt log

---
 .../dram_kv_embedding_cache/feature_evict.h   | 125 +++++++++-------
 .../fixed_block_pool.h                        |  26 ++--
 .../dram_kv_embedding_cache/CMakeLists.txt    |  10 +-
 .../feature_evict_test.cpp                    | 135 ++++++++++++------
 .../fixed_block_pool_test.cpp                 |  58 +++-----
 .../sharded_map_test.cpp                      |  90 +++++-------
 6 files changed, 229 insertions(+), 215 deletions(-)

diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h b/fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h
index c531d60966..a7c42b291e 100644
--- a/fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h
@@ -1,6 +1,3 @@
-//
-// Created by root on 25-5-26.
-//
 #pragma once
 
 #include <atomic>
@@ -11,6 +8,8 @@
 #include <vector>
 
 #include <cassert>
+#include <fmt/chrono.h>
+#include <fmt/format.h>
 #include <folly/executors/CPUThreadPoolExecutor.h>
 #include <folly/futures/Future.h>
 
@@ -18,35 +17,33 @@
 
 namespace kv_mem {
 
+template <typename weight_type>
 class FeatureEvictBase {
  public:
-  FeatureEvictBase(folly::CPUThreadPoolExecutor* executor,
-                   SynchronizedShardedMap<int64_t, float*>& kv_store)
+  FeatureEvictBase(folly::CPUThreadPoolExecutor* executor, SynchronizedShardedMap<int64_t, weight_type*>& kv_store)
       : executor_(executor),
         kv_store_(kv_store),
         evict_flag_(false),
         evict_interrupt_(false),
         num_shards_(kv_store.getNumShards()) {
     init_shard_status();
-    // evict_flag_ 表示是否有任务在进行
-    // evict_interrupt_ 表示是否有任务被中断
   }
 
   virtual ~FeatureEvictBase() {
-    // 析构时，需要等待任务执行完成
-    wait_completion();  // 等待所有异步任务完成
+    wait_completion();  // Wait for all asynchronous tasks to complete.
   };
 
-  // 触发异步淘汰
-  // 如果有执行中的任务，直接返回, 防止多次触发
-  // 如果没有执行中的任务，初始化任务状态
+  // Trigger asynchronous eviction.
+  // If there is an ongoing task, return directly to prevent multiple triggers.
+  // If there is no ongoing task, initialize the task state.
   void trigger_evict() {
     std::lock_guard<std::mutex> lock(mutex_);
     if (evict_flag_.exchange(true)) return;
+    fmt::print("Starting new eviction process...\n");
     prepare_evict();
   }
 
-  // 恢复任务执行，如果有进行中的任务返回true, 没有返回false
+  // Resume task execution. Returns true if there is an ongoing task, false otherwise.
   bool resume() {
     std::lock_guard<std::mutex> lock(mutex_);
     if (!evict_flag_.load()) return false;
@@ -57,8 +54,8 @@ class FeatureEvictBase {
     return true;
   };
 
-  // 暂停淘汰过程，如果有进行中的任务返回true, 没有返回false
-  // 在暂停阶段，判断淘汰是否完成
+  // Pause the eviction process. Returns true if there is an ongoing task, false otherwise.
+  // During the pause phase, check whether the eviction is complete.
   bool pause() {
     std::lock_guard<std::mutex> lock(mutex_);
     if (!evict_flag_.load()) return false;
@@ -68,7 +65,7 @@ class FeatureEvictBase {
     return true;
   }
 
-  // 检查是否正在淘汰
+  // Check whether eviction is ongoing.
   bool is_evicting() {
     std::lock_guard<std::mutex> lock(mutex_);
     check_and_reset_evict_flag();
@@ -87,13 +84,12 @@ class FeatureEvictBase {
     }
   }
 
-  // 初始化分片状态
+  // Initialize shard state.
   void prepare_evict() {
     for (int shard_id = 0; shard_id < num_shards_; ++shard_id) {
       auto rlmap = kv_store_.by(shard_id).rlock();
       auto* mempool = kv_store_.pool_by(shard_id);
-      block_nums_snapshot_[shard_id] =
-          mempool->get_chunks().size() * mempool->get_blocks_per_chunk();
+      block_nums_snapshot_[shard_id] = mempool->get_chunks().size() * mempool->get_blocks_per_chunk();
       block_cursors_[shard_id] = 0;
       shards_finished_[shard_id]->store(false);
     }
@@ -101,40 +97,60 @@ class FeatureEvictBase {
 
   void submit_shard_task(int shard_id) {
     if (shards_finished_[shard_id]->load()) return;
-    futures_.emplace_back(folly::via(executor_).thenValue(
-        [this, shard_id](auto&&) { process_shard(shard_id); }));
+    futures_.emplace_back(folly::via(executor_).thenValue([this, shard_id](auto&&) { process_shard(shard_id); }));
   }
 
   void process_shard(int shard_id) {
+    auto start_time = std::chrono::high_resolution_clock::now();
+    size_t evicted_count = 0;
+    size_t processed_count = 0;
+
     auto wlock = kv_store_.by(shard_id).wlock();
     auto* pool = kv_store_.pool_by(shard_id);
-    while (!evict_interrupt_.load() &&
-           block_cursors_[shard_id] < block_nums_snapshot_[shard_id]) {
-      auto* block = pool->get_block<float>(block_cursors_[shard_id]++);
+
+    while (!evict_interrupt_.load() && block_cursors_[shard_id] < block_nums_snapshot_[shard_id]) {
+      auto* block = pool->template get_block<weight_type>(block_cursors_[shard_id]++);
+      processed_count++;
       if (block && evict_block(block)) {
         int64_t key = FixedBlockPool::get_key(block);
         auto it = wlock->find(key);
         if (it != wlock->end() && block == it->second) {
           wlock->erase(key);
-          pool->deallocate_t<float>(block);
+          pool->template deallocate_t<weight_type>(block);
+          evicted_count++;
         }
       }
     }
 
-    // 判断循环正常结束
+    // Check whether the loop ends normally.
     if (block_cursors_[shard_id] >= block_nums_snapshot_[shard_id]) {
       shards_finished_[shard_id]->store(true);
     }
+
+    auto end_time = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
+
+    fmt::print(
+        "Shard {} completed: \n"
+        "  - Time taken: {}ms\n"
+        "  - Total blocks processed: {}\n"
+        "  - Blocks evicted: {}\n"
+        "  - Eviction rate: {:.2f}%\n",
+        shard_id,
+        duration.count(),
+        processed_count,
+        evicted_count,
+        (evicted_count * 100.0f) / processed_count);
   }
 
-  virtual bool evict_block(float* block) = 0;
+  virtual bool evict_block(weight_type* block) = 0;
 
   void wait_completion() {
     folly::collectAll(futures_).wait();
     futures_.clear();
   }
 
-  // 检查并重置
+  // Check and reset the eviction flag.
   void check_and_reset_evict_flag() {
     bool all_finished = true;
     for (int i = 0; i < num_shards_; ++i) {
@@ -143,32 +159,30 @@ class FeatureEvictBase {
     if (all_finished) evict_flag_.store(false);
   }
 
-  folly::CPUThreadPoolExecutor* executor_;             // 线程池
-  SynchronizedShardedMap<int64_t, float*>& kv_store_;  // shard map
-  std::vector<std::size_t> block_cursors_;             // 已处理的block 索引
-  std::vector<std::size_t> block_nums_snapshot_;  // 触发淘汰时，记录的block总数
-  std::vector<std::unique_ptr<std::atomic<bool>>>
-      shards_finished_;                              // 已完成的shard标识
-  std::atomic<bool> evict_flag_;                     // 表示是否驱逐任务在进行
-  std::atomic<bool> evict_interrupt_;                // 表示驱逐任务是否暂停
-  std::vector<folly::Future<folly::Unit>> futures_;  // 分片任务记录
-  std::mutex mutex_;  // 接口锁，保证 public 接口 线程安全
-  int num_shards_;    // 并发任务数
+  folly::CPUThreadPoolExecutor* executor_;                           // Thread pool.
+  SynchronizedShardedMap<int64_t, weight_type*>& kv_store_;          // Sharded map.
+  std::vector<std::size_t> block_cursors_;                           // Index of processed blocks.
+  std::vector<std::size_t> block_nums_snapshot_;                     // Snapshot of total blocks at eviction trigger.
+  std::vector<std::unique_ptr<std::atomic<bool>>> shards_finished_;  // Flags indicating whether shards are finished.
+  std::atomic<bool> evict_flag_;                                     // Indicates whether an eviction task is ongoing.
+  std::atomic<bool> evict_interrupt_;                                // Indicates whether the eviction task is paused.
+  std::vector<folly::Future<folly::Unit>> futures_;                  // Records of shard tasks.
+  std::mutex mutex_;  // Interface lock to ensure thread safety for public methods.
+  int num_shards_;    // Number of concurrent tasks.
 };
 
-class CounterBasedEvict : public FeatureEvictBase {
+template <typename weight_type>
+class CounterBasedEvict : public FeatureEvictBase<weight_type> {
  public:
   CounterBasedEvict(folly::CPUThreadPoolExecutor* executor,
-                    SynchronizedShardedMap<int64_t, float*>& kv_store,
+                    SynchronizedShardedMap<int64_t, weight_type*>& kv_store,
                     float decay_rate,
-                    int threshold)
-      : FeatureEvictBase(executor, kv_store),
-        decay_rate_(decay_rate),
-        threshold_(threshold) {}
+                    uint32_t threshold)
+      : FeatureEvictBase<weight_type>(executor, kv_store), decay_rate_(decay_rate), threshold_(threshold) {}
 
  protected:
-  bool evict_block(float* block) override {
-    // 应用衰减并检查阈值
+  bool evict_block(weight_type* block) override {
+    // Apply decay and check the threshold.
     auto current_count = FixedBlockPool::get_count(block);
     current_count *= decay_rate_;
     FixedBlockPool::set_count(block, current_count);
@@ -176,24 +190,25 @@ class CounterBasedEvict : public FeatureEvictBase {
   }
 
  private:
-  float decay_rate_;
-  uint32_t threshold_;
+  float decay_rate_;    // Decay rate for the block count.
+  uint32_t threshold_;  // Threshold for eviction.
 };
 
-class TimeBasedEvict : public FeatureEvictBase {
+template <typename weight_type>
+class TimeBasedEvict : public FeatureEvictBase<weight_type> {
  public:
   TimeBasedEvict(folly::CPUThreadPoolExecutor* executor,
-                 SynchronizedShardedMap<int64_t, float*>& kv_store,
+                 SynchronizedShardedMap<int64_t, weight_type*>& kv_store,
                  uint32_t ttl)
-      : FeatureEvictBase(executor, kv_store), ttl_(ttl) {}
+      : FeatureEvictBase<weight_type>(executor, kv_store), ttl_(ttl) {}
 
  protected:
-  bool evict_block(float* block) override {
+  bool evict_block(weight_type* block) override {
     auto current_time = FixedBlockPool::current_timestamp();
     return current_time - FixedBlockPool::get_timestamp(block) > ttl_;
   }
 
  private:
-  uint32_t ttl_;
+  uint32_t ttl_;  // Time-to-live for eviction.
 };
-}  // namespace kv_mem
+}  // namespace kv_mem
\ No newline at end of file
diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h
index 0ffb9e40e5..f3f238d674 100644
--- a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h
@@ -22,7 +22,6 @@ class FixedBlockPool : public std::pmr::memory_resource {
 
   // Metadata structure (publicly accessible)
   // alignas(8) MetaHeader >= sizeof(void*), avoid mempool block too small.
-  // Metadata structure (publicly accessible)
   struct alignas(8) MetaHeader {  // 16bytes
     int64_t key;                  // feature key 8bytes
     uint32_t timestamp;           // 4 bytes，the unit is second, uint32 indicates a range of over 120 years
@@ -54,12 +53,9 @@ class FixedBlockPool : public std::pmr::memory_resource {
   static uint32_t get_timestamp(const void* block) { return reinterpret_cast<const MetaHeader*>(block)->timestamp; }
   static void update_timestamp(void* block) { reinterpret_cast<MetaHeader*>(block)->timestamp = current_timestamp(); }
   static uint32_t current_timestamp() {
-    // std::chrono::duration_cast<std::chrono::seconds>(std::chrono::system_clock::now().time_since_epoch()).count();
-    // facebook::WallClockUtil::NowInUsecFast();
     return std::time(nullptr);
   }
 
-  // 与类型有关
   // Calculate storage size
   template <typename scalar_t>
   static size_t calculate_block_size(size_t dimension) {
@@ -83,17 +79,6 @@ class FixedBlockPool : public std::pmr::memory_resource {
     return reinterpret_cast<const scalar_t*>(reinterpret_cast<const char*>(block) + sizeof(FixedBlockPool::MetaHeader));
   }
 
-  template <typename scalar_t>
-  scalar_t* get_block(size_t index) {
-    char* current_chunk = static_cast<char*>(chunks_[index / blocks_per_chunk_].ptr);
-    char* block = current_chunk + block_size_ * (index % blocks_per_chunk_);
-    if (FixedBlockPool::get_used(block)) {
-      return reinterpret_cast<scalar_t*>(block);
-    } else {
-      return nullptr;
-    }
-  };
-
   explicit FixedBlockPool(std::size_t block_size,               // Size of each memory block
                           std::size_t block_alignment,          // Memory block alignment requirement
                           std::size_t blocks_per_chunk = 8192,  // Number of blocks per chunk
@@ -149,6 +134,17 @@ class FixedBlockPool : public std::pmr::memory_resource {
     this->deallocate(block, block_size_, block_alignment_);
   }
 
+  template <typename scalar_t>
+  scalar_t* get_block(size_t index) {
+    char* current_chunk = static_cast<char*>(chunks_[index / blocks_per_chunk_].ptr);
+    char* block = current_chunk + block_size_ * (index % blocks_per_chunk_);
+    if (FixedBlockPool::get_used(block)) {
+      return reinterpret_cast<scalar_t*>(block);
+    } else {
+      return nullptr;
+    }
+  };
+
   [[nodiscard]] const auto& get_chunks() const noexcept { return chunks_; }
   [[nodiscard]] std::size_t get_block_size() const noexcept { return block_size_; }
   [[nodiscard]] std::size_t get_block_alignment() const noexcept { return block_alignment_; }
diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt b/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt
index 9bf610f50d..d7566c00d9 100644
--- a/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt
+++ b/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt
@@ -8,14 +8,14 @@ include_directories(
 add_executable(fixed_block_pool_test ${CMAKE_CURRENT_SOURCE_DIR}/fixed_block_pool_test.cpp)
 target_compile_features(fixed_block_pool_test PUBLIC cxx_std_17)
 target_compile_options(fixed_block_pool_test PUBLIC "-O3")
-target_link_libraries(fixed_block_pool_test gtest gtest_main)
+target_link_libraries(fixed_block_pool_test gtest gtest_main Folly::folly)
 
 add_executable(sharded_map_test ${CMAKE_CURRENT_SOURCE_DIR}/sharded_map_test.cpp)
 target_compile_features(sharded_map_test PUBLIC cxx_std_17)
 target_compile_options(sharded_map_test PUBLIC "-O3")
 target_link_libraries(sharded_map_test gtest gtest_main Folly::folly)
 
-add_executable(evict_test ${CMAKE_CURRENT_SOURCE_DIR}/evict_test.cpp)
-target_compile_features(evict_test PUBLIC cxx_std_17)
-target_compile_options(evict_test PUBLIC "-O3")
-target_link_libraries(evict_test gtest gtest_main Folly::folly)
\ No newline at end of file
+add_executable(feature_evict_test ${CMAKE_CURRENT_SOURCE_DIR}/feature_evict_test.cpp)
+target_compile_features(feature_evict_test PUBLIC cxx_std_17)
+target_compile_options(feature_evict_test PUBLIC "-O3")
+target_link_libraries(feature_evict_test gtest gtest_main Folly::folly)
\ No newline at end of file
diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp
index 464ed6294f..72dce3093d 100644
--- a/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp
+++ b/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp
@@ -1,63 +1,51 @@
-//
-// Created by arron on 2025/5/22.
-//
 #include "fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h"
 
 #include <cstdio>
 #include <iostream>
 
 #include <array>
+#include <fmt/format.h>
 #include <folly/executors/CPUThreadPoolExecutor.h>
 #include <gtest/gtest.h>
 
 #include "fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h"
 
 namespace kv_mem {
-class FeatureEvictTest : public ::testing::Test {
- protected:
-  static constexpr int NUM_SHARDS = 4;
-  static constexpr int DIMENSION = 128;
-  size_t BLOCK_SIZE = FixedBlockPool::calculate_block_size<float>(DIMENSION);
-  size_t BLOCK_ALIGNMENT = FixedBlockPool::calculate_block_alignment<float>();
+static constexpr int DIMENSION = 128;
+size_t BLOCK_SIZE = FixedBlockPool::calculate_block_size<float>(DIMENSION);
+size_t BLOCK_ALIGNMENT = FixedBlockPool::calculate_block_alignment<float>();
 
-  void SetUp() override {
-    executor_ = std::make_unique<folly::CPUThreadPoolExecutor>(4);
-    kv_store_ = std::make_unique<SynchronizedShardedMap<int64_t, float*>>(
-        NUM_SHARDS, BLOCK_SIZE, BLOCK_ALIGNMENT);
+TEST(FeatureEvictTest, BasicEviction) {
+  static constexpr int NUM_SHARDS = 8;
+  auto executor_ = std::make_unique<folly::CPUThreadPoolExecutor>(4);
+  auto kv_store_ = std::make_unique<SynchronizedShardedMap<int64_t, float*>>(NUM_SHARDS, BLOCK_SIZE, BLOCK_ALIGNMENT);
 
-    // 插入测试数据
-
-    for (int i = 0; i < 1000; ++i) {
-      int shard_id = i % NUM_SHARDS;
-      auto wlock = kv_store_->by(shard_id).wlock();
-      auto* pool = kv_store_->pool_by(shard_id);
-      float* block = pool->allocate_t<float>();
-      FixedBlockPool::set_key(block, i);
-      FixedBlockPool::set_count(block, 1);  // 初始分数
-      FixedBlockPool::set_used(block, true);
-      wlock->insert({i, block});
-    }
-
-    for (int i = 1000; i < 2000; ++i) {
-      int shard_id = i % NUM_SHARDS;
-      auto wlock = kv_store_->by(shard_id).wlock();
-      auto* pool = kv_store_->pool_by(shard_id);
-      float* block = pool->allocate_t<float>();
-      FixedBlockPool::set_key(block, i);
-      FixedBlockPool::set_count(block, 2);  // 初始分数
-      FixedBlockPool::set_used(block, true);
-      wlock->insert({i, block});
-    }
+  // Insert test data
+  for (int i = 0; i < 1000; ++i) {
+    int shard_id = i % NUM_SHARDS;
+    auto wlock = kv_store_->by(shard_id).wlock();
+    auto* pool = kv_store_->pool_by(shard_id);
+    auto* block = pool->allocate_t<float>();
+    FixedBlockPool::set_key(block, i);
+    FixedBlockPool::set_count(block, 1);  // Initial score
+    FixedBlockPool::set_used(block, true);
+    wlock->insert({i, block});
   }
 
-  std::unique_ptr<folly::CPUThreadPoolExecutor> executor_;
-  std::unique_ptr<SynchronizedShardedMap<int64_t, float*>> kv_store_;
-};
+  for (int i = 1000; i < 2000; ++i) {
+    int shard_id = i % NUM_SHARDS;
+    auto wlock = kv_store_->by(shard_id).wlock();
+    auto* pool = kv_store_->pool_by(shard_id);
+    auto* block = pool->allocate_t<float>();
+    FixedBlockPool::set_key(block, i);
+    FixedBlockPool::set_count(block, 2);  // Initial score
+    FixedBlockPool::set_used(block, true);
+    wlock->insert({i, block});
+  }
 
-TEST_F(FeatureEvictTest, BasicEviction) {
   CounterBasedEvict evictor(executor_.get(), *kv_store_.get(), 0.5f, 1);
 
-  // 初始验证
+  // Initial validation
   size_t total_blocks = 0;
   for (int shard_id = 0; shard_id < NUM_SHARDS; ++shard_id) {
     auto rlock = kv_store_->by(shard_id).rlock();
@@ -65,22 +53,22 @@ TEST_F(FeatureEvictTest, BasicEviction) {
   }
   ASSERT_EQ(total_blocks, 2000);
 
-  // 执行淘汰
+  // Perform eviction
   evictor.trigger_evict();
 
-  // 验证淘汰过程
+  // Validate eviction process
   while (evictor.is_evicting()) {
     evictor.resume();
     std::this_thread::sleep_for(std::chrono::microseconds(5));
     evictor.pause();
   }
 
-  // 验证结果
+  // Validate results
   size_t remaining = 0;
   for (int shard_id = 0; shard_id < NUM_SHARDS; ++shard_id) {
     auto rlock = kv_store_->by(shard_id).rlock();
     remaining += rlock->size();
-    // 验证分数衰减
+    // Validate score decay
     for (const auto& [key, block] : *rlock) {
       ASSERT_EQ(FixedBlockPool::get_count(block), 1);
     }
@@ -88,4 +76,59 @@ TEST_F(FeatureEvictTest, BasicEviction) {
   std::cout << "remaining: " << remaining << std::endl;
   ASSERT_EQ(remaining, 1000);
 }
-}  // namespace kv_mem
+
+TEST(FeatureEvictTest, PerformanceTest) {
+  static constexpr int NUM_SHARDS = 1;
+  // Test configurations
+  const std::vector<int> test_sizes = {100'000, 500'000, 1'000'000, 5'000'000, 10'000'000};
+
+  fmt::print("\nPerformance Test Results:\n");
+  fmt::print("{:<15} {:<15} {:<15}\n", "Size", "Time(ms)", "Items/ms");
+  fmt::print("{:-<45}\n", "");  // 分隔线
+
+  for (const auto& size : test_sizes) {
+    // Create executor and store for each test size
+    auto executor = std::make_unique<folly::CPUThreadPoolExecutor>(8);
+    auto kv_store =
+        std::make_unique<SynchronizedShardedMap<int64_t, float*>>(NUM_SHARDS, BLOCK_SIZE, BLOCK_ALIGNMENT, 1000);
+
+    // Insert test data with different initial scores
+    for (int i = 0; i < size; ++i) {
+      int shard_id = i % NUM_SHARDS;
+      auto wlock = kv_store->by(shard_id).wlock();
+      auto* pool = kv_store->pool_by(shard_id);
+      auto* block = pool->allocate_t<float>();
+      FixedBlockPool::set_key(block, i);
+      FixedBlockPool::set_count(block, (i % 2) ? 1 : 2);  // Alternate between scores
+      FixedBlockPool::set_used(block, true);
+      wlock->insert({i, block});
+    }
+
+    // Measure eviction time
+    std::vector<double> execution_times;
+    CounterBasedEvict evictor(executor.get(), *kv_store.get(), 0.5f, 1);
+
+    auto start_time = std::chrono::high_resolution_clock::now();
+
+    // Perform eviction
+    evictor.trigger_evict();
+    evictor.resume();
+    while (evictor.is_evicting()) {
+      std::this_thread::sleep_for(std::chrono::microseconds(1));
+    }
+
+    auto end_time = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
+
+    std::size_t current_size = 0;
+    for (int shard_id = 0; shard_id < NUM_SHARDS; ++shard_id) {
+      auto wlock = kv_store->by(shard_id).wlock();
+      current_size += wlock->size();
+    }
+    double eviction_rate = static_cast<double>(size - current_size) / static_cast<double>(size);
+
+    // Print results
+    fmt::print("{:<15d} {:<15d} {:<15.2f}\n", size, duration, eviction_rate);
+  }
+}
+}  // namespace kv_mem
\ No newline at end of file
diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp
index 47ef59d2de..f725d29a77 100644
--- a/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp
+++ b/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp
@@ -6,14 +6,14 @@
 #include <memory>
 #include <vector>
 
+#include <fmt/format.h>
+#include <fmt/ranges.h>
 #include <gtest/gtest.h>
-
 namespace kv_mem {
 
 double test_std_vector(size_t vector_size, size_t repeat_count) {
-  float sum = 0.0f;  // Prevent optimization
-  std::vector<std::vector<float>>
-      all_vectors;  // Store all vectors to prevent release
+  float sum = 0.0f;                             // Prevent optimization
+  std::vector<std::vector<float>> all_vectors;  // Store all vectors to prevent release
   all_vectors.reserve(repeat_count);
 
   auto start = std::chrono::high_resolution_clock::now();
@@ -61,10 +61,9 @@ double test_pool_vector(size_t vector_size, size_t repeat_count) {
 }
 
 void benchmark_memory_allocators() {
-  std::cout << "====== Testing performance difference between memory pool and "
-               "native vector allocation for 10 million "
-               "times ======"
-            << std::endl;
+  fmt::print(
+      "====== Testing performance difference between memory pool and "
+      "native vector allocation for 10 million times ======\n");
 
   // Vector sizes to test (in number of float elements)
   std::vector<size_t> vector_sizes = {4, 8, 16, 32, 64, 128, 256};
@@ -73,33 +72,24 @@ void benchmark_memory_allocators() {
   const size_t repeat_count = 10'000'000;
 
   for (const auto& size : vector_sizes) {
-    std::cout << "Vector size: " << size << " floats ("
-              << (size * sizeof(float)) << " bytes)" << std::endl;
-
+    fmt::print("Vector size: {} floats ({} bytes)\n", size, size * sizeof(float));
     // Testing standard vector
     double std_time = test_std_vector(size, repeat_count);
-    std::cout << "  Standard vector: " << std::fixed << std::setprecision(2)
-              << std_time << " ms" << std::endl;
+    fmt::print("  Standard vector: {:.2f} ms\n", std_time);
 
     // Testing memory pool
     double pool_time = test_pool_vector(size, repeat_count);
-    std::cout << "  Memory pool: " << std::fixed << std::setprecision(2)
-              << pool_time << " ms" << std::endl;
+    fmt::print("  Memory pool: {:.2f} ms\n", pool_time);
 
     // Calculate speed improvement
     double speedup = std_time / pool_time;
-    std::cout << "  Speed improvement: " << std::fixed << std::setprecision(2)
-              << speedup << "x" << std::endl;
-
-    std::cout << std::endl;
-    std::cout << "============================" << std::endl;
+    fmt::print("  Speed improvement: {:.2f}x\n\n", speedup);
+    fmt::print("============================\n");
   }
 }
 
 // Basic functionality test: Integer keys
-TEST(FixedBlockPoolTest, benchmark_memory_allocators) {
-  benchmark_memory_allocators();
-}
+TEST(FixedBlockPoolTest, benchmark_memory_allocators) { benchmark_memory_allocators(); }
 
 // Test constructor normal case
 TEST(FixedBlockPoolTest, ConstructorNormal) {
@@ -208,14 +198,10 @@ TEST(FixedBlockPoolTest, ErrorHandling) {
   kv_mem::FixedBlockPool pool(block_size, alignment);
 
   // Try to allocate memory with incorrect size
-  EXPECT_THROW(
-      { [[maybe_unused]] void* p = pool.allocate(block_size * 2, alignment); },
-      std::bad_alloc);
+  EXPECT_THROW({ [[maybe_unused]] void* p = pool.allocate(block_size * 2, alignment); }, std::bad_alloc);
 
   // Try to allocate memory with incorrect alignment
-  EXPECT_THROW(
-      { [[maybe_unused]] void* p = pool.allocate(block_size, alignment * 2); },
-      std::bad_alloc);
+  EXPECT_THROW({ [[maybe_unused]] void* p = pool.allocate(block_size, alignment * 2); }, std::bad_alloc);
 }
 
 // Test memory reuse after deallocation
@@ -250,8 +236,7 @@ TEST(FixedBlockPoolTest, CustomUpstreamResource) {
 
   class CountingResource : public std::pmr::memory_resource {
    public:
-    CountingResource(int& alloc_count, int& dealloc_count)
-        : alloc_count_(alloc_count), dealloc_count_(dealloc_count) {}
+    CountingResource(int& alloc_count, int& dealloc_count) : alloc_count_(alloc_count), dealloc_count_(dealloc_count) {}
 
    protected:
     void* do_allocate(size_t bytes, size_t alignment) override {
@@ -264,10 +249,7 @@ TEST(FixedBlockPoolTest, CustomUpstreamResource) {
       std::pmr::new_delete_resource()->deallocate(p, bytes, alignment);
     }
 
-    bool do_is_equal(
-        const std::pmr::memory_resource& other) const noexcept override {
-      return this == &other;
-    }
+    bool do_is_equal(const std::pmr::memory_resource& other) const noexcept override { return this == &other; }
 
    private:
     int& alloc_count_;
@@ -316,8 +298,7 @@ TEST(FixedBlockPool, BasicFunctionality) {
 
   // Test data pointer offset
   float* data = FixedBlockPool::data_ptr<float>(block);
-  ASSERT_EQ(reinterpret_cast<char*>(data) - reinterpret_cast<char*>(block),
-            sizeof(FixedBlockPool::MetaHeader));
+  ASSERT_EQ(reinterpret_cast<char*>(data) - reinterpret_cast<char*>(block), sizeof(FixedBlockPool::MetaHeader));
 
   // Test timestamp update
   FixedBlockPool::update_timestamp(block);
@@ -340,8 +321,7 @@ TEST(FixedBlockPool, MultiDimensionTest) {
     EXPECT_EQ(alignment % alignof(float), 0);
 
     // Verify block size calculation
-    const size_t expected_size =
-        sizeof(FixedBlockPool::MetaHeader) + dim * sizeof(float);
+    const size_t expected_size = sizeof(FixedBlockPool::MetaHeader) + dim * sizeof(float);
     EXPECT_EQ(block_size, expected_size);
   }
 }
diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp
index a2f0dcfb1a..5e4b59e206 100644
--- a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp
+++ b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp
@@ -2,24 +2,23 @@
 #include <iostream>
 
 #include <array>
+#include <fmt/format.h>
+#include <fmt/ranges.h>
 #include <gtest/gtest.h>
 
 #include "fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h"
 #include "fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h"
 
 namespace kv_mem {
-std::vector<float> generateFixedEmbedding(int dimension) {
-  return std::vector<float>(dimension, 1.0);
-}
+std::vector<float> generateFixedEmbedding(int dimension) { return std::vector<float>(dimension, 1.0); }
 
 void memPoolEmbedding(int dimension, size_t numInserts, size_t numLookups) {
   const size_t numShards = 1;
 
-  SynchronizedShardedMap<unsigned long, float*> embeddingMap(
-      numShards,
-      dimension * sizeof(float),  // block_size
-      alignof(float),             // block_alignment
-      8192);                      // blocks_per_chunk
+  SynchronizedShardedMap<unsigned long, float*> embeddingMap(numShards,
+                                                             dimension * sizeof(float),  // block_size
+                                                             alignof(float),             // block_alignment
+                                                             8192);                      // blocks_per_chunk
   double insertTime, lookupTime;
   {
     std::vector<float> fixedEmbedding = generateFixedEmbedding(dimension);
@@ -35,9 +34,7 @@ void memPoolEmbedding(int dimension, size_t numInserts, size_t numLookups) {
       wlock->insert_or_assign(i, arr);
     }
     auto endInsert = std::chrono::high_resolution_clock::now();
-    insertTime =
-        std::chrono::duration<double, std::milli>(endInsert - startInsert)
-            .count();
+    insertTime = std::chrono::duration<double, std::milli>(endInsert - startInsert).count();
   }
 
   std::vector<float> lookEmbedding(dimension);
@@ -53,31 +50,25 @@ void memPoolEmbedding(int dimension, size_t numInserts, size_t numLookups) {
       }
     }
     auto endLookup = std::chrono::high_resolution_clock::now();
-    lookupTime =
-        std::chrono::duration<double, std::milli>(endLookup - startLookup)
-            .count();
+    lookupTime = std::chrono::duration<double, std::milli>(endLookup - startLookup).count();
   }
 
-  std::cout << std::left << std::setw(20) << dimension;
-  std::cout << std::fixed << std::setprecision(2);
-  std::cout << std::setw(20) << insertTime;
-  std::cout << std::setw(20) << lookupTime;
-  std::cout << std::setw(20) << (100.0 * (double)hitCount / (double)numLookups);
-  std::cout << std::endl;
+  fmt::print("{:<20}{:<20.2f}{:<20.2f}{:<20.2f}\n",
+             dimension,
+             insertTime,
+             lookupTime,
+             100.0 * static_cast<double>(hitCount) / static_cast<double>(numLookups));
 }
 
-void memPoolEmbeddingWithTime(int dimension,
-                              size_t numInserts,
-                              size_t numLookups) {
+void memPoolEmbeddingWithTime(int dimension, size_t numInserts, size_t numLookups) {
   const size_t numShards = 1;
   size_t block_size = FixedBlockPool::calculate_block_size<float>(dimension);
   size_t block_alignment = FixedBlockPool::calculate_block_alignment<float>();
 
-  SynchronizedShardedMap<unsigned long, float*> embeddingMap(
-      numShards,
-      block_size,       // block_size
-      block_alignment,  // block_alignment
-      8192);            // blocks_per_chunk
+  SynchronizedShardedMap<unsigned long, float*> embeddingMap(numShards,
+                                                             block_size,       // block_size
+                                                             block_alignment,  // block_alignment
+                                                             8192);            // blocks_per_chunk
   double insertTime, lookupTime;
   {
     std::vector<float> fixedEmbedding = generateFixedEmbedding(dimension);
@@ -93,9 +84,7 @@ void memPoolEmbeddingWithTime(int dimension,
       wlock->insert_or_assign(i, block);
     }
     auto endInsert = std::chrono::high_resolution_clock::now();
-    insertTime =
-        std::chrono::duration<double, std::milli>(endInsert - startInsert)
-            .count();
+    insertTime = std::chrono::duration<double, std::milli>(endInsert - startInsert).count();
   }
 
   std::vector<float> lookEmbedding(dimension);
@@ -114,17 +103,15 @@ void memPoolEmbeddingWithTime(int dimension,
       }
     }
     auto endLookup = std::chrono::high_resolution_clock::now();
-    lookupTime =
-        std::chrono::duration<double, std::milli>(endLookup - startLookup)
-            .count();
+    lookupTime = std::chrono::duration<double, std::milli>(endLookup - startLookup).count();
   }
 
-  std::cout << std::left << std::setw(20) << dimension;
-  std::cout << std::fixed << std::setprecision(2);
-  std::cout << std::setw(20) << insertTime;
-  std::cout << std::setw(20) << lookupTime;
-  std::cout << std::setw(20) << (100.0 * (double)hitCount / (double)numLookups);
-  std::cout << std::endl;
+  // 替换输出部分
+  fmt::print("{:<20}{:<20.2f}{:<20.2f}{:<20.2f}\n",
+             dimension,
+             insertTime,
+             lookupTime,
+             100.0 * static_cast<double>(hitCount) / static_cast<double>(numLookups));
 }
 
 int benchmark() {
@@ -132,27 +119,20 @@ int benchmark() {
   const size_t numInserts = 1'000'000;  // 1 million insert
   const size_t numLookups = 1'000'000;  // 1 million find
 
-  std::cout
-      << "======================= mempool ===================================="
-      << std::endl;
-  std::cout << std::left << std::setw(20) << "dim" << std::setw(20)
-            << "insert time (ms)" << std::setw(20) << "find time (ms)"
-            << std::setw(20) << "hit rate (%)" << std::endl;
+  fmt::print("======================= mempool ====================================\n");
+  fmt::print("{:<20}{:<20}{:<20}{:<20}\n", "dim", "insert time (ms)", "find time (ms)", "hit rate (%)");
   for (int dim : dimensions) {
     memPoolEmbedding(dim, numInserts, numLookups);
   }
-  std::cout << std::endl << std ::endl;
-
-  std::cout << "======================= mempool with time "
-               "===================================="
-            << std::endl;
-  std::cout << std::left << std::setw(20) << "dim" << std::setw(20)
-            << "insert time (ms)" << std::setw(20) << "find time (ms)"
-            << std::setw(20) << "hit rate (%)" << std::endl;
+  fmt::print("\n\n");
+  std::fflush(stdout);
+
+  fmt::print("======================= mempool with time ====================================\n");
+  fmt::print("{:<20}{:<20}{:<20}{:<20}\n", "dim", "insert time (ms)", "find time (ms)", "hit rate (%)");
   for (int dim : dimensions) {
     memPoolEmbeddingWithTime(dim, numInserts, numLookups);
   }
-  std::cout << std::endl << std ::endl;
+  fmt::print("\n\n");
   return 0;
 }
 TEST(SynchronizedShardedMap, benchmark) { benchmark(); }

From 99c14d090799bd5020c05a465b8f7a68d5a1d2e8 Mon Sep 17 00:00:00 2001
From: houzhenggang <hou.zg@foxmail.com>
Date: Tue, 27 May 2025 19:34:34 +0800
Subject: [PATCH 07/12] QuantUtilsTest add static_cast

---
 test/QuantUtilsTest.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/QuantUtilsTest.cc b/test/QuantUtilsTest.cc
index fdd9af4ebd..6ea7dd12aa 100644
--- a/test/QuantUtilsTest.cc
+++ b/test/QuantUtilsTest.cc
@@ -560,7 +560,7 @@ class EmbeddingQuantizeFixedNumberTest : public testing::TestWithParam<int> {
       1, 1, 1, 1,               // All the same. Range: 0, min: 1
       -64, -2.75, 61.625, 191,  // Range: 255, min: -64. Picking 61.625 because it differs under FP16 (will become 61.5).
     };
-    assert(float_test_input.size() == row * col);
+    assert(float_test_input.size() == static_cast<size_t>(row * col));
 
     float16_test_input.resize(float_test_input.size());
     std::transform(

From ffc0333d44dee192eb27cdb966ad8266cfb9055a Mon Sep 17 00:00:00 2001
From: WP <yrzr12345678@gmail.com>
Date: Tue, 27 May 2025 20:24:00 +0800
Subject: [PATCH 08/12] FeatureEvict supplement

---
 .../dram_kv_embedding_cache.h                 |  56 +++-
 .../dram_kv_embedding_cache_wrapper.h         |  30 +-
 .../dram_kv_embedding_cache/feature_evict.h   | 152 +++++++++-
 .../fixed_block_pool.h                        |  10 +
 .../feature_evict_test.cpp                    | 259 +++++++++++++++++-
 5 files changed, 480 insertions(+), 27 deletions(-)

diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h
index 4507e2060f..5ceae13127 100644
--- a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h
@@ -15,7 +15,8 @@
 
 #include "SynchronizedShardedMap.h"
 #include "deeplearning/fbgemm/fbgemm_gpu/src/ssd_split_embeddings_cache/initializer.h"
-#include "store_value_utils.h"
+#include "fixed_block_pool.h"
+#include "feature_evict.h"
 
 #include <ATen/core/ivalue.h>
 #include <caffe2/torch/fb/distributed/wireSerializer/WireSerializer.h>
@@ -46,6 +47,7 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
   /// @param max_D the maximum dimension of of embedding tensor
   /// @param uniform_init_lower the lower bound of the uniform distribution
   /// @param uniform_init_upper the upper bound of the uniform distribution
+  /// @param feature_evict_config feature evict config
   /// @param num_shards number of shards for the kvstore. This is to improve
   /// parallelization. Each key value pair will be sharded into one shard.
   /// @param num_threads num of threads that kvstore needs to be run upon for
@@ -59,6 +61,7 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
       int64_t max_D,
       double uniform_init_lower,
       double uniform_init_upper,
+      FeatureEvictConfig feature_evict_config,
       int64_t num_shards = 8,
       int64_t num_threads = 32,
       int64_t row_storage_bitwidth = 32,
@@ -68,10 +71,11 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
             max_D,
             0), // l2_cache_size_gb =0 to disable l2 cache
         max_D_(max_D),
+        feature_evict_config_(feature_evict_config),
         num_shards_(num_shards),
         weight_ttl_in_hours_(weight_ttl_in_hours),
-        block_size_(StoreValueUtils::calculate_block_size<weight_type>(max_D)),
-        block_alignment_(StoreValueUtils::calculate_block_alignment<weight_type>()),
+        block_size_(FixedBlockPool::calculate_block_size<weight_type>(max_D)),
+        block_alignment_(FixedBlockPool::calculate_block_alignment<weight_type>()),
         kv_store_(SynchronizedShardedMap<int64_t, weight_type*>(
             num_shards_,
             block_size_,
@@ -86,6 +90,9 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
         uniform_init_lower,
         uniform_init_upper,
         row_storage_bitwidth);
+    if (feature_evict_config_.trigger_mode != EvictTriggerMode::DISABLED) {
+      feature_evict_ = create_feature_evict(feature_evict_config_, executor_.get(), kv_store_, max_D);
+    }
   }
 
   void initialize_initializers(
@@ -205,12 +212,13 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
                             block = it->second;
                           } else {
                             // Key doesn't exist, allocate new block and insert.
-                            block = StoreValueUtils::allocate<weight_type>(
-                                block_size_, block_alignment_, pool);
+                            block = pool->allocate_t();
                             wlmap->insert({id, block});
                           }
-                          StoreValueUtils::update_timestamp<weight_type>(block);
-                          auto* data_ptr = StoreValueUtils::data_ptr<weight_type>(block);
+                          if (feature_evict_) {
+                            feature_evict_->update_feature_statistics(block);
+                          }
+                          auto* data_ptr = FixedBlockPool::data_ptr<weight_type>(block);
                           std::copy(weights[id_index]
                                         .template data_ptr<weight_type>(),
                                     weights[id_index]
@@ -295,12 +303,11 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
                             continue;
                           }
                           // use mempool
-                          const auto* data_ptr = StoreValueUtils::data_ptr<weight_type>(cached_iter->second);
-                          StoreValueUtils::update_timestamp(cached_iter->second);
+                          const auto* data_ptr = FixedBlockPool::data_ptr<weight_type>(cached_iter->second);
                           std::copy(
                               data_ptr,
                               data_ptr + max_D_,
-                              &(weights_data_ptr[index * max_D_]));  // dst_start
+                              &(weights_data_ptr[id_index * max_D_]));  // dst_start
                         }
                       }
                     });
@@ -322,6 +329,32 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
 
   void compact() override {}
 
+  void trigger_feature_evict() {
+    if (feature_evict_) {
+      feature_evict_->trigger_evict();
+    }
+  }
+
+  void feature_evict_resume() {
+    if (feature_evict_) {
+      feature_evict_->resume();
+    }
+  }
+
+  void feature_evict_pause() {
+    if (feature_evict_) {
+      feature_evict_->pause();
+    }
+  }
+
+  void maybe_evict_by_step() {
+    if (feature_evict_config_.trigger_mode == EvictTriggerMode::ITERATION &&
+        feature_evict_config_.trigger_step_interval > 0 &&
+        ++current_iter_ % feature_evict_config_.trigger_step_interval == 0) {
+      trigger_feature_evict();
+    }
+  }
+
  private:
   void fill_from_row_storage(
       int shard_id,
@@ -390,6 +423,9 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
   std::atomic_bool is_eviction_ongoing_ = false;
   std::vector<std::unique_ptr<ssd::Initializer>> initializers_;
   int64_t elem_size_;
+  FeatureEvictConfig feature_evict_config_;
+  std::unique_ptr<FeatureEvict<weight_type>> feature_evict_;
+  int current_iter_ = 0;
 }; // class DramKVEmbeddingCache
 
 } // namespace kv_mem
diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h
index 0b915e50ba..9dc1be091b 100644
--- a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h
@@ -26,15 +26,34 @@ class DramKVEmbeddingCacheWrapper : public torch::jit::CustomClassHolder {
       int64_t max_D,
       double uniform_init_lower,
       double uniform_init_upper,
+      int evict_trigger_mode,
+      int evict_trigger_strategy,
+      int64_t trigger_step_interval,
+      uint32_t ttl,
+      uint32_t count_threshold,
+      float count_decay_rate,
+      double l2_weight_threshold,
       int64_t num_shards = 8,
       int64_t num_threads = 32,
       int64_t row_storage_bitwidth = 32,
       int64_t weight_ttl_in_hours = 2) {
+
+    // feature evict config
+    FeatureEvictConfig feature_evict_config;
+    feature_evict_config.trigger_mode = static_cast<EvictTriggerMode>(evict_trigger_mode);
+    feature_evict_config.trigger_strategy = static_cast<EvictTriggerStrategy>(evict_trigger_strategy);
+    feature_evict_config.trigger_step_interval = trigger_step_interval;
+    feature_evict_config.ttl = ttl;
+    feature_evict_config.count_threshold = count_threshold;
+    feature_evict_config.count_decay_rate = count_decay_rate;
+    feature_evict_config.l2_weight_threshold = l2_weight_threshold;
+
     if (row_storage_bitwidth == 16) {
       impl_ = std::make_shared<kv_mem::DramKVEmbeddingCache<at::Half>>(
           max_D,
           uniform_init_lower,
           uniform_init_upper,
+          feature_evict_config,
           num_shards,
           num_threads,
           row_storage_bitwidth,
@@ -44,6 +63,7 @@ class DramKVEmbeddingCacheWrapper : public torch::jit::CustomClassHolder {
           max_D,
           uniform_init_lower,
           uniform_init_upper,
+          feature_evict_config,
           num_shards,
           num_threads,
           row_storage_bitwidth,
@@ -67,7 +87,11 @@ class DramKVEmbeddingCacheWrapper : public torch::jit::CustomClassHolder {
   }
 
   void set(at::Tensor indices, at::Tensor weights, at::Tensor count) {
-    return impl_->set(indices, weights, count);
+    impl_->feature_evict_pause();
+    impl_->set(indices, weights, count);
+    // when use ITERATION EvictTriggerMode, trigger evict by step
+    impl_->maybe_evict_by_step();
+    impl_->feature_evict_resume();
   }
 
   void flush() {
@@ -86,7 +110,9 @@ class DramKVEmbeddingCacheWrapper : public torch::jit::CustomClassHolder {
       at::Tensor weights,
       at::Tensor count,
       int64_t sleep_ms) {
-    return impl_->get(indices, weights, count, sleep_ms);
+    impl_->feature_evict_pause();
+    impl_->get(indices, weights, count, sleep_ms);
+    impl_->feature_evict_resume();
   }
 
   void wait_util_filling_work_done() {
diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h b/fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h
index a7c42b291e..8a384b3f55 100644
--- a/fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h
@@ -17,10 +17,28 @@
 
 namespace kv_mem {
 
+enum class EvictTriggerMode {
+  DISABLED,   // Do not use feature evict
+  ITERATION,  // Trigger based on iteration steps
+  MANUAL      // Manually triggered by upstream
+};
+
+enum class EvictTriggerStrategy { BY_TIMESTAMP, BY_COUNTER, BY_TIMESTAMP_AND_COUNTER, BY_L2WEIGHT };
+
+struct FeatureEvictConfig {
+  EvictTriggerStrategy trigger_strategy;
+  EvictTriggerMode trigger_mode;
+  int64_t trigger_step_interval;
+  uint32_t ttl;
+  uint32_t count_threshold;
+  float count_decay_rate;
+  double l2_weight_threshold;
+};
+
 template <typename weight_type>
-class FeatureEvictBase {
+class FeatureEvict {
  public:
-  FeatureEvictBase(folly::CPUThreadPoolExecutor* executor, SynchronizedShardedMap<int64_t, weight_type*>& kv_store)
+  FeatureEvict(folly::CPUThreadPoolExecutor* executor, SynchronizedShardedMap<int64_t, weight_type*>& kv_store)
       : executor_(executor),
         kv_store_(kv_store),
         evict_flag_(false),
@@ -29,7 +47,7 @@ class FeatureEvictBase {
     init_shard_status();
   }
 
-  virtual ~FeatureEvictBase() {
+  virtual ~FeatureEvict() {
     wait_completion();  // Wait for all asynchronous tasks to complete.
   };
 
@@ -39,7 +57,6 @@ class FeatureEvictBase {
   void trigger_evict() {
     std::lock_guard<std::mutex> lock(mutex_);
     if (evict_flag_.exchange(true)) return;
-    fmt::print("Starting new eviction process...\n");
     prepare_evict();
   }
 
@@ -72,6 +89,8 @@ class FeatureEvictBase {
     return evict_flag_.load();
   }
 
+  virtual void update_feature_statistics(weight_type* block) = 0;
+
  protected:
   void init_shard_status() {
     block_cursors_.resize(num_shards_);
@@ -172,13 +191,15 @@ class FeatureEvictBase {
 };
 
 template <typename weight_type>
-class CounterBasedEvict : public FeatureEvictBase<weight_type> {
+class CounterBasedEvict : public FeatureEvict<weight_type> {
  public:
   CounterBasedEvict(folly::CPUThreadPoolExecutor* executor,
                     SynchronizedShardedMap<int64_t, weight_type*>& kv_store,
                     float decay_rate,
                     uint32_t threshold)
-      : FeatureEvictBase<weight_type>(executor, kv_store), decay_rate_(decay_rate), threshold_(threshold) {}
+      : FeatureEvict<weight_type>(executor, kv_store), decay_rate_(decay_rate), threshold_(threshold) {}
+
+  void update_feature_statistics(weight_type* block) override { FixedBlockPool::update_count(block); }
 
  protected:
   bool evict_block(weight_type* block) override {
@@ -195,12 +216,14 @@ class CounterBasedEvict : public FeatureEvictBase<weight_type> {
 };
 
 template <typename weight_type>
-class TimeBasedEvict : public FeatureEvictBase<weight_type> {
+class TimeBasedEvict : public FeatureEvict<weight_type> {
  public:
   TimeBasedEvict(folly::CPUThreadPoolExecutor* executor,
                  SynchronizedShardedMap<int64_t, weight_type*>& kv_store,
                  uint32_t ttl)
-      : FeatureEvictBase<weight_type>(executor, kv_store), ttl_(ttl) {}
+      : FeatureEvict<weight_type>(executor, kv_store), ttl_(ttl) {}
+
+  void update_feature_statistics(weight_type* block) override { FixedBlockPool::update_timestamp(block); }
 
  protected:
   bool evict_block(weight_type* block) override {
@@ -211,4 +234,115 @@ class TimeBasedEvict : public FeatureEvictBase<weight_type> {
  private:
   uint32_t ttl_;  // Time-to-live for eviction.
 };
-}  // namespace kv_mem
\ No newline at end of file
+
+template <typename weight_type>
+class TimeCounterBasedEvict : public FeatureEvict<weight_type> {
+ public:
+  TimeCounterBasedEvict(folly::CPUThreadPoolExecutor* executor,
+                        SynchronizedShardedMap<int64_t, weight_type*>& kv_store,
+                        uint32_t ttl,
+                        float decay_rate,
+                        uint32_t threshold)
+      : FeatureEvict<weight_type>(executor, kv_store), ttl_(ttl), decay_rate_(decay_rate), threshold_(threshold) {}
+
+  void update_feature_statistics(weight_type* block) override {
+    FixedBlockPool::update_timestamp(block);
+    FixedBlockPool::update_count(block);
+  }
+
+ protected:
+  bool evict_block(weight_type* block) override {
+    // Apply decay and check the count threshold and ttl.
+    auto current_time = FixedBlockPool::current_timestamp();
+    auto current_count = FixedBlockPool::get_count(block);
+    current_count *= decay_rate_;
+    FixedBlockPool::set_count(block, current_count);
+    return (current_time - FixedBlockPool::get_timestamp(block) > ttl_) && (current_count < threshold_);
+  }
+
+ private:
+  uint32_t ttl_;       // Time-to-live for eviction.
+  float decay_rate_;   // Decay rate for the block count.
+  uint32_t threshold_; // Count threshold for eviction.
+};
+
+template <typename weight_type>
+class L2WeightBasedEvict : public FeatureEvict<weight_type> {
+ public:
+  L2WeightBasedEvict(folly::CPUThreadPoolExecutor* executor,
+                     SynchronizedShardedMap<int64_t, weight_type*>& kv_store,
+                     double threshold,
+                     size_t dimension)
+      : FeatureEvict<weight_type>(executor, kv_store), threshold_(threshold), dimension_(dimension) {}
+
+  void update_feature_statistics([[maybe_unused]] weight_type* block) override {}
+
+ protected:
+  bool evict_block(weight_type* block) override {
+    auto l2weight = FixedBlockPool::get_l2weight(block, dimension_);
+    return l2weight < threshold_;
+  }
+
+ private:
+  double threshold_;  // L2 weight threshold for eviction.
+  size_t dimension_;  // Embedding dimension
+};
+
+template <typename weight_type>
+std::unique_ptr<FeatureEvict<weight_type>> create_feature_evict(
+    const FeatureEvictConfig& config,
+    folly::CPUThreadPoolExecutor* executor,
+    SynchronizedShardedMap<int64_t, weight_type*>& kv_store,
+    size_t dimension) {
+  if (executor == nullptr) {
+    throw std::invalid_argument("executor cannot be null");
+  }
+
+  switch (config.trigger_strategy) {
+    case EvictTriggerStrategy::BY_TIMESTAMP: {
+      if (config.ttl <= 0) {
+        throw std::invalid_argument("ttl must be positive");
+      }
+      return std::make_unique<TimeBasedEvict<weight_type>>(executor, kv_store, config.ttl);
+    }
+
+    case EvictTriggerStrategy::BY_COUNTER: {
+      if (config.count_decay_rate <= 0 || config.count_decay_rate > 1) {
+        throw std::invalid_argument("count_decay_rate must be in range (0,1]");
+      }
+      if (config.count_threshold <= 0) {
+        throw std::invalid_argument("count_threshold must be positive");
+      }
+      return std::make_unique<CounterBasedEvict<weight_type>>(
+          executor, kv_store, config.count_decay_rate, config.count_threshold);
+    }
+
+    case EvictTriggerStrategy::BY_TIMESTAMP_AND_COUNTER: {
+      if (config.ttl <= 0) {
+        throw std::invalid_argument("ttl must be positive");
+      }
+      if (config.count_decay_rate <= 0 || config.count_decay_rate > 1) {
+        throw std::invalid_argument("count_decay_rate must be in range (0,1]");
+      }
+      if (config.count_threshold <= 0) {
+        throw std::invalid_argument("count_threshold must be positive");
+      }
+      return std::make_unique<TimeCounterBasedEvict<weight_type>>(
+          executor, kv_store, config.ttl, config.count_decay_rate, config.count_threshold);
+    }
+
+    case EvictTriggerStrategy::BY_L2WEIGHT: {
+      if (config.l2_weight_threshold <= 0) {
+        throw std::invalid_argument("l2_weight_threshold must be positive");
+      }
+      // TODO: optimizer parameters should not be included in dimension
+      return std::make_unique<L2WeightBasedEvict<weight_type>>(
+          executor, kv_store, config.l2_weight_threshold, dimension);
+    }
+
+    default:
+      throw std::runtime_error("Unknown evict trigger strategy");
+  }
+}
+
+}  // namespace kv_mem
diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h
index f3f238d674..54f7c402a0 100644
--- a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h
@@ -5,6 +5,8 @@
 #include <memory_resource>
 #include <stdexcept>
 #include <vector>
+#include <cmath>
+#include <numeric>
 
 #include <cassert>
 
@@ -79,6 +81,14 @@ class FixedBlockPool : public std::pmr::memory_resource {
     return reinterpret_cast<const scalar_t*>(reinterpret_cast<const char*>(block) + sizeof(FixedBlockPool::MetaHeader));
   }
 
+  template <typename scalar_t>
+  static scalar_t get_l2weight(scalar_t* block, size_t dimension) {
+    scalar_t* data = FixedBlockPool::data_ptr(block);
+    return std::sqrt(
+        std::accumulate(data, data + dimension, scalar_t(0),
+                        [](scalar_t sum, scalar_t val) { return sum + val * val; }));
+  }
+
   explicit FixedBlockPool(std::size_t block_size,               // Size of each memory block
                           std::size_t block_alignment,          // Memory block alignment requirement
                           std::size_t blocks_per_chunk = 8192,  // Number of blocks per chunk
diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp
index 72dce3093d..48a39d8b45 100644
--- a/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp
+++ b/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp
@@ -15,7 +15,7 @@ static constexpr int DIMENSION = 128;
 size_t BLOCK_SIZE = FixedBlockPool::calculate_block_size<float>(DIMENSION);
 size_t BLOCK_ALIGNMENT = FixedBlockPool::calculate_block_alignment<float>();
 
-TEST(FeatureEvictTest, BasicEviction) {
+TEST(FeatureEvictTest, CounterBasedEviction) {
   static constexpr int NUM_SHARDS = 8;
   auto executor_ = std::make_unique<folly::CPUThreadPoolExecutor>(4);
   auto kv_store_ = std::make_unique<SynchronizedShardedMap<int64_t, float*>>(NUM_SHARDS, BLOCK_SIZE, BLOCK_ALIGNMENT);
@@ -43,7 +43,21 @@ TEST(FeatureEvictTest, BasicEviction) {
     wlock->insert({i, block});
   }
 
-  CounterBasedEvict evictor(executor_.get(), *kv_store_.get(), 0.5f, 1);
+  std::unique_ptr<FeatureEvict<float>> feature_evict;
+  int evict_trigger_mode = 2;
+  int evict_trigger_strategy = 1;
+  uint32_t count_threshold = 1;
+  float count_decay_rate = 0.5;
+  // feature evict config
+  FeatureEvictConfig feature_evict_config;
+  feature_evict_config.trigger_mode = static_cast<EvictTriggerMode>(evict_trigger_mode);
+  feature_evict_config.trigger_strategy = static_cast<EvictTriggerStrategy>(evict_trigger_strategy);
+  feature_evict_config.count_threshold = count_threshold;
+  feature_evict_config.count_decay_rate = count_decay_rate;
+
+  if (feature_evict_config.trigger_mode != EvictTriggerMode::DISABLED) {
+    feature_evict = create_feature_evict(feature_evict_config, executor_.get(),*kv_store_.get(), 4);
+  }
 
   // Initial validation
   size_t total_blocks = 0;
@@ -54,13 +68,13 @@ TEST(FeatureEvictTest, BasicEviction) {
   ASSERT_EQ(total_blocks, 2000);
 
   // Perform eviction
-  evictor.trigger_evict();
+  feature_evict->trigger_evict();
 
   // Validate eviction process
-  while (evictor.is_evicting()) {
-    evictor.resume();
+  while (feature_evict->is_evicting()) {
+    feature_evict->resume();
     std::this_thread::sleep_for(std::chrono::microseconds(5));
-    evictor.pause();
+    feature_evict->pause();
   }
 
   // Validate results
@@ -77,6 +91,239 @@ TEST(FeatureEvictTest, BasicEviction) {
   ASSERT_EQ(remaining, 1000);
 }
 
+TEST(FeatureEvictTest, TimeBasedEviction) {
+  static constexpr int NUM_SHARDS = 8;
+  auto executor_ = std::make_unique<folly::CPUThreadPoolExecutor>(4);
+  auto kv_store_ = std::make_unique<SynchronizedShardedMap<int64_t, float*>>(NUM_SHARDS, BLOCK_SIZE, BLOCK_ALIGNMENT);
+
+  // Insert test data
+  for (int i = 0; i < 1000; ++i) {
+    int shard_id = i % NUM_SHARDS;
+    auto wlock = kv_store_->by(shard_id).wlock();
+    auto* pool = kv_store_->pool_by(shard_id);
+    auto* block = pool->allocate_t<float>();
+    FixedBlockPool::set_key(block, i);
+    FixedBlockPool::update_timestamp(block);  // Initial score
+    FixedBlockPool::set_used(block, true);
+    wlock->insert({i, block});
+  }
+  std::this_thread::sleep_for(std::chrono::seconds(5));
+
+  for (int i = 1000; i < 2000; ++i) {
+    int shard_id = i % NUM_SHARDS;
+    auto wlock = kv_store_->by(shard_id).wlock();
+    auto* pool = kv_store_->pool_by(shard_id);
+    auto* block = pool->allocate_t<float>();
+    FixedBlockPool::set_key(block, i);
+    FixedBlockPool::update_timestamp(block); // Initial score
+    FixedBlockPool::set_used(block, true);
+    wlock->insert({i, block});
+  }
+
+  std::unique_ptr<FeatureEvict<float>> feature_evict;
+  int evict_trigger_mode = 2;
+  int evict_trigger_strategy = 0;
+  uint32_t ttl = 4;
+  // feature evict config
+  FeatureEvictConfig feature_evict_config;
+  feature_evict_config.trigger_mode = static_cast<EvictTriggerMode>(evict_trigger_mode);
+  feature_evict_config.trigger_strategy = static_cast<EvictTriggerStrategy>(evict_trigger_strategy);
+  feature_evict_config.ttl = ttl;
+
+  if (feature_evict_config.trigger_mode != EvictTriggerMode::DISABLED) {
+    feature_evict = create_feature_evict(feature_evict_config, executor_.get(),*kv_store_.get(), 4);
+  }
+
+  // Initial validation
+  size_t total_blocks = 0;
+  for (int shard_id = 0; shard_id < NUM_SHARDS; ++shard_id) {
+    auto rlock = kv_store_->by(shard_id).rlock();
+    total_blocks += rlock->size();
+  }
+  ASSERT_EQ(total_blocks, 2000);
+
+  // Perform eviction
+  feature_evict->trigger_evict();
+
+  // Validate eviction process
+  while (feature_evict->is_evicting()) {
+    feature_evict->resume();
+    std::this_thread::sleep_for(std::chrono::microseconds(5));
+    feature_evict->pause();
+  }
+
+  // Validate results
+  size_t remaining = 0;
+  for (int shard_id = 0; shard_id < NUM_SHARDS; ++shard_id) {
+    auto rlock = kv_store_->by(shard_id).rlock();
+    remaining += rlock->size();
+  }
+  std::cout << "remaining: " << remaining << std::endl;
+  ASSERT_EQ(remaining, 1000);
+}
+
+TEST(FeatureEvictTest, TimeCounterBasedEviction) {
+  static constexpr int NUM_SHARDS = 8;
+  auto executor_ = std::make_unique<folly::CPUThreadPoolExecutor>(4);
+  auto kv_store_ = std::make_unique<SynchronizedShardedMap<int64_t, float*>>(NUM_SHARDS, BLOCK_SIZE, BLOCK_ALIGNMENT);
+
+  // Insert test data
+  for (int i = 0; i < 500; ++i) {
+    int shard_id = i % NUM_SHARDS;
+    auto wlock = kv_store_->by(shard_id).wlock();
+    auto* pool = kv_store_->pool_by(shard_id);
+    auto* block = pool->allocate_t<float>();
+    FixedBlockPool::set_key(block, i);
+    FixedBlockPool::update_timestamp(block);  // Initial score
+    FixedBlockPool::set_count(block, 1);
+    FixedBlockPool::set_used(block, true);
+    wlock->insert({i, block});
+  }
+  std::this_thread::sleep_for(std::chrono::seconds(5));
+  for (int i = 500; i < 1000; ++i) {
+    int shard_id = i % NUM_SHARDS;
+    auto wlock = kv_store_->by(shard_id).wlock();
+    auto* pool = kv_store_->pool_by(shard_id);
+    auto* block = pool->allocate_t<float>();
+    FixedBlockPool::set_key(block, i);
+    FixedBlockPool::update_timestamp(block);  // Initial score
+    FixedBlockPool::set_count(block, 1);
+    FixedBlockPool::set_used(block, true);
+    wlock->insert({i, block});
+  }
+
+  for (int i = 1000; i < 2000; ++i) {
+    int shard_id = i % NUM_SHARDS;
+    auto wlock = kv_store_->by(shard_id).wlock();
+    auto* pool = kv_store_->pool_by(shard_id);
+    auto* block = pool->allocate_t<float>();
+    FixedBlockPool::set_key(block, i);
+    FixedBlockPool::update_timestamp(block); // Initial score
+    FixedBlockPool::set_count(block, 2);
+    FixedBlockPool::set_used(block, true);
+    wlock->insert({i, block});
+  }
+
+  std::unique_ptr<FeatureEvict<float>> feature_evict;
+  int evict_trigger_mode = 2;
+  int evict_trigger_strategy = 2;
+  uint32_t ttl = 4;
+  uint32_t count_threshold = 1;
+  float count_decay_rate = 0.5;
+
+  // feature evict config
+  FeatureEvictConfig feature_evict_config;
+  feature_evict_config.trigger_mode = static_cast<EvictTriggerMode>(evict_trigger_mode);
+  feature_evict_config.trigger_strategy = static_cast<EvictTriggerStrategy>(evict_trigger_strategy);
+  feature_evict_config.ttl = ttl;
+  feature_evict_config.count_threshold = count_threshold;
+  feature_evict_config.count_decay_rate = count_decay_rate;
+
+  if (feature_evict_config.trigger_mode != EvictTriggerMode::DISABLED) {
+    feature_evict = create_feature_evict(feature_evict_config, executor_.get(),*kv_store_.get(), 4);
+  }
+
+  // Initial validation
+  size_t total_blocks = 0;
+  for (int shard_id = 0; shard_id < NUM_SHARDS; ++shard_id) {
+    auto rlock = kv_store_->by(shard_id).rlock();
+    total_blocks += rlock->size();
+  }
+  ASSERT_EQ(total_blocks, 2000);
+
+  // Perform eviction
+  feature_evict->trigger_evict();
+
+  // Validate eviction process
+  while (feature_evict->is_evicting()) {
+    feature_evict->resume();
+    std::this_thread::sleep_for(std::chrono::microseconds(5));
+    feature_evict->pause();
+  }
+
+  // Validate results
+  size_t remaining = 0;
+  for (int shard_id = 0; shard_id < NUM_SHARDS; ++shard_id) {
+    auto rlock = kv_store_->by(shard_id).rlock();
+    remaining += rlock->size();
+  }
+  std::cout << "remaining: " << remaining << std::endl;
+  ASSERT_EQ(remaining, 1500);
+}
+
+TEST(FeatureEvictTest, L2WeightBasedEviction) {
+  static constexpr int NUM_SHARDS = 8;
+  auto executor_ = std::make_unique<folly::CPUThreadPoolExecutor>(4);
+  auto kv_store_ = std::make_unique<SynchronizedShardedMap<int64_t, float*>>(NUM_SHARDS, BLOCK_SIZE, BLOCK_ALIGNMENT);
+  int dim = 4;
+  std::vector<float> weight1(dim, 1.0);
+  // Insert test data
+  for (int i = 0; i < 1000; ++i) {
+    int shard_id = i % NUM_SHARDS;
+    auto wlock = kv_store_->by(shard_id).wlock();
+    auto* pool = kv_store_->pool_by(shard_id);
+    auto* block = pool->allocate_t<float>();
+    auto* data_ptr = FixedBlockPool::data_ptr<float>(block);
+    FixedBlockPool::set_key(block, i);
+    std::copy(weight1.begin(), weight1.end(), data_ptr);
+    FixedBlockPool::set_used(block, true);
+    wlock->insert({i, block});
+  }
+  std::vector<float> weight2(dim, 2.0);
+  for (int i = 1000; i < 2000; ++i) {
+    int shard_id = i % NUM_SHARDS;
+    auto wlock = kv_store_->by(shard_id).wlock();
+    auto* pool = kv_store_->pool_by(shard_id);
+    auto* block = pool->allocate_t<float>();
+    auto* data_ptr = FixedBlockPool::data_ptr<float>(block);
+    FixedBlockPool::set_key(block, i);
+    std::copy(weight2.begin(), weight2.end(), data_ptr);
+    FixedBlockPool::set_used(block, true);
+    wlock->insert({i, block});
+  }
+
+  std::unique_ptr<FeatureEvict<float>> feature_evict;
+  int evict_trigger_mode = 2;
+  int evict_trigger_strategy = 3;
+  double l2_weight_threshold = 3.0;
+  // feature evict config
+  FeatureEvictConfig feature_evict_config;
+  feature_evict_config.trigger_mode = static_cast<EvictTriggerMode>(evict_trigger_mode);
+  feature_evict_config.trigger_strategy = static_cast<EvictTriggerStrategy>(evict_trigger_strategy);
+  feature_evict_config.l2_weight_threshold = l2_weight_threshold;
+
+  if (feature_evict_config.trigger_mode != EvictTriggerMode::DISABLED) {
+    feature_evict = create_feature_evict(feature_evict_config, executor_.get(),*kv_store_.get(), dim);
+  }
+
+  // Initial validation
+  size_t total_blocks = 0;
+  for (int shard_id = 0; shard_id < NUM_SHARDS; ++shard_id) {
+    auto rlock = kv_store_->by(shard_id).rlock();
+    total_blocks += rlock->size();
+  }
+  ASSERT_EQ(total_blocks, 2000);
+
+  // Perform eviction
+  feature_evict->trigger_evict();
+
+  // Validate eviction process
+  while (feature_evict->is_evicting()) {
+    feature_evict->resume();
+    std::this_thread::sleep_for(std::chrono::microseconds(5));
+    feature_evict->pause();
+  }
+
+  // Validate results
+  size_t remaining = 0;
+  for (int shard_id = 0; shard_id < NUM_SHARDS; ++shard_id) {
+    auto rlock = kv_store_->by(shard_id).rlock();
+    remaining += rlock->size();
+  }
+  std::cout << "remaining: " << remaining << std::endl;
+  ASSERT_EQ(remaining, 1000);
+}
+
 TEST(FeatureEvictTest, PerformanceTest) {
   static constexpr int NUM_SHARDS = 1;
   // Test configurations

From e094e47076ca26cf9798800c06d408711d159fef Mon Sep 17 00:00:00 2001
From: WP <yrzr12345678@gmail.com>
Date: Wed, 28 May 2025 11:27:21 +0800
Subject: [PATCH 09/12] add Memory statistics

---
 .../SynchronizedShardedMap.h                  | 11 ++++++
 .../dram_kv_embedding_cache.h                 |  4 +++
 .../dram_kv_embedding_cache_wrapper.h         |  4 +++
 .../sharded_map_test.cpp                      | 36 +++++++++++++++++++
 4 files changed, 55 insertions(+)

diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h b/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h
index 3cd4c61c6f..e773a068ec 100644
--- a/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h
@@ -53,6 +53,17 @@ class SynchronizedShardedMap {
 
   auto getNumShards() { return shards_.size(); }
 
+  auto getUsedMemSize() {
+    size_t used_mem_size = 0;
+    size_t block_size = mempools_[0]->get_block_size();
+    for (size_t i = 0; i < shards_.size(); ++i) {
+      auto rlmap = shards_[i].rlock();
+      // only calculate the sizes of K, V and block that are used
+      used_mem_size += rlmap->size() * (sizeof(K) + sizeof(V) + block_size);
+    }
+    return used_mem_size;
+  }
+
  private:
   std::vector<folly::Synchronized<folly::F14FastMap<K, V>, M>> shards_;
   std::vector<std::unique_ptr<FixedBlockPool>> mempools_;
diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h
index 5ceae13127..3749b3a81b 100644
--- a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h
@@ -355,6 +355,10 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
     }
   }
 
+  size_t get_map_used_memsize() {
+    return kv_store_.getUsedMemSize();
+  }
+
  private:
   void fill_from_row_storage(
       int shard_id,
diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h
index 9dc1be091b..fe6a345a0c 100644
--- a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h
@@ -123,6 +123,10 @@ class DramKVEmbeddingCacheWrapper : public torch::jit::CustomClassHolder {
     return impl_->get_keys_in_range(start, end);
   }
 
+  size_t get_map_used_memsize() {
+    return impl_->get_map_used_memsize();
+  }
+
  private:
   // friend class EmbeddingRocksDBWrapper;
   friend class ssd::KVTensorWrapper;
diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp
index 5e4b59e206..d84acbb1d8 100644
--- a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp
+++ b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp
@@ -114,6 +114,36 @@ void memPoolEmbeddingWithTime(int dimension, size_t numInserts, size_t numLookup
              100.0 * static_cast<double>(hitCount) / static_cast<double>(numLookups));
 }
 
+void memPoolEmbeddingMemSize(int dimension, size_t numInserts) {
+  const size_t numShards = 4;
+  size_t block_size = FixedBlockPool::calculate_block_size<float>(dimension);
+  size_t block_alignment = FixedBlockPool::calculate_block_alignment<float>();
+
+  SynchronizedShardedMap<unsigned long, float*> embeddingMap(numShards,
+                                                             block_size,       // block_size
+                                                             block_alignment,  // block_alignment
+                                                             8192);            // blocks_per_chunk
+  {
+    std::vector<float> fixedEmbedding = generateFixedEmbedding(dimension);
+
+    auto wlock = embeddingMap.by(0).wlock();
+    auto* pool = embeddingMap.pool_by(0);
+
+    for (size_t i = 0; i < numInserts; i++) {
+      auto* block = pool->allocate_t<float>();
+      auto* data_ptr = FixedBlockPool::data_ptr<float>(block);
+      std::copy(fixedEmbedding.begin(), fixedEmbedding.end(), data_ptr);
+      wlock->insert_or_assign(i, block);
+    }
+  }
+  size_t totalMemory = embeddingMap.getUsedMemSize();
+  fmt::print("{:<20}{:<20}{:<20.2f}\n",
+             dimension,
+             numInserts,
+             static_cast<double>(totalMemory) / (1024 * 1024)); // MB
+
+}
+
 int benchmark() {
   std::vector<int> dimensions = {4, 8, 16, 32, 64};
   const size_t numInserts = 1'000'000;  // 1 million insert
@@ -133,6 +163,12 @@ int benchmark() {
     memPoolEmbeddingWithTime(dim, numInserts, numLookups);
   }
   fmt::print("\n\n");
+
+  fmt::print("======================= memory usage statistics ====================================\n");
+  fmt::print("{:<20}{:<20}{:<20}\n","dim", "numInserts", "total memory (MB)");
+  for (int dim : dimensions) {
+    memPoolEmbeddingMemSize(dim, numInserts);
+  }
   return 0;
 }
 TEST(SynchronizedShardedMap, benchmark) { benchmark(); }

From 33d7bb982e8741227cd496c6c54f217f68fa0412 Mon Sep 17 00:00:00 2001
From: WP <yrzr12345678@gmail.com>
Date: Wed, 28 May 2025 11:50:53 +0800
Subject: [PATCH 10/12] concern block align

---
 .../src/dram_kv_embedding_cache/SynchronizedShardedMap.h       | 2 +-
 fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h      | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h b/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h
index e773a068ec..2583ee03d3 100644
--- a/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h
@@ -55,7 +55,7 @@ class SynchronizedShardedMap {
 
   auto getUsedMemSize() {
     size_t used_mem_size = 0;
-    size_t block_size = mempools_[0]->get_block_size();
+    size_t block_size = mempools_[0]->get_aligned_block_size();
     for (size_t i = 0; i < shards_.size(); ++i) {
       auto rlmap = shards_[i].rlock();
       // only calculate the sizes of K, V and block that are used
diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h
index 54f7c402a0..ff1adaaf82 100644
--- a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h
@@ -159,6 +159,9 @@ class FixedBlockPool : public std::pmr::memory_resource {
   [[nodiscard]] std::size_t get_block_size() const noexcept { return block_size_; }
   [[nodiscard]] std::size_t get_block_alignment() const noexcept { return block_alignment_; }
   [[nodiscard]] std::size_t get_blocks_per_chunk() const noexcept { return blocks_per_chunk_; }
+  [[nodiscard]] std::size_t get_aligned_block_size() const noexcept {
+    return (block_size_ + block_alignment_ - 1) / block_alignment_ * block_alignment_;
+  }
 
  protected:
   // Core allocation function

From 7e69add99d0c20a361690f593efe9e9a7ac27cf8 Mon Sep 17 00:00:00 2001
From: WP <yrzr12345678@gmail.com>
Date: Wed, 28 May 2025 11:59:57 +0800
Subject: [PATCH 11/12] add const

---
 fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h | 2 +-
 .../src/dram_kv_embedding_cache/dram_kv_embedding_cache.h       | 2 +-
 .../dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h b/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h
index 2583ee03d3..1948b42c8d 100644
--- a/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h
@@ -53,7 +53,7 @@ class SynchronizedShardedMap {
 
   auto getNumShards() { return shards_.size(); }
 
-  auto getUsedMemSize() {
+  auto getUsedMemSize() const {
     size_t used_mem_size = 0;
     size_t block_size = mempools_[0]->get_aligned_block_size();
     for (size_t i = 0; i < shards_.size(); ++i) {
diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h
index 3749b3a81b..32225fe059 100644
--- a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h
@@ -355,7 +355,7 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
     }
   }
 
-  size_t get_map_used_memsize() {
+  size_t get_map_used_memsize() const {
     return kv_store_.getUsedMemSize();
   }
 
diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h
index fe6a345a0c..2543091d6e 100644
--- a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h
@@ -123,7 +123,7 @@ class DramKVEmbeddingCacheWrapper : public torch::jit::CustomClassHolder {
     return impl_->get_keys_in_range(start, end);
   }
 
-  size_t get_map_used_memsize() {
+  size_t get_map_used_memsize() const {
     return impl_->get_map_used_memsize();
   }
 

From 97639674e76e83284c905baac779a545206a5e96 Mon Sep 17 00:00:00 2001
From: houzhenggang <houzhenggang@meituan.com>
Date: Wed, 28 May 2025 16:34:10 +0800
Subject: [PATCH 12/12] hashtable save and load

---
 .../SynchronizedShardedMap.h                  |  48 +++++-
 .../fixed_block_pool.h                        | 108 +++++++++++-
 .../dram_kv_embedding_cache/CMakeLists.txt    |  27 +--
 .../fixed_block_pool_saver_test.cpp           | 157 ++++++++++++++++++
 .../sharded_map_test.cpp                      |  53 ++++++
 5 files changed, 378 insertions(+), 15 deletions(-)
 create mode 100644 fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_saver_test.cpp

diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h b/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h
index 1948b42c8d..0dda6e6fcb 100644
--- a/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h
@@ -51,7 +51,7 @@ class SynchronizedShardedMap {
     return mempools_.at(index % shards_.size()).get();
   }
 
-  auto getNumShards() { return shards_.size(); }
+  auto getNumShards() const { return shards_.size(); }
 
   auto getUsedMemSize() const {
     size_t used_mem_size = 0;
@@ -64,6 +64,52 @@ class SynchronizedShardedMap {
     return used_mem_size;
   }
 
+  void save(const std::string& filename) const {
+    std::ofstream out(filename, std::ios::binary);
+    if (!out) {
+      throw std::runtime_error("Failed to open file for writing");
+    }
+
+    const std::size_t num_shards = getNumShards();
+    out.write(reinterpret_cast<const char*>(&num_shards), sizeof(num_shards));
+    out.close();
+
+    // save every mempool
+    for (std::size_t shard_id = 0; shard_id < getNumShards(); ++shard_id) {
+      std::string pool_filename = filename + ".pool." + std::to_string(i);
+      auto wlock = shards_[shard_id].wlock();
+      mempools_[shard_id]->serialize(pool_filename);
+    }
+  }
+
+  void load(const std::string& filename) {
+    std::ifstream in(filename, std::ios::binary);
+    if (!in) {
+      throw std::runtime_error("Failed to open file for reading");
+    }
+
+    size_t num_shards;
+    in.read(reinterpret_cast<char*>(&num_shards), sizeof(num_shards));
+    in.close();
+
+    if (num_shards != getNumShards()) {
+      throw std::runtime_error("Shard count mismatch between file and map");
+    }
+
+    for (std::size_t shard_id = 0; shard_id < getNumShards(); ++shard_id) {
+      std::string pool_filename = filename + ".pool." + std::to_string(i);
+      auto wlock = shards_[shard_id].wlock();
+      // first deserialize mempool
+      mempools_[shard_id]->deserialize(pool_filename);
+      // load map from mempool
+      wlock->clear();
+      mempools_[shard_id]->for_each_block([&wlock](void* block) {
+        auto key = FixedBlockPool::get_key(block);
+        wlock->emplace(key, reinterpret_cast<V>(block));
+      });
+    }
+  }
+
  private:
   std::vector<folly::Synchronized<folly::F14FastMap<K, V>, M>> shards_;
   std::vector<std::unique_ptr<FixedBlockPool>> mempools_;
diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h
index ff1adaaf82..f8acbffb09 100644
--- a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h
@@ -2,13 +2,15 @@
 
 #include <chrono>
 #include <cstddef>
+#include <fstream>
 #include <memory_resource>
+#include <numeric>
 #include <stdexcept>
 #include <vector>
-#include <cmath>
-#include <numeric>
 
 #include <cassert>
+#include <cmath>
+#include <fmt/format.h>
 
 namespace kv_mem {
 static constexpr uint32_t kMaxInt31Counter = 2147483647;
@@ -155,6 +157,108 @@ class FixedBlockPool : public std::pmr::memory_resource {
     }
   };
 
+  template <typename Func>
+  void for_each_block(Func&& func) const {
+    for (const auto& chunk : chunks_) {
+      char* current = static_cast<char*>(chunk.ptr);
+      for (size_t i = 0; i < blocks_per_chunk_; ++i) {
+        if (FixedBlockPool::get_used(current)) {
+          func(current);
+        }
+        current += block_size_;
+      }
+    }
+  }
+
+  void serialize(const std::string& filename) const {
+    auto start = std::chrono::high_resolution_clock::now();
+
+    std::ofstream out(filename, std::ios::binary);
+    if (!out) {
+      throw std::runtime_error("Failed to open file for writing");
+    }
+    // Write metadata
+    out.write(reinterpret_cast<const char*>(&block_size_), sizeof(block_size_));
+    out.write(reinterpret_cast<const char*>(&block_alignment_), sizeof(block_alignment_));
+    out.write(reinterpret_cast<const char*>(&blocks_per_chunk_), sizeof(blocks_per_chunk_));
+    const size_t num_chunks = chunks_.size();
+    out.write(reinterpret_cast<const char*>(&num_chunks), sizeof(num_chunks));
+
+    // Write data for each chunk
+    for (const auto& chunk : chunks_) {
+      assert(chunk.size == block_size_ * blocks_per_chunk_);
+      out.write(static_cast<const char*>(chunk.ptr), static_cast<long>(chunk.size));
+    }
+    out.flush();
+    out.close();
+    double data_size_mb = static_cast<double>((block_size_ * chunks_.size() * blocks_per_chunk_)) / (1024.0 * 1024.0);
+
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration<double>(end - start).count();
+
+    fmt::print("Serialized {}: size={:.3f}MB, time={}s, throughput={:.3f}MB/s\n",
+               filename,
+               data_size_mb,
+               duration,
+               (data_size_mb / duration));
+  }
+
+  void deserialize(const std::string& filename) {
+    auto start = std::chrono::high_resolution_clock::now();
+
+    std::ifstream in(filename, std::ios::binary);
+    if (!in) {
+      throw std::runtime_error("Failed to open file for reading");
+    }
+
+    // Read metadata
+    std::size_t block_size, block_alignment, blocks_per_chunk, num_chunks;
+    in.read(reinterpret_cast<char*>(&block_size), sizeof(block_size));
+    in.read(reinterpret_cast<char*>(&block_alignment), sizeof(block_alignment));
+    in.read(reinterpret_cast<char*>(&blocks_per_chunk), sizeof(blocks_per_chunk));
+    in.read(reinterpret_cast<char*>(&num_chunks), sizeof(num_chunks));
+
+    // Validate parameters
+    if (block_size != block_size_) {
+      throw std::invalid_argument("Invalid block_size in file");
+    }
+    if (block_alignment != block_alignment_) {
+      throw std::invalid_argument("Invalid block_alignment in file");
+    }
+    if (blocks_per_chunk != blocks_per_chunk_) {
+      throw std::invalid_argument("Invalid blocks_per_chunk_ in file");
+    }
+
+    // Read data for each chunk and rebuild memory structure
+    const std::size_t chunk_size = block_size_ * blocks_per_chunk_;
+    for (size_t i = 0; i < num_chunks; ++i) {
+      void* chunk_ptr = upstream_->allocate(chunk_size, block_alignment_);
+      in.read(static_cast<char*>(chunk_ptr), static_cast<long>(chunk_size));
+      // Add chunk to memory pool
+      chunks_.push_back({chunk_ptr, chunk_size, block_alignment});
+      // Rebuild free_list_
+      char* current = static_cast<char*>(chunk_ptr);
+      for (size_t j = 0; j < blocks_per_chunk; ++j) {
+        void* block = current + j * block_size;
+        if (!get_used(block)) {
+          do_deallocate(block, block_size_, block_alignment_);
+        }
+      }
+    }
+    in.close();
+
+    double data_size_mb = static_cast<double>((block_size_ * chunks_.size() * blocks_per_chunk_)) / (1024.0 * 1024.0);
+
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration<double>(end - start).count();
+
+    fmt::print("Deserialized {}: size={:.3f}MB, time={}s, throughput={:.3f}MB/s\n",
+               filename,
+               data_size_mb,
+               duration,
+               (data_size_mb / duration));
+  }
+
   [[nodiscard]] const auto& get_chunks() const noexcept { return chunks_; }
   [[nodiscard]] std::size_t get_block_size() const noexcept { return block_size_; }
   [[nodiscard]] std::size_t get_block_alignment() const noexcept { return block_alignment_; }
diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt b/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt
index d7566c00d9..9ab483eab3 100644
--- a/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt
+++ b/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt
@@ -5,17 +5,20 @@ include_directories(
         ${FBGEMM_SOURCE_DIR}
 )
 
-add_executable(fixed_block_pool_test ${CMAKE_CURRENT_SOURCE_DIR}/fixed_block_pool_test.cpp)
-target_compile_features(fixed_block_pool_test PUBLIC cxx_std_17)
-target_compile_options(fixed_block_pool_test PUBLIC "-O3")
-target_link_libraries(fixed_block_pool_test gtest gtest_main Folly::folly)
+set(COMMON_COMPILE_FEATURES cxx_std_17)
+set(COMMON_COMPILE_OPTIONS "-O3")
+set(COMMON_LINK_LIBRARIES gtest gtest_main Folly::folly)
 
-add_executable(sharded_map_test ${CMAKE_CURRENT_SOURCE_DIR}/sharded_map_test.cpp)
-target_compile_features(sharded_map_test PUBLIC cxx_std_17)
-target_compile_options(sharded_map_test PUBLIC "-O3")
-target_link_libraries(sharded_map_test gtest gtest_main Folly::folly)
+set(TEST_TARGETS
+        fixed_block_pool_test
+        fixed_block_pool_saver_test
+        sharded_map_test
+        feature_evict_test
+)
 
-add_executable(feature_evict_test ${CMAKE_CURRENT_SOURCE_DIR}/feature_evict_test.cpp)
-target_compile_features(feature_evict_test PUBLIC cxx_std_17)
-target_compile_options(feature_evict_test PUBLIC "-O3")
-target_link_libraries(feature_evict_test gtest gtest_main Folly::folly)
\ No newline at end of file
+foreach (target ${TEST_TARGETS})
+    add_executable(${target} ${CMAKE_CURRENT_SOURCE_DIR}/${target}.cpp)
+    target_compile_features(${target} PUBLIC ${COMMON_COMPILE_FEATURES})
+    target_compile_options(${target} PUBLIC ${COMMON_COMPILE_OPTIONS})
+    target_link_libraries(${target} ${COMMON_LINK_LIBRARIES})
+endforeach ()
\ No newline at end of file
diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_saver_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_saver_test.cpp
new file mode 100644
index 0000000000..44ef79bc0a
--- /dev/null
+++ b/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_saver_test.cpp
@@ -0,0 +1,157 @@
+#include <chrono>
+#include <filesystem>
+#include <random>
+#include <unordered_map>
+
+#include <gtest/gtest.h>
+
+#include "fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h"
+
+namespace kv_mem {
+void removeFileIfExists(const std::string& filename) {
+  if (std::filesystem::exists(filename)) {
+    std::filesystem::remove(filename);
+  }
+}
+class FixedBlockPoolTest : public ::testing::Test {
+ protected:
+  static constexpr size_t kDimension = 128;  // embedding dimension
+  using scalar_t = float;                    // data type
+
+  void SetUp() override {
+    block_size_ = kv_mem::FixedBlockPool::calculate_block_size<scalar_t>(kDimension);
+    block_alignment_ = kv_mem::FixedBlockPool::calculate_block_alignment<scalar_t>();
+    pool_ = std::make_unique<kv_mem::FixedBlockPool>(block_size_, block_alignment_);
+  }
+
+  // Generate random data
+  void generateRandomData(std::size_t num_blocks) {
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<uint64_t> key_dist(1, UINT64_MAX);
+    std::uniform_real_distribution<float> val_dist(-1.0, 1.0);
+
+    for (size_t i = 0; i < num_blocks; ++i) {
+      auto* block = pool_->allocate_t<scalar_t>();
+      uint64_t key = key_dist(gen);
+
+      // Set metadata
+      kv_mem::FixedBlockPool::set_key(block, key);
+      kv_mem::FixedBlockPool::set_count(block, i % 100);
+      kv_mem::FixedBlockPool::update_timestamp(block);
+
+      // Set embedding data
+      auto* data = kv_mem::FixedBlockPool::data_ptr(block);
+      for (size_t j = 0; j < kDimension; ++j) {
+        data[j] = val_dist(gen);
+      }
+
+      // Record for verification
+      original_data_[key] = std::vector<scalar_t>(data, data + kDimension);
+    }
+  }
+
+  // Verify data correctness
+  bool verifyData() {
+    size_t verified_count = 0;
+
+    // Traverse all chunks to verify data
+    for (const auto& chunk : pool_->get_chunks()) {
+      char* current = static_cast<char*>(chunk.ptr);
+      size_t blocks_in_chunk = chunk.size / block_size_;
+
+      for (size_t i = 0; i < blocks_in_chunk; ++i) {
+        void* block = current + i * block_size_;
+        if (kv_mem::FixedBlockPool::get_used(block)) {
+          uint64_t key = kv_mem::FixedBlockPool::get_key(block);
+          auto* data = kv_mem::FixedBlockPool::data_ptr(reinterpret_cast<scalar_t*>(block));
+
+          // Find and compare original data
+          auto it = original_data_.find(key);
+          if (it == original_data_.end()) {
+            return false;
+          }
+
+          if (!std::equal(data, data + kDimension, it->second.begin())) {
+            return false;
+          }
+
+          verified_count++;
+        }
+      }
+    }
+
+    return verified_count == original_data_.size();
+  }
+
+  // Performance test helper function
+  template <typename Func>
+  double measureTime(Func&& func) {
+    auto start = std::chrono::high_resolution_clock::now();
+    func();
+    auto end = std::chrono::high_resolution_clock::now();
+    return std::chrono::duration<double>(end - start).count();
+  }
+
+  std::unique_ptr<kv_mem::FixedBlockPool> pool_;
+  size_t block_size_{};
+  size_t block_alignment_{};
+  std::unordered_map<uint64_t, std::vector<scalar_t>> original_data_;
+};
+
+// Correctness test
+TEST_F(FixedBlockPoolTest, SerializationCorrectness) {
+  // 1. Generate random data
+  generateRandomData(1000);
+
+  // 2. Serialize
+  const std::string filename = "test_pool.bin";
+  pool_->serialize(filename);
+
+  // 3. Create a new memory pool and deserialize
+  auto new_pool = std::make_unique<kv_mem::FixedBlockPool>(block_size_, block_alignment_);
+  new_pool->deserialize(filename);
+
+  // 4. Verify data
+  pool_ = std::move(new_pool);
+  EXPECT_TRUE(verifyData());
+}
+
+// Edge case test
+TEST_F(FixedBlockPoolTest, SerializationEdgeCases) {
+  // 1. Empty pool serialization test
+  const std::string empty_filename = "empty_pool.bin";
+  pool_->serialize(empty_filename);
+
+  auto new_pool = std::make_unique<kv_mem::FixedBlockPool>(block_size_, block_alignment_);
+  EXPECT_NO_THROW(new_pool->deserialize(empty_filename));
+
+  // 2. File not found test
+  EXPECT_THROW(pool_->deserialize("nonexistent_file.bin"), std::runtime_error);
+
+  // 3. Parameter mismatch test
+  generateRandomData(1000);
+  const std::string filename = "test_pool.bin";
+  pool_->serialize(filename);
+
+  auto wrong_pool = std::make_unique<kv_mem::FixedBlockPool>(block_size_ * 2,  // Incorrect block size
+                                                             block_alignment_);
+  EXPECT_THROW(wrong_pool->deserialize(filename), std::invalid_argument);
+}
+
+// Performance test
+TEST_F(FixedBlockPoolTest, SerializationPerformance) {
+  const std::size_t num_blocks = 20'000'000;
+  generateRandomData(num_blocks);
+  const std::string filename = "test_pool.bin";
+  removeFileIfExists(filename);
+
+  pool_->serialize(filename);
+
+  auto new_pool = std::make_unique<kv_mem::FixedBlockPool>(block_size_, block_alignment_);
+  new_pool->deserialize(filename);
+
+  std::remove(filename.c_str());
+}
+
+}  // namespace kv_mem
\ No newline at end of file
diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp
index d84acbb1d8..4445d1d4a2 100644
--- a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp
+++ b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp
@@ -171,6 +171,59 @@ int benchmark() {
   }
   return 0;
 }
+
+void save_and_restore() {
+  const int numShards = 4;
+  const std::size_t dimension = 32;
+  const std::size_t block_size = FixedBlockPool::calculate_block_size<float>(dimension);
+  const std::size_t block_alignment = FixedBlockPool::calculate_block_alignment<float>();
+  const int numItems = 1'000'000;
+  const std::string filename = "test_map.bin";
+
+  SynchronizedShardedMap<int64_t, float*> original_map(numShards, block_size, block_alignment);
+
+  std::vector<float> test_embedding = generateFixedEmbedding(dimension);
+  for (int i = 0; i < numItems; ++i) {
+    int shard_id = i % numShards;
+    auto wlock = original_map.by(shard_id).wlock();
+    auto* pool = original_map.pool_by(shard_id);
+
+    auto* block = pool->allocate_t<float>();
+    auto* data_ptr = FixedBlockPool::data_ptr<float>(block);
+    std::copy(test_embedding.begin(), test_embedding.end(), data_ptr);
+
+    FixedBlockPool::set_key(block, i);
+    wlock->insert({i, block});
+  }
+
+  original_map.save(filename);
+
+  SynchronizedShardedMap<int64_t, float*> restored_map(numShards, block_size, block_alignment);
+  restored_map.load(filename);
+
+  for (int64_t i = 0; i < numItems; ++i) {
+    int shard_id = i % numShards;
+    auto rlock = restored_map.by(shard_id).rlock();
+
+    auto it = rlock->find(i);
+    ASSERT_NE(it, rlock->end()) << "Key " << i << " not found after load";
+
+    float* block = it->second;
+    ASSERT_EQ(FixedBlockPool::get_key(block), i);
+
+    const float* data_ptr = FixedBlockPool::data_ptr<float>(block);
+    for (std::size_t j = 0; j < dimension; ++j) {
+      ASSERT_FLOAT_EQ(data_ptr[j], test_embedding[j]) << "Data mismatch at position " << j << " for key " << i;
+    }
+  }
+
+  std::remove(filename.c_str());
+  for (int i = 0; i < numShards; ++i) {
+    std::remove((filename + ".pool." + std::to_string(i)).c_str());
+  }
+};
+
+TEST(SynchronizedShardedMap, save_and_restore) { save_and_restore(); }
 TEST(SynchronizedShardedMap, benchmark) { benchmark(); }
 
 }  // namespace kv_mem
\ No newline at end of file