From 71525b85503b7cf4c564996e0e6135e225477084 Mon Sep 17 00:00:00 2001 From: houzhenggang Date: Tue, 20 May 2025 10:37:32 +0800 Subject: [PATCH 01/12] FixedBlockPool --- .../SynchronizedShardedMap.h | 30 +- .../dram_kv_embedding_cache.h | 61 ++-- .../fixed_block_pool.h | 128 ++++++++ .../store_value_utils.h | 81 +++++ .../dram_kv_embedding_cache/CMakeLists.txt | 14 + .../fixed_block_pool_test.cpp | 301 ++++++++++++++++++ .../sharded_map_test.cpp | 164 ++++++++++ .../store_value_utils_test.cpp | 86 +++++ 8 files changed, 835 insertions(+), 30 deletions(-) create mode 100644 fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h create mode 100644 fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h create mode 100644 fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt create mode 100644 fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp create mode 100644 fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp create mode 100644 fbgemm_gpu/test/dram_kv_embedding_cache/store_value_utils_test.cpp diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h b/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h index 12d8be97b5..3cd4c61c6f 100644 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h @@ -8,8 +8,10 @@ #pragma once +#include #include -#include "folly/Synchronized.h" + +#include "fixed_block_pool.h" namespace kv_mem { @@ -29,18 +31,30 @@ class SynchronizedShardedMap { public: using iterator = typename folly::F14FastMap::const_iterator; - explicit SynchronizedShardedMap(std::size_t numShards) : shards_(numShards) {} + explicit SynchronizedShardedMap(std::size_t numShards, + std::size_t block_size, + std::size_t block_alignment, + std::size_t blocks_per_chunk = 8192) + : shards_(numShards), mempools_(numShards) { + // Init mempools_ + for (auto& pool : mempools_) { + pool = std::make_unique( + block_size, block_alignment, blocks_per_chunk); + } + } // Get shard map by index - auto& by(int index) { - return shards_.at(index % shards_.size()); - } + auto& by(int index) { return shards_.at(index % shards_.size()); } - auto getNumShards() { - return shards_.size(); + // Get shard pool by index + auto* pool_by(int index) { + return mempools_.at(index % shards_.size()).get(); } + auto getNumShards() { return shards_.size(); } + private: std::vector, M>> shards_; + std::vector> mempools_; }; -} // namespace kv_mem +} // namespace kv_mem diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h index 0b195b695b..23460c41bb 100644 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h @@ -15,7 +15,7 @@ #include "SynchronizedShardedMap.h" #include "deeplearning/fbgemm/fbgemm_gpu/src/ssd_split_embeddings_cache/initializer.h" -#include "store_value.h" +#include "store_value_utils.h" #include #include @@ -70,8 +70,13 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB { max_D_(max_D), num_shards_(num_shards), weight_ttl_in_hours_(weight_ttl_in_hours), - kv_store_(SynchronizedShardedMap>( - num_shards_)), + block_size_(StoreValueUtils::calculate_block_size(max_D)), + block_alignment_(StoreValueUtils::calculate_block_alignment()), + kv_store_(SynchronizedShardedMap( + num_shards_, + block_size_, + block_alignment_, + /*blocks_per_chunk=*/8192)), elem_size_(row_storage_bitwidth / 8) { executor_ = std::make_unique(std::max( num_threads, facebook::Proc::getCpuInfo().numCpuCores)); @@ -185,20 +190,31 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB { CHECK_EQ(indices.size(0), weights.size(0)); { auto wlmap = kv_store_.by(shard_id).wlock(); - + auto* pool = kv_store_.pool_by(shard_id); for (auto index_iter = indexes.begin(); index_iter != indexes.end(); index_iter++) { const auto& id_index = *index_iter; auto id = indices[id_index].template item(); - wlmap->try_emplace( - id, - StoreValue(std::vector( - weights[id_index] - .template data_ptr(), - weights[id_index] - .template data_ptr() + - weights[id_index].numel()))); + + // use mempool + weight_type* block = nullptr; + // First check if the key already exists + auto it = wlmap->find(id); + if (it != wlmap->end()) { + block = it->second; + } else { + // Key doesn't exist, allocate new block and insert. + block = StoreValueUtils::allocate( + block_size_, block_alignment_, pool); + wlmap->insert({id, block}); + } + StoreValueUtils::update_timestamp(block); + auto* data_ptr = StoreValueUtils::data_ptr(block); + std::copy( + weights[id_index].template data_ptr + weights[id_index].template data_ptr() + weights[id_index].numel(), + data_ptr); } } }); @@ -265,6 +281,7 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB { weights.data_ptr(); auto id = indices[id_index].template item(); auto wlmap = kv_store_.by(shard_id).wlock(); + auto* pool = kv_store_.pool_by(shard_id); const auto cached_iter = wlmap->find(id); if (cached_iter == wlmap->end()) { fill_from_row_storage( @@ -276,16 +293,13 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB { row_storage_data_ptr)); continue; } - const auto& cache_results = - cached_iter->second.getValueAndPromote(); - CHECK_EQ(cache_results.size(), max_D_); + // use mempool + const auto* data_ptr = StoreValueUtils::data_ptr(cached_iter->second); + StoreValueUtils::update_timestamp(cached_iter->second); std::copy( - reinterpret_cast( - &(cache_results[0])), - reinterpret_cast( - &(cache_results[max_D_])), - &(weights_data_ptr - [id_index * max_D_])); // dst_start + data_ptr, + data_ptr + max_D_, + &weights_data[index * max_D_]); // dst_start } } }); @@ -368,7 +382,10 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB { int64_t max_D_; int64_t num_shards_; int64_t weight_ttl_in_hours_; - SynchronizedShardedMap> kv_store_; + // mempool params + size_t block_size_; + size_t block_alignment_; + SynchronizedShardedMap kv_store_; std::atomic_bool is_eviction_ongoing_ = false; std::vector> initializers_; int64_t elem_size_; diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h new file mode 100644 index 0000000000..a474b367c3 --- /dev/null +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h @@ -0,0 +1,128 @@ +#pragma once + +#include +#include +#include +#include + +#include + +namespace kv_mem { +class FixedBlockPool : public std::pmr::memory_resource { + public: + explicit FixedBlockPool( + std::size_t block_size, // Size of each memory block + std::size_t block_alignment, // Memory block alignment requirement + std::size_t blocks_per_chunk = 8192, // Number of blocks per chunk + std::pmr::memory_resource* upstream = std::pmr::new_delete_resource()) + // Minimum block size is 8 bytes + : block_size_(std::max(block_size, sizeof(void*))), + block_alignment_(block_alignment), + blocks_per_chunk_(blocks_per_chunk), + upstream_(upstream), + chunks_(upstream) { + // Validate minimum data size, whether it's less than 8 bytes + // half type, 2 bytes, minimum embedding length 4 + // float type, 4 bytes, minimum embedding length 2 + // Large objects use memory pool, small objects are placed directly in the + // hashtable + if (block_size < sizeof(void*)) { + // Block size must be at least able to store a pointer (for free list) + throw std::invalid_argument("Block size must be at least sizeof(void*)"); + } + + // Validate that alignment requirement is a power of 2 + if ((block_alignment_ & (block_alignment_ - 1)) != 0) { + throw std::invalid_argument("Alignment must be power of two"); + } + + // Validate that block size is a multiple of alignment + if (block_size_ % block_alignment_ != 0) { + throw std::invalid_argument("Block size must align with alignment"); + } + + // Ensure block size is at least 1 + if (block_size_ < 1) { + throw std::invalid_argument("Block size must be at least 1"); + } + } + + // Release all allocated memory during destruction + ~FixedBlockPool() override { + for (auto&& chunk : chunks_) { + upstream_->deallocate(chunk.ptr, chunk.size, chunk.alignment); + } + } + + protected: + // Core allocation function + void* do_allocate(std::size_t bytes, std::size_t alignment) override { + // Only handle matching block size and alignment requirements + if (bytes != block_size_ || alignment != block_alignment_) { + throw std::bad_alloc(); + } + + // Allocate a new chunk when no blocks are available + if (!free_list_) { + allocate_chunk(); + } + + // Take a block from the head of the free list + void* result = free_list_; + free_list_ = *static_cast(free_list_); + return result; + } + + // Core deallocation function + void do_deallocate(void* p, + [[maybe_unused]] std::size_t bytes, + [[maybe_unused]] std::size_t alignment) override { + // Insert memory block back to the head of free list + *static_cast(p) = free_list_; + free_list_ = p; + } + + // Resource equality comparison (only the same object is equal) + [[nodiscard]] bool do_is_equal( + const std::pmr::memory_resource& other) const noexcept override { + return this == &other; + } + + private: + // Chunk metadata + struct chunk_info { + void* ptr; // Memory block pointer + std::size_t size; // Total size + std::size_t alignment; + }; + + // Allocate a new memory chunk + void allocate_chunk() { + const std::size_t chunk_size = block_size_ * blocks_per_chunk_; + + // Allocate aligned memory through upstream resource + void* chunk_ptr = upstream_->allocate(chunk_size, block_alignment_); + + // Record chunk information for later release + chunks_.push_back({chunk_ptr, chunk_size, block_alignment_}); + + // Initialize free list: link blocks in reverse order from chunk end to + // beginning (improves locality) + char* current = static_cast(chunk_ptr) + chunk_size; + for (std::size_t i = 0; i < blocks_per_chunk_; ++i) { + current -= block_size_; + *reinterpret_cast(current) = free_list_; + free_list_ = current; + } + } + + // Member variables + const std::size_t block_size_; // Block size (not less than pointer size) + const std::size_t block_alignment_; // Block alignment requirement + const std::size_t blocks_per_chunk_; // Number of blocks per chunk + std::pmr::memory_resource* upstream_; // Upstream memory resource + std::pmr::vector chunks_{ + 1024}; // Records of all allocated chunks + void* free_list_ = nullptr; // Free block list head pointer +}; +} // namespace kv_mem diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h b/fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h new file mode 100644 index 0000000000..c10c318621 --- /dev/null +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h @@ -0,0 +1,81 @@ +#pragma once +#include + +#include "fixed_block_pool.h" + +namespace kv_mem { + +class StoreValueUtils { + public: + // Metadata structure (publicly accessible) + struct alignas(8) MetaHeader { + int64_t timestamp; // 8 bytes + // Can be extended with other fields: uint32_t counter, uint64_t key, etc. + }; + + // Create memory block with metadata + template + static scalar_t* allocate(size_t& block_size, + size_t& alignment, + FixedBlockPool* pool) { + return reinterpret_cast(pool->allocate(block_size, alignment)); + } + + // Destroy memory block + template + static void deallocate(scalar_t* block, + size_t& block_size, + size_t& alignment, + FixedBlockPool* pool) { + pool->deallocate(block, block_size, alignment); + } + + // Calculate storage size + template + static size_t calculate_block_size(size_t dimension) { + return sizeof(MetaHeader) + dimension * sizeof(scalar_t); + } + + // Calculate alignment requirements + template + static size_t calculate_block_alignment() { + return std::max(alignof(MetaHeader), alignof(scalar_t)); + } + + // Metadata operations + template + static int64_t get_timestamp(const scalar_t* block) { + return reinterpret_cast(block)->timestamp; + } + + template + static void set_timestamp(scalar_t* block, int64_t ts) { + reinterpret_cast(block)->timestamp = ts; + } + + template + static void update_timestamp(scalar_t* block) { + reinterpret_cast(block)->timestamp = current_timestamp(); + } + + // Data pointer retrieval + template + static scalar_t* data_ptr(scalar_t* block) { + return reinterpret_cast(reinterpret_cast(block) + + sizeof(MetaHeader)); + } + + template + static const scalar_t* data_ptr(const scalar_t* block) { + return reinterpret_cast( + reinterpret_cast(block) + sizeof(MetaHeader)); + } + + static int64_t current_timestamp() { + return std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + // facebook::WallClockUtil::NowInUsecFast(); + } +}; +} // namespace kv_mem \ No newline at end of file diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt b/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt new file mode 100644 index 0000000000..e9a41eac51 --- /dev/null +++ b/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt @@ -0,0 +1,14 @@ +add_executable(fixed_block_pool_test ${CMAKE_CURRENT_SOURCE_DIR}/fixed_block_pool_test.cpp) +target_compile_features(fixed_block_pool_test PUBLIC cxx_std_17) +target_include_directories(fixed_block_pool_test PUBLIC ${FBGEMM_SOURCE_DIR}) +target_link_libraries(fixed_block_pool_test gtest gtest_main) + +add_executable(sharded_map_test ${CMAKE_CURRENT_SOURCE_DIR}/sharded_map_test.cpp) +target_compile_features(sharded_map_test PUBLIC cxx_std_17) +target_include_directories(fixed_block_pool_test PUBLIC ${FBGEMM_SOURCE_DIR}) +target_link_libraries(sharded_map_test gtest gtest_main Folly::folly) + +add_executable(store_value_utils_test ${CMAKE_CURRENT_SOURCE_DIR}/store_value_utils_test.cpp) +target_compile_features(store_value_utils_test PUBLIC cxx_std_17) +target_include_directories(store_value_utils_test PUBLIC ${FBGEMM_SOURCE_DIR}) +target_link_libraries(store_value_utils_test gtest gtest_main Folly::folly) \ No newline at end of file diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp new file mode 100644 index 0000000000..606819ad8d --- /dev/null +++ b/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp @@ -0,0 +1,301 @@ +#include "fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h" + +#include +#include +#include +#include +#include + +#include + +#include "fixed_block_pool.h" +namespace kv_mem { + +double test_std_vector(size_t vector_size, size_t repeat_count) { + float sum = 0.0f; // Prevent optimization + std::vector> + all_vectors; // Store all vectors to prevent release + all_vectors.reserve(repeat_count); + + auto start = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < repeat_count; ++i) { + all_vectors.emplace_back(vector_size); + auto& vec = all_vectors.back(); + + for (size_t j = 0; j < vector_size; ++j) { + vec[j] = static_cast(j); + } + + // Simple usage to prevent optimization + sum += vec[0]; + } + + auto end = std::chrono::high_resolution_clock::now(); + return std::chrono::duration(end - start).count(); +} + +// Testing memory pool allocation +double test_pool_vector(size_t vector_size, size_t repeat_count) { + // Create a memory pool large enough + FixedBlockPool pool(vector_size * sizeof(float), alignof(float), 8092); + std::pmr::polymorphic_allocator alloc(&pool); + + auto start = std::chrono::high_resolution_clock::now(); + float sum = 0.0f; // Prevent optimization + for (size_t i = 0; i < repeat_count; ++i) { + float* arr = alloc.allocate(vector_size); + + for (size_t j = 0; j < vector_size; ++j) { + arr[j] = static_cast(j); + } + + // Simple usage to prevent optimization + sum += arr[0]; + + // Removed deallocate statement, no longer releasing memory to avoid memory + // reuse + // alloc.deallocate(arr, dim); + } + + auto end = std::chrono::high_resolution_clock::now(); + return std::chrono::duration(end - start).count(); +} + +void benchmark_memory_allocators() { + std::cout << "====== Testing performance difference between memory pool and " + "native vector allocation for 10 million " + "times ======" + << std::endl; + + // Vector sizes to test (in number of float elements) + std::vector vector_sizes = {4, 8, 16, 32, 64, 128, 256}; + + // Repeat count (10 million times) + const size_t repeat_count = 10'000'000; + + for (const auto& size : vector_sizes) { + std::cout << "Vector size: " << size << " floats (" + << (size * sizeof(float)) << " bytes)" << std::endl; + + // Testing standard vector + double std_time = test_std_vector(size, repeat_count); + std::cout << " Standard vector: " << std::fixed << std::setprecision(2) + << std_time << " ms" << std::endl; + + // Testing memory pool + double pool_time = test_pool_vector(size, repeat_count); + std::cout << " Memory pool: " << std::fixed << std::setprecision(2) + << pool_time << " ms" << std::endl; + + // Calculate speed improvement + double speedup = std_time / pool_time; + std::cout << " Speed improvement: " << std::fixed << std::setprecision(2) + << speedup << "x" << std::endl; + + std::cout << std::endl; + std::cout << "============================" << std::endl; + } +} + +// Basic functionality test: Integer keys +TEST(FixedBlockPoolTest, benchmark_memory_allocators) { + benchmark_memory_allocators(); +} + +// Test constructor normal case +TEST(FixedBlockPoolTest, ConstructorNormal) { + EXPECT_NO_THROW({ kv_mem::FixedBlockPool pool(16, 8); }); +} + +// Test constructor exception cases +TEST(FixedBlockPoolTest, ConstructorExceptions) { + // Block size smaller than pointer size + EXPECT_THROW({ kv_mem::FixedBlockPool pool(1, 1); }, std::invalid_argument); + + // Alignment not a power of 2 + EXPECT_THROW({ kv_mem::FixedBlockPool pool(16, 3); }, std::invalid_argument); + + // Block size not a multiple of alignment + EXPECT_THROW({ kv_mem::FixedBlockPool pool(10, 8); }, std::invalid_argument); +} + +// Test basic memory allocation and deallocation +TEST(FixedBlockPoolTest, BasicAllocation) { + const size_t block_size = 16; + const size_t alignment = 8; + kv_mem::FixedBlockPool pool(block_size, alignment); + + void* p = pool.allocate(block_size, alignment); + EXPECT_NE(p, nullptr); + + // Verify allocated memory is usable + std::memset(p, 0xAB, block_size); + + pool.deallocate(p, block_size, alignment); +} + +// Test multiple allocations and deallocations +TEST(FixedBlockPoolTest, MultipleAllocations) { + const size_t block_size = 32; + const size_t alignment = 8; + kv_mem::FixedBlockPool pool(block_size, alignment); + + std::vector blocks; + const int NUM_BLOCKS = 100; + + // Allocate multiple blocks + for (int i = 0; i < NUM_BLOCKS; ++i) { + void* p = pool.allocate(block_size, alignment); + EXPECT_NE(p, nullptr); + // Write some data + *static_cast(p) = i; + blocks.push_back(p); + } + + // Verify data + for (int i = 0; i < NUM_BLOCKS; ++i) { + EXPECT_EQ(*static_cast(blocks[i]), i); + } + + // Release all blocks + for (auto p : blocks) { + pool.deallocate(p, block_size, alignment); + } +} + +// Test cross-chunk allocation (each chunk has only 10 blocks) +TEST(FixedBlockPoolTest, CrossChunkAllocation) { + const size_t block_size = 16; + const size_t alignment = 8; + const size_t blocks_per_chunk = 10; + kv_mem::FixedBlockPool pool(block_size, alignment, blocks_per_chunk); + + std::vector blocks; + const int NUM_BLOCKS = 25; // Exceeds 2 chunks + + // Allocate blocks beyond a single chunk capacity + for (int i = 0; i < NUM_BLOCKS; ++i) { + void* p = pool.allocate(block_size, alignment); + EXPECT_NE(p, nullptr); + blocks.push_back(p); + } + + // Release all blocks + for (auto p : blocks) { + pool.deallocate(p, block_size, alignment); + } +} + +// Test memory alignment +TEST(FixedBlockPoolTest, MemoryAlignment) { + const size_t block_size = 64; + const size_t alignment = 32; + kv_mem::FixedBlockPool pool(block_size, alignment); + + void* p = pool.allocate(block_size, alignment); + EXPECT_NE(p, nullptr); + + // Verify address is aligned to specified alignment + uintptr_t addr = reinterpret_cast(p); + EXPECT_EQ(addr % alignment, 0); + + pool.deallocate(p, block_size, alignment); +} + +// Test error handling - allocating blocks with mismatched size or alignment +TEST(FixedBlockPoolTest, ErrorHandling) { + const size_t block_size = 16; + const size_t alignment = 8; + kv_mem::FixedBlockPool pool(block_size, alignment); + + // Try to allocate memory with incorrect size + EXPECT_THROW( + { [[maybe_unused]] void* p = pool.allocate(block_size * 2, alignment); }, + std::bad_alloc); + + // Try to allocate memory with incorrect alignment + EXPECT_THROW( + { [[maybe_unused]] void* p = pool.allocate(block_size, alignment * 2); }, + std::bad_alloc); +} + +// Test memory reuse after deallocation +TEST(FixedBlockPoolTest, ReuseAfterDeallocation) { + const size_t block_size = 16; + const size_t alignment = 8; + kv_mem::FixedBlockPool pool(block_size, alignment); + + void* p1 = pool.allocate(block_size, alignment); + void* p2 = pool.allocate(block_size, alignment); + + // Release the first block + pool.deallocate(p1, block_size, alignment); + + // Reallocate, should get the recently freed block (due to LIFO order) + void* p3 = pool.allocate(block_size, alignment); + EXPECT_EQ(p3, p1); + + // Cleanup + pool.deallocate(p2, block_size, alignment); + pool.deallocate(p3, block_size, alignment); +} + +// Test custom upstream memory resource +TEST(FixedBlockPoolTest, CustomUpstreamResource) { + const size_t block_size = 16; + const size_t alignment = 8; + + // Use custom memory resource that tracks allocations + int allocate_count = 0; + int deallocate_count = 0; + + class CountingResource : public std::pmr::memory_resource { + public: + CountingResource(int& alloc_count, int& dealloc_count) + : alloc_count_(alloc_count), dealloc_count_(dealloc_count) {} + + protected: + void* do_allocate(size_t bytes, size_t alignment) override { + ++alloc_count_; + return std::pmr::new_delete_resource()->allocate(bytes, alignment); + } + + void do_deallocate(void* p, size_t bytes, size_t alignment) override { + ++dealloc_count_; + std::pmr::new_delete_resource()->deallocate(p, bytes, alignment); + } + + bool do_is_equal( + const std::pmr::memory_resource& other) const noexcept override { + return this == &other; + } + + private: + int& alloc_count_; + int& dealloc_count_; + }; + + CountingResource upstream(allocate_count, deallocate_count); + { + kv_mem::FixedBlockPool pool(block_size, alignment, 1024, &upstream); + + // Allocate some blocks to trigger chunk allocation + std::vector blocks; + for (int i = 0; i < 10; ++i) { + blocks.push_back(pool.allocate(block_size, alignment)); + } + + // Verify upstream resource was called + EXPECT_GT(allocate_count, 0); + EXPECT_EQ(deallocate_count, 0); + + // Release all blocks + for (auto p : blocks) { + pool.deallocate(p, block_size, alignment); + } + } + // Destructor should release all chunks + EXPECT_GT(deallocate_count, 0); +} + +} // namespace kv_mem \ No newline at end of file diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp new file mode 100644 index 0000000000..2189b55009 --- /dev/null +++ b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp @@ -0,0 +1,164 @@ +#include +#include + +#include +#include +#include "fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h" +#include "fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h" + +namespace kv_mem { +std::vector generateFixedEmbedding(int dimension) { + return std::vector(dimension, 1.0); +} + +void memPoolEmbedding(int dimension, size_t numInserts, size_t numLookups) { + const size_t numShards = 1; + + // 初始化带内存池的哈希表 + SynchronizedShardedMap embeddingMap( + numShards, + dimension * sizeof(float), // block_size + alignof(float), // block_alignment + 8192); // blocks_per_chunk + double insertTime, lookupTime; + { + std::vector fixedEmbedding = generateFixedEmbedding(dimension); + + auto wlock = embeddingMap.by(0).wlock(); + auto* pool = embeddingMap.pool_by(0); + std::pmr::polymorphic_allocator alloc(pool); + + auto startInsert = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < numInserts; i++) { + float* arr = alloc.allocate(dimension); + std::copy(fixedEmbedding.begin(), fixedEmbedding.end(), arr); + wlock->insert_or_assign(i, arr); + } + auto endInsert = std::chrono::high_resolution_clock::now(); + insertTime = + std::chrono::duration(endInsert - startInsert) + .count(); + } + + std::vector lookEmbedding(dimension); + size_t hitCount = 0; + { + auto rlock = embeddingMap.by(0).rlock(); + auto startLookup = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < numLookups; i++) { + auto it = rlock->find(i % numInserts); + if (it != rlock->end()) { + hitCount++; + std::copy(it->second, it->second + dimension, lookEmbedding.data()); + } + } + auto endLookup = std::chrono::high_resolution_clock::now(); + lookupTime = + std::chrono::duration(endLookup - startLookup) + .count(); + } + + std::cout << std::left << std::setw(20) << dimension; + std::cout << std::fixed << std::setprecision(2); + std::cout << std::setw(20) << insertTime; + std::cout << std::setw(20) << lookupTime; + std::cout << std::setw(20) << (100.0 * (double)hitCount / (double)numLookups); + std::cout << std::endl; +} + +void memPoolEmbeddingWithTime(int dimension, + size_t numInserts, + size_t numLookups) { + const size_t numShards = 1; + size_t block_size = StoreValueUtils::calculate_block_size(dimension); + size_t block_alignment = StoreValueUtils::calculate_block_alignment(); + + // 初始化带内存池的哈希表 + SynchronizedShardedMap embeddingMap( + numShards, + block_size, // block_size + block_alignment, // block_alignment + 8192); // blocks_per_chunk + double insertTime, lookupTime; + // 测试插入性能 + { + std::vector fixedEmbedding = generateFixedEmbedding(dimension); + + auto wlock = embeddingMap.by(0).wlock(); + auto* pool = embeddingMap.pool_by(0); + + auto startInsert = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < numInserts; i++) { + auto* block = + StoreValueUtils::allocate(block_size, block_alignment, pool); + auto* data_ptr = StoreValueUtils::data_ptr(block); + std::copy(fixedEmbedding.begin(), fixedEmbedding.end(), data_ptr); + wlock->insert_or_assign(i, block); + } + auto endInsert = std::chrono::high_resolution_clock::now(); + insertTime = + std::chrono::duration(endInsert - startInsert) + .count(); + } + + std::vector lookEmbedding(dimension); + size_t hitCount = 0; + { + auto rlock = embeddingMap.by(0).rlock(); + auto startLookup = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < numLookups; i++) { + auto it = rlock->find(i % numInserts); + if (it != rlock->end()) { + hitCount++; + const float* data_ptr = StoreValueUtils::data_ptr(it->second); + // update timestamp + StoreValueUtils::update_timestamp(it->second); + std::copy(data_ptr, data_ptr + dimension, lookEmbedding.data()); + } + } + auto endLookup = std::chrono::high_resolution_clock::now(); + lookupTime = + std::chrono::duration(endLookup - startLookup) + .count(); + } + + std::cout << std::left << std::setw(20) << dimension; + std::cout << std::fixed << std::setprecision(2); + std::cout << std::setw(20) << insertTime; + std::cout << std::setw(20) << lookupTime; + std::cout << std::setw(20) << (100.0 * (double)hitCount / (double)numLookups); + std::cout << std::endl; +} + +int benchmark() { + std::vector dimensions = {4, 8, 16, 32, 64}; + // 操作数量 + const size_t numInserts = 1'000'000; // 1 million insert + const size_t numLookups = 1'000'000; // 1 million find + + std::cout + << "======================= mempool ====================================" + << std::endl; + std::cout << std::left << std::setw(20) << "dim" << std::setw(20) + << "insert time (ms)" << std::setw(20) << "find time (ms)" + << std::setw(20) << "hit rate (%)" << std::endl; + for (int dim : dimensions) { + memPoolEmbedding(dim, numInserts, numLookups); + } + std::cout << std::endl << std ::endl; + + std::cout << "======================= mempool with time " + "====================================" + << std::endl; + std::cout << std::left << std::setw(20) << "dim" << std::setw(20) + << "insert time (ms)" << std::setw(20) << "find time (ms)" + << std::setw(20) << "hit rate (%)" << std::endl; + for (int dim : dimensions) { + memPoolEmbeddingWithTime(dim, numInserts, numLookups); + } + std::cout << std::endl << std ::endl; + return 0; +} +TEST(SynchronizedShardedMap, benchmark) { benchmark(); } + +} // namespace kv_mem \ No newline at end of file diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/store_value_utils_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/store_value_utils_test.cpp new file mode 100644 index 0000000000..c1506c16e1 --- /dev/null +++ b/fbgemm_gpu/test/dram_kv_embedding_cache/store_value_utils_test.cpp @@ -0,0 +1,86 @@ +#include "fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h" + +#include "gtest/gtest.h" +namespace kv_mem { + +TEST(StoreValueUtils, BasicFunctionality) { + constexpr int dim = 4; + size_t block_size = StoreValueUtils::calculate_block_size(dim); + size_t alignment = StoreValueUtils::calculate_block_alignment(); + + // Initialize memory pool + FixedBlockPool pool(block_size, alignment, 1024); + + // Test memory allocation + float* block = StoreValueUtils::allocate(block_size, alignment, &pool); + StoreValueUtils::update_timestamp(block); + ASSERT_NE(block, nullptr); + + // Verify metadata header + int64_t ts1 = StoreValueUtils::get_timestamp(block); + EXPECT_LE(StoreValueUtils::current_timestamp(), ts1); + + // Test data pointer offset + float* data = StoreValueUtils::data_ptr(block); + ASSERT_EQ(reinterpret_cast(data) - reinterpret_cast(block), sizeof(StoreValueUtils::MetaHeader)); + + // Test timestamp update + StoreValueUtils::update_timestamp(block); + int64_t ts2 = StoreValueUtils::get_timestamp(block); + EXPECT_GE(ts2, ts1); // New timestamp should be greater or equal + + // Test memory deallocation + EXPECT_NO_THROW(StoreValueUtils::deallocate(block, block_size, alignment, &pool)); +} + +TEST(StoreValueUtils, MultiDimensionTest) { + // Test memory alignment for different dimensions + const std::vector test_dims = {1, 4, 16, 64, 256}; + for (int dim : test_dims) { + size_t block_size = StoreValueUtils::calculate_block_size(dim); + size_t alignment = StoreValueUtils::calculate_block_alignment(); + + // Verify alignment requirements + EXPECT_EQ(alignment % alignof(StoreValueUtils::MetaHeader), 0); + EXPECT_EQ(alignment % alignof(float), 0); + + // Verify block size calculation + const size_t expected_size = sizeof(StoreValueUtils::MetaHeader) + dim * sizeof(float); + EXPECT_EQ(block_size, expected_size); + } +} + +TEST(StoreValueUtils, TimestampPrecision) { + // Test timestamp precision accuracy + constexpr int test_iterations = 1000; + int64_t prev_ts = StoreValueUtils::current_timestamp(); + + for (int i = 0; i < test_iterations; ++i) { + int64_t curr_ts = StoreValueUtils::current_timestamp(); + EXPECT_GE(curr_ts, prev_ts); // Timestamps should be monotonically increasing + prev_ts = curr_ts; + } +} + +TEST(StoreValueUtils, DataIntegrity) { + // Test data storage integrity + constexpr int dim = 8; + std::vector src_data(dim, 3.14f); + + size_t block_size = StoreValueUtils::calculate_block_size(dim); + size_t alignment = StoreValueUtils::calculate_block_alignment(); + FixedBlockPool pool(block_size, alignment, 1024); + + // Allocate and write data + float* block = StoreValueUtils::allocate(block_size, alignment, &pool); + float* data_ptr = StoreValueUtils::data_ptr(block); + std::copy(src_data.begin(), src_data.end(), data_ptr); + + // Verify data consistency + for (int i = 0; i < dim; ++i) { + EXPECT_FLOAT_EQ(data_ptr[i], src_data[i]); + } + + StoreValueUtils::deallocate(block, block_size, alignment, &pool); +} +} // namespace kv_mem \ No newline at end of file From f712994af21571553d09d0ddb7ea42a99b48acc9 Mon Sep 17 00:00:00 2001 From: houzhenggang Date: Tue, 20 May 2025 12:40:39 +0800 Subject: [PATCH 02/12] use weights_data_ptr --- .../src/dram_kv_embedding_cache/dram_kv_embedding_cache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h index 4f2d90a9ca..4507e2060f 100644 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h @@ -300,7 +300,7 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB { std::copy( data_ptr, data_ptr + max_D_, - &weights_data[index * max_D_]); // dst_start + &(weights_data_ptr[index * max_D_])); // dst_start } } }); From 9c209a2eb3070139443dd8843d43b50e88e5c379 Mon Sep 17 00:00:00 2001 From: houzhenggang Date: Tue, 20 May 2025 13:44:03 +0800 Subject: [PATCH 03/12] update some annotate --- fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h | 1 + fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp | 4 ---- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h b/fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h index c10c318621..e2ac55acc9 100644 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h @@ -8,6 +8,7 @@ namespace kv_mem { class StoreValueUtils { public: // Metadata structure (publicly accessible) + // alignas(8) MetaHeader >= sizeof(void*), avoid mempool block too small. struct alignas(8) MetaHeader { int64_t timestamp; // 8 bytes // Can be extended with other fields: uint32_t counter, uint64_t key, etc. diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp index 2189b55009..6aa9373ac4 100644 --- a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp +++ b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp @@ -14,7 +14,6 @@ std::vector generateFixedEmbedding(int dimension) { void memPoolEmbedding(int dimension, size_t numInserts, size_t numLookups) { const size_t numShards = 1; - // 初始化带内存池的哈希表 SynchronizedShardedMap embeddingMap( numShards, dimension * sizeof(float), // block_size @@ -73,14 +72,12 @@ void memPoolEmbeddingWithTime(int dimension, size_t block_size = StoreValueUtils::calculate_block_size(dimension); size_t block_alignment = StoreValueUtils::calculate_block_alignment(); - // 初始化带内存池的哈希表 SynchronizedShardedMap embeddingMap( numShards, block_size, // block_size block_alignment, // block_alignment 8192); // blocks_per_chunk double insertTime, lookupTime; - // 测试插入性能 { std::vector fixedEmbedding = generateFixedEmbedding(dimension); @@ -132,7 +129,6 @@ void memPoolEmbeddingWithTime(int dimension, int benchmark() { std::vector dimensions = {4, 8, 16, 32, 64}; - // 操作数量 const size_t numInserts = 1'000'000; // 1 million insert const size_t numLookups = 1'000'000; // 1 million find From bdec70251a08b9a4fb0c9a626dc01d8d704fa7d6 Mon Sep 17 00:00:00 2001 From: houzhenggang Date: Mon, 26 May 2025 14:23:47 +0800 Subject: [PATCH 04/12] update MetaHeader --- .../fixed_block_pool.h | 163 ++++++++++++-- .../src/dram_kv_embedding_cache/store_value.h | 56 ----- .../store_value_utils.h | 82 ------- .../dram_kv_embedding_cache/CMakeLists.txt | 19 +- .../feature_evict_test.cpp | 202 ++++++++++++++++++ .../fixed_block_pool_test.cpp | 86 +++++++- .../sharded_map_test.cpp | 15 +- 7 files changed, 458 insertions(+), 165 deletions(-) delete mode 100644 fbgemm_gpu/src/dram_kv_embedding_cache/store_value.h delete mode 100644 fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h create mode 100644 fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h index a474b367c3..bcc1e36fca 100644 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -10,6 +11,130 @@ namespace kv_mem { class FixedBlockPool : public std::pmr::memory_resource { public: + // Chunk metadata + struct ChunkInfo { + void* ptr; // Memory block pointer + std::size_t size; // Total size + std::size_t alignment; + }; + + // Metadata structure (publicly accessible) + // alignas(8) MetaHeader >= sizeof(void*), avoid mempool block too small. + struct alignas(8) MetaHeader { + uint64_t key; // 8 bytes + int32_t score; // 4 bytes + bool used; // 1 byte + }; + + // Metadata operations + + // Key operations + static uint64_t get_key(const void* block) { + return reinterpret_cast(block)->key; + } + static void set_key(void* block, uint64_t key) { + reinterpret_cast(block)->key = key; + } + + // used operations + static bool get_used(const void* block) { + return reinterpret_cast(block)->used; + } + static void set_used(void* block, bool used) { + reinterpret_cast(block)->used = used; + } + + // Score operations + static int32_t get_score(const void* block) { + return reinterpret_cast(block)->score; + } + static void set_score(void* block, int32_t score) { + reinterpret_cast(block)->score = score; + } + static void update_score(void* block) { + auto& score = reinterpret_cast(block)->score; + // Avoid addition removal + if (score < std::numeric_limits::max()) { + score++; + } + } + // timestamp operations + static void update_timestamp(void* block) { + reinterpret_cast(block)->score = current_timestamp(); + } + static int32_t current_timestamp() { + auto stamp = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + return static_cast(stamp); + // facebook::WallClockUtil::NowInUsecFast(); + } + + // 与类型有关 + // Calculate storage size + template + static size_t calculate_block_size(size_t dimension) { + return sizeof(FixedBlockPool::MetaHeader) + dimension * sizeof(scalar_t); + } + + // Calculate alignment requirements + template + static size_t calculate_block_alignment() { + return std::max(alignof(FixedBlockPool::MetaHeader), alignof(scalar_t)); + } + + // Data pointer retrieval + template + static scalar_t* data_ptr(scalar_t* block) { + return reinterpret_cast(reinterpret_cast(block) + + sizeof(FixedBlockPool::MetaHeader)); + } + + template + static const scalar_t* data_ptr(const scalar_t* block) { + return reinterpret_cast( + reinterpret_cast(block) + + sizeof(FixedBlockPool::MetaHeader)); + } + + // Create memory block with metadata + template + static scalar_t* allocate_t(size_t& block_size, + size_t& alignment, + FixedBlockPool* pool) { + auto* block = + reinterpret_cast(pool->allocate(block_size, alignment)); + return block; + } + + // Destroy memory block + template + static void deallocate_t(scalar_t* block, + size_t& block_size, + size_t& alignment, + FixedBlockPool* pool) { + pool->deallocate(block, block_size, alignment); + } + + // 使用示例 + template + static void get_keys_with_low_score(FixedBlockPool* pool, + int32_t threshold, + float decay, + std::vector& result) { + pool->for_each_block([&decay, &threshold, &result](void* block) { + if (FixedBlockPool::get_used(block)) { + auto score = FixedBlockPool::get_score(static_cast(block)); + score = score * decay; + FixedBlockPool::set_score(static_cast(block), score); + if (score < threshold) { + result.push_back( + FixedBlockPool::get_key(static_cast(block))); + } + } + }); + } + explicit FixedBlockPool( std::size_t block_size, // Size of each memory block std::size_t block_alignment, // Memory block alignment requirement @@ -54,6 +179,21 @@ class FixedBlockPool : public std::pmr::memory_resource { } } + // 新增获取chunks信息的接口 + [[nodiscard]] const auto& get_chunks() const noexcept { return chunks_; } + + // 新增遍历所有block的接口 + template + void for_each_block(Func&& func) const { + for (const auto& chunk : chunks_) { + char* current = static_cast(chunk.ptr); + for (size_t i = 0; i < blocks_per_chunk_; ++i) { + func(current); + current += block_size_; + } + } + } + protected: // Core allocation function void* do_allocate(std::size_t bytes, std::size_t alignment) override { @@ -70,6 +210,7 @@ class FixedBlockPool : public std::pmr::memory_resource { // Take a block from the head of the free list void* result = free_list_; free_list_ = *static_cast(free_list_); + FixedBlockPool::set_used(result, true); return result; } @@ -80,6 +221,7 @@ class FixedBlockPool : public std::pmr::memory_resource { // Insert memory block back to the head of free list *static_cast(p) = free_list_; free_list_ = p; + FixedBlockPool::set_used(free_list_, false); } // Resource equality comparison (only the same object is equal) @@ -89,13 +231,6 @@ class FixedBlockPool : public std::pmr::memory_resource { } private: - // Chunk metadata - struct chunk_info { - void* ptr; // Memory block pointer - std::size_t size; // Total size - std::size_t alignment; - }; - // Allocate a new memory chunk void allocate_chunk() { const std::size_t chunk_size = block_size_ * blocks_per_chunk_; @@ -103,6 +238,9 @@ class FixedBlockPool : public std::pmr::memory_resource { // Allocate aligned memory through upstream resource void* chunk_ptr = upstream_->allocate(chunk_size, block_alignment_); + // Block used flag set false. + FixedBlockPool::set_used(chunk_ptr, false); + // Record chunk information for later release chunks_.push_back({chunk_ptr, chunk_size, block_alignment_}); @@ -118,11 +256,10 @@ class FixedBlockPool : public std::pmr::memory_resource { // Member variables const std::size_t block_size_; // Block size (not less than pointer size) - const std::size_t block_alignment_; // Block alignment requirement - const std::size_t blocks_per_chunk_; // Number of blocks per chunk - std::pmr::memory_resource* upstream_; // Upstream memory resource - std::pmr::vector chunks_{ - 1024}; // Records of all allocated chunks - void* free_list_ = nullptr; // Free block list head pointer + const std::size_t block_alignment_; // Block alignment requirement + const std::size_t blocks_per_chunk_; // Number of blocks per chunk + std::pmr::memory_resource* upstream_; // Upstream memory resource + std::pmr::vector chunks_{1024}; // Records of all allocated chunks + void* free_list_ = nullptr; // Free block list head pointer }; } // namespace kv_mem diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/store_value.h b/fbgemm_gpu/src/dram_kv_embedding_cache/store_value.h deleted file mode 100644 index 375c63ce46..0000000000 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/store_value.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once -#include -#include "common/time/Time.h" - -namespace kv_mem { - -/// @ingroup embedding-dram-kvstore -/// -/// @brief data structure to store tensor value and it's timestamp -// -template -class StoreValue { - public: - explicit StoreValue(std::vector&& value) { - value_ = std::move(value); - timestamp_ = facebook::WallClockUtil::NowInUsecFast(); - } - - explicit StoreValue(StoreValue&& pv) noexcept { - timestamp_ = facebook::WallClockUtil::NowInUsecFast(); - value_ = std::move(pv.value_); - } - - int64_t getTimestamp() const { - return timestamp_; - } - - const std::vector& getValue() const { - return value_; - } - - const std::vector& getValueAndPromote() { - timestamp_ = facebook::WallClockUtil::NowInUsecFast(); - return value_; - } - - private: - StoreValue& operator=(const StoreValue&) = delete; - StoreValue& operator=(const StoreValue&&) = delete; - StoreValue(const StoreValue& other) = delete; - - // cached tensor value - std::vector value_; - - // last visit timestamp - int64_t timestamp_; -}; -} // namespace kv_mem diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h b/fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h deleted file mode 100644 index e2ac55acc9..0000000000 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h +++ /dev/null @@ -1,82 +0,0 @@ -#pragma once -#include - -#include "fixed_block_pool.h" - -namespace kv_mem { - -class StoreValueUtils { - public: - // Metadata structure (publicly accessible) - // alignas(8) MetaHeader >= sizeof(void*), avoid mempool block too small. - struct alignas(8) MetaHeader { - int64_t timestamp; // 8 bytes - // Can be extended with other fields: uint32_t counter, uint64_t key, etc. - }; - - // Create memory block with metadata - template - static scalar_t* allocate(size_t& block_size, - size_t& alignment, - FixedBlockPool* pool) { - return reinterpret_cast(pool->allocate(block_size, alignment)); - } - - // Destroy memory block - template - static void deallocate(scalar_t* block, - size_t& block_size, - size_t& alignment, - FixedBlockPool* pool) { - pool->deallocate(block, block_size, alignment); - } - - // Calculate storage size - template - static size_t calculate_block_size(size_t dimension) { - return sizeof(MetaHeader) + dimension * sizeof(scalar_t); - } - - // Calculate alignment requirements - template - static size_t calculate_block_alignment() { - return std::max(alignof(MetaHeader), alignof(scalar_t)); - } - - // Metadata operations - template - static int64_t get_timestamp(const scalar_t* block) { - return reinterpret_cast(block)->timestamp; - } - - template - static void set_timestamp(scalar_t* block, int64_t ts) { - reinterpret_cast(block)->timestamp = ts; - } - - template - static void update_timestamp(scalar_t* block) { - reinterpret_cast(block)->timestamp = current_timestamp(); - } - - // Data pointer retrieval - template - static scalar_t* data_ptr(scalar_t* block) { - return reinterpret_cast(reinterpret_cast(block) + - sizeof(MetaHeader)); - } - - template - static const scalar_t* data_ptr(const scalar_t* block) { - return reinterpret_cast( - reinterpret_cast(block) + sizeof(MetaHeader)); - } - - static int64_t current_timestamp() { - return std::chrono::duration_cast( - std::chrono::system_clock::now().time_since_epoch()) - .count(); - // facebook::WallClockUtil::NowInUsecFast(); - } -}; -} // namespace kv_mem \ No newline at end of file diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt b/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt index e9a41eac51..9bf610f50d 100644 --- a/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt +++ b/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt @@ -1,14 +1,21 @@ +find_package(folly REQUIRED) +find_package(gflags REQUIRED) + +include_directories( + ${FBGEMM_SOURCE_DIR} +) + add_executable(fixed_block_pool_test ${CMAKE_CURRENT_SOURCE_DIR}/fixed_block_pool_test.cpp) target_compile_features(fixed_block_pool_test PUBLIC cxx_std_17) -target_include_directories(fixed_block_pool_test PUBLIC ${FBGEMM_SOURCE_DIR}) +target_compile_options(fixed_block_pool_test PUBLIC "-O3") target_link_libraries(fixed_block_pool_test gtest gtest_main) add_executable(sharded_map_test ${CMAKE_CURRENT_SOURCE_DIR}/sharded_map_test.cpp) target_compile_features(sharded_map_test PUBLIC cxx_std_17) -target_include_directories(fixed_block_pool_test PUBLIC ${FBGEMM_SOURCE_DIR}) +target_compile_options(sharded_map_test PUBLIC "-O3") target_link_libraries(sharded_map_test gtest gtest_main Folly::folly) -add_executable(store_value_utils_test ${CMAKE_CURRENT_SOURCE_DIR}/store_value_utils_test.cpp) -target_compile_features(store_value_utils_test PUBLIC cxx_std_17) -target_include_directories(store_value_utils_test PUBLIC ${FBGEMM_SOURCE_DIR}) -target_link_libraries(store_value_utils_test gtest gtest_main Folly::folly) \ No newline at end of file +add_executable(evict_test ${CMAKE_CURRENT_SOURCE_DIR}/evict_test.cpp) +target_compile_features(evict_test PUBLIC cxx_std_17) +target_compile_options(evict_test PUBLIC "-O3") +target_link_libraries(evict_test gtest gtest_main Folly::folly) \ No newline at end of file diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp new file mode 100644 index 0000000000..6ae9631ce8 --- /dev/null +++ b/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp @@ -0,0 +1,202 @@ +// +// Created by arron on 2025/5/22. +// +#include +#include +#include + +#include +#include +#include + +#include "fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h" + +namespace kv_mem { + +// Zipf分布生成器实现 +// alpha=1.3 → 约90%重复率 +// alpha=1.5 → 约95%重复率 +// alpha=2.0 → 约99%重复率 +class ZipfGenerator { + public: + ZipfGenerator(double alpha, unsigned long n) : alpha_(alpha), n_(n), dist_(0.0, 1.0) { + // 预计算调和数 + c_ = 0.0; + for (unsigned long i = 1; i <= n_; ++i) c_ += 1.0 / std::pow(i, alpha_); + c_ = 1.0 / c_; + } + + template + unsigned long operator()(Generator& gen) { + while (true) { + double u = dist_(gen); + double v = dist_(gen); + unsigned long k = static_cast(std::floor(std::pow(u, -1.0 / (alpha_ - 1.0)))); + if (k > n_) continue; + double T = std::pow((k + 1.0) / k, alpha_ - 1.0); + double accept_prob = (std::pow(k, -alpha_)) / (c_ * v * (T - 1.0) * k / n_); + if (accept_prob >= 1.0 || dist_(gen) < accept_prob) { + return k; + } + } + } + + private: + double alpha_; // 分布参数(>1.0) + unsigned long n_; // 元素总数 + double c_; // 归一化常数 + std::uniform_real_distribution dist_; +}; + +std::vector generateFixedEmbedding(int dimension) { return std::vector(dimension, 1.0); } + +void memPoolEmbeddingWithTime(int dimension, size_t numInserts, size_t numLookups) { + const size_t numShards = 1; + size_t block_size = FixedBlockPool::calculate_block_size(dimension); + size_t block_alignment = FixedBlockPool::calculate_block_alignment(); + + const size_t TOTAL_KEYS = 1'000'000; // 1百万个可能的键 + const double ZIPF_ALPHA = 1.5; // 调整这个参数控制热点程度 + + ZipfGenerator zipf(ZIPF_ALPHA, TOTAL_KEYS); + std::random_device rd; + std::mt19937 gen(rd()); + + SynchronizedShardedMap embeddingMap(numShards, + block_size, // block_size + block_alignment, // block_alignment + 8192); // blocks_per_chunk + double insertTime, lookupTime; + { + std::vector fixedEmbedding = generateFixedEmbedding(dimension); + + auto wlmap = embeddingMap.by(0).wlock(); + auto* pool = embeddingMap.pool_by(0); + + auto startInsert = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < numInserts; i++) { + auto id = zipf(gen); + // use mempool + float* block = nullptr; + // First check if the key already exists + auto it = wlmap->find(id); + if (it != wlmap->end()) { + block = it->second; + } else { + // Key doesn't exist, allocate new block and insert. + block = FixedBlockPool::allocate_t(block_size, block_alignment, pool); + FixedBlockPool::set_key(block, id); + FixedBlockPool::set_score(block, 0); + FixedBlockPool::set_used(block, true); + + wlmap->insert({id, block}); + } + FixedBlockPool::update_score(block); + auto* data_ptr = FixedBlockPool::data_ptr(block); + std::copy(fixedEmbedding.begin(), fixedEmbedding.end(), data_ptr); + } + auto endInsert = std::chrono::high_resolution_clock::now(); + insertTime = std::chrono::duration(endInsert - startInsert).count(); + } + + std::vector lookEmbedding(dimension); + size_t hitCount = 0; + { + auto rlock = embeddingMap.by(0).rlock(); + auto startLookup = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < numLookups; i++) { + auto id = zipf(gen); + auto it = rlock->find(id); + if (it != rlock->end()) { + hitCount++; + const float* data_ptr = FixedBlockPool::data_ptr(it->second); + std::copy(data_ptr, data_ptr + dimension, lookEmbedding.data()); + } + } + auto endLookup = std::chrono::high_resolution_clock::now(); + lookupTime = std::chrono::duration(endLookup - startLookup).count(); + } + + { + size_t score_sum = 0; + auto rlock = embeddingMap.by(0).rlock(); + for (const auto& [key, block] : *rlock) { + score_sum += FixedBlockPool::get_score(block); + } + ASSERT_EQ(score_sum, numInserts); + } + + // 遍历 chunk 找到要淘汰的 key + // 对 map 进行加锁,释放资源 + std::vector low_keys; + { + auto rlock = embeddingMap.by(0).rlock(); + std::cout << "map num:" << rlock->size() << std::endl; + auto* pool = embeddingMap.pool_by(0); + FixedBlockPool::get_keys_with_low_score(pool, 1, 0.99, low_keys); + std::cout << "low key num:" << low_keys.size() << std::endl; + } + + // 获取写锁,进行map 删除, pool 内存释放 + { + // 获取写锁,进行map删除和pool内存释放 + auto wlock = embeddingMap.by(0).wlock(); + auto* pool = embeddingMap.pool_by(0); + + for (auto& key : low_keys) { + // 1. 从map中查找并获取对应的block指针 + auto it = wlock->find(key); + if (it != wlock->end()) { + float* block = it->second; + FixedBlockPool::deallocate_t(block, block_size, block_alignment, pool); + // 3. 从map中移除该键值对 + wlock->erase(it); + } + } + std::cout << "after delete, map size:" << wlock->size() << std::endl; + } + + // 删除阶段:分批次处理,每次处理1000个key + const size_t batch_size = 1000; + for (size_t i = 0; i < low_keys.size(); i += batch_size) { + auto start = low_keys.begin() + i; + auto end = (i + batch_size < low_keys.size()) ? low_keys.begin() + i + batch_size : low_keys.end(); + std::vector batch(start, end); + + // 获取写锁处理当前批次 + auto wlock = embeddingMap.by(0).wlock(); + auto* pool = embeddingMap.pool_by(0); + + for (auto key : batch) { + auto it = wlock->find(key); + if (it != wlock->end()) { + float* block = it->second; + FixedBlockPool::deallocate_t(block, block_size, block_alignment, pool); + wlock->erase(it); + } + } + std::cout << "after delete, map size:" << wlock->size() << std::endl; + } + + std::cout << std::left << std::setw(20) << dimension; + std::cout << std::fixed << std::setprecision(2); + std::cout << std::setw(20) << insertTime; + std::cout << std::setw(20) << lookupTime; + std::cout << std::setw(20) << (100.0 * (double)hitCount / (double)numLookups); + std::cout << std::endl; +} + +int benchmark() { + std::vector dimensions = {4}; + const size_t numInserts = 1'000'000; // 1 million insert + const size_t numLookups = 1'000'000; // 1 million find + + std::cout << "======================= mempool ====================================" << std::endl; + std::cout << std::left << std::setw(20) << "dim" << std::setw(20) << "insert time (ms)" << std::setw(20) << "find time (ms)" << std::setw(20) << "hit rate (%)" << std::endl; + for (int dim : dimensions) { + memPoolEmbeddingWithTime(dim, numInserts, numLookups); + } + return 0; +} +TEST(Evict, benchmark) { benchmark(); } +} // namespace kv_mem diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp index 606819ad8d..8d7767c879 100644 --- a/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp +++ b/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp @@ -8,7 +8,6 @@ #include -#include "fixed_block_pool.h" namespace kv_mem { double test_std_vector(size_t vector_size, size_t repeat_count) { @@ -298,4 +297,89 @@ TEST(FixedBlockPoolTest, CustomUpstreamResource) { EXPECT_GT(deallocate_count, 0); } +TEST(FixedBlockPool, BasicFunctionality) { + constexpr int dim = 4; + size_t block_size = FixedBlockPool ::calculate_block_size(dim); + size_t alignment = FixedBlockPool::calculate_block_alignment(); + + // Initialize memory pool + FixedBlockPool pool(block_size, alignment, 1024); + + // Test memory allocation + auto* block = FixedBlockPool::allocate_t(block_size, alignment, &pool); + FixedBlockPool::update_timestamp(block); + ASSERT_NE(block, nullptr); + + // Verify metadata header + int64_t ts1 = FixedBlockPool::get_score(block); + EXPECT_LE(FixedBlockPool::current_timestamp(), ts1); + + // Test data pointer offset + float* data = FixedBlockPool::data_ptr(block); + ASSERT_EQ(reinterpret_cast(data) - reinterpret_cast(block), + sizeof(FixedBlockPool::MetaHeader)); + + // Test timestamp update + FixedBlockPool::update_timestamp(block); + int64_t ts2 = FixedBlockPool::get_score(block); + EXPECT_GE(ts2, ts1); // New timestamp should be greater or equal + + // Test memory deallocation + EXPECT_NO_THROW( + FixedBlockPool::deallocate_t(block, block_size, alignment, &pool)); +} + +TEST(FixedBlockPool, MultiDimensionTest) { + // Test memory alignment for different dimensions + const std::vector test_dims = {1, 4, 16, 64, 256}; + for (int dim : test_dims) { + size_t block_size = FixedBlockPool::calculate_block_size(dim); + size_t alignment = FixedBlockPool::calculate_block_alignment(); + + // Verify alignment requirements + EXPECT_EQ(alignment % alignof(FixedBlockPool::MetaHeader), 0); + EXPECT_EQ(alignment % alignof(float), 0); + + // Verify block size calculation + const size_t expected_size = + sizeof(FixedBlockPool::MetaHeader) + dim * sizeof(float); + EXPECT_EQ(block_size, expected_size); + } +} + +TEST(FixedBlockPool, TimestampPrecision) { + // Test timestamp precision accuracy + constexpr int test_iterations = 1000; + int64_t prev_ts = FixedBlockPool::current_timestamp(); + + for (int i = 0; i < test_iterations; ++i) { + int64_t curr_ts = FixedBlockPool::current_timestamp(); + EXPECT_GE(curr_ts, + prev_ts); // Timestamps should be monotonically increasing + prev_ts = curr_ts; + } +} + +TEST(FixedBlockPool, DataIntegrity) { + // Test data storage integrity + constexpr int dim = 8; + std::vector src_data(dim, 3.14f); + + size_t block_size = FixedBlockPool::calculate_block_size(dim); + size_t alignment = FixedBlockPool::calculate_block_alignment(); + FixedBlockPool pool(block_size, alignment, 1024); + + // Allocate and write data + auto* block = FixedBlockPool::allocate_t(block_size, alignment, &pool); + auto* data_ptr = FixedBlockPool::data_ptr(block); + std::copy(src_data.begin(), src_data.end(), data_ptr); + + // Verify data consistency + for (int i = 0; i < dim; ++i) { + EXPECT_FLOAT_EQ(data_ptr[i], src_data[i]); + } + + FixedBlockPool::deallocate_t(block, block_size, alignment, &pool); +} + } // namespace kv_mem \ No newline at end of file diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp index 6aa9373ac4..f19e1e219a 100644 --- a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp +++ b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp @@ -3,8 +3,9 @@ #include #include + #include "fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h" -#include "fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h" +#include "fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h" namespace kv_mem { std::vector generateFixedEmbedding(int dimension) { @@ -69,8 +70,8 @@ void memPoolEmbeddingWithTime(int dimension, size_t numInserts, size_t numLookups) { const size_t numShards = 1; - size_t block_size = StoreValueUtils::calculate_block_size(dimension); - size_t block_alignment = StoreValueUtils::calculate_block_alignment(); + size_t block_size = MemPoolUtils::calculate_block_size(dimension); + size_t block_alignment = MemPoolUtils::calculate_block_alignment(); SynchronizedShardedMap embeddingMap( numShards, @@ -87,8 +88,8 @@ void memPoolEmbeddingWithTime(int dimension, auto startInsert = std::chrono::high_resolution_clock::now(); for (size_t i = 0; i < numInserts; i++) { auto* block = - StoreValueUtils::allocate(block_size, block_alignment, pool); - auto* data_ptr = StoreValueUtils::data_ptr(block); + MemPoolUtils::allocate(block_size, block_alignment, pool); + auto* data_ptr = MemPoolUtils::data_ptr(block); std::copy(fixedEmbedding.begin(), fixedEmbedding.end(), data_ptr); wlock->insert_or_assign(i, block); } @@ -107,9 +108,9 @@ void memPoolEmbeddingWithTime(int dimension, auto it = rlock->find(i % numInserts); if (it != rlock->end()) { hitCount++; - const float* data_ptr = StoreValueUtils::data_ptr(it->second); + const float* data_ptr = MemPoolUtils::data_ptr(it->second); // update timestamp - StoreValueUtils::update_timestamp(it->second); + FixedBlockPool::update_timestamp(it->second); std::copy(data_ptr, data_ptr + dimension, lookEmbedding.data()); } } From a2cab1b59b495af7df87301643eb9a18a2b2d648 Mon Sep 17 00:00:00 2001 From: houzhenggang Date: Tue, 27 May 2025 10:11:47 +0800 Subject: [PATCH 05/12] FeatureEvict --- .../dram_kv_embedding_cache/feature_evict.h | 199 +++++++++++++++ .../fixed_block_pool.h | 159 +++++------- .../feature_evict_test.cpp | 237 +++++------------- .../fixed_block_pool_test.cpp | 14 +- .../sharded_map_test.cpp | 11 +- .../store_value_utils_test.cpp | 86 ------- 6 files changed, 329 insertions(+), 377 deletions(-) create mode 100644 fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h delete mode 100644 fbgemm_gpu/test/dram_kv_embedding_cache/store_value_utils_test.cpp diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h b/fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h new file mode 100644 index 0000000000..c531d60966 --- /dev/null +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h @@ -0,0 +1,199 @@ +// +// Created by root on 25-5-26. +// +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "SynchronizedShardedMap.h" + +namespace kv_mem { + +class FeatureEvictBase { + public: + FeatureEvictBase(folly::CPUThreadPoolExecutor* executor, + SynchronizedShardedMap& kv_store) + : executor_(executor), + kv_store_(kv_store), + evict_flag_(false), + evict_interrupt_(false), + num_shards_(kv_store.getNumShards()) { + init_shard_status(); + // evict_flag_ 表示是否有任务在进行 + // evict_interrupt_ 表示是否有任务被中断 + } + + virtual ~FeatureEvictBase() { + // 析构时,需要等待任务执行完成 + wait_completion(); // 等待所有异步任务完成 + }; + + // 触发异步淘汰 + // 如果有执行中的任务,直接返回, 防止多次触发 + // 如果没有执行中的任务,初始化任务状态 + void trigger_evict() { + std::lock_guard lock(mutex_); + if (evict_flag_.exchange(true)) return; + prepare_evict(); + } + + // 恢复任务执行,如果有进行中的任务返回true, 没有返回false + bool resume() { + std::lock_guard lock(mutex_); + if (!evict_flag_.load()) return false; + evict_interrupt_.store(false); + for (int shard_id = 0; shard_id < num_shards_; ++shard_id) { + submit_shard_task(shard_id); + } + return true; + }; + + // 暂停淘汰过程,如果有进行中的任务返回true, 没有返回false + // 在暂停阶段,判断淘汰是否完成 + bool pause() { + std::lock_guard lock(mutex_); + if (!evict_flag_.load()) return false; + evict_interrupt_.store(true); + check_and_reset_evict_flag(); + wait_completion(); + return true; + } + + // 检查是否正在淘汰 + bool is_evicting() { + std::lock_guard lock(mutex_); + check_and_reset_evict_flag(); + return evict_flag_.load(); + } + + protected: + void init_shard_status() { + block_cursors_.resize(num_shards_); + block_nums_snapshot_.resize(num_shards_); + shards_finished_.clear(); + for (int i = 0; i < num_shards_; ++i) { + block_cursors_[i] = 0; + block_nums_snapshot_[i] = 0; + shards_finished_.emplace_back(std::make_unique>(false)); + } + } + + // 初始化分片状态 + void prepare_evict() { + for (int shard_id = 0; shard_id < num_shards_; ++shard_id) { + auto rlmap = kv_store_.by(shard_id).rlock(); + auto* mempool = kv_store_.pool_by(shard_id); + block_nums_snapshot_[shard_id] = + mempool->get_chunks().size() * mempool->get_blocks_per_chunk(); + block_cursors_[shard_id] = 0; + shards_finished_[shard_id]->store(false); + } + } + + void submit_shard_task(int shard_id) { + if (shards_finished_[shard_id]->load()) return; + futures_.emplace_back(folly::via(executor_).thenValue( + [this, shard_id](auto&&) { process_shard(shard_id); })); + } + + void process_shard(int shard_id) { + auto wlock = kv_store_.by(shard_id).wlock(); + auto* pool = kv_store_.pool_by(shard_id); + while (!evict_interrupt_.load() && + block_cursors_[shard_id] < block_nums_snapshot_[shard_id]) { + auto* block = pool->get_block(block_cursors_[shard_id]++); + if (block && evict_block(block)) { + int64_t key = FixedBlockPool::get_key(block); + auto it = wlock->find(key); + if (it != wlock->end() && block == it->second) { + wlock->erase(key); + pool->deallocate_t(block); + } + } + } + + // 判断循环正常结束 + if (block_cursors_[shard_id] >= block_nums_snapshot_[shard_id]) { + shards_finished_[shard_id]->store(true); + } + } + + virtual bool evict_block(float* block) = 0; + + void wait_completion() { + folly::collectAll(futures_).wait(); + futures_.clear(); + } + + // 检查并重置 + void check_and_reset_evict_flag() { + bool all_finished = true; + for (int i = 0; i < num_shards_; ++i) { + if (!shards_finished_[i]->load()) all_finished = false; + } + if (all_finished) evict_flag_.store(false); + } + + folly::CPUThreadPoolExecutor* executor_; // 线程池 + SynchronizedShardedMap& kv_store_; // shard map + std::vector block_cursors_; // 已处理的block 索引 + std::vector block_nums_snapshot_; // 触发淘汰时,记录的block总数 + std::vector>> + shards_finished_; // 已完成的shard标识 + std::atomic evict_flag_; // 表示是否驱逐任务在进行 + std::atomic evict_interrupt_; // 表示驱逐任务是否暂停 + std::vector> futures_; // 分片任务记录 + std::mutex mutex_; // 接口锁,保证 public 接口 线程安全 + int num_shards_; // 并发任务数 +}; + +class CounterBasedEvict : public FeatureEvictBase { + public: + CounterBasedEvict(folly::CPUThreadPoolExecutor* executor, + SynchronizedShardedMap& kv_store, + float decay_rate, + int threshold) + : FeatureEvictBase(executor, kv_store), + decay_rate_(decay_rate), + threshold_(threshold) {} + + protected: + bool evict_block(float* block) override { + // 应用衰减并检查阈值 + auto current_count = FixedBlockPool::get_count(block); + current_count *= decay_rate_; + FixedBlockPool::set_count(block, current_count); + return current_count < threshold_; + } + + private: + float decay_rate_; + uint32_t threshold_; +}; + +class TimeBasedEvict : public FeatureEvictBase { + public: + TimeBasedEvict(folly::CPUThreadPoolExecutor* executor, + SynchronizedShardedMap& kv_store, + uint32_t ttl) + : FeatureEvictBase(executor, kv_store), ttl_(ttl) {} + + protected: + bool evict_block(float* block) override { + auto current_time = FixedBlockPool::current_timestamp(); + return current_time - FixedBlockPool::get_timestamp(block) > ttl_; + } + + private: + uint32_t ttl_; +}; +} // namespace kv_mem diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h index bcc1e36fca..0ffb9e40e5 100644 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h @@ -9,6 +9,8 @@ #include namespace kv_mem { +static constexpr uint32_t kMaxInt31Counter = 2147483647; + class FixedBlockPool : public std::pmr::memory_resource { public: // Chunk metadata @@ -20,54 +22,41 @@ class FixedBlockPool : public std::pmr::memory_resource { // Metadata structure (publicly accessible) // alignas(8) MetaHeader >= sizeof(void*), avoid mempool block too small. - struct alignas(8) MetaHeader { - uint64_t key; // 8 bytes - int32_t score; // 4 bytes - bool used; // 1 byte + // Metadata structure (publicly accessible) + struct alignas(8) MetaHeader { // 16bytes + int64_t key; // feature key 8bytes + uint32_t timestamp; // 4 bytes,the unit is second, uint32 indicates a range of over 120 years + uint32_t count : 31; // only 31 bit is used, max value is 2147483647 + bool used : 1; // Mark whether this block is in use for the judgment of memory pool traversal + // Can be extended with other fields: uint32_t click, etc. }; // Metadata operations // Key operations - static uint64_t get_key(const void* block) { - return reinterpret_cast(block)->key; - } - static void set_key(void* block, uint64_t key) { - reinterpret_cast(block)->key = key; - } + static uint64_t get_key(const void* block) { return reinterpret_cast(block)->key; } + static void set_key(void* block, uint64_t key) { reinterpret_cast(block)->key = key; } // used operations - static bool get_used(const void* block) { - return reinterpret_cast(block)->used; - } - static void set_used(void* block, bool used) { - reinterpret_cast(block)->used = used; - } + static bool get_used(const void* block) { return reinterpret_cast(block)->used; } + static void set_used(void* block, bool used) { reinterpret_cast(block)->used = used; } // Score operations - static int32_t get_score(const void* block) { - return reinterpret_cast(block)->score; - } - static void set_score(void* block, int32_t score) { - reinterpret_cast(block)->score = score; - } - static void update_score(void* block) { - auto& score = reinterpret_cast(block)->score; + static uint32_t get_count(const void* block) { return reinterpret_cast(block)->count; } + static void set_count(void* block, uint32_t count) { reinterpret_cast(block)->count = count; } + static void update_count(void* block) { // Avoid addition removal - if (score < std::numeric_limits::max()) { - score++; + if (reinterpret_cast(block)->count < kMaxInt31Counter) { + reinterpret_cast(block)->count++; } } // timestamp operations - static void update_timestamp(void* block) { - reinterpret_cast(block)->score = current_timestamp(); - } - static int32_t current_timestamp() { - auto stamp = std::chrono::duration_cast( - std::chrono::system_clock::now().time_since_epoch()) - .count(); - return static_cast(stamp); + static uint32_t get_timestamp(const void* block) { return reinterpret_cast(block)->timestamp; } + static void update_timestamp(void* block) { reinterpret_cast(block)->timestamp = current_timestamp(); } + static uint32_t current_timestamp() { + // std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); // facebook::WallClockUtil::NowInUsecFast(); + return std::time(nullptr); } // 与类型有关 @@ -86,60 +75,29 @@ class FixedBlockPool : public std::pmr::memory_resource { // Data pointer retrieval template static scalar_t* data_ptr(scalar_t* block) { - return reinterpret_cast(reinterpret_cast(block) + - sizeof(FixedBlockPool::MetaHeader)); + return reinterpret_cast(reinterpret_cast(block) + sizeof(FixedBlockPool::MetaHeader)); } template static const scalar_t* data_ptr(const scalar_t* block) { - return reinterpret_cast( - reinterpret_cast(block) + - sizeof(FixedBlockPool::MetaHeader)); - } - - // Create memory block with metadata - template - static scalar_t* allocate_t(size_t& block_size, - size_t& alignment, - FixedBlockPool* pool) { - auto* block = - reinterpret_cast(pool->allocate(block_size, alignment)); - return block; + return reinterpret_cast(reinterpret_cast(block) + sizeof(FixedBlockPool::MetaHeader)); } - // Destroy memory block template - static void deallocate_t(scalar_t* block, - size_t& block_size, - size_t& alignment, - FixedBlockPool* pool) { - pool->deallocate(block, block_size, alignment); - } - - // 使用示例 - template - static void get_keys_with_low_score(FixedBlockPool* pool, - int32_t threshold, - float decay, - std::vector& result) { - pool->for_each_block([&decay, &threshold, &result](void* block) { - if (FixedBlockPool::get_used(block)) { - auto score = FixedBlockPool::get_score(static_cast(block)); - score = score * decay; - FixedBlockPool::set_score(static_cast(block), score); - if (score < threshold) { - result.push_back( - FixedBlockPool::get_key(static_cast(block))); - } - } - }); - } + scalar_t* get_block(size_t index) { + char* current_chunk = static_cast(chunks_[index / blocks_per_chunk_].ptr); + char* block = current_chunk + block_size_ * (index % blocks_per_chunk_); + if (FixedBlockPool::get_used(block)) { + return reinterpret_cast(block); + } else { + return nullptr; + } + }; - explicit FixedBlockPool( - std::size_t block_size, // Size of each memory block - std::size_t block_alignment, // Memory block alignment requirement - std::size_t blocks_per_chunk = 8192, // Number of blocks per chunk - std::pmr::memory_resource* upstream = std::pmr::new_delete_resource()) + explicit FixedBlockPool(std::size_t block_size, // Size of each memory block + std::size_t block_alignment, // Memory block alignment requirement + std::size_t blocks_per_chunk = 8192, // Number of blocks per chunk + std::pmr::memory_resource* upstream = std::pmr::new_delete_resource()) // Minimum block size is 8 bytes : block_size_(std::max(block_size, sizeof(void*))), block_alignment_(block_alignment), @@ -179,21 +137,23 @@ class FixedBlockPool : public std::pmr::memory_resource { } } - // 新增获取chunks信息的接口 - [[nodiscard]] const auto& get_chunks() const noexcept { return chunks_; } + // Create memory block with metadata + template + scalar_t* allocate_t() { + return reinterpret_cast(this->allocate(block_size_, block_alignment_)); + } - // 新增遍历所有block的接口 - template - void for_each_block(Func&& func) const { - for (const auto& chunk : chunks_) { - char* current = static_cast(chunk.ptr); - for (size_t i = 0; i < blocks_per_chunk_; ++i) { - func(current); - current += block_size_; - } - } + // Destroy memory block + template + void deallocate_t(scalar_t* block) { + this->deallocate(block, block_size_, block_alignment_); } + [[nodiscard]] const auto& get_chunks() const noexcept { return chunks_; } + [[nodiscard]] std::size_t get_block_size() const noexcept { return block_size_; } + [[nodiscard]] std::size_t get_block_alignment() const noexcept { return block_alignment_; } + [[nodiscard]] std::size_t get_blocks_per_chunk() const noexcept { return blocks_per_chunk_; } + protected: // Core allocation function void* do_allocate(std::size_t bytes, std::size_t alignment) override { @@ -215,9 +175,7 @@ class FixedBlockPool : public std::pmr::memory_resource { } // Core deallocation function - void do_deallocate(void* p, - [[maybe_unused]] std::size_t bytes, - [[maybe_unused]] std::size_t alignment) override { + void do_deallocate(void* p, [[maybe_unused]] std::size_t bytes, [[maybe_unused]] std::size_t alignment) override { // Insert memory block back to the head of free list *static_cast(p) = free_list_; free_list_ = p; @@ -225,10 +183,7 @@ class FixedBlockPool : public std::pmr::memory_resource { } // Resource equality comparison (only the same object is equal) - [[nodiscard]] bool do_is_equal( - const std::pmr::memory_resource& other) const noexcept override { - return this == &other; - } + [[nodiscard]] bool do_is_equal(const std::pmr::memory_resource& other) const noexcept override { return this == &other; } private: // Allocate a new memory chunk @@ -238,9 +193,6 @@ class FixedBlockPool : public std::pmr::memory_resource { // Allocate aligned memory through upstream resource void* chunk_ptr = upstream_->allocate(chunk_size, block_alignment_); - // Block used flag set false. - FixedBlockPool::set_used(chunk_ptr, false); - // Record chunk information for later release chunks_.push_back({chunk_ptr, chunk_size, block_alignment_}); @@ -250,12 +202,13 @@ class FixedBlockPool : public std::pmr::memory_resource { for (std::size_t i = 0; i < blocks_per_chunk_; ++i) { current -= block_size_; *reinterpret_cast(current) = free_list_; + FixedBlockPool::set_used(current, false); free_list_ = current; } } // Member variables - const std::size_t block_size_; // Block size (not less than pointer size) + const std::size_t block_size_; // Block size (not less than pointer size) const std::size_t block_alignment_; // Block alignment requirement const std::size_t blocks_per_chunk_; // Number of blocks per chunk std::pmr::memory_resource* upstream_; // Upstream memory resource diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp index 6ae9631ce8..464ed6294f 100644 --- a/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp +++ b/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp @@ -1,202 +1,91 @@ // // Created by arron on 2025/5/22. // +#include "fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h" + #include #include -#include #include -#include +#include #include #include "fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h" namespace kv_mem { - -// Zipf分布生成器实现 -// alpha=1.3 → 约90%重复率 -// alpha=1.5 → 约95%重复率 -// alpha=2.0 → 约99%重复率 -class ZipfGenerator { - public: - ZipfGenerator(double alpha, unsigned long n) : alpha_(alpha), n_(n), dist_(0.0, 1.0) { - // 预计算调和数 - c_ = 0.0; - for (unsigned long i = 1; i <= n_; ++i) c_ += 1.0 / std::pow(i, alpha_); - c_ = 1.0 / c_; - } - - template - unsigned long operator()(Generator& gen) { - while (true) { - double u = dist_(gen); - double v = dist_(gen); - unsigned long k = static_cast(std::floor(std::pow(u, -1.0 / (alpha_ - 1.0)))); - if (k > n_) continue; - double T = std::pow((k + 1.0) / k, alpha_ - 1.0); - double accept_prob = (std::pow(k, -alpha_)) / (c_ * v * (T - 1.0) * k / n_); - if (accept_prob >= 1.0 || dist_(gen) < accept_prob) { - return k; - } +class FeatureEvictTest : public ::testing::Test { + protected: + static constexpr int NUM_SHARDS = 4; + static constexpr int DIMENSION = 128; + size_t BLOCK_SIZE = FixedBlockPool::calculate_block_size(DIMENSION); + size_t BLOCK_ALIGNMENT = FixedBlockPool::calculate_block_alignment(); + + void SetUp() override { + executor_ = std::make_unique(4); + kv_store_ = std::make_unique>( + NUM_SHARDS, BLOCK_SIZE, BLOCK_ALIGNMENT); + + // 插入测试数据 + + for (int i = 0; i < 1000; ++i) { + int shard_id = i % NUM_SHARDS; + auto wlock = kv_store_->by(shard_id).wlock(); + auto* pool = kv_store_->pool_by(shard_id); + float* block = pool->allocate_t(); + FixedBlockPool::set_key(block, i); + FixedBlockPool::set_count(block, 1); // 初始分数 + FixedBlockPool::set_used(block, true); + wlock->insert({i, block}); } - } - - private: - double alpha_; // 分布参数(>1.0) - unsigned long n_; // 元素总数 - double c_; // 归一化常数 - std::uniform_real_distribution dist_; -}; - -std::vector generateFixedEmbedding(int dimension) { return std::vector(dimension, 1.0); } -void memPoolEmbeddingWithTime(int dimension, size_t numInserts, size_t numLookups) { - const size_t numShards = 1; - size_t block_size = FixedBlockPool::calculate_block_size(dimension); - size_t block_alignment = FixedBlockPool::calculate_block_alignment(); - - const size_t TOTAL_KEYS = 1'000'000; // 1百万个可能的键 - const double ZIPF_ALPHA = 1.5; // 调整这个参数控制热点程度 - - ZipfGenerator zipf(ZIPF_ALPHA, TOTAL_KEYS); - std::random_device rd; - std::mt19937 gen(rd()); - - SynchronizedShardedMap embeddingMap(numShards, - block_size, // block_size - block_alignment, // block_alignment - 8192); // blocks_per_chunk - double insertTime, lookupTime; - { - std::vector fixedEmbedding = generateFixedEmbedding(dimension); - - auto wlmap = embeddingMap.by(0).wlock(); - auto* pool = embeddingMap.pool_by(0); - - auto startInsert = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < numInserts; i++) { - auto id = zipf(gen); - // use mempool - float* block = nullptr; - // First check if the key already exists - auto it = wlmap->find(id); - if (it != wlmap->end()) { - block = it->second; - } else { - // Key doesn't exist, allocate new block and insert. - block = FixedBlockPool::allocate_t(block_size, block_alignment, pool); - FixedBlockPool::set_key(block, id); - FixedBlockPool::set_score(block, 0); - FixedBlockPool::set_used(block, true); - - wlmap->insert({id, block}); - } - FixedBlockPool::update_score(block); - auto* data_ptr = FixedBlockPool::data_ptr(block); - std::copy(fixedEmbedding.begin(), fixedEmbedding.end(), data_ptr); + for (int i = 1000; i < 2000; ++i) { + int shard_id = i % NUM_SHARDS; + auto wlock = kv_store_->by(shard_id).wlock(); + auto* pool = kv_store_->pool_by(shard_id); + float* block = pool->allocate_t(); + FixedBlockPool::set_key(block, i); + FixedBlockPool::set_count(block, 2); // 初始分数 + FixedBlockPool::set_used(block, true); + wlock->insert({i, block}); } - auto endInsert = std::chrono::high_resolution_clock::now(); - insertTime = std::chrono::duration(endInsert - startInsert).count(); } - std::vector lookEmbedding(dimension); - size_t hitCount = 0; - { - auto rlock = embeddingMap.by(0).rlock(); - auto startLookup = std::chrono::high_resolution_clock::now(); - for (size_t i = 0; i < numLookups; i++) { - auto id = zipf(gen); - auto it = rlock->find(id); - if (it != rlock->end()) { - hitCount++; - const float* data_ptr = FixedBlockPool::data_ptr(it->second); - std::copy(data_ptr, data_ptr + dimension, lookEmbedding.data()); - } - } - auto endLookup = std::chrono::high_resolution_clock::now(); - lookupTime = std::chrono::duration(endLookup - startLookup).count(); - } + std::unique_ptr executor_; + std::unique_ptr> kv_store_; +}; - { - size_t score_sum = 0; - auto rlock = embeddingMap.by(0).rlock(); - for (const auto& [key, block] : *rlock) { - score_sum += FixedBlockPool::get_score(block); - } - ASSERT_EQ(score_sum, numInserts); - } +TEST_F(FeatureEvictTest, BasicEviction) { + CounterBasedEvict evictor(executor_.get(), *kv_store_.get(), 0.5f, 1); - // 遍历 chunk 找到要淘汰的 key - // 对 map 进行加锁,释放资源 - std::vector low_keys; - { - auto rlock = embeddingMap.by(0).rlock(); - std::cout << "map num:" << rlock->size() << std::endl; - auto* pool = embeddingMap.pool_by(0); - FixedBlockPool::get_keys_with_low_score(pool, 1, 0.99, low_keys); - std::cout << "low key num:" << low_keys.size() << std::endl; + // 初始验证 + size_t total_blocks = 0; + for (int shard_id = 0; shard_id < NUM_SHARDS; ++shard_id) { + auto rlock = kv_store_->by(shard_id).rlock(); + total_blocks += rlock->size(); } + ASSERT_EQ(total_blocks, 2000); - // 获取写锁,进行map 删除, pool 内存释放 - { - // 获取写锁,进行map删除和pool内存释放 - auto wlock = embeddingMap.by(0).wlock(); - auto* pool = embeddingMap.pool_by(0); + // 执行淘汰 + evictor.trigger_evict(); - for (auto& key : low_keys) { - // 1. 从map中查找并获取对应的block指针 - auto it = wlock->find(key); - if (it != wlock->end()) { - float* block = it->second; - FixedBlockPool::deallocate_t(block, block_size, block_alignment, pool); - // 3. 从map中移除该键值对 - wlock->erase(it); - } - } - std::cout << "after delete, map size:" << wlock->size() << std::endl; + // 验证淘汰过程 + while (evictor.is_evicting()) { + evictor.resume(); + std::this_thread::sleep_for(std::chrono::microseconds(5)); + evictor.pause(); } - // 删除阶段:分批次处理,每次处理1000个key - const size_t batch_size = 1000; - for (size_t i = 0; i < low_keys.size(); i += batch_size) { - auto start = low_keys.begin() + i; - auto end = (i + batch_size < low_keys.size()) ? low_keys.begin() + i + batch_size : low_keys.end(); - std::vector batch(start, end); - - // 获取写锁处理当前批次 - auto wlock = embeddingMap.by(0).wlock(); - auto* pool = embeddingMap.pool_by(0); - - for (auto key : batch) { - auto it = wlock->find(key); - if (it != wlock->end()) { - float* block = it->second; - FixedBlockPool::deallocate_t(block, block_size, block_alignment, pool); - wlock->erase(it); - } + // 验证结果 + size_t remaining = 0; + for (int shard_id = 0; shard_id < NUM_SHARDS; ++shard_id) { + auto rlock = kv_store_->by(shard_id).rlock(); + remaining += rlock->size(); + // 验证分数衰减 + for (const auto& [key, block] : *rlock) { + ASSERT_EQ(FixedBlockPool::get_count(block), 1); } - std::cout << "after delete, map size:" << wlock->size() << std::endl; - } - - std::cout << std::left << std::setw(20) << dimension; - std::cout << std::fixed << std::setprecision(2); - std::cout << std::setw(20) << insertTime; - std::cout << std::setw(20) << lookupTime; - std::cout << std::setw(20) << (100.0 * (double)hitCount / (double)numLookups); - std::cout << std::endl; -} - -int benchmark() { - std::vector dimensions = {4}; - const size_t numInserts = 1'000'000; // 1 million insert - const size_t numLookups = 1'000'000; // 1 million find - - std::cout << "======================= mempool ====================================" << std::endl; - std::cout << std::left << std::setw(20) << "dim" << std::setw(20) << "insert time (ms)" << std::setw(20) << "find time (ms)" << std::setw(20) << "hit rate (%)" << std::endl; - for (int dim : dimensions) { - memPoolEmbeddingWithTime(dim, numInserts, numLookups); } - return 0; + std::cout << "remaining: " << remaining << std::endl; + ASSERT_EQ(remaining, 1000); } -TEST(Evict, benchmark) { benchmark(); } } // namespace kv_mem diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp index 8d7767c879..47ef59d2de 100644 --- a/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp +++ b/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp @@ -306,12 +306,12 @@ TEST(FixedBlockPool, BasicFunctionality) { FixedBlockPool pool(block_size, alignment, 1024); // Test memory allocation - auto* block = FixedBlockPool::allocate_t(block_size, alignment, &pool); + auto* block = pool.allocate_t(); FixedBlockPool::update_timestamp(block); ASSERT_NE(block, nullptr); // Verify metadata header - int64_t ts1 = FixedBlockPool::get_score(block); + int64_t ts1 = FixedBlockPool::get_timestamp(block); EXPECT_LE(FixedBlockPool::current_timestamp(), ts1); // Test data pointer offset @@ -321,12 +321,11 @@ TEST(FixedBlockPool, BasicFunctionality) { // Test timestamp update FixedBlockPool::update_timestamp(block); - int64_t ts2 = FixedBlockPool::get_score(block); + int64_t ts2 = FixedBlockPool::get_timestamp(block); EXPECT_GE(ts2, ts1); // New timestamp should be greater or equal // Test memory deallocation - EXPECT_NO_THROW( - FixedBlockPool::deallocate_t(block, block_size, alignment, &pool)); + EXPECT_NO_THROW(pool.deallocate_t(block)); } TEST(FixedBlockPool, MultiDimensionTest) { @@ -370,7 +369,7 @@ TEST(FixedBlockPool, DataIntegrity) { FixedBlockPool pool(block_size, alignment, 1024); // Allocate and write data - auto* block = FixedBlockPool::allocate_t(block_size, alignment, &pool); + auto* block = pool.allocate_t(); auto* data_ptr = FixedBlockPool::data_ptr(block); std::copy(src_data.begin(), src_data.end(), data_ptr); @@ -378,8 +377,7 @@ TEST(FixedBlockPool, DataIntegrity) { for (int i = 0; i < dim; ++i) { EXPECT_FLOAT_EQ(data_ptr[i], src_data[i]); } - - FixedBlockPool::deallocate_t(block, block_size, alignment, &pool); + pool.deallocate_t(block); } } // namespace kv_mem \ No newline at end of file diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp index f19e1e219a..a2f0dcfb1a 100644 --- a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp +++ b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp @@ -70,8 +70,8 @@ void memPoolEmbeddingWithTime(int dimension, size_t numInserts, size_t numLookups) { const size_t numShards = 1; - size_t block_size = MemPoolUtils::calculate_block_size(dimension); - size_t block_alignment = MemPoolUtils::calculate_block_alignment(); + size_t block_size = FixedBlockPool::calculate_block_size(dimension); + size_t block_alignment = FixedBlockPool::calculate_block_alignment(); SynchronizedShardedMap embeddingMap( numShards, @@ -87,9 +87,8 @@ void memPoolEmbeddingWithTime(int dimension, auto startInsert = std::chrono::high_resolution_clock::now(); for (size_t i = 0; i < numInserts; i++) { - auto* block = - MemPoolUtils::allocate(block_size, block_alignment, pool); - auto* data_ptr = MemPoolUtils::data_ptr(block); + auto* block = pool->allocate_t(); + auto* data_ptr = FixedBlockPool::data_ptr(block); std::copy(fixedEmbedding.begin(), fixedEmbedding.end(), data_ptr); wlock->insert_or_assign(i, block); } @@ -108,7 +107,7 @@ void memPoolEmbeddingWithTime(int dimension, auto it = rlock->find(i % numInserts); if (it != rlock->end()) { hitCount++; - const float* data_ptr = MemPoolUtils::data_ptr(it->second); + const float* data_ptr = FixedBlockPool::data_ptr(it->second); // update timestamp FixedBlockPool::update_timestamp(it->second); std::copy(data_ptr, data_ptr + dimension, lookEmbedding.data()); diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/store_value_utils_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/store_value_utils_test.cpp deleted file mode 100644 index c1506c16e1..0000000000 --- a/fbgemm_gpu/test/dram_kv_embedding_cache/store_value_utils_test.cpp +++ /dev/null @@ -1,86 +0,0 @@ -#include "fbgemm_gpu/src/dram_kv_embedding_cache/store_value_utils.h" - -#include "gtest/gtest.h" -namespace kv_mem { - -TEST(StoreValueUtils, BasicFunctionality) { - constexpr int dim = 4; - size_t block_size = StoreValueUtils::calculate_block_size(dim); - size_t alignment = StoreValueUtils::calculate_block_alignment(); - - // Initialize memory pool - FixedBlockPool pool(block_size, alignment, 1024); - - // Test memory allocation - float* block = StoreValueUtils::allocate(block_size, alignment, &pool); - StoreValueUtils::update_timestamp(block); - ASSERT_NE(block, nullptr); - - // Verify metadata header - int64_t ts1 = StoreValueUtils::get_timestamp(block); - EXPECT_LE(StoreValueUtils::current_timestamp(), ts1); - - // Test data pointer offset - float* data = StoreValueUtils::data_ptr(block); - ASSERT_EQ(reinterpret_cast(data) - reinterpret_cast(block), sizeof(StoreValueUtils::MetaHeader)); - - // Test timestamp update - StoreValueUtils::update_timestamp(block); - int64_t ts2 = StoreValueUtils::get_timestamp(block); - EXPECT_GE(ts2, ts1); // New timestamp should be greater or equal - - // Test memory deallocation - EXPECT_NO_THROW(StoreValueUtils::deallocate(block, block_size, alignment, &pool)); -} - -TEST(StoreValueUtils, MultiDimensionTest) { - // Test memory alignment for different dimensions - const std::vector test_dims = {1, 4, 16, 64, 256}; - for (int dim : test_dims) { - size_t block_size = StoreValueUtils::calculate_block_size(dim); - size_t alignment = StoreValueUtils::calculate_block_alignment(); - - // Verify alignment requirements - EXPECT_EQ(alignment % alignof(StoreValueUtils::MetaHeader), 0); - EXPECT_EQ(alignment % alignof(float), 0); - - // Verify block size calculation - const size_t expected_size = sizeof(StoreValueUtils::MetaHeader) + dim * sizeof(float); - EXPECT_EQ(block_size, expected_size); - } -} - -TEST(StoreValueUtils, TimestampPrecision) { - // Test timestamp precision accuracy - constexpr int test_iterations = 1000; - int64_t prev_ts = StoreValueUtils::current_timestamp(); - - for (int i = 0; i < test_iterations; ++i) { - int64_t curr_ts = StoreValueUtils::current_timestamp(); - EXPECT_GE(curr_ts, prev_ts); // Timestamps should be monotonically increasing - prev_ts = curr_ts; - } -} - -TEST(StoreValueUtils, DataIntegrity) { - // Test data storage integrity - constexpr int dim = 8; - std::vector src_data(dim, 3.14f); - - size_t block_size = StoreValueUtils::calculate_block_size(dim); - size_t alignment = StoreValueUtils::calculate_block_alignment(); - FixedBlockPool pool(block_size, alignment, 1024); - - // Allocate and write data - float* block = StoreValueUtils::allocate(block_size, alignment, &pool); - float* data_ptr = StoreValueUtils::data_ptr(block); - std::copy(src_data.begin(), src_data.end(), data_ptr); - - // Verify data consistency - for (int i = 0; i < dim; ++i) { - EXPECT_FLOAT_EQ(data_ptr[i], src_data[i]); - } - - StoreValueUtils::deallocate(block, block_size, alignment, &pool); -} -} // namespace kv_mem \ No newline at end of file From 9e842adfd46735b36adb958b8f61e5ded89382cc Mon Sep 17 00:00:00 2001 From: houzhenggang Date: Tue, 27 May 2025 15:14:22 +0800 Subject: [PATCH 06/12] feature evict add fmt log --- .../dram_kv_embedding_cache/feature_evict.h | 125 +++++++++------- .../fixed_block_pool.h | 26 ++-- .../dram_kv_embedding_cache/CMakeLists.txt | 10 +- .../feature_evict_test.cpp | 135 ++++++++++++------ .../fixed_block_pool_test.cpp | 58 +++----- .../sharded_map_test.cpp | 90 +++++------- 6 files changed, 229 insertions(+), 215 deletions(-) diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h b/fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h index c531d60966..a7c42b291e 100644 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h @@ -1,6 +1,3 @@ -// -// Created by root on 25-5-26. -// #pragma once #include @@ -11,6 +8,8 @@ #include #include +#include +#include #include #include @@ -18,35 +17,33 @@ namespace kv_mem { +template class FeatureEvictBase { public: - FeatureEvictBase(folly::CPUThreadPoolExecutor* executor, - SynchronizedShardedMap& kv_store) + FeatureEvictBase(folly::CPUThreadPoolExecutor* executor, SynchronizedShardedMap& kv_store) : executor_(executor), kv_store_(kv_store), evict_flag_(false), evict_interrupt_(false), num_shards_(kv_store.getNumShards()) { init_shard_status(); - // evict_flag_ 表示是否有任务在进行 - // evict_interrupt_ 表示是否有任务被中断 } virtual ~FeatureEvictBase() { - // 析构时,需要等待任务执行完成 - wait_completion(); // 等待所有异步任务完成 + wait_completion(); // Wait for all asynchronous tasks to complete. }; - // 触发异步淘汰 - // 如果有执行中的任务,直接返回, 防止多次触发 - // 如果没有执行中的任务,初始化任务状态 + // Trigger asynchronous eviction. + // If there is an ongoing task, return directly to prevent multiple triggers. + // If there is no ongoing task, initialize the task state. void trigger_evict() { std::lock_guard lock(mutex_); if (evict_flag_.exchange(true)) return; + fmt::print("Starting new eviction process...\n"); prepare_evict(); } - // 恢复任务执行,如果有进行中的任务返回true, 没有返回false + // Resume task execution. Returns true if there is an ongoing task, false otherwise. bool resume() { std::lock_guard lock(mutex_); if (!evict_flag_.load()) return false; @@ -57,8 +54,8 @@ class FeatureEvictBase { return true; }; - // 暂停淘汰过程,如果有进行中的任务返回true, 没有返回false - // 在暂停阶段,判断淘汰是否完成 + // Pause the eviction process. Returns true if there is an ongoing task, false otherwise. + // During the pause phase, check whether the eviction is complete. bool pause() { std::lock_guard lock(mutex_); if (!evict_flag_.load()) return false; @@ -68,7 +65,7 @@ class FeatureEvictBase { return true; } - // 检查是否正在淘汰 + // Check whether eviction is ongoing. bool is_evicting() { std::lock_guard lock(mutex_); check_and_reset_evict_flag(); @@ -87,13 +84,12 @@ class FeatureEvictBase { } } - // 初始化分片状态 + // Initialize shard state. void prepare_evict() { for (int shard_id = 0; shard_id < num_shards_; ++shard_id) { auto rlmap = kv_store_.by(shard_id).rlock(); auto* mempool = kv_store_.pool_by(shard_id); - block_nums_snapshot_[shard_id] = - mempool->get_chunks().size() * mempool->get_blocks_per_chunk(); + block_nums_snapshot_[shard_id] = mempool->get_chunks().size() * mempool->get_blocks_per_chunk(); block_cursors_[shard_id] = 0; shards_finished_[shard_id]->store(false); } @@ -101,40 +97,60 @@ class FeatureEvictBase { void submit_shard_task(int shard_id) { if (shards_finished_[shard_id]->load()) return; - futures_.emplace_back(folly::via(executor_).thenValue( - [this, shard_id](auto&&) { process_shard(shard_id); })); + futures_.emplace_back(folly::via(executor_).thenValue([this, shard_id](auto&&) { process_shard(shard_id); })); } void process_shard(int shard_id) { + auto start_time = std::chrono::high_resolution_clock::now(); + size_t evicted_count = 0; + size_t processed_count = 0; + auto wlock = kv_store_.by(shard_id).wlock(); auto* pool = kv_store_.pool_by(shard_id); - while (!evict_interrupt_.load() && - block_cursors_[shard_id] < block_nums_snapshot_[shard_id]) { - auto* block = pool->get_block(block_cursors_[shard_id]++); + + while (!evict_interrupt_.load() && block_cursors_[shard_id] < block_nums_snapshot_[shard_id]) { + auto* block = pool->template get_block(block_cursors_[shard_id]++); + processed_count++; if (block && evict_block(block)) { int64_t key = FixedBlockPool::get_key(block); auto it = wlock->find(key); if (it != wlock->end() && block == it->second) { wlock->erase(key); - pool->deallocate_t(block); + pool->template deallocate_t(block); + evicted_count++; } } } - // 判断循环正常结束 + // Check whether the loop ends normally. if (block_cursors_[shard_id] >= block_nums_snapshot_[shard_id]) { shards_finished_[shard_id]->store(true); } + + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end_time - start_time); + + fmt::print( + "Shard {} completed: \n" + " - Time taken: {}ms\n" + " - Total blocks processed: {}\n" + " - Blocks evicted: {}\n" + " - Eviction rate: {:.2f}%\n", + shard_id, + duration.count(), + processed_count, + evicted_count, + (evicted_count * 100.0f) / processed_count); } - virtual bool evict_block(float* block) = 0; + virtual bool evict_block(weight_type* block) = 0; void wait_completion() { folly::collectAll(futures_).wait(); futures_.clear(); } - // 检查并重置 + // Check and reset the eviction flag. void check_and_reset_evict_flag() { bool all_finished = true; for (int i = 0; i < num_shards_; ++i) { @@ -143,32 +159,30 @@ class FeatureEvictBase { if (all_finished) evict_flag_.store(false); } - folly::CPUThreadPoolExecutor* executor_; // 线程池 - SynchronizedShardedMap& kv_store_; // shard map - std::vector block_cursors_; // 已处理的block 索引 - std::vector block_nums_snapshot_; // 触发淘汰时,记录的block总数 - std::vector>> - shards_finished_; // 已完成的shard标识 - std::atomic evict_flag_; // 表示是否驱逐任务在进行 - std::atomic evict_interrupt_; // 表示驱逐任务是否暂停 - std::vector> futures_; // 分片任务记录 - std::mutex mutex_; // 接口锁,保证 public 接口 线程安全 - int num_shards_; // 并发任务数 + folly::CPUThreadPoolExecutor* executor_; // Thread pool. + SynchronizedShardedMap& kv_store_; // Sharded map. + std::vector block_cursors_; // Index of processed blocks. + std::vector block_nums_snapshot_; // Snapshot of total blocks at eviction trigger. + std::vector>> shards_finished_; // Flags indicating whether shards are finished. + std::atomic evict_flag_; // Indicates whether an eviction task is ongoing. + std::atomic evict_interrupt_; // Indicates whether the eviction task is paused. + std::vector> futures_; // Records of shard tasks. + std::mutex mutex_; // Interface lock to ensure thread safety for public methods. + int num_shards_; // Number of concurrent tasks. }; -class CounterBasedEvict : public FeatureEvictBase { +template +class CounterBasedEvict : public FeatureEvictBase { public: CounterBasedEvict(folly::CPUThreadPoolExecutor* executor, - SynchronizedShardedMap& kv_store, + SynchronizedShardedMap& kv_store, float decay_rate, - int threshold) - : FeatureEvictBase(executor, kv_store), - decay_rate_(decay_rate), - threshold_(threshold) {} + uint32_t threshold) + : FeatureEvictBase(executor, kv_store), decay_rate_(decay_rate), threshold_(threshold) {} protected: - bool evict_block(float* block) override { - // 应用衰减并检查阈值 + bool evict_block(weight_type* block) override { + // Apply decay and check the threshold. auto current_count = FixedBlockPool::get_count(block); current_count *= decay_rate_; FixedBlockPool::set_count(block, current_count); @@ -176,24 +190,25 @@ class CounterBasedEvict : public FeatureEvictBase { } private: - float decay_rate_; - uint32_t threshold_; + float decay_rate_; // Decay rate for the block count. + uint32_t threshold_; // Threshold for eviction. }; -class TimeBasedEvict : public FeatureEvictBase { +template +class TimeBasedEvict : public FeatureEvictBase { public: TimeBasedEvict(folly::CPUThreadPoolExecutor* executor, - SynchronizedShardedMap& kv_store, + SynchronizedShardedMap& kv_store, uint32_t ttl) - : FeatureEvictBase(executor, kv_store), ttl_(ttl) {} + : FeatureEvictBase(executor, kv_store), ttl_(ttl) {} protected: - bool evict_block(float* block) override { + bool evict_block(weight_type* block) override { auto current_time = FixedBlockPool::current_timestamp(); return current_time - FixedBlockPool::get_timestamp(block) > ttl_; } private: - uint32_t ttl_; + uint32_t ttl_; // Time-to-live for eviction. }; -} // namespace kv_mem +} // namespace kv_mem \ No newline at end of file diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h index 0ffb9e40e5..f3f238d674 100644 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h @@ -22,7 +22,6 @@ class FixedBlockPool : public std::pmr::memory_resource { // Metadata structure (publicly accessible) // alignas(8) MetaHeader >= sizeof(void*), avoid mempool block too small. - // Metadata structure (publicly accessible) struct alignas(8) MetaHeader { // 16bytes int64_t key; // feature key 8bytes uint32_t timestamp; // 4 bytes,the unit is second, uint32 indicates a range of over 120 years @@ -54,12 +53,9 @@ class FixedBlockPool : public std::pmr::memory_resource { static uint32_t get_timestamp(const void* block) { return reinterpret_cast(block)->timestamp; } static void update_timestamp(void* block) { reinterpret_cast(block)->timestamp = current_timestamp(); } static uint32_t current_timestamp() { - // std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); - // facebook::WallClockUtil::NowInUsecFast(); return std::time(nullptr); } - // 与类型有关 // Calculate storage size template static size_t calculate_block_size(size_t dimension) { @@ -83,17 +79,6 @@ class FixedBlockPool : public std::pmr::memory_resource { return reinterpret_cast(reinterpret_cast(block) + sizeof(FixedBlockPool::MetaHeader)); } - template - scalar_t* get_block(size_t index) { - char* current_chunk = static_cast(chunks_[index / blocks_per_chunk_].ptr); - char* block = current_chunk + block_size_ * (index % blocks_per_chunk_); - if (FixedBlockPool::get_used(block)) { - return reinterpret_cast(block); - } else { - return nullptr; - } - }; - explicit FixedBlockPool(std::size_t block_size, // Size of each memory block std::size_t block_alignment, // Memory block alignment requirement std::size_t blocks_per_chunk = 8192, // Number of blocks per chunk @@ -149,6 +134,17 @@ class FixedBlockPool : public std::pmr::memory_resource { this->deallocate(block, block_size_, block_alignment_); } + template + scalar_t* get_block(size_t index) { + char* current_chunk = static_cast(chunks_[index / blocks_per_chunk_].ptr); + char* block = current_chunk + block_size_ * (index % blocks_per_chunk_); + if (FixedBlockPool::get_used(block)) { + return reinterpret_cast(block); + } else { + return nullptr; + } + }; + [[nodiscard]] const auto& get_chunks() const noexcept { return chunks_; } [[nodiscard]] std::size_t get_block_size() const noexcept { return block_size_; } [[nodiscard]] std::size_t get_block_alignment() const noexcept { return block_alignment_; } diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt b/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt index 9bf610f50d..d7566c00d9 100644 --- a/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt +++ b/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt @@ -8,14 +8,14 @@ include_directories( add_executable(fixed_block_pool_test ${CMAKE_CURRENT_SOURCE_DIR}/fixed_block_pool_test.cpp) target_compile_features(fixed_block_pool_test PUBLIC cxx_std_17) target_compile_options(fixed_block_pool_test PUBLIC "-O3") -target_link_libraries(fixed_block_pool_test gtest gtest_main) +target_link_libraries(fixed_block_pool_test gtest gtest_main Folly::folly) add_executable(sharded_map_test ${CMAKE_CURRENT_SOURCE_DIR}/sharded_map_test.cpp) target_compile_features(sharded_map_test PUBLIC cxx_std_17) target_compile_options(sharded_map_test PUBLIC "-O3") target_link_libraries(sharded_map_test gtest gtest_main Folly::folly) -add_executable(evict_test ${CMAKE_CURRENT_SOURCE_DIR}/evict_test.cpp) -target_compile_features(evict_test PUBLIC cxx_std_17) -target_compile_options(evict_test PUBLIC "-O3") -target_link_libraries(evict_test gtest gtest_main Folly::folly) \ No newline at end of file +add_executable(feature_evict_test ${CMAKE_CURRENT_SOURCE_DIR}/feature_evict_test.cpp) +target_compile_features(feature_evict_test PUBLIC cxx_std_17) +target_compile_options(feature_evict_test PUBLIC "-O3") +target_link_libraries(feature_evict_test gtest gtest_main Folly::folly) \ No newline at end of file diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp index 464ed6294f..72dce3093d 100644 --- a/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp +++ b/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp @@ -1,63 +1,51 @@ -// -// Created by arron on 2025/5/22. -// #include "fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h" #include #include #include +#include #include #include #include "fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h" namespace kv_mem { -class FeatureEvictTest : public ::testing::Test { - protected: - static constexpr int NUM_SHARDS = 4; - static constexpr int DIMENSION = 128; - size_t BLOCK_SIZE = FixedBlockPool::calculate_block_size(DIMENSION); - size_t BLOCK_ALIGNMENT = FixedBlockPool::calculate_block_alignment(); +static constexpr int DIMENSION = 128; +size_t BLOCK_SIZE = FixedBlockPool::calculate_block_size(DIMENSION); +size_t BLOCK_ALIGNMENT = FixedBlockPool::calculate_block_alignment(); - void SetUp() override { - executor_ = std::make_unique(4); - kv_store_ = std::make_unique>( - NUM_SHARDS, BLOCK_SIZE, BLOCK_ALIGNMENT); +TEST(FeatureEvictTest, BasicEviction) { + static constexpr int NUM_SHARDS = 8; + auto executor_ = std::make_unique(4); + auto kv_store_ = std::make_unique>(NUM_SHARDS, BLOCK_SIZE, BLOCK_ALIGNMENT); - // 插入测试数据 - - for (int i = 0; i < 1000; ++i) { - int shard_id = i % NUM_SHARDS; - auto wlock = kv_store_->by(shard_id).wlock(); - auto* pool = kv_store_->pool_by(shard_id); - float* block = pool->allocate_t(); - FixedBlockPool::set_key(block, i); - FixedBlockPool::set_count(block, 1); // 初始分数 - FixedBlockPool::set_used(block, true); - wlock->insert({i, block}); - } - - for (int i = 1000; i < 2000; ++i) { - int shard_id = i % NUM_SHARDS; - auto wlock = kv_store_->by(shard_id).wlock(); - auto* pool = kv_store_->pool_by(shard_id); - float* block = pool->allocate_t(); - FixedBlockPool::set_key(block, i); - FixedBlockPool::set_count(block, 2); // 初始分数 - FixedBlockPool::set_used(block, true); - wlock->insert({i, block}); - } + // Insert test data + for (int i = 0; i < 1000; ++i) { + int shard_id = i % NUM_SHARDS; + auto wlock = kv_store_->by(shard_id).wlock(); + auto* pool = kv_store_->pool_by(shard_id); + auto* block = pool->allocate_t(); + FixedBlockPool::set_key(block, i); + FixedBlockPool::set_count(block, 1); // Initial score + FixedBlockPool::set_used(block, true); + wlock->insert({i, block}); } - std::unique_ptr executor_; - std::unique_ptr> kv_store_; -}; + for (int i = 1000; i < 2000; ++i) { + int shard_id = i % NUM_SHARDS; + auto wlock = kv_store_->by(shard_id).wlock(); + auto* pool = kv_store_->pool_by(shard_id); + auto* block = pool->allocate_t(); + FixedBlockPool::set_key(block, i); + FixedBlockPool::set_count(block, 2); // Initial score + FixedBlockPool::set_used(block, true); + wlock->insert({i, block}); + } -TEST_F(FeatureEvictTest, BasicEviction) { CounterBasedEvict evictor(executor_.get(), *kv_store_.get(), 0.5f, 1); - // 初始验证 + // Initial validation size_t total_blocks = 0; for (int shard_id = 0; shard_id < NUM_SHARDS; ++shard_id) { auto rlock = kv_store_->by(shard_id).rlock(); @@ -65,22 +53,22 @@ TEST_F(FeatureEvictTest, BasicEviction) { } ASSERT_EQ(total_blocks, 2000); - // 执行淘汰 + // Perform eviction evictor.trigger_evict(); - // 验证淘汰过程 + // Validate eviction process while (evictor.is_evicting()) { evictor.resume(); std::this_thread::sleep_for(std::chrono::microseconds(5)); evictor.pause(); } - // 验证结果 + // Validate results size_t remaining = 0; for (int shard_id = 0; shard_id < NUM_SHARDS; ++shard_id) { auto rlock = kv_store_->by(shard_id).rlock(); remaining += rlock->size(); - // 验证分数衰减 + // Validate score decay for (const auto& [key, block] : *rlock) { ASSERT_EQ(FixedBlockPool::get_count(block), 1); } @@ -88,4 +76,59 @@ TEST_F(FeatureEvictTest, BasicEviction) { std::cout << "remaining: " << remaining << std::endl; ASSERT_EQ(remaining, 1000); } -} // namespace kv_mem + +TEST(FeatureEvictTest, PerformanceTest) { + static constexpr int NUM_SHARDS = 1; + // Test configurations + const std::vector test_sizes = {100'000, 500'000, 1'000'000, 5'000'000, 10'000'000}; + + fmt::print("\nPerformance Test Results:\n"); + fmt::print("{:<15} {:<15} {:<15}\n", "Size", "Time(ms)", "Items/ms"); + fmt::print("{:-<45}\n", ""); // 分隔线 + + for (const auto& size : test_sizes) { + // Create executor and store for each test size + auto executor = std::make_unique(8); + auto kv_store = + std::make_unique>(NUM_SHARDS, BLOCK_SIZE, BLOCK_ALIGNMENT, 1000); + + // Insert test data with different initial scores + for (int i = 0; i < size; ++i) { + int shard_id = i % NUM_SHARDS; + auto wlock = kv_store->by(shard_id).wlock(); + auto* pool = kv_store->pool_by(shard_id); + auto* block = pool->allocate_t(); + FixedBlockPool::set_key(block, i); + FixedBlockPool::set_count(block, (i % 2) ? 1 : 2); // Alternate between scores + FixedBlockPool::set_used(block, true); + wlock->insert({i, block}); + } + + // Measure eviction time + std::vector execution_times; + CounterBasedEvict evictor(executor.get(), *kv_store.get(), 0.5f, 1); + + auto start_time = std::chrono::high_resolution_clock::now(); + + // Perform eviction + evictor.trigger_evict(); + evictor.resume(); + while (evictor.is_evicting()) { + std::this_thread::sleep_for(std::chrono::microseconds(1)); + } + + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end_time - start_time).count(); + + std::size_t current_size = 0; + for (int shard_id = 0; shard_id < NUM_SHARDS; ++shard_id) { + auto wlock = kv_store->by(shard_id).wlock(); + current_size += wlock->size(); + } + double eviction_rate = static_cast(size - current_size) / static_cast(size); + + // Print results + fmt::print("{:<15d} {:<15d} {:<15.2f}\n", size, duration, eviction_rate); + } +} +} // namespace kv_mem \ No newline at end of file diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp index 47ef59d2de..f725d29a77 100644 --- a/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp +++ b/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_test.cpp @@ -6,14 +6,14 @@ #include #include +#include +#include #include - namespace kv_mem { double test_std_vector(size_t vector_size, size_t repeat_count) { - float sum = 0.0f; // Prevent optimization - std::vector> - all_vectors; // Store all vectors to prevent release + float sum = 0.0f; // Prevent optimization + std::vector> all_vectors; // Store all vectors to prevent release all_vectors.reserve(repeat_count); auto start = std::chrono::high_resolution_clock::now(); @@ -61,10 +61,9 @@ double test_pool_vector(size_t vector_size, size_t repeat_count) { } void benchmark_memory_allocators() { - std::cout << "====== Testing performance difference between memory pool and " - "native vector allocation for 10 million " - "times ======" - << std::endl; + fmt::print( + "====== Testing performance difference between memory pool and " + "native vector allocation for 10 million times ======\n"); // Vector sizes to test (in number of float elements) std::vector vector_sizes = {4, 8, 16, 32, 64, 128, 256}; @@ -73,33 +72,24 @@ void benchmark_memory_allocators() { const size_t repeat_count = 10'000'000; for (const auto& size : vector_sizes) { - std::cout << "Vector size: " << size << " floats (" - << (size * sizeof(float)) << " bytes)" << std::endl; - + fmt::print("Vector size: {} floats ({} bytes)\n", size, size * sizeof(float)); // Testing standard vector double std_time = test_std_vector(size, repeat_count); - std::cout << " Standard vector: " << std::fixed << std::setprecision(2) - << std_time << " ms" << std::endl; + fmt::print(" Standard vector: {:.2f} ms\n", std_time); // Testing memory pool double pool_time = test_pool_vector(size, repeat_count); - std::cout << " Memory pool: " << std::fixed << std::setprecision(2) - << pool_time << " ms" << std::endl; + fmt::print(" Memory pool: {:.2f} ms\n", pool_time); // Calculate speed improvement double speedup = std_time / pool_time; - std::cout << " Speed improvement: " << std::fixed << std::setprecision(2) - << speedup << "x" << std::endl; - - std::cout << std::endl; - std::cout << "============================" << std::endl; + fmt::print(" Speed improvement: {:.2f}x\n\n", speedup); + fmt::print("============================\n"); } } // Basic functionality test: Integer keys -TEST(FixedBlockPoolTest, benchmark_memory_allocators) { - benchmark_memory_allocators(); -} +TEST(FixedBlockPoolTest, benchmark_memory_allocators) { benchmark_memory_allocators(); } // Test constructor normal case TEST(FixedBlockPoolTest, ConstructorNormal) { @@ -208,14 +198,10 @@ TEST(FixedBlockPoolTest, ErrorHandling) { kv_mem::FixedBlockPool pool(block_size, alignment); // Try to allocate memory with incorrect size - EXPECT_THROW( - { [[maybe_unused]] void* p = pool.allocate(block_size * 2, alignment); }, - std::bad_alloc); + EXPECT_THROW({ [[maybe_unused]] void* p = pool.allocate(block_size * 2, alignment); }, std::bad_alloc); // Try to allocate memory with incorrect alignment - EXPECT_THROW( - { [[maybe_unused]] void* p = pool.allocate(block_size, alignment * 2); }, - std::bad_alloc); + EXPECT_THROW({ [[maybe_unused]] void* p = pool.allocate(block_size, alignment * 2); }, std::bad_alloc); } // Test memory reuse after deallocation @@ -250,8 +236,7 @@ TEST(FixedBlockPoolTest, CustomUpstreamResource) { class CountingResource : public std::pmr::memory_resource { public: - CountingResource(int& alloc_count, int& dealloc_count) - : alloc_count_(alloc_count), dealloc_count_(dealloc_count) {} + CountingResource(int& alloc_count, int& dealloc_count) : alloc_count_(alloc_count), dealloc_count_(dealloc_count) {} protected: void* do_allocate(size_t bytes, size_t alignment) override { @@ -264,10 +249,7 @@ TEST(FixedBlockPoolTest, CustomUpstreamResource) { std::pmr::new_delete_resource()->deallocate(p, bytes, alignment); } - bool do_is_equal( - const std::pmr::memory_resource& other) const noexcept override { - return this == &other; - } + bool do_is_equal(const std::pmr::memory_resource& other) const noexcept override { return this == &other; } private: int& alloc_count_; @@ -316,8 +298,7 @@ TEST(FixedBlockPool, BasicFunctionality) { // Test data pointer offset float* data = FixedBlockPool::data_ptr(block); - ASSERT_EQ(reinterpret_cast(data) - reinterpret_cast(block), - sizeof(FixedBlockPool::MetaHeader)); + ASSERT_EQ(reinterpret_cast(data) - reinterpret_cast(block), sizeof(FixedBlockPool::MetaHeader)); // Test timestamp update FixedBlockPool::update_timestamp(block); @@ -340,8 +321,7 @@ TEST(FixedBlockPool, MultiDimensionTest) { EXPECT_EQ(alignment % alignof(float), 0); // Verify block size calculation - const size_t expected_size = - sizeof(FixedBlockPool::MetaHeader) + dim * sizeof(float); + const size_t expected_size = sizeof(FixedBlockPool::MetaHeader) + dim * sizeof(float); EXPECT_EQ(block_size, expected_size); } } diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp index a2f0dcfb1a..5e4b59e206 100644 --- a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp +++ b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp @@ -2,24 +2,23 @@ #include #include +#include +#include #include #include "fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h" #include "fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h" namespace kv_mem { -std::vector generateFixedEmbedding(int dimension) { - return std::vector(dimension, 1.0); -} +std::vector generateFixedEmbedding(int dimension) { return std::vector(dimension, 1.0); } void memPoolEmbedding(int dimension, size_t numInserts, size_t numLookups) { const size_t numShards = 1; - SynchronizedShardedMap embeddingMap( - numShards, - dimension * sizeof(float), // block_size - alignof(float), // block_alignment - 8192); // blocks_per_chunk + SynchronizedShardedMap embeddingMap(numShards, + dimension * sizeof(float), // block_size + alignof(float), // block_alignment + 8192); // blocks_per_chunk double insertTime, lookupTime; { std::vector fixedEmbedding = generateFixedEmbedding(dimension); @@ -35,9 +34,7 @@ void memPoolEmbedding(int dimension, size_t numInserts, size_t numLookups) { wlock->insert_or_assign(i, arr); } auto endInsert = std::chrono::high_resolution_clock::now(); - insertTime = - std::chrono::duration(endInsert - startInsert) - .count(); + insertTime = std::chrono::duration(endInsert - startInsert).count(); } std::vector lookEmbedding(dimension); @@ -53,31 +50,25 @@ void memPoolEmbedding(int dimension, size_t numInserts, size_t numLookups) { } } auto endLookup = std::chrono::high_resolution_clock::now(); - lookupTime = - std::chrono::duration(endLookup - startLookup) - .count(); + lookupTime = std::chrono::duration(endLookup - startLookup).count(); } - std::cout << std::left << std::setw(20) << dimension; - std::cout << std::fixed << std::setprecision(2); - std::cout << std::setw(20) << insertTime; - std::cout << std::setw(20) << lookupTime; - std::cout << std::setw(20) << (100.0 * (double)hitCount / (double)numLookups); - std::cout << std::endl; + fmt::print("{:<20}{:<20.2f}{:<20.2f}{:<20.2f}\n", + dimension, + insertTime, + lookupTime, + 100.0 * static_cast(hitCount) / static_cast(numLookups)); } -void memPoolEmbeddingWithTime(int dimension, - size_t numInserts, - size_t numLookups) { +void memPoolEmbeddingWithTime(int dimension, size_t numInserts, size_t numLookups) { const size_t numShards = 1; size_t block_size = FixedBlockPool::calculate_block_size(dimension); size_t block_alignment = FixedBlockPool::calculate_block_alignment(); - SynchronizedShardedMap embeddingMap( - numShards, - block_size, // block_size - block_alignment, // block_alignment - 8192); // blocks_per_chunk + SynchronizedShardedMap embeddingMap(numShards, + block_size, // block_size + block_alignment, // block_alignment + 8192); // blocks_per_chunk double insertTime, lookupTime; { std::vector fixedEmbedding = generateFixedEmbedding(dimension); @@ -93,9 +84,7 @@ void memPoolEmbeddingWithTime(int dimension, wlock->insert_or_assign(i, block); } auto endInsert = std::chrono::high_resolution_clock::now(); - insertTime = - std::chrono::duration(endInsert - startInsert) - .count(); + insertTime = std::chrono::duration(endInsert - startInsert).count(); } std::vector lookEmbedding(dimension); @@ -114,17 +103,15 @@ void memPoolEmbeddingWithTime(int dimension, } } auto endLookup = std::chrono::high_resolution_clock::now(); - lookupTime = - std::chrono::duration(endLookup - startLookup) - .count(); + lookupTime = std::chrono::duration(endLookup - startLookup).count(); } - std::cout << std::left << std::setw(20) << dimension; - std::cout << std::fixed << std::setprecision(2); - std::cout << std::setw(20) << insertTime; - std::cout << std::setw(20) << lookupTime; - std::cout << std::setw(20) << (100.0 * (double)hitCount / (double)numLookups); - std::cout << std::endl; + // 替换输出部分 + fmt::print("{:<20}{:<20.2f}{:<20.2f}{:<20.2f}\n", + dimension, + insertTime, + lookupTime, + 100.0 * static_cast(hitCount) / static_cast(numLookups)); } int benchmark() { @@ -132,27 +119,20 @@ int benchmark() { const size_t numInserts = 1'000'000; // 1 million insert const size_t numLookups = 1'000'000; // 1 million find - std::cout - << "======================= mempool ====================================" - << std::endl; - std::cout << std::left << std::setw(20) << "dim" << std::setw(20) - << "insert time (ms)" << std::setw(20) << "find time (ms)" - << std::setw(20) << "hit rate (%)" << std::endl; + fmt::print("======================= mempool ====================================\n"); + fmt::print("{:<20}{:<20}{:<20}{:<20}\n", "dim", "insert time (ms)", "find time (ms)", "hit rate (%)"); for (int dim : dimensions) { memPoolEmbedding(dim, numInserts, numLookups); } - std::cout << std::endl << std ::endl; - - std::cout << "======================= mempool with time " - "====================================" - << std::endl; - std::cout << std::left << std::setw(20) << "dim" << std::setw(20) - << "insert time (ms)" << std::setw(20) << "find time (ms)" - << std::setw(20) << "hit rate (%)" << std::endl; + fmt::print("\n\n"); + std::fflush(stdout); + + fmt::print("======================= mempool with time ====================================\n"); + fmt::print("{:<20}{:<20}{:<20}{:<20}\n", "dim", "insert time (ms)", "find time (ms)", "hit rate (%)"); for (int dim : dimensions) { memPoolEmbeddingWithTime(dim, numInserts, numLookups); } - std::cout << std::endl << std ::endl; + fmt::print("\n\n"); return 0; } TEST(SynchronizedShardedMap, benchmark) { benchmark(); } From 99c14d090799bd5020c05a465b8f7a68d5a1d2e8 Mon Sep 17 00:00:00 2001 From: houzhenggang Date: Tue, 27 May 2025 19:34:34 +0800 Subject: [PATCH 07/12] QuantUtilsTest add static_cast --- test/QuantUtilsTest.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/QuantUtilsTest.cc b/test/QuantUtilsTest.cc index fdd9af4ebd..6ea7dd12aa 100644 --- a/test/QuantUtilsTest.cc +++ b/test/QuantUtilsTest.cc @@ -560,7 +560,7 @@ class EmbeddingQuantizeFixedNumberTest : public testing::TestWithParam { 1, 1, 1, 1, // All the same. Range: 0, min: 1 -64, -2.75, 61.625, 191, // Range: 255, min: -64. Picking 61.625 because it differs under FP16 (will become 61.5). }; - assert(float_test_input.size() == row * col); + assert(float_test_input.size() == static_cast(row * col)); float16_test_input.resize(float_test_input.size()); std::transform( From ffc0333d44dee192eb27cdb966ad8266cfb9055a Mon Sep 17 00:00:00 2001 From: WP Date: Tue, 27 May 2025 20:24:00 +0800 Subject: [PATCH 08/12] FeatureEvict supplement --- .../dram_kv_embedding_cache.h | 56 +++- .../dram_kv_embedding_cache_wrapper.h | 30 +- .../dram_kv_embedding_cache/feature_evict.h | 152 +++++++++- .../fixed_block_pool.h | 10 + .../feature_evict_test.cpp | 259 +++++++++++++++++- 5 files changed, 480 insertions(+), 27 deletions(-) diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h index 4507e2060f..5ceae13127 100644 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h @@ -15,7 +15,8 @@ #include "SynchronizedShardedMap.h" #include "deeplearning/fbgemm/fbgemm_gpu/src/ssd_split_embeddings_cache/initializer.h" -#include "store_value_utils.h" +#include "fixed_block_pool.h" +#include "feature_evict.h" #include #include @@ -46,6 +47,7 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB { /// @param max_D the maximum dimension of of embedding tensor /// @param uniform_init_lower the lower bound of the uniform distribution /// @param uniform_init_upper the upper bound of the uniform distribution + /// @param feature_evict_config feature evict config /// @param num_shards number of shards for the kvstore. This is to improve /// parallelization. Each key value pair will be sharded into one shard. /// @param num_threads num of threads that kvstore needs to be run upon for @@ -59,6 +61,7 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB { int64_t max_D, double uniform_init_lower, double uniform_init_upper, + FeatureEvictConfig feature_evict_config, int64_t num_shards = 8, int64_t num_threads = 32, int64_t row_storage_bitwidth = 32, @@ -68,10 +71,11 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB { max_D, 0), // l2_cache_size_gb =0 to disable l2 cache max_D_(max_D), + feature_evict_config_(feature_evict_config), num_shards_(num_shards), weight_ttl_in_hours_(weight_ttl_in_hours), - block_size_(StoreValueUtils::calculate_block_size(max_D)), - block_alignment_(StoreValueUtils::calculate_block_alignment()), + block_size_(FixedBlockPool::calculate_block_size(max_D)), + block_alignment_(FixedBlockPool::calculate_block_alignment()), kv_store_(SynchronizedShardedMap( num_shards_, block_size_, @@ -86,6 +90,9 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB { uniform_init_lower, uniform_init_upper, row_storage_bitwidth); + if (feature_evict_config_.trigger_mode != EvictTriggerMode::DISABLED) { + feature_evict_ = create_feature_evict(feature_evict_config_, executor_.get(), kv_store_, max_D); + } } void initialize_initializers( @@ -205,12 +212,13 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB { block = it->second; } else { // Key doesn't exist, allocate new block and insert. - block = StoreValueUtils::allocate( - block_size_, block_alignment_, pool); + block = pool->allocate_t(); wlmap->insert({id, block}); } - StoreValueUtils::update_timestamp(block); - auto* data_ptr = StoreValueUtils::data_ptr(block); + if (feature_evict_) { + feature_evict_->update_feature_statistics(block); + } + auto* data_ptr = FixedBlockPool::data_ptr(block); std::copy(weights[id_index] .template data_ptr(), weights[id_index] @@ -295,12 +303,11 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB { continue; } // use mempool - const auto* data_ptr = StoreValueUtils::data_ptr(cached_iter->second); - StoreValueUtils::update_timestamp(cached_iter->second); + const auto* data_ptr = FixedBlockPool::data_ptr(cached_iter->second); std::copy( data_ptr, data_ptr + max_D_, - &(weights_data_ptr[index * max_D_])); // dst_start + &(weights_data_ptr[id_index * max_D_])); // dst_start } } }); @@ -322,6 +329,32 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB { void compact() override {} + void trigger_feature_evict() { + if (feature_evict_) { + feature_evict_->trigger_evict(); + } + } + + void feature_evict_resume() { + if (feature_evict_) { + feature_evict_->resume(); + } + } + + void feature_evict_pause() { + if (feature_evict_) { + feature_evict_->pause(); + } + } + + void maybe_evict_by_step() { + if (feature_evict_config_.trigger_mode == EvictTriggerMode::ITERATION && + feature_evict_config_.trigger_step_interval > 0 && + ++current_iter_ % feature_evict_config_.trigger_step_interval == 0) { + trigger_feature_evict(); + } + } + private: void fill_from_row_storage( int shard_id, @@ -390,6 +423,9 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB { std::atomic_bool is_eviction_ongoing_ = false; std::vector> initializers_; int64_t elem_size_; + FeatureEvictConfig feature_evict_config_; + std::unique_ptr> feature_evict_; + int current_iter_ = 0; }; // class DramKVEmbeddingCache } // namespace kv_mem diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h index 0b915e50ba..9dc1be091b 100644 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h @@ -26,15 +26,34 @@ class DramKVEmbeddingCacheWrapper : public torch::jit::CustomClassHolder { int64_t max_D, double uniform_init_lower, double uniform_init_upper, + int evict_trigger_mode, + int evict_trigger_strategy, + int64_t trigger_step_interval, + uint32_t ttl, + uint32_t count_threshold, + float count_decay_rate, + double l2_weight_threshold, int64_t num_shards = 8, int64_t num_threads = 32, int64_t row_storage_bitwidth = 32, int64_t weight_ttl_in_hours = 2) { + + // feature evict config + FeatureEvictConfig feature_evict_config; + feature_evict_config.trigger_mode = static_cast(evict_trigger_mode); + feature_evict_config.trigger_strategy = static_cast(evict_trigger_strategy); + feature_evict_config.trigger_step_interval = trigger_step_interval; + feature_evict_config.ttl = ttl; + feature_evict_config.count_threshold = count_threshold; + feature_evict_config.count_decay_rate = count_decay_rate; + feature_evict_config.l2_weight_threshold = l2_weight_threshold; + if (row_storage_bitwidth == 16) { impl_ = std::make_shared>( max_D, uniform_init_lower, uniform_init_upper, + feature_evict_config, num_shards, num_threads, row_storage_bitwidth, @@ -44,6 +63,7 @@ class DramKVEmbeddingCacheWrapper : public torch::jit::CustomClassHolder { max_D, uniform_init_lower, uniform_init_upper, + feature_evict_config, num_shards, num_threads, row_storage_bitwidth, @@ -67,7 +87,11 @@ class DramKVEmbeddingCacheWrapper : public torch::jit::CustomClassHolder { } void set(at::Tensor indices, at::Tensor weights, at::Tensor count) { - return impl_->set(indices, weights, count); + impl_->feature_evict_pause(); + impl_->set(indices, weights, count); + // when use ITERATION EvictTriggerMode, trigger evict by step + impl_->maybe_evict_by_step(); + impl_->feature_evict_resume(); } void flush() { @@ -86,7 +110,9 @@ class DramKVEmbeddingCacheWrapper : public torch::jit::CustomClassHolder { at::Tensor weights, at::Tensor count, int64_t sleep_ms) { - return impl_->get(indices, weights, count, sleep_ms); + impl_->feature_evict_pause(); + impl_->get(indices, weights, count, sleep_ms); + impl_->feature_evict_resume(); } void wait_util_filling_work_done() { diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h b/fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h index a7c42b291e..8a384b3f55 100644 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h @@ -17,10 +17,28 @@ namespace kv_mem { +enum class EvictTriggerMode { + DISABLED, // Do not use feature evict + ITERATION, // Trigger based on iteration steps + MANUAL // Manually triggered by upstream +}; + +enum class EvictTriggerStrategy { BY_TIMESTAMP, BY_COUNTER, BY_TIMESTAMP_AND_COUNTER, BY_L2WEIGHT }; + +struct FeatureEvictConfig { + EvictTriggerStrategy trigger_strategy; + EvictTriggerMode trigger_mode; + int64_t trigger_step_interval; + uint32_t ttl; + uint32_t count_threshold; + float count_decay_rate; + double l2_weight_threshold; +}; + template -class FeatureEvictBase { +class FeatureEvict { public: - FeatureEvictBase(folly::CPUThreadPoolExecutor* executor, SynchronizedShardedMap& kv_store) + FeatureEvict(folly::CPUThreadPoolExecutor* executor, SynchronizedShardedMap& kv_store) : executor_(executor), kv_store_(kv_store), evict_flag_(false), @@ -29,7 +47,7 @@ class FeatureEvictBase { init_shard_status(); } - virtual ~FeatureEvictBase() { + virtual ~FeatureEvict() { wait_completion(); // Wait for all asynchronous tasks to complete. }; @@ -39,7 +57,6 @@ class FeatureEvictBase { void trigger_evict() { std::lock_guard lock(mutex_); if (evict_flag_.exchange(true)) return; - fmt::print("Starting new eviction process...\n"); prepare_evict(); } @@ -72,6 +89,8 @@ class FeatureEvictBase { return evict_flag_.load(); } + virtual void update_feature_statistics(weight_type* block) = 0; + protected: void init_shard_status() { block_cursors_.resize(num_shards_); @@ -172,13 +191,15 @@ class FeatureEvictBase { }; template -class CounterBasedEvict : public FeatureEvictBase { +class CounterBasedEvict : public FeatureEvict { public: CounterBasedEvict(folly::CPUThreadPoolExecutor* executor, SynchronizedShardedMap& kv_store, float decay_rate, uint32_t threshold) - : FeatureEvictBase(executor, kv_store), decay_rate_(decay_rate), threshold_(threshold) {} + : FeatureEvict(executor, kv_store), decay_rate_(decay_rate), threshold_(threshold) {} + + void update_feature_statistics(weight_type* block) override { FixedBlockPool::update_count(block); } protected: bool evict_block(weight_type* block) override { @@ -195,12 +216,14 @@ class CounterBasedEvict : public FeatureEvictBase { }; template -class TimeBasedEvict : public FeatureEvictBase { +class TimeBasedEvict : public FeatureEvict { public: TimeBasedEvict(folly::CPUThreadPoolExecutor* executor, SynchronizedShardedMap& kv_store, uint32_t ttl) - : FeatureEvictBase(executor, kv_store), ttl_(ttl) {} + : FeatureEvict(executor, kv_store), ttl_(ttl) {} + + void update_feature_statistics(weight_type* block) override { FixedBlockPool::update_timestamp(block); } protected: bool evict_block(weight_type* block) override { @@ -211,4 +234,115 @@ class TimeBasedEvict : public FeatureEvictBase { private: uint32_t ttl_; // Time-to-live for eviction. }; -} // namespace kv_mem \ No newline at end of file + +template +class TimeCounterBasedEvict : public FeatureEvict { + public: + TimeCounterBasedEvict(folly::CPUThreadPoolExecutor* executor, + SynchronizedShardedMap& kv_store, + uint32_t ttl, + float decay_rate, + uint32_t threshold) + : FeatureEvict(executor, kv_store), ttl_(ttl), decay_rate_(decay_rate), threshold_(threshold) {} + + void update_feature_statistics(weight_type* block) override { + FixedBlockPool::update_timestamp(block); + FixedBlockPool::update_count(block); + } + + protected: + bool evict_block(weight_type* block) override { + // Apply decay and check the count threshold and ttl. + auto current_time = FixedBlockPool::current_timestamp(); + auto current_count = FixedBlockPool::get_count(block); + current_count *= decay_rate_; + FixedBlockPool::set_count(block, current_count); + return (current_time - FixedBlockPool::get_timestamp(block) > ttl_) && (current_count < threshold_); + } + + private: + uint32_t ttl_; // Time-to-live for eviction. + float decay_rate_; // Decay rate for the block count. + uint32_t threshold_; // Count threshold for eviction. +}; + +template +class L2WeightBasedEvict : public FeatureEvict { + public: + L2WeightBasedEvict(folly::CPUThreadPoolExecutor* executor, + SynchronizedShardedMap& kv_store, + double threshold, + size_t dimension) + : FeatureEvict(executor, kv_store), threshold_(threshold), dimension_(dimension) {} + + void update_feature_statistics([[maybe_unused]] weight_type* block) override {} + + protected: + bool evict_block(weight_type* block) override { + auto l2weight = FixedBlockPool::get_l2weight(block, dimension_); + return l2weight < threshold_; + } + + private: + double threshold_; // L2 weight threshold for eviction. + size_t dimension_; // Embedding dimension +}; + +template +std::unique_ptr> create_feature_evict( + const FeatureEvictConfig& config, + folly::CPUThreadPoolExecutor* executor, + SynchronizedShardedMap& kv_store, + size_t dimension) { + if (executor == nullptr) { + throw std::invalid_argument("executor cannot be null"); + } + + switch (config.trigger_strategy) { + case EvictTriggerStrategy::BY_TIMESTAMP: { + if (config.ttl <= 0) { + throw std::invalid_argument("ttl must be positive"); + } + return std::make_unique>(executor, kv_store, config.ttl); + } + + case EvictTriggerStrategy::BY_COUNTER: { + if (config.count_decay_rate <= 0 || config.count_decay_rate > 1) { + throw std::invalid_argument("count_decay_rate must be in range (0,1]"); + } + if (config.count_threshold <= 0) { + throw std::invalid_argument("count_threshold must be positive"); + } + return std::make_unique>( + executor, kv_store, config.count_decay_rate, config.count_threshold); + } + + case EvictTriggerStrategy::BY_TIMESTAMP_AND_COUNTER: { + if (config.ttl <= 0) { + throw std::invalid_argument("ttl must be positive"); + } + if (config.count_decay_rate <= 0 || config.count_decay_rate > 1) { + throw std::invalid_argument("count_decay_rate must be in range (0,1]"); + } + if (config.count_threshold <= 0) { + throw std::invalid_argument("count_threshold must be positive"); + } + return std::make_unique>( + executor, kv_store, config.ttl, config.count_decay_rate, config.count_threshold); + } + + case EvictTriggerStrategy::BY_L2WEIGHT: { + if (config.l2_weight_threshold <= 0) { + throw std::invalid_argument("l2_weight_threshold must be positive"); + } + // TODO: optimizer parameters should not be included in dimension + return std::make_unique>( + executor, kv_store, config.l2_weight_threshold, dimension); + } + + default: + throw std::runtime_error("Unknown evict trigger strategy"); + } +} + +} // namespace kv_mem diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h index f3f238d674..54f7c402a0 100644 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include @@ -79,6 +81,14 @@ class FixedBlockPool : public std::pmr::memory_resource { return reinterpret_cast(reinterpret_cast(block) + sizeof(FixedBlockPool::MetaHeader)); } + template + static scalar_t get_l2weight(scalar_t* block, size_t dimension) { + scalar_t* data = FixedBlockPool::data_ptr(block); + return std::sqrt( + std::accumulate(data, data + dimension, scalar_t(0), + [](scalar_t sum, scalar_t val) { return sum + val * val; })); + } + explicit FixedBlockPool(std::size_t block_size, // Size of each memory block std::size_t block_alignment, // Memory block alignment requirement std::size_t blocks_per_chunk = 8192, // Number of blocks per chunk diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp index 72dce3093d..48a39d8b45 100644 --- a/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp +++ b/fbgemm_gpu/test/dram_kv_embedding_cache/feature_evict_test.cpp @@ -15,7 +15,7 @@ static constexpr int DIMENSION = 128; size_t BLOCK_SIZE = FixedBlockPool::calculate_block_size(DIMENSION); size_t BLOCK_ALIGNMENT = FixedBlockPool::calculate_block_alignment(); -TEST(FeatureEvictTest, BasicEviction) { +TEST(FeatureEvictTest, CounterBasedEviction) { static constexpr int NUM_SHARDS = 8; auto executor_ = std::make_unique(4); auto kv_store_ = std::make_unique>(NUM_SHARDS, BLOCK_SIZE, BLOCK_ALIGNMENT); @@ -43,7 +43,21 @@ TEST(FeatureEvictTest, BasicEviction) { wlock->insert({i, block}); } - CounterBasedEvict evictor(executor_.get(), *kv_store_.get(), 0.5f, 1); + std::unique_ptr> feature_evict; + int evict_trigger_mode = 2; + int evict_trigger_strategy = 1; + uint32_t count_threshold = 1; + float count_decay_rate = 0.5; + // feature evict config + FeatureEvictConfig feature_evict_config; + feature_evict_config.trigger_mode = static_cast(evict_trigger_mode); + feature_evict_config.trigger_strategy = static_cast(evict_trigger_strategy); + feature_evict_config.count_threshold = count_threshold; + feature_evict_config.count_decay_rate = count_decay_rate; + + if (feature_evict_config.trigger_mode != EvictTriggerMode::DISABLED) { + feature_evict = create_feature_evict(feature_evict_config, executor_.get(),*kv_store_.get(), 4); + } // Initial validation size_t total_blocks = 0; @@ -54,13 +68,13 @@ TEST(FeatureEvictTest, BasicEviction) { ASSERT_EQ(total_blocks, 2000); // Perform eviction - evictor.trigger_evict(); + feature_evict->trigger_evict(); // Validate eviction process - while (evictor.is_evicting()) { - evictor.resume(); + while (feature_evict->is_evicting()) { + feature_evict->resume(); std::this_thread::sleep_for(std::chrono::microseconds(5)); - evictor.pause(); + feature_evict->pause(); } // Validate results @@ -77,6 +91,239 @@ TEST(FeatureEvictTest, BasicEviction) { ASSERT_EQ(remaining, 1000); } +TEST(FeatureEvictTest, TimeBasedEviction) { + static constexpr int NUM_SHARDS = 8; + auto executor_ = std::make_unique(4); + auto kv_store_ = std::make_unique>(NUM_SHARDS, BLOCK_SIZE, BLOCK_ALIGNMENT); + + // Insert test data + for (int i = 0; i < 1000; ++i) { + int shard_id = i % NUM_SHARDS; + auto wlock = kv_store_->by(shard_id).wlock(); + auto* pool = kv_store_->pool_by(shard_id); + auto* block = pool->allocate_t(); + FixedBlockPool::set_key(block, i); + FixedBlockPool::update_timestamp(block); // Initial score + FixedBlockPool::set_used(block, true); + wlock->insert({i, block}); + } + std::this_thread::sleep_for(std::chrono::seconds(5)); + + for (int i = 1000; i < 2000; ++i) { + int shard_id = i % NUM_SHARDS; + auto wlock = kv_store_->by(shard_id).wlock(); + auto* pool = kv_store_->pool_by(shard_id); + auto* block = pool->allocate_t(); + FixedBlockPool::set_key(block, i); + FixedBlockPool::update_timestamp(block); // Initial score + FixedBlockPool::set_used(block, true); + wlock->insert({i, block}); + } + + std::unique_ptr> feature_evict; + int evict_trigger_mode = 2; + int evict_trigger_strategy = 0; + uint32_t ttl = 4; + // feature evict config + FeatureEvictConfig feature_evict_config; + feature_evict_config.trigger_mode = static_cast(evict_trigger_mode); + feature_evict_config.trigger_strategy = static_cast(evict_trigger_strategy); + feature_evict_config.ttl = ttl; + + if (feature_evict_config.trigger_mode != EvictTriggerMode::DISABLED) { + feature_evict = create_feature_evict(feature_evict_config, executor_.get(),*kv_store_.get(), 4); + } + + // Initial validation + size_t total_blocks = 0; + for (int shard_id = 0; shard_id < NUM_SHARDS; ++shard_id) { + auto rlock = kv_store_->by(shard_id).rlock(); + total_blocks += rlock->size(); + } + ASSERT_EQ(total_blocks, 2000); + + // Perform eviction + feature_evict->trigger_evict(); + + // Validate eviction process + while (feature_evict->is_evicting()) { + feature_evict->resume(); + std::this_thread::sleep_for(std::chrono::microseconds(5)); + feature_evict->pause(); + } + + // Validate results + size_t remaining = 0; + for (int shard_id = 0; shard_id < NUM_SHARDS; ++shard_id) { + auto rlock = kv_store_->by(shard_id).rlock(); + remaining += rlock->size(); + } + std::cout << "remaining: " << remaining << std::endl; + ASSERT_EQ(remaining, 1000); +} + +TEST(FeatureEvictTest, TimeCounterBasedEviction) { + static constexpr int NUM_SHARDS = 8; + auto executor_ = std::make_unique(4); + auto kv_store_ = std::make_unique>(NUM_SHARDS, BLOCK_SIZE, BLOCK_ALIGNMENT); + + // Insert test data + for (int i = 0; i < 500; ++i) { + int shard_id = i % NUM_SHARDS; + auto wlock = kv_store_->by(shard_id).wlock(); + auto* pool = kv_store_->pool_by(shard_id); + auto* block = pool->allocate_t(); + FixedBlockPool::set_key(block, i); + FixedBlockPool::update_timestamp(block); // Initial score + FixedBlockPool::set_count(block, 1); + FixedBlockPool::set_used(block, true); + wlock->insert({i, block}); + } + std::this_thread::sleep_for(std::chrono::seconds(5)); + for (int i = 500; i < 1000; ++i) { + int shard_id = i % NUM_SHARDS; + auto wlock = kv_store_->by(shard_id).wlock(); + auto* pool = kv_store_->pool_by(shard_id); + auto* block = pool->allocate_t(); + FixedBlockPool::set_key(block, i); + FixedBlockPool::update_timestamp(block); // Initial score + FixedBlockPool::set_count(block, 1); + FixedBlockPool::set_used(block, true); + wlock->insert({i, block}); + } + + for (int i = 1000; i < 2000; ++i) { + int shard_id = i % NUM_SHARDS; + auto wlock = kv_store_->by(shard_id).wlock(); + auto* pool = kv_store_->pool_by(shard_id); + auto* block = pool->allocate_t(); + FixedBlockPool::set_key(block, i); + FixedBlockPool::update_timestamp(block); // Initial score + FixedBlockPool::set_count(block, 2); + FixedBlockPool::set_used(block, true); + wlock->insert({i, block}); + } + + std::unique_ptr> feature_evict; + int evict_trigger_mode = 2; + int evict_trigger_strategy = 2; + uint32_t ttl = 4; + uint32_t count_threshold = 1; + float count_decay_rate = 0.5; + + // feature evict config + FeatureEvictConfig feature_evict_config; + feature_evict_config.trigger_mode = static_cast(evict_trigger_mode); + feature_evict_config.trigger_strategy = static_cast(evict_trigger_strategy); + feature_evict_config.ttl = ttl; + feature_evict_config.count_threshold = count_threshold; + feature_evict_config.count_decay_rate = count_decay_rate; + + if (feature_evict_config.trigger_mode != EvictTriggerMode::DISABLED) { + feature_evict = create_feature_evict(feature_evict_config, executor_.get(),*kv_store_.get(), 4); + } + + // Initial validation + size_t total_blocks = 0; + for (int shard_id = 0; shard_id < NUM_SHARDS; ++shard_id) { + auto rlock = kv_store_->by(shard_id).rlock(); + total_blocks += rlock->size(); + } + ASSERT_EQ(total_blocks, 2000); + + // Perform eviction + feature_evict->trigger_evict(); + + // Validate eviction process + while (feature_evict->is_evicting()) { + feature_evict->resume(); + std::this_thread::sleep_for(std::chrono::microseconds(5)); + feature_evict->pause(); + } + + // Validate results + size_t remaining = 0; + for (int shard_id = 0; shard_id < NUM_SHARDS; ++shard_id) { + auto rlock = kv_store_->by(shard_id).rlock(); + remaining += rlock->size(); + } + std::cout << "remaining: " << remaining << std::endl; + ASSERT_EQ(remaining, 1500); +} + +TEST(FeatureEvictTest, L2WeightBasedEviction) { + static constexpr int NUM_SHARDS = 8; + auto executor_ = std::make_unique(4); + auto kv_store_ = std::make_unique>(NUM_SHARDS, BLOCK_SIZE, BLOCK_ALIGNMENT); + int dim = 4; + std::vector weight1(dim, 1.0); + // Insert test data + for (int i = 0; i < 1000; ++i) { + int shard_id = i % NUM_SHARDS; + auto wlock = kv_store_->by(shard_id).wlock(); + auto* pool = kv_store_->pool_by(shard_id); + auto* block = pool->allocate_t(); + auto* data_ptr = FixedBlockPool::data_ptr(block); + FixedBlockPool::set_key(block, i); + std::copy(weight1.begin(), weight1.end(), data_ptr); + FixedBlockPool::set_used(block, true); + wlock->insert({i, block}); + } + std::vector weight2(dim, 2.0); + for (int i = 1000; i < 2000; ++i) { + int shard_id = i % NUM_SHARDS; + auto wlock = kv_store_->by(shard_id).wlock(); + auto* pool = kv_store_->pool_by(shard_id); + auto* block = pool->allocate_t(); + auto* data_ptr = FixedBlockPool::data_ptr(block); + FixedBlockPool::set_key(block, i); + std::copy(weight2.begin(), weight2.end(), data_ptr); + FixedBlockPool::set_used(block, true); + wlock->insert({i, block}); + } + + std::unique_ptr> feature_evict; + int evict_trigger_mode = 2; + int evict_trigger_strategy = 3; + double l2_weight_threshold = 3.0; + // feature evict config + FeatureEvictConfig feature_evict_config; + feature_evict_config.trigger_mode = static_cast(evict_trigger_mode); + feature_evict_config.trigger_strategy = static_cast(evict_trigger_strategy); + feature_evict_config.l2_weight_threshold = l2_weight_threshold; + + if (feature_evict_config.trigger_mode != EvictTriggerMode::DISABLED) { + feature_evict = create_feature_evict(feature_evict_config, executor_.get(),*kv_store_.get(), dim); + } + + // Initial validation + size_t total_blocks = 0; + for (int shard_id = 0; shard_id < NUM_SHARDS; ++shard_id) { + auto rlock = kv_store_->by(shard_id).rlock(); + total_blocks += rlock->size(); + } + ASSERT_EQ(total_blocks, 2000); + + // Perform eviction + feature_evict->trigger_evict(); + + // Validate eviction process + while (feature_evict->is_evicting()) { + feature_evict->resume(); + std::this_thread::sleep_for(std::chrono::microseconds(5)); + feature_evict->pause(); + } + + // Validate results + size_t remaining = 0; + for (int shard_id = 0; shard_id < NUM_SHARDS; ++shard_id) { + auto rlock = kv_store_->by(shard_id).rlock(); + remaining += rlock->size(); + } + std::cout << "remaining: " << remaining << std::endl; + ASSERT_EQ(remaining, 1000); +} + TEST(FeatureEvictTest, PerformanceTest) { static constexpr int NUM_SHARDS = 1; // Test configurations From e094e47076ca26cf9798800c06d408711d159fef Mon Sep 17 00:00:00 2001 From: WP Date: Wed, 28 May 2025 11:27:21 +0800 Subject: [PATCH 09/12] add Memory statistics --- .../SynchronizedShardedMap.h | 11 ++++++ .../dram_kv_embedding_cache.h | 4 +++ .../dram_kv_embedding_cache_wrapper.h | 4 +++ .../sharded_map_test.cpp | 36 +++++++++++++++++++ 4 files changed, 55 insertions(+) diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h b/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h index 3cd4c61c6f..e773a068ec 100644 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h @@ -53,6 +53,17 @@ class SynchronizedShardedMap { auto getNumShards() { return shards_.size(); } + auto getUsedMemSize() { + size_t used_mem_size = 0; + size_t block_size = mempools_[0]->get_block_size(); + for (size_t i = 0; i < shards_.size(); ++i) { + auto rlmap = shards_[i].rlock(); + // only calculate the sizes of K, V and block that are used + used_mem_size += rlmap->size() * (sizeof(K) + sizeof(V) + block_size); + } + return used_mem_size; + } + private: std::vector, M>> shards_; std::vector> mempools_; diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h index 5ceae13127..3749b3a81b 100644 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h @@ -355,6 +355,10 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB { } } + size_t get_map_used_memsize() { + return kv_store_.getUsedMemSize(); + } + private: void fill_from_row_storage( int shard_id, diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h index 9dc1be091b..fe6a345a0c 100644 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h @@ -123,6 +123,10 @@ class DramKVEmbeddingCacheWrapper : public torch::jit::CustomClassHolder { return impl_->get_keys_in_range(start, end); } + size_t get_map_used_memsize() { + return impl_->get_map_used_memsize(); + } + private: // friend class EmbeddingRocksDBWrapper; friend class ssd::KVTensorWrapper; diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp index 5e4b59e206..d84acbb1d8 100644 --- a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp +++ b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp @@ -114,6 +114,36 @@ void memPoolEmbeddingWithTime(int dimension, size_t numInserts, size_t numLookup 100.0 * static_cast(hitCount) / static_cast(numLookups)); } +void memPoolEmbeddingMemSize(int dimension, size_t numInserts) { + const size_t numShards = 4; + size_t block_size = FixedBlockPool::calculate_block_size(dimension); + size_t block_alignment = FixedBlockPool::calculate_block_alignment(); + + SynchronizedShardedMap embeddingMap(numShards, + block_size, // block_size + block_alignment, // block_alignment + 8192); // blocks_per_chunk + { + std::vector fixedEmbedding = generateFixedEmbedding(dimension); + + auto wlock = embeddingMap.by(0).wlock(); + auto* pool = embeddingMap.pool_by(0); + + for (size_t i = 0; i < numInserts; i++) { + auto* block = pool->allocate_t(); + auto* data_ptr = FixedBlockPool::data_ptr(block); + std::copy(fixedEmbedding.begin(), fixedEmbedding.end(), data_ptr); + wlock->insert_or_assign(i, block); + } + } + size_t totalMemory = embeddingMap.getUsedMemSize(); + fmt::print("{:<20}{:<20}{:<20.2f}\n", + dimension, + numInserts, + static_cast(totalMemory) / (1024 * 1024)); // MB + +} + int benchmark() { std::vector dimensions = {4, 8, 16, 32, 64}; const size_t numInserts = 1'000'000; // 1 million insert @@ -133,6 +163,12 @@ int benchmark() { memPoolEmbeddingWithTime(dim, numInserts, numLookups); } fmt::print("\n\n"); + + fmt::print("======================= memory usage statistics ====================================\n"); + fmt::print("{:<20}{:<20}{:<20}\n","dim", "numInserts", "total memory (MB)"); + for (int dim : dimensions) { + memPoolEmbeddingMemSize(dim, numInserts); + } return 0; } TEST(SynchronizedShardedMap, benchmark) { benchmark(); } From 33d7bb982e8741227cd496c6c54f217f68fa0412 Mon Sep 17 00:00:00 2001 From: WP Date: Wed, 28 May 2025 11:50:53 +0800 Subject: [PATCH 10/12] concern block align --- .../src/dram_kv_embedding_cache/SynchronizedShardedMap.h | 2 +- fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h b/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h index e773a068ec..2583ee03d3 100644 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h @@ -55,7 +55,7 @@ class SynchronizedShardedMap { auto getUsedMemSize() { size_t used_mem_size = 0; - size_t block_size = mempools_[0]->get_block_size(); + size_t block_size = mempools_[0]->get_aligned_block_size(); for (size_t i = 0; i < shards_.size(); ++i) { auto rlmap = shards_[i].rlock(); // only calculate the sizes of K, V and block that are used diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h index 54f7c402a0..ff1adaaf82 100644 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h @@ -159,6 +159,9 @@ class FixedBlockPool : public std::pmr::memory_resource { [[nodiscard]] std::size_t get_block_size() const noexcept { return block_size_; } [[nodiscard]] std::size_t get_block_alignment() const noexcept { return block_alignment_; } [[nodiscard]] std::size_t get_blocks_per_chunk() const noexcept { return blocks_per_chunk_; } + [[nodiscard]] std::size_t get_aligned_block_size() const noexcept { + return (block_size_ + block_alignment_ - 1) / block_alignment_ * block_alignment_; + } protected: // Core allocation function From 7e69add99d0c20a361690f593efe9e9a7ac27cf8 Mon Sep 17 00:00:00 2001 From: WP Date: Wed, 28 May 2025 11:59:57 +0800 Subject: [PATCH 11/12] add const --- fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h | 2 +- .../src/dram_kv_embedding_cache/dram_kv_embedding_cache.h | 2 +- .../dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h b/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h index 2583ee03d3..1948b42c8d 100644 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h @@ -53,7 +53,7 @@ class SynchronizedShardedMap { auto getNumShards() { return shards_.size(); } - auto getUsedMemSize() { + auto getUsedMemSize() const { size_t used_mem_size = 0; size_t block_size = mempools_[0]->get_aligned_block_size(); for (size_t i = 0; i < shards_.size(); ++i) { diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h index 3749b3a81b..32225fe059 100644 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h @@ -355,7 +355,7 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB { } } - size_t get_map_used_memsize() { + size_t get_map_used_memsize() const { return kv_store_.getUsedMemSize(); } diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h index fe6a345a0c..2543091d6e 100644 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h @@ -123,7 +123,7 @@ class DramKVEmbeddingCacheWrapper : public torch::jit::CustomClassHolder { return impl_->get_keys_in_range(start, end); } - size_t get_map_used_memsize() { + size_t get_map_used_memsize() const { return impl_->get_map_used_memsize(); } From 97639674e76e83284c905baac779a545206a5e96 Mon Sep 17 00:00:00 2001 From: houzhenggang Date: Wed, 28 May 2025 16:34:10 +0800 Subject: [PATCH 12/12] hashtable save and load --- .../SynchronizedShardedMap.h | 48 +++++- .../fixed_block_pool.h | 108 +++++++++++- .../dram_kv_embedding_cache/CMakeLists.txt | 27 +-- .../fixed_block_pool_saver_test.cpp | 157 ++++++++++++++++++ .../sharded_map_test.cpp | 53 ++++++ 5 files changed, 378 insertions(+), 15 deletions(-) create mode 100644 fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_saver_test.cpp diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h b/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h index 1948b42c8d..0dda6e6fcb 100644 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/SynchronizedShardedMap.h @@ -51,7 +51,7 @@ class SynchronizedShardedMap { return mempools_.at(index % shards_.size()).get(); } - auto getNumShards() { return shards_.size(); } + auto getNumShards() const { return shards_.size(); } auto getUsedMemSize() const { size_t used_mem_size = 0; @@ -64,6 +64,52 @@ class SynchronizedShardedMap { return used_mem_size; } + void save(const std::string& filename) const { + std::ofstream out(filename, std::ios::binary); + if (!out) { + throw std::runtime_error("Failed to open file for writing"); + } + + const std::size_t num_shards = getNumShards(); + out.write(reinterpret_cast(&num_shards), sizeof(num_shards)); + out.close(); + + // save every mempool + for (std::size_t shard_id = 0; shard_id < getNumShards(); ++shard_id) { + std::string pool_filename = filename + ".pool." + std::to_string(i); + auto wlock = shards_[shard_id].wlock(); + mempools_[shard_id]->serialize(pool_filename); + } + } + + void load(const std::string& filename) { + std::ifstream in(filename, std::ios::binary); + if (!in) { + throw std::runtime_error("Failed to open file for reading"); + } + + size_t num_shards; + in.read(reinterpret_cast(&num_shards), sizeof(num_shards)); + in.close(); + + if (num_shards != getNumShards()) { + throw std::runtime_error("Shard count mismatch between file and map"); + } + + for (std::size_t shard_id = 0; shard_id < getNumShards(); ++shard_id) { + std::string pool_filename = filename + ".pool." + std::to_string(i); + auto wlock = shards_[shard_id].wlock(); + // first deserialize mempool + mempools_[shard_id]->deserialize(pool_filename); + // load map from mempool + wlock->clear(); + mempools_[shard_id]->for_each_block([&wlock](void* block) { + auto key = FixedBlockPool::get_key(block); + wlock->emplace(key, reinterpret_cast(block)); + }); + } + } + private: std::vector, M>> shards_; std::vector> mempools_; diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h index ff1adaaf82..f8acbffb09 100644 --- a/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h +++ b/fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h @@ -2,13 +2,15 @@ #include #include +#include #include +#include #include #include -#include -#include #include +#include +#include namespace kv_mem { static constexpr uint32_t kMaxInt31Counter = 2147483647; @@ -155,6 +157,108 @@ class FixedBlockPool : public std::pmr::memory_resource { } }; + template + void for_each_block(Func&& func) const { + for (const auto& chunk : chunks_) { + char* current = static_cast(chunk.ptr); + for (size_t i = 0; i < blocks_per_chunk_; ++i) { + if (FixedBlockPool::get_used(current)) { + func(current); + } + current += block_size_; + } + } + } + + void serialize(const std::string& filename) const { + auto start = std::chrono::high_resolution_clock::now(); + + std::ofstream out(filename, std::ios::binary); + if (!out) { + throw std::runtime_error("Failed to open file for writing"); + } + // Write metadata + out.write(reinterpret_cast(&block_size_), sizeof(block_size_)); + out.write(reinterpret_cast(&block_alignment_), sizeof(block_alignment_)); + out.write(reinterpret_cast(&blocks_per_chunk_), sizeof(blocks_per_chunk_)); + const size_t num_chunks = chunks_.size(); + out.write(reinterpret_cast(&num_chunks), sizeof(num_chunks)); + + // Write data for each chunk + for (const auto& chunk : chunks_) { + assert(chunk.size == block_size_ * blocks_per_chunk_); + out.write(static_cast(chunk.ptr), static_cast(chunk.size)); + } + out.flush(); + out.close(); + double data_size_mb = static_cast((block_size_ * chunks_.size() * blocks_per_chunk_)) / (1024.0 * 1024.0); + + auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration(end - start).count(); + + fmt::print("Serialized {}: size={:.3f}MB, time={}s, throughput={:.3f}MB/s\n", + filename, + data_size_mb, + duration, + (data_size_mb / duration)); + } + + void deserialize(const std::string& filename) { + auto start = std::chrono::high_resolution_clock::now(); + + std::ifstream in(filename, std::ios::binary); + if (!in) { + throw std::runtime_error("Failed to open file for reading"); + } + + // Read metadata + std::size_t block_size, block_alignment, blocks_per_chunk, num_chunks; + in.read(reinterpret_cast(&block_size), sizeof(block_size)); + in.read(reinterpret_cast(&block_alignment), sizeof(block_alignment)); + in.read(reinterpret_cast(&blocks_per_chunk), sizeof(blocks_per_chunk)); + in.read(reinterpret_cast(&num_chunks), sizeof(num_chunks)); + + // Validate parameters + if (block_size != block_size_) { + throw std::invalid_argument("Invalid block_size in file"); + } + if (block_alignment != block_alignment_) { + throw std::invalid_argument("Invalid block_alignment in file"); + } + if (blocks_per_chunk != blocks_per_chunk_) { + throw std::invalid_argument("Invalid blocks_per_chunk_ in file"); + } + + // Read data for each chunk and rebuild memory structure + const std::size_t chunk_size = block_size_ * blocks_per_chunk_; + for (size_t i = 0; i < num_chunks; ++i) { + void* chunk_ptr = upstream_->allocate(chunk_size, block_alignment_); + in.read(static_cast(chunk_ptr), static_cast(chunk_size)); + // Add chunk to memory pool + chunks_.push_back({chunk_ptr, chunk_size, block_alignment}); + // Rebuild free_list_ + char* current = static_cast(chunk_ptr); + for (size_t j = 0; j < blocks_per_chunk; ++j) { + void* block = current + j * block_size; + if (!get_used(block)) { + do_deallocate(block, block_size_, block_alignment_); + } + } + } + in.close(); + + double data_size_mb = static_cast((block_size_ * chunks_.size() * blocks_per_chunk_)) / (1024.0 * 1024.0); + + auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration(end - start).count(); + + fmt::print("Deserialized {}: size={:.3f}MB, time={}s, throughput={:.3f}MB/s\n", + filename, + data_size_mb, + duration, + (data_size_mb / duration)); + } + [[nodiscard]] const auto& get_chunks() const noexcept { return chunks_; } [[nodiscard]] std::size_t get_block_size() const noexcept { return block_size_; } [[nodiscard]] std::size_t get_block_alignment() const noexcept { return block_alignment_; } diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt b/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt index d7566c00d9..9ab483eab3 100644 --- a/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt +++ b/fbgemm_gpu/test/dram_kv_embedding_cache/CMakeLists.txt @@ -5,17 +5,20 @@ include_directories( ${FBGEMM_SOURCE_DIR} ) -add_executable(fixed_block_pool_test ${CMAKE_CURRENT_SOURCE_DIR}/fixed_block_pool_test.cpp) -target_compile_features(fixed_block_pool_test PUBLIC cxx_std_17) -target_compile_options(fixed_block_pool_test PUBLIC "-O3") -target_link_libraries(fixed_block_pool_test gtest gtest_main Folly::folly) +set(COMMON_COMPILE_FEATURES cxx_std_17) +set(COMMON_COMPILE_OPTIONS "-O3") +set(COMMON_LINK_LIBRARIES gtest gtest_main Folly::folly) -add_executable(sharded_map_test ${CMAKE_CURRENT_SOURCE_DIR}/sharded_map_test.cpp) -target_compile_features(sharded_map_test PUBLIC cxx_std_17) -target_compile_options(sharded_map_test PUBLIC "-O3") -target_link_libraries(sharded_map_test gtest gtest_main Folly::folly) +set(TEST_TARGETS + fixed_block_pool_test + fixed_block_pool_saver_test + sharded_map_test + feature_evict_test +) -add_executable(feature_evict_test ${CMAKE_CURRENT_SOURCE_DIR}/feature_evict_test.cpp) -target_compile_features(feature_evict_test PUBLIC cxx_std_17) -target_compile_options(feature_evict_test PUBLIC "-O3") -target_link_libraries(feature_evict_test gtest gtest_main Folly::folly) \ No newline at end of file +foreach (target ${TEST_TARGETS}) + add_executable(${target} ${CMAKE_CURRENT_SOURCE_DIR}/${target}.cpp) + target_compile_features(${target} PUBLIC ${COMMON_COMPILE_FEATURES}) + target_compile_options(${target} PUBLIC ${COMMON_COMPILE_OPTIONS}) + target_link_libraries(${target} ${COMMON_LINK_LIBRARIES}) +endforeach () \ No newline at end of file diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_saver_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_saver_test.cpp new file mode 100644 index 0000000000..44ef79bc0a --- /dev/null +++ b/fbgemm_gpu/test/dram_kv_embedding_cache/fixed_block_pool_saver_test.cpp @@ -0,0 +1,157 @@ +#include +#include +#include +#include + +#include + +#include "fbgemm_gpu/src/dram_kv_embedding_cache/fixed_block_pool.h" + +namespace kv_mem { +void removeFileIfExists(const std::string& filename) { + if (std::filesystem::exists(filename)) { + std::filesystem::remove(filename); + } +} +class FixedBlockPoolTest : public ::testing::Test { + protected: + static constexpr size_t kDimension = 128; // embedding dimension + using scalar_t = float; // data type + + void SetUp() override { + block_size_ = kv_mem::FixedBlockPool::calculate_block_size(kDimension); + block_alignment_ = kv_mem::FixedBlockPool::calculate_block_alignment(); + pool_ = std::make_unique(block_size_, block_alignment_); + } + + // Generate random data + void generateRandomData(std::size_t num_blocks) { + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution key_dist(1, UINT64_MAX); + std::uniform_real_distribution val_dist(-1.0, 1.0); + + for (size_t i = 0; i < num_blocks; ++i) { + auto* block = pool_->allocate_t(); + uint64_t key = key_dist(gen); + + // Set metadata + kv_mem::FixedBlockPool::set_key(block, key); + kv_mem::FixedBlockPool::set_count(block, i % 100); + kv_mem::FixedBlockPool::update_timestamp(block); + + // Set embedding data + auto* data = kv_mem::FixedBlockPool::data_ptr(block); + for (size_t j = 0; j < kDimension; ++j) { + data[j] = val_dist(gen); + } + + // Record for verification + original_data_[key] = std::vector(data, data + kDimension); + } + } + + // Verify data correctness + bool verifyData() { + size_t verified_count = 0; + + // Traverse all chunks to verify data + for (const auto& chunk : pool_->get_chunks()) { + char* current = static_cast(chunk.ptr); + size_t blocks_in_chunk = chunk.size / block_size_; + + for (size_t i = 0; i < blocks_in_chunk; ++i) { + void* block = current + i * block_size_; + if (kv_mem::FixedBlockPool::get_used(block)) { + uint64_t key = kv_mem::FixedBlockPool::get_key(block); + auto* data = kv_mem::FixedBlockPool::data_ptr(reinterpret_cast(block)); + + // Find and compare original data + auto it = original_data_.find(key); + if (it == original_data_.end()) { + return false; + } + + if (!std::equal(data, data + kDimension, it->second.begin())) { + return false; + } + + verified_count++; + } + } + } + + return verified_count == original_data_.size(); + } + + // Performance test helper function + template + double measureTime(Func&& func) { + auto start = std::chrono::high_resolution_clock::now(); + func(); + auto end = std::chrono::high_resolution_clock::now(); + return std::chrono::duration(end - start).count(); + } + + std::unique_ptr pool_; + size_t block_size_{}; + size_t block_alignment_{}; + std::unordered_map> original_data_; +}; + +// Correctness test +TEST_F(FixedBlockPoolTest, SerializationCorrectness) { + // 1. Generate random data + generateRandomData(1000); + + // 2. Serialize + const std::string filename = "test_pool.bin"; + pool_->serialize(filename); + + // 3. Create a new memory pool and deserialize + auto new_pool = std::make_unique(block_size_, block_alignment_); + new_pool->deserialize(filename); + + // 4. Verify data + pool_ = std::move(new_pool); + EXPECT_TRUE(verifyData()); +} + +// Edge case test +TEST_F(FixedBlockPoolTest, SerializationEdgeCases) { + // 1. Empty pool serialization test + const std::string empty_filename = "empty_pool.bin"; + pool_->serialize(empty_filename); + + auto new_pool = std::make_unique(block_size_, block_alignment_); + EXPECT_NO_THROW(new_pool->deserialize(empty_filename)); + + // 2. File not found test + EXPECT_THROW(pool_->deserialize("nonexistent_file.bin"), std::runtime_error); + + // 3. Parameter mismatch test + generateRandomData(1000); + const std::string filename = "test_pool.bin"; + pool_->serialize(filename); + + auto wrong_pool = std::make_unique(block_size_ * 2, // Incorrect block size + block_alignment_); + EXPECT_THROW(wrong_pool->deserialize(filename), std::invalid_argument); +} + +// Performance test +TEST_F(FixedBlockPoolTest, SerializationPerformance) { + const std::size_t num_blocks = 20'000'000; + generateRandomData(num_blocks); + const std::string filename = "test_pool.bin"; + removeFileIfExists(filename); + + pool_->serialize(filename); + + auto new_pool = std::make_unique(block_size_, block_alignment_); + new_pool->deserialize(filename); + + std::remove(filename.c_str()); +} + +} // namespace kv_mem \ No newline at end of file diff --git a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp index d84acbb1d8..4445d1d4a2 100644 --- a/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp +++ b/fbgemm_gpu/test/dram_kv_embedding_cache/sharded_map_test.cpp @@ -171,6 +171,59 @@ int benchmark() { } return 0; } + +void save_and_restore() { + const int numShards = 4; + const std::size_t dimension = 32; + const std::size_t block_size = FixedBlockPool::calculate_block_size(dimension); + const std::size_t block_alignment = FixedBlockPool::calculate_block_alignment(); + const int numItems = 1'000'000; + const std::string filename = "test_map.bin"; + + SynchronizedShardedMap original_map(numShards, block_size, block_alignment); + + std::vector test_embedding = generateFixedEmbedding(dimension); + for (int i = 0; i < numItems; ++i) { + int shard_id = i % numShards; + auto wlock = original_map.by(shard_id).wlock(); + auto* pool = original_map.pool_by(shard_id); + + auto* block = pool->allocate_t(); + auto* data_ptr = FixedBlockPool::data_ptr(block); + std::copy(test_embedding.begin(), test_embedding.end(), data_ptr); + + FixedBlockPool::set_key(block, i); + wlock->insert({i, block}); + } + + original_map.save(filename); + + SynchronizedShardedMap restored_map(numShards, block_size, block_alignment); + restored_map.load(filename); + + for (int64_t i = 0; i < numItems; ++i) { + int shard_id = i % numShards; + auto rlock = restored_map.by(shard_id).rlock(); + + auto it = rlock->find(i); + ASSERT_NE(it, rlock->end()) << "Key " << i << " not found after load"; + + float* block = it->second; + ASSERT_EQ(FixedBlockPool::get_key(block), i); + + const float* data_ptr = FixedBlockPool::data_ptr(block); + for (std::size_t j = 0; j < dimension; ++j) { + ASSERT_FLOAT_EQ(data_ptr[j], test_embedding[j]) << "Data mismatch at position " << j << " for key " << i; + } + } + + std::remove(filename.c_str()); + for (int i = 0; i < numShards; ++i) { + std::remove((filename + ".pool." + std::to_string(i)).c_str()); + } +}; + +TEST(SynchronizedShardedMap, save_and_restore) { save_and_restore(); } TEST(SynchronizedShardedMap, benchmark) { benchmark(); } } // namespace kv_mem \ No newline at end of file