From bcccfe9b7a0584c72accf366dd4e1ec54946beaa Mon Sep 17 00:00:00 2001 From: vegetableysm Date: Tue, 18 Nov 2025 13:10:58 +0800 Subject: [PATCH 1/2] llm patch Signed-off-by: vegetableysm --- CMakeLists.txt | 39 +- cmake/FindAIO.cmake | 34 + cmake/FindGFlags.cmake | 8 +- cmake/FindGTest.cmake | 40 + cmake/FindGlog.cmake | 8 +- cmake/FindLibUnwind.cmake | 68 +- modules/basic/stream/fixed_blob_stream.cc | 161 ++ modules/basic/stream/fixed_blob_stream.h | 155 ++ modules/vllm-kv-cache/CMakeLists.txt | 72 + modules/vllm-kv-cache/README.md | 115 + modules/vllm-kv-cache/ds/vllm_block.cc | 403 +++ modules/vllm-kv-cache/ds/vllm_block.h | 148 + modules/vllm-kv-cache/ds/vllm_layer.cc | 376 +++ modules/vllm-kv-cache/ds/vllm_layer.h | 103 + modules/vllm-kv-cache/src/env.cc | 112 + modules/vllm-kv-cache/src/env.h | 60 + modules/vllm-kv-cache/src/io/aio_adaptor.cc | 827 ++++++ modules/vllm-kv-cache/src/io/aio_adaptor.h | 234 ++ .../vllm-kv-cache/src/io/aio_operations.cc | 59 + modules/vllm-kv-cache/src/io/aio_operations.h | 67 + .../vllm-kv-cache/src/io/error_injection.cc | 149 + .../vllm-kv-cache/src/io/error_injection.h | 93 + modules/vllm-kv-cache/src/io/io_adaptor.h | 128 + .../src/io/mock_aio_operations.cc | 523 ++++ .../src/io/mock_aio_operations.h | 134 + .../vllm-kv-cache/src/io/mock_io_adapter.cc | 221 ++ .../vllm-kv-cache/src/io/mock_io_adapter.h | 85 + .../vllm-kv-cache/src/io/posix_io_adaptor.cc | 114 + .../vllm-kv-cache/src/io/posix_io_adaptor.h | 58 + .../src/storage/vllm_kv_storage.cc | 2401 +++++++++++++++++ .../src/storage/vllm_kv_storage.h | 398 +++ .../vllm-kv-cache/src/vllm_kv_cache_util.cc | 154 ++ .../vllm-kv-cache/src/vllm_kv_cache_util.h | 73 + .../tests/vllm_storage_local_test.cc | 433 +++ python/client.cc | 135 + python/vineyard/core/client.py | 128 + .../core/tests/fixed_stream_receiver.py | 105 + .../core/tests/fixed_stream_sender.py | 82 + python/vineyard/io/fixed_blob.py | 214 ++ src/client/client.cc | 937 ++++++- src/client/client.h | 130 +- src/client/client_base.cc | 95 +- src/client/client_base.h | 25 +- src/client/ds/blob.cc | 64 + src/client/ds/blob.h | 73 + src/client/ds/object_meta.h | 2 + src/client/ds/stream.h | 3 + src/client/rpc_client.cc | 1 + src/common/memory/payload.cc | 11 + src/common/memory/payload.h | 7 +- src/common/util/env.cc | 15 + src/common/util/env.h | 9 + src/common/util/get_tid.h | 32 + src/common/util/json.h | 5 + src/common/util/monitor.h | 201 ++ src/common/util/protocols.cc | 1497 +++++++++- src/common/util/protocols.h | 517 +++- src/common/util/sidecar.cc | 153 ++ src/common/util/sidecar.h | 95 + src/common/util/trace.h | 69 + src/common/util/uuid.h | 19 +- src/server/async/rpc_server.cc | 15 +- src/server/async/rpc_server.h | 3 + src/server/async/socket_server.cc | 1699 +++++++++++- src/server/async/socket_server.h | 109 +- src/server/memory/malloc.cc | 43 +- src/server/memory/memory.cc | 80 + src/server/memory/memory.h | 40 +- src/server/memory/stream_store.cc | 856 +++++- src/server/memory/stream_store.h | 280 +- src/server/memory/usage.h | 19 +- src/server/server/vineyard_runner.cc | 9 +- src/server/server/vineyard_server.cc | 648 ++++- src/server/server/vineyard_server.h | 101 +- src/server/services/meta_service.cc | 14 +- src/server/services/meta_service.h | 30 +- src/server/util/etcd_member.cc | 6 +- src/server/util/remote.cc | 128 +- src/server/util/remote.h | 53 +- src/server/util/remote_pool.cc | 96 + src/server/util/remote_pool.h | 59 + src/server/util/spill_file.cc | 7 +- src/server/util/utils.h | 54 + thirdparty/etcd-cpp-apiv3 | 2 +- thirdparty/thread-pool/thread_pool.h | 117 + 85 files changed, 16947 insertions(+), 198 deletions(-) create mode 100644 cmake/FindAIO.cmake create mode 100644 cmake/FindGTest.cmake create mode 100644 modules/basic/stream/fixed_blob_stream.cc create mode 100644 modules/basic/stream/fixed_blob_stream.h create mode 100644 modules/vllm-kv-cache/CMakeLists.txt create mode 100644 modules/vllm-kv-cache/README.md create mode 100644 modules/vllm-kv-cache/ds/vllm_block.cc create mode 100644 modules/vllm-kv-cache/ds/vllm_block.h create mode 100644 modules/vllm-kv-cache/ds/vllm_layer.cc create mode 100644 modules/vllm-kv-cache/ds/vllm_layer.h create mode 100644 modules/vllm-kv-cache/src/env.cc create mode 100644 modules/vllm-kv-cache/src/env.h create mode 100644 modules/vllm-kv-cache/src/io/aio_adaptor.cc create mode 100644 modules/vllm-kv-cache/src/io/aio_adaptor.h create mode 100644 modules/vllm-kv-cache/src/io/aio_operations.cc create mode 100644 modules/vllm-kv-cache/src/io/aio_operations.h create mode 100644 modules/vllm-kv-cache/src/io/error_injection.cc create mode 100644 modules/vllm-kv-cache/src/io/error_injection.h create mode 100644 modules/vllm-kv-cache/src/io/io_adaptor.h create mode 100644 modules/vllm-kv-cache/src/io/mock_aio_operations.cc create mode 100644 modules/vllm-kv-cache/src/io/mock_aio_operations.h create mode 100644 modules/vllm-kv-cache/src/io/mock_io_adapter.cc create mode 100644 modules/vllm-kv-cache/src/io/mock_io_adapter.h create mode 100644 modules/vllm-kv-cache/src/io/posix_io_adaptor.cc create mode 100644 modules/vllm-kv-cache/src/io/posix_io_adaptor.h create mode 100644 modules/vllm-kv-cache/src/storage/vllm_kv_storage.cc create mode 100644 modules/vllm-kv-cache/src/storage/vllm_kv_storage.h create mode 100644 modules/vllm-kv-cache/src/vllm_kv_cache_util.cc create mode 100644 modules/vllm-kv-cache/src/vllm_kv_cache_util.h create mode 100644 modules/vllm-kv-cache/tests/vllm_storage_local_test.cc create mode 100644 python/vineyard/core/tests/fixed_stream_receiver.py create mode 100644 python/vineyard/core/tests/fixed_stream_sender.py create mode 100644 python/vineyard/io/fixed_blob.py create mode 100644 src/common/util/get_tid.h create mode 100644 src/common/util/monitor.h create mode 100644 src/common/util/sidecar.cc create mode 100644 src/common/util/sidecar.h create mode 100644 src/common/util/trace.h create mode 100644 src/server/util/remote_pool.cc create mode 100644 src/server/util/remote_pool.h create mode 100644 src/server/util/utils.h create mode 100644 thirdparty/thread-pool/thread_pool.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 875b7c294..8e9515566 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,15 +32,18 @@ project(vineyard LANGUAGES C CXX VERSION ${VINEYARD_VERSION}) option(BUILD_SHARED_LIBS "Build shared libraries" ON) option(USE_STATIC_BOOST_LIBS "Build with static-linked boost libraries" OFF) +option(LINK_STATIC_LIBC "Link with static libstdc++ if the BUILD_SHARED_LIBS is OFF and the compiler is gcc" ON) option(USE_EXTERNAL_ETCD_LIBS "Build with external etcd-cpp-apiv3 library rather than the submodule one" OFF) option(USE_EXTERNAL_REDIS_LIBS "Build with external redis-plus-plus library rather than the submodule one" ON) option(USE_EXTERNAL_HIREDIS_LIBS "Build with external hiredis library rather than the submodule one" ON) +option(USE_EXTERNAL_METASERVICE_LIBS "Use external MetaService libraries" OFF) option(VINEYARD_USE_ASAN "Using address sanitizer to check memory accessing" OFF) option(VINEYARD_USE_LTO "Using IPO/LTO support for link-time optimization" OFF) option(USE_LIBUNWIND "Using libunwind to retrieve the stack backtrace when exception occurs" ON) option(USE_INCLUDE_WHAT_YOU_USE "Simply the intra-module dependencies with iwyu" OFF) option(USE_JSON_DIAGNOSTICS "Using json diagnostics to check the validity of metadata" OFF) option(USE_CUDA "Enabling GPU (CUDA) support" OFF) +option(ENABLE_VINEYARD_MONITOR "Enable vineyard monitor" OFF) option(BUILD_VINEYARD_SERVER "Build vineyard's server" ON) option(BUILD_VINEYARD_SERVER_REDIS "Enable redis as the metadata backend" OFF) @@ -70,6 +73,9 @@ option(BUILD_VINEYARD_BENCHMARKS "Generate make targets for vineyard benchmarks" option(BUILD_VINEYARD_BENCHMARKS_ALL "Include make targets for vineyard benchmarks to ALL" OFF) option(BUILD_VINEYARD_COVERAGE "Build vineyard with coverage information, requires build with Debug" OFF) option(BUILD_VINEYARD_PROFILING "Build vineyard with profiling information" OFF) +option(BUILD_ENV_RUNC "Build vineyard in runc env" OFF) +option(BUILD_FABRIC_RDMA "Build vineyard fabric with RDMA support" OFF) +option(BUILD_VLLM_CACHE "Build vineyard with vllm cache support" OFF) include(CheckCXXCompilerFlag) include(CheckLibraryExists) @@ -493,6 +499,7 @@ endmacro() macro(find_cuda) # find cuda runtime library set(CUDA_USE_STATIC_CUDA_RUNTIME ON) + enable_language(CUDA) find_package(CUDA REQUIRED) endmacro(find_cuda) @@ -675,7 +682,7 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") endif() set(FABRIC_MAKEFILE "${CMAKE_SOURCE_DIR}/thirdparty/libfabric/Makefile") - if(IBVERBS_LIB AND RDMACM_LIB) + if(BUILD_FABRIC_RDMA AND IBVERBS_LIB AND RDMACM_LIB) set(RDMA_LIBS ${RDMACM_LIB} ${IBVERBS_LIB} ${RT_LIB}) add_custom_command( @@ -696,6 +703,8 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") --disable-perf --disable-efa --disable-mrail + --disable-uffd-monitor + --with-cuda=no --enable-verbs > /dev/null WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/thirdparty/libfabric ) @@ -719,6 +728,8 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") --disable-perf --disable-efa --disable-mrail + --with-cuda=no + --disable-uffd-monitor --disable-verbs > /dev/null WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/thirdparty/libfabric ) @@ -751,11 +762,23 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}/libfabric.a) endif() +if (BUILD_VINEYARD_TESTS) + include("cmake/FindGTest.cmake") +endif() + # boost is only required by some components if(BUILD_VINEYARD_SERVER OR BUILD_VINEYARD_IO OR BUILD_VINEYARD_GRAPH) find_boost() endif() +if(BUILD_VINEYARD_LLM_CACHE) + find_gflags() +endif() + +if (ENABLE_VINEYARD_MONITOR) + add_definitions(-DENABLE_VINEYARD_MONITOR) +endif() + # build vineyardd if(BUILD_VINEYARD_SERVER) find_gflags() @@ -844,8 +867,13 @@ if(BUILD_VINEYARD_SERVER) install_vineyard_target(vineyardd) if(NOT BUILD_SHARED_LIBS) if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang") - target_compile_options(vineyardd PRIVATE -static-libgcc -static-libstdc++ -Os) - target_add_link_options(vineyardd PRIVATE OPTIONS -static-libgcc -static-libstdc++ -Os) + if (LINK_STATIC_LIBC) + target_compile_options(vineyardd PRIVATE -static-libgcc -static-libstdc++ -Os) + target_add_link_options(vineyardd PRIVATE OPTIONS -static-libgcc -static-libstdc++ -Os) + else() + target_compile_options(vineyardd PRIVATE -Os) + target_add_link_options(vineyardd PRIVATE OPTIONS -Os) + endif() endif() target_link_libraries(vineyardd PRIVATE ${GRPC_GRPC++_LIBRARY} ${GRPC_LIBRARY} ${GPR_LIBRARY}) endif() @@ -1072,6 +1100,11 @@ if(BUILD_VINEYARD_LLM_CACHE) list(APPEND VINEYARD_INSTALL_LIBS vineyard_llm_cache) endif() +if(BUILD_VLLM_CACHE) + include("cmake/FindAIO.cmake") + add_subdirectory(modules/vllm-kv-cache) +endif() + if(BUILD_VINEYARD_TESTS) add_subdirectory(test) endif() diff --git a/cmake/FindAIO.cmake b/cmake/FindAIO.cmake new file mode 100644 index 000000000..3bd6d3133 --- /dev/null +++ b/cmake/FindAIO.cmake @@ -0,0 +1,34 @@ +# FindAIO.cmake + +# Try to find the AIO library +# Define the following cached variables: +# AIO_FOUND - Was AIO found? +# AIO_INCLUDE_DIRS - Where to find the AIO includes +# AIO_LIBRARIES - The libraries needed to use AIO + +set(AIO_HOME $ENV{AIO_HOME}) + +find_path (AIO_INCLUDE_DIRS + NAMES libaio.h + PATHS ${AIO_HOME}/include /usr/local/include /usr/include +) + +find_library (AIO_LIBRARIES + NAMES aio + PATHS ${AIO_HOME}/lib /usr/local/lib /usr/lib/x86_64-linux-gnu +) + +include (FindPackageHandleStandardArgs) +find_package_handle_standard_args(AIO DEFAULT_MSG +AIO_INCLUDE_DIRS AIO_LIBRARIES) + +if (AIO_FOUND) + add_library(AIO::aio SHARED IMPORTED) + set_target_properties(AIO::aio PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${AIO_INCLUDE_DIRS}" + IMPORTED_LOCATION "${AIO_LIBRARIES}" + INTERFACE_COMPILE_DEFINITIONS "CMAKE_INCLUDE" + ) +endif() + +mark_as_advanced(AIO_INCLUDE_DIRS AIO_LIBRARIES) diff --git a/cmake/FindGFlags.cmake b/cmake/FindGFlags.cmake index d9a46567c..e77b36e03 100644 --- a/cmake/FindGFlags.cmake +++ b/cmake/FindGFlags.cmake @@ -18,7 +18,7 @@ include(FindPackageHandleStandardArgs) -set(GFLAGS_ROOT_DIR "" CACHE PATH "Folder contains Gflags") +set(GFLAGS_ROOT_DIR $ENV{GFLAGS_ROOT_DIR} CACHE PATH "Folder contains Gflags") # We are testing only a couple of files in the include directories if(WIN32) @@ -26,7 +26,7 @@ if(WIN32) PATHS ${GFLAGS_ROOT_DIR}/src/windows) else() find_path(GFLAGS_INCLUDE_DIR gflags/gflags.h - PATHS ${GFLAGS_ROOT_DIR}) + HINTS ${GFLAGS_ROOT_DIR}/include) endif() if(MSVC) @@ -42,7 +42,9 @@ if(MSVC) set(GFLAGS_LIBRARY optimized ${GFLAGS_LIBRARY_RELEASE} debug ${GFLAGS_LIBRARY_DEBUG}) else() - find_library(GFLAGS_LIBRARY gflags) + find_library(GFLAGS_LIBRARY gflags + HINTS ${GFLAGS_ROOT_DIR} + PATH_SUFFIXES lib) endif() find_package_handle_standard_args(GFlags DEFAULT_MSG GFLAGS_INCLUDE_DIR GFLAGS_LIBRARY) diff --git a/cmake/FindGTest.cmake b/cmake/FindGTest.cmake new file mode 100644 index 000000000..7999f6a92 --- /dev/null +++ b/cmake/FindGTest.cmake @@ -0,0 +1,40 @@ +# FindGTest.cmake + +# Try to find the GTest library +# Define the following cached variables: +# GTest_FOUND - Was GTest found? +# GTest_INCLUDE_DIRS - Where to find the GTest includes +# GTest_LIBRARIES - The libraries needed to use GTest + +set(GTEST_HOME $ENV{GTEST_HOME}) + +find_path (GTEST_INCLUDE_DIRS + NAMES gtest/gtest.h + PATHS ${GTEST_HOME}/include /usr/local/include /usr/include +) + +find_library (GTEST_LIBRARIES + NAMES gtest gtest_main + PATHS ${GTEST_HOME}/lib /usr/local/lib /usr/lib +) + +include (FindPackageHandleStandardArgs) +find_package_handle_standard_args(GTEST DEFAULT_MSG +GTEST_INCLUDE_DIRS GTEST_LIBRARIES) + +message("GTest include dirs: ${GTEST_INCLUDE_DIRS} GTest libraries: ${GTEST_LIBRARIES}") + +if (GTEST_FOUND) + add_library(GTEST::gtest SHARED IMPORTED) + set_target_properties(GTEST::gtest PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${GTEST_INCLUDE_DIRS}" + IMPORTED_LOCATION "${GTEST_LIBRARIES}" + INTERFACE_COMPILE_DEFINITIONS "CMAKE_INCLUDE" + ) +else() + message(WARNING "GTest not found.") + set(GTEST_INCLUDE_DIRS "") + set(GTEST_LIBRARIES "") +endif() + +mark_as_advanced(GTEST_INCLUDE_DIRS GTEST_LIBRARIES) diff --git a/cmake/FindGlog.cmake b/cmake/FindGlog.cmake index b0f8d5f5d..f402be357 100644 --- a/cmake/FindGlog.cmake +++ b/cmake/FindGlog.cmake @@ -18,14 +18,14 @@ include(FindPackageHandleStandardArgs) -set(GLOG_ROOT_DIR "" CACHE PATH "Folder contains Google glog") +set(GLOG_ROOT_DIR $ENV{GLOG_ROOT_DIR} CACHE PATH "Folder contains Google glog") if(WIN32) find_path(GLOG_INCLUDE_DIR glog/logging.h PATHS ${GLOG_ROOT_DIR}/src/windows) else() find_path(GLOG_INCLUDE_DIR glog/logging.h - PATHS ${GLOG_ROOT_DIR}) + HINTS ${GLOG_ROOT_DIR}/include) endif() if(MSVC) @@ -40,10 +40,12 @@ if(MSVC) set(GLOG_LIBRARY optimized ${GLOG_LIBRARY_RELEASE} debug ${GLOG_LIBRARY_DEBUG}) else() find_library(GLOG_LIBRARY glog - PATHS ${GLOG_ROOT_DIR} + HINTS ${GLOG_ROOT_DIR} PATH_SUFFIXES lib lib64) endif() +message("GLOG_ROOT_DIR: ${GLOG_ROOT_DIR}, GLOG_INCLUDE_DIR: ${GLOG_INCLUDE_DIR}, GLOG_LIBRARY: ${GLOG_LIBRARY}") + find_package_handle_standard_args(Glog DEFAULT_MSG GLOG_INCLUDE_DIR GLOG_LIBRARY) if(GLOG_FOUND) diff --git a/cmake/FindLibUnwind.cmake b/cmake/FindLibUnwind.cmake index a35489c3f..ab34cd058 100644 --- a/cmake/FindLibUnwind.cmake +++ b/cmake/FindLibUnwind.cmake @@ -14,19 +14,32 @@ # LIBUNWIND_LIBRARIES - The libraries needed to use libunwind # LIBUNWIND_INCLUDE_DIR - Location of unwind.h and libunwind.h -FIND_PATH(LIBUNWIND_INCLUDE_DIR libunwind.h) -if(NOT LIBUNWIND_INCLUDE_DIR) - message(STATUS "failed to find libunwind.h") -elseif(NOT EXISTS "${LIBUNWIND_INCLUDE_DIR}/unwind.h") - message(STATUS "libunwind.h was found, but unwind.h was not found in that directory.") - SET(LIBUNWIND_INCLUDE_DIR "") -endif() +set(UNWIND_HOME $ENV{UNWIND_HOME}) + +if (UNWIND_HOME) + find_path (LIBUNWIND_INCLUDE_DIR + NAMES libunwind.h + PATHS ${UNWIND_HOME}/include + ) + find_library (LIBUNWIND_LIBRARIES + NAMES unwind + PATHS ${UNWIND_HOME}/lib + ) +else() + FIND_PATH(LIBUNWIND_INCLUDE_DIR libunwind.h) + if(NOT LIBUNWIND_INCLUDE_DIR) + message(STATUS "failed to find libunwind.h") + elseif(NOT EXISTS "${LIBUNWIND_INCLUDE_DIR}/unwind.h") + message(STATUS "libunwind.h was found, but unwind.h was not found in that directory.") + SET(LIBUNWIND_INCLUDE_DIR "") + endif() -FIND_LIBRARY(LIBUNWIND_GENERIC_LIBRARY "unwind") -if(NOT LIBUNWIND_GENERIC_LIBRARY) - MESSAGE(STATUS "failed to find unwind generic library") + FIND_LIBRARY(LIBUNWIND_GENERIC_LIBRARY "unwind") + if(NOT LIBUNWIND_GENERIC_LIBRARY) + MESSAGE(STATUS "failed to find unwind generic library") + endif() + SET(LIBUNWIND_LIBRARIES ${LIBUNWIND_GENERIC_LIBRARY}) endif() -SET(LIBUNWIND_LIBRARIES ${LIBUNWIND_GENERIC_LIBRARY}) # For some reason, we have to link to two libunwind shared object files: # one arch-specific and one not. @@ -41,15 +54,32 @@ elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^i.86$") endif() if(LIBUNWIND_ARCH) - FIND_LIBRARY(LIBUNWIND_SPECIFIC_LIBRARY "unwind-${LIBUNWIND_ARCH}") - if(NOT LIBUNWIND_SPECIFIC_LIBRARY) - MESSAGE(STATUS "failed to find unwind-${LIBUNWIND_ARCH}") - endif() - if(LIBUNWIND_SPECIFIC_LIBRARY) - SET(LIBUNWIND_LIBRARIES ${LIBUNWIND_LIBRARIES} ${LIBUNWIND_SPECIFIC_LIBRARY}) + if (UNWIND_HOME) + find_library (LIBUNWIND_SPECIFIC_LIBRARY + NAMES unwind-${LIBUNWIND_ARCH} + PATHS ${UNWIND_HOME}/lib + ) + if (NOT LIBUNWIND_SPECIFIC_LIBRARY) + message(STATUS "failed to find unwind-${LIBUNWIND_ARCH}") + endif() + if (LIBUNWIND_SPECIFIC_LIBRARY) + SET(LIBUNWIND_LIBRARIES ${LIBUNWIND_LIBRARIES} ${LIBUNWIND_SPECIFIC_LIBRARY}) + else() + if(APPLE) + SET(LIBUNWIND_LIBRARIES ${LIBUNWIND_LIBRARIES}) + endif() + endif() else() - if(APPLE) - SET(LIBUNWIND_LIBRARIES ${LIBUNWIND_LIBRARIES}) + FIND_LIBRARY(LIBUNWIND_SPECIFIC_LIBRARY "unwind-${LIBUNWIND_ARCH}") + if(NOT LIBUNWIND_SPECIFIC_LIBRARY) + MESSAGE(STATUS "failed to find unwind-${LIBUNWIND_ARCH}") + endif() + if(LIBUNWIND_SPECIFIC_LIBRARY) + SET(LIBUNWIND_LIBRARIES ${LIBUNWIND_LIBRARIES} ${LIBUNWIND_SPECIFIC_LIBRARY}) + else() + if(APPLE) + SET(LIBUNWIND_LIBRARIES ${LIBUNWIND_LIBRARIES}) + endif() endif() endif() endif(LIBUNWIND_ARCH) diff --git a/modules/basic/stream/fixed_blob_stream.cc b/modules/basic/stream/fixed_blob_stream.cc new file mode 100644 index 000000000..11051508f --- /dev/null +++ b/modules/basic/stream/fixed_blob_stream.cc @@ -0,0 +1,161 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include + +#include +#include +#include + +#include "arrow/builder.h" +#include "arrow/status.h" + +#include "basic/stream/fixed_blob_stream.h" +#include "client/client.h" +#include "client/ds/blob.h" +#include "client/ds/i_object.h" +#include "client/ds/stream.h" +#include "common/util/uuid.h" + +namespace vineyard { + +Status FixedBlobStream::Open(Client* client, StreamOpenMode mode, bool wait, + uint64_t timeout) { + client_ = client; + Status status = Status::OK(); + if (is_remote_) { + status = client_->VineyardOpenRemoteFixedStream( + stream_name_, this->id_, recv_mem_fd_, buffer_nums_, buffer_size_, + rpc_endpoint_, mode, wait, timeout); + } else { + status = client_->OpenFixedStream(this->id_, mode, recv_mem_fd_); + } + if (status.ok()) { + recv_flag_mem_ = + mmap(0, STREAM_PAGE_SIZE, PROT_READ, MAP_SHARED, recv_mem_fd_, 0); + if (recv_flag_mem_ == MAP_FAILED) { + status = Status::IOError("Failed to mmap recv_flag_mem."); + Close(); + } + } + return status; +} + +Status FixedBlobStream::ActivateStreamWithBuffer(std::vector& buffers) { + if (!is_remote_) { + return Status::Invalid("Not a remote stream."); + } + return client_->VineyardActivateRemoteFixedStream(this->id_, buffers); +} + +Status FixedBlobStream::ActivateStreamWithBlob( + std::vector& blob_list) { + if (!is_remote_) { + return Status::Invalid("Not a remote stream."); + } + return client_->VineyardActivateRemoteFixedStream(this->id_, blob_list); +} + +Status FixedBlobStream::ActivateStreamWithOffset( + std::vector& offset_list) { + if (!is_remote_) { + return Status::Invalid("Not a remote stream."); + } + return client_->VineyardActivateRemoteFixedStreamWithOffset(this->id_, + offset_list); +} + +Status FixedBlobStream::Push(uint64_t offset) { + if (is_remote_) { + return Status::Invalid("Cannot push to a remote stream."); + } + return client_->PushNextStreamChunkByOffset(this->id_, offset); +} + +Status FixedBlobStream::CheckBlockReceived(int index, bool& finished) { + unsigned char error_code = reinterpret_cast( + recv_flag_mem_)[STREAM_PAGE_SIZE - sizeof(unsigned char)]; + std::string error_msg(reinterpret_cast(recv_flag_mem_) + + STREAM_PAGE_SIZE - STREAM_ERROR_LENGTH - + sizeof(unsigned char), + STREAM_ERROR_LENGTH); + if (error_code != 0) { + std::cerr << "Error code: " << static_cast(error_code) + << ", error message: " << error_msg << std::endl; + Status status = + Status(StatusCode(error_code), "Check block received failed."); + return status; + } + + if (index == -1) { + for (int i = 0; i < buffer_nums_; ++i) { + finished = true; + if (reinterpret_cast(recv_flag_mem_)[i] == 0) { + finished = false; + break; + } + } + return Status::OK(); + } else if (index >= 0 && index < buffer_nums_) { + finished = (reinterpret_cast(recv_flag_mem_)[index] == 1); + return Status::OK(); + } else { + return Status::Invalid("Index out of range."); + } +} + +Status FixedBlobStream::Abort(bool& success) { + if (is_remote_) { + return client_->VineyardAbortRemoteStream(this->id_, success); + } else { + return client_->AbortStream(this->id_, success); + } +} + +Status FixedBlobStream::Close() { + Status status = Status::OK(); + if (is_remote_) { + status = client_->VineyardCloseRemoteFixedStream(this->id_); + } else { + status = client_->CloseStream(this->id_); + } + if (status.ok() && recv_flag_mem_ != nullptr) { + munmap(recv_flag_mem_, STREAM_PAGE_SIZE); + recv_flag_mem_ = nullptr; + } + client_ = nullptr; + return status; +} + +Status FixedBlobStream::Delete(Client* client, FixedBlobStream stream) { + RETURN_ON_ERROR(client->DelData(stream.id_)); + return client->DeleteStream(stream.id_); +} + +Status FixedBlobStream::PrintRecvInfo() { + std::cout << "--------------------------" << std::endl; + std::cout << " buffer_nums_: " << buffer_nums_ << std::endl; + std::cout << "recv_mem_: " << recv_flag_mem_ << std::endl; + for (int i = 0; i < buffer_nums_; ++i) { + std::cout << "Recv flag " << i << ": " + << static_cast((reinterpret_cast(recv_flag_mem_))[i]) + << " " << std::endl; + } + std::cout << std::endl; + std::cout << "--------------------------" << std::endl; + return Status::OK(); +} + +} // namespace vineyard diff --git a/modules/basic/stream/fixed_blob_stream.h b/modules/basic/stream/fixed_blob_stream.h new file mode 100644 index 000000000..0aa4381ff --- /dev/null +++ b/modules/basic/stream/fixed_blob_stream.h @@ -0,0 +1,155 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef MODULES_BASIC_STREAM_FIXED_BLOB_STREAM_H_ +#define MODULES_BASIC_STREAM_FIXED_BLOB_STREAM_H_ + +#include +#include +#include + +#include "arrow/builder.h" +#include "arrow/status.h" + +#include "basic/ds/arrow_utils.h" +#include "client/client.h" +#include "client/ds/blob.h" +#include "client/ds/i_object.h" +#include "client/ds/stream.h" +#include "common/util/uuid.h" + +namespace vineyard { + +class FixedBlobStream : public BareRegistered, + public Stream> { + public: + explicit FixedBlobStream(ObjectMeta meta) { Construct(meta); } + + FixedBlobStream() {} + + static std::unique_ptr Create() __attribute__((used)) { + return std::static_pointer_cast( + std::unique_ptr{new FixedBlobStream()}); + } + + Status Open(Client* client, StreamOpenMode mode, bool wait = false, + uint64_t timeout = 0); + + Status ActivateStreamWithBuffer(std::vector& buffers); + + Status ActivateStreamWithBlob(std::vector& blob_list); + + Status ActivateStreamWithOffset(std::vector& offset_list); + + Status Push(uint64_t offset); + + Status CheckBlockReceived(int index, bool& finished); + + Status Abort(bool& success); + + Status Close(); + + static Status Delete(Client* client, FixedBlobStream stream); + + Status PrintRecvInfo(); + + void Construct(const ObjectMeta& meta) override { + std::string __type_name = type_name(); + VINEYARD_ASSERT(meta.GetTypeName() == __type_name, + "Expect typename '" + __type_name + "', but got '" + + meta.GetTypeName() + "'"); + Object::Construct(meta); + this->meta_.GetKeyValue("nums", this->buffer_nums_); + this->meta_.GetKeyValue("size", this->buffer_size_); + this->meta_.GetKeyValue("is_remote", this->is_remote_); + this->meta_.GetKeyValue("rpc_endpoint", this->rpc_endpoint_); + this->meta_.GetKeyValue("stream_name", this->stream_name_); + } + + ObjectID GetId() const { return this->id_; } + + protected: + int buffer_nums_ = 0; + size_t buffer_size_ = 0; + std::string stream_name_ = ""; + bool is_remote_ = false; + std::string rpc_endpoint_ = ""; + void* recv_flag_mem_ = nullptr; + int recv_mem_fd_ = -1; + Client* client_ = nullptr; +}; + +class FixedStreamBuilder { + public: + explicit FixedStreamBuilder(Client& client) : client_(client) { + meta_.SetTypeName(type_name()); + meta_.SetNBytes(0); + } + + template + void AddKeyValue(const std::string& key, const Value& value) { + meta_.AddKeyValue(key, value); + } + + Status Finish(std::string stream_name, int nums = 0, size_t size = 0) { + ObjectID id = InvalidObjectID(); + RETURN_ON_ERROR(client_.CreateFixedStream(id, stream_name, nums, size)); + meta_.SetId(id); + return Status::OK(); + } + + static Status Make(Client& client, std::shared_ptr& stream, + ObjectID remote_id, int nums, size_t size, bool is_remote, + std::string rpc_endpoint) { + FixedStreamBuilder builder(client); + builder.AddKeyValue("nums", nums); + builder.AddKeyValue("size", size); + builder.AddKeyValue("stream_name", ""); + builder.AddKeyValue("is_remote", is_remote); + builder.AddKeyValue("rpc_endpoint", rpc_endpoint); + builder.AddKeyValue("remote_id", remote_id); + RETURN_ON_ERROR(builder.Finish("", nums, size)); + stream = std::make_shared(builder.meta_); + return Status::OK(); + } + + static Status Make(Client& client, std::shared_ptr& stream, + std::string stream_name, int nums, size_t size, + bool is_remote = false, std::string rpc_endpoint = "") { + FixedStreamBuilder builder(client); + builder.AddKeyValue("nums", nums); + builder.AddKeyValue("size", size); + builder.AddKeyValue("stream_name", stream_name); + builder.AddKeyValue("is_remote", is_remote); + builder.AddKeyValue("rpc_endpoint", rpc_endpoint); + builder.AddKeyValue("remote_id", InvalidObjectID()); + RETURN_ON_ERROR(builder.Finish(stream_name, nums, size)); + stream = std::make_shared(builder.meta_); + return Status::OK(); + } + + private: + Client& client_; + ObjectMeta meta_; +}; + +template <> +struct stream_type> { + using type = FixedBlobStream; +}; + +} // namespace vineyard + +#endif // MODULES_BASIC_STREAM_FIXED_BLOB_STREAM_H_ diff --git a/modules/vllm-kv-cache/CMakeLists.txt b/modules/vllm-kv-cache/CMakeLists.txt new file mode 100644 index 000000000..93c404b96 --- /dev/null +++ b/modules/vllm-kv-cache/CMakeLists.txt @@ -0,0 +1,72 @@ +file(GLOB VINEYARD_VLLM_KV_CACHE_SRCS "${CMAKE_CURRENT_SOURCE_DIR}" + "ds/*.cc" + "src/*.cc" + "src/storage/*.cc" + "src/io/*.cc" + "*.cc" +) + +add_library(vineyard_vllm_kv_cache ${VINEYARD_VLLM_KV_CACHE_SRCS}) +target_link_libraries(vineyard_vllm_kv_cache PRIVATE ${GLOG_LIBRARIES} ${AIO_LIBRARIES}) +target_link_libraries(vineyard_vllm_kv_cache PUBLIC vineyard_client ) + +install_export_vineyard_target(vineyard_vllm_kv_cache) +install_vineyard_headers("${CMAKE_CURRENT_SOURCE_DIR}") + +add_custom_target(vineyard_vllm_kv_cache_tests) +add_dependencies(vineyard_tests vineyard_vllm_kv_cache_tests) + +if(BUILD_VINEYARD_TESTS) + enable_testing() + # Define common test sources + set(VLLM_STORAGE_TEST_COMMON_SOURCES + tests/vllm_storage_test_common.cc) + + file(GLOB TEST_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/tests" "${CMAKE_CURRENT_SOURCE_DIR}/tests/*.cc") + foreach(f ${TEST_FILES}) + if (${f} STREQUAL "vllm_kv_storage_unittest.cc") + # Skip the vllm_kv_storage_unittest.cc. It need to be fixed. + continue() + endif() + string(REGEX MATCH "^(.*)\\.[^.]*$" dummy ${f}) + set(T_NAME ${CMAKE_MATCH_1}) + message(STATUS "Found unit_test - " ${T_NAME}) + if(BUILD_VINEYARD_TESTS_ALL) + if(${T_NAME} STREQUAL "vllm_storage_test_common") + # Skip the common file as it's not a test executable + continue() + endif() + add_executable(${T_NAME} tests/${T_NAME}.cc) + else() + if(${T_NAME} STREQUAL "vllm_storage_test_common") + # Skip the common file as it's not a test executable + continue() + endif() + add_executable(${T_NAME} EXCLUDE_FROM_ALL tests/${T_NAME}.cc) + endif() + + # Add common sources to test executables + if(${T_NAME} MATCHES "vllm_storage_disk_test") + target_sources(${T_NAME} PRIVATE ${VLLM_STORAGE_TEST_COMMON_SOURCES}) + endif() + + # Add common sources to performance test executables + if(${T_NAME} MATCHES "vllm_storage_disk_perftest") + target_sources(${T_NAME} PRIVATE ${VLLM_STORAGE_TEST_COMMON_SOURCES}) + endif() + target_include_directories(${T_NAME} PRIVATE + ${GTEST_INCLUDE_DIRS} + ${GLOG_INCLUDE_DIRS}) + + target_link_libraries(${T_NAME} PRIVATE + libzstd_static + vineyard_vllm_kv_cache + ${GLOG_LIBRARIES} + ${GTEST_LIBRARIES}) + if(${LIBUNWIND_FOUND}) + target_link_libraries(${T_NAME} PRIVATE ${LIBUNWIND_LIBRARIES}) + endif() + add_test(${T_NAME}, ${T_NAME}) + add_dependencies(vineyard_vllm_kv_cache_tests ${T_NAME}) + endforeach() +endif() diff --git a/modules/vllm-kv-cache/README.md b/modules/vllm-kv-cache/README.md new file mode 100644 index 000000000..b30b6e481 --- /dev/null +++ b/modules/vllm-kv-cache/README.md @@ -0,0 +1,115 @@ +# Vineyard LLM KV Cache + +## Background + +Large Language Models (LLMs) are popular for their ability to generate content and solve complex tasks. However, LLM inference can be costly due to extensive GPU use and slow service engine speeds, particularly in multiple conversations. With rising demand, optimizing LLM inference throughput in multi-turn dialogues and cutting costs is crucial. + +Specifically, the inference of LLM contains two phase: **Prefill** and **Decode**. The **Prefill** is to calculate the KV Cache of input tokens and the **Decode** is to generate the output tokens based on the calculated KV Cache. In multi-turn dialogues, the current input token will be superimposed with the previous output and input into the model as the new input for inference. The KV Cache of the previous input tokens can be reused in the **Prefill** phase, which can slow down the First Token Time (FTT) and improve the overall throughput. + +However, the GPU memory is limited, and the KV Cache size grows linearly with the input sequence length. So we want to offload the KV Cache to the host memory or disk storage to save the GPU memory usage. We can gain benefits from the large memory capacity of host memory and disk storage to store more KV Cache data if the load cost is lower than the cost of recomputing the KV Cache. + +## Design + +### User defined blob with VLLMBlock object + +We define a new Vineyard object named `VLLMBlock`, which represents a block of KV cache data in the vineyard blob. Each `VLLMBlock` contains multiple buffers, and each buffer corresponds to a specific layer and key/value in the LLM model. + +To keep the expansibility of the kv storage, we put the memory management logic in the user side. User can use vineyard new API `GetVineyardMmapFd` to get the mmap fd of the vineyard memory, and then use the fd to mmap the memory region in the user side. Use the vllm as an example, user can implement a allocator using the same allocation strategy as vllm with the vineyard memory region, which can keep the same memory layout as vllm. Then user can implement some swap kernels to swap the GPU memory to the vineyard memory region. Then, user can use the vineyard vllm kv storage api to "register" this user allocated memory region as the vineyard blob, and create the `VLLMBlock` object to manage the offsets and sizes of each buffer in the vineyard blob. To support user defined blob, we design a new blob type named `UserBlob`, which just wraps the user allocated memory region without managing the memory allocation and deallocation. The vineyard server only manages the offsets and sizes of each buffer in the vineyard server. + +Because the allocator and swap kernels are implemented in the user side, user can customize the allocation and swap strategy according to their own requirements. It also makes the vineyard vllm kv storage more flexible and adaptable to different llm engine. + +We now proceed to introduce the core fields of the VLLMBlock object: + +- `offsets_`: This field is a vector of offsets for each buffer in the block. Each offset indicates the distance from the mmap memory region base in the user side and the start of the buffer. The offsets are calculated based on the memory layout of the LLM model, which is typically organized by layer, key/value, and buffer. Or user can customize the memory layout according to their own requirements. + +- `sizes_`: This field is a vector of sizes for each buffer in the block. Each size indicates the size of the corresponding buffer in bytes. The sizes are determined by the model configuration, such as the hidden size and data type. In most cases, the sizes of all buffers in a block are the same. + +- `shape_`: This field represents the shape of the block, which is typically a 3D tensor with dimensions [layer_num, kv_num, buffer_num]. The shape indicates how many layers, keys/values, and contiguous buffers are contained in the block. It depends on the model configuration and the block size In vllm, the buffers in a block are usually discrete, so we design the shape_ to represent the logical shape of the block. Shape such as [52, 2, 1] represents a block containing 52 * 2 * 1 = 104 buffers. Each buffer is contiguous but the buffers are discrete in memory. Of course, user can implement a special swap kernel to make sure the kv cache swap to host memory in a contiguous way to make the block is a contiguous memory region, but it will increase the complexity of the swap kernel and hard to layerwise transfer because the information of layer is lost. + +- `layer_index_`: This field indicates the layer index of the block in the shape_. This field is used to support flexible layout such as [kv_num, layer_num, buffer_num], which can be configured by the user according to their own requirements. Layer index is used to translate the blocks to layers for layerwise transfer( transfer between vineyardd is WIP). + +As mentioned before, the vineyard vllm kv storage does not manage the memory allocation and swap logic. Instead, it relies on the user to implement these functionalities using the vineyard memory region. The vineyard vllm kv storage only provides the interface to register the user allocated memory region as the vineyard blob, and create the VLLMBlock object to manage the offsets and sizes of each buffer in the vineyard blob. This design is used to transfer between vineyardd in the future. + +Because the transfer of vineyard vllm kv storage between vineyardd and prefill-decode disaggregation inference is still work in progress, now vineyard only supports the vineyard vllm kv storage as the local memory kv cache storage in a single machine. + +### Disk storage as multi level cache + +To further improve the capacity of the KV cache, we implement a disk storage as the multi-level cache. The vineyard llm kv cache supports all posix compliant filesystem as the backend storage, including local disk, NFS, HDFS, and other distributed filesystem. + +Disk storage use multi-level directory structure to store the kv tensor files. The directory depends on the block hash value, which is calculated based on the token id. It can reduce the number of files in a single directory and improve the performance of file operations. And if the disk storage is based on distributed filesystem, it can smoothly support global kv cache in multiple machines. + +Disk storage also use the AIO based asynchronous read and write to improve the performance of disk I/O. The AIO operations are implemented using the Linux native AIO library `libaio`. So this module needs to install the `libaio-dev` package before building. In informal tests, aio was able to use the whole disk bandwidth with a small amount of concurrency. + +### Layerwise transfer support(WIP) + +To support the transfer of kv cache between vineyardd, we design the vineyard vllm kv storage to support layerwise transfer. The layerwise transfer means that the vineyard vllm kv storage can transfer the kv cache data layer by layer. When a request arrived at the llm engine, engine processes the request layer by layer instead block by block. So the vineyard vllm kv storage can transfer the kv cache data of the required layer to the llm engine, which can overlap the transfer time and the swap time(Swap data from host memory to GPU memory). + +We design a Object named VLLMLayers which represents a set of layers belongs to a set of VLLMBlocks. When engine need to fetch some remote kv cache blocks, it can request the vineyard vllm kv storage to transfer the required layers of the blocks. The vineyard vllm kv storage will find the corresponding VLLMBlocks and create a VLLMLayers object to represent the required layers. Then engine can use VLLMLayers object to check if the required layers are available in the local vineyardd or need to wait for the completion of the transfer. + +The layerwise transfer implementation on the user side is completed. But the transfer between vineyardd is still work in progress. We will support the transfer of vineyard vllm kv storage between vineyardd in the future. + + +### Streaming support(WIP) + +In the prefill-decode disaggregation inference, the kv cache is generated in the prefill phase on one machine, and then transferred to the decode phase on another machine. This scenario requires the vineyard vllm kv storage to support streaming mode, which means that the vineyard vllm kv storage can transfer the kv cache data in a streaming way. It also requires transfer layer by layer to overlap the transfer time and swap time. Stream mode is completed, but the transfer between vineyardd is still work in progress. We will support the streaming mode to transfer data between vineyardd in the future. + +### IPC optimization + +Because a blocks need manager a lot of small buffers. To avoid the overhead of IPC communication such as create a lot of user defined blobs, we use mmap to share the IPC msg when use the API assochiated with the vllm kv storage. The mmap fd is created when the vineyard client connect to the vineyard server. The vineyard server will reserve a large memory region for the mmap fd, and the vineyard client can use the mmap fd to mmap the memory region in the user side. Then the vineyard vllm kv storage can use the mmap memory region to store the IPC msg when use the API associated with the vllm kv storage. The implementation details are encapsulated in vllm kv storage module. + + +## Usage + +We provide [C++](https://github.com/v6d-io/v6d/modules/vllm-kv-cache/src/storage/vllm_kv_storage.h) as the core API for vineyard vllm kv cache. If user implement their own block allocator and kernel swap function, they can easily use the vineyard vllm kv cache C++ API to create the vineyard blobs and VLLMBlock object to manage the kv cache data in vineyard. + +We also provide a test case to show how to use the vineyard vllm kv cache C++ API. The test case is located in the [C++](https://github.com/v6d-io/v6d/modules/vllm-kv-cache/tests/vllm_storage_local_test.cc) directory. The test case shows how to create some vineyard blobs with user allocated memory region, create VLLMBlock objects to manage the offsets and sizes of each buffer in the vineyard blob, and perform basic operations such as put, get, and delete on the vineyard vllm kv cache. + +### C++ API + +1. First, you need to install the required dependencies. + +```bash +$ cd v6d && git submodule update --init --recursive +``` + +2. Then, you can build the vineyard server and vineyard llm kv cache library. + +```bash +$ mkdir build && cd build +$ cmake .. -DBUILD_VLLM_CACHE=ON +$ make -j +$ make vineyard_tests -j +``` + +After the build, you can check the `vineyardd` and `vllm_storage_local_test` in the `build/bin` directory. + +```bash +$ ls build/bin +vineyardd +$ ls build/bin/vllm_storage_local_test +vllm_storage_local_test +``` + +3. Run the vineyard vllm kv cache test. + +- Open a terminal to start the vineyard server. + +```bash +$ ./build/bin/vineyardd --socket=/tmp/vineyard1.sock --meta=local --reserve_memory=true -size=4G -2M_alignment=true +``` + +Then open another terminal to run the vineyard llm kv cache test. + +```bash +$ ./build/bin/vllm_storage_local_test +``` + +### Performance + +Work in progress. + +### Future work + +- Support the transfer of vineyard vllm kv storage between vineyardd and prefill-decode disaggregation inference. +- Support the RDMA based vineyard vllm kv storage for high performance multi-node llm inference. +- Add the benchmark for vineyard vllm kv storage. \ No newline at end of file diff --git a/modules/vllm-kv-cache/ds/vllm_block.cc b/modules/vllm-kv-cache/ds/vllm_block.cc new file mode 100644 index 000000000..bad9361a3 --- /dev/null +++ b/modules/vllm-kv-cache/ds/vllm_block.cc @@ -0,0 +1,403 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include + +#include "client/client.h" +#include "client/ds/blob.h" +#include "common/memory/memcpy.h" +#include "common/util/logging.h" +#include "vllm-kv-cache/ds/vllm_block.h" +#include "vllm-kv-cache/src/vllm_kv_cache_util.h" + +namespace vineyard { + +void VLLMBlock::Construct(const ObjectMeta& meta) { + Object::Construct(meta); + if (meta_.GetTypeName() != type_name()) { + return; + } + meta_.GetKeyValue("shape", this->shape_); + uint64_t nums = meta_.GetKeyValue("nums"); + meta_.GetKeyValue("offsets", this->offsets_); + meta_.GetKeyValue("sizes", this->sizes_); + meta_.GetKeyValue("layer_index", this->layer_index_); + std::string ids_str_encode = meta_.GetKeyValue("blob_ids"); + std::string ids_str = base64_decode(ids_str_encode); + if (ids_str.size() != sizeof(ObjectID) * nums) { + LOG(WARNING) << "Invalid blob ids size: " << ids_str.size() + << ", expected: " << sizeof(ObjectID) * nums + << ", which means meta has been corrupted."; + return; + } + blobs_.resize(nums); + memory::concurrent_memcpy(blobs_.data(), ids_str.data(), ids_str.size()); +} + +Status VLLMBlock::FromBuilder(VLLMBlockBuilder& builder) { + this->shape_ = builder.GetShape(); + this->offsets_ = builder.GetOffsets(); + this->sizes_ = builder.GetSizes(); + this->layer_index_ = builder.GetLayerIndex(); + const std::vector blob_ids = builder.GetBlobIDs(); + this->blobs_.resize(blob_ids.size()); + memory::concurrent_memcpy(this->blobs_.data(), blob_ids.data(), + sizeof(ObjectID) * blob_ids.size()); + return Status::OK(); +} + +void VLLMBlock::Dump() { + std::cout << "VLLMBlock dump:" << std::endl; + std::cout << "id:" << ObjectIDToString(id_) << std::endl; + std::cout << "shape:" + << "["; + for (size_t i = 0; i < shape_.size(); ++i) { + std::cout << shape_[i]; + if (i != shape_.size() - 1) { + std::cout << ","; + } + } + std::cout << "]" << std::endl; + std::cout << "offsets:" + << "["; + for (size_t i = 0; i < offsets_.size(); ++i) { + std::cout << offsets_[i]; + if (i != offsets_.size() - 1) { + std::cout << ","; + } + } + std::cout << "]" << std::endl; + std::cout << "sizes:" + << "["; + for (size_t i = 0; i < sizes_.size(); ++i) { + std::cout << sizes_[i]; + if (i != sizes_.size() - 1) { + std::cout << ","; + } + } + std::cout << "]" << std::endl; + std::cout << "blobs:" + << "["; + for (size_t i = 0; i < blobs_.size(); ++i) { + std::cout << ObjectIDToString(blobs_[i]); + if (i != blobs_.size() - 1) { + std::cout << ","; + } + } + std::cout << "]" << std::endl; +} + +// TBD: batch create +Status VLLMBlockBuilder::Make(Client& client, std::vector offsets, + std::vector sizes, + std::vector shape, int layer_index, + std::shared_ptr& builder, + std::string& req_flag) { + builder = std::make_shared(); + builder->shape_ = shape; + builder->offsets_ = offsets; + builder->sizes_ = sizes; + builder->layer_index_ = layer_index; + + RETURN_ON_ERROR( + client.CreateUserBlobs(offsets, sizes, builder->blobs_, req_flag)); + builder->blob_ids_.resize(builder->blobs_.size()); + for (size_t i = 0; i < builder->blobs_.size(); ++i) { + builder->blob_ids_[i] = builder->blobs_[i]->id(); + } + return Status::OK(); +} + +Status VLLMBlockBuilder::Make( + Client& client, std::vector>& offsets_vec, + std::vector>& sizes_vec, std::vector& shape, + int layer_index, + std::vector>& builder_vec, + std::string& req_flag) { + size_t num = offsets_vec.size(); + builder_vec.resize(num); + std::vector offsets; + std::vector sizes; + std::vector> blob_writers; + uint64_t start = 0, end = 0; + for (size_t i = 0; i < num; ++i) { + builder_vec[i] = std::make_shared(); + builder_vec[i]->shape_ = shape; + builder_vec[i]->offsets_ = offsets_vec[i]; + builder_vec[i]->sizes_ = sizes_vec[i]; + builder_vec[i]->layer_index_ = layer_index; + + for (size_t j = 0; j < offsets_vec[i].size(); ++j) { + offsets.push_back(offsets_vec[i][j]); + sizes.push_back(sizes_vec[i][j]); + } + } + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + RETURN_ON_ERROR( + client.CreateUserBlobs(offsets, sizes, blob_writers, req_flag)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag << ". Create " << blob_writers.size() + << " user blobs cost: " << (end - start) << " us"; + + for (size_t i = 0; i < num; ++i) { + builder_vec[i]->blobs_.resize(blob_writers.size() / num); + builder_vec[i]->blob_ids_.resize(blob_writers.size() / num); + for (size_t j = 0; j < blob_writers.size() / num; ++j) { + builder_vec[i]->blobs_[j] = + std::move(blob_writers[i * (blob_writers.size() / num) + j]); + builder_vec[i]->blob_ids_[j] = builder_vec[i]->blobs_[j]->id(); + } + } + + return Status::OK(); +} + +Status VLLMBlockBuilder::Build(Client& client) { return Status::OK(); } + +Status VLLMBlockBuilder::_Seal(Client& client, + std::shared_ptr& object) { + RETURN_ON_ERROR(Build(client)); + std::shared_ptr block = std::make_shared(); + std::vector blob_ids; + blob_ids.reserve(blobs_.size()); + for (size_t i = 0; i < blobs_.size(); ++i) { + std::shared_ptr blob; + RETURN_ON_ERROR(blobs_[i]->Seal(client, blob)); + blob_ids.push_back(blob->id()); + } + + ObjectMeta meta; + ConstructVLLMBlockMeta(blob_ids, blobs_.size(), shape_, offsets_, sizes_, + layer_index_, meta); + meta.SetId(InvalidObjectID()); + block->Object::Construct(meta); + block->Construct(meta); + + RETURN_ON_ERROR(client.CreateMetaData(block->meta_, block->id_)); + Status status = client.Persist(block->id_); + if (!status.ok()) { + LOG(ERROR) << "Failed to persist block: " << status.ToString(); + client.DelData(block->id_); + return status; + } + object = std::dynamic_pointer_cast(block); + + return Status::OK(); +} + +Status VLLMBlockBuilder::BatchSeal( + Client& client, std::vector>& builders, + std::vector>& objects, + std::vector& ids, std::string& req_flag) { + std::vector> blob_ids; + std::vector metas; + + uint64_t start = 0, end = 0; + metas.resize(builders.size()); + std::vector> results; + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + for (size_t i = 0; i < builders.size(); ++i) { + results.emplace_back(KVCacheHelper::GetConstructThreadPool()->enqueue( + [&](size_t index) { + ConstructVLLMBlockMeta( + builders[index]->blob_ids_, builders[index]->blobs_.size(), + builders[index]->shape_, builders[index]->offsets_, + builders[index]->sizes_, builders[index]->layer_index_, + metas[index]); + return Status::OK(); + }, + i)); + } + for (auto& result : results) { + uint64_t index = 0; + if (!result.get().ok()) { + LOG(WARNING) << "Failed to construct VLLMBlock meta, block index: " + << index << ", request: " << req_flag; + } + index++; + } + + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag + << ". Batch construct VLLMBlock builder meta use:" << (end - start) + << " us"; + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + RETURN_ON_ERROR(client.CreateHugeMetaData(metas, ids, req_flag)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag << ". CreateHugeMetaData use:" << (end - start) + << " us"; + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + objects.reserve(builders.size()); + for (auto& builder : builders) { + std::shared_ptr block = std::make_shared(); + block->FromBuilder(*builder); + objects.push_back(block); + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag + << ". Create VLLMBlock meta use:" << (end - start) << " us"; + + return Status::OK(); +} + +void VLLMBlockBuilder::Dump() { + std::cout << "VLLMBlockBuilder dump:" << std::endl; + std::cout << "shape:" + << "["; + for (size_t i = 0; i < shape_.size(); ++i) { + std::cout << shape_[i]; + if (i != shape_.size() - 1) { + std::cout << ","; + } + } + std::cout << "]" << std::endl; + std::cout << "offsets:" + << "["; + for (size_t i = 0; i < offsets_.size(); ++i) { + std::cout << offsets_[i]; + if (i != offsets_.size() - 1) { + std::cout << ","; + } + } + std::cout << "]" << std::endl; + std::cout << "sizes:" + << "["; + for (size_t i = 0; i < sizes_.size(); ++i) { + std::cout << sizes_[i]; + if (i != sizes_.size() - 1) { + std::cout << ","; + } + } + std::cout << "]" << std::endl; + std::cout << "blobs:" + << "["; + for (size_t i = 0; i < blob_ids_.size(); ++i) { + std::cout << ObjectIDToString(blob_ids_[i]); + if (i != blob_ids_.size() - 1) { + std::cout << ","; + } + } + std::cout << "]" << std::endl; +} + +void ConstructVLLMBlockMeta(std::vector& blob_ids, size_t nums, + std::vector& shape, + std::vector& offsets, + std::vector& sizes, int layer_index, + ObjectMeta& meta) { + std::string ids_str = std::string(blob_ids.size() * sizeof(ObjectID), 0); + memcpy(ids_str.data(), blob_ids.data(), blob_ids.size() * sizeof(ObjectID)); + std::string ids_str_encode = base64_encode(ids_str); + meta.AddKeyValue("nums", nums); + meta.AddKeyValue("shape", shape); + meta.AddKeyValue("offsets", offsets); + meta.AddKeyValue("sizes", sizes); + meta.AddKeyValue("layer_index", layer_index); + meta.AddKeyValue("blob_ids", ids_str_encode); + meta.SetTypeName(type_name()); +} + +Status ConstructVLLMBlockFileMeta(ObjectMeta& meta, json& file_meta_json) { + std::string type; + std::vector shape; + std::vector sizes; + int layer_index = -1; + uint64_t nums = 0; + type = meta.GetTypeName(); + nums = meta.GetKeyValue("nums"); + meta.GetKeyValue("shape", shape); + meta.GetKeyValue("sizes", sizes); + meta.GetKeyValue("layer_index", layer_index); + meta.GetKeyValue("nums", nums); + + file_meta_json["type"] = type; + file_meta_json["shape"] = shape; + file_meta_json["sizes"] = sizes; + file_meta_json["layer_index"] = layer_index; + file_meta_json["nums"] = nums; + + return Status::OK(); +} + +Status ConstructVLLMBlockFileMeta(std::vector& offsets, + std::vector& sizes, + std::vector& shape, int layer_index, + json& file_meta_json) { + std::string type; + type = type_name(); + uint64_t nums = offsets.size(); + + file_meta_json["type"] = type; + file_meta_json["shape"] = shape; + file_meta_json["sizes"] = sizes; + file_meta_json["layer_index"] = layer_index; + file_meta_json["nums"] = nums; + + return Status::OK(); +} + +Status ParseVLLMBlockFileJson(json& block_meta_json, size_t& nums, + std::vector& sizes, + std::vector& shape, int& layer_index) { + if (unlikely(!block_meta_json.contains("type") || + !block_meta_json.contains("shape") || + !block_meta_json.contains("sizes") || + !block_meta_json.contains("layer_index") || + !block_meta_json.contains("nums"))) { + return Status::Invalid( + "Invalid VLLMBlock metadata, missing required fields."); + } + + if (block_meta_json["type"] != type_name()) { + return Status::Invalid("Invalid VLLMBlock metadata, type mismatch."); + } + + try { + nums = block_meta_json["nums"].get(); + shape = block_meta_json["shape"].get>(); + sizes = block_meta_json["sizes"].get>(); + layer_index = block_meta_json["layer_index"].get(); + } catch (const std::exception& e) { + return Status::Invalid("Failed to parse VLLMBlock metadata: " + + std::string(e.what())); + } + + return Status::OK(); +} + +} // namespace vineyard diff --git a/modules/vllm-kv-cache/ds/vllm_block.h b/modules/vllm-kv-cache/ds/vllm_block.h new file mode 100644 index 000000000..60939ca64 --- /dev/null +++ b/modules/vllm-kv-cache/ds/vllm_block.h @@ -0,0 +1,148 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef MODULES_VLLM_KV_CACHE_DS_VLLM_BLOCK_H_ +#define MODULES_VLLM_KV_CACHE_DS_VLLM_BLOCK_H_ + +#include +#include +#include +#include +#include +#include + +#include "client/client.h" +#include "client/ds/blob.h" +#include "client/ds/remote_blob.h" + +namespace vineyard { + +class VLLMBlockBuilder; + +class VLLMBlock : public vineyard::Registered { + public: + VLLMBlock() = default; + + static std::unique_ptr Create() __attribute__((used)) { + return std::unique_ptr(new VLLMBlock()); + } + + void Construct(const ObjectMeta& meta) override; + + void Dump(); + + std::vector GetBlobs() { return blobs_; } + + std::vector GetOffsets() { return offsets_; } + + std::vector GetSizes() { return sizes_; } + + std::vector GetShape() { return shape_; } + + int GetLayerIndex() { return layer_index_; } + + Status FromBuilder(VLLMBlockBuilder& builder); + + private: + // FIXME: assume the shape is layer * kv * block_id + std::vector blobs_; + std::vector offsets_; + std::vector sizes_; + std::vector shape_; + int layer_index_ = -1; + + friend class VLLMBlockBuilder; +}; + +class VLLMBlockBuilder : public vineyard::ObjectBuilder { + public: + VLLMBlockBuilder() {} + + ~VLLMBlockBuilder() {} + + static Status Make(Client& client, std::vector offsets, + std::vector sizes, std::vector shape, + int layer_index, + std::shared_ptr& builder, + std::string& req_flag); + + static Status Make(Client& client, + std::vector>& offsets, + std::vector>& sizes, + std::vector& shape, int layer_index, + std::vector>& builder, + std::string& req_flag); + + static Status BatchSeal( + Client& client, std::vector>& builders, + std::vector>& objects, + std::vector& ids, std::string& req_flag); + + Status Build(Client& client) override; + + Status _Seal(Client& client, std::shared_ptr& object) override; + + const std::vector& GetBlobIDs() { return blob_ids_; } + + std::vector>& GetBlobs() { return blobs_; } + + std::vector GetOffsets() { return offsets_; } + + std::vector GetSizes() { return sizes_; } + + std::vector GetShape() { return shape_; } + + int GetLayerIndex() { return layer_index_; } + + void Dump(); + + private: + std::vector> blobs_; + std::vector blob_ids_; + std::vector offsets_; + std::vector sizes_; + // FIXME: assume the shape is layer * kv * block_id + std::vector shape_; + int layer_index_ = -1; + + friend class VLLMLayer; +}; + +void ConstructVLLMBlockMeta(std::vector& blob_ids, size_t nums, + std::vector& shape, + std::vector& offsets, + std::vector& sizes, int layer_index, + ObjectMeta& meta); + +Status ConstructVLLMBlockFileMeta(ObjectMeta& meta, json& file_meta_json); + +Status ConstructVLLMBlockFileMeta(std::vector& offsets, + std::vector& sizes, + std::vector& shape, int layer_index, + json& file_meta_json); + +Status ParseVLLMBlockFileJson(json& block_meta_json, size_t& nums, + std::vector& sizes, + std::vector& shape, int& layer_index); + +bool CheckVLLMBlockEqual(size_t nums_1, size_t nums_2, + std::vector& sizes_1, + std::vector& sizes_2, + std::vector& shape_1, + std::vector& shape_2, int layer_index_1, + int layer_index_2); +} // namespace vineyard + +#endif // MODULES_VLLM_KV_CACHE_DS_VLLM_BLOCK_H_ diff --git a/modules/vllm-kv-cache/ds/vllm_layer.cc b/modules/vllm-kv-cache/ds/vllm_layer.cc new file mode 100644 index 000000000..e9289a565 --- /dev/null +++ b/modules/vllm-kv-cache/ds/vllm_layer.cc @@ -0,0 +1,376 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "client/client.h" +#include "common/util/logging.h" +#include "common/util/sidecar.h" +#include "vllm-kv-cache/ds/vllm_layer.h" +#include "vllm-kv-cache/src/storage/vllm_kv_storage.h" +#include "vllm-kv-cache/src/vllm_kv_cache_util.h" + +namespace vineyard { +Status VLLMLayers::FromBlocks(Client& client, std::vector& block_hash, + std::vector>& offsets_vec, + std::vector>& sizes_vec, + std::vector& shape_vec, int layer_index, + std::vector& metas, + std::string rpc_endpoint, + std::shared_ptr& layers, + std::string& req_flag) { + VLOG(2) << "Creating VLLMLayer from blocks, block_hash size: " + << block_hash.size() << ", offsets_vec size: " << offsets_vec.size() + << ", sizes_vec size: " << sizes_vec.size() + << ", request: " << req_flag; + uint64_t start = 0, end = 0; + if (block_hash.size() == 0) { + return Status::OK(); + } + RETURN_ON_ASSERT(offsets_vec.size() == block_hash.size(), + "offsets_vec size not match"); + RETURN_ON_ASSERT(sizes_vec.size() == block_hash.size(), + "sizes_vec size not match"); + RETURN_ON_ASSERT(metas.size() == block_hash.size(), "meta size not match"); + + // fetch meta data + std::vector> remote_blobs; + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + for (uint64_t i = 0; i < metas.size(); i++) { + int remote_layer_index = metas[i].GetKeyValue("layer_index"); + RETURN_ON_ASSERT(remote_layer_index == layer_index, + "layer index not match"); + std::vector remote_shape; + metas[i].GetKeyValue("shape", remote_shape); + RETURN_ON_ASSERT(remote_shape.size() == shape_vec.size(), + "remote shape size not match"); + RETURN_ON_ASSERT( + std::equal(remote_shape.begin(), remote_shape.end(), shape_vec.begin()), + "remote shape not match"); + + std::vector remote_size_vec; + metas[i].GetKeyValue("sizes", remote_size_vec); + RETURN_ON_ASSERT(remote_size_vec.size() == sizes_vec[i].size(), + "remote size of size vector not match"); + RETURN_ON_ASSERT(std::equal(remote_size_vec.begin(), remote_size_vec.end(), + sizes_vec[i].begin()), + "remote blob size not match"); + + std::vector remote_offsets_vec; + metas[i].GetKeyValue("offsets", remote_offsets_vec); + RETURN_ON_ASSERT(remote_offsets_vec.size() == offsets_vec[i].size(), + "remote offsets size not match"); + + std::vector remote_blob; + uint64_t nums = metas[i].GetKeyValue("nums"); + RETURN_ON_ASSERT(nums == offsets_vec[i].size(), + "remote blob nums not match"); + + std::string ids_str_encoder = metas[i].GetKeyValue("blob_ids"); + std::string ids_str = base64_decode(ids_str_encoder); + if (ids_str.size() != sizeof(ObjectID) * nums) { + return Status::Invalid( + "Invalid blob ids size: " + std::to_string(ids_str.size()) + + ", expected: " + std::to_string(sizeof(ObjectID) * nums) + + ", which means meta has been corrupted." + " Request: " + req_flag); + } + std::vector ids; + ids.resize(nums); + memcpy(ids.data(), ids_str.data(), ids_str.size()); + for (uint64_t j = 0; j < nums; j++) { + ObjectID blob_id = ids[j]; + remote_blob.push_back(blob_id); + VLOG(100) << ObjectIDToString(remote_blob.back()); + } + remote_blobs.push_back(remote_blob); + } + + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag + << ". Fetch remote blob ids cost: " << (end - start) << " us"; + + // create layer object + start = end; + Status ret = FromBlocksInternal(client, shape_vec, layer_index, offsets_vec, + remote_blobs, sizes_vec, rpc_endpoint, + block_hash, layers, req_flag); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag + << ". Create VLLMLayers from blocks cost: " << (end - start) << " us"; + return ret; +} + +VLLMLayers::~VLLMLayers() { + if (recv_flag_mem_ != nullptr) { + munmap(recv_flag_mem_, GET_BLOB_RECV_MEM_SIZE); + recv_flag_mem_ = nullptr; + } + if (fd_ != -1) { + close(fd_); + fd_ = -1; + } + return; +} + +Status VLLMLayers::FromBlocksInternal( + Client& client, std::vector shape, int layer_index, + std::vector>& local_offset, + std::vector>& remote_blobs, + std::vector>& sizes_vec, std::string rpc_endpoint, + std::vector& block_hash, std::shared_ptr& layers, + std::string& req_flag) { + VLOG(2) << "Creating VLLMLayer from blocks, local_offset size: " + << local_offset.size() + << ", remote_blobs size: " << remote_blobs.size() + << "hash num:" << block_hash.size() << ", request: " << req_flag; + uint64_t start = 0, end = 0; + if (local_offset.size() == 0 || remote_blobs.size() == 0) { + return Status::OK(); + } + RETURN_ON_ASSERT( + local_offset.size() == remote_blobs.size(), + "local and remote blobs size not match, request:" + req_flag); + + std::vector> local_layer_offsets; + std::vector> remote_layer_blobs; + std::vector> layer_sizes; + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + KVCacheHelper::ShuffleBlockToLayer(local_layer_offsets, remote_layer_blobs, + layer_sizes, local_offset, remote_blobs, + sizes_vec, shape, layer_index); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag + << ". Shuffle blocks to layer cost: " << (end - start) << " us"; + + RETURN_ON_ERROR(Make(local_layer_offsets, remote_layer_blobs, layer_sizes, + shape, layer_index, rpc_endpoint, layers, req_flag)); + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + RETURN_ON_ERROR(layers->Transfer(client)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag + << ". Start transfer VLLMLayers cost: " << (end - start) << " us."; + + layers->block_nums_ = block_hash.size(); + return Status::OK(); +} + +Status VLLMLayers::IsReceived(int index, bool& received) { + if (recv_flag_mem_ == nullptr) { + return Status::IOError( + "recv_flag_mem_ is not initialized. Please identify if the transfer is " + "beginning. Request id: " + + req_flag_); + } + + unsigned char error_code = reinterpret_cast( + recv_flag_mem_)[GET_BLOB_RECV_MEM_SIZE - sizeof(unsigned char)]; + std::string error_msg(reinterpret_cast(recv_flag_mem_) + + GET_BLOB_RECV_MEM_SIZE - ERROR_MSG_LENGTH - + sizeof(unsigned char), + ERROR_MSG_LENGTH); + if (error_code != 0) { + std::cerr << "Error code: " << static_cast(error_code) + << ", error message: " << error_msg + << ", request id: " << req_flag_ << std::endl; + Status status = + Status(StatusCode(error_code), + "Check block received failed. Request id: " + req_flag_ + + ", error message: " + error_msg); + is_transferring_ = false; + return status; + } + if (index == -1) { + for (int i = 0; i < layer_nums_; ++i) { + received = true; + if (reinterpret_cast(recv_flag_mem_)[i] == 0) { + received = false; + break; + } + } + return Status::OK(); + } else if (index >= 0 && index < layer_nums_) { + received = (reinterpret_cast(recv_flag_mem_))[index] == 1; + if (received && index == layer_nums_ - 1) { + is_finished_ = true; + is_transferring_ = false; + } + return Status::OK(); + } else { + return Status::Invalid("Index out of range. Request id: " + req_flag_); + } + return Status::OK(); +} + +Status VLLMLayers::PutBlocks(std::vector& statuses) { + if (!is_finished_) { + return Status::Invalid("VLLMLayers transfer is not finished."); + } + if (has_put_) { + return Status::Invalid("Blocks have already been put."); + } + + // RETURN_ON_ERROR(VLLMKVStorage::PutBlockKVCache(block_hash_, + // block_builders_, statuses, + // req_flag_)); + // has_put_ = true; + // need_to_delete_builder_.clear(); + return Status::OK(); +} + +Status VLLMLayers::PutAndCleanWithFilter(std::vector& filter, + std::vector& statuses) { + if (!is_finished_) { + return Status::Invalid("VLLMLayers transfer is not finished."); + } + if (has_put_) { + return Status::Invalid("Blocks have already been put."); + } + + // std::set filter_set; + // for (size_t i = 0; i < filter.size(); ++i) { + // if (filter[i] > block_builders_.size()) { + // return Status::Invalid("Filter index out of range."); + // } + // if (filter_set.find(filter[i]) != filter_set.end()) { + // return Status::Invalid("Filter index duplicated."); + // } + // auto it = need_to_delete_builder_.find(block_builders_[filter[i]]); + // if (it == need_to_delete_builder_.end()) { + // return Status::Invalid("Block builder not found."); + // } + // filter_set.insert(filter[i]); + // } + + // std::vector> need_to_put_builders; + // std::vector need_to_put_block_hash; + // for (size_t i = 0; i < filter.size(); ++i) { + // need_to_put_builders.push_back(block_builders_[filter[i]]); + // need_to_put_block_hash.push_back(block_hash_[filter[i]]); + // } + + // RETURN_ON_ERROR(VLLMKVStorage::PutBlockKVCache(need_to_put_block_hash, + // need_to_put_builders, + // statuses, req_flag_)); + + // has_put_ = true; + // for (size_t i = 0; i < need_to_put_builders.size(); ++i) { + // auto it = need_to_delete_builder_.find(need_to_put_builders[i]); + // if (it != need_to_delete_builder_.end()) { + // need_to_delete_builder_.erase(it); + // } + // } + // if (!DeleteNotSavedBlocks().ok()) { + // LOG(ERROR) << "Failed to delete not saved blocks."; + // } + + return Status::OK(); +} + +Status VLLMLayers::DeleteNotSavedBlocks() { + if (is_transferring_) { + LOG(WARNING) << "Transfer is still in progress, cannot delete blocks."; + return Status::Invalid("Transfer is still in progress."); + } + + // std::vector> + // not_sealed_block_builder_vec( + // need_to_delete_builder_.begin(), need_to_delete_builder_.end()); + // return VLLMKVStorage::DeleteBlockBuilders(not_sealed_block_builder_vec, + // req_flag_); + return Status::OK(); +} + +Status VLLMLayers::Make(std::vector> local_offsets, + std::vector> remote_blobs, + std::vector> sizes_vec, + std::vector shape, int layer_index, + std::string rpc_endpoint, + std::shared_ptr& layers, + std::string& req_flag) { + layers = std::make_shared(); + layers->local_offset_ = std::move(local_offsets); + layers->remote_id_ = std::move(remote_blobs); + layers->rpc_endpoint_ = rpc_endpoint; + layers->layer_nums_ = layers->local_offset_.size(); + layers->req_flag_ = req_flag; + layers->sizes_ = std::move(sizes_vec); + return Status::OK(); +} + +Status VLLMLayers::Transfer(Client& client) { + if (is_transferring_) { + return Status::OK(); + } + + RETURN_ON_ERROR(client.VineyardGetRemoteBlobsWithOffset( + local_offset_, remote_id_, sizes_, rpc_endpoint_, fd_, req_flag_)); + recv_flag_mem_ = mmap(nullptr, GET_BLOB_RECV_MEM_SIZE, PROT_READ | PROT_WRITE, + MAP_SHARED, fd_, 0); + if (recv_flag_mem_ == MAP_FAILED) { + return Status::IOError("Failed to mmap recv_flag_mem. Request id: " + + req_flag_); + } + + is_transferring_ = true; + VLOG(2) << "Transfer is beginning, address of recv_flag_mem_: " + << static_cast(recv_flag_mem_) + << ", size: " << GET_BLOB_RECV_MEM_SIZE; + return Status::OK(); +} + +void VLLMLayers::Dump() { + std::cout << "VLLMLayer dump:" << std::endl; + std::cout << "blob map:" << std::endl; + for (size_t i = 0; i < local_offset_.size(); i++) { + std::cout << "layer " << i << ": " << std::endl; + for (size_t j = 0; j < local_offset_[i].size(); j++) { + std::cout << "local offsets: " << local_offset_[i][j] << ", " + << "remote: " << ObjectIDToString(remote_id_[i][j]) + << std::endl; + } + std::cout << std::endl; + } + std::cout << "rpc endpoint: " << rpc_endpoint_ << std::endl; +} + +} // namespace vineyard diff --git a/modules/vllm-kv-cache/ds/vllm_layer.h b/modules/vllm-kv-cache/ds/vllm_layer.h new file mode 100644 index 000000000..6097cf5bc --- /dev/null +++ b/modules/vllm-kv-cache/ds/vllm_layer.h @@ -0,0 +1,103 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef MODULES_VLLM_KV_CACHE_DS_VLLM_LAYER_H_ +#define MODULES_VLLM_KV_CACHE_DS_VLLM_LAYER_H_ + +#include +#include +#include +#include +#include +#include + +#include "client/client.h" +#include "client/ds/object_meta.h" +#include "common/util/status.h" +#include "vllm-kv-cache/ds/vllm_block.h" + +namespace vineyard { + +class VLLMLayers { + public: + VLLMLayers() = default; + + ~VLLMLayers(); + + static Status FromBlocks(Client& client, std::vector& block_hash, + std::vector>& offsets_vec, + std::vector>& sizes_vec, + std::vector& shape_vec, int layer_index, + std::vector& metas, + std::string rpc_endpoint, + std::shared_ptr& layers, + std::string& req_flag); + + Status IsReceived(int index, bool& received); + + int GetLayerNum() const { return layer_nums_; } + + size_t GetBlockNum() const { return block_nums_; } + + Status PutBlocks(std::vector& statuses); + + // Put blocks with filter, and delete not saved blocks. + // If status is not OK, the block will not be deleted. + Status PutAndCleanWithFilter(std::vector& filter, + std::vector& statuses); + + Status DeleteNotSavedBlocks(); + + void Dump(); + + const std::vector& GetBlockHashes() const { return block_hash_; } + + private: + static Status FromBlocksInternal( + Client& client, std::vector shape, int layer_index, + std::vector>& local_offset, + std::vector>& remote_blobs, + std::vector>& sizes_vec, std::string rpc_endpoint, + std::vector& block_hash, std::shared_ptr& layers, + std::string& req_flag); + + static Status Make(std::vector> local_offsets, + std::vector> remote_blobs, + std::vector> sizes_vec, + std::vector shape, int layer_index, + std::string rpc_endpoint, + std::shared_ptr& layer, std::string& req_flag); + + Status Transfer(Client& client); + + std::vector block_hash_; + uint64_t block_nums_ = 0; + + bool is_transferring_ = false; + bool is_finished_ = false; + bool has_put_ = false; + std::vector> local_offset_; + std::vector> remote_id_; + std::vector> sizes_; + int layer_nums_ = 0; + std::string rpc_endpoint_; + int fd_ = -1; + void* recv_flag_mem_ = nullptr; + std::string req_flag_ = ""; +}; + +} // namespace vineyard + +#endif // MODULES_VLLM_KV_CACHE_DS_VLLM_LAYER_H_ diff --git a/modules/vllm-kv-cache/src/env.cc b/modules/vllm-kv-cache/src/env.cc new file mode 100644 index 000000000..53d8eb0d3 --- /dev/null +++ b/modules/vllm-kv-cache/src/env.cc @@ -0,0 +1,112 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include + +#include "vllm-kv-cache/src/env.h" + +namespace vineyard { + +std::string VLLMKVCacheEnv::GetKVStorageConcurrency() { + static std::string kv_storage_concurrency = + read_env("VLLM_KV_STORAGE_CONCURRENCY", + std::to_string(std::thread::hardware_concurrency())); + return kv_storage_concurrency; +} + +std::string VLLMKVCacheEnv::GetVLLMBlockPrefix() { + static std::string vllm_block_prefix = + read_env("VLLM_BLOCK_PREFIX", "block.hash.key."); + return vllm_block_prefix; +} + +std::string VLLMKVCacheEnv::GetVineyardVLLMKVCacheIOTimeoutMilliseconds() { + static std::string io_timeout_milliseconds = + read_env("VINEYARD_VLLM_KV_CACHE_IO_TIMEOUT_MILLISECONDS", "5000"); + return io_timeout_milliseconds; +} + +std::string VLLMKVCacheEnv::GetDirectIOAlign() { + static std::string direct_io_align = read_env("DIRECT_IO_ALIGN", "4096"); + return direct_io_align; +} + +std::string VLLMKVCacheEnv::AIORetryWaitMicroseconds() { + static std::string aio_retry_wait_microseconds = + read_env("VINEYARD_AIO_RETRY_WAIT_MICROSECONDS", "1000"); + return aio_retry_wait_microseconds; +} + +std::string VLLMKVCacheEnv::AIOGCWaitTimeMicroseconds() { + static std::string aio_gc_wait_time_microseconds = + read_env("VINEYARD_AIO_GC_WAIT_TIME_MICROSECONDS", "10000"); + return aio_gc_wait_time_microseconds; +} + +std::string VLLMKVCacheEnv::GetVineyardAIOSubmitConcurrency() { + static std::string aio_submit_concurrency = + read_env("VINEYARD_AIO_SUBMIT_CONCURRENCY", "4"); + return aio_submit_concurrency; +} + +std::string VLLMKVCacheEnv::VineyardEnableVLLMKVCacheMemCopy() { + std::string enable_mem_copy = + read_env("VINEYARD_ENABLE_VLLM_KV_CACHE_MEM_COPY", "1"); + return enable_mem_copy; +} + +std::string VLLMKVCacheEnv::VineyardEnableVLLMKVCacheDirectIO() { + std::string enable_direct_io = + read_env("VINEYARD_ENABLE_VLLM_KV_CACHE_DIRECT_IO", "1"); + return enable_direct_io; +} + +std::string VLLMKVCacheEnv::GetVineyardVLLMKVCacheDiskPath() { + static std::string disk_path = + read_env("VINEYARD_VLLM_KV_CACHE_DISK_PATH", ""); + return disk_path; +} + +std::string VLLMKVCacheEnv::GetVineyardVLLMKVCacheDiskType() { + static std::string disk_type = + read_env("VINEYARD_VLLM_KV_CACHE_DISK_TYPE", ""); + return disk_type; +} + +std::string VLLMKVCacheEnv::LocalVineyardVLLMKVCache() { + static std::string local_vineyard_vllm_cache = + read_env("LOCAL_VINEYARD_VLLM_KVCACHE", "1"); + return local_vineyard_vllm_cache; +} + +std::string VLLMKVCacheEnv::GetVineyardVLLMBlockMetaMagicSize() { + static std::string meta_magic_size = + read_env("VINEYARD_VLLM_BLOCK_META_MAGIC_SIZE", "4096"); + return meta_magic_size; +} + +std::string VLLMKVCacheEnv::GetVineyardVLLMMaxBlockNum() { + static std::string max_block_num = + read_env("VINEYARD_VLLM_MAX_BLOCK_SIZE", "8192"); + return max_block_num; +} + +std::string VLLMKVCacheEnv::GetAIOPullResultInterval() { + static std::string aio_pull_result_interval = + read_env("VINEYARD_AIO_PULL_RESULT_INTERVAL", "10"); + return aio_pull_result_interval; +} + +}; // namespace vineyard diff --git a/modules/vllm-kv-cache/src/env.h b/modules/vllm-kv-cache/src/env.h new file mode 100644 index 000000000..d2d4e5cf1 --- /dev/null +++ b/modules/vllm-kv-cache/src/env.h @@ -0,0 +1,60 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef MODULES_VLLM_KV_CACHE_SRC_ENV_H_ +#define MODULES_VLLM_KV_CACHE_SRC_ENV_H_ + +#include + +#include "common/util/env.h" + +namespace vineyard { + +class VLLMKVCacheEnv : public VineyardEnv { + public: + static std::string GetKVStorageConcurrency(); + + static std::string GetVLLMBlockPrefix(); + + static std::string GetVineyardVLLMKVCacheIOTimeoutMilliseconds(); + + static std::string GetDirectIOAlign(); + + static std::string AIORetryWaitMicroseconds(); + + static std::string AIOGCWaitTimeMicroseconds(); + + static std::string GetVineyardAIOSubmitConcurrency(); + + static std::string VineyardEnableVLLMKVCacheMemCopy(); + + static std::string VineyardEnableVLLMKVCacheDirectIO(); + + static std::string GetVineyardVLLMKVCacheDiskPath(); + + static std::string GetVineyardVLLMKVCacheDiskType(); + + static std::string LocalVineyardVLLMKVCache(); + + static std::string GetVineyardVLLMBlockMetaMagicSize(); + + static std::string GetVineyardVLLMMaxBlockNum(); + + static std::string GetAIOPullResultInterval(); +}; + +}; // namespace vineyard + +#endif // MODULES_VLLM_KV_CACHE_SRC_ENV_H_ diff --git a/modules/vllm-kv-cache/src/io/aio_adaptor.cc b/modules/vllm-kv-cache/src/io/aio_adaptor.cc new file mode 100644 index 000000000..db2ebc722 --- /dev/null +++ b/modules/vllm-kv-cache/src/io/aio_adaptor.cc @@ -0,0 +1,827 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Refer to: PAI-LLM/vllm/blob/develop/csrc/v6d/load_blocks_helper.cpp + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "common/util/logging.h" +#include "vllm-kv-cache/src/env.h" +#include "vllm-kv-cache/src/io/aio_adaptor.h" +#include "vllm-kv-cache/src/vllm_kv_cache_util.h" + +#include "thread-pool/thread_pool.h" + +namespace vineyard { + +namespace vllm_kv_cache { + +namespace io { + +AIOContext::AIOContext(uint64_t concurrency) + : AIOContext(concurrency, std::make_shared()) {} + +AIOContext::AIOContext(uint64_t concurrency, + std::shared_ptr aio_ops) + : concurrency_(concurrency), + io_queue_(concurrency), + queue_mutex_(concurrency), + aio_ops_(std::move(aio_ops)) { + // init context + LOG(INFO) << "Initializing AIO context with concurrency: " << concurrency_; + thread_pool_ = std::make_shared(concurrency_ + 2); + for (size_t i = 0; i < concurrency_; ++i) { + queue_cv_.push_back(std::make_shared()); + } + aio_ctx_ = nullptr; + int ret = aio_ops_->io_setup(max_events_, &aio_ctx_); + if (ret < 0) { + throw std::runtime_error("Failed to initialize io_setup: error code:" + + std::to_string(ret)); + } + io_timeout_milliseconds_ = + std::stoll(VLLMKVCacheEnv::GetVineyardVLLMKVCacheIOTimeoutMilliseconds()); + LOG(INFO) << "AIOContext initialized with max_events: " << max_events_ + << ", io_timeout_milliseconds_: " << io_timeout_milliseconds_; + LOG(INFO) << "Running polling thread..."; + + // run polling thread + thread_pool_->enqueue_noreturn([this]() { this->PullIORequestThread(); }); + + LOG(INFO) << "Running timeout tracker thread..."; + thread_pool_->enqueue_noreturn([this]() { this->TimeTrackerThread(); }); + + LOG(INFO) << "Running " << concurrency_ << " submit threads..."; + for (size_t i = 0; i < concurrency_; ++i) { + thread_pool_->enqueue_noreturn( + [this, i]() { this->PushIORequestThread(i); }); + } + LOG(INFO) << "Init done"; +} + +void AIOContext::PullIORequestThread() { + struct io_event* events = new io_event[max_events_]; + struct timespec timeout = {0, 1000000}; // 1ms timeout + while (!stop_.load()) { + usleep(stoi(VLLMKVCacheEnv::GetAIOPullResultInterval())); + // pulling events + int num_events = + aio_ops_->io_getevents(aio_ctx_, 0, max_events_, events, &timeout); + if (num_events < 0) { + // handle error (e.g., EINTR interrupt) + LOG(INFO) << "io_getevents error: " << strerror(-num_events); + continue; + } + + if (num_events == 0) { + continue; // no events, continue polling + } + + submitted_requests_.fetch_sub(num_events); + + // process each completed event + for (int i = 0; i < num_events; i++) { + auto* user_data = static_cast(events[i].data); + + std::lock_guard lock(submitted_io_requests_mutex_); + try { + bool promise_set = user_data->promise_setted.exchange(true); + if (!promise_set) { + if (events[i].res >= 0 && events[i].res2 == 0) { // success + size_t bytes_transferred = events[i].res; + if (bytes_transferred < user_data->expect_size) { + user_data->promise.set_value(Status::EndOfFile()); + } else if (bytes_transferred > user_data->expect_size) { + user_data->promise.set_value( + Status::Invalid("return size larger than expect size")); + } else { + user_data->promise.set_value(Status::OK()); + } + } else { + LOG(WARNING) << "Promise already set for AIO operation, skipping."; + } + } + } catch (const std::future_error& e) { + LOG(WARNING) << "Failed to set promise value: " << e.what() + << " maybe set by timeout tracker."; + } + submitted_io_requests_.erase(user_data); + ReleaseAIOUserData(user_data); + } + } + delete events; + LOG(INFO) << "Polling thread stopped."; +} + +void AIOContext::TimeTrackerThread() { + // timeout checker. + while (!stop_.load()) { + usleep(std::stoi( + VLLMKVCacheEnv::AIOGCWaitTimeMicroseconds())); // sleep for 10ms + std::lock_guard lock(submitted_io_requests_mutex_); + for (auto it = submitted_io_requests_.begin(); + it != submitted_io_requests_.end();) { + auto user_data = *it; + if (user_data->timestamp + io_timeout_milliseconds_ < + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count()) { + // timeout, set promise to error + try { + bool promise_set = user_data->promise_setted.exchange(true); + if (!promise_set) { + user_data->promise.set_value( + Status::IOError("AIO operation timed out after " + + std::to_string(io_timeout_milliseconds_) + + " ms." + "IO context enqueue time: " + + KVCacheHelper::MicrosecondToTimestamp( + user_data->enqueue_timestamp) + + ", dequeue time: " + + KVCacheHelper::MicrosecondToTimestamp( + user_data->dequeue_timestamp) + + ", submit time: " + + KVCacheHelper::MicrosecondToTimestamp( + user_data->submit_timestamp))); + } else { + LOG(WARNING) << "Promise already set for AIO operation, skipping."; + } + } catch (const std::future_error& e) { + LOG(WARNING) << "Failed to set timeout promise value: " << e.what(); + } + submitted_io_requests_.erase(it++); + } else { + ++it; + } + } + } + LOG(INFO) << "Timeout tracker thread stopped."; +} + +void AIOContext::PushIORequestThread(uint64_t i) { + while (!stop_.load()) { + std::vector user_data_vec; + PullRequest(user_data_vec, i); + if (user_data_vec.empty()) { + continue; + } + + int64_t submit_requests_num = user_data_vec.size(); + std::shared_ptr iocbs( + new struct iocb*[submit_requests_num]); + for (int64_t i = 0; i < submit_requests_num; ++i) { + iocbs[i] = &user_data_vec[i]->cb; + user_data_vec[i]->timestamp = + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + } + + VLOG(2) << "Submitting " << submit_requests_num << " AIO requests..."; + int64_t left_requests_num = submit_requests_num; + int retry_count = 0; + { + std::lock_guard lock(submitted_io_requests_mutex_); + for (int64_t i = 0; i < submit_requests_num; ++i) { + user_data_vec[i]->submit_timestamp = + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + submitted_io_requests_.insert(user_data_vec[i]); + } + } + do { + int submitted = aio_ops_->io_submit( + aio_ctx_, left_requests_num, + iocbs.get() + (submit_requests_num - left_requests_num)); + if (submitted < 0) { + LOG(ERROR) << "io_submit failed: " << strerror(-submitted) + << ", current submitted requests: " + << GetProcessingIORequest() << ", retrying..."; + usleep(std::stoi(VLLMKVCacheEnv::AIORetryWaitMicroseconds())); + retry_count++; + if (retry_count >= retry_times_) { + LOG(ERROR) << "Failed to submit AIO requests after " << retry_count + << " retries."; + // set failed promises + std::lock_guard lock(submitted_io_requests_mutex_); + for (int64_t i = 0; i < left_requests_num; ++i) { + auto user_data = static_cast( + iocbs[submit_requests_num - left_requests_num + i]->data); + try { + bool promise_set = user_data->promise_setted.exchange(true); + if (!promise_set) { + user_data->promise.set_value(Status::IOError( + "Failed to submit AIO request after retries")); + } else { + LOG(WARNING) + << "Promise already set for AIO operation, skipping."; + } + } catch (const std::future_error& e) { + LOG(WARNING) << "Failed to set promise value: " << e.what() + << " maybe set by timeout tracker."; + } + submitted_io_requests_.erase(user_data); + ReleaseAIOUserData(user_data); + } + break; + } + } else { + submitted_requests_.fetch_add(submitted); + left_requests_num -= submitted; + } + } while (left_requests_num > 0); + } + LOG(INFO) << "Submit thread stopped."; +} + +AIOContext::~AIOContext() { + stop_.store(true); + for (size_t i = 0; i < concurrency_; ++i) { + queue_cv_[i]->notify_all(); + } + thread_pool_.reset(); // Reset the thread pool to stop all threads + aio_ops_->io_destroy(aio_ctx_); + LOG(INFO) << "AIOContext destroyed."; +} + +AIOUserData* AIOContext::CreateAIOUserData(size_t expect_size, + std::promise promise) { + auto user_data = new AIOUserData(); + user_data->expect_size = expect_size; + user_data->promise = std::move(promise); + memset(&user_data->cb, 0, sizeof(user_data->cb)); + return user_data; +} + +void AIOContext::ReleaseAIOUserData(AIOUserData* user_data) { + if (user_data) { + delete user_data; + } +} + +IOAdaptorFactory GetAIOAdaptorFactory() { + LOG(INFO) << "Using AIO adaptor"; + return [](const std::string& path) + -> std::shared_ptr { + return std::make_shared(path); + }; +} + +std::future AIOContext::SubmitRead(int fd, void* data, size_t size, + size_t offset) { + std::shared_ptr> status_promise = + std::make_shared>(); + auto status_future = status_promise->get_future(); + if (fd < 0) { + status_promise->set_value(Status::IOError("Invalid file descriptor")); + return status_future; + } + + int64_t queue_index = read_counter_.fetch_add(1) % concurrency_; + auto user_data = CreateAIOUserData(size, std::move(*status_promise)); + aio_ops_->io_prep_pread(&user_data->cb, fd, data, size, offset); + user_data->cb.data = user_data; // Set the user data pointer for completion + user_data->timestamp = + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + + std::vector user_data_vec; + user_data_vec.push_back(user_data); + + PushRequest(user_data_vec, queue_index); + return status_future; +} + +std::future AIOContext::SubmitRead(int fd, std::shared_ptr data, + size_t size, size_t offset) { + std::shared_ptr> status_promise = + std::make_shared>(); + auto status_future = status_promise->get_future(); + if (fd < 0) { + status_promise->set_value(Status::IOError("Invalid file descriptor")); + return status_future; + } + + int64_t queue_index = read_counter_.fetch_add(1) % concurrency_; + auto user_data = CreateAIOUserData(size, std::move(*status_promise)); + aio_ops_->io_prep_pread(&user_data->cb, fd, data.get(), size, offset); + user_data->cb.data = user_data; // Set the user data pointer for completion + user_data->data_ptr = data; // Store the shared pointer + user_data->timestamp = + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + + std::vector user_data_vec; + user_data_vec.push_back(user_data); + + PushRequest(user_data_vec, queue_index); + return status_future; +} + +std::future AIOContext::SubmitWrite(int fd, void* data, size_t size, + size_t offset) { + std::shared_ptr> status_promise = + std::make_shared>(); + auto status_future = status_promise->get_future(); + if (fd < 0) { + status_promise->set_value(Status::IOError("Invalid file descriptor")); + return status_future; + } + + int64_t queue_index = write_counter_.fetch_add(1) % concurrency_; + auto user_data = CreateAIOUserData(size, std::move(*status_promise)); + aio_ops_->io_prep_pwrite(&user_data->cb, fd, data, size, offset); + user_data->cb.data = user_data; // Set the user data pointer for completion + user_data->timestamp = + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + + std::vector user_data_vec; + user_data_vec.push_back(user_data); + + PushRequest(user_data_vec, queue_index); + return status_future; +} + +std::future AIOContext::SubmitWrite(int fd, std::shared_ptr data, + size_t size, size_t offset) { + std::shared_ptr> status_promise = + std::make_shared>(); + auto status_future = status_promise->get_future(); + if (fd < 0) { + status_promise->set_value(Status::IOError("Invalid file descriptor")); + return status_future; + } + + int64_t queue_index = write_counter_.fetch_add(1) % concurrency_; + auto user_data = CreateAIOUserData(size, std::move(*status_promise)); + aio_ops_->io_prep_pwrite(&user_data->cb, fd, data.get(), size, offset); + user_data->cb.data = user_data; // Set the user data pointer for completion + user_data->data_ptr = data; // Store the shared pointer + user_data->timestamp = + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + + std::vector user_data_vec; + user_data_vec.push_back(user_data); + + PushRequest(user_data_vec, queue_index); + return status_future; +} + +Status AIOContext::BatchSubmitRead(int fd, std::vector& data_vec, + std::vector& size_vec, + std::vector& offset_vec, + std::vector>& results) { + if (fd < 0) { + LOG(ERROR) << "Invalid file descriptor: " << fd; + return Status::IOError("Invalid file descriptor"); + } + + uint64_t queue_index = read_counter_.fetch_add(1) % concurrency_; + + std::vector>> status_promise_vec; + for (size_t i = 0; i < data_vec.size(); ++i) { + status_promise_vec.push_back(std::make_shared>()); + results.push_back(status_promise_vec.back()->get_future()); + } + + std::vector user_data_vec; + for (size_t i = 0; i < data_vec.size(); ++i) { + auto user_data = + this->CreateAIOUserData(size_vec[i], std::move(*status_promise_vec[i])); + aio_ops_->io_prep_pread(&user_data->cb, fd, data_vec[i], size_vec[i], + offset_vec[i]); + user_data->cb.data = user_data; // Set the user data pointer for completion + user_data_vec.push_back(user_data); + } + + PushRequest(user_data_vec, queue_index); + return Status::OK(); +} + +Status AIOContext::BatchSubmitRead(int fd, + std::vector>& data_vec, + std::vector& size_vec, + std::vector& offset_vec, + std::vector>& results) { + if (fd < 0) { + LOG(ERROR) << "Invalid file descriptor: " << fd; + return Status::IOError("Invalid file descriptor"); + } + + uint64_t queue_index = read_counter_.fetch_add(1) % concurrency_; + + std::vector>> status_promise_vec; + for (size_t i = 0; i < data_vec.size(); ++i) { + status_promise_vec.push_back(std::make_shared>()); + results.push_back(status_promise_vec.back()->get_future()); + } + + std::vector user_data_vec; + for (size_t i = 0; i < data_vec.size(); ++i) { + auto user_data = + this->CreateAIOUserData(size_vec[i], std::move(*status_promise_vec[i])); + aio_ops_->io_prep_pread(&user_data->cb, fd, data_vec[i].get(), size_vec[i], + offset_vec[i]); + user_data->cb.data = user_data; // Set the user data pointer for completion + user_data->data_ptr = data_vec[i]; // Store the shared pointer + user_data_vec.push_back(user_data); + } + + PushRequest(user_data_vec, queue_index); + return Status::OK(); +} + +Status AIOContext::BatchSubmitWrite(int fd, std::vector& data_vec, + std::vector& size_vec, + std::vector& offset_vec, + std::vector>& results) { + if (fd < 0) { + LOG(ERROR) << "Invalid file descriptor: " << fd; + return Status::Invalid("Invalid file descriptor"); + } + + uint64_t queue_index = write_counter_.fetch_add(1) % concurrency_; + + std::vector>> status_promise_vec; + for (size_t i = 0; i < data_vec.size(); ++i) { + status_promise_vec.push_back(std::make_shared>()); + results.push_back(status_promise_vec.back()->get_future()); + } + + std::vector user_data_vec; + for (size_t i = 0; i < data_vec.size(); ++i) { + auto user_data = + CreateAIOUserData(size_vec[i], std::move(*status_promise_vec[i])); + aio_ops_->io_prep_pwrite(&user_data->cb, fd, data_vec[i], size_vec[i], + offset_vec[i]); + user_data->cb.data = user_data; // Set the user data pointer for completion + user_data_vec.push_back(user_data); + } + + PushRequest(user_data_vec, queue_index); + return Status::OK(); +} + +Status AIOContext::BatchSubmitWrite( + int fd, std::vector>& data_vec, + std::vector& size_vec, std::vector& offset_vec, + std::vector>& results) { + if (fd < 0) { + LOG(ERROR) << "Invalid file descriptor: " << fd; + return Status::Invalid("Invalid file descriptor"); + } + + uint64_t queue_index = write_counter_.fetch_add(1) % concurrency_; + + std::vector>> status_promise_vec; + for (size_t i = 0; i < data_vec.size(); ++i) { + status_promise_vec.push_back(std::make_shared>()); + results.push_back(status_promise_vec.back()->get_future()); + } + + std::vector user_data_vec; + for (size_t i = 0; i < data_vec.size(); ++i) { + auto user_data = + CreateAIOUserData(size_vec[i], std::move(*status_promise_vec[i])); + aio_ops_->io_prep_pwrite(&user_data->cb, fd, data_vec[i].get(), size_vec[i], + offset_vec[i]); + user_data->cb.data = user_data; // Set the user data pointer for completion + user_data->data_ptr = data_vec[i]; // Store the shared pointer + user_data_vec.push_back(user_data); + } + + PushRequest(user_data_vec, queue_index); + return Status::OK(); +} + +void AIOContext::PullRequest(std::vector& user_data, + uint64_t queue_index) { + std::unique_lock ulock(queue_mutex_[queue_index]); + if (io_queue_[queue_index].empty()) { + // 等待在 condition virable 上 + // wait 直到有新的请求被加入 + this->queue_cv_[queue_index]->wait(ulock, [this, queue_index] { + return !this->io_queue_[queue_index].empty() || this->stop_.load(); + }); + if (this->stop_.load()) { + return; + } + } + + while (!this->io_queue_[queue_index].empty() && + user_data.size() < static_cast(max_push_events_)) { + user_data.push_back(this->io_queue_[queue_index].front()); + this->io_queue_[queue_index].pop_front(); + user_data.back()->dequeue_timestamp = + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + } + + static uint64_t start = + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + if (!user_data.empty()) { + uint64_t end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + if (end - start > 100) { + LOG(INFO) << "Pulled " << user_data.size() + << " requests from queue index " << queue_index + << " pendding requests: " + << this->io_queue_[queue_index].size(); + start = end; + } + } +} + +void AIOContext::PushRequest(std::vector& user_data, + uint64_t queue_index) { + { + std::unique_lock lock(queue_mutex_[queue_index]); + for (auto data : user_data) { + data->enqueue_timestamp = + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + this->io_queue_[queue_index].push_back(data); + } + } + // 唤醒等待的线程 + this->queue_cv_[queue_index]->notify_one(); +} + +Status AIOAdaptor::Open(std::string mode, bool direct_io) { + int flags = 0; + if (mode == "r") { + flags = O_RDONLY; + } else if (mode == "w") { + flags = O_WRONLY | O_CREAT | O_TRUNC; + } else { + return Status::Invalid("Invalid mode: " + mode); + } + + if (direct_io) { + flags |= O_DIRECT; + } + direct_io_ = direct_io; + + fd_ = open(location_.c_str(), flags, 0666); + if (fd_ == -1) { + return Status::IOError("Failed to open file: " + location_ + + " error:" + std::string(strerror(errno))); + } + return Status::OK(); +} + +Status AIOAdaptor::Read(void* data, size_t size) { + if (fd_ == -1) { + return Status::IOError("File not opened: " + location_ + + " error:" + std::string(strerror(errno))); + } + + if (direct_io_ && size % std::stoi(VLLMKVCacheEnv::GetDirectIOAlign()) != 0) { + return Status::IOError( + "Direct IO requires size to be a multiple of 512 bytes, but got: " + + std::to_string(size)); + } + + auto future_status = + aio_context_->SubmitRead(this->fd_, data, size, this->read_pos); + if (future_status.valid()) { + Status status = future_status.get(); + if (status.ok()) { + read_pos += size; // Update read position + } + return status; + } else { + return Status::IOError("Failed to submit read operation"); + } +} + +Status AIOAdaptor::Write(void* data, size_t size) { + if (fd_ == -1) { + return Status::IOError("File not opened: " + location_ + + " error:" + std::string(strerror(errno))); + } + + auto future_status = + aio_context_->SubmitWrite(this->fd_, data, size, this->write_pos); + if (future_status.valid()) { + Status status = future_status.get(); + if (status.ok()) { + write_pos += size; // Update write position + } + return status; + } else { + return Status::IOError("Failed to submit write operation"); + } +} + +Status AIOAdaptor::Close() { + if (fd_ == -1) { + return Status::OK(); // No error, just return OK + } + + if (direct_io_ && write_pos) { + int align_size = std::stoi(VLLMKVCacheEnv::GetDirectIOAlign()); + if (write_pos % align_size != 0) { + LOG(WARNING) << "Direct IO requires write position to be a multiple of " + << align_size << ", padding with zeros."; + // Pad with zeros to align the write position + size_t padding_size = align_size - (write_pos % align_size); + char* padding_data = new char[padding_size]; + memset(padding_data, 0, padding_size); + Status status = Write(padding_data, padding_size); + delete[] padding_data; + if (!status.ok()) { + LOG(ERROR) << "Failed to write padding data for direct IO: " + << status.ToString(); + return status; + } + } + } + + if (fsync(fd_) < 0) { + return Status::IOError("Failed to sync file: " + location_ + + " error:" + std::string(strerror(errno))); + } + + if (close(fd_) < 0) { + return Status::IOError("Failed to close file: " + location_ + + " error:" + std::string(strerror(errno))); + } + fd_ = -1; + return Status::OK(); +} + +Status AIOAdaptor::FileTruncate(size_t size) { + if (fd_ == -1) { + return Status::IOError("File not opened: " + location_ + + " error:" + std::string(strerror(errno))); + } + + if (ftruncate(fd_, size) < 0) { + return Status::IOError("Failed to truncate file: " + location_ + + " to size: " + std::to_string(size) + + " error:" + std::string(strerror(errno))); + } + return Status::OK(); +} + +std::future AIOAdaptor::AsyncRead(void* data, size_t size, + size_t offset) { + if (fd_ == -1) { + return std::async(std::launch::async, [this]() { + return Status::IOError("File not opened: " + location_ + + " error:" + std::string(strerror(errno))); + }); + } + + return aio_context_->SubmitRead(this->fd_, data, size, offset); +} + +std::future AIOAdaptor::AsyncRead(std::shared_ptr data, + size_t size, size_t offset) { + if (fd_ == -1) { + return std::async(std::launch::async, [this]() { + return Status::IOError("File not opened: " + location_ + + " error:" + std::string(strerror(errno))); + }); + } + + return aio_context_->SubmitRead(this->fd_, data, size, offset); +} + +std::future AIOAdaptor::AsyncWrite(void* data, size_t size, + size_t offset) { + if (fd_ == -1) { + return std::async(std::launch::async, [this]() { + return Status::IOError("File not opened: " + location_ + + " error:" + std::string(strerror(errno))); + }); + } + + return aio_context_->SubmitWrite(this->fd_, data, size, offset); +} + +std::future AIOAdaptor::AsyncWrite(std::shared_ptr data, + size_t size, size_t offset) { + if (fd_ == -1) { + return std::async(std::launch::async, [this]() { + return Status::IOError("File not opened: " + location_ + + " error:" + std::string(strerror(errno))); + }); + } + + return aio_context_->SubmitWrite(this->fd_, data, size, offset); +} + +Status AIOAdaptor::BatchAsyncRead(std::vector& data_vec, + std::vector& size_vec, + std::vector& offset_vec, + std::vector>& results) { + if (fd_ == -1) { + return Status::IOError("File not opened: " + location_ + + " error:" + std::string(strerror(errno))); + } + + return aio_context_->BatchSubmitRead(this->fd_, data_vec, size_vec, + offset_vec, results); +} + +Status AIOAdaptor::BatchAsyncRead(std::vector>& data_vec, + std::vector& size_vec, + std::vector& offset_vec, + std::vector>& results) { + if (fd_ == -1) { + return Status::IOError("File not opened: " + location_ + + " error:" + std::string(strerror(errno))); + } + + return aio_context_->BatchSubmitRead(this->fd_, data_vec, size_vec, + offset_vec, results); +} + +Status AIOAdaptor::BatchAsyncWrite(std::vector& data_vec, + std::vector& size_vec, + std::vector& offset_vec, + std::vector>& results) { + if (fd_ == -1) { + LOG(ERROR) << "File not opened: " << location_ + << ", error: " << std::string(strerror(errno)); + return Status::IOError("File not opened: " + location_ + + " error:" + std::string(strerror(errno))); + } + + return aio_context_->BatchSubmitWrite(this->fd_, data_vec, size_vec, + offset_vec, results); +} + +Status AIOAdaptor::BatchAsyncWrite(std::vector>& data_vec, + std::vector& size_vec, + std::vector& offset_vec, + std::vector>& results) { + if (fd_ == -1) { + LOG(ERROR) << "File not opened: " << location_ + << ", error: " << std::string(strerror(errno)); + return Status::IOError("File not opened: " + location_ + + " error:" + std::string(strerror(errno))); + } + + return aio_context_->BatchSubmitWrite(this->fd_, data_vec, size_vec, + offset_vec, results); +} + +Status AIOAdaptor::GetFileSize(size_t& size) { + std::filesystem::path file_path(location_); + if (!std::filesystem::exists(file_path)) { + return Status::IOError("File does not exist: " + location_); + } + + std::error_code ec; + size = std::filesystem::file_size(file_path, ec); + if (ec) { + LOG(ERROR) << "Error getting file size for '" << file_path.string() + << "': " << ec.message(); + return Status::IOError("Failed to get file size: " + file_path.string() + + ", error: " + ec.message()); + } + + return Status::OK(); +} + +} // namespace io + +} // namespace vllm_kv_cache + +} // namespace vineyard diff --git a/modules/vllm-kv-cache/src/io/aio_adaptor.h b/modules/vllm-kv-cache/src/io/aio_adaptor.h new file mode 100644 index 000000000..626254b54 --- /dev/null +++ b/modules/vllm-kv-cache/src/io/aio_adaptor.h @@ -0,0 +1,234 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Refer to: PAI-LLM/vllm/blob/develop/csrc/v6d/load_blocks_helper.hpp + +#ifndef MODULES_VLLM_KV_CACHE_SRC_IO_AIO_ADAPTOR_H_ +#define MODULES_VLLM_KV_CACHE_SRC_IO_AIO_ADAPTOR_H_ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/util/callback.h" +#include "common/util/status.h" +#include "vllm-kv-cache/src/env.h" +#include "vllm-kv-cache/src/io/aio_operations.h" +#include "vllm-kv-cache/src/io/io_adaptor.h" + +#include "thread-pool/thread_pool.h" + +namespace vineyard { + +namespace vllm_kv_cache { + +namespace io { + +struct AIOUserData { + struct iocb cb; + size_t expect_size; + std::promise promise; + std::atomic promise_setted{false}; + int64_t timestamp; + std::shared_ptr data_ptr; + uint64_t enqueue_timestamp; + uint64_t dequeue_timestamp; + uint64_t submit_timestamp; +}; + +class AIOContext { + public: + static std::shared_ptr GetSingleInstance( + std::shared_ptr aio_ops) { + static std::shared_ptr instance(new AIOContext( + std::stoi(VLLMKVCacheEnv::GetVineyardAIOSubmitConcurrency()), aio_ops)); + return instance; + } + + std::future SubmitRead(int fd, void* data, size_t size, + size_t offset); + + std::future SubmitRead(int fd, std::shared_ptr data, + size_t size, size_t offset); + + std::future SubmitWrite(int fd, void* data, size_t size, + size_t offset); + + std::future SubmitWrite(int fd, std::shared_ptr data, + size_t size, size_t offset); + + Status BatchSubmitRead(int fd, std::vector& data_vec, + std::vector& size_vec, + std::vector& offset_vec, + std::vector>& results); + + Status BatchSubmitRead(int fd, std::vector>& data_vec, + std::vector& size_vec, + std::vector& offset_vec, + std::vector>& results); + + Status BatchSubmitWrite(int fd, std::vector& data_vec, + std::vector& size_vec, + std::vector& offset_vec, + std::vector>& results); + + Status BatchSubmitWrite(int fd, std::vector>& data_vec, + std::vector& size_vec, + std::vector& offset_vec, + std::vector>& results); + + size_t GetProcessingIORequest() { + int64_t ret = submitted_requests_.load(); + return ret < 0 ? 0 : static_cast(ret); + } + + ~AIOContext(); + + private: + explicit AIOContext(uint64_t concurrency); + + explicit AIOContext( + uint64_t concurrency, + std::shared_ptr aio_ops); + + AIOUserData* CreateAIOUserData(size_t expect_size, + std::promise promise); + + void ReleaseAIOUserData(AIOUserData* user_data); + + void PullRequest(std::vector& user_data, + uint64_t queue_index = 0); + + void PushRequest(std::vector& user_data, + uint64_t queue_index = 0); + + void PullIORequestThread(); + + void TimeTrackerThread(); + + void PushIORequestThread(uint64_t queue_index); + + uint64_t concurrency_; + io_context_t aio_ctx_; + std::atomic stop_{false}; + + int64_t max_events_ = 32768; + int64_t max_push_events_ = 128; + std::atomic write_counter_{0}; + std::atomic read_counter_{0}; + int64_t io_timeout_milliseconds_; + std::set submitted_io_requests_; + std::mutex submitted_io_requests_mutex_; + + int retry_times_ = 3; + std::vector> io_queue_; + std::vector queue_mutex_; + + size_t max_pendding_requests_ = 4096; + + std::shared_ptr thread_pool_; + + std::shared_ptr aio_ops_; + std::atomic submitted_requests_{0}; + std::vector> queue_cv_; +}; + +IOAdaptorFactory GetAIOAdaptorFactory(); + +class AIOAdaptor : public IIOAdaptor { + public: + explicit AIOAdaptor( + const std::string& location, + std::shared_ptr aio_ops = + std::make_shared()) + : location_(location), + fd_(-1), + aio_context_(AIOContext::GetSingleInstance(aio_ops)) {} + + Status Open(std::string mode, bool direct_io = false) override; + + // not timer-safe + Status Read(void* data, size_t size) override; + + // not timer-safe + Status Write(void* data, size_t size) override; + + Status Close() override; + + Status GetFileSize(size_t& size) override; + + Status FileTruncate(size_t size) override; + + std::future AsyncRead(void* data, size_t size, + size_t offset) override; + + std::future AsyncRead(std::shared_ptr data, size_t size, + size_t offset) override; + + std::future AsyncWrite(void* data, size_t size, + size_t offset) override; + + std::future AsyncWrite(std::shared_ptr data, size_t size, + size_t offset) override; + + Status BatchAsyncRead(std::vector& data_vec, + std::vector& size_vec, + std::vector& offset_vec, + std::vector>& results) override; + + Status BatchAsyncWrite(std::vector& data_vec, + std::vector& size_vec, + std::vector& offset_vec, + std::vector>& results) override; + + Status BatchAsyncRead(std::vector>& data_vec, + std::vector& size_vec, + std::vector& offset_vec, + std::vector>& results) override; + + Status BatchAsyncWrite(std::vector>& data_vec, + std::vector& size_vec, + std::vector& offset_vec, + std::vector>& results) override; + + ~AIOAdaptor() { + if (fd_ != -1) { + Close(); + } + } + + private: + std::string location_; + int fd_ = -1; + bool direct_io_ = false; + uint64_t read_pos = 0; + uint64_t write_pos = 0; + std::shared_ptr aio_context_; +}; + +} // namespace io + +} // namespace vllm_kv_cache + +} // namespace vineyard + +#endif // MODULES_VLLM_KV_CACHE_SRC_IO_AIO_ADAPTOR_H_ diff --git a/modules/vllm-kv-cache/src/io/aio_operations.cc b/modules/vllm-kv-cache/src/io/aio_operations.cc new file mode 100644 index 000000000..7352a9516 --- /dev/null +++ b/modules/vllm-kv-cache/src/io/aio_operations.cc @@ -0,0 +1,59 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "vllm-kv-cache/src/io/aio_operations.h" +#include +#include "common/util/logging.h" + +namespace vineyard { + +namespace vllm_kv_cache { + +namespace io { + +int RealAIOOperations::io_setup(int maxevents, io_context_t* ctx_idp) { + return ::io_setup(maxevents, ctx_idp); +} + +int RealAIOOperations::io_submit(io_context_t ctx_id, int64_t nr, + struct iocb* ios[]) { + return ::io_submit(ctx_id, nr, ios); +} + +int RealAIOOperations::io_getevents(io_context_t ctx_id, int64_t min_nr, + int64_t nr, struct io_event* events, + struct timespec* timeout) { + return ::io_getevents(ctx_id, min_nr, nr, events, timeout); +} + +int RealAIOOperations::io_destroy(io_context_t ctx_id) { + return ::io_destroy(ctx_id); +} + +void RealAIOOperations::io_prep_pread(struct iocb* iocb, int fd, void* buf, + size_t count, int64_t offset) { + ::io_prep_pread(iocb, fd, buf, count, offset); +} + +void RealAIOOperations::io_prep_pwrite(struct iocb* iocb, int fd, void* buf, + size_t count, int64_t offset) { + ::io_prep_pwrite(iocb, fd, buf, count, offset); +} + +} // namespace io + +} // namespace vllm_kv_cache + +} // namespace vineyard diff --git a/modules/vllm-kv-cache/src/io/aio_operations.h b/modules/vllm-kv-cache/src/io/aio_operations.h new file mode 100644 index 000000000..6fc16e67d --- /dev/null +++ b/modules/vllm-kv-cache/src/io/aio_operations.h @@ -0,0 +1,67 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef MODULES_VLLM_KV_CACHE_SRC_IO_AIO_OPERATIONS_H_ +#define MODULES_VLLM_KV_CACHE_SRC_IO_AIO_OPERATIONS_H_ + +#include + +namespace vineyard { + +namespace vllm_kv_cache { + +namespace io { + +class IAIOOperations { + public: + virtual ~IAIOOperations() = default; + + virtual int io_setup(int maxevents, io_context_t* ctx_idp) = 0; + virtual int io_submit(io_context_t ctx_id, int64_t nr, + struct iocb* ios[]) = 0; + virtual int io_getevents(io_context_t ctx_id, int64_t min_nr, int64_t nr, + struct io_event* events, + struct timespec* timeout) = 0; + virtual int io_destroy(io_context_t ctx_id) = 0; + + virtual void io_prep_pread(struct iocb* iocb, int fd, void* buf, size_t count, + int64_t offset) = 0; + virtual void io_prep_pwrite(struct iocb* iocb, int fd, void* buf, + size_t count, int64_t offset) = 0; +}; + +class RealAIOOperations : public IAIOOperations { + public: + virtual ~RealAIOOperations() = default; + + int io_setup(int maxevents, io_context_t* ctx_idp) override; + int io_submit(io_context_t ctx_id, int64_t nr, struct iocb* ios[]) override; + int io_getevents(io_context_t ctx_id, int64_t min_nr, int64_t nr, + struct io_event* events, struct timespec* timeout) override; + int io_destroy(io_context_t ctx_id) override; + + void io_prep_pread(struct iocb* iocb, int fd, void* buf, size_t count, + int64_t offset) override; + void io_prep_pwrite(struct iocb* iocb, int fd, void* buf, size_t count, + int64_t offset) override; +}; + +} // namespace io + +} // namespace vllm_kv_cache + +} // namespace vineyard + +#endif // MODULES_VLLM_KV_CACHE_SRC_IO_AIO_OPERATIONS_H_ diff --git a/modules/vllm-kv-cache/src/io/error_injection.cc b/modules/vllm-kv-cache/src/io/error_injection.cc new file mode 100644 index 000000000..28e1c321f --- /dev/null +++ b/modules/vllm-kv-cache/src/io/error_injection.cc @@ -0,0 +1,149 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "vllm-kv-cache/src/io/error_injection.h" +#include +#include + +namespace vineyard { + +namespace vllm_kv_cache { + +namespace io { + +// Global error injection flags for MockAIOOperations +bool global_mock_aio_operation_io_setup_error = false; +bool global_mock_aio_operation_io_submit_timeout = false; +bool global_mock_aio_operation_io_submit_error = false; +bool global_mock_aio_operation_io_submit_part_processed = false; +int64_t global_mock_aio_operation_io_submit_max_processed = 5; +bool global_mock_aio_operation_io_getevents_timeout = false; +bool global_mock_aio_operation_io_getevents_error = false; +bool global_mock_aio_operation_io_getevents_no_events = false; +uint64_t global_mock_aio_operation_io_submit_timeout_ms = 1000; +uint64_t global_mock_aio_operation_io_getevents_timeout_ms = 1000; +int global_mock_aio_operation_io_setup_error_code = -EAGAIN; +int global_mock_aio_operation_io_submit_error_code = -EAGAIN; +int global_mock_aio_operation_io_getevents_error_code = -EIO; + +bool global_mock_aio_operation_io_read_error = false; +bool global_mock_aio_operation_io_write_error = false; +bool global_mock_aio_operation_io_read_timeout = false; +bool global_mock_aio_operation_io_write_timeout = false; +uint64_t global_mock_aio_operation_io_timeout_ms = 1000; + +// Global error injection flags for MockIOAdaptor +bool global_mock_io_read_error = false; +bool global_mock_io_write_error = false; +bool global_mock_io_read_timeout = false; +bool global_mock_io_write_timeout = false; +bool global_mock_io_batch_read_error = false; +bool global_mock_io_batch_write_error = false; +bool global_mock_io_batch_read_timeout = false; +bool global_mock_io_batch_write_timeout = false; +uint64_t global_mock_io_timeout_ms = 1000; + +// Functions to set global error injection flags for MockAIOOperations +void SetGlobalMockAIOOperationSetupError(bool error) { + global_mock_aio_operation_io_setup_error = error; +} + +void SetGlobalMockAIOOperationSubmitTimeout(bool timeout, uint64_t timeout_ms) { + global_mock_aio_operation_io_submit_timeout = timeout; + global_mock_aio_operation_io_submit_timeout_ms = timeout_ms; +} + +void SetGlobalMockAIOOperationSubmitError(bool error) { + global_mock_aio_operation_io_submit_error = error; +} + +void SetGlobalMockAIOOperationSubmitMaxProcessedPerCall( + bool is_part_processed, uint64_t max_processed) { + global_mock_aio_operation_io_submit_part_processed = is_part_processed; + global_mock_aio_operation_io_submit_max_processed = max_processed; +} + +void SetGlobalMockAIOOperationGetEventsTimeout(bool timeout, + uint64_t timeout_ms) { + global_mock_aio_operation_io_getevents_timeout = timeout; + global_mock_aio_operation_io_getevents_timeout_ms = timeout_ms; +} + +void SetGlobalMockAIOOperationGetEventsError(bool error) { + global_mock_aio_operation_io_getevents_error = error; +} + +void SetGlobalMockAIOOperationGetEventsNoEvents(bool no_events) { + global_mock_aio_operation_io_getevents_no_events = no_events; +} + +void SetGlobalMockAIOOperationReadError(bool error) { + global_mock_aio_operation_io_read_error = error; +} + +void SetGlobalMockAIOOperationWriteError(bool error) { + global_mock_aio_operation_io_write_error = error; +} + +void SetGlobalMockAIOOperationReadTimeout(bool timeout, uint64_t timeout_ms) { + global_mock_aio_operation_io_read_timeout = timeout; + global_mock_aio_operation_io_timeout_ms = timeout_ms; +} + +void SetGlobalMockAIOOperationWriteTimeout(bool timeout, uint64_t timeout_ms) { + global_mock_aio_operation_io_write_timeout = timeout; + global_mock_aio_operation_io_timeout_ms = timeout_ms; +} + +// Functions to set global error injection flags for MockIOAdaptor +void SetGlobalMockIOReadError(bool error) { global_mock_io_read_error = error; } + +void SetGlobalMockIOWriteError(bool error) { + global_mock_io_write_error = error; +} + +void SetGlobalMockIOReadTimeout(bool timeout, uint64_t timeout_ms) { + global_mock_io_read_timeout = timeout; + global_mock_io_timeout_ms = timeout_ms; +} + +void SetGlobalMockIOWriteTimeout(bool timeout, uint64_t timeout_ms) { + global_mock_io_write_timeout = timeout; + global_mock_io_timeout_ms = timeout_ms; +} + +void SetGlobalMockIOBatchReadError(bool error) { + global_mock_io_batch_read_error = error; +} + +void SetGlobalMockIOBatchWriteError(bool error) { + global_mock_io_batch_write_error = error; +} + +void SetGlobalMockIOBatchReadTimeout(bool timeout, uint64_t timeout_ms) { + global_mock_io_batch_read_timeout = timeout; + global_mock_io_timeout_ms = timeout_ms; +} + +void SetGlobalMockIOBatchWriteTimeout(bool timeout, uint64_t timeout_ms) { + global_mock_io_batch_write_timeout = timeout; + global_mock_io_timeout_ms = timeout_ms; +} + +} // namespace io + +} // namespace vllm_kv_cache + +} // namespace vineyard diff --git a/modules/vllm-kv-cache/src/io/error_injection.h b/modules/vllm-kv-cache/src/io/error_injection.h new file mode 100644 index 000000000..a8a80e281 --- /dev/null +++ b/modules/vllm-kv-cache/src/io/error_injection.h @@ -0,0 +1,93 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef MODULES_VLLM_KV_CACHE_SRC_IO_ERROR_INJECTION_H_ +#define MODULES_VLLM_KV_CACHE_SRC_IO_ERROR_INJECTION_H_ + +#include + +namespace vineyard { + +namespace vllm_kv_cache { + +namespace io { + +// Global error injection flags for MockAIOOperations +extern bool global_mock_aio_operation_io_setup_error; +extern bool global_mock_aio_operation_io_submit_timeout; +extern bool global_mock_aio_operation_io_submit_error; +extern bool global_mock_aio_operation_io_submit_part_processed; +extern int64_t global_mock_aio_operation_io_submit_max_processed; +extern bool global_mock_aio_operation_io_getevents_timeout; +extern bool global_mock_aio_operation_io_getevents_error; +extern bool global_mock_aio_operation_io_getevents_no_events; +extern uint64_t global_mock_aio_operation_io_submit_timeout_ms; +extern uint64_t global_mock_aio_operation_io_getevents_timeout_ms; +extern int global_mock_aio_operation_io_setup_error_code; +extern int global_mock_aio_operation_io_submit_error_code; +extern int global_mock_aio_operation_io_getevents_error_code; + +extern bool global_mock_aio_operation_io_read_error; +extern bool global_mock_aio_operation_io_write_error; +extern bool global_mock_aio_operation_io_read_timeout; +extern bool global_mock_aio_operation_io_write_timeout; +extern uint64_t global_mock_aio_operation_io_timeout_ms; + +// Global error injection flags for MockIOAdaptor +extern bool global_mock_io_read_error; +extern bool global_mock_io_write_error; +extern bool global_mock_io_read_timeout; +extern bool global_mock_io_write_timeout; +extern bool global_mock_io_batch_read_error; +extern bool global_mock_io_batch_write_error; +extern bool global_mock_io_batch_read_timeout; +extern bool global_mock_io_batch_write_timeout; +extern uint64_t global_mock_io_timeout_ms; + +// Functions to set global error injection flags for MockAIOOperations +void SetGlobalMockAIOOperationSetupError(bool error); +void SetGlobalMockAIOOperationSubmitTimeout(bool timeout, + uint64_t timeout_ms = 1000); +void SetGlobalMockAIOOperationSubmitError(bool error); +void SetGlobalMockAIOOperationSubmitMaxProcessedPerCall( + bool is_part_processed, uint64_t max_processed = 3); +void SetGlobalMockAIOOperationGetEventsTimeout(bool timeout, + uint64_t timeout_ms = 1000); +void SetGlobalMockAIOOperationGetEventsError(bool error); +void SetGlobalMockAIOOperationGetEventsNoEvents(bool no_events); +void SetGlobalMockAIOOperationReadError(bool error); +void SetGlobalMockAIOOperationWriteError(bool error); +void SetGlobalMockAIOOperationReadTimeout(bool timeout, + uint64_t timeout_ms = 1000); +void SetGlobalMockAIOOperationWriteTimeout(bool timeout, + uint64_t timeout_ms = 1000); + +// Functions to set global error injection flags for MockIOAdaptor +void SetGlobalMockIOReadError(bool error); +void SetGlobalMockIOWriteError(bool error); +void SetGlobalMockIOReadTimeout(bool timeout, uint64_t timeout_ms = 1000); +void SetGlobalMockIOWriteTimeout(bool timeout, uint64_t timeout_ms = 1000); +void SetGlobalMockIOBatchReadError(bool error); +void SetGlobalMockIOBatchWriteError(bool error); +void SetGlobalMockIOBatchReadTimeout(bool timeout, uint64_t timeout_ms = 1000); +void SetGlobalMockIOBatchWriteTimeout(bool timeout, uint64_t timeout_ms = 1000); + +} // namespace io + +} // namespace vllm_kv_cache + +} // namespace vineyard + +#endif // MODULES_VLLM_KV_CACHE_SRC_IO_ERROR_INJECTION_H_ diff --git a/modules/vllm-kv-cache/src/io/io_adaptor.h b/modules/vllm-kv-cache/src/io/io_adaptor.h new file mode 100644 index 000000000..389065828 --- /dev/null +++ b/modules/vllm-kv-cache/src/io/io_adaptor.h @@ -0,0 +1,128 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef MODULES_VLLM_KV_CACHE_SRC_IO_IO_ADAPTOR_H_ +#define MODULES_VLLM_KV_CACHE_SRC_IO_IO_ADAPTOR_H_ + +#include +#include +#include +#include + +#include "common/util/status.h" + +namespace vineyard { + +namespace vllm_kv_cache { + +namespace io { + +class IIOAdaptor { + public: + virtual Status Open(std::string mode, bool direct_io = false) = 0; + + virtual Status Read(void* data, size_t size) = 0; + + virtual Status Write(void* data, size_t size) = 0; + + virtual Status Close() = 0; + + virtual Status GetFileSize(size_t& size) = 0; + + virtual Status FileTruncate(size_t size) = 0; + + virtual std::future AsyncRead(void* data, size_t size, + size_t offset) { + return std::async(std::launch::async, + [this, data, size]() { return this->Read(data, size); }); + } + + virtual std::future AsyncWrite(void* data, size_t size, + size_t offset) { + return std::async(std::launch::async, + [this, data, size]() { return this->Write(data, size); }); + } + + virtual Status BatchAsyncRead(std::vector& data_vec, + std::vector& size_vec, + std::vector& offset_vec, + std::vector>& results) { + for (size_t i = 0; i < data_vec.size(); ++i) { + results.push_back( + std::async(std::launch::async, [this, &data_vec, &size_vec, i]() { + return this->Read(data_vec[i], size_vec[i]); + })); + } + return Status::OK(); + } + + virtual Status BatchAsyncWrite(std::vector& data_vec, + std::vector& size_vec, + std::vector& offset_vec, + std::vector>& results) { + for (size_t i = 0; i < data_vec.size(); ++i) { + results.push_back( + std::async(std::launch::async, [this, &data_vec, &size_vec, i]() { + return this->Write(data_vec[i], size_vec[i]); + })); + } + return Status::OK(); + } + + virtual std::future AsyncRead(std::shared_ptr data, size_t size, + size_t offset) { + return std::async(std::launch::async, []() { + return Status::NotImplemented( + "AsyncRead with shared_ptr is not implemented in IIOAdaptor"); + }); + } + + virtual std::future AsyncWrite(std::shared_ptr data, + size_t size, size_t offset) { + return std::async(std::launch::async, []() { + return Status::NotImplemented( + "AsyncWrite with shared_ptr is not implemented in IIOAdaptor"); + }); + } + + virtual Status BatchAsyncRead(std::vector>& data_vec, + std::vector& size_vec, + std::vector& offset_vec, + std::vector>& results) { + return Status::NotImplemented( + "BatchAsyncRead with shared_ptr is not implemented in " + "IIOAdaptor"); + } + + virtual Status BatchAsyncWrite(std::vector>& data_vec, + std::vector& size_vec, + std::vector& offset_vec, + std::vector>& results) { + return Status::NotImplemented( + "BatchAsyncWrite with shared_ptr is not implemented in " + "IIOAdaptor"); + } +}; + +using IOAdaptorFactory = + std::function(const std::string& path)>; + +} // namespace io + +} // namespace vllm_kv_cache + +} // namespace vineyard + +#endif // MODULES_VLLM_KV_CACHE_SRC_IO_IO_ADAPTOR_H_ diff --git a/modules/vllm-kv-cache/src/io/mock_aio_operations.cc b/modules/vllm-kv-cache/src/io/mock_aio_operations.cc new file mode 100644 index 000000000..d69520170 --- /dev/null +++ b/modules/vllm-kv-cache/src/io/mock_aio_operations.cc @@ -0,0 +1,523 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "vllm-kv-cache/src/io/mock_aio_operations.h" + +#include +#include +#include +#include +#include +#include + +#include "common/util/logging.h" +#include "vllm-kv-cache/src/io/error_injection.h" + +namespace vineyard { + +namespace vllm_kv_cache { + +namespace io { + +MockAIOOperations::MockAIOOperations() : stop_worker_(false) { + // Start worker thread + worker_thread_ = std::thread(&MockAIOOperations::ProcessIORequests, this); +} + +MockAIOOperations::~MockAIOOperations() { + // Stop worker thread + { + std::lock_guard lock(pending_ios_mutex_); + stop_worker_ = true; + pending_ios_cv_.notify_all(); + } + + if (worker_thread_.joinable()) { + worker_thread_.join(); + } + + // Destroy all contexts + std::lock_guard lock(contexts_mutex_); + for (auto& pair : contexts_) { + pair.second->destroyed = true; + pair.second->cv.notify_all(); + } + contexts_.clear(); +} + +int MockAIOOperations::io_setup(int maxevents, io_context_t* ctx_idp) { + LOG(INFO) << "mock io_setup"; + if (global_mock_aio_operation_io_setup_error) { + LOG(ERROR) << "in mock aio operations: io_setup failed due to global " + "injection, error code: " + << global_mock_aio_operation_io_setup_error_code; + return global_mock_aio_operation_io_setup_error_code; + } + + // Create a new context + auto context = std::make_shared(maxevents); + + // Generate a unique context ID (using pointer address) + io_context_t ctx_id = reinterpret_cast(new char); + + // Store context + { + std::lock_guard lock(contexts_mutex_); + contexts_[ctx_id] = context; + } + + *ctx_idp = ctx_id; + return 0; +} + +int MockAIOOperations::io_submit(io_context_t ctx_id, int64_t nr, + struct iocb* ios[]) { + if (global_mock_aio_operation_io_submit_timeout) { + LOG(INFO) << "io_submit timeout injected for " + << global_mock_aio_operation_io_submit_timeout_ms << " ms"; + std::this_thread::sleep_for(std::chrono::milliseconds( + global_mock_aio_operation_io_submit_timeout_ms)); + } + + if (global_mock_aio_operation_io_submit_error) { + LOG(ERROR) << "in mock aio operations: io_submit failed due to global " + "injection, error code: " + << global_mock_aio_operation_io_submit_error_code; + return global_mock_aio_operation_io_submit_error_code; + } + + // Find context + std::shared_ptr context; + { + std::lock_guard lock(contexts_mutex_); + auto it = contexts_.find(ctx_id); + if (it == contexts_.end()) { + LOG(ERROR) << "in mock aio operations: io_submit failed: context not " + "found for ctx_id=" + << ctx_id; + return -EINVAL; + } + context = it->second; + } + + if (context->destroyed) { + LOG(ERROR) << "in mock aio operations: io_submit failed: context is " + "destroyed for ctx_id=" + << ctx_id; + return -EBADF; + } + + // Determine how many requests to process + int64_t processed = + (global_mock_aio_operation_io_submit_part_processed == false) + ? nr + : std::min(nr, global_mock_aio_operation_io_submit_max_processed); + + // Add I/O requests to pending queue + { + std::lock_guard lock(pending_ios_mutex_); + for (int64_t i = 0; i < processed; i++) { + pending_ios_.push({ctx_id, ios[i]}); + } + pending_ios_cv_.notify_all(); + } + + if (global_mock_aio_operation_io_submit_part_processed) { + LOG(INFO) << "io_submit part processed injected for max processed: " + << global_mock_aio_operation_io_submit_max_processed + << " requests, real processed: " << processed << " requests"; + } + + return processed; +} + +int MockAIOOperations::io_getevents(io_context_t ctx_id, int64_t min_nr, + int64_t nr, struct io_event* events, + struct timespec* timeout) { + if (global_mock_aio_operation_io_getevents_no_events) { + LOG(INFO) << "io_getevents returning 0 events due to global injection"; + return 0; + } + + // Find context + std::shared_ptr context; + { + std::lock_guard lock(contexts_mutex_); + auto it = contexts_.find(ctx_id); + if (it == contexts_.end()) { + LOG(ERROR) << "in mock aio operations: io_getevents failed: context not " + "found for ctx_id=" + << ctx_id; + return -EINVAL; + } + context = it->second; + } + + if (context->destroyed) { + LOG(ERROR) << "in mock aio operations: io_getevents failed: context is " + "destroyed for ctx_id=" + << ctx_id; + return -EBADF; + } + + // Convert timespec to chrono duration + std::chrono::steady_clock::time_point deadline; + bool has_timeout = false; + if (timeout) { + deadline = std::chrono::steady_clock::now() + + std::chrono::seconds(timeout->tv_sec) + + std::chrono::nanoseconds(timeout->tv_nsec); + has_timeout = true; + } + + int64_t num_events = 0; + std::unique_lock lock(context->mutex); + + // Wait for events if needed + if (min_nr > 0 && + context->completed_events.size() < static_cast(min_nr)) { + if (has_timeout) { + context->cv.wait_until(lock, deadline, [context, min_nr]() { + return context->completed_events.size() >= + static_cast(min_nr) || + context->destroyed; + }); + } else { + context->cv.wait(lock, [context, min_nr]() { + return context->completed_events.size() >= + static_cast(min_nr) || + context->destroyed; + }); + } + } + + if (context->destroyed) { + return -EBADF; + } + + // Check for timeout + if (has_timeout && std::chrono::steady_clock::now() > deadline) { + if (global_mock_aio_operation_io_getevents_timeout) { + // LOG(ERROR) << "in mock aio operations: io_getevents timed out, error + // code: " << -EAGAIN; + return -EAGAIN; + } + } + + // Collect events + while (num_events < nr && !context->completed_events.empty()) { + events[num_events++] = context->completed_events.front(); + context->completed_events.pop(); + } + + if (global_mock_aio_operation_io_getevents_timeout && num_events > 0) { + // if the first event is a short read(meta), we don't set timeout + if (num_events == 1 && events[0].res <= 4096) { + // Do nothing + } else { + LOG(INFO) << "io_getevents timeout injected for " + << global_mock_aio_operation_io_getevents_timeout_ms << " ms"; + std::this_thread::sleep_for(std::chrono::milliseconds( + global_mock_aio_operation_io_getevents_timeout_ms)); + } + } + + if (global_mock_aio_operation_io_getevents_error && num_events > 0) { + // if the first event is a short read(meta), we don't set timeout + if (num_events == 1 && events[0].res <= 4096) { + // Do nothing + } else { + LOG(ERROR) << "in mock aio operations: io_getevents failed due to global " + "injection, error code: " + << global_mock_aio_operation_io_getevents_error_code; + return global_mock_aio_operation_io_getevents_error_code; + } + } + + return num_events; +} + +int MockAIOOperations::io_destroy(io_context_t ctx_id) { + std::shared_ptr context; + { + std::lock_guard lock(contexts_mutex_); + auto it = contexts_.find(ctx_id); + if (it == contexts_.end()) { + LOG(ERROR) << "in mock aio operations: io_destroy failed: context not " + "found for ctx_id=" + << ctx_id; + return -EINVAL; + } + context = it->second; + contexts_.erase(it); + } + + // Mark context as destroyed and notify any waiting threads + context->destroyed = true; + context->cv.notify_all(); + + // Clean up context ID + delete[] reinterpret_cast(ctx_id); + + return 0; +} + +// Mock functions for io_prep_pread and io_prep_pwrite +void MockAIOOperations::io_prep_pread(struct iocb* iocb, int fd, void* buf, + size_t count, int64_t offset) { + std::memset(iocb, 0, sizeof(*iocb)); + iocb->aio_fildes = fd; + iocb->aio_lio_opcode = IO_CMD_PREAD; + iocb->aio_reqprio = 0; + // Use the union member 'c' which has buf, nbytes, and offset + iocb->u.c.buf = buf; + iocb->u.c.nbytes = count; + iocb->u.c.offset = offset; +} + +void MockAIOOperations::io_prep_pwrite(struct iocb* iocb, int fd, void* buf, + size_t count, int64_t offset) { + std::memset(iocb, 0, sizeof(*iocb)); + iocb->aio_fildes = fd; + iocb->aio_lio_opcode = IO_CMD_PWRITE; + iocb->aio_reqprio = 0; + // Use the union member 'c' which has buf, nbytes, and offset + iocb->u.c.buf = buf; + iocb->u.c.nbytes = count; + iocb->u.c.offset = offset; +} + +void MockAIOOperations::ProcessIORequests() { + while (!stop_worker_) { + std::vector io_requests; + + // Wait for pending I/O requests + { + std::unique_lock lock(pending_ios_mutex_); + pending_ios_cv_.wait( + lock, [this]() { return !pending_ios_.empty() || stop_worker_; }); + + if (stop_worker_ && pending_ios_.empty()) { + break; + } + + // Dequeue all pending I/O requests + while (!pending_ios_.empty()) { + io_requests.push_back(pending_ios_.front()); + pending_ios_.pop(); + } + } + + // Process all I/O requests concurrently + std::vector workers; + for (const auto& io_request : io_requests) { + workers.emplace_back([this, io_request]() { + SimulateIOOperation(io_request.ctx_id, io_request.iocb_ptr); + }); + } + + // Wait for all workers to complete + for (auto& worker : workers) { + if (worker.joinable()) { + worker.join(); + } + } + } +} + +void MockAIOOperations::SimulateIOOperation(io_context_t ctx_id, + struct iocb* iocb) { + // For simplicity, we'll just complete the operation immediately + // In a more realistic implementation, you might want to simulate actual I/O + // delays + ssize_t result = 0; + + switch (iocb->aio_lio_opcode) { + case IO_CMD_PREAD: { + // Simulate reading from a file + result = ReadFromFile(iocb->aio_fildes, iocb->u.c.buf, + static_cast(iocb->u.c.nbytes), + static_cast(iocb->u.c.offset)); + break; + } + case IO_CMD_PWRITE: { + // Simulate writing to a file + result = WriteToFile(iocb->aio_fildes, iocb->u.c.buf, + static_cast(iocb->u.c.nbytes), + static_cast(iocb->u.c.offset)); + break; + } + default: + LOG(ERROR) + << "in mock aio operations: SimulateIOOperation failed: unknown opcode=" + << iocb->aio_lio_opcode; + result = -EINVAL; + break; + } + + CompleteIOOperation(ctx_id, iocb, result); +} + +void MockAIOOperations::CompleteIOOperation(io_context_t ctx_id, + struct iocb* iocb, ssize_t result) { + // Find context + std::shared_ptr context; + { + std::lock_guard lock(contexts_mutex_); + auto it = contexts_.find(ctx_id); + if (it == contexts_.end()) { + LOG(ERROR) << "in mock aio operations: CompleteIOOperation failed: " + "context not found for ctx_id=" + << ctx_id; + return; + } + context = it->second; + } + + if (context->destroyed) { + LOG(ERROR) << "in mock aio operations: CompleteIOOperation failed: context " + "is destroyed for ctx_id=" + << ctx_id; + return; + } + + // Create io_event + struct io_event event; + std::memset(&event, 0, sizeof(event)); + event.data = reinterpret_cast(iocb); + event.obj = iocb; + event.res = result; + event.res2 = 0; + + // Add event to completed events queue + { + std::lock_guard lock(context->mutex); + context->completed_events.push(event); + } + + // Notify waiting threads + context->cv.notify_all(); +} + +ssize_t MockAIOOperations::ReadFromFile(int fd, void* buf, size_t count, + off_t offset) { + LOG(INFO) << "in mock aio operations: ReadFromFile called for fd=" << fd + << ", count=" << count << ", offset=" << offset; + + // Inject read error if enabled + if (global_mock_aio_operation_io_read_error) { + LOG(ERROR) << "in mock aio operations: ReadFromFile failed due to global " + "injection"; + return -EIO; + } + + // Find file data + std::shared_ptr file_data; + { + std::lock_guard lock(files_mutex_); + auto it = files_.find(fd); + if (it == files_.end()) { + // File not found, return error + LOG(ERROR) << "in mock aio operations: ReadFromFile failed: file not " + "found for fd=" + << fd; + return -EBADF; + } + file_data = it->second; + } + + size_t to_read = 0; + { + // Lock file data for reading + std::lock_guard lock(file_data->mutex); + + // Check if offset is beyond file size + if (static_cast(offset) >= file_data->data.size()) { + return 0; // EOF + } + + // Calculate number of bytes to read + size_t available = file_data->data.size() - static_cast(offset); + to_read = std::min(count, available); + + // Copy data to buffer + std::memcpy(buf, file_data->data.data() + offset, to_read); + } + + // Inject read timeout if enabled + if (global_mock_aio_operation_io_read_timeout && count > 4096) { + LOG(INFO) << "ReadFromFile timeout injected for " + << global_mock_aio_operation_io_timeout_ms << " ms"; + std::this_thread::sleep_for( + std::chrono::milliseconds(global_mock_aio_operation_io_timeout_ms)); + } + + return static_cast(to_read); +} + +ssize_t MockAIOOperations::WriteToFile(int fd, const void* buf, size_t count, + off_t offset) { + // LOG(INFO) << "in mock aio operations: WriteToFile called for fd=" << fd + // << ", count=" << count << ", offset=" << offset; + + // Inject write error if enabled + if (global_mock_aio_operation_io_write_error) { + LOG(ERROR) + << "in mock aio operations: WriteToFile failed due to global injection"; + return -EIO; + } + + // Inject write timeout if enabled + if (global_mock_aio_operation_io_write_timeout) { + LOG(INFO) << "WriteToFile timeout injected for " + << global_mock_aio_operation_io_timeout_ms << " ms"; + std::this_thread::sleep_for( + std::chrono::milliseconds(global_mock_aio_operation_io_timeout_ms)); + } + + // Find or create file data + std::shared_ptr file_data; + { + std::lock_guard lock(files_mutex_); + auto it = files_.find(fd); + if (it == files_.end()) { + // Create new file data + file_data = std::make_shared(); + files_[fd] = file_data; + } else { + file_data = it->second; + } + } + + // Lock file data for writing + std::lock_guard lock(file_data->mutex); + + // Ensure file data is large enough + size_t required_size = static_cast(offset) + count; + if (file_data->data.size() < required_size) { + file_data->data.resize(required_size); + } + + // Copy data from buffer + std::memcpy(file_data->data.data() + offset, buf, count); + + return static_cast(count); +} + +} // namespace io + +} // namespace vllm_kv_cache + +} // namespace vineyard diff --git a/modules/vllm-kv-cache/src/io/mock_aio_operations.h b/modules/vllm-kv-cache/src/io/mock_aio_operations.h new file mode 100644 index 000000000..652407d52 --- /dev/null +++ b/modules/vllm-kv-cache/src/io/mock_aio_operations.h @@ -0,0 +1,134 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef MODULES_VLLM_KV_CACHE_SRC_IO_MOCK_AIO_OPERATIONS_H_ +#define MODULES_VLLM_KV_CACHE_SRC_IO_MOCK_AIO_OPERATIONS_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "vllm-kv-cache/src/io/aio_operations.h" +#include "vllm-kv-cache/src/io/error_injection.h" + +// Define IO_CMD constants if not already defined +#ifndef IO_CMD_PREAD +#define IO_CMD_PREAD 0 +#endif + +#ifndef IO_CMD_PWRITE +#define IO_CMD_PWRITE 1 +#endif + +#ifndef IO_CMD_PREADV +#define IO_CMD_PREADV 7 +#endif + +#ifndef IO_CMD_PWRITEV +#define IO_CMD_PWRITEV 8 +#endif + +namespace vineyard { + +namespace vllm_kv_cache { + +namespace io { + +// In-memory implementation of AIO operations +class MockAIOOperations : public vineyard::vllm_kv_cache::io::IAIOOperations { + public: + MockAIOOperations(); + ~MockAIOOperations(); + + int io_setup(int maxevents, io_context_t* ctx_idp) override; + int io_submit(io_context_t ctx_id, int64_t nr, struct iocb* ios[]) override; + int io_getevents(io_context_t ctx_id, int64_t min_nr, int64_t nr, + struct io_event* events, struct timespec* timeout) override; + int io_destroy(io_context_t ctx_id) override; + + void io_prep_pread(struct iocb* iocb, int fd, void* buf, size_t count, + int64_t offset); + void io_prep_pwrite(struct iocb* iocb, int fd, void* buf, size_t count, + int64_t offset); + + private: + // Internal structure to represent an AIO context + struct AIOContext { + std::mutex mutex; + std::condition_variable cv; + std::queue completed_events; + std::atomic destroyed; + int max_events; + + explicit AIOContext(int maxevents) + : destroyed(false), max_events(maxevents) {} + }; + + // Map of file descriptors to file data + struct FileData { + std::vector data; + std::mutex mutex; + }; + + std::unordered_map> files_; + std::mutex files_mutex_; + + // Map of context IDs to context data + std::unordered_map> contexts_; + std::mutex contexts_mutex_; + + // Worker thread for processing I/O requests + std::thread worker_thread_; + std::atomic stop_worker_; + + // Queue for pending I/O requests + struct PendingIO { + io_context_t ctx_id; + struct iocb* iocb_ptr; + }; + + std::queue pending_ios_; + std::mutex pending_ios_mutex_; + std::condition_variable pending_ios_cv_; + + // Worker function to process I/O requests + void ProcessIORequests(); + + // Helper function to simulate I/O operation + void SimulateIOOperation(io_context_t ctx_id, struct iocb* iocb); + + // Helper function to complete an I/O operation + void CompleteIOOperation(io_context_t ctx_id, struct iocb* iocb, + ssize_t result); + + // Helper functions for file operations + ssize_t ReadFromFile(int fd, void* buf, size_t count, off_t offset); + ssize_t WriteToFile(int fd, const void* buf, size_t count, off_t offset); +}; + +} // namespace io + +} // namespace vllm_kv_cache + +} // namespace vineyard + +#endif // MODULES_VLLM_KV_CACHE_SRC_IO_MOCK_AIO_OPERATIONS_H_ diff --git a/modules/vllm-kv-cache/src/io/mock_io_adapter.cc b/modules/vllm-kv-cache/src/io/mock_io_adapter.cc new file mode 100644 index 000000000..115a0caf7 --- /dev/null +++ b/modules/vllm-kv-cache/src/io/mock_io_adapter.cc @@ -0,0 +1,221 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "vllm-kv-cache/src/io/mock_io_adapter.h" +#include "vllm-kv-cache/src/io/mock_aio_operations.h" + +#include +#include +#include +#include +#include +#include + +#include "common/util/logging.h" + +namespace vineyard { + +namespace vllm_kv_cache { + +namespace io { + +IOAdaptorFactory GetMockIOAdaptorFactory() { + LOG(INFO) << "Using mock IO adaptor"; + return [](const std::string& path) + -> std::shared_ptr { + auto adaptor = std::make_shared(path); + return adaptor; + }; +} + +// MockIOAdaptor using MockAIOOperations +MockIOAdaptor::MockIOAdaptor(const std::string& location) + : location_(location), + aio_adaptor_(std::make_shared( + location, + std::make_shared())) { +} + +Status MockIOAdaptor::Open(std::string mode, bool direct_io) { + return aio_adaptor_->Open(mode, direct_io); +} + +Status MockIOAdaptor::Read(void* data, size_t size) { + // Check if we need to inject a read error + if (global_mock_io_read_error) { + return Status::IOError("Injected read error"); + } + + // Check if we need to simulate a read timeout + if (global_mock_io_read_timeout) { + std::this_thread::sleep_for( + std::chrono::milliseconds(global_mock_io_timeout_ms)); + } + + return aio_adaptor_->Read(data, size); +} + +Status MockIOAdaptor::Write(void* data, size_t size) { + // Check if we need to inject a write error + if (global_mock_io_write_error) { + return Status::IOError("Injected write error"); + } + + // Check if we need to simulate a write timeout + if (global_mock_io_write_timeout) { + std::this_thread::sleep_for( + std::chrono::milliseconds(global_mock_io_timeout_ms)); + } + + // need to write data to disk(cpfs) to serve for get file size + WriteDataAtOffset(location_, data, size, 0); + + return aio_adaptor_->Write(data, size); +} + +Status MockIOAdaptor::Close() { return aio_adaptor_->Close(); } + +Status MockIOAdaptor::GetFileSize(size_t& size) { + return aio_adaptor_->GetFileSize(size); +} + +Status MockIOAdaptor::FileTruncate(size_t size) { return Status::OK(); } + +std::future MockIOAdaptor::AsyncWrite(void* data, size_t size, + size_t offset) { + if (global_mock_io_write_error) { + std::promise promise; + promise.set_value(Status::IOError("Injected async write error")); + return promise.get_future(); + } + + if (global_mock_io_write_timeout) { + std::this_thread::sleep_for( + std::chrono::milliseconds(global_mock_io_timeout_ms)); + } + + // need to write data to disk(cpfs) to serve for get file size + WriteDataAtOffset(location_, data, size, offset); + + return aio_adaptor_->AsyncWrite(data, size, offset); +} + +std::future MockIOAdaptor::AsyncRead(void* data, size_t size, + size_t offset) { + if (global_mock_io_read_error) { + std::promise promise; + promise.set_value(Status::IOError("Injected async read error")); + return promise.get_future(); + } + + if (global_mock_io_read_timeout) { + std::this_thread::sleep_for( + std::chrono::milliseconds(global_mock_io_timeout_ms)); + } + + return aio_adaptor_->AsyncRead(data, size, offset); +} + +Status MockIOAdaptor::BatchAsyncRead( + std::vector& data_vec, std::vector& size_vec, + std::vector& offset_vec, + std::vector>& results) { + if (global_mock_io_batch_read_error) { + for (size_t i = 0; i < data_vec.size(); ++i) { + std::promise promise; + promise.set_value( + Status::IOError("Injected batch async read error for operation " + + std::to_string(i))); + results.push_back(promise.get_future()); + } + return Status::OK(); + } + + if (global_mock_io_batch_read_timeout) { + std::this_thread::sleep_for( + std::chrono::milliseconds(global_mock_io_timeout_ms)); + } + + return aio_adaptor_->BatchAsyncRead(data_vec, size_vec, offset_vec, results); +} + +Status MockIOAdaptor::BatchAsyncWrite( + std::vector& data_vec, std::vector& size_vec, + std::vector& offset_vec, + std::vector>& results) { + if (global_mock_io_batch_write_error) { + for (size_t i = 0; i < data_vec.size(); ++i) { + std::promise promise; + promise.set_value( + Status::IOError("Injected batch async write error for operation " + + std::to_string(i))); + results.push_back(promise.get_future()); + } + return Status::OK(); + } + + if (global_mock_io_batch_write_timeout) { + std::this_thread::sleep_for( + std::chrono::milliseconds(global_mock_io_timeout_ms)); + } + + // need to write data to disk(cpfs) to serve for get file size + for (size_t i = 0; i < data_vec.size(); ++i) { + WriteDataAtOffset(location_, data_vec[i], size_vec[i], offset_vec[i]); + } + + return aio_adaptor_->BatchAsyncWrite(data_vec, size_vec, offset_vec, results); +} + +Status MockIOAdaptor::WriteDataAtOffset(const std::string& location, void* data, + size_t size, size_t offset) { + int fd = open(location.c_str(), O_CREAT | O_RDWR, 0666); + if (fd == -1) { + LOG(ERROR) << "Failed to open file: " << location << ", errno: " << errno; + return Status::IOError("Failed to open file: " + location + + ", errno: " + std::to_string(errno)); + } + + if (lseek(fd, offset, SEEK_SET) == -1) { + close(fd); + return Status::IOError("Failed to seek to offset: " + + std::to_string(offset)); + } + + ssize_t written = write(fd, data, size); + if (written == -1) { + close(fd); + return Status::IOError("Failed to write data"); + } + + if (static_cast(written) != size) { + close(fd); + return Status::IOError("Incomplete write: expected " + + std::to_string(size) + " bytes, wrote " + + std::to_string(written) + " bytes"); + } + + if (close(fd) == -1) { + return Status::IOError("Failed to close file"); + } + + return Status::OK(); +} + +} // namespace io + +} // namespace vllm_kv_cache + +} // namespace vineyard diff --git a/modules/vllm-kv-cache/src/io/mock_io_adapter.h b/modules/vllm-kv-cache/src/io/mock_io_adapter.h new file mode 100644 index 000000000..b71e1d254 --- /dev/null +++ b/modules/vllm-kv-cache/src/io/mock_io_adapter.h @@ -0,0 +1,85 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef MODULES_VLLM_KV_CACHE_SRC_IO_MOCK_IO_ADAPTER_H_ +#define MODULES_VLLM_KV_CACHE_SRC_IO_MOCK_IO_ADAPTER_H_ + +#include +#include +#include + +#include "common/util/callback.h" +#include "common/util/env.h" +#include "common/util/status.h" +#include "vllm-kv-cache/src/io/aio_adaptor.h" +#include "vllm-kv-cache/src/io/error_injection.h" +#include "vllm-kv-cache/src/io/io_adaptor.h" + +namespace vineyard { + +namespace vllm_kv_cache { + +namespace io { + +// Factory function for creating MockIOAdaptor instances +IOAdaptorFactory GetMockIOAdaptorFactory(); + +class MockIOAdaptor : public io::IIOAdaptor { + public: + explicit MockIOAdaptor(const std::string& location); + + Status Open(std::string mode, bool direct_io = false) override; + + Status Read(void* data, size_t size) override; + + Status Write(void* data, size_t size) override; + + Status Close() override; + + Status GetFileSize(size_t& size) override; + + Status FileTruncate(size_t size) override; + + std::future AsyncWrite(void* data, size_t size, + size_t offset) override; + + std::future AsyncRead(void* data, size_t size, + size_t offset) override; + + Status BatchAsyncRead(std::vector& data_vec, + std::vector& size_vec, + std::vector& offset_vec, + std::vector>& results) override; + + Status BatchAsyncWrite(std::vector& data_vec, + std::vector& size_vec, + std::vector& offset_vec, + std::vector>& results) override; + + Status WriteDataAtOffset(const std::string& location, void* data, size_t size, + size_t offset); + + private: + std::string location_; + std::shared_ptr aio_adaptor_; +}; + +} // namespace io + +} // namespace vllm_kv_cache + +} // namespace vineyard + +#endif // MODULES_VLLM_KV_CACHE_SRC_IO_MOCK_IO_ADAPTER_H_ diff --git a/modules/vllm-kv-cache/src/io/posix_io_adaptor.cc b/modules/vllm-kv-cache/src/io/posix_io_adaptor.cc new file mode 100644 index 000000000..2afcccca0 --- /dev/null +++ b/modules/vllm-kv-cache/src/io/posix_io_adaptor.cc @@ -0,0 +1,114 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include +#include +#include + +#include "common/util/status.h" +#include "vllm-kv-cache/src/io/posix_io_adaptor.h" + +namespace vineyard { + +namespace vllm_kv_cache { + +namespace io { + +Status PosixIOAdaptor::Open(std::string mode, bool direct_io) { + int flags = 0; + if (mode == "r") { + flags = O_RDONLY; + } else if (mode == "w") { + flags = O_WRONLY | O_CREAT | O_TRUNC; + } else { + return Status::Invalid("Invalid mode: " + mode); + } + fd_ = open(location_.c_str(), flags, 0666); + if (fd_ == -1) { + return Status::IOError("Failed to open file: " + location_ + + " error:" + std::string(strerror(errno))); + } + return Status::OK(); +} + +Status PosixIOAdaptor::Read(void* data, size_t size) { + if (fd_ == -1) { + return Status::IOError("File not opened: " + location_ + + " error:" + std::string(strerror(errno))); + } + ssize_t bytes_read = read(fd_, data, size); + if (bytes_read == -1) { + return Status::IOError("Failed to read file: " + location_ + + " error:" + std::string(strerror(errno))); + } + if (static_cast(bytes_read) < size) { + return Status::EndOfFile(); + } + return Status::OK(); +} + +Status PosixIOAdaptor::Write(void* data, size_t size) { + if (fd_ == -1) { + return Status::IOError("File not opened: " + location_); + } + ssize_t bytes_written = write(fd_, data, size); + if (bytes_written == -1) { + return Status::IOError("Failed to write file: " + location_ + + " error:" + std::string(strerror(errno))); + } + if (static_cast(bytes_written) < size) { + return Status::IOError("Partial write to file: " + location_ + + " error:" + std::string(strerror(errno))); + } + return Status::OK(); +} + +Status PosixIOAdaptor::Close() { + if (fd_ == -1) { + return Status::IOError("File not opened: " + location_ + + " error:" + std::string(strerror(errno))); + } + if (close(fd_) < 0) { + return Status::IOError("Failed to close file: " + location_ + + " error:" + std::string(strerror(errno))); + } + fd_ = -1; + return Status::OK(); +} + +Status PosixIOAdaptor::FileTruncate(size_t size) { + if (fd_ == -1) { + return Status::IOError("File not opened: " + location_ + + " error:" + std::string(strerror(errno))); + } + + if (ftruncate(fd_, size) < 0) { + return Status::IOError("Failed to truncate file: " + location_ + + " to size: " + std::to_string(size) + + " error:" + std::string(strerror(errno))); + } + return Status::OK(); +} + +Status PosixIOAdaptor::GetFileSize(size_t& size) { + return Status::NotImplemented( + "GetFileSize is not implemented in PosixIOAdaptor"); +} + +} // namespace io + +} // namespace vllm_kv_cache + +} // namespace vineyard diff --git a/modules/vllm-kv-cache/src/io/posix_io_adaptor.h b/modules/vllm-kv-cache/src/io/posix_io_adaptor.h new file mode 100644 index 000000000..ea629ea64 --- /dev/null +++ b/modules/vllm-kv-cache/src/io/posix_io_adaptor.h @@ -0,0 +1,58 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef MODULES_VLLM_KV_CACHE_SRC_IO_POSIX_IO_ADAPTOR_H_ +#define MODULES_VLLM_KV_CACHE_SRC_IO_POSIX_IO_ADAPTOR_H_ + +#include + +#include "common/util/status.h" +#include "vllm-kv-cache/src/io/io_adaptor.h" + +namespace vineyard { + +namespace vllm_kv_cache { + +namespace io { + +class PosixIOAdaptor : public IIOAdaptor { + public: + explicit PosixIOAdaptor(const std::string& location, bool direct_io = false) + : location_(location), fd_(-1) {} + + Status Open(std::string mode, bool direct_io = false) override; + + Status Read(void* data, size_t size) override; + + Status Write(void* data, size_t size) override; + + Status GetFileSize(size_t& size) override; + + Status Close() override; + + Status FileTruncate(size_t size) override; + + private: + std::string location_; + int fd_ = -1; +}; + +} // namespace io + +} // namespace vllm_kv_cache + +} // namespace vineyard + +#endif // MODULES_VLLM_KV_CACHE_SRC_IO_POSIX_IO_ADAPTOR_H_ diff --git a/modules/vllm-kv-cache/src/storage/vllm_kv_storage.cc b/modules/vllm-kv-cache/src/storage/vllm_kv_storage.cc new file mode 100644 index 000000000..ad20c1532 --- /dev/null +++ b/modules/vllm-kv-cache/src/storage/vllm_kv_storage.cc @@ -0,0 +1,2401 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/memory/memcpy.h" +#include "common/util/get_tid.h" +#include "common/util/logging.h" +#include "common/util/monitor.h" +#include "vllm-kv-cache/ds/vllm_block.h" +#include "vllm-kv-cache/src/env.h" +#include "vllm-kv-cache/src/io/aio_adaptor.h" +#include "vllm-kv-cache/src/io/mock_io_adapter.h" +#include "vllm-kv-cache/src/storage/vllm_kv_storage.h" +#include "vllm-kv-cache/src/vllm_kv_cache_util.h" + +namespace vineyard { + +extern std::atomic VLLMKVStorage::req_count_; +extern uint64_t VLLMKVStorage::storage_base_pointer_; +extern uint64_t VLLMKVStorage::threads_; + +extern std::vector> VLLMKVStorage::vineyard_clients_; +extern std::vector> VLLMKVStorage::req_thread_vec_; +extern std::shared_ptr VLLMKVStorage::io_thread_pool_; +extern std::shared_ptr VLLMKVStorage::copy_thread_pool_; +extern std::shared_ptr VLLMKVStorage::fast_opt_thread_pool_; +extern std::shared_ptr VLLMKVStorage::block_opt_thread_pool_; +extern bool VLLMKVStorage::use_copy_; +extern bool VLLMKVStorage::direct_io_; +extern monitor::Monitor VLLMKVStorage::load_from_disk_io_monitor_; +extern monitor::Monitor VLLMKVStorage::load_memory_copy_monitor_; +extern monitor::Monitor VLLMKVStorage::load_from_disk_monitor_; +extern monitor::Monitor VLLMKVStorage::save_to_disk_io_monitor_; +extern monitor::Monitor VLLMKVStorage::save_to_disk_monitor_; +extern monitor::Monitor VLLMKVStorage::save_memory_copy_monitor_; +extern vllm_kv_cache::io::IOAdaptorFactory VLLMKVStorage::io_adaptor_factory_; + +Status VLLMKVStorage::InitStorage(uint64_t base_pointer, std::string ipc_socket, + std::string io_type, bool enable_mem_copy, + bool direct_io) { + storage_base_pointer_ = base_pointer; + threads_ = std::stoull(VLLMKVCacheEnv::GetKVStorageConcurrency()) / 2; + + RETURN_ON_ERROR(KVCacheHelper::Init(threads_)); + size_t vllm_max_block_num = + std::stoull(VLLMKVCacheEnv::GetVineyardVLLMMaxBlockNum()); + size_t vllm_block_meta_magic_size = + std::stoull(VLLMKVCacheEnv::GetVineyardVLLMBlockMetaMagicSize()); + for (size_t i = 0; i < threads_; i++) { + req_thread_vec_.push_back(std::make_shared(1)); + vineyard_clients_.emplace_back(std::make_shared()); + RETURN_ON_ERROR(vineyard_clients_.back()->Connect(ipc_socket)); + RETURN_ON_ERROR(vineyard_clients_.back()->RequireExtraRequestMemory( + vllm_block_meta_magic_size * vllm_max_block_num)); + } + LOG(INFO) << "VLLMKVStorage::InitStorage: vineyard_clients_ size = " + << vineyard_clients_.size(); + io_thread_pool_ = std::make_shared(threads_); + copy_thread_pool_ = std::make_shared(threads_); + fast_opt_thread_pool_ = std::make_shared(threads_); + block_opt_thread_pool_ = std::make_shared(threads_); + LOG(INFO) << "VLLMKVStorage::InitStorage: req_thread_pool_ threads = " + << threads_ << ", io_thread_pool_ threads = " << threads_ + << ", copy_thread_pool_ threads = " << threads_ + << ", fast_opt_thread_pool_ threads = " << threads_ + << ", block_opt_thread_pool_ threads = " << threads_; + + InitMonitor(); + use_copy_ = enable_mem_copy; + direct_io_ = direct_io; + LOG(INFO) << "VLLMKVStorage::InitStorage: use_copy_ = " << use_copy_; + LOG(INFO) << "VLLMKVStorage::InitStorage: direct_io_ = " << direct_io_; + + // Initialize io_adaptor_factory_ to use AIOAdaptor by default + if (io_type == "aio") { + io_adaptor_factory_ = vineyard::vllm_kv_cache::io::GetAIOAdaptorFactory(); + } else if (io_type == "mock") { + io_adaptor_factory_ = + vineyard::vllm_kv_cache::io::GetMockIOAdaptorFactory(); + } else { + return Status::Invalid("Invalid io_type: " + io_type); + } + return Status::OK(); +} + +Status VLLMKVStorage::GetBlockLocation( + std::vector& block_hash, + std::vector>& locations, std::string req_flag) { + if (block_hash.size() == 0) { + return Status::OK(); + } + size_t index = req_count_.fetch_add(1) % threads_; + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag << " arrived at GetBlockLocation, " + << "assigned to thread: " << index; + return req_thread_vec_[index] + ->enqueue([&]() { + uint64_t start = + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + Status status = GetBlockLocation(*vineyard_clients_[index], block_hash, + locations, req_flag); + VINEYARD_ASSERT(vineyard_clients_[index]->Connected(), + "vineyard client is not connected"); + uint64_t end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request : " << req_flag + << ". GetBlockLocation cost:" << (end - start) << " us."; + return status; + }) + .get(); +} + +Status VLLMKVStorage::GetBlockLocation( + Client& client, std::vector& block_hash, + std::vector>& locations, std::string& req_flag) { + VLOG(2) << "GetBlockLocation for request: " << req_flag; + std::vector block_names; + for (auto hash : block_hash) { + block_names.push_back(KVCacheHelper::BuildBlockName(hash)); + } + return client.GetObjectLocation(block_names, locations, req_flag); +} + +Status VLLMKVStorage::GetBlockKVCacheLayerwise( + std::vector& block_hash_vec, + std::vector>& offsets_vec, + std::vector>& sizes_vec, std::vector& shape, + int layer_index, std::string rpc_endpoint, + std::shared_ptr& layers, std::string req_flag) { + RETURN_ON_ASSERT( + block_hash_vec.size() == offsets_vec.size(), + "block_hash and offsets size not match for request: " + req_flag); + RETURN_ON_ASSERT( + block_hash_vec.size() == sizes_vec.size(), + "block_hash and sizes size not match for request: " + req_flag); + RETURN_ON_ASSERT(layer_index >= 0, + "layer_index must be >= 0 for request: " + req_flag); + RETURN_ON_ASSERT(layer_index < static_cast(shape.size()), + "layer_index must be < shape.size()"); + if (block_hash_vec.size() == 0) { + return Status::OK(); + } + + size_t index = req_count_.fetch_add(1) % threads_; + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag << " arrived at GetBlockKVCacheLayerwise, " + << "assigned to thread: " << index; + return req_thread_vec_[index] + ->enqueue([&]() { + uint64_t start = + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + Status status = GetBlockKVCacheLayerwise( + *vineyard_clients_[index], block_hash_vec, offsets_vec, sizes_vec, + shape, layer_index, rpc_endpoint, layers, req_flag); + VINEYARD_ASSERT(vineyard_clients_[index]->Connected(), + "vineyard client is not connected"); + uint64_t end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request : " << req_flag + << ". GetBlockKVCacheLayerwise cost:" << (end - start) << " us."; + return status; + }) + .get(); +} + +Status VLLMKVStorage::GetBlockKVCacheLayerwise( + Client& client, std::vector& block_hash, + std::vector>& offsets_vec, + std::vector>& sizes_vec, std::vector& shape, + int layer_index, std::string rpc_endpoint, + std::shared_ptr& layers, std::string req_flag) { + // from name get meta + // 1. send request to remote to get meta + // 2. shuffle buffer to layer wise + // 3. get remote buffer layer by layer + VLOG(2) << "GetBlockKVCacheLayerwise: " + << "block_hash size: " << block_hash.size() + << ", offsets_vec size: " << offsets_vec.size() + << ", sizes_vec size: " << sizes_vec.size() + << ", shape size: " << shape.size() + << ", layer_index: " << layer_index << ", request id: " << req_flag; + uint64_t start = 0, end = 0; + RETURN_ON_ASSERT( + block_hash.size() == offsets_vec.size(), + "block_hash and offsets size not match for request: " + req_flag); + RETURN_ON_ASSERT( + block_hash.size() == sizes_vec.size(), + "block_hash and sizes size not match for request: " + req_flag); + RETURN_ON_ASSERT(layer_index >= 0, + "layer_index must be >= 0 for request: " + req_flag); + RETURN_ON_ASSERT( + layer_index < static_cast(shape.size()), + "layer_index must be < shape.size() for request: " + req_flag); + std::vector block_names; + for (auto hash : block_hash) { + block_names.push_back(KVCacheHelper::BuildBlockName(hash)); + } + std::vector meta_vec; + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + RETURN_ON_ERROR(client.VineyardGetMetasByNames(block_names, rpc_endpoint, + meta_vec, req_flag)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag << ". Get remote meta cost: " << (end - start) + << " us"; + RETURN_ON_ASSERT( + meta_vec.size() == block_hash.size(), + "meta_vec and block_hash size not match for request: " + req_flag); + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + Status status = VLLMLayers::FromBlocks( + client, block_hash, offsets_vec, sizes_vec, shape, layer_index, meta_vec, + rpc_endpoint, layers, req_flag); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag + << ". Construct layers from blocks cost:" << (end - start) << " us"; + return status; +} + +Status VLLMKVStorage::PutBlockKVCache( + std::vector& block_hash_vec, + std::vector>& offsets_vec, + std::vector>& sizes_vec, std::vector& shape, + int layer_index, std::vector& statuses, std::string req_flag) { + RETURN_ON_ASSERT( + block_hash_vec.size() == offsets_vec.size(), + "block_hash and offsets size not match for request: " + req_flag); + RETURN_ON_ASSERT( + block_hash_vec.size() == sizes_vec.size(), + "block_hash and sizes size not match for request: " + req_flag); + RETURN_ON_ASSERT(layer_index >= 0, + "layer_index must be >= 0 for request: " + req_flag); + RETURN_ON_ASSERT( + layer_index < static_cast(shape.size()), + "layer_index must be < shape.size() for request: " + req_flag); + if (block_hash_vec.size() == 0) { + return Status::OK(); + } + + size_t index = req_count_.fetch_add(1) % threads_; + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag << " arrived at PutBlockKVCache, " + << "assigned to thread: " << index; + return req_thread_vec_[index] + ->enqueue([&]() { + uint64_t start = + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + Status status = PutBlockKVCache(*vineyard_clients_[index], + block_hash_vec, offsets_vec, sizes_vec, + shape, layer_index, statuses, req_flag); + VINEYARD_ASSERT(vineyard_clients_[index]->Connected(), + "vineyard client is not connected"); + uint64_t end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request : " << req_flag + << ". PutBlockKVCache cost:" << (end - start) << " us."; + return status; + }) + .get(); +} + +Status VLLMKVStorage::PutBlockKVCache( + Client& client, std::vector& block_hash_vec, + std::vector>& offsets_vec, + std::vector>& sizes_vec, std::vector& shape, + int layer_index, std::vector& statuses, std::string& req_flag) { + std::vector> blocks; + return PutBlockKVCache(client, block_hash_vec, offsets_vec, sizes_vec, shape, + layer_index, blocks, statuses, req_flag); +} + +Status VLLMKVStorage::PutBlockKVCache( + Client& client, std::vector& block_hash_vec, + std::vector>& offsets_vec, + std::vector>& sizes_vec, + std::vector& shape_vec, int layer_index, + std::vector>& blocks, + std::vector& statuses, std::string& req_flag) { + VLOG(2) << "PutBlockKVCache: " + << "block_hash_vec size: " << block_hash_vec.size() + << ", offsets_vec size: " << offsets_vec.size() + << ", sizes_vec size: " << sizes_vec.size() + << ", shape_vec size: " << shape_vec.size() + << ", layer_index: " << layer_index << ", request id: " << req_flag; + uint64_t start = 0, end = 0; + + std::vector> block_builders_to_delete; + std::vector> blocks_to_delete; + statuses.resize(block_hash_vec.size(), Status::OK()); + std::vector> block_builders; + std::vector block_names; + std::vector ids; + for (auto hash : block_hash_vec) { + block_names.push_back(KVCacheHelper::BuildBlockName(hash)); + } + Status status = + VLLMBlockBuilder::Make(client, offsets_vec, sizes_vec, shape_vec, + layer_index, block_builders, req_flag); + if (!status.ok()) { + statuses.assign(block_hash_vec.size(), status); + return Status::OK(); + } + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + status = VLLMBlockBuilder::BatchSeal(client, block_builders, blocks, ids, + req_flag); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag << " BatchSeal cost:" << (end - start) + << " us"; + if (!status.ok()) { + LOG(ERROR) << "Failed to seal blocks: " << status.ToString() + << ", request id: " << req_flag; + for (auto& builder : block_builders) { + block_builders_to_delete.push_back(builder); + } + statuses.assign(block_hash_vec.size(), status); + } else { + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + Status status = client.PutNames(ids, block_names, req_flag, false); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag << " PutNames cost:" << (end - start) + << " us"; + if (!status.ok()) { + LOG(ERROR) << "Failed to put block names: " << status.ToString() + << ", request id: " << req_flag; + for (auto& block : blocks) { + blocks_to_delete.push_back(block); + } + statuses.assign(block_hash_vec.size(), status); + } + } + + /* + * Here we can discard the result of DeleteBlocks with no fatal effect because + * this error occurs when put block name to meta service. If it is failed, + * other node can not get the block by name, so it is safe to notify user + * that the block is not put successfully. It just cause a memory leak + * if the block is not deleted. + */ + if (!DeleteBlocks(client, blocks_to_delete, req_flag).ok()) { + LOG(ERROR) << "Failed to delete blocks, may cause memory leak. Request id: " + << req_flag; + } + + if (!DeleteBlockBuilders(client, block_builders_to_delete, req_flag).ok()) { + LOG(ERROR) << "Failed to delete block builders, may cause memory leak. " + "Request id: " + << req_flag; + } + + return Status::OK(); +} + +Status VLLMKVStorage::PutBlockKVCache( + std::vector& block_hash_vec, + std::vector>& block_builders, + std::vector& statuses, std::string& req_flag) { + size_t index = req_count_.fetch_add(1) % threads_; + return req_thread_vec_[index] + ->enqueue([&]() { + Status status = + PutBlockKVCache(*vineyard_clients_[index], block_hash_vec, + block_builders, statuses, req_flag); + VINEYARD_ASSERT(vineyard_clients_[index]->Connected(), + "vineyard client is not connected"); + return status; + }) + .get(); +} + +Status VLLMKVStorage::PutBlockKVCache( + Client& client, std::vector& block_hash_vec, + std::vector>& block_builders, + std::vector& statuses, std::string& req_flag) { + std::vector> blocks_to_delete; + std::vector> block_builders_to_delete; + RETURN_ON_ASSERT( + block_hash_vec.size() == block_builders.size(), + "block_hash and block_builders size not match for request: " + req_flag); + statuses.resize(block_hash_vec.size()); + for (size_t i = 0; i < block_hash_vec.size(); ++i) { + std::shared_ptr block; + Status status = block_builders[i]->Seal(client, block); + if (!status.ok()) { + LOG(ERROR) << "Failed to seal block: " << status.ToString() + << ", request id: " << req_flag; + block_builders_to_delete.push_back(block_builders[i]); + statuses[i] = status; + continue; + } + status = client.PutName( + block->id(), KVCacheHelper::BuildBlockName(block_hash_vec[i]), false); + if (!status.ok()) { + LOG(ERROR) << "Failed to put block name: " << status.ToString() + << ", request id: " << req_flag; + blocks_to_delete.push_back(std::dynamic_pointer_cast(block)); + statuses[i] = status; + continue; + } + statuses[i] = status; + } + if (!DeleteBlockBuilders(client, block_builders_to_delete, req_flag).ok()) { + LOG(ERROR) << "Failed to delete block builders, may cause memory leak. " + "Request id: " + << req_flag; + } + + if (!DeleteBlocks(client, blocks_to_delete, req_flag).ok()) { + LOG(ERROR) << "Failed to delete blocks, may cause memory leak. Request id: " + << req_flag; + } + return Status::OK(); +} + +Status VLLMKVStorage::DeleteBlocks(std::vector block_hash_vec, + std::string req_flag) { + if (block_hash_vec.size() == 0) { + return Status::OK(); + } + size_t index = req_count_.fetch_add(1) % threads_; + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag << " arrived at DeleteBlocks, " + << "assigned to thread: " << index; + return req_thread_vec_[index] + ->enqueue([&]() { + uint64_t start = + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + Status status = block_opt_thread_pool_ + ->enqueue([&]() { + return DeleteBlocks(*vineyard_clients_[index], + block_hash_vec, req_flag); + }) + .get(); + + VINEYARD_ASSERT(vineyard_clients_[index]->Connected(), + "vineyard client is not connected"); + uint64_t end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request : " << req_flag + << ". DeleteBlocks cost:" << (end - start) << " us."; + return status; + }) + .get(); +} + +Status VLLMKVStorage::DeleteBlocks(Client& client, + std::vector block_hash_vec, + std::string& req_flag) { + uint64_t start = 0, end = 0; + std::vector block_name_vec; + size_t block_num = block_hash_vec.size(); + + block_name_vec.reserve(block_num); + for (auto block_hash : block_hash_vec) { + std::string block_name = KVCacheHelper::BuildBlockName(block_hash); + block_name_vec.push_back(block_name); + } + + std::vector id_vec; + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VINEYARD_CHECK_OK(client.GetNames(block_name_vec, id_vec, req_flag)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag + << ". Get block ids from names cost: " << (end - start) << " us"; + + std::vector valid_id_vec; + for (auto id : id_vec) { + if (id != InvalidObjectID()) { + valid_id_vec.push_back(id); + } + } + if (valid_id_vec.size() == 0) { + return Status::OK(); + } + + std::vector meta_vec; + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VINEYARD_CHECK_OK( + client.GetHugeMetaData(valid_id_vec, meta_vec, req_flag, false, true)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag << ". Get block metas cost: " << (end - start) + << " us"; + + // Drop name will triggler rpc, so we discard the error result + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + Status status = client.DropNames(block_name_vec, req_flag); + if (!status.ok()) { + LOG(WARNING) << "Failed to drop block names: " << status.ToString() + << ", may cause inconsistency. Request id: " << req_flag; + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag << ". Drop block names cost: " << (end - start) + << " us"; + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VINEYARD_CHECK_OK( + client.DelHugeData(valid_id_vec, false, false, true, req_flag)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag + << ". Delete block objects cost: " << (end - start) << " us"; + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VINEYARD_CHECK_OK(CleanBlockBlobs(client, meta_vec, req_flag)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag + << ". Clean block blobs cost: " << (end - start) << " us"; + + return Status::OK(); +} + +Status VLLMKVStorage::DeleteBlocks( + Client& client, std::vector>& blocks, + std::string& req_flag) { + std::vector meta_vec; + std::vector id_vec; + meta_vec.reserve(blocks.size()); + id_vec.reserve(blocks.size()); + + for (auto block : blocks) { + id_vec.push_back(block->id()); + meta_vec.push_back(block->meta()); + } + + VINEYARD_CHECK_OK(client.DelData(id_vec)); + VINEYARD_CHECK_OK(CleanBlockBlobs(client, meta_vec, req_flag)); + return Status::OK(); +} + +Status VLLMKVStorage::DeleteBlockBuilders( + std::vector>& block_builders, + std::string& req_flag) { + size_t index = req_count_.fetch_add(1) % threads_; + return req_thread_vec_[index] + ->enqueue([&]() { + Status status = DeleteBlockBuilders(*vineyard_clients_[index], + block_builders, req_flag); + VINEYARD_ASSERT(vineyard_clients_[index]->Connected(), + "vineyard client is not connected"); + return status; + }) + .get(); +} + +Status VLLMKVStorage::DeleteBlockBuilders( + Client& client, + std::vector>& block_builders, + std::string& req_flag) { + Status status = Status::OK(); + for (auto block_builder : block_builders) { + Status status_ = CleanBlockBuilderBlobs(client, block_builder, req_flag); + if (!status_.ok()) { + LOG(WARNING) << "Failed to clean block builder" + << ", may cause memory leak. Error: " << status_.ToString() + << ", request id: " << req_flag; + status += status_; + } + } + return status; +} + +/** + * FIXME: If the user 1 create a udc, and user 2 create the same udc, it + * can occur that the user 2 rename file failed and then the udc is deleted + * by ttl. Then the udc of user 2 will lose some blocks but the system think + * the udc is valid. + * + * FIXME: refresh block ttl after save to disk. + */ +Status VLLMKVStorage::SaveToDisk( + std::vector& block_hash_vec, + std::vector>& offsets_vec, + std::vector>& sizes_vec, std::vector& shape, + int layer_index, std::vector& statuses, uint64_t ttl, bool wait, + std::string req_flag) { + RETURN_ON_ASSERT(block_hash_vec.size() == offsets_vec.size(), + "block_hash and offsets size not match"); + RETURN_ON_ASSERT(block_hash_vec.size() == sizes_vec.size(), + "block_hash and sizes size not match"); + RETURN_ON_ASSERT(layer_index >= 0, "layer_index must be >= 0"); + RETURN_ON_ASSERT(layer_index < static_cast(shape.size()), + "layer_index must be < shape.size()"); + if (block_hash_vec.size() == 0) { + return Status::OK(); + } + + size_t index = req_count_.fetch_add(1) % threads_; + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag << " arrived at SaveToDisk, " + << "assigned to thread: " << index; + return req_thread_vec_[index] + ->enqueue([&]() { + uint64_t start = 0, end = 0; + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + + statuses.resize(block_hash_vec.size(), Status::OK()); + std::vector filtered_hash_vec; + std::vector filtered_hash_index; + std::vector exist_block_vec; + std::vector> filtered_offsets_vec; + std::vector> filtered_sizes_vec; + if (!FilterFiles(block_hash_vec, exist_block_vec, filtered_hash_vec, + filtered_hash_index) + .ok()) { + filtered_hash_vec = block_hash_vec; + filtered_hash_index.resize(block_hash_vec.size()); + for (size_t i = 0; i < block_hash_vec.size(); ++i) { + filtered_hash_index[i] = i; + } + filtered_offsets_vec = offsets_vec; + filtered_sizes_vec = sizes_vec; + } + for (auto index : filtered_hash_index) { + filtered_offsets_vec.push_back(offsets_vec[index]); + filtered_sizes_vec.push_back(sizes_vec[index]); + } + RETURN_ON_ERROR(UpdateTTL(*vineyard_clients_[index], exist_block_vec, + ttl, req_flag)); + + Status status = Status::OK(); + if (use_copy_) { + status = SaveToDiskWithCopy(*vineyard_clients_[index], + filtered_hash_vec, filtered_offsets_vec, + filtered_sizes_vec, shape, layer_index, + statuses, ttl, wait, req_flag); + } else { + status = SaveToDiskWithoutCopy( + *vineyard_clients_[index], filtered_hash_vec, + filtered_offsets_vec, filtered_sizes_vec, shape, layer_index, + statuses, ttl, req_flag); + } + VINEYARD_ASSERT(vineyard_clients_[index]->Connected(), + "vineyard client is not connected"); + + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag << ". SaveToDisk cost:" << (end - start) + << " us."; + return status; + }) + .get(); +} + +Status VLLMKVStorage::FilterFiles(std::vector& block_hash_vec, + std::vector& exist_block_vec, + std::vector& filtered_hash_vec, + std::vector& filtered_hash_index) { + uint64_t start = 0, end = 0; + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + std::vector> exist_statuses; + exist_block_vec.reserve(block_hash_vec.size()); + exist_statuses.reserve(block_hash_vec.size()); + for (size_t i = 0; i < block_hash_vec.size(); ++i) { + exist_statuses.push_back(fast_opt_thread_pool_->enqueue( + [&](size_t i) { + std::string prefix_dir, file_name; + Hash2PrefixDirAndSuffixFile(block_hash_vec[i], prefix_dir, file_name); + std::string file_path = + VLLMKVCacheEnv::GetVineyardVLLMKVCacheDiskPath() + '/' + + file_name; + if (std::filesystem::exists(file_path)) { + VLOG(100) << "Block file exists: " << file_path + << ", skip write to disk."; + return Status::OK(); + } + return Status::ObjectNotExists(); + }, + i)); + } + + for (size_t i = 0; i < exist_statuses.size(); ++i) { + Status status = exist_statuses[i].get(); + if (status.IsObjectNotExists()) { + filtered_hash_vec.push_back(block_hash_vec[i]); + filtered_hash_index.push_back(i); + } else if (!status.ok()) { + // unexpected error + return status; + } else { + exist_block_vec.push_back(block_hash_vec[i]); + } + } + + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "FilterFiles cost:" << (end - start) << " us"; + return Status::OK(); +} + +Status VLLMKVStorage::UpdateTTL(Client& client, + std::vector& block_hash_vec, + uint64_t ttl, std::string& req_flag) { + uint64_t start = 0, end = 0; + if (block_hash_vec.empty()) { + return Status::OK(); + } + if (VLLMKVCacheEnv::LocalVineyardVLLMKVCache() == "1") { + LOG(INFO) << "LocalVineyardVLLMKVCache is set, skip UpdateTTL."; + return Status::OK(); + } + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + std::string block_location; + std::vector disk_type; + std::vector block_name; + RETURN_ON_ERROR(GetIOTag(block_location)); + for (size_t i = 0; i < block_hash_vec.size(); ++i) { + disk_type.push_back(block_location); + block_name.push_back(KVCacheHelper::BuildBlockName(block_hash_vec[i])); + } + client.PutObjectLocation(block_name, disk_type, ttl, req_flag); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag << ". UpdateTTL cost: " << (end - start) + << " us"; + return Status::OK(); +} + +Status VLLMKVStorage::SaveToDiskWithCopy( + Client& client, std::vector& block_hash_vec, + std::vector>& offsets_vec, + std::vector>& sizes_vec, std::vector& shape, + int layer_index, std::vector& status_vec, uint64_t ttl, bool wait, + std::string& req_flag) { + uint64_t start = 0, end = 0; + MONITOR_AUTO(save_to_disk_monitor_); + + size_t hash_num = block_hash_vec.size(); + if (hash_num == 0) { + LOG(INFO) << "No new blocks to save to disk for request: " << req_flag; + return Status::OK(); + } + status_vec.resize(hash_num, Status::OK()); + + std::vector tmp_file_name_vec; + std::vector file_name_vec; + std::vector> + tmp_io_adaptor_vec; + std::vector> auto_delete_vec; + tmp_io_adaptor_vec.resize(hash_num); + tmp_file_name_vec.resize(hash_num); + file_name_vec.resize(hash_num); + + std::string pidstr = std::to_string(getpid()); + Status status = Status::OK(); + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + for (size_t i = 0; i < hash_num; ++i) { + std::string prefix_dir; + Hash2PrefixDirAndSuffixFile(block_hash_vec[i], prefix_dir, + file_name_vec[i]); + status = CreateDirectoriesIfNotExists( + VLLMKVCacheEnv::GetVineyardVLLMKVCacheDiskPath() + '/' + prefix_dir); + if (!status.ok()) { + LOG(ERROR) << "Failed to create directory: " << prefix_dir + << ", request id: " << req_flag + << ", error: " << status.ToString(); + status_vec[i] = status; + continue; + } + + std::string tmp_file_name = file_name_vec[i] + "_" + pidstr + "_" + + std::to_string(gettid()) + "_tmp"; + // We do not to check if the file is exists. + std::shared_ptr io_adaptor; + status = GetIOAdaptor(io_adaptor, tmp_file_name); + if (!status.ok()) { + LOG(ERROR) << "Failed to get IOAdaptor for block: " + << KVCacheHelper::BuildBlockName(block_hash_vec[i]) + << ", error: " << status.ToString() + << ", request id: " << req_flag; + status_vec[i] = status; + continue; + } + + status = io_adaptor->Open("w", direct_io_); + if (!status.ok()) { + LOG(ERROR) << "Failed to open IOAdaptor for block: " + << KVCacheHelper::BuildBlockName(block_hash_vec[i]) + << ", error: " << status.ToString() + << ", request id: " << req_flag; + status_vec[i] = status; + continue; + } + + tmp_io_adaptor_vec[i] = io_adaptor; + tmp_file_name_vec[i] = tmp_file_name; + auto_delete_vec.push_back(std::shared_ptr( + new std::string(tmp_file_name), [&](std::string* ptr) { + if (!std::filesystem::remove(GetIOPathPrefix() + "/" + *ptr) && + std::filesystem::exists(GetIOPathPrefix() + "/" + *ptr)) { + LOG(WARNING) << "Failed to remove temporary file: " << *ptr + << ", may cause resource leak."; + } + delete ptr; + })); + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag + << ". Create tmp file io adaptors cost: " << (end - start) << " us"; + + start = end; + MONITOR_START(save_memory_copy_monitor_) + std::vector> data_ptr_vec; + std::vector file_size_vec; + std::vector> copy_statuses; + copy_statuses.resize(hash_num); + data_ptr_vec.resize(hash_num); + file_size_vec.resize(hash_num); + auto copy_func = [&](size_t i) -> Status { + std::shared_ptr data_ptr; + size_t file_size = 0; + json meta_json; + ConstructVLLMBlockFileMeta(offsets_vec[i], sizes_vec[i], shape, layer_index, + meta_json); + std::string meta_str = meta_json.dump(); + Status status = CopyBlockToMemoryInternal( + meta_str, offsets_vec[i], sizes_vec[i], data_ptr, file_size); + if (status.ok()) { + data_ptr_vec[i] = (data_ptr); + file_size_vec[i] = (file_size); + if (!tmp_io_adaptor_vec[i]->FileTruncate(file_size_vec[i]).ok()) { + LOG(WARNING) << "Failed to truncate file for block: " + << KVCacheHelper::BuildBlockName(block_hash_vec[i]) + << ", may cause performance issue."; + } + } + return status; + }; + + for (size_t i = 0; i < hash_num; ++i) { + if (status_vec[i].ok()) { + copy_statuses[i] = copy_thread_pool_->enqueue(copy_func, i); + } + } + + for (size_t i = 0; i < hash_num; ++i) { + if (!status_vec[i].ok()) { + continue; + } + + if (!copy_statuses[i].valid()) { + LOG(ERROR) << "Future is not valid for block: " + << KVCacheHelper::BuildBlockName(block_hash_vec[i]); + tmp_io_adaptor_vec[i]->Close(); + status_vec[i] = Status::IOError("Future is not valid"); + continue; + } + + Status status = copy_statuses[i].get(); + if (!status.ok()) { + LOG(ERROR) << "Failed to copy block to memory: " + << KVCacheHelper::BuildBlockName(block_hash_vec[i]) + << ", error: " << status.ToString(); + tmp_io_adaptor_vec[i]->Close(); + status_vec[i] = status; + } + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag + << ". Copy blocks to memory cost: " << (end - start) << " us"; + + MONITOR_END(save_memory_copy_monitor_); + + MONITOR_START(save_to_disk_io_monitor_); + + if (wait) { + RETURN_ON_ERROR(SaveToDiskSubmitIO(client, tmp_io_adaptor_vec, + file_size_vec, data_ptr_vec, + block_hash_vec, status_vec, req_flag)); + + RETURN_ON_ERROR(SaveToDiskMoveFile( + client, block_hash_vec, tmp_io_adaptor_vec, file_name_vec, + tmp_file_name_vec, status_vec, ttl, req_flag)); + } else { + block_opt_thread_pool_->enqueue( + [&client, ttl, req_flag, auto_delete_vec_ = std::move(auto_delete_vec), + tmp_io_adaptor_vec_ = std::move(tmp_io_adaptor_vec), + file_name_vec_ = std::move(file_name_vec), + tmp_file_name_vec_ = std::move(tmp_file_name_vec), + data_ptr_vec_ = std::move(data_ptr_vec), + file_size_vec_ = std::move(file_size_vec), + block_hash_vec_ = std::move(block_hash_vec), + status_vec_ = status_vec]() { + std::vector> + tmp_io_adaptor_vec__ = tmp_io_adaptor_vec_; + std::vector file_name_vec__ = file_name_vec_; + std::vector tmp_file_name_vec__ = tmp_file_name_vec_; + std::vector> data_ptr_vec__ = data_ptr_vec_; + std::vector file_size_vec__ = file_size_vec_; + std::vector block_hash_vec__ = block_hash_vec_; + std::vector status_vec__ = status_vec_; + + SaveToDiskSubmitIO(client, tmp_io_adaptor_vec__, file_size_vec__, + data_ptr_vec__, block_hash_vec__, status_vec__, + req_flag); + SaveToDiskMoveFile(client, block_hash_vec__, tmp_io_adaptor_vec__, + file_name_vec__, tmp_file_name_vec__, status_vec__, + ttl, req_flag); + return Status::OK(); + }); + } + return Status::OK(); +} + +Status VLLMKVStorage::SaveToDiskSubmitIO( + Client& client, + std::vector>& io_adaptor_vec, + std::vector& file_size_vec, + std::vector>& data_ptr_vec, + std::vector& block_hash_vec, std::vector& status_vec, + std::string req_flag) { + uint64_t start = 0, end = 0; + size_t hash_num = block_hash_vec.size(); + std::vector> submit_status_vecs; + submit_status_vecs.resize(hash_num); + for (size_t i = 0; i < hash_num; ++i) { + if (!status_vec[i].ok()) { + continue; + } + submit_status_vecs[i] = + io_adaptor_vec[i]->AsyncWrite(data_ptr_vec[i], file_size_vec[i], 0); + } + + for (size_t i = 0; i < hash_num; ++i) { + if (!status_vec[i].ok()) { + continue; + } + if (!submit_status_vecs[i].valid()) { + LOG(ERROR) << "Future is not valid for block: " + << KVCacheHelper::BuildBlockName(block_hash_vec[i]) + << ", request id: " << req_flag; + io_adaptor_vec[i]->Close(); + status_vec[i] = + Status::IOError("Future is not valid. Request id: " + req_flag); + continue; + } + Status status = submit_status_vecs[i].get(); + if (!status.ok()) { + LOG(ERROR) << "Failed to write block to disk: " + << KVCacheHelper::BuildBlockName(block_hash_vec[i]) + << ", error: " << status.ToString() + << ", request id: " << req_flag; + io_adaptor_vec[i]->Close(); + status_vec[i] = status; + } + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag + << ". Write blocks to disk cost: " << (end - start) << " us."; + + return Status::OK(); +} + +Status VLLMKVStorage::SaveToDiskMoveFile( + Client& client, std::vector& block_hash_vec, + std::vector>& io_adaptor_vec, + std::vector& file_name_vec, + std::vector& tmp_file_name_vec, + std::vector& status_vec, uint64_t ttl, std::string req_flag) { + uint64_t start = 0, end = 0; + uint64_t hash_num = block_hash_vec.size(); + + std::string disk_location; + Status status = GetIOTag(disk_location); + if (!status.ok()) { + LOG(ERROR) << "Failed to get disk tag for blocks, error: " + << status.ToString() << ", request id: " << req_flag; + return status; + } + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + std::vector> move_status_vec; + move_status_vec.resize(hash_num); + auto move_func = [&](size_t i) -> Status { + RETURN_ON_ERROR(io_adaptor_vec[i]->Close()); + std::shared_ptr io_adaptor; + GetIOAdaptor(io_adaptor, file_name_vec[i]); + RETURN_ON_ERROR(io_adaptor->Open("w", direct_io_)); + RETURN_ON_ERROR(io_adaptor->Close()); + + try { + std::filesystem::rename( + (GetIOPathPrefix() + "/" + tmp_file_name_vec[i]).c_str(), + (GetIOPathPrefix() + "/" + file_name_vec[i]).c_str()); + } catch (const std::filesystem::filesystem_error& e) { + LOG(ERROR) << "Failed to rename file: " + << GetIOPathPrefix() + "/" + tmp_file_name_vec[i] << " to " + << GetIOPathPrefix() + "/" + file_name_vec[i] + << ", error: " << e.what(); + return Status::IOError("Failed to rename file: " + GetIOPathPrefix() + + "/" + tmp_file_name_vec[i] + " to " + + GetIOPathPrefix() + "/" + file_name_vec[i] + + ", error: " + e.what()); + } + return Status::OK(); + }; + for (size_t i = 0; i < hash_num; ++i) { + if (!status_vec[i].ok()) { + continue; + } + move_status_vec[i] = io_thread_pool_->enqueue(move_func, i); + } + + std::vector written_block_name_vec; + std::vector block_location_vec; + written_block_name_vec.resize(hash_num); + block_location_vec.resize(hash_num); + + for (size_t i = 0; i < hash_num; ++i) { + if (!status_vec[i].ok()) { + continue; + } + Status status = move_status_vec[i].get(); + if (!status.ok()) { + LOG(ERROR) << "Failed to move file for block: " + << KVCacheHelper::BuildBlockName(block_hash_vec[i]) + << ", error: " << status.ToString() + << ", request id: " << req_flag; + status_vec[i] = status; + continue; + } + written_block_name_vec[i] = + KVCacheHelper::BuildBlockName(block_hash_vec[i]); + block_location_vec[i] = disk_location; + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag + << ". Move blocks to final path cost: " << (end - start) << " us"; + MONITOR_END(save_to_disk_io_monitor_); + + start = end; + if (VLLMKVCacheEnv::LocalVineyardVLLMKVCache() == "1") { + LOG(INFO) << "Test mode will skip putting object location."; + return Status::OK(); + } + + status = client.PutObjectLocation(written_block_name_vec, block_location_vec, + ttl, req_flag); + if (!status.ok()) { + LOG(WARNING) << "Failed to put object location for blocks, error: " + << status.ToString() << ", request id: " << req_flag; + for (auto path : file_name_vec) { + std::filesystem::remove(GetIOPathPrefix() + "/" + path); + } + return status; + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag + << ". Put object location cost: " << (end - start) << " us"; + + return Status::OK(); +} + +Status VLLMKVStorage::SaveToDiskWithoutCopy( + Client& client, std::vector& block_hash_vec, + std::vector>& offsets_vec, + std::vector>& sizes_vec, std::vector& shape, + int layer_index, std::vector& status_vec, uint64_t ttl, + std::string& req_flag) { + uint64_t start = 0, end = 0; + MONITOR_AUTO(save_to_disk_monitor_); + size_t hash_num = block_hash_vec.size(); + if (hash_num == 0) { + return Status::OK(); + } + status_vec.resize(hash_num, Status::OK()); + + std::vector tmp_file_name_vec; + std::vector file_name_vec; + std::vector> + tmp_io_adaptor_vec; + std::vector> auto_delete_vec; + tmp_io_adaptor_vec.resize(hash_num); + tmp_file_name_vec.resize(hash_num); + file_name_vec.resize(hash_num); + + std::string disk_location; + Status status = GetIOTag(disk_location); + if (!status.ok()) { + LOG(ERROR) << "Failed to get disk tag for blocks, error: " + << status.ToString() << ", request id: " << req_flag; + return status; + } + + std::string pidstr = std::to_string(getpid()); + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + for (size_t i = 0; i < hash_num; ++i) { + std::string prefix_dir; + Hash2PrefixDirAndSuffixFile(block_hash_vec[i], prefix_dir, + file_name_vec[i]); + status = CreateDirectoriesIfNotExists( + VLLMKVCacheEnv::GetVineyardVLLMKVCacheDiskPath() + '/' + prefix_dir); + if (!status.ok()) { + LOG(ERROR) << "Failed to create directory: " << prefix_dir; + status_vec[i] = status; + continue; + } + + std::string tmp_file_name = file_name_vec[i] + "_" + pidstr + "_" + + std::to_string(gettid()) + "_tmp"; + // We do not to check if the file is exists. + std::shared_ptr io_adaptor; + status = GetIOAdaptor(io_adaptor, tmp_file_name); + if (!status.ok()) { + LOG(ERROR) << "Failed to get IOAdaptor for block: " + << KVCacheHelper::BuildBlockName(block_hash_vec[i]) + << ", error: " << status.ToString(); + status_vec[i] = status; + continue; + } + + status = io_adaptor->Open("w", direct_io_); + if (!status.ok()) { + LOG(ERROR) << "Failed to open IOAdaptor for block: " + << KVCacheHelper::BuildBlockName(block_hash_vec[i]) + << ", error: " << status.ToString(); + status_vec[i] = status; + continue; + } + + tmp_io_adaptor_vec[i] = io_adaptor; + tmp_file_name_vec[i] = tmp_file_name; + auto_delete_vec.push_back(std::shared_ptr( + new std::string(tmp_file_name), [&](std::string* ptr) { + if (!std::filesystem::remove(GetIOPathPrefix() + "/" + *ptr) && + std::filesystem::exists(GetIOPathPrefix() + "/" + *ptr)) { + LOG(WARNING) << "Failed to remove temporary file: " << *ptr + << ", may cause resource leak."; + } + delete ptr; + })); + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag + << ". Create tmp file io adaptors cost: " << (end - start) << " us"; + + start = end; + std::vector> ret_status_future_vec; + std::vector>> write_status_future_vec; + write_status_future_vec.resize(hash_num); + ret_status_future_vec.resize(hash_num); + auto write_func = [&](size_t i) -> Status { + json meta_json; + return WriteBlockToDisk(tmp_io_adaptor_vec[i], offsets_vec[i], sizes_vec[i], + shape, layer_index, write_status_future_vec[i]); + }; + + for (size_t i = 0; i < hash_num; ++i) { + if (!status_vec[i].ok()) { + continue; + } + ret_status_future_vec[i] = io_thread_pool_->enqueue(write_func, i); + } + + // check write results + for (size_t i = 0; i < hash_num; ++i) { + if (!status_vec[i].ok()) { + continue; + } + if (!ret_status_future_vec[i].valid()) { + LOG(ERROR) << "Future is not valid for block: " + << KVCacheHelper::BuildBlockName(block_hash_vec[i]); + tmp_io_adaptor_vec[i]->Close(); + status_vec[i] = Status::IOError("Future is not valid"); + continue; + } + Status status = ret_status_future_vec[i].get(); + if (!status.ok()) { + LOG(ERROR) << "Failed to write block to disk: " + << KVCacheHelper::BuildBlockName(block_hash_vec[i]) + << ", error: " << status.ToString(); + tmp_io_adaptor_vec[i]->Close(); + status_vec[i] = status; + continue; + } + for (auto& write_status : write_status_future_vec[i]) { + if (!write_status.valid()) { + LOG(ERROR) << "Future is not valid for block: " + << KVCacheHelper::BuildBlockName(block_hash_vec[i]); + tmp_io_adaptor_vec[i]->Close(); + status_vec[i] = Status::IOError("Future is not valid"); + break; + } + Status s = write_status.get(); + if (!s.ok()) { + LOG(ERROR) << "Failed to write block to disk: " + << KVCacheHelper::BuildBlockName(block_hash_vec[i]) + << ", error: " << s.ToString(); + tmp_io_adaptor_vec[i]->Close(); + status_vec[i] = s; + break; + } + } + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag + << ". Write blocks to disk cost: " << (end - start) << " us."; + + start = end; + // move tmp file to final location + std::vector> move_statuses; + move_statuses.resize(hash_num); + auto move_func = [&](size_t i) -> Status { + RETURN_ON_ERROR(tmp_io_adaptor_vec[i]->Close()); + std::shared_ptr io_adaptor_tmp; + GetIOAdaptor(io_adaptor_tmp, file_name_vec[i]); + RETURN_ON_ERROR(io_adaptor_tmp->Open("w", direct_io_)); + RETURN_ON_ERROR(io_adaptor_tmp->Close()); + + try { + std::filesystem::rename( + (GetIOPathPrefix() + "/" + tmp_file_name_vec[i]).c_str(), + (GetIOPathPrefix() + "/" + file_name_vec[i]).c_str()); + } catch (const std::filesystem::filesystem_error& e) { + LOG(ERROR) << "Failed to rename file: " + << GetIOPathPrefix() + "/" + tmp_file_name_vec[i] << " to " + << GetIOPathPrefix() + "/" + file_name_vec[i] + << ", error: " << e.what(); + return Status::IOError("Failed to rename file: " + GetIOPathPrefix() + + "/" + tmp_file_name_vec[i] + " to " + + GetIOPathPrefix() + "/" + file_name_vec[i] + + ", error: " + e.what()); + } + return Status::OK(); + }; + for (size_t i = 0; i < hash_num; ++i) { + if (!status_vec[i].ok()) { + continue; + } + move_statuses[i] = io_thread_pool_->enqueue(move_func, i); + } + + std::vector written_block_name_vec; + std::vector block_location_vec; + written_block_name_vec.resize(hash_num); + block_location_vec.resize(hash_num); + + for (size_t i = 0; i < hash_num; ++i) { + if (!status_vec[i].ok()) { + continue; + } + Status status = move_statuses[i].get(); + if (!status.ok()) { + LOG(ERROR) << "Failed to move file for block: " + << KVCacheHelper::BuildBlockName(block_hash_vec[i]) + << ", error: " << status.ToString(); + status_vec[i] = status; + continue; + } + written_block_name_vec[i] = + KVCacheHelper::BuildBlockName(block_hash_vec[i]); + block_location_vec[i] = disk_location; + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag + << ". Move blocks to final path cost: " << (end - start) << " us."; + + start = end; + if (VLLMKVCacheEnv::LocalVineyardVLLMKVCache() == "1") { + LOG(INFO) << "Test mode will skip put object location for blocks."; + return Status::OK(); + } + status = client.PutObjectLocation(written_block_name_vec, block_location_vec, + ttl, req_flag); + if (!status.ok()) { + LOG(WARNING) << "Failed to put object location for blocks, error: " + << status.ToString(); + for (auto path : file_name_vec) { + std::filesystem::remove(GetIOPathPrefix() + "/" + path); + } + return status; + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag + << ". Put object location cost: " << (end - start) << " us"; + + return Status::OK(); +} + +Status VLLMKVStorage::LoadFromDisk( + std::vector& block_hash_vec, + std::vector>& offsets_vec, + std::vector>& sizes_vec, std::vector& shape, + int layer_index, std::vector& statuses, std::string req_flag) { + RETURN_ON_ASSERT(block_hash_vec.size() == offsets_vec.size(), + "block_hash_vec.size() and offsets_vec.size() must be equal " + "for request: " + + req_flag); + RETURN_ON_ASSERT( + block_hash_vec.size() == sizes_vec.size(), + "block_hash_vec.size() and sizes_vec.size() must be equal for request: " + + req_flag); + RETURN_ON_ASSERT(layer_index >= 0, + "layer_index must be >= 0 for request: " + req_flag); + RETURN_ON_ASSERT( + layer_index < static_cast(shape.size()), + "layer_index must be < shape.size() for request: " + req_flag); + if (block_hash_vec.empty()) { + return Status::OK(); + } + + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag << " arrived at LoadFromDisk."; + uint64_t start = 0, end = 0; + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + + Status status = Status::OK(); + if (use_copy_) { + status = LoadFromDiskWithCopy(block_hash_vec, offsets_vec, sizes_vec, shape, + layer_index, statuses, req_flag); + } else { + status = LoadFromDiskWithoutCopy(block_hash_vec, offsets_vec, sizes_vec, + shape, layer_index, statuses, req_flag); + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag << ". LoadFromDisk cost: " << (end - start) + << " us."; + return status; +} + +Status VLLMKVStorage::LoadFromDiskWithCopy( + std::vector& block_hash_vec, + std::vector>& offsets_vec, + std::vector>& sizes_vec, std::vector& shape, + int layer_index, std::vector& status_vec, std::string& req_flag) { + uint64_t start = 0, end = 0; + MONITOR_AUTO(load_from_disk_monitor_); + + size_t hash_num = block_hash_vec.size(); + if (hash_num == 0) { + return Status::OK(); + } + status_vec.resize(hash_num, Status::OK()); + + std::vector> io_adaptor_vec; + std::vector> data_ptr_vec; + std::vector file_size_vec; + std::vector file_name_vec; + io_adaptor_vec.resize(hash_num); + data_ptr_vec.resize(hash_num); + file_size_vec.resize(hash_num); + file_name_vec.resize(hash_num); + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + for (size_t i = 0; i < hash_num; ++i) { + std::string prefix_dir; + Hash2PrefixDirAndSuffixFile(block_hash_vec[i], prefix_dir, + file_name_vec[i]); + std::shared_ptr io_adaptor; + Status status = GetIOAdaptor(io_adaptor, file_name_vec[i]); + if (!status.ok()) { + LOG(ERROR) << "Failed to get IOAdaptor for file: " << file_name_vec[i] + << ", error: " << status.ToString(); + status_vec[i] = status; + continue; + } + status = io_adaptor->Open("r", direct_io_); + if (!status.ok()) { + LOG(ERROR) << "Failed to open IOAdaptor for file: " << file_name_vec[i] + << ", error: " << status.ToString(); + status_vec[i] = status; + if (!io_adaptor->Close().ok()) { + LOG(ERROR) << "Failed to close IOAdaptor for file: " << file_name_vec[i] + << ", may cause resource leak."; + } + continue; + } + size_t file_size = 0; + status = io_adaptor->GetFileSize(file_size); + if (!status.ok()) { + LOG(ERROR) << "Failed to get file size for file: " << file_name_vec[i] + << ", error: " << status.ToString(); + status_vec[i] = status; + if (!io_adaptor->Close().ok()) { + LOG(ERROR) << "Failed to close IOAdaptor for file: " << file_name_vec[i] + << ", may cause resource leak."; + } + continue; + } + if (file_size == 0) { + LOG(ERROR) << "File to load: " << file_name_vec[i] << " is empty!"; + status_vec[i] = + Status::IOError("File to load: " + file_name_vec[i] + " is empty!"); + if (!io_adaptor->Close().ok()) { + LOG(ERROR) << "Failed to close IOAdaptor for file: " << file_name_vec[i] + << ", may cause resource leak."; + } + continue; + } + // align to align + size_t align = std::stoull(VLLMKVCacheEnv::GetDirectIOAlign()); + size_t mmap_size = ((file_size + align - 1) / align) * align; + void* data_ptr = mmap(nullptr, mmap_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (data_ptr == MAP_FAILED) { + LOG(ERROR) << "Failed to mmap memory for file: " << file_name_vec[i] + << ", size: " << mmap_size << ", error: " << strerror(errno); + status_vec[i] = Status::Invalid( + "Failed to mmap memory for file: " + file_name_vec[i] + ", size: " + + std::to_string(mmap_size) + ", error: " + strerror(errno)); + } + std::shared_ptr data_ptr_shared( + reinterpret_cast(data_ptr), [mmap_size](char* p) { + if (munmap(p, mmap_size) != 0) { + LOG(ERROR) << "Failed to munmap memory, may cause memory leak."; + } + }); + data_ptr_vec[i] = data_ptr_shared; + + io_adaptor_vec[i] = io_adaptor; + file_size_vec[i] = file_size; + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag + << ". Create read file io adaptors cost: " << (end - start) << " us"; + + start = end; + std::vector> submit_status_vec; + submit_status_vec.resize(hash_num); + MONITOR_START(load_from_disk_io_monitor_); + + for (size_t i = 0; i < hash_num; ++i) { + if (!status_vec[i].ok()) { + continue; + } + submit_status_vec[i] = + io_adaptor_vec[i]->AsyncRead(data_ptr_vec[i], file_size_vec[i], 0); + } + + for (size_t i = 0; i < hash_num; ++i) { + if (!status_vec[i].ok()) { + continue; + } + if (!submit_status_vec[i].valid()) { + // It means that the read operation not completed successfully. + status_vec[i] = Status::Invalid( + "Invalid future for reading block from disk, maybe the IOAdaptor " + "is not valid or the read operation is not started. Request id: " + + req_flag); + LOG(ERROR) << "Invalid future for reading block from disk." + << " Request id: " << req_flag; + io_adaptor_vec[i]->Close(); + continue; + } + Status status = submit_status_vec[i].get(); + if (!status.ok()) { + // It means that the read operation not completed successfully. + LOG(ERROR) << "Failed to read block from disk, error: " + << status.ToString(); + status_vec[i] = status; + io_adaptor_vec[i]->Close(); + continue; + } + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag + << ". Read blocks from disk cost: " << (end - start) << " us."; + + start = end; + MONITOR_START(load_memory_copy_monitor_); + std::vector> memcpy_status_future_vec; + memcpy_status_future_vec.resize(hash_num); + auto memory_copy_func = [&](size_t i) -> Status { + return ReadBlockFromMemory(data_ptr_vec[i], file_size_vec[i], + offsets_vec[i], sizes_vec[i], shape, + layer_index); + }; + + for (size_t i = 0; i < hash_num; ++i) { + if (!status_vec[i].ok()) { + continue; + } + memcpy_status_future_vec[i] = + copy_thread_pool_->enqueue(memory_copy_func, i); + } + + for (size_t i = 0; i < hash_num; ++i) { + if (!status_vec[i].ok()) { + continue; + } + if (!memcpy_status_future_vec[i].valid()) { + // It means that the read operation not completed successfully. + LOG(ERROR) << "Invalid future for reading block from disk." + << " Request id: " << req_flag; + status_vec[i] = Status::Invalid( + "Invalid future for reading block from disk, maybe the data_ptr " + "is not valid or the read operation is not started. Request id: " + + req_flag); + io_adaptor_vec[i]->Close(); + continue; + } + Status status = memcpy_status_future_vec[i].get(); + if (!status.ok()) { + // It means that the read operation not completed successfully. + LOG(ERROR) << "Failed to read block from disk, error: " + << status.ToString(); + io_adaptor_vec[i]->Close(); + status_vec[i] = status; + } + } + + MONITOR_END(load_memory_copy_monitor_); + MONITOR_END(load_from_disk_io_monitor_); + + for (size_t i = 0; i < hash_num; ++i) { + if (!status_vec[i].ok()) { + continue; + } + if (!io_adaptor_vec[i]->Close().ok()) { + // Data is already loaded, so we can ignore the error. + // But we should log the error to report resource leak. + LOG(WARNING) << "Failed to close IOAdaptor, may cause resource leak." + << " Request id: " << req_flag; + } + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag + << ". Copy blocks to memory cost: " << (end - start) << " us"; + + return Status::OK(); +} + +Status VLLMKVStorage::LoadFromDiskWithoutCopy( + std::vector& block_hash_vec, + std::vector>& offsets_vec, + std::vector>& sizes_vec, std::vector& shape, + int layer_index, std::vector& status_vec, std::string& req_flag) { + uint64_t start = 0, end = 0; + MONITOR_AUTO(load_from_disk_monitor_); + + size_t hash_num = block_hash_vec.size(); + if (hash_num == 0) { + return Status::OK(); + } + std::vector file_name_vec; + std::vector> io_adaptor_vec; + file_name_vec.resize(hash_num); + io_adaptor_vec.resize(hash_num); + status_vec.resize(hash_num, Status::OK()); + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + for (size_t i = 0; i < block_hash_vec.size(); ++i) { + std::string prefix_dir; + Hash2PrefixDirAndSuffixFile(block_hash_vec[i], prefix_dir, + file_name_vec[i]); + std::shared_ptr io_adaptor; + Status status = GetIOAdaptor(io_adaptor, file_name_vec[i]); + if (!status.ok()) { + LOG(ERROR) << "Failed to get IOAdaptor for file: " << file_name_vec[i] + << ", error: " << status.ToString() + << ", request id: " << req_flag; + status_vec[i] = status; + continue; + } + status = io_adaptor->Open("r", direct_io_); + if (!status.ok()) { + LOG(ERROR) << "Failed to open IOAdaptor for file: " << file_name_vec[i] + << ", error: " << status.ToString() + << ", request id: " << req_flag; + status_vec[i] = status; + if (!io_adaptor->Close().ok()) { + LOG(ERROR) << "Failed to close IOAdaptor for file: " << file_name_vec[i] + << ", may cause resource leak." + << ", request id: " << req_flag; + } + continue; + } + + io_adaptor_vec[i] = io_adaptor; + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag + << ". Create read file io adaptors cost: " << (end - start) << " us"; + + start = end; + std::vector> ret_status_future_vec; + std::vector>> read_status_future_vec; + ret_status_future_vec.resize(hash_num); + read_status_future_vec.resize(hash_num); + MONITOR_START(load_from_disk_io_monitor_); + auto read_func = [&](size_t i) -> Status { + return ReadBlockFromDisk(io_adaptor_vec[i], offsets_vec[i], sizes_vec[i], + shape, layer_index, read_status_future_vec[i]); + }; + + for (size_t i = 0; i < hash_num; ++i) { + if (!status_vec[i].ok()) { + continue; + } + ret_status_future_vec[i] = io_thread_pool_->enqueue(read_func, i); + } + + for (size_t i = 0; i < hash_num; ++i) { + if (!status_vec[i].ok()) { + continue; + } + if (!ret_status_future_vec[i].valid()) { + // It means that the read operation not completed successfully. + status_vec[i] = Status::Invalid( + "Invalid future for reading block from disk, maybe the IOAdaptor " + "is not valid or the read operation is not started."); + LOG(ERROR) << "Invalid future for reading block from disk." + << " Request id: " << req_flag; + io_adaptor_vec[i]->Close(); + continue; + } + Status status = ret_status_future_vec[i].get(); + if (!status.ok()) { + // It means that the read operation not completed successfully. + LOG(ERROR) << "Failed to read block from disk, error: " + << status.ToString() << ", request id: " << req_flag; + status_vec[i] = status; + io_adaptor_vec[i]->Close(); + continue; + } + for (auto& read_status : read_status_future_vec[i]) { + if (!read_status.valid()) { + LOG(ERROR) << "Invalid future for reading block from disk."; + status_vec[i] = Status::Invalid( + "Invalid future for reading block from disk, maybe the read " + "operation is not started. Request id:" + + req_flag); + break; + } + Status s = read_status.get(); + if (!s.ok()) { + LOG(ERROR) << "Failed to read block from disk, error: " << s.ToString() + << ", request id: " << req_flag; + status_vec[i] = s; + break; + } + } + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag + << ". Read blocks from disk cost: " << (end - start) << " us."; + + MONITOR_START(load_memory_copy_monitor_); + MONITOR_END(load_memory_copy_monitor_); + MONITOR_END(load_from_disk_io_monitor_); + + for (size_t i = 0; i < hash_num; ++i) { + if (!status_vec[i].ok()) { + continue; + } + if (!io_adaptor_vec[i]->Close().ok()) { + // Data is already loaded, so we can ignore the error. + // But we should log the error to report resource leak. + LOG(WARNING) << "Failed to close IOAdaptor, may cause resource leak." + << " Request id: " << req_flag; + } + } + + return Status::OK(); +} + +Status VLLMKVStorage::CleanBlockBlobs(Client& client, + std::vector block_meta_vec, + std::string& req_flag) { + uint64_t start = 0, end = 0; + std::vector blob_id_vec; + uint64_t total_blob = 0; + std::vector offsets; + std::vector nums_vec; + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + for (const auto& block_meta : block_meta_vec) { + VINEYARD_ASSERT(block_meta.GetTypeName() == type_name(), + "Invalid block meta type: " + block_meta.GetTypeName() + + " for request: " + req_flag); + uint64_t nums = block_meta.GetKeyValue("nums"); + offsets.push_back(total_blob); + nums_vec.push_back(nums); + total_blob += nums; + } + blob_id_vec.resize(total_blob, InvalidObjectID()); + + std::vector> decode_statuses; + for (size_t i = 0; i < block_meta_vec.size(); ++i) { + decode_statuses.emplace_back(io_thread_pool_->enqueue( + [&](size_t index) { + const auto& block_meta = block_meta_vec[index]; + std::string ids_str_encoder = + block_meta.GetKeyValue("blob_ids"); + std::string ids_str = base64_decode(ids_str_encoder); + if (ids_str.size() != sizeof(ObjectID) * nums_vec[index]) { + LOG(WARNING) << "Invalid blob ids size: " << ids_str.size() + << ", expected: " << sizeof(ObjectID) * nums_vec[index] + << ", which means meta has been corrupted." + << ", request id: " << req_flag; + } else { + memcpy(blob_id_vec.data() + offsets[index], ids_str.data(), + sizeof(ObjectID) * nums_vec[index]); + } + return Status::OK(); + }, + i)); + } + for (auto& status : decode_statuses) { + if (!status.get().ok()) { + LOG(WARNING) << "Failed to decode blob ids for cleaning block blobs." + << ", request id: " << req_flag; + } + } + + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag << ". Decode blob ids cost: " << (end - start) + << " us, total blobs: " << total_blob; + start = end; + VINEYARD_CHECK_OK(client.DeleteUserBlobs(blob_id_vec, req_flag)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(KVCacheHelper::GetTraceLogLevel()) + << "Request: " << req_flag << ". Delete blobs cost: " << (end - start) + << " us, total blobs: " << total_blob; + return Status::OK(); +} + +Status VLLMKVStorage::CleanBlockBuilderBlobs( + Client& client, std::shared_ptr block_builder, + std::string& req_flag) { + std::vector>& blobs = + block_builder->GetBlobs(); + std::vector blob_ids; + for (auto& blob : blobs) { + blob_ids.push_back(blob->id()); + } + return client.DeleteUserBlobs(blob_ids, req_flag); +} + +Status VLLMKVStorage::GetIOAdaptor( + std::shared_ptr& io_adaptor, + std::string file_name) { + std::string path_prefix = GetIOPathPrefix(); + if (path_prefix.empty()) { + return Status::Invalid("VINEYARD_VLLM_KV_CACHE_DISK_PATH is not set"); + } + + io_adaptor = io_adaptor_factory_(path_prefix + "/" + file_name); + return Status::OK(); +} + +std::string VLLMKVStorage::GetIOPathPrefix() { + static std::string path_prefix = + VLLMKVCacheEnv::GetVineyardVLLMKVCacheDiskPath(); + return path_prefix; +} + +Status VLLMKVStorage::GetIOTag(std::string& tag) { + std::string disk_type = VLLMKVCacheEnv::GetVineyardVLLMKVCacheDiskType(); + if (disk_type == "cpfs") { + tag = "cpfs"; + } else { + return Status::Invalid("Invalid VINEYARD_VLLM_KV_CACHE_DISK_TYPE: " + + disk_type); + } + return Status::OK(); +} + +Status VLLMKVStorage::WriteBlockToDisk( + Client& client, std::shared_ptr io_adaptor, + ObjectMeta& meta, std::vector>& statuses) { + json file_meta_json; + RETURN_ON_ERROR(ConstructVLLMBlockFileMeta(meta, file_meta_json)); + std::string meta_str = file_meta_json.dump(); + size_t meta_size = meta_str.size(); + size_t meta_magic_size = + std::stoull(VLLMKVCacheEnv::GetVineyardVLLMBlockMetaMagicSize()); + + // write meta + void* meta_ptr = mmap(nullptr, meta_magic_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (meta_ptr == MAP_FAILED) { + return Status::IOError("Failed to mmap block meta, error: " + + std::string(strerror(errno))); + } + std::shared_ptr meta_ptr_shared( + reinterpret_cast(meta_ptr), + [meta_magic_size](char* p) { munmap(p, meta_magic_size); }); + memcpy(meta_ptr_shared.get(), &meta_size, sizeof(size_t)); + memcpy(meta_ptr_shared.get() + sizeof(size_t), meta_str.c_str(), meta_size); + RETURN_ON_ERROR( + io_adaptor->AsyncWrite(meta_ptr_shared, meta_magic_size, 0).get()); + + // write data + uint64_t nums = meta.GetKeyValue("nums"); + std::string ids_str_encoder = meta.GetKeyValue("blob_ids"); + std::string ids_str = base64_decode(ids_str_encoder); + if (ids_str.size() != sizeof(ObjectID) * nums) { + return Status::Invalid( + "Invalid blob ids size: " + std::to_string(ids_str.size()) + + ", expected: " + std::to_string(sizeof(ObjectID) * nums) + + ", which means meta has been corrupted."); + } + std::vector blob_ids; + blob_ids.resize(nums); + memcpy(blob_ids.data(), ids_str.data(), sizeof(ObjectID) * nums); + + std::vector> blobs; + std::vector write_ptr_vec; + std::vector write_size_vec; + std::vector offset_vec; + + RETURN_ON_ERROR(client.GetUserBlobs(blob_ids, blobs)); + uint64_t offset = meta_magic_size; + for (size_t i = 0; i < blobs.size(); ++i) { + std::shared_ptr blob = blobs[i]; + write_ptr_vec.push_back( + reinterpret_cast(blob->offset() + storage_base_pointer_)); + write_size_vec.push_back(blob->size()); + offset_vec.push_back(offset); // offset is not used in this case + offset += blob->size(); + } + + if (!io_adaptor->FileTruncate(offset).ok()) { + LOG(WARNING) << "Failed to truncate file to size: " << offset + << ", may cause performance issue."; + } + + RETURN_ON_ERROR(io_adaptor->BatchAsyncWrite(write_ptr_vec, write_size_vec, + offset_vec, statuses)); + + return Status::OK(); +} + +Status VLLMKVStorage::WriteBlockToDisk( + std::shared_ptr io_adaptor, + std::vector& offset_vec, std::vector& sizes_vec, + std::vector& shape, int layer_index, + std::vector>& statuses) { + json file_meta_json; + RETURN_ON_ERROR(ConstructVLLMBlockFileMeta(offset_vec, sizes_vec, shape, + layer_index, file_meta_json)); + std::string meta_str = file_meta_json.dump(); + size_t meta_size = meta_str.size(); + size_t meta_magic_size = + std::stoull(VLLMKVCacheEnv::GetVineyardVLLMBlockMetaMagicSize()); + + // write meta + void* meta_ptr = mmap(nullptr, meta_magic_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (meta_ptr == MAP_FAILED) { + return Status::IOError("Failed to mmap block meta, error: " + + std::string(strerror(errno))); + } + std::shared_ptr meta_ptr_shared( + reinterpret_cast(meta_ptr), + [meta_magic_size](char* p) { munmap(p, meta_magic_size); }); + memcpy(meta_ptr_shared.get(), &meta_size, sizeof(size_t)); + memcpy(meta_ptr_shared.get() + sizeof(size_t), meta_str.c_str(), meta_size); + RETURN_ON_ERROR( + io_adaptor->AsyncWrite(meta_ptr_shared, meta_magic_size, 0).get()); + + // write data + std::vector write_ptr_vec; + std::vector write_size_vec; + std::vector file_offset_vec; + + uint64_t offset = meta_magic_size; + for (size_t i = 0; i < offset_vec.size(); ++i) { + write_ptr_vec.push_back( + reinterpret_cast(offset_vec[i] + storage_base_pointer_)); + write_size_vec.push_back(sizes_vec[i]); + file_offset_vec.push_back(offset); // offset is not used in this case + offset += sizes_vec[i]; + } + + if (!io_adaptor->FileTruncate(offset).ok()) { + LOG(WARNING) << "Failed to truncate file to size: " << offset + << ", may cause performance issue."; + } + + RETURN_ON_ERROR(io_adaptor->BatchAsyncWrite(write_ptr_vec, write_size_vec, + file_offset_vec, statuses)); + + return Status::OK(); +} + +Status VLLMKVStorage::CopyBlockToMemory(Client& client, ObjectMeta& meta, + std::shared_ptr& data_ptr, + size_t& file_size) { + std::vector offsets_vec; + std::vector sizes_vec; + json file_meta_json; + RETURN_ON_ERROR(ConstructVLLMBlockFileMeta(meta, file_meta_json)); + std::string meta_str = file_meta_json.dump(); + + uint64_t nums = meta.GetKeyValue("nums"); + std::string ids_str_encoder = meta.GetKeyValue("blob_ids"); + std::string ids_str = base64_decode(ids_str_encoder); + if (ids_str.size() != sizeof(ObjectID) * nums) { + return Status::Invalid( + "Invalid blob ids size: " + std::to_string(ids_str.size()) + + ", expected: " + std::to_string(sizeof(ObjectID) * nums) + + ", which means meta has been corrupted."); + } + std::vector blob_ids; + blob_ids.resize(nums); + memcpy(blob_ids.data(), ids_str.data(), sizeof(ObjectID) * nums); + + std::vector> blobs; + RETURN_ON_ERROR(client.GetUserBlobs(blob_ids, blobs)); + for (size_t i = 0; i < blobs.size(); ++i) { + std::shared_ptr blob = blobs[i]; + offsets_vec.push_back(blob->offset()); + sizes_vec.push_back(blob->size()); + } + + return CopyBlockToMemoryInternal(meta_str, offsets_vec, sizes_vec, data_ptr, + file_size); +} + +Status VLLMKVStorage::CopyBlockToMemoryInternal( + std::string& meta_str, std::vector& offsets_vec, + std::vector& size_vec, std::shared_ptr& data_ptr, + size_t& file_size) { + size_t meta_size = meta_str.size(); + size_t meta_magic_size = + std::stoull(VLLMKVCacheEnv::GetVineyardVLLMBlockMetaMagicSize()); + file_size = meta_magic_size; + + for (size_t i = 0; i < size_vec.size(); ++i) { + file_size += size_vec[i]; + } + + size_t align = std::stoull(VLLMKVCacheEnv::GetDirectIOAlign()); + if (file_size % align != 0) { + file_size += (align - (file_size % align)); + } + void* data = mmap(nullptr, file_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (data == MAP_FAILED) { + return Status::IOError("Failed to mmap block data, error: " + + std::string(strerror(errno))); + } + + data_ptr = std::shared_ptr( + reinterpret_cast(data), [file_size](char* p) { + if (munmap(p, file_size) != 0) { + LOG(ERROR) << "Failed to munmap memory, may cause memory leak."; + } + }); + + if (data_ptr == nullptr) { + return Status::IOError("Failed to allocate memory for block data."); + } + + size_t offset = 0; + memcpy(data_ptr.get() + offset, &meta_size, sizeof(size_t)); + offset += sizeof(size_t); + memory::concurrent_memcpy(data_ptr.get() + offset, meta_str.c_str(), + meta_size); + offset += meta_magic_size - sizeof(size_t); + + for (size_t i = 0; i < offsets_vec.size(); ++i) { + if (offset + size_vec[i] > file_size) { + return Status::IOError( + "Write size exceeds file size, invalid block file."); + } + memory::concurrent_memcpy( + data_ptr.get() + offset, + reinterpret_cast(offsets_vec[i] + storage_base_pointer_), + size_vec[i]); + offset += size_vec[i]; + } + return Status::OK(); +} + +Status VLLMKVStorage::ReadBlockFromDisk( + std::shared_ptr io_adaptor, + std::vector& offsets_vec, std::vector& sizes_vec, + std::vector& shape, int layer_index, + std::vector>& statuses) { + if (sizes_vec.size() == 0) { + return Status::Invalid("Sizes vector is empty, invalid block file."); + } + size_t file_size = 0; + RETURN_ON_ERROR(io_adaptor->GetFileSize(file_size)); + if (file_size == 0) { + return Status::Invalid("File size is zero, invalid block file."); + } + + uint64_t meta_magic_size = + std::stoull(VLLMKVCacheEnv::GetVineyardVLLMBlockMetaMagicSize()); + void* meta_data = mmap(nullptr, meta_magic_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (meta_data == MAP_FAILED) { + return Status::IOError("Failed to mmap block meta, error: " + + std::string(strerror(errno))); + } + std::shared_ptr meta_ptr( + reinterpret_cast(meta_data), + [meta_magic_size](char* p) { munmap(p, meta_magic_size); }); + RETURN_ON_ERROR(io_adaptor->AsyncRead(meta_ptr, meta_magic_size, 0).get()); + size_t meta_size = reinterpret_cast(meta_ptr.get())[0]; + + std::string meta_str; + try { + meta_str = std::string(meta_ptr.get() + sizeof(meta_size), meta_size); + } catch (const std::exception& e) { + return Status::IOError("Failed to allocate memory for meta string."); + } + + json file_meta_json; + try { + file_meta_json = json::parse(meta_str); + } catch (const json::parse_error& e) { + LOG(ERROR) << "Failed to parse meta json: " << e.what() + << ", meta_str: " << meta_str; + return Status::Invalid("Failed to parse meta json: " + + std::string(e.what()) + ", meta_str: " + meta_str); + } + VLOG(100) << "file_meta_json: " << file_meta_json; + + std::vector file_sizes_vec; + std::vector file_shape; + size_t blob_nums = 0; + int file_layer_index = 0; + RETURN_ON_ERROR(ParseVLLMBlockFileJson( + file_meta_json, blob_nums, file_sizes_vec, file_shape, file_layer_index)); + if (file_sizes_vec.size() == 0) { + return Status::Invalid("Blob sizes vector is empty, invalid block file."); + } + + if (!CheckVLLMBlockEqual(blob_nums, offsets_vec.size(), file_sizes_vec, + sizes_vec, file_shape, shape, file_layer_index, + layer_index)) { + std::string error_msg = + "Block file meta does not match the expected " + "block structure. " + "Expected blob nums: " + + std::to_string(offsets_vec.size()) + + ", file blob nums: " + std::to_string(blob_nums) + + ", expected layer index: " + std::to_string(layer_index) + + ", file layer index: " + std::to_string(file_layer_index); + error_msg += ", expected sizes: ["; + for (const auto& size : sizes_vec) { + error_msg += std::to_string(size) + ", "; + } + error_msg += "], file sizes: ["; + for (const auto& size : file_sizes_vec) { + error_msg += std::to_string(size) + ", "; + } + error_msg += "]"; + error_msg += ", expected shape: ["; + for (const auto& dim : shape) { + error_msg += std::to_string(dim) + ", "; + } + error_msg += "], file shape: ["; + for (const auto& dim : file_shape) { + error_msg += std::to_string(dim) + ", "; + } + error_msg += "]"; + return Status::Invalid(error_msg); + } + std::vector read_ptr_vec; + std::vector file_offset_vec; + size_t offset = meta_magic_size; + for (size_t i = 0; i < blob_nums; i++) { + read_ptr_vec.push_back( + reinterpret_cast(offsets_vec[i] + storage_base_pointer_)); + file_offset_vec.push_back(offset); + offset += sizes_vec[i]; + } + // std::vector> read_futures; + Status status = io_adaptor->BatchAsyncRead(read_ptr_vec, sizes_vec, + file_offset_vec, statuses); + if (!status.ok()) { + LOG(ERROR) << "Failed to read block from disk, error: " + << status.ToString(); + return status; + } + + return Status::OK(); +} + +Status VLLMKVStorage::ReadBlockFromMemory(std::shared_ptr data_ptr, + size_t file_size, + std::vector& offsets_vec, + std::vector& sizes_vec, + std::vector& shape, + int layer_index) { + if (sizes_vec.size() == 0) { + return Status::Invalid("Sizes vector is empty, invalid block file."); + } + if (file_size == 0) { + return Status::Invalid("File size is zero, invalid block file."); + } + + size_t offset = 0; + size_t meta_size = 0; + size_t meta_magic_size = + std::stoull(VLLMKVCacheEnv::GetVineyardVLLMBlockMetaMagicSize()); + memcpy(&meta_size, data_ptr.get(), sizeof(size_t)); + offset += sizeof(size_t); + if (meta_size == 0) { + return Status::Invalid("Meta size is zero, invalid block file."); + } + + std::shared_ptr meta_buffer; + try { + meta_buffer = std::shared_ptr(new char[meta_size]); + if (meta_buffer == nullptr) { + return Status::Invalid("Failed to allocate memory for meta buffer."); + } + } catch (std::bad_alloc& e) { + return Status::Invalid("Failed to allocate memory for meta buffer."); + } + + memory::concurrent_memcpy(meta_buffer.get(), data_ptr.get() + offset, + meta_size); + offset += meta_magic_size - sizeof(size_t); + std::string meta_str(meta_buffer.get(), meta_size); + json file_meta_json; + try { + file_meta_json = json::parse(meta_str); + } catch (const json::parse_error& e) { + LOG(ERROR) << "Failed to parse meta json: " << e.what() + << ", meta_str: " << meta_str; + return Status::Invalid("Failed to parse meta json: " + + std::string(e.what()) + ", meta_str: " + meta_str); + } + VLOG(100) << "file_meta_json: " << file_meta_json; + + std::vector file_sizes_vec; + std::vector file_shape; + size_t blob_nums = 0; + int file_layer_index = 0; + RETURN_ON_ERROR(ParseVLLMBlockFileJson( + file_meta_json, blob_nums, file_sizes_vec, file_shape, file_layer_index)); + if (file_sizes_vec.size() == 0) { + return Status::Invalid("Blob sizes vector is empty, invalid block file."); + } + + if (!CheckVLLMBlockEqual(blob_nums, offsets_vec.size(), file_sizes_vec, + sizes_vec, file_shape, shape, file_layer_index, + layer_index)) { + std::string error_msg = + "Block file meta does not match the expected " + "block structure. " + "Expected blob nums: " + + std::to_string(offsets_vec.size()) + + ", file blob nums: " + std::to_string(blob_nums) + + ", expected layer index: " + std::to_string(layer_index) + + ", file layer index: " + std::to_string(file_layer_index); + error_msg += ", expected sizes: ["; + for (const auto& size : sizes_vec) { + error_msg += std::to_string(size) + ", "; + } + error_msg += "], file sizes: ["; + for (const auto& size : file_sizes_vec) { + error_msg += std::to_string(size) + ", "; + } + error_msg += "]"; + error_msg += ", expected shape: ["; + for (const auto& dim : shape) { + error_msg += std::to_string(dim) + ", "; + } + error_msg += "], file shape: ["; + for (const auto& dim : file_shape) { + error_msg += std::to_string(dim) + ", "; + } + error_msg += "]"; + return Status::Invalid(error_msg); + } + + for (size_t i = 0; i < blob_nums; i++) { + if (offset + sizes_vec[i] > file_size) { + return Status::Invalid( + "Read size exceeds file size, invalid block file."); + } + memory::concurrent_memcpy( + reinterpret_cast(offsets_vec[i] + storage_base_pointer_), + reinterpret_cast(data_ptr.get() + offset), sizes_vec[i]); + offset += sizes_vec[i]; + } + + return Status::OK(); +} + +void VLLMKVStorage::Hash2PrefixDirAndSuffixFile(const uint64_t hash_num, + std::string& prefix_dir, + std::string& file_path) { + std::stringstream ss; + for (int i = 3; i >= 1; --i) { + uint16_t part = (hash_num >> (i * 16)) & 0xFFFF; + ss << std::hex << std::setfill('0') << std::setw(4) << part << "/"; + } + prefix_dir = ss.str(); + ss << std::hex << std::setfill('0') << std::setw(4) << (hash_num & 0xFFFF); + file_path = ss.str(); +} + +size_t VLLMKVStorage::GetIOProcessingRequestNums() { + static std::shared_ptr aio_ops = + std::make_shared(); + static std::shared_ptr aio_context = + vllm_kv_cache::io::AIOContext::GetSingleInstance(aio_ops); + return aio_context->GetProcessingIORequest(); +} + +Status VLLMKVStorage::CreateDirectoriesIfNotExists(const std::string& path) { + try { + std::filesystem::create_directories(path); + } catch (const std::filesystem::filesystem_error& e) { + return Status::IOError("Failed to create directories for path: " + path + + ", error: " + e.what()); + } + return Status::OK(); +} + +bool CheckVLLMBlockEqual(size_t nums_1, size_t nums_2, + std::vector& sizes_1, + std::vector& sizes_2, + std::vector& shape_1, + std::vector& shape_2, int layer_index_1, + int layer_index_2) { + if (nums_1 != nums_2 || layer_index_1 != layer_index_2) { + return false; + } + + if (sizes_1.size() != sizes_2.size() || shape_1.size() != shape_2.size()) { + return false; + } + + for (size_t i = 0; i < sizes_1.size(); ++i) { + if (sizes_1[i] != sizes_2[i]) { + return false; + } + } + + for (size_t i = 0; i < shape_1.size(); ++i) { + if (shape_1[i] != shape_2[i]) { + return false; + } + } + + return true; +} + +} // namespace vineyard diff --git a/modules/vllm-kv-cache/src/storage/vllm_kv_storage.h b/modules/vllm-kv-cache/src/storage/vllm_kv_storage.h new file mode 100644 index 000000000..e2068c115 --- /dev/null +++ b/modules/vllm-kv-cache/src/storage/vllm_kv_storage.h @@ -0,0 +1,398 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef MODULES_VLLM_KV_CACHE_SRC_STORAGE_VLLM_KV_STORAGE_H_ +#define MODULES_VLLM_KV_CACHE_SRC_STORAGE_VLLM_KV_STORAGE_H_ + +#include +#include +#include +#include +#include +#include + +#include "client/client.h" +#include "common/util/monitor.h" +#include "common/util/status.h" +#include "vllm-kv-cache/ds/vllm_block.h" +#include "vllm-kv-cache/ds/vllm_layer.h" +#include "vllm-kv-cache/src/env.h" +#include "vllm-kv-cache/src/io/io_adaptor.h" + +#include "thread-pool/thread_pool.h" + +namespace vineyard { + +class VLLMKVStorage { + public: + VLLMKVStorage() = default; + + ~VLLMKVStorage() = default; + + static Status InitStorage( + uint64_t base_pointer, std::string ipc_socket, + std::string io_type = "aio", + bool enable_mem_copy = + VLLMKVCacheEnv::VineyardEnableVLLMKVCacheMemCopy() == "1", + bool direct_io = VLLMKVCacheEnv::VineyardEnableVLLMKVCacheDirectIO() == + "1"); + + static Status SetStorageBasePointer(uint64_t base_pointer) { + storage_base_pointer_ = base_pointer; + return Status::OK(); + } + + /** + * @brief Get the block location from vineyard. + * + * @param client The vineyard client. + * @param block_hash The vector of block hashes. + * @param locations The map to store the locations of the blocks. + * The key is the block name and the value is a set of locations (e.g., IPs). + * + * @return Status indicating success or failure of the operation. + */ + static Status GetBlockLocation(std::vector& block_hash, + std::vector>& locations, + std::string req_flag = ""); + + /** + * @brief Get the block kv cache object from vineyard with layerwise transfer. + * + * @param client The vineyard client. + * @param block_hash_vec The vector of block hashes. + * @param offsets_vec The vector of offsets for each block buffer. + * @param sizes_vec The vector of sizes for each block buffer. + * @param shape The shape of the block. + * @param layer_index The index of the layer in the shape. + * @param block_builders The vector of shared pointers to VLLMBlockBuilder + * objects (Created by vineyard). + * @param layers The shared pointer to VLLMLayers object. + * @param rpc_endpoint The RPC endpoint to fetch the blocks. + * @param block_nums The number of blocks fetched. + * @return Status indicating success or failure of the operation. + */ + static Status GetBlockKVCacheLayerwise( + std::vector& block_hash_vec, + std::vector>& offsets_vec, + std::vector>& sizes_vec, std::vector& shape, + int layer_index, std::string rpc_endpoint, + std::shared_ptr& layers, std::string req_flag = ""); + + /** + * @brief Put the block kv cache object to vineyard. + * + * @param client The vineyard client. + * @param block_hash_vec The vector of block hashes. + * @param offsets_vec The vector of offsets for each block buffer. + * @param sizes_vec The vector of sizes for each block buffer. + * @param shape The shape of the block. + * @param layer_index The index of the layer in the shape. + * @param statuses A map to store the status of each block put operation. + * + * @return Status indicating success or failure of the operation. + */ + static Status PutBlockKVCache(std::vector& block_hash_vec, + std::vector>& offsets_vec, + std::vector>& sizes_vec, + std::vector& shape, int layer_index, + std::vector& statuses, + std::string req_flag = ""); + + /** + * @brief Delete blocks from vineyard. + * + * @param client The vineyard client. + * @param block_hash_vec The vector of block hashes to + * delete. + * + * @return Status indicating success or failure of the operation. + */ + static Status DeleteBlocks(std::vector block_hash_vec, + std::string req_flag = ""); + + /** + * @brief Save blocks to disk. + * + * @param client The vineyard client. + * @param block_hash_vec The vector of block hashes to save. + * + * @return Status indicating success or failure of the operation. + */ + static Status SaveToDisk(std::vector& block_hash_vec, + std::vector>& offsets_vec, + std::vector>& sizes_vec, + std::vector& shape, int layer_index, + std::vector& statuses, uint64_t ttl, + bool wait = true, std::string req_flag = ""); + + static Status LoadFromDisk(std::vector& block_hash_vec, + std::vector>& offsets_vec, + std::vector>& sizes_vec, + std::vector& shape, int layer_index, + std::vector& statuses, + std::string req_flag = ""); + + static void DumpMonitor() { + DUMP_MONITOR_HEADER(); + DUMP_MONITOR(load_from_disk_io_monitor_); + DUMP_MONITOR(load_memory_copy_monitor_); + DUMP_MONITOR(load_from_disk_monitor_); + DUMP_MONITOR(save_to_disk_io_monitor_); + DUMP_MONITOR(save_to_disk_monitor_); + DUMP_MONITOR(save_memory_copy_monitor_); + } + + static void InitMonitor() { + MONITOR_CLEAR(load_from_disk_io_monitor_, "LoadFromDisk_IO", + monitor::MILLISECONDS); + MONITOR_CLEAR(load_memory_copy_monitor_, "LoadMemoryCopy", + monitor::MILLISECONDS); + MONITOR_CLEAR(load_from_disk_monitor_, "LoadFromDisk", + monitor::MILLISECONDS); + MONITOR_CLEAR(save_to_disk_io_monitor_, "SaveToDisk_IO", + monitor::MILLISECONDS); + MONITOR_CLEAR(save_to_disk_monitor_, "SaveToDisk", monitor::MILLISECONDS); + MONITOR_CLEAR(save_memory_copy_monitor_, "SaveMemoryCopy", + monitor::MILLISECONDS); + } + + static void Hash2PrefixDirAndSuffixFile(const uint64_t hash_num, + std::string& prefix_dir, + std::string& file_path); + + static size_t GetIOProcessingRequestNums(); + + public: + // for mock test + static Status SetClientVec(std::vector> clients, + uint64_t threads) { + threads_ = threads / 2; + if (io_thread_pool_ == nullptr || copy_thread_pool_ == nullptr || + req_thread_vec_.size() == 0) { + io_thread_pool_ = std::make_shared(threads_); + copy_thread_pool_ = std::make_shared(threads_); + fast_opt_thread_pool_ = std::make_shared(threads_); + block_opt_thread_pool_ = std::make_shared(threads_); + for (uint64_t i = 0; i < threads_; ++i) { + req_thread_vec_.emplace_back( + std::make_shared(1)); // single thread pool + } + } + vineyard_clients_ = clients; + return Status::OK(); + } + + private: + static Status PutBlockKVCache(Client& client, + std::vector& block_hash_vec, + std::vector>& offsets_vec, + std::vector>& sizes_vec, + std::vector& shape, int layer_index, + std::vector& statuses, + std::string& req_flag); + + static Status PutBlockKVCache(Client& client, + std::vector& block_hash_vec, + std::vector>& offsets_vec, + std::vector>& sizes_vec, + std::vector& shape, int layer_index, + std::vector>& blocks, + std::vector& statuses, + std::string& req_flag); + + static Status GetBlockKVCacheLayerwise( + Client& client, std::vector& block_hash_vec, + std::vector>& offsets_vec, + std::vector>& sizes_vec, std::vector& shape, + int layer_index, std::string rpc_endpoint, + std::shared_ptr& layers, std::string req_flag); + + static Status GetBlockLocation(Client& client, + std::vector& block_hash, + std::vector>& locations, + std::string& req_flag); + + static Status DeleteBlocks(Client& client, + std::vector block_hash_vec, + std::string& req_flag); + /** + * @brief Put the block kv cache object to vineyard. + * + * @param client The vineyard client. + * @param block_hash_vec The vector of block hashes. + * @param block_builders The vector of shared pointers to VLLMBlockBuilder + * objects (Provided by the user). + * @param statuses A map to store the status of each block put operation. + * + * @return Status indicating success or failure of the operation. + */ + static Status PutBlockKVCache( + std::vector& block_hash_vec, + std::vector>& block_builders, + std::vector& statuses, std::string& req_flag); + + static Status PutBlockKVCache( + Client& client, std::vector& block_hash_vec, + std::vector>& block_builders, + std::vector& statuses, std::string& req_flag); + + static Status DeleteBlocks(Client& client, + std::vector>& blocks, + std::string& req_flag); + + static Status DeleteBlockBuilders( + std::vector>& block_builders, + std::string& req_flag); + + static Status DeleteBlockBuilders( + Client& client, + std::vector>& block_builders, + std::string& req_flag); + + static Status CleanBlockBlobs(Client& client, + std::vector block_meta_vec, + std::string& req_flag); + + static Status CleanBlockBuilderBlobs( + Client& client, std::shared_ptr block_builder, + std::string& req_flag); + + static Status GetIOAdaptor( + std::shared_ptr& io_adaptor, + std::string file_name); + + static std::string GetIOPathPrefix(); + + static Status GetIOTag(std::string& tag); + + static Status FilterFiles(std::vector& block_hash_vec, + std::vector& exist_block_vec, + std::vector& filtered_hash_vec, + std::vector& filtered_hash_index); + + static Status UpdateTTL(Client& client, std::vector& block_hash_vec, + uint64_t ttl, std::string& req_flag); + + static Status SaveToDiskWithCopy( + Client& client, std::vector& block_hash_vec, + std::vector>& offsets_vec, + std::vector>& sizes_vec, std::vector& shape, + int layer_index, std::vector& statuses, uint64_t ttl, bool wait, + std::string& req_flag); + + // without get meta from v6d + static Status SaveToDiskWithoutCopy( + Client& client, std::vector& block_hash_vec, + std::vector>& offsets_vec, + std::vector>& sizes_vec, std::vector& shape, + int layer_index, std::vector& statuses, uint64_t ttl, + std::string& req_flag); + + static Status SaveToDiskSubmitIO( + Client& client, + std::vector>& + io_adaptor_vec, + std::vector& file_size_vec, + std::vector>& data_ptr_vec, + std::vector& block_hash_vec, std::vector& statuses, + std::string req_flag); + + static Status SaveToDiskMoveFile( + Client& client, std::vector& block_hash_vec, + std::vector>& + io_adaptor_vec, + std::vector& file_name_vec, + std::vector& tmp_file_name_vec, + std::vector& status_vec, uint64_t ttl, std::string req_flag); + + // without put to v6d + static Status LoadFromDiskWithCopy( + std::vector& block_hash_vec, + std::vector>& offsets_vec, + std::vector>& sizes_vec, std::vector& shape, + int layer_index, std::vector& statuses, std::string& req_flag); + + // without put to v6d + static Status LoadFromDiskWithoutCopy( + std::vector& block_hash_vec, + std::vector>& offsets_vec, + std::vector>& sizes_vec, std::vector& shape, + int layer_index, std::vector& statuses, std::string& req_flag); + + static Status CopyBlockToMemory(Client& client, ObjectMeta& meta, + std::shared_ptr& data_ptr, + size_t& file_size); + + static Status CopyBlockToMemoryInternal(std::string& meta_str, + std::vector& offsets_vec, + std::vector& sizes_vec, + std::shared_ptr& data_ptr, + size_t& file_size); + + static Status WriteBlockToDisk( + Client& client, std::shared_ptr io_adaptor, + ObjectMeta& meta, std::vector>& statuses); + + static Status WriteBlockToDisk( + std::shared_ptr io_adaptor, + std::vector& offset_vec, std::vector& sizes_vec, + std::vector& shape, int layer_index, + std::vector>& statuses); + + static Status ReadBlockFromDisk( + std::shared_ptr io_adaptor, + std::vector& offset_vec, std::vector& size_vec, + std::vector& shape, int layer_index, + std::vector>& statuses); + + // for memcpy + static Status ReadBlockFromMemory(std::shared_ptr data_ptr, + size_t file_size, + std::vector& offsets_vec, + std::vector& sizes_vec, + std::vector& shape, + int layer_index); + + static Status CreateDirectoriesIfNotExists(const std::string& path); + + static std::atomic req_count_; + static uint64_t threads_; + static uint64_t storage_base_pointer_; + + static std::vector> vineyard_clients_; + static std::vector> req_thread_vec_; + static std::shared_ptr io_thread_pool_; + static std::shared_ptr copy_thread_pool_; + static std::shared_ptr fast_opt_thread_pool_; + static std::shared_ptr block_opt_thread_pool_; + static bool use_copy_; + static bool direct_io_; + + static monitor::Monitor load_from_disk_io_monitor_; + static monitor::Monitor load_memory_copy_monitor_; + static monitor::Monitor load_from_disk_monitor_; + static monitor::Monitor save_to_disk_io_monitor_; + static monitor::Monitor save_to_disk_monitor_; + static monitor::Monitor save_memory_copy_monitor_; + + static vllm_kv_cache::io::IOAdaptorFactory io_adaptor_factory_; + + friend class VLLMLayers; +}; + +} // namespace vineyard + +#endif // MODULES_VLLM_KV_CACHE_SRC_STORAGE_VLLM_KV_STORAGE_H_ diff --git a/modules/vllm-kv-cache/src/vllm_kv_cache_util.cc b/modules/vllm-kv-cache/src/vllm_kv_cache_util.cc new file mode 100644 index 000000000..e3396120c --- /dev/null +++ b/modules/vllm-kv-cache/src/vllm_kv_cache_util.cc @@ -0,0 +1,154 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include +#include +#include +#include + +#include "common/util/env.h" + +#include "common/util/logging.h" +#include "vllm-kv-cache/src/env.h" +#include "vllm-kv-cache/src/vllm_kv_cache_util.h" + +namespace vineyard { + +extern std::shared_ptr KVCacheHelper::construct_helper_pool_; +extern int KVCacheHelper::log_level_; + +Status KVCacheHelper::Init(size_t concurrency) { + if (construct_helper_pool_ == nullptr) { + construct_helper_pool_ = std::make_shared(concurrency); + } + std::string log_level_str = VineyardEnv::GetVineyardTraceLogLevel(); + try { + log_level_ = std::stoi(log_level_str); + } catch (...) { log_level_ = 0; } + LOG(INFO) << "Set KVCacheHelper log level to " << log_level_; + return Status::OK(); +} + +int KVCacheHelper::GetTraceLogLevel() { return log_level_; } + +std::string KVCacheHelper::GetBLockNamePrefix() { + return VLLMKVCacheEnv::GetVLLMBlockPrefix(); +} + +std::string KVCacheHelper::BuildBlockName(uint64_t hash) { + return GetBLockNamePrefix() + std::to_string(hash); +} + +uint64_t KVCacheHelper::GetLayer(std::vector& shape, + int layer_idnex) { + if (shape.size() == 0 || layer_idnex < 0 || + static_cast(layer_idnex) >= shape.size()) { + return 0; + } + return shape[layer_idnex]; +} + +uint64_t KVCacheHelper::GetBlobNumsPerLayer(std::vector& shape, + int layer_index) { + if (shape.size() < 2) { + return 0; + } + uint64_t blob_nums = 1; + for (size_t i = 0; i < shape.size(); ++i) { + blob_nums *= shape[i]; + } + return blob_nums / shape[layer_index]; +} + +uint64_t KVCacheHelper::GetContinuousBlobNums(std::vector& shape, + int layer_index) { + if (layer_index < 0 || static_cast(layer_index) >= shape.size()) { + return 0; + } + uint64_t continuous_block_nums = 1; + for (size_t i = layer_index + 1; i < shape.size(); ++i) { + continuous_block_nums *= shape[i]; + } + return continuous_block_nums; +} + +void KVCacheHelper::ShuffleBlockToLayer( + std::vector>& local_layer_blobs, + std::vector>& remote_layer_blobs, + std::vector>& layer_sizes, + std::vector>& local_blobs, + std::vector>& remote_blobs, + std::vector>& sizes_vec, std::vector& shape, + int layer_index) { + uint64_t layer_num = GetLayer(shape, layer_index); + local_layer_blobs.resize(layer_num); + remote_layer_blobs.resize(layer_num); + layer_sizes.resize(layer_num); + uint64_t layer_continuous_blob_num = + GetContinuousBlobNums(shape, layer_index); + /* + * time complexity: + * O(cycle) + * = O(block_num * blobs_per_block / (layer_num * layer_continuous_blob_num) * + * layer_index * layer_continuous_blob_num) = O(num_blobs) + */ + for (size_t block_index = 0; block_index < local_blobs.size(); + block_index++) { + for (size_t blob_index_base = 0; blob_index_base < local_blobs[0].size(); + blob_index_base += layer_continuous_blob_num * layer_num) { + for (uint64_t layer_index = 0; layer_index < layer_num; layer_index++) { + for (uint64_t blob_index = 0; blob_index < layer_continuous_blob_num; + blob_index++) { + local_layer_blobs[layer_index].push_back( + local_blobs[block_index][blob_index_base + + layer_index * layer_continuous_blob_num + + blob_index]); + remote_layer_blobs[layer_index].push_back( + remote_blobs[block_index] + [blob_index_base + + layer_index * layer_continuous_blob_num + + blob_index]); + layer_sizes[layer_index].push_back( + sizes_vec[block_index] + [blob_index_base + + layer_index * layer_continuous_blob_num + blob_index]); + } + } + } + } +} + +std::string KVCacheHelper::MicrosecondToTimestamp(int64_t microsecond) { + const uint64_t microseconds_per_second = 1000000; + std::time_t seconds_part = microsecond / microseconds_per_second; + uint64_t microseconds_part = microsecond % microseconds_per_second; + + struct tm time_info; + localtime_r(&seconds_part, &time_info); + + char buffer[32]; + size_t written_len = + strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", &time_info); + std::stringstream ss; + ss << std::setw(6) << std::setfill('0') << microseconds_part; + + if (written_len > 0) { + return std::string(buffer, written_len) + "." + ss.str(); + } + + return ""; +} + +} // namespace vineyard diff --git a/modules/vllm-kv-cache/src/vllm_kv_cache_util.h b/modules/vllm-kv-cache/src/vllm_kv_cache_util.h new file mode 100644 index 000000000..9112d8116 --- /dev/null +++ b/modules/vllm-kv-cache/src/vllm_kv_cache_util.h @@ -0,0 +1,73 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef MODULES_VLLM_KV_CACHE_SRC_VLLM_KV_CACHE_UTIL_H_ +#define MODULES_VLLM_KV_CACHE_SRC_VLLM_KV_CACHE_UTIL_H_ + +#include +#include +#include +#include +#include + +#include "client/ds/object_meta.h" +#include "common/util/logging.h" +#include "common/util/uuid.h" + +#include "thread-pool/thread_pool.h" + +namespace vineyard { + +class KVCacheHelper { + public: + static Status Init(size_t concurrency = std::thread::hardware_concurrency()); + + static int GetTraceLogLevel(); + + static std::string GetBLockNamePrefix(); + + static std::string BuildBlockName(uint64_t hash); + + static uint64_t GetLayer(std::vector& shape, int layer_idnex); + + static uint64_t GetBlobNumsPerLayer(std::vector& shape, + int layer_index); + + static uint64_t GetContinuousBlobNums(std::vector& shape, + int layer_index); + + static void ShuffleBlockToLayer( + std::vector>& local_layer_offsets, + std::vector>& remote_layer_blobs, + std::vector>& layer_sizes, + std::vector>& local_offsets, + std::vector>& remote_blobs, + std::vector>& sizes_vec, std::vector& shape, + int layer_index); + + static std::shared_ptr GetConstructThreadPool() { + return construct_helper_pool_; + } + + static std::string MicrosecondToTimestamp(int64_t microsecond); + + private: + static std::shared_ptr construct_helper_pool_; + static int log_level_; +}; + +} // namespace vineyard + +#endif // MODULES_VLLM_KV_CACHE_SRC_VLLM_KV_CACHE_UTIL_H_ diff --git a/modules/vllm-kv-cache/tests/vllm_storage_local_test.cc b/modules/vllm-kv-cache/tests/vllm_storage_local_test.cc new file mode 100644 index 000000000..1042efda9 --- /dev/null +++ b/modules/vllm-kv-cache/tests/vllm_storage_local_test.cc @@ -0,0 +1,433 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include + +#include +#include + +#include "client/client.h" +#include "common/util/logging.h" +#include "vllm-kv-cache/ds/vllm_block.h" +#include "vllm-kv-cache/ds/vllm_layer.h" +#include "vllm-kv-cache/src/storage/vllm_kv_storage.h" + +using namespace vineyard; // NOLINT(build/namespaces) + +std::string ipc_socket; + +int fd = -1; +uint64_t base = 0; +size_t map_size = 0; +size_t memory_size = 0; + +// Model config, here just use a fake config for testing +uint64_t layer_num = 1; +uint64_t kv_num = 2; +uint64_t buffer_num = 1; +uint64_t buffer_size = 128 * 1024; // 128KB +std::vector shape = {layer_num, kv_num, + buffer_num}; // layer, kv, buffers +int layer_index = 0; // means that the layer index in shape + +Client client; + +struct ModelConfig { + uint64_t layer_num; + uint64_t kv_num; + uint64_t buffer_num; + uint64_t buffer_size; + std::vector shape; + int layer_index; +}; + +struct VLLMBLock_ { + uint64_t block_hash_; + std::vector offsets_; + std::vector sizes_; + std::vector shape_ = shape; + int layer_index_ = layer_index; +}; + +// Simulate a VLLMBLockAllocator for testing. +class FakeVLLMBLockAllocator { + public: + FakeVLLMBLockAllocator() = default; + + // Init the whole memory region and separate into blocks. + // For testing, we just create not more than 1000 blocks. + void Init(void* memory_addr, size_t size, ModelConfig model_config) { + LOG(INFO) << "FakeVLLMBLockAllocator Init called with config:" + << " layer_num=" << model_config.layer_num + << ", kv_num=" << model_config.kv_num + << ", buffer_num=" << model_config.buffer_num + << ", buffer_size=" << model_config.buffer_size; + memory_addr_ = memory_addr; + size_ = size; + model_config_ = model_config; + uint64_t block_size = model_config_.layer_num * model_config_.kv_num * + model_config_.buffer_num * model_config_.buffer_size; + num_blocks_ = size_ / block_size; + num_blocks_ = + std::min(num_blocks_, + static_cast(1000)); // limit to 1k blocks for fake + + LOG(INFO) << "Calculating " << num_blocks_ << " blocks for memory size " + << size_ << " with block size " << block_size << "."; + LOG(INFO) << "Buffer num per blocks:" + << model_config_.buffer_num * model_config_.kv_num * + model_config_.layer_num + << ", each buffer size: " << model_config_.buffer_size; + + uint64_t per_layer_size = model_config_.kv_num * model_config_.buffer_num * + model_config_.buffer_size; + uint64_t per_kv_size = model_config_.buffer_num * model_config_.buffer_size; + LOG(INFO) << "Per layer size: " << per_layer_size + << ", per kv size: " << per_kv_size; + for (uint64_t i = 0; i < num_blocks_; ++i) { + VLLMBLock_ block; + // calculate offsets and sizes + // whole memory laout is layer * kv * block * buffer + for (uint64_t l = 0; l < model_config_.layer_num; ++l) { + for (uint64_t k = 0; k < model_config_.kv_num; ++k) { + for (uint64_t b = 0; b < model_config_.buffer_num; ++b) { + uint64_t offset = i * block_size + l * per_layer_size + + k * per_kv_size + b * model_config_.buffer_size; + block.offsets_.push_back(offset); + block.sizes_.push_back(model_config_.buffer_size); + } + } + } + free_block_queue_.push(block); + } + LOG(INFO) << "FakeVLLMBLockAllocator initialized with " << num_blocks_ + << " blocks."; + } + + // Release all resources. + void Release() { + LOG(INFO) << "FakeVLLMBLockAllocator Release called."; + memory_addr_ = nullptr; + size_ = 0; + model_config_ = ModelConfig(); + num_blocks_ = 0; + while (!free_block_queue_.empty()) { + free_block_queue_.pop(); + } + LOG(INFO) << "FakeVLLMBLockAllocator released."; + } + + // Allocate blocks from the free queue. + Status AllocateBlocks(uint64_t block_nums, std::vector& blocks) { + LOG(INFO) << "AllocateBlocks called for " << block_nums << " blocks."; + if (block_nums > free_block_queue_.size()) { + LOG(ERROR) << "Not enough blocks to allocate, requested: " << block_nums + << ", available: " << (num_blocks_ - free_block_queue_.size()); + return Status::Invalid("Not enough blocks to allocate"); + } + + while (block_nums > 0) { + VLLMBLock_ block; + if (!free_block_queue_.empty()) { + block = free_block_queue_.front(); + block.block_hash_ = block_nums; // just a fake hash + free_block_queue_.pop(); + } else { + return Status::Invalid("No more free blocks available"); + } + blocks.push_back(block); + --block_nums; + } + LOG(INFO) << "Allocated " << blocks.size() << " blocks."; + return Status::OK(); + } + + // Free blocks back to the free queue. + Status FreeBlocks(std::vector& blocks) { + LOG(INFO) << "FreeBlocks called for " << blocks.size() << " blocks."; + while (!blocks.empty()) { + free_block_queue_.push(blocks.back()); + blocks.pop_back(); + } + LOG(INFO) << "Freed blocks."; + return Status::OK(); + } + + private: + void* memory_addr_; + size_t size_; + ModelConfig model_config_; + uint64_t num_blocks_; + std::queue free_block_queue_; + + std::vector layer_offsets; // size equals to layer_num, means the + // offsets of each layer in the buffer + std::vector kv_offsets; // size equals to kv_num +}; + +std::shared_ptr fake_block_allocator; + +/* + * Initialize the VLLMKVStorage with mmaped memory. Then initialize the + * FakeVLLMBLockAllocator. Allocator will manage the mmap memory region + * so that when data swap to host memory, it can be seened by vineyardd, + * which means that vineyardd can send the data to other vineyardd with + * out extra copy. + */ +void init() { + VINEYARD_CHECK_OK(client.Connect(ipc_socket)); + LOG(INFO) << "Connected to IPCServer: " << ipc_socket; + uint64_t offset; + VINEYARD_CHECK_OK(client.GetVineyardMmapFd(fd, map_size, offset)); + LOG(INFO) << "Mmaped fd: " << fd << ", length: " << map_size + << ", offset: " << offset; + base = reinterpret_cast( + mmap(nullptr, map_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); + if (base == reinterpret_cast(MAP_FAILED)) { + LOG(ERROR) << "Failed to mmap received fd as a writable buffer: " + << strerror(errno); + throw std::runtime_error("Failed to mmap memory"); + } + + base = reinterpret_cast(base) + offset; // Adjust base address + Status status = + VLLMKVStorage::InitStorage(base, ipc_socket, "aio", true, false); + if (!status.ok()) { + LOG(ERROR) << "InitStorage failed: " << status; + exit(-1); + } + memory_size = map_size - offset; + LOG(INFO) << "Mmaped fd: " << fd << ", length: " << map_size + << ", free size: " << memory_size << ", offset: " << offset + << ", base address: " << std::hex << base; + + fake_block_allocator = std::make_shared(); + fake_block_allocator->Init(reinterpret_cast(base), memory_size, + ModelConfig{layer_num, kv_num, buffer_num, + buffer_size, shape, layer_index}); + + VLLMKVStorage::InitStorage(base, ipc_socket); + LOG(INFO) << "VLLMKVStorage initialized."; +} + +/* + * This function simulates swapping data from GPU to host memory. + * It fills the allocated blocks with fake data for testing. + */ +Status fake_swap_from_gpu(std::vector& blocks) { + LOG(INFO) << "Fake swap from GPU for " << blocks.size() << " blocks."; + for (auto& block : blocks) { + for (auto offset : block.offsets_) { + LOG(INFO) << "Filling fake data at offset: " << offset + << ", size: " << buffer_size; + uint8_t* ptr = reinterpret_cast(base + offset); + for (size_t i = 0; i < buffer_size; ++i) { + ptr[i] = static_cast(i); // fake data + } + } + } + return Status::OK(); +} + +/* + * This function show how to save blocks to vineyardd via VLLMKVStorage. + * When a put operation is returned ok, it means that vineyardd can access + * the data in the mmaped memory region directly. + */ +Status save_to_v6d(std::vector& blocks) { + LOG(INFO) << "Save to v6d for " << blocks.size() << " blocks."; + std::vector block_hash_vec; + std::vector> offsets_vec; + std::vector> sizes_vec; + + for (auto& block : blocks) { + block_hash_vec.push_back(block.block_hash_); + offsets_vec.push_back(block.offsets_); + sizes_vec.push_back(block.sizes_); + } + + std::vector statuses; + VINEYARD_CHECK_OK(VLLMKVStorage::PutBlockKVCache( + block_hash_vec, offsets_vec, sizes_vec, shape, layer_index, statuses, + "test-request")); + for (const auto& status : statuses) { + if (!status.ok()) { + LOG(ERROR) << "Failed to put block to v6d, error: " << status.ToString(); + return status; + } + } + + return Status::OK(); +} + +/* + * This function show how to delete blocks from vineyardd via VLLMKVStorage. + * Because the memory management is at client side, this API just detele + * the metadata in vineyardd. After delete, vineyardd do not have the block + * info so that this block is unvisible to other vineyardd. + */ +Status delete_from_v6d(std::vector& block_hashes) { + return VLLMKVStorage::DeleteBlocks(block_hashes, "test-delete-request"); +} + +// This function show how to save blocks to disk via VLLMKVStorage. +Status save_to_disk(std::vector& blocks) { + LOG(INFO) << "Save to disk for " << blocks.size() << " blocks."; + std::vector block_hash_vec; + std::vector> offsets_vec; + std::vector> sizes_vec; + + for (auto& block : blocks) { + block_hash_vec.push_back(block.block_hash_); + offsets_vec.push_back(block.offsets_); + sizes_vec.push_back(block.sizes_); + } + + std::vector statuses; + VINEYARD_CHECK_OK(VLLMKVStorage::SaveToDisk( + block_hash_vec, offsets_vec, sizes_vec, shape, layer_index, statuses, 5, + true, "test-disk-request")); + for (const auto& status : statuses) { + if (!status.ok()) { + LOG(ERROR) << "Failed to put block to disk, error: " << status.ToString(); + return status; + } + } + + return Status::OK(); +} + +// This function show how to load blocks from disk via VLLMKVStorage. +Status load_from_disk(std::vector& block_hashes, + std::vector& blocks) { + LOG(INFO) << "Load from disk for " << block_hashes.size() << " blocks."; + RETURN_ON_ASSERT(block_hashes.size() == blocks.size(), + "block_hashes.size() and blocks.size() must be equal"); + std::vector> offsets_vec; + std::vector> sizes_vec; + + for (size_t i = 0; i < block_hashes.size(); ++i) { + std::vector offsets = blocks[i].offsets_; + std::vector sizes = blocks[i].sizes_; + offsets_vec.push_back(offsets); + sizes_vec.push_back(sizes); + } + std::vector statuses; + VINEYARD_CHECK_OK(VLLMKVStorage::LoadFromDisk( + block_hashes, offsets_vec, sizes_vec, shape, layer_index, statuses, + "test-load-disk-request")); + for (size_t i = 0; i < statuses.size(); ++i) { + if (!statuses[i].ok()) { + LOG(ERROR) << "Failed to load block from disk, error: " + << statuses[i].ToString(); + return statuses[i]; + } + } + + return Status::OK(); +} + +Status check_blocks_equal(std::vector& blocks1, + std::vector& blocks2) { + RETURN_ON_ASSERT(blocks1.size() == blocks2.size(), + "blocks1.size() and blocks2.size() must be equal"); + for (size_t i = 0; i < blocks1.size(); ++i) { + auto& block1 = blocks1[i]; + auto& block2 = blocks2[i]; + RETURN_ON_ASSERT(block1.block_hash_ == block2.block_hash_, + "block hashes are not equal"); + RETURN_ON_ASSERT(block1.offsets_.size() == block2.offsets_.size(), + "offsets sizes are not equal"); + RETURN_ON_ASSERT(block1.sizes_.size() == block2.sizes_.size(), + "sizes sizes are not equal"); + for (size_t j = 0; j < block1.offsets_.size(); ++j) { + uint8_t* ptr1 = reinterpret_cast(base + block1.offsets_[j]); + uint8_t* ptr2 = reinterpret_cast(base + block2.offsets_[j]); + for (size_t k = 0; k < block1.sizes_[j]; ++k) { + RETURN_ON_ASSERT( + ptr1[k] == ptr2[k], + "block data are not equal at block " + std::to_string(i) + + ", buffer " + std::to_string(j) + ", byte " + + std::to_string(k) + ". Block 1 data is " + + std::to_string(static_cast(ptr1[k])) + + ", Block 2 data is " + + std::to_string(static_cast(ptr2[k])) + + ". Block 1 offset is " + std::to_string(block1.offsets_[j]) + + ", Block 2 offset is " + std::to_string(block2.offsets_[j])); + } + } + } + return Status::OK(); +} + +int main(int argc, char** argv) { + ipc_socket = std::string("/tmp/vineyard1.sock"); + std::string disk_file_path = "/tmp/vllm_kv_cache_disk_dir"; + if (setenv("VINEYARD_VLLM_KV_CACHE_DISK_PATH", disk_file_path.c_str(), 1) != + 0) { + LOG(ERROR) << "Failed to set VINEYARD_VLLM_KV_CACHE_DISK_PATH environment " + "variable"; + return -1; + } else { + LOG(INFO) << "Set VINEYARD_VLLM_KV_CACHE_DISK_PATH to " << disk_file_path + << ", read env:" + << VLLMKVCacheEnv::GetVineyardVLLMKVCacheDiskPath(); + if (!std::filesystem::exists(disk_file_path)) { + std::filesystem::create_directories(disk_file_path); + } + } + + uint64_t num_blocks = 4; + init(); + + std::vector blocks; + fake_block_allocator->AllocateBlocks(num_blocks, blocks); + + VINEYARD_CHECK_OK(fake_swap_from_gpu(blocks)); + save_to_v6d(blocks); + + std::vector block_hashes; + for (auto& block : blocks) { + block_hashes.push_back(block.block_hash_); + } + VINEYARD_CHECK_OK(delete_from_v6d(block_hashes)); + + VINEYARD_CHECK_OK(save_to_disk(blocks)); + + std::vector loaded_blocks; + VINEYARD_CHECK_OK( + fake_block_allocator->AllocateBlocks(num_blocks, loaded_blocks)); + VINEYARD_CHECK_OK(load_from_disk(block_hashes, loaded_blocks)); + + VINEYARD_CHECK_OK(check_blocks_equal(blocks, loaded_blocks)); + + LOG(INFO) << "All tests passed!"; + + fake_block_allocator->FreeBlocks(blocks); + fake_block_allocator->FreeBlocks(loaded_blocks); + + fake_block_allocator->Release(); + + if (munmap(reinterpret_cast(base), map_size) != 0) { + LOG(ERROR) << "Failed to munmap memory: " << strerror(errno); + return -1; + } + + client.Disconnect(); + + std::filesystem::remove_all(disk_file_path); + + return 0; +} diff --git a/python/client.cc b/python/client.cc index 6f0f5fc62..ed280e244 100644 --- a/python/client.cc +++ b/python/client.cc @@ -363,6 +363,34 @@ void bind_client(py::module& mod) { } }, "stream"_a, py::arg("drop_metadata") = true) + .def( + "create_fixed_stream", + [](ClientBase* self, std::string stream_name, int blob_num, + size_t blob_size) -> ObjectIDWrapper { + ObjectID stream_id; + throw_on_error(self->CreateFixedStream(stream_id, stream_name, + blob_num, blob_size)); + return stream_id; + }, + "stream_name"_a, "blob_num"_a, "blob_size"_a) + .def( + "push_next_stream_chunk_by_offset", + [](ClientBase* self, ObjectID const id, uint64_t offset) { + throw_on_error(self->PushNextStreamChunkByOffset(id, offset)); + }, + "id"_a, "offset"_a, py::call_guard()) + .def( + "close_stream", + [](ClientBase* self, ObjectID const id) { + throw_on_error(self->CloseStream(id)); + }, + "id"_a, py::call_guard()) + .def( + "delete_stream", + [](ClientBase* self, ObjectID const id) { + throw_on_error(self->DeleteStream(id)); + }, + "id"_a) .def( "persist", [](ClientBase* self, const ObjectIDWrapper object_id) { @@ -759,6 +787,113 @@ void bind_client(py::module& mod) { } }, "stream"_a) + .def( + "vineyard_open_remote_fixed_stream_with_id", + [](Client* self, ObjectID remote_id, ObjectID local_id, int blob_nums, + size_t size, std::string remote_endpoint, std::string mode, + bool wait, uint64_t timeout) { + int fd = -1; + if (mode == "r") { + throw_on_error(self->VineyardOpenRemoteFixedStream( + remote_id, local_id, fd, blob_nums, size, remote_endpoint, + StreamOpenMode::read, wait, timeout)); + } else { + throw_on_error(Status::AssertionFailed("Mode can only be 'r'")); + } + return fd; + }, + "remote_id"_a, "local_id"_a, "blob_nums"_a, "sizes"_a, + "remote_endpoint"_a, "mode"_a, "wait"_a, "timeout"_a, + py::call_guard()) + .def( + "vineyard_open_remote_fixed_stream_with_name", + [](Client* self, std::string remote_stream_name, ObjectID local_id, + int blob_nums, size_t size, std::string remote_endpoint, + std::string mode, bool wait, uint64_t timeout) { + int fd = -1; + if (mode == "r") { + throw_on_error(self->VineyardOpenRemoteFixedStream( + remote_stream_name, local_id, fd, blob_nums, size, + remote_endpoint, StreamOpenMode::read, wait, timeout)); + } else { + throw_on_error(Status::AssertionFailed("Mode can only be 'r'")); + } + return fd; + }, + "remote_stream_name"_a, "local_id"_a, "blob_nums"_a, "sizes"_a, + "remote_endpoint"_a, "mode"_a, "wait"_a, "timeout"_a, + py::call_guard()) + .def( + "vineyard_activate_remote_fixed_stream_with_offset", + [](Client* self, ObjectID local_id, std::vector& offsets) { + throw_on_error(self->VineyardActivateRemoteFixedStreamWithOffset( + local_id, offsets)); + }, + "local_id"_a, "offsets"_a, py::call_guard()) + .def( + "vineyard_get_next_fixed_stream_chunk", + [](Client* self) -> int { + int index = -1; + throw_on_error(self->VineyardGetNextFixedStreamChunk(index)); + return index; + }, + py::call_guard()) + .def( + "open_fixed_stream", + [](Client* self, ObjectID stream_id, std::string mode) { + int fd = -1; + if (mode == "w") { + throw_on_error( + self->OpenFixedStream(stream_id, StreamOpenMode::write, fd)); + } else { + throw_on_error(Status::AssertionFailed("Mode can only be 'w'")); + } + return fd; + }, + "stream"_a, "mode"_a, py::call_guard()) + .def( + "vineyard_close_remote_fixed_stream", + [](Client* self, ObjectID local_id) { + throw_on_error(self->VineyardCloseRemoteFixedStream(local_id)); + }, + "local_id"_a, py::call_guard()) + .def( + "vineyard_abort_remote_stream", + [](Client* self, ObjectID local_id) { + bool success = false; + throw_on_error(self->VineyardAbortRemoteStream(local_id, success)); + return success; + }, + "local_id"_a, py::call_guard()) + .def( + "abort_stream", + [](Client* self, ObjectID local_id) { + bool success = false; + throw_on_error(self->AbortStream(local_id, success)); + return success; + }, + "local_id"_a, py::call_guard()) + .def( + "check_fixed_stream_received", + [](Client* self, ObjectID local_id, int index) -> bool { + bool received = false; + throw_on_error( + self->CheckFixedStreamReceived(local_id, index, received)); + return received; + }, + "local_id"_a, "index"_a) + .def("get_vineyard_mmap_fd", + [](Client* self) -> std::vector { + int fd = -1; + size_t size = 0; + size_t offset = 0; + std::vector ret; + throw_on_error(self->GetVineyardMmapFd(fd, size, offset)); + ret.push_back(fd); + ret.push_back(size); + ret.push_back(offset); + return ret; + }) .def( "allocated_size", [](Client* self, const ObjectID id) -> size_t { diff --git a/python/vineyard/core/client.py b/python/vineyard/core/client.py index 2bb0b1c2f..0f2207d10 100644 --- a/python/vineyard/core/client.py +++ b/python/vineyard/core/client.py @@ -21,6 +21,8 @@ import warnings from concurrent.futures import ThreadPoolExecutor from concurrent.futures import as_completed +from concurrent.futures import Future +import threading from typing import Any from typing import Dict from typing import List @@ -43,9 +45,67 @@ from vineyard._C import _connect from vineyard.core.builder import BuilderContext from vineyard.core.builder import put +from vineyard.core.resolver import get_current_resolvers from vineyard.core.resolver import ResolverContext from vineyard.core.resolver import get +class AsyncFixedStreamChunk: + + def __init__(self, client, chunk_nums): + self.client = client + self._chunk_nums = chunk_nums + self.future : Optional[Future] = None + self._reader_index = 0 + self._writer_index = 0 + self._ready_list = [] + self._exception : Optional[Exception] = None + self._lock = threading.RLock() + self._start_fetch() + + + def _start_fetch(self): + self.future = self.client._async_task_thread_pool.submit(self._fetch) + self.future.add_done_callback(self._callback) + + def _fetch(self): + try: + try: + self._lock.release() + except Exception as e: + print("Error in AsyncFixedStreamChunk fetch release lock:", e) + pass + + index = self.client.vineyard_get_next_fixed_stream_chunk() + with self._lock: + self._ready_list.append(index) + self._writer_index += 1 + except Exception as e: + self._exception = e + + def _callback(self, future): + try: + future.result() + except Exception as e: + self._exception = e + finally: + with self._lock: + if self._writer_index < self._chunk_nums: + self._start_fetch() + + def get(self) -> int: + if self._exception: + raise self._exception + + try: + with self._lock: + self._ready_list.index(self._reader_index) + self._reader_index += 1 + index = self._reader_index - 1 + return index + except Exception as e: + print("Error in AsyncFixedStreamChunk get:", e) + pass + return -1 def _apply_docstring(func): def _apply(fn): @@ -168,6 +228,7 @@ def __init__( session: int = None, username: str = None, password: str = None, + max_workers: int = 8, config: str = None, ): """Connects to the vineyard IPC socket and RPC socket. @@ -211,6 +272,8 @@ def __init__( is enabled. password: Optional, the required password of vineyardd when authentication is enabled. + max_workers: Optional, the maximum number of threads that can be used to + asynchronously get/put objects from/to vineyard. Default is 8. config: Optional, can either be a path to a YAML configuration file or a path to a directory containing the default config file `vineyard-config.yaml`. Also, the environment variable @@ -292,6 +355,10 @@ def __init__( self._spread = False self._compression = True + + # Initialize thread pool for lazy_get + self._async_task_thread_pool = ThreadPoolExecutor(max_workers=max_workers) + if self._ipc_client is None and self._rpc_client is None: raise ConnectionError( "Failed to connect to vineyard via both IPC and RPC connection. " @@ -385,10 +452,28 @@ def create_stream(self, id: ObjectID) -> None: def open_stream(self, id: ObjectID, mode: str) -> None: return self.default_client().open_stream(id, mode) + @_apply_docstring(IPCClient.close_stream) + def close_stream(self, id: ObjectID) -> None: + return self.default_client().close_stream(id) + + @_apply_docstring(IPCClient.delete_stream) + def delete_stream(self, id: ObjectID) -> None: + return self.default_client().delete_stream(id) + + @_apply_docstring(IPCClient.create_fixed_stream) + def create_fixed_stream(self, stream_name: str, blob_num: int, size: int) -> ObjectID: + return self.default_client().create_fixed_stream(stream_name, blob_num, size) + @_apply_docstring(IPCClient.push_chunk) def push_chunk(self, stream_id: ObjectID, chunk: ObjectID) -> None: return self.default_client().push_chunk(stream_id, chunk) + @_apply_docstring(IPCClient.push_next_stream_chunk_by_offset) + def push_next_stream_chunk_by_offset( + self, stream_id: ObjectID, offset: int + ) -> None: + return self.default_client().push_next_stream_chunk_by_offset(stream_id, offset) + @_apply_docstring(IPCClient.next_chunk_id) def next_chunk_id(self, stream_id: ObjectID) -> ObjectID: return self.default_client().next_chunk_id(stream_id) @@ -652,6 +737,47 @@ def new_buffer_chunk(self, stream: ObjectID, size: int) -> memoryview: def next_buffer_chunk(self, stream: ObjectID) -> memoryview: return self.ipc_client.next_buffer_chunk(stream) + @_apply_docstring(IPCClient.vineyard_open_remote_fixed_stream_with_id) + def vineyard_open_remote_fixed_stream_with_id(self, remote_id: ObjectID, local_id: ObjectID, blob_nums: int, size: int, remote_endpoint: str, mode: str, wait: bool, timeout: int) -> int: + return self.ipc_client.vineyard_open_remote_fixed_stream_with_id(remote_id, local_id, blob_nums, size, remote_endpoint, mode, wait, timeout) + + @_apply_docstring(IPCClient.vineyard_open_remote_fixed_stream_with_name) + def vineyard_open_remote_fixed_stream_with_name(self, remote_name: str, local_id: ObjectID, blob_nums: int, size: int, remote_endpoint: str, mode: str, wait: bool, timeout: int) -> int: + return self.ipc_client.vineyard_open_remote_fixed_stream_with_name(remote_name, local_id, blob_nums, size, remote_endpoint, mode, wait, timeout) + + @_apply_docstring(IPCClient.vineyard_activate_remote_fixed_stream_with_offset) + def vineyard_activate_remote_fixed_stream_with_offset(self, stream_id: ObjectID, offsets: List[int]) -> None: + return self.ipc_client.vineyard_activate_remote_fixed_stream_with_offset(stream_id, offsets) + + # List[0]: fd, List[1]: size, List[2]: offset + @_apply_docstring(IPCClient.get_vineyard_mmap_fd) + def get_vineyard_mmap_fd(self) -> List[int]: + return self.ipc_client.get_vineyard_mmap_fd() + + @_apply_docstring(IPCClient.vineyard_get_next_fixed_stream_chunk) + def vineyard_get_next_fixed_stream_chunk(self) -> int: + return self.ipc_client.vineyard_get_next_fixed_stream_chunk() + + @_apply_docstring(IPCClient.open_fixed_stream) + def open_fixed_stream(self, stream_id: ObjectID, mode: str) -> None: + return self.ipc_client.open_fixed_stream(stream_id, mode) + + @_apply_docstring(IPCClient.vineyard_close_remote_fixed_stream) + def vineyard_close_remote_fixed_stream(self, stream_id: ObjectID) -> None: + return self.ipc_client.vineyard_close_remote_fixed_stream(stream_id) + + @_apply_docstring(IPCClient.vineyard_abort_remote_stream) + def vineyard_abort_remote_stream(self, stream_id: ObjectID) -> bool: + return self.ipc_client.vineyard_abort_remote_stream(stream_id) + + @_apply_docstring(IPCClient.abort_stream) + def abort_stream(self, stream_id: ObjectID) -> bool: + return self.ipc_client.abort_stream(stream_id) + + @_apply_docstring(IPCClient.check_fixed_stream_received) + def check_fixed_stream_received(self, stream_id: ObjectID, index: int) -> bool: + return self.ipc_client.check_fixed_stream_received(stream_id, index) + @_apply_docstring(IPCClient.allocated_size) def allocated_size(self, object_id: Union[Object, ObjectID]) -> int: return self.ipc_client.allocated_size(object_id) @@ -874,5 +1000,7 @@ def with_spread(self, enabled: bool = True): yield self.spread = tmp_spread + def vineyard_get_next_fixed_stream_chunk_async(self, nums) -> AsyncFixedStreamChunk: + return AsyncFixedStreamChunk(self, nums) __all__ = ['Client'] diff --git a/python/vineyard/core/tests/fixed_stream_receiver.py b/python/vineyard/core/tests/fixed_stream_receiver.py new file mode 100644 index 000000000..dc302083e --- /dev/null +++ b/python/vineyard/core/tests/fixed_stream_receiver.py @@ -0,0 +1,105 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright 2020-2023 Alibaba Group Holding Limited. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from datetime import datetime +import mmap +import sys +import time +import vineyard +from vineyard.io.fixed_blob import FixedBlobStream + +from vineyard._C import ObjectID + +blob_num = 10 +blob_size = 1024 * 1024 * 2 + +def run_receiver(client: vineyard.Client, mm: mmap.mmap, ipc_socket: str, rpc_endpoint: str): + fixed_blob_stream = FixedBlobStream.new(client, "test-stream-5", blob_num, blob_size, True, rpc_endpoint) + stream_reader = fixed_blob_stream.open_reader(client, True, 10000) + offset_list = [] + for i in range(blob_num): + offset_list.append(i * blob_size) + + stream_reader.activate_stream_with_offset(offset_list) + + total_finished = stream_reader.check_block_received(-1) + print("Stream is :", "finished" if total_finished else "not finished") + + for i in range(blob_num): + finished = False + while not finished: + start_time = datetime.now().microsecond + try: + finished = stream_reader.check_block_received(i) + except Exception as e: + print(f"Error checking block {i}: {e}") + break + + end_time = datetime.now().microsecond + print(f"Waiting for chunk {i}...") + time.sleep(0.2) + + if finished is not True: + while True: + aborted = stream_reader.abort() + if aborted: + print("Stream aborted, bye...") + return + + for j in range(blob_size): + assert mm.read_byte() == j % 256 + print("Chunk ", i, " received successfully") + + for i in range(blob_num): + finished = False + while not finished: + start_time = datetime.now().microsecond + finished = stream_reader.check_block_received(i) + end_time = datetime.now().microsecond + print(f"check used time: {end_time - start_time} us") + + start_time = datetime.now().microsecond + total_finished = stream_reader.check_block_received(-1) + end_time = datetime.now().microsecond + print("Stream is :", "finished" if total_finished else "not finished") + print("check all use time: ", end_time - start_time, " us") + stream_reader.finish_and_delete() + + +def __main__(): + arguments = sys.argv[1:] + if len(arguments) < 2: + print("Usage: fixed_stream_receiver.py ") + return 1 + + ipc_socket = arguments[0] + rpc_endpoint = arguments[1] + client = vineyard.connect(ipc_socket) + client.timeout_seconds = 5 + + list = client.get_vineyard_mmap_fd() + fd = list[0] + offset = list[2] + + mm = mmap.mmap(fd, 0) + mm.seek(offset) + + run_receiver(client, mm, ipc_socket, rpc_endpoint) + +if __name__ == "__main__": + __main__() \ No newline at end of file diff --git a/python/vineyard/core/tests/fixed_stream_sender.py b/python/vineyard/core/tests/fixed_stream_sender.py new file mode 100644 index 000000000..eca399813 --- /dev/null +++ b/python/vineyard/core/tests/fixed_stream_sender.py @@ -0,0 +1,82 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright 2020-2023 Alibaba Group Holding Limited. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import sys +import mmap +import vineyard +import threading +from time import sleep +from vineyard.io.fixed_blob import FixedBlobStream + +blob_num = 10 +blob_size = 1024 * 1024 * 2 + +def check_received(client:vineyard.Client, stream_id: vineyard.ObjectID, stream_reader: FixedBlobStream.Reader): + finished = False + while not finished: + finished = stream_reader.check_block_received(-1) + print("Waiting for stream to finish...") + sleep(2) + + success = False + success = stream_reader.abort() + print("Stream aborted: ", success) + stream_reader.finish_and_delete() + +def run_sender(client: vineyard.Client, mm: mmap.mmap): + fixed_blob_stream = FixedBlobStream.new(client, "test-stream-5", blob_num, blob_size, False, "") + stream_writer = fixed_blob_stream.open_writer(client) + + offset_list = [] + for i in range(blob_num): + for j in range(blob_size): + mm.write_byte(j % 256) + + for i in range(blob_num): + offset_list.append(i * blob_size) + + thread = threading.Thread(target=check_received, args=(client, id, stream_writer,)) + + thread.start() + + for offset in offset_list: + stream_writer.append(offset) + sleep(1) + + thread.join() + +def __main__(): + arguments = sys.argv[1:] + if len(arguments) < 1: + print("Usage: fixed_stream_receiver.py ") + return 1 + + ipc_socket = arguments[0] + client = vineyard.connect(ipc_socket) + client.timeout_seconds = 5 + + list = client.get_vineyard_mmap_fd() + fd = list[0] + offset = list[2] + + mm = mmap.mmap(fd, 0) + mm.seek(offset) + + run_sender(client, mm) + +if __name__ == "__main__": + __main__() \ No newline at end of file diff --git a/python/vineyard/io/fixed_blob.py b/python/vineyard/io/fixed_blob.py new file mode 100644 index 000000000..5e4a2a161 --- /dev/null +++ b/python/vineyard/io/fixed_blob.py @@ -0,0 +1,214 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright 2020-2023 Alibaba Group Holding Limited. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +''' This module exposes support for FixedBlobStream. +''' + +import contextlib +import mmap +import os +from typing import List +from typing import Optional + +from vineyard._C import InvalidException +from vineyard._C import ObjectMeta +from vineyard._C import ObjectID +from vineyard.core import context +from vineyard.io.stream import BaseStream + +class FixedBlobStream(BaseStream): + def __init__(self, meta: ObjectMeta): + super().__init__(meta) + self.nums_ = meta['nums'] + self.size_ = meta['size'] + self.is_remote_ = meta['is_remote'] + self.rpc_endpoint_ = meta['rpc_endpoint'] + self.stream_name_ = meta['stream_name'] + self.mmap_size = 4096 + self.error_msg_len = 256 + + @staticmethod + def new(client, + stream_name: str, + nums: int, + size: int, + is_remote: bool = False, + rpc_endpoint: Optional[str] = "") -> "FixedBlobStream": + meta = ObjectMeta() + meta['typename'] = 'vineyard::FixedBlobStream' + meta['nums'] = nums + meta['size'] = size + meta['is_remote'] = is_remote + meta['rpc_endpoint'] = rpc_endpoint + meta['stream_name'] = stream_name + + meta.id = client.create_fixed_stream(stream_name, nums, size) + return FixedBlobStream(meta) + + class Reader(BaseStream.Reader): + def __init__(self, stream: "FixedBlobStream"): + self.stream_ = stream + + def next(self) -> object: + raise NotImplementedError("FixedBlobStream does not support read yet.") + + def next_metadata(self) -> ObjectMeta: + raise NotImplementedError("FixedBlobStream does not support read yet.") + + def activate_stream_with_offset(self, offsets: List[int]): + self.stream_.activate_stream_with_offset(offsets) + + def abort(self) -> bool: + return self.stream_.abort() + + def finish(self): + self.stream_.close() + + def finish_and_delete(self): + client_ = self.stream_.client_ + self.stream_.close() + FixedBlobStream.delete(client_, self.stream_) + + def check_block_received(self, index:int) -> bool: + return self.stream_.check_block_received(index) + + class Writer(BaseStream.Writer): + def __init__(self, stream: "FixedBlobStream"): + self.stream_ = stream + + def next(self, size: int) -> memoryview: + raise NotImplementedError("FixedBlobStream does not support write yet.") + + def append(self, offset: int): + self.stream_.push_offset_block(offset) + + def fail(self): + raise NotImplementedError("FixedBlobStream does not support write yet.") + + def abort(self) -> bool: + return self.stream_.abort() + + def finish(self): + self.stream_.close() + + def finish_and_delete(self): + client_ = self.stream_.client_ + self.stream_.close() + FixedBlobStream.delete(client_, self.stream_) + + def check_block_received(self, index:int) -> bool: + return self.stream_.check_block_received(index) + + def open_reader(self, client, wait: bool = False, timeout: int = 0): + self.open(client, "r", wait, timeout) + return FixedBlobStream.Reader(self) + + def open_writer(self, client): + self.open(client, "w") + return FixedBlobStream.Writer(self) + + def open(self, + client, + mode, + wait: bool = False, + timeout: int = 0): + self.client_ = client + if (self.is_remote_): + self.recv_mem_fd_ = self.client_.vineyard_open_remote_fixed_stream_with_name(self.stream_name_, self.meta.id, self.nums_, self.size_, self.rpc_endpoint_, mode, wait, timeout) + else: + self.recv_mem_fd_ = self.client_.open_fixed_stream(self.meta.id, mode) + if (self.recv_mem_fd_ < 0): + raise ValueError("Failed to open remote fixed stream") + try: + self.recv_mem_ = mmap.mmap(self.recv_mem_fd_, self.mmap_size, access=mmap.ACCESS_READ) + except Exception as e: + self.close() + raise e + + def activate_stream_with_offset(self, offsets: List[int]): + if (not self.is_remote_): + raise ValueError("The stream is not remote stream") + self.client_.vineyard_activate_remote_fixed_stream_with_offset(self.meta.id, offsets) + + def push_offset_block(self, offsets: int): + self.client_.push_next_stream_chunk_by_offset(self.meta.id, offsets) + + def check_block_received(self, index:int) -> bool: + if (self.recv_mem_[self.mmap_size - 1] != 0): + self.recv_mem_.seek(self.mmap_size - self.error_msg_len - 1) + error_msg = self.recv_mem_.read(self.error_msg_len) + null_byte_index = error_msg.find(b'\0') + if null_byte_index != -1: + error_msg = error_msg[:null_byte_index] + else: + error_msg = error_msg + raise InvalidException(error_msg.decode('ascii')) + + if (index == -1): + ret = True + for i in range(self.nums_): + if self.recv_mem_[i] == 0: + ret = False + break + return ret + elif (index < 0 or index >= self.nums_): + raise ValueError("Invalid index") + else: + return self.recv_mem_[index] == 1 + + + def close(self): + try: + if (self.is_remote_): + self.client_.vineyard_close_remote_fixed_stream(self.meta.id) + else: + self.client_.close_stream(self.meta.id) + except Exception as e: + print("error:", e) + + os.close(self.recv_mem_fd_) + self.recv_mem_.close() + self.client_ = None + + def abort(self) -> bool: + if (self.is_remote_): + return self.client_.vineyard_abort_remote_stream(self.meta.id) + else: + return self.client_.abort_stream(self.meta.id) + + @staticmethod + def delete(client, fixed_blob_stream: "FixedBlobStream"): + client.delete_stream(fixed_blob_stream.meta.id) + +def fixed_blob_stream_resolver(obj, resolver): # pylint: disable=unused-argument + meta = obj.meta + return FixedBlobStream(meta) + + +def register_fixed_blob_stream_types(_builder_ctx, resolver_ctx): + if resolver_ctx is not None: + resolver_ctx.register( + 'vineyard::FixedBlobStream', fixed_blob_stream_resolver + ) + + +@contextlib.contextmanager +def recordbatch_stream_context(): + with context() as (builder_ctx, resolver_ctx): + register_fixed_blob_stream_types(builder_ctx, resolver_ctx) + yield builder_ctx, resolver_ctx diff --git a/src/client/client.cc b/src/client/client.cc index 4ea6d0709..5ae35a100 100644 --- a/src/client/client.cc +++ b/src/client/client.cc @@ -19,6 +19,8 @@ limitations under the License. #include #include +#include +#include #include #include #include @@ -32,8 +34,11 @@ limitations under the License. #include "client/utils.h" #include "common/memory/cuda_ipc.h" #include "common/memory/fling.h" +#include "common/memory/memcpy.h" #include "common/util/env.h" +#include "common/util/get_tid.h" #include "common/util/protocols.h" +#include "common/util/sidecar.h" #include "common/util/status.h" #include "common/util/uuid.h" #include "common/util/version.h" @@ -108,7 +113,14 @@ Status BasicIPCClient::Open(std::string const& ipc_socket, return Status::OK(); } -Client::~Client() { Disconnect(); } +Client::~Client() { + Disconnect(); + if (extra_request_memory_addr_ != nullptr) { + munmap(extra_request_memory_addr_, extra_request_mem_size_); + extra_request_memory_addr_ = nullptr; + extra_request_mem_size_ = 0; + } +} Status Client::Connect() { auto ep = read_env("VINEYARD_IPC_SOCKET"); @@ -189,6 +201,75 @@ Status Client::GetMetaData(const ObjectID id, ObjectMeta& meta, return Status::OK(); } +Status Client::CreateHugeMetaData(std::vector& meta_datas, + std::vector& ids, + std::string& req_flag) { + ENSURE_CONNECTED(this); + std::vector computed_instance_ids(meta_datas.size(), + this->instance_id_); + + Signature signature; + InstanceID instance_id; + std::vector trees; + std::vector json_lengths; + uint64_t start = 0, end = 0; + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + for (auto& meta_data : meta_datas) { + meta_data.SetInstanceId(this->instance_id_); + // TODO: here do not support k8s + trees.emplace_back(std::move(json_to_string(meta_data.MetaData()))); + json_lengths.emplace_back(trees.back().size()); + } + + RETURN_ON_ERROR(LseekExtraMsgWritePos(0)); + RETURN_ON_ERROR(AttachReqFlag(req_flag)); + + RETURN_ON_ERROR(WriteExtraMsg(json_lengths.data(), + json_lengths.size() * sizeof(uint64_t))); + for (size_t i = 0; i < trees.size(); ++i) { + RETURN_ON_ERROR(WriteExtraMsg(trees[i].data(), trees[i].size())); + } + + std::string message_out; + WriteCreateHugeDatasRequest(trees.size(), message_out); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + logger_ << LogPrefix() << "Request: " << req_flag + << ". CreateHugeMetaData Construct IPC msg cost:" << (end - start) + << " us." << std::endl; + start = end; + RETURN_ON_ERROR(doWrite(message_out)); + + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + logger_ << LogPrefix() << "Request: " << req_flag + << ". CreateHugeMetaData IPC cost:" << (end - start) << " us." + << std::endl; + size_t id_num; + RETURN_ON_ERROR( + ReadCreateHugeDatasReply(message_in, id_num, signature, instance_id)); + RETURN_ON_ASSERT(id_num == meta_datas.size(), + "mismatched number of created objects"); + + ids.resize(id_num); + RETURN_ON_ERROR(LseekExtraMsgReadPos(0)); + RETURN_ON_ERROR(ReadExtraMsg(ids.data(), id_num * sizeof(ObjectID))); + + for (size_t i = 0; i < meta_datas.size(); ++i) { + meta_datas[i].SetId(ids[i]); + meta_datas[i].SetSignature(signature); + meta_datas[i].SetClient(this); + meta_datas[i].SetInstanceId(instance_id); + } + return Status::OK(); +} + Status Client::FetchAndGetMetaData(const ObjectID id, ObjectMeta& meta, const bool sync_remote) { ObjectID local_object_id = InvalidObjectID(); @@ -227,6 +308,97 @@ Status Client::GetMetaData(const std::vector& ids, return Status::OK(); } +Status Client::GetHugeMetaData(const std::vector& ids, + std::vector& metas, + std::string& req_flag, const bool sync_remote, + bool fast_path) { + ENSURE_CONNECTED(this); + uint64_t start = 0, end = 0; + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + RETURN_ON_ERROR(LseekExtraMsgWritePos(0)); + RETURN_ON_ERROR(AttachReqFlag(req_flag)); + RETURN_ON_ERROR(WriteExtraMsg(ids.data(), ids.size() * sizeof(ObjectID))); + + std::string message_out; + WriteGetHugeDataRequest(ids.size(), sync_remote, false, message_out); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + logger_ << LogPrefix() << "Request: " << req_flag + << ". GetHugeMetaData construct IPC msg cost:" << (end - start) + << " us." << std::endl; + + start = end; + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + logger_ << LogPrefix() << "Request: " << req_flag + << ". GetHugeMetaData IPC cost:" << (end - start) << " us." + << std::endl; + + start = end; + size_t json_length; + json tree; + RETURN_ON_ERROR(ReadGetHugeDataReply(message_in, json_length)); + std::string json_str(json_length, 0); + + RETURN_ON_ERROR(LseekExtraMsgReadPos(0)); + RETURN_ON_ERROR(ReadExtraMsg(json_str.data(), json_length)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + logger_ << LogPrefix() << "Request: " << req_flag + << ". GetHugeMetaData read reply cost:" << (end - start) << " us." + << std::endl; + start = end; + try { + tree = + json_from_buf(static_cast(json_str.data()), json_length); + } catch (std::exception& e) { + return Status::Invalid("Failed to parse json: " + std::string(e.what())); + } + auto items = tree.items(); + + std::set blob_ids; + for (auto kv = items.begin(); kv != items.end(); ++kv) { + ObjectMeta meta; + meta.SetMetaData(this, kv.value()); + metas.emplace_back(meta); + if (!fast_path) { + for (const auto& id : meta.GetBufferSet()->AllBufferIds()) { + blob_ids.emplace(id); + } + } + } + + if (!fast_path) { + std::map> buffers; + RETURN_ON_ERROR(GetBuffers(blob_ids, buffers)); + + for (auto& meta : metas) { + for (auto const id : meta.GetBufferSet()->AllBufferIds()) { + const auto& buffer = buffers.find(id); + if (buffer != buffers.end()) { + meta.SetBuffer(id, buffer->second); + } + } + } + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + logger_ << LogPrefix() << "Request: " << req_flag + << ". GetHugeMetaData decode meta cost:" << (end - start) << " us." + << std::endl; + return Status::OK(); +} + Status Client::CreateBlob(size_t size, std::unique_ptr& blob) { ENSURE_CONNECTED(this); ObjectID object_id = InvalidObjectID(); @@ -252,6 +424,117 @@ Status Client::CreateBlobs(const std::vector& sizes, return Status::OK(); } +Status Client::CreateUserBlobs( + const std::vector& offsets, const std::vector& sizes, + std::vector>& blobs, + std::string& req_flag) { + ENSURE_CONNECTED(this); + RETURN_ON_ASSERT(offsets.size() == sizes.size(), + "The number of offsets and sizes should be the same"); + uint64_t start = 0, end = 0; + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + std::string message_out; + WriteCreateUserBuffersRequest(offsets, sizes, message_out); + + RETURN_ON_ERROR(LseekExtraMsgWritePos(0)); + RETURN_ON_ERROR(AttachReqFlag(req_flag)); + + RETURN_ON_ERROR( + WriteExtraMsg(offsets.data(), offsets.size() * sizeof(uint64_t))); + RETURN_ON_ERROR(WriteExtraMsg(sizes.data(), sizes.size() * sizeof(size_t))); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + logger_ << LogPrefix() << "Request: " << req_flag + << ". CreateUserBlobs construct IPC msg cost:" << (end - start) + << " us." << std::endl; + + start = end; + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + logger_ << LogPrefix() << "Request: " << req_flag + << ". CreateUserBlobs IPC cost:" << (end - start) << " us." + << std::endl; + + start = end; + std::vector ids; + RETURN_ON_ERROR(ReadCreateUserBuffersReply(message_in, ids)); + + RETURN_ON_ERROR(LseekExtraMsgReadPos(0)); + RETURN_ON_ERROR(ReadExtraMsg(ids.data(), ids.size() * sizeof(ObjectID))); + + RETURN_ON_ASSERT(ids.size() == sizes.size(), + "The number of ids and sizes should be the same"); + for (size_t i = 0; i < ids.size(); i++) { + std::shared_ptr buffer = + std::make_shared(offsets[i], sizes[i]); + std::unique_ptr blob = std::unique_ptr( + new UserBlobBuilder(ids[i], sizes[i], offsets[i])); + blobs.emplace_back(std::move(blob)); + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + logger_ << LogPrefix() << "Request: " << req_flag + << ". CreateUserBlobs total cost:" << (end - start) << " us." + << std::endl; + return Status::OK(); +} + +Status Client::GetUserBlobs(std::vector& ids, + std::vector>& blobs) { + ENSURE_CONNECTED(this); + if (ids.empty()) { + return Status::OK(); + } + + std::string message_out; + WriteGetUserBuffersRequest(ids, message_out); + json message_in; + RETURN_ON_ERROR(doWrite(message_out)); + + std::vector payloads; + RETURN_ON_ERROR(doRead(message_in)); + RETURN_ON_ERROR(ReadGetUserBuffersReply(message_in, payloads)); + RETURN_ON_ASSERT(payloads.size() == ids.size(), + "The number of payloads and ids should be the same"); + for (size_t i = 0; i < ids.size(); ++i) { + std::shared_ptr blob = std::shared_ptr(new UserBlob{}); + blob->id_ = ids[i]; + blob->offset_ = payloads[i].user_offset; + blob->size_ = payloads[i].data_size; + blobs.push_back(blob); + } + + return Status::OK(); +} + +Status Client::DeleteUserBlobs(std::vector& ids, + std::string& req_flag) { + ENSURE_CONNECTED(this); + if (ids.empty()) { + return Status::OK(); + } + std::string message_out; + WriteDeleteUserBuffersRequest(ids, message_out); + RETURN_ON_ERROR(LseekExtraMsgWritePos(0)); + RETURN_ON_ERROR(AttachReqFlag(req_flag)); + RETURN_ON_ERROR(WriteExtraMsg(ids.data(), ids.size() * sizeof(ObjectID))); + + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + RETURN_ON_ERROR(ReadDeleteUserBuffersReply(message_in)); + return Status::OK(); +} + Status Client::GetBlob(ObjectID const id, std::shared_ptr& blob) { return this->GetBlob(id, false, blob); } @@ -388,6 +671,329 @@ Status Client::PullNextStreamChunk(ObjectID const id, buffer->meta().GetTypeName() + "'"); } +Status Client::AbortStream(ObjectID const id, bool& success) { + ENSURE_CONNECTED(this); + std::string message_out; + WriteAbortStreamRequest(id, message_out); + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + RETURN_ON_ERROR(ReadAbortStreamReply(message_in, success)); + return Status::OK(); +} + +Status Client::CheckFixedStreamReceived(ObjectID const id, int index, + bool& finished) { + ENSURE_CONNECTED(this); + std::string message_out; + WriteCheckFixedStreamReceivedRequest(id, index, message_out); + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + RETURN_ON_ERROR(ReadCheckFixedStreamReceivedReply(finished, message_in)); + return Status::OK(); +} + +Status Client::VineyardOpenRemoteFixedStream(ObjectID remote_id, + ObjectID local_id, int& fd, + int blob_nums, size_t size, + std::string remote_endpoint, + StreamOpenMode mode, bool wait, + uint64_t timeout) { + ENSURE_CONNECTED(this); + std::string message_out; + WriteVineyardOpenRemoteFixedStreamRequest( + remote_id, "", local_id, blob_nums, size, remote_endpoint, + static_cast(mode), wait, timeout, message_out); + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + ObjectID id; + RETURN_ON_ERROR(ReadVineyardOpenRemoteFixedStreamReply(message_in, id)); + + fd = recv_fd(vineyard_conn_); + return Status::OK(); +} + +Status Client::VineyardOpenRemoteFixedStream(std::string remote_stream_name, + ObjectID local_id, int& fd, + int blob_nums, size_t size, + std::string remote_endpoint, + StreamOpenMode mode, bool wait, + uint64_t timeout) { + ENSURE_CONNECTED(this); + std::string message_out; + WriteVineyardOpenRemoteFixedStreamRequest( + InvalidObjectID(), remote_stream_name, local_id, blob_nums, size, + remote_endpoint, static_cast(mode), wait, timeout, message_out); + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + ObjectID id; + RETURN_ON_ERROR(ReadVineyardOpenRemoteFixedStreamReply(message_in, id)); + fd = recv_fd(vineyard_conn_); + return Status::OK(); +} + +Status Client::VineyardActivateRemoteFixedStream(ObjectID local_id, + std::vector& buffers) { + ENSURE_CONNECTED(this); + std::string message_out; + std::vector blob_list; + WriteVineyardActivateRemoteFixedStreamRequest(local_id, true, blob_list, + message_out); + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + std::vector payload_list; + std::vector fds_sent, fds_recv; + RETURN_ON_ERROR(ReadVineyardActivateRemoteFixedStreamReply( + message_in, payload_list, fds_sent)); + + std::set fds_recv_set; + for (size_t i = 0; i < payload_list.size(); ++i) { + int fd_recv = shm_->PreMmap(payload_list[i].store_fd); + if (fd_recv != -1) { + fds_recv_set.emplace(fd_recv); + } + } + fds_recv.assign(fds_recv_set.begin(), fds_recv_set.end()); + + if (message_in.contains("fds") && fds_recv != fds_sent) { + json error = json::object(); + error["error"] = + "VineyardActivateRemoteFixedStream: the fds are not matched " + "between client and server"; + error["fds_sent"] = fds_sent; + error["fds_recv"] = fds_recv; + error["response"] = message_in; + return Status::Invalid(error.dump()); + } + + for (size_t i = 0; i < payload_list.size(); ++i) { + uint8_t *shared = nullptr, *dist = nullptr; + RETURN_ON_ERROR( + shm_->Mmap(payload_list[i].store_fd, payload_list[i].object_id, + payload_list[i].map_size, payload_list[i].data_size, + payload_list[i].data_offset, + payload_list[i].pointer - payload_list[i].data_offset, false, + false, &shared)); + dist = shared + payload_list[i].data_offset; + buffers.push_back(reinterpret_cast(dist)); + } + + return Status::OK(); +} + +Status Client::VineyardActivateRemoteFixedStream( + ObjectID local_id, std::vector& blob_list) { + ENSURE_CONNECTED(this); + std::string message_out; + WriteVineyardActivateRemoteFixedStreamRequest(local_id, false, blob_list, + message_out); + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + std::vector payload_list; + std::vector fds_sent, fds_recv; + RETURN_ON_ERROR(ReadVineyardActivateRemoteFixedStreamReply( + message_in, payload_list, fds_sent)); + return Status::OK(); +} + +Status Client::VineyardActivateRemoteFixedStreamWithOffset( + ObjectID local_id, std::vector& offset_list) { + ENSURE_CONNECTED(this); + std::string message_out; + WriteVineyardActivateRemoteFixedStreamWithOffsetRequest(local_id, offset_list, + message_out); + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + RETURN_ON_ERROR( + ReadVineyardActivateRemoteFixedStreamWithOffsetReply(message_in)); + return Status::OK(); +} + +Status Client::OpenFixedStream(ObjectID stream_id, StreamOpenMode mode, + int& fd) { + ENSURE_CONNECTED(this); + std::string message_out; + WriteOpenFixedStreamRequest(stream_id, static_cast(mode), + message_out); + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + RETURN_ON_ERROR(ReadOpenFixedStreamReply(message_in)); + fd = recv_fd(vineyard_conn_); + return Status::OK(); +} + +Status Client::VineyardCloseRemoteFixedStream(ObjectID local_id) { + ENSURE_CONNECTED(this); + std::string message_out; + WriteVineyardCloseRemoteFixedStreamRequest(local_id, message_out); + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + RETURN_ON_ERROR(ReadVineyardCloseRemoteFixedStreamReply(message_in)); + return Status::OK(); +} + +Status Client::GetVineyardMmapFd(int& fd, size_t& size, size_t& offset) { + ENSURE_CONNECTED(this); + std::string message_out; + WriteGetVineyardMmapFdRequest(message_out); + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + RETURN_ON_ERROR(ReadGetVineyardMmapFdReply(message_in, size, offset)); + fd = recv_fd(vineyard_conn_); + return Status::OK(); +} + +Status Client::VineyardStopStream(ObjectID local_id) { + // TBD + return Status::NotImplemented("Not implemented yet"); +} + +Status Client::VineyardDropStream(ObjectID local_id) { + // TBD + return Status::NotImplemented("Not implemented yet"); +} + +Status Client::VineyardAbortRemoteStream(ObjectID local_id, bool& success) { + ENSURE_CONNECTED(this); + std::string message_out; + WriteVineyardAbortRemoteStreamRequest(local_id, message_out); + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + RETURN_ON_ERROR(ReadVineyardAbortRemoteStreamReply(message_in, success)); + return Status::OK(); +} + +Status Client::VineyardGetNextFixedStreamChunk(int& index) { + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + RETURN_ON_ERROR(ReadStreamReadyAckReply(message_in, index)); + return Status::OK(); +} + +Status Client::VineyardGetMetasByNames(std::vector& names, + std::string rpc_encpoint, + std::vector& metas, + std::string req_flag) { + ENSURE_CONNECTED(this); + RETURN_ON_ERROR(LseekExtraMsgWritePos(0)); + RETURN_ON_ERROR(AttachReqFlag(req_flag)); + + std::string message_out; + WriteVineyardGetMetasByNamesRequest(names, rpc_encpoint, message_out); + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + std::vector contents; + RETURN_ON_ERROR(ReadVineyardGetMetasByNamesReply(message_in, contents)); + for (auto const& content : contents) { + ObjectMeta meta; + meta.SetMetaData(this, content); + metas.emplace_back(meta); + } + return Status::OK(); +} + +Status Client::VineyardGetRemoteBlobs( + std::vector> local_id_vec, + std::vector> remote_id_vec, std::string rpc_endpoint, + int& fd, std::string& req_flag) { + ENSURE_CONNECTED(this); + uint64_t start = 0, end = 0; + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + std::string message_out; + WriteVineyardGetRemoteBlobsWithRDMARequest(local_id_vec, remote_id_vec, + rpc_endpoint, message_out); + RETURN_ON_ERROR(LseekExtraMsgWritePos(0)); + RETURN_ON_ERROR(AttachReqFlag(req_flag)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + + logger_ << LogPrefix() << "Request: " << req_flag + << ". VineyardGetRemoteBlobsWithRDMA Construct IPC msg cost:" + << (end - start) << " us." << std::endl; + start = end; + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + logger_ << LogPrefix() << "Request: " << req_flag + << ". VineyardGetRemoteBlobsWithRDMA IPC cost:" << (end - start) + << " us." << std::endl; + RETURN_ON_ERROR(ReadVineyardGetRemoteBlobsWithRDMAReply(message_in)); + + fd = recv_fd(vineyard_conn_); + return Status::OK(); +} + +Status Client::VineyardGetRemoteBlobsWithOffset( + std::vector>& local_offset_vec, + std::vector>& remote_id_vec, + std::vector>& size_vec, std::string rpc_endpoint, + int& fd, std::string& req_flag) { + ENSURE_CONNECTED(this); + uint64_t start = 0, end = 0; + size_t batch_num = 0, batch_size = 0; + batch_num = local_offset_vec.size(); + if (batch_num == 0) { + return Status::OK(); + } + batch_size = local_offset_vec[0].size(); + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + std::string message_out; + WriteVineyardGetRemoteBlobsWithOffsetRequest(batch_num, batch_size, + rpc_endpoint, message_out); + RETURN_ON_ERROR(LseekExtraMsgWritePos(0)); + RETURN_ON_ERROR(AttachReqFlag(req_flag)); + for (size_t i = 0; i < batch_num; ++i) { + RETURN_ON_ERROR(WriteExtraMsg(local_offset_vec[i].data(), + local_offset_vec[i].size() * sizeof(size_t))); + RETURN_ON_ERROR(WriteExtraMsg(remote_id_vec[i].data(), + remote_id_vec[i].size() * sizeof(ObjectID))); + RETURN_ON_ERROR( + WriteExtraMsg(size_vec[i].data(), size_vec[i].size() * sizeof(size_t))); + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + + logger_ << LogPrefix() << "Request: " << req_flag + << ". VineyardGetRemoteBlobsWithOffset Construct IPC msg cost:" + << (end - start) << " us." << std::endl; + start = end; + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + logger_ << LogPrefix() << "Request: " << req_flag + << ". VineyardGetRemoteBlobsWithOffset IPC cost:" << (end - start) + << " us." << std::endl; + RETURN_ON_ERROR(ReadVineyardGetRemoteBlobsWithOffsetReply(message_in)); + + fd = recv_fd(vineyard_conn_); + return Status::OK(); +} + std::shared_ptr Client::GetObject(const ObjectID id, const bool sync_remote) { ObjectMeta meta; @@ -965,6 +1571,32 @@ Status Client::DelData(const std::vector& ids, const bool force, return Status::OK(); } +Status Client::DelHugeData(const std::vector& ids, + const bool release_blob, const bool force, + const bool deep, std::string req_flag) { + ENSURE_CONNECTED(this); + if (release_blob) { + for (auto id : ids) { + // May contain duplicated blob ids. + VINEYARD_DISCARD(Release(id)); + } + } + + RETURN_ON_ERROR(LseekExtraMsgWritePos(0)); + RETURN_ON_ERROR(AttachReqFlag(req_flag)); + RETURN_ON_ERROR(WriteExtraMsg(ids.data(), ids.size() * sizeof(ObjectID))); + + std::string message_out; + WriteDelHugeDataRequest(ids.size(), force, deep, /*memory_trim*/ false, + /*fastpath=*/false, message_out); + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + RETURN_ON_ERROR(ReadDelHugeDataReply(message_in)); + + return Status::OK(); +} + Status Client::GetBufferSizes(const std::set& ids, std::map& sizes) { return this->GetBufferSizes(ids, false, sizes); @@ -1064,6 +1696,19 @@ Status Client::Seal(ObjectID const& object_id) { return Status::OK(); } +Status Client::SealUserBlob(ObjectID const& object_id) { + ENSURE_CONNECTED(this); + RETURN_ON_ASSERT(IsBlob(object_id)); + std::string message_out; + WriteSealRequest(object_id, message_out); + RETURN_ON_ERROR(doWrite(message_out)); + + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + RETURN_ON_ERROR(ReadSealReply(message_in)); + return Status::OK(); +} + Status Client::ShallowCopy(ObjectID const id, ObjectID& target_id, Client& source_client) { ENSURE_CONNECTED(this); @@ -1150,6 +1795,194 @@ Status Client::ShallowCopy(PlasmaID const plasma_id, ObjectID& target_id, return Status::OK(); } +Status Client::GetObjectLocation(const std::vector& names, + std::vector>& locations, + std::string& req_flag) { + ENSURE_CONNECTED(this); + uint64_t start = 0, end = 0; + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + std::string message_out; + WriteGetObjectLocationRequest(names, message_out); + RETURN_ON_ERROR(LseekExtraMsgWritePos(0)); + RETURN_ON_ERROR(AttachReqFlag(req_flag)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + logger_ << LogPrefix() << "Request: " << req_flag + << ". GetObjectLocation prepare cost:" << (end - start) << " us." + << std::endl; + + start = end; + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + logger_ << LogPrefix() << "Request: " << req_flag + << ". GetObjectLocation IPC cost:" << (end - start) << " us." + << std::endl; + + start = end; + std::vector> location_vector; + RETURN_ON_ERROR(ReadGetObjectLocationReply(message_in, location_vector)); + for (size_t i = 0; i < location_vector.size(); i++) { + std::set location_set; + for (auto const& location : location_vector[i]) { + location_set.insert(location); + } + locations.push_back(location_set); + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + logger_ << LogPrefix() << "Request: " << req_flag + << ". GetObjectLocation parse cost:" << (end - start) << " us." + << std::endl; + + return Status::OK(); +} + +Status Client::PutObjectLocation(const std::vector& names, + const std::vector& locations, + int ttl_seconds, std::string& req_flag) { + ENSURE_CONNECTED(this); + uint64_t start = 0, end = 0; + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + RETURN_ON_ERROR(LseekExtraMsgWritePos(0)); + RETURN_ON_ERROR(AttachReqFlag(req_flag)); + std::string message_out; + WritePutObjectLocationRequest(names, locations, ttl_seconds, message_out); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + logger_ << LogPrefix() << "Request: " << req_flag + << ". PutObjectLocation prepare cost:" << (end - start) << " us." + << std::endl; + start = end; + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + logger_ << LogPrefix() << "Request: " << req_flag + << ". PutObjectLocation IPC cost:" << (end - start) << " us." + << std::endl; + RETURN_ON_ERROR(ReadPutObjectLocationReply(message_in)); + return Status::OK(); +} + +Status Client::PutNames(const std::vector& ids, + const std::vector& names, + std::string& req_flag, const bool overwrite) { + ENSURE_CONNECTED(this); + RETURN_ON_ASSERT(ids.size() == names.size(), + "ids and names should have the same size"); + uint64_t start = 0, end = 0; + RETURN_ON_ERROR(LseekExtraMsgWritePos(0)); + RETURN_ON_ERROR(AttachReqFlag(req_flag)); + + std::string message_out; + WritePutNamesRequest(ids, names, overwrite, message_out); + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + logger_ << LogPrefix() << "Request: " << req_flag + << ". PutNames IPC cost:" << (end - start) << " us" << std::endl; + RETURN_ON_ERROR(ReadPutNamesReply(message_in)); + return Status::OK(); +} + +Status Client::GetNames(const std::vector& name_vec, + std::vector& id_vec, std::string& req_flag) { + ENSURE_CONNECTED(this); + uint64_t start = 0, end = 0; + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + RETURN_ON_ERROR(LseekExtraMsgWritePos(0)); + RETURN_ON_ERROR(AttachReqFlag(req_flag)); + std::string message_out; + WriteGetNamesRequest(name_vec, false, message_out); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + logger_ << LogPrefix() << "Request: " << req_flag + << ". GetNames write msg cost:" << (end - start) << " us." + << std::endl; + start = end; + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + logger_ << LogPrefix() << "Request: " << req_flag + << ". GetNames IPC cost:" << (end - start) << " us." << std::endl; + start = end; + RETURN_ON_ERROR(ReadGetNamesReply(message_in, id_vec)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + logger_ << LogPrefix() << "Request: " << req_flag + << ". GetNames parse cost:" << (end - start) << " us." << std::endl; + return Status::OK(); +} + +Status Client::DropNames(std::vector& names, + std::string& req_flag) { + ENSURE_CONNECTED(this); + uint64_t start = 0, end = 0; + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + std::string message_out; + WriteDropNamesRequest(names, message_out); + std::vector name_lengths(names.size()); + for (size_t i = 0; i < names.size(); ++i) { + name_lengths[i] = names[i].size(); + } + + RETURN_ON_ERROR(LseekExtraMsgWritePos(0)); + RETURN_ON_ERROR(AttachReqFlag(req_flag)); + RETURN_ON_ERROR( + WriteExtraMsg(name_lengths.data(), name_lengths.size() * sizeof(size_t))); + + for (size_t i = 0; i < names.size(); ++i) { + RETURN_ON_ERROR(WriteExtraMsg(names[i].data(), names[i].size())); + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + logger_ << LogPrefix() << "Request: " << req_flag + << ". DropNames prepare msg cost:" << (end - start) << " us." + << std::endl; + start = end; + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + logger_ << LogPrefix() << "Request: " << req_flag + << ". DropNames IPC cost:" << (end - start) << " us." << std::endl; + RETURN_ON_ERROR(ReadDropNamesReply(message_in)); + return Status::OK(); +} + Status Client::IsInUse(ObjectID const& id, bool& is_in_use) { ENSURE_CONNECTED(this); @@ -1203,6 +2036,108 @@ Status Client::TryReleaseLock(std::string key, bool& result) { return Status::OK(); } +Status Client::RequireExtraRequestMemory(size_t size) { + ENSURE_CONNECTED(this); + + std::string message_out; + WriteRequireExtraRequestMemoryRequest(size, message_out); + VINEYARD_CHECK_OK(doWrite(message_out)); + + json message_in; + VINEYARD_CHECK_OK(doRead(message_in)); + VINEYARD_CHECK_OK(ReadRequireExtraRequestMemoryReply(message_in)); + + int fd = recv_fd(vineyard_conn_); + void* mmap_addr = + mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (mmap_addr == MAP_FAILED) { + return Status::IOError("mmap failed"); + } + extra_request_memory_addr_ = mmap_addr; + extra_request_mem_size_ = size; + + std::cout << "Extra request memory addr: " << extra_request_memory_addr_ + << ", size: " << extra_request_mem_size_ << std::endl; + close(fd); + return Status::OK(); +} + +Status Client::WriteExtraMsg(const void* data, size_t size) { + if (!extra_request_memory_addr_ || size > extra_request_mem_size_) { + return Status::Invalid("Invalid extra request memory"); + } + if (write_pos_ + size > extra_request_mem_size_) { + return Status::Invalid("Invalid extra request memory"); + } + memory::concurrent_memcpy( + static_cast(extra_request_memory_addr_) + write_pos_, data, size); + write_pos_ += size; + return Status::OK(); +} + +Status Client::ReadExtraMsg(void* data, size_t size) { + if (!extra_request_memory_addr_ || size > extra_request_mem_size_) { + return Status::Invalid("Invalid extra request memory"); + } + if (read_pos_ + size > extra_request_mem_size_) { + return Status::Invalid("Invalid extra request memory"); + } + memory::concurrent_memcpy( + data, static_cast(extra_request_memory_addr_) + read_pos_, size); + read_pos_ += size; + return Status::OK(); +} + +Status Client::LseekExtraMsgWritePos(uint64_t offset) { + if (!extra_request_memory_addr_ || offset > extra_request_mem_size_) { + return Status::Invalid("Invalid extra request memory"); + } + write_pos_ = offset; + return Status::OK(); +} + +Status Client::LseekExtraMsgReadPos(uint64_t offset) { + if (!extra_request_memory_addr_ || offset > extra_request_mem_size_) { + return Status::Invalid("Invalid extra request memory"); + } + read_pos_ = offset; + return Status::OK(); +} + +Status Client::AttachReqFlag(const std::string& req_flag) { + std::string attr_str = + ClientAttributes::Default().SetReqName(req_flag).ToJsonString(); + size_t length = attr_str.length(); + RETURN_ON_ERROR(WriteExtraMsg(&length, sizeof(size_t))); + RETURN_ON_ERROR(WriteExtraMsg(attr_str.data(), length)); + return Status::OK(); +} + +std::string Client::LogPrefix() { + auto now = std::chrono::system_clock::now(); + auto now_time_t = std::chrono::system_clock::to_time_t(now); + auto now_tm = *std::localtime(&now_time_t); + + auto time_since_epoch = now.time_since_epoch(); + auto microseconds = + std::chrono::duration_cast(time_since_epoch) + .count() % + 1000000; + + std::stringstream log_prefix_ss; + log_prefix_ss << 'I'; + log_prefix_ss << std::put_time(&now_tm, "%Y%m%d %H:%M:%S"); + log_prefix_ss << '.' << std::setw(6) << std::setfill('0') << microseconds; + log_prefix_ss << ' ' << gettid(); + std::string file_name = (__FILE__); + size_t pos = file_name.find_last_of('/'); + if (pos != std::string::npos) { + file_name = file_name.substr(pos + 1); + } + log_prefix_ss << ' ' << file_name << ':' << __LINE__ << "] "; + return log_prefix_ss.str(); +} + PlasmaClient::~PlasmaClient() {} // dummy implementation diff --git a/src/client/client.h b/src/client/client.h index c96cb27d7..d84ef0c44 100644 --- a/src/client/client.h +++ b/src/client/client.h @@ -32,14 +32,20 @@ limitations under the License. #include "common/util/lifecycle.h" #include "common/util/protocols.h" #include "common/util/status.h" +#include "common/util/trace.h" #include "common/util/uuid.h" +#include "thread-pool/thread_pool.h" + namespace vineyard { class Blob; class BlobWriter; +class UserBlob; +class UserBlobBuilder; class Buffer; class MutableBuffer; +class UserBuffer; namespace detail { @@ -248,10 +254,10 @@ class PlasmaClient; * vineyard server. Vineyard's IPC Client talks to vineyard server * and manipulate objects in vineyard. */ -class Client final : public BasicIPCClient, - protected detail::UsageTracker { +class Client : public BasicIPCClient, + protected detail::UsageTracker { public: - Client() {} + Client() { logger_ = Logger(stoi(VineyardEnv::GetVineyardTraceLogLevel())); } ~Client() override; @@ -357,6 +363,9 @@ class Client final : public BasicIPCClient, Status FetchAndGetMetaData(const ObjectID id, ObjectMeta& meta_data, const bool sync_remote = false); + Status CreateHugeMetaData(std::vector& meta_datas, + std::vector& ids, std::string& req_flag); + /** * @brief Obtain multiple metadatas from vineyard server. * @@ -371,6 +380,11 @@ class Client final : public BasicIPCClient, Status GetMetaData(const std::vector& ids, std::vector&, const bool sync_remote = false); + Status GetHugeMetaData(const std::vector& ids, + std::vector& metas, std::string& req_flag, + const bool sync_remote = false, + bool fast_path = false); + /** * @brief Create a blob in vineyard server. When creating a blob, vineyard * server's bulk allocator will prepare a block of memory of the requested @@ -398,6 +412,15 @@ class Client final : public BasicIPCClient, Status CreateBlobs(const std::vector& sizes, std::vector>& blobs); + Status CreateUserBlobs(const std::vector& offsets, + const std::vector& sizes, + std::vector>& blobs, + std::string& req_flag); + + Status GetUserBlobs(std::vector& ids, + std::vector>& blobs); + + Status DeleteUserBlobs(std::vector& ids, std::string& req_flag); /** * @brief Get a blob from vineyard server. * @@ -489,6 +512,62 @@ class Client final : public BasicIPCClient, */ Status PullNextStreamChunk(ObjectID const id, std::unique_ptr& chunk); + Status AbortStream(ObjectID const id, bool& success); + + Status VineyardOpenRemoteFixedStream(ObjectID remote_id, ObjectID local_id, + int& fd, int blob_nums, size_t size, + std::string remote_endpoint, + StreamOpenMode mode, bool wait = false, + uint64_t timeout = 0); + + Status VineyardOpenRemoteFixedStream(std::string remote_stream_name, + ObjectID local_id, int& fd, + int blob_nums, size_t size, + std::string remote_endpoint, + StreamOpenMode mode, bool wait = false, + uint64_t timeout = 0); + + Status VineyardActivateRemoteFixedStream(ObjectID local_id, + std::vector& buffers); + + Status VineyardActivateRemoteFixedStream(ObjectID local_id, + std::vector& buffer_list); + + Status VineyardActivateRemoteFixedStreamWithOffset( + ObjectID local_id, std::vector& offset_list); + + Status VineyardCloseRemoteFixedStream(ObjectID local_id); + + Status OpenFixedStream(ObjectID stream_id, StreamOpenMode mode, int& fd); + + Status VineyardStopStream(ObjectID local_id); + + Status VineyardDropStream(ObjectID local_id); + + Status VineyardAbortRemoteStream(ObjectID local_id, bool& success); + + Status VineyardGetNextFixedStreamChunk(int& index); + + Status VineyardGetMetasByNames(std::vector& names, + std::string rpc_encpoint, + std::vector& metas, + std::string req_flag); + + virtual Status VineyardGetRemoteBlobs( + std::vector> local_id_vec, + std::vector> remote_id_vec, + std::string rpc_endpoint, int& fd, std::string& req_flag); + + virtual Status VineyardGetRemoteBlobsWithOffset( + std::vector>& local_offset_vec, + std::vector>& remote_id_vec, + std::vector>& size_vec, std::string rpc_endpoint, + int& fd, std::string& req_flag); + + Status GetVineyardMmapFd(int& fd, size_t& size, size_t& offset); + + Status CheckFixedStreamReceived(ObjectID const id, int index, bool& finished); + /** * @brief Get an object from vineyard. The ObjectFactory will be used to * resolve the constructor of the object. @@ -777,6 +856,23 @@ class Client final : public BasicIPCClient, Status ShallowCopy(PlasmaID const plasma_id, ObjectID& target_id, PlasmaClient& source_client); + Status GetObjectLocation(const std::vector& names, + std::vector>& locations, + std::string& req_flag); + + Status PutObjectLocation(const std::vector& names, + const std::vector& locations, + int ttl_seconds, std::string& req_flag); + + Status PutNames(const std::vector& ids, + const std::vector& names, std::string& req_flag, + const bool overwrite = true); + + Status GetNames(const std::vector& name_vec, + std::vector& id_vec, std::string& req_flag); + + Status DropNames(std::vector& names, std::string& req_flag); + /** * @brief Decrease the reference count of the object. It will trigger * `OnRelease` behavior when reference count reaches zero. See UsageTracker. @@ -824,6 +920,10 @@ class Client final : public BasicIPCClient, Status DelData(const std::vector& ids, const bool force, const bool deep, const bool memory_trim); + Status DelHugeData(const std::vector& ids, + const bool release_blob = true, const bool force = false, + const bool deep = true, std::string req_flag = ""); + /** * @brief Create a GPU buffer on vineyard server. See also `CreateBuffer`. * @@ -877,6 +977,8 @@ class Client final : public BasicIPCClient, */ Status TryReleaseLock(std::string key, bool& result) override; + Status RequireExtraRequestMemory(size_t size); + protected: /** * @brief Required by `UsageTracker`. When reference count reaches zero, send @@ -968,6 +1070,8 @@ class Client final : public BasicIPCClient, */ Status Seal(ObjectID const& object_id); + Status SealUserBlob(ObjectID const& object_id); + /** * @brief Check if the client is an IPC client. * @@ -982,8 +1086,28 @@ class Client final : public BasicIPCClient, Status GetBufferSizes(const std::set& ids, const bool unsafe, std::map& sizes); + Status WriteExtraMsg(const void* data, size_t size); + + Status ReadExtraMsg(void* data, size_t size); + + Status LseekExtraMsgWritePos(uint64_t offset); + + Status LseekExtraMsgReadPos(uint64_t offset); + + Status AttachReqFlag(const std::string& req_flag); + + std::string LogPrefix(); + + void* extra_request_memory_addr_ = nullptr; + size_t extra_request_mem_size_ = 0; + uint64_t write_pos_ = 0; + uint64_t read_pos_ = 0; + Logger logger_; + friend class Blob; friend class BlobWriter; + friend class UserBlob; + friend class UserBlobBuilder; friend class ObjectBuilder; friend class detail::UsageTracker; }; diff --git a/src/client/client_base.cc b/src/client/client_base.cc index 8ce89390f..a1f69ef65 100644 --- a/src/client/client_base.cc +++ b/src/client/client_base.cc @@ -13,10 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "client/client_base.h" - #include +#include + +#include "client/client_base.h" #include "client/ds/i_object.h" #include "client/ds/object_factory.h" #include "client/io.h" @@ -260,14 +261,50 @@ Status ClientBase::CreateStream(const ObjectID& id) { return Status::OK(); } +Status ClientBase::CreateFixedStream(ObjectID& id, std::string stream_name, + int blob_num, size_t blob_size) { + ENSURE_CONNECTED(this); + std::string message_out; + WriteCreateFixedStreamRequest(stream_name, blob_num, blob_size, message_out); + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + RETURN_ON_ERROR(ReadCreateFixedStreamReply(message_in, id)); + return Status::OK(); +} + +Status ClientBase::PutStreamName(const ObjectID& id, const std::string& name) { + std::string message_out; + WritePutStreamNameRequest(id, name, message_out); + RETURN_ON_ERROR(doWrite(message_out)); + + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + RETURN_ON_ERROR(ReadPutStreamNameReply(message_in)); + return Status::OK(); +} + +Status ClientBase::GetStreamIDByName(std::string name, ObjectID& id) { + std::string message_out; + WriteGetStreamIDByNameRequest(name, message_out); + RETURN_ON_ERROR(doWrite(message_out)); + + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + RETURN_ON_ERROR(ReadGetStreamIDByNameReply(message_in, id)); + return Status::OK(); +} + Status ClientBase::OpenStream(const ObjectID& id, StreamOpenMode mode) { ENSURE_CONNECTED(this); + ObjectID ret_id; std::string message_out; - WriteOpenStreamRequest(id, static_cast(mode), message_out); + WriteOpenStreamRequest(id, "", static_cast(mode), false, 0, + message_out); RETURN_ON_ERROR(doWrite(message_out)); json message_in; RETURN_ON_ERROR(doRead(message_in)); - RETURN_ON_ERROR(ReadOpenStreamReply(message_in)); + RETURN_ON_ERROR(ReadOpenStreamReply(message_in, ret_id)); return Status::OK(); } @@ -283,6 +320,40 @@ Status ClientBase::PushNextStreamChunk(ObjectID const id, return Status::OK(); } +Status ClientBase::PushNextStreamChunkByOffset(ObjectID const id, + uint64_t offset) { + ENSURE_CONNECTED(this); + std::string message_out; + WritePushNextStreamChunkByOffsetRequest(id, offset, message_out); + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + RETURN_ON_ERROR(ReadPushNextStreamChunkByOffsetReply(message_in)); + return Status::OK(); +} + +Status ClientBase::CloseStream(ObjectID id) { + ENSURE_CONNECTED(this); + std::string message_out; + WriteCloseStreamRequest(id, message_out); + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + RETURN_ON_ERROR(ReadCloseStreamReply(message_in)); + return Status::OK(); +} + +Status ClientBase::DeleteStream(ObjectID id) { + ENSURE_CONNECTED(this); + std::string message_out; + WriteDeleteStreamRequest(id, message_out); + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + RETURN_ON_ERROR(ReadDeleteStreamReply(message_in)); + return Status::OK(); +} + Status ClientBase::PullNextStreamChunk(ObjectID const id, ObjectID& chunk) { ENSURE_CONNECTED(this); std::string message_out; @@ -346,6 +417,17 @@ Status ClientBase::Persist(const ObjectID id) { return Status::OK(); } +Status ClientBase::Persist(const std::vector& ids) { + ENSURE_CONNECTED(this); + std::string message_out; + WriteBatchPersistRequest(ids, message_out); + RETURN_ON_ERROR(doWrite(message_out)); + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + RETURN_ON_ERROR(ReadBatchPersistReply(message_in)); + return Status::OK(); +} + Status ClientBase::IfPersist(const ObjectID id, bool& persist) { ENSURE_CONNECTED(this); std::string message_out; @@ -391,10 +473,11 @@ Status ClientBase::ShallowCopy(const ObjectID id, json const& extra_metadata, return Status::OK(); } -Status ClientBase::PutName(const ObjectID id, std::string const& name) { +Status ClientBase::PutName(const ObjectID id, std::string const& name, + bool overwrite) { ENSURE_CONNECTED(this); std::string message_out; - WritePutNameRequest(id, name, message_out); + WritePutNameRequest(id, name, overwrite, message_out); RETURN_ON_ERROR(doWrite(message_out)); json message_in; RETURN_ON_ERROR(doRead(message_in)); diff --git a/src/client/client_base.h b/src/client/client_base.h index 34955a341..d6e484719 100644 --- a/src/client/client_base.h +++ b/src/client/client_base.h @@ -19,6 +19,7 @@ limitations under the License. #include #include #include +#include #include #include #include @@ -248,6 +249,13 @@ class ClientBase { */ Status CreateStream(const ObjectID& id); + Status CreateFixedStream(ObjectID& id, std::string stream_name, int blob_num, + size_t blob_size); + + Status PutStreamName(const ObjectID& id, const std::string& name); + + Status GetStreamIDByName(std::string name, ObjectID& id); + /** * @brief open a stream on vineyard. Failed if the stream is already opened on * the given mode. @@ -272,6 +280,12 @@ class ClientBase { */ Status PushNextStreamChunk(ObjectID const id, ObjectID const chunk); + Status PushNextStreamChunkByOffset(ObjectID const id, uint64_t offset); + + Status CloseStream(ObjectID id); + + Status DeleteStream(ObjectID id); + /** * @brief Pull a chunk from a stream. When there's no more chunk available in * the stream, i.e., the stream has been stopped, a status code @@ -341,6 +355,8 @@ class ClientBase { */ Status Persist(const ObjectID id); + Status Persist(const std::vector& ids); + /** * @brief Check if the given object has been persist to etcd. * @@ -403,7 +419,8 @@ class ClientBase { * * @return Status that indicates whether the request has succeeded. */ - Status PutName(const ObjectID id, std::string const& name); + Status PutName(const ObjectID id, std::string const& name, + bool overwrite = true); /** * @brief Retrieve the object ID by associated name. @@ -657,11 +674,11 @@ class ClientBase { int get_timeout_seconds() const { return timeout_seconds_; } protected: - Status doWrite(const std::string& message_out); + virtual Status doWrite(const std::string& message_out); - Status doRead(std::string& message_in); + virtual Status doRead(std::string& message_in); - Status doRead(json& root); + virtual Status doRead(json& root); mutable bool connected_; std::string ipc_socket_; diff --git a/src/client/ds/blob.cc b/src/client/ds/blob.cc index 86d6eb02b..1aae3dc8d 100644 --- a/src/client/ds/blob.cc +++ b/src/client/ds/blob.cc @@ -345,6 +345,70 @@ Status BlobWriter::_Seal(Client& client, std::shared_ptr& object) { return Status::OK(); } +void UserBlob::Construct(ObjectMeta const& meta) { + std::string __type_name = type_name(); + VINEYARD_ASSERT(meta.GetTypeName() == __type_name, + "Expect typename '" + __type_name + "', but got '" + + meta.GetTypeName() + "'"); + this->meta_ = meta; + this->id_ = meta.GetId(); + if (this->buffer_ != nullptr) { + return; + } + if (this->id_ == EmptyBlobID()) { + this->size_ = 0; + return; + } + if (!meta.IsLocal()) { + return; + } + auto buffer = std::dynamic_pointer_cast(this->buffer_); + if (meta.GetBuffer(meta.GetId(), buffer).ok()) { + if (this->buffer_ == nullptr) { + throw std::runtime_error( + "Blob::Construct(): Invalid internal state: local blob found but it " + "is nullptr: " + + ObjectIDToString(meta.GetId())); + } + this->size_ = this->buffer_->size(); + } else { + throw std::runtime_error( + "Blob::Construct(): Invalid internal state: failed to construct local " + "blob since payload is missing: " + + ObjectIDToString(meta.GetId())); + } +} + +Status UserBlobBuilder::_Seal(Client& client, std::shared_ptr& object) { + RETURN_ON_ASSERT(!this->sealed(), + "The user blob builder has been already sealed."); + auto buffer = std::make_shared(this->offset_, this->size_); + + std::shared_ptr blob(new UserBlob()); + object = blob; + + blob->id_ = object_id_; + blob->size_ = size(); + blob->meta_.SetId(object_id_); // blob's id is the address + + // create meta in vineyardd + blob->meta_.SetTypeName(type_name()); + blob->meta_.AddKeyValue("length", size()); + blob->meta_.SetNBytes(size()); + blob->meta_.AddKeyValue("instance_id", client.instance_id()); + blob->meta_.AddKeyValue("transient", true); + + blob->buffer_ = buffer; // assign the readonly buffer. + + RETURN_ON_ERROR(client.SealUserBlob(object_id_)); + // associate extra key-value metadata + for (auto const& kv : metadata_) { + blob->meta_.AddKeyValue(kv.first, kv.second); + } + this->set_sealed(true); + return Status::OK(); +} + Status BufferSet::EmplaceBuffer(ObjectID const id) { auto p = buffers_.find(id); if (p != buffers_.end() && p->second != nullptr) { diff --git a/src/client/ds/blob.h b/src/client/ds/blob.h index 4e9b7e75c..c6eb71bf1 100644 --- a/src/client/ds/blob.h +++ b/src/client/ds/blob.h @@ -104,6 +104,21 @@ class Buffer { Buffer& operator=(Buffer&&) = delete; }; +class UserBuffer : public Buffer { + public: + UserBuffer(size_t offset, const int64_t size) : Buffer(0, size, true) { + is_mutable_ = false; + offset_ = offset; + } + + size_t offset() const { return offset_; } + + protected: + UserBuffer() : Buffer(nullptr, 0, true) {} + + size_t offset_ = 0; +}; + class MutableBuffer : public Buffer { public: MutableBuffer(uint8_t* data, const int64_t size) : Buffer(data, size) { @@ -416,6 +431,8 @@ class BlobWriter : public ObjectBuilder { void Dump() const; protected: + BlobWriter() = default; + Status _Seal(Client& client, std::shared_ptr& object) override; private: @@ -438,6 +455,62 @@ class BlobWriter : public ObjectBuilder { friend class RPCClient; }; +class UserBlob : public Registered { + public: + size_t size() const { return size_; } + + const size_t offset() const { return offset_; } + + void Construct(ObjectMeta const& meta) override; + + static std::unique_ptr Create() __attribute__((used)) { + return std::static_pointer_cast( + std::unique_ptr{new UserBlob()}); + } + + private: + UserBlob() { + this->id_ = InvalidObjectID(); + this->size_ = std::numeric_limits::max(); + this->offset_ = 0; + } + + size_t size_ = 0; + size_t offset_ = 0; + std::shared_ptr buffer_ = nullptr; + + friend class Client; + friend class UserBlobBuilder; + friend class ObjectMeta; +}; + +class UserBlobBuilder : public ObjectBuilder { + public: + uint64_t offset() const { return offset_; } + + size_t size() const { return size_; } + + ObjectID id() const { return object_id_; } + + protected: + Status _Seal(Client& client, std::shared_ptr& object) override; + + private: + Status Build(Client& client) override { return Status::OK(); }; + + UserBlobBuilder(ObjectID const object_id, const size_t size, + const size_t offset) + : object_id_(object_id), size_(size), offset_(offset) {} + + ObjectID object_id_; + std::shared_ptr buffer_; + uint64_t size_ = 0; + uint64_t offset_ = 0; + std::unordered_map metadata_; + + friend class Client; +}; + /** * @brief A set of (readonly) buffers that been associated with an object and * its members (recursively). diff --git a/src/client/ds/object_meta.h b/src/client/ds/object_meta.h index 3928cc399..bf554dc48 100644 --- a/src/client/ds/object_meta.h +++ b/src/client/ds/object_meta.h @@ -814,6 +814,8 @@ class ObjectMeta { friend class Blob; friend class RemoteBlob; friend class BlobWriter; + friend class UserBlob; + friend class UserBlobBuilder; }; template <> diff --git a/src/client/ds/stream.h b/src/client/ds/stream.h index 487e8b84c..82048e844 100644 --- a/src/client/ds/stream.h +++ b/src/client/ds/stream.h @@ -39,6 +39,9 @@ struct stream_type { template using stream_type_t = typename stream_type::type; +#define STREAM_PAGE_SIZE 4096 +#define STREAM_ERROR_LENGTH 256 + template class Stream : public Object { public: diff --git a/src/client/rpc_client.cc b/src/client/rpc_client.cc index 0ed089a8a..83e1e6528 100644 --- a/src/client/rpc_client.cc +++ b/src/client/rpc_client.cc @@ -1051,6 +1051,7 @@ Status RPCClient::GetRemoteBlobs( Status RPCClient::doReleaseBlobsWithRDMARequest( std::unordered_set id_set) { + ENSURE_CONNECTED(this); std::string message_out; WriteReleaseBlobsWithRDMARequest(id_set, message_out); RETURN_ON_ERROR(doWrite(message_out)); diff --git a/src/common/memory/payload.cc b/src/common/memory/payload.cc index 84a366017..0acbcba12 100644 --- a/src/common/memory/payload.cc +++ b/src/common/memory/payload.cc @@ -24,6 +24,7 @@ Payload::Payload() store_fd(-1), arena_fd(-1), data_offset(0), + user_offset(0), data_size(0), map_size(0), ref_cnt(0), @@ -41,6 +42,7 @@ Payload::Payload(ObjectID object_id, int64_t size, uint8_t* ptr, int fd, store_fd(fd), arena_fd(-1), data_offset(offset), + user_offset(0), data_size(size), map_size(msize), ref_cnt(0), @@ -58,6 +60,7 @@ Payload::Payload(ObjectID object_id, int64_t size, uint8_t* ptr, int fd, store_fd(fd), arena_fd(arena_fd), data_offset(offset), + user_offset(0), data_size(size), map_size(msize), ref_cnt(0), @@ -74,6 +77,8 @@ Payload::Payload(const Payload& payload) { store_fd = payload.store_fd; arena_fd = payload.arena_fd; data_offset = payload.data_offset; + user_offset = payload.user_offset; + is_user_created = payload.is_user_created; data_size = payload.data_size; map_size = payload.map_size; ref_cnt = payload.ref_cnt; @@ -90,6 +95,8 @@ Payload& Payload::operator=(const Payload& payload) { store_fd = payload.store_fd; arena_fd = payload.arena_fd; data_offset = payload.data_offset; + user_offset = payload.user_offset; + is_user_created = payload.is_user_created; data_size = payload.data_size; map_size = payload.map_size; ref_cnt = payload.ref_cnt; @@ -122,6 +129,7 @@ void Payload::ToJSON(json& tree) const { tree["object_id"] = object_id; tree["store_fd"] = store_fd; tree["data_offset"] = data_offset; + tree["user_offset"] = user_offset; tree["data_size"] = data_size; tree["map_size"] = map_size; tree["pointer"] = reinterpret_cast(pointer); @@ -134,6 +142,7 @@ void Payload::FromJSON(const json& tree) { object_id = tree["object_id"].get(); store_fd = tree["store_fd"].get(); data_offset = tree["data_offset"].get(); + user_offset = tree["user_offset"].get(); data_size = tree["data_size"].get(); map_size = tree["map_size"].get(); pointer = reinterpret_cast(tree["pointer"].get()); @@ -160,6 +169,7 @@ void PlasmaPayload::ToJSON(json& tree) const { tree["plasma_size"] = plasma_size; tree["store_fd"] = store_fd; tree["data_offset"] = data_offset; + tree["user_offset"] = user_offset; tree["data_size"] = data_size; tree["map_size"] = map_size; tree["ref_cnt"] = ref_cnt; @@ -174,6 +184,7 @@ void PlasmaPayload::FromJSON(const json& tree) { plasma_size = tree["plasma_size"].get(); store_fd = tree["store_fd"].get(); data_offset = tree["data_offset"].get(); + user_offset = tree["user_offset"].get(); data_size = tree["data_size"].get(); map_size = tree["map_size"].get(); ref_cnt = tree["ref_cnt"].get(); diff --git a/src/common/memory/payload.h b/src/common/memory/payload.h index b327fe00c..78ba8f5b3 100644 --- a/src/common/memory/payload.h +++ b/src/common/memory/payload.h @@ -35,6 +35,7 @@ struct Payload { int store_fd; int arena_fd; ptrdiff_t data_offset; + uint64_t user_offset; int64_t data_size; int64_t map_size; int64_t ref_cnt; @@ -42,7 +43,9 @@ struct Payload { bool is_sealed; bool is_owner; bool is_spilled; - bool is_gpu; // indicate if the blob is on the GPU + bool is_gpu; // indicate if the blob is on the GPU + bool is_user_created; // if the object pointer is created by user(means that + // the object is not created by vineyard) std::atomic_int pinned; // indicate if the blob is spillable @@ -85,6 +88,8 @@ struct Payload { inline bool IsGPU() { return is_gpu; } + bool IsUserCreated() { return is_user_created; } + /** * @brief Pin the payload, return true is the payload is already pinned. */ diff --git a/src/common/util/env.cc b/src/common/util/env.cc index 0efd16298..28b60889e 100644 --- a/src/common/util/env.cc +++ b/src/common/util/env.cc @@ -354,4 +354,19 @@ int64_t read_physical_memory_limit() { return limit_in_bytes; } +std::string VineyardEnv::GetVineyardTraceLogLevel() { + static std::string log_level = read_env("VINEYARD_TRACE_LOG_LEVEL", "100"); + return log_level; +} + +std::string VineyardEnv::GetVineyardStreamIdleTimeoutMS() { + static std::string timeout = read_env("VINEYARD_STREAM_IDLE_TIMEOUT_MS", "0"); + return timeout; +} + +std::string VineyardEnv::GetVineyardRPCCallTimeoutMS() { + static std::string timeout = read_env("VINEYARD_RPC_TIMEOUT_MS", "5000"); + return timeout; +} + } // namespace vineyard diff --git a/src/common/util/env.h b/src/common/util/env.h index fb1229ab6..ae293f63c 100644 --- a/src/common/util/env.h +++ b/src/common/util/env.h @@ -153,6 +153,15 @@ int64_t parse_memory_size(std::string const& nbytes); */ int64_t read_physical_memory_limit(); +class VineyardEnv { + public: + static std::string GetVineyardTraceLogLevel(); + + static std::string GetVineyardStreamIdleTimeoutMS(); + + static std::string GetVineyardRPCCallTimeoutMS(); +}; + } // namespace vineyard #endif // SRC_COMMON_UTIL_ENV_H_ diff --git a/src/common/util/get_tid.h b/src/common/util/get_tid.h new file mode 100644 index 000000000..2fed1869a --- /dev/null +++ b/src/common/util/get_tid.h @@ -0,0 +1,32 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef SRC_COMMON_UTIL_GET_TID_H_ +#define SRC_COMMON_UTIL_GET_TID_H_ + +#include + +namespace vineyard { + +#if defined(__linux__) +#include +static pid_t gettid(void) { return syscall(SYS_gettid); } +#else +static pid_t gettid(void) { return 0; } +#endif + +} // namespace vineyard + +#endif // SRC_COMMON_UTIL_GET_TID_H_ diff --git a/src/common/util/json.h b/src/common/util/json.h index 07e91d73e..a9e27d6ec 100644 --- a/src/common/util/json.h +++ b/src/common/util/json.h @@ -224,6 +224,11 @@ inline std::string unescape_json_pointer(std::string& s) { return s; } +inline json json_from_buf(const void* buf, size_t request_length) { + return json::parse(reinterpret_cast(buf), + reinterpret_cast(buf) + request_length); +} + } // namespace vineyard #endif // SRC_COMMON_UTIL_JSON_H_ diff --git a/src/common/util/monitor.h b/src/common/util/monitor.h new file mode 100644 index 000000000..d20975963 --- /dev/null +++ b/src/common/util/monitor.h @@ -0,0 +1,201 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef SRC_COMMON_UTIL_MONITOR_H_ +#define SRC_COMMON_UTIL_MONITOR_H_ + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef ENABLE_VINEYARD_MONITOR + +#define MONITOR_START(__monitor) __monitor.StartTick(); +#define MONITOR_END(__monitor) __monitor.EndTick() +#define DUMP_MONITOR(__monitors) __monitors.Dump() +#define DUMP_MONITOR_HEADER() vineyard::monitor::Monitor::DumpHeader() +#define MONITOR_AUTO(__monitor) \ + vineyard::monitor::MonitorGuard monitor##guard(__monitor) + +#define MONITOR_CLEAR(__monitor, __name, __unit) \ + do { \ + __monitor.Clear(); \ + __monitor.SetName(__name); \ + __monitor.SetUnit(__unit); \ + } while (0) + +#else + +#define MONITOR_START(__monitor) +#define MONITOR_END(__monitor) +#define DUMP_MONITOR(__monitors) +#define DUMP_MONITOR_HEADER() +#define MONITOR_AUTO(__monitor) +#define MONITOR_CLEAR(__monitor, __name, __unit) + +#endif + +namespace vineyard { + +namespace monitor { + +enum UNIT { MICROSECONDS, MILLISECONDS, NANOSECONDS, SECONDS }; + +class Monitor { + public: + Monitor(std::string name, UNIT unit) : name_(std::move(name)), unit_(unit) {} + + Monitor() : name_("default_monitor"), unit_(MICROSECONDS) {} + + void StartTick() { + if (started_) { + return; + } + mutex_.lock(); + + started_ = true; + last_time_ = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()); + } + + void EndTick() { + if (!started_) { + return; + } + mutex_.unlock(); + + auto now = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()); + timestamp_.push_back(now - last_time_); + last_time_ = now; + started_ = false; + } + + static void DumpHeader() { + std::cout << "Dump Monitor Info:" << std::endl; + size_t name_width = 20; + size_t count_width = 10; + size_t time_width = 18; + std::cout << std::left << std::setw(name_width) << "Name" + << std::setw(count_width) << "Count" << std::setw(time_width) + << "Avg" << std::setw(time_width) << "Min" + << std::setw(time_width) << "Max" << std::setw(time_width) + << "P50" << std::setw(time_width) << "P99" + << std::setw(time_width) << "Total" << std::endl; + } + + void Dump() { + if (timestamp_.empty()) { + return; + } + std::sort(timestamp_.begin(), timestamp_.end()); + size_t count = timestamp_.size(); + std::chrono::nanoseconds total = std::chrono::nanoseconds::zero(); + for (const auto& t : timestamp_) { + total += t; + } + std::string unit; + uint64_t factor = 1; + if (unit_ == NANOSECONDS) { + unit = "ns"; + } else if (unit_ == MICROSECONDS) { + unit = "us"; + factor = 1000; + } else if (unit_ == MILLISECONDS) { + unit = "ms"; + factor = 1000 * 1000; + } else if (unit_ == SECONDS) { + unit = "s"; + factor = 1000 * 1000 * 1000; + } else { + unit = "us"; + } + auto avg = total / count; + auto min = timestamp_.front(); + auto max = timestamp_.back(); + auto p50 = timestamp_[count / 2]; + auto p99 = + timestamp_[std::min(count - 1, static_cast(count * 0.99))]; + + const int name_width = 20; + const int count_width = 10; + const int time_width = 18; + + std::cout << std::left << std::setw(name_width) + << name_.substr(0, 17) + (name_.length() > 17 ? "..." : "") + << std::setw(count_width) << count << std::setw(time_width) + << format(static_cast(avg.count()) / factor, unit) + << std::setw(time_width) + << format(static_cast(min.count()) / factor, unit) + << std::setw(time_width) + << format(static_cast(max.count()) / factor, unit) + << std::setw(time_width) + << format(static_cast(p50.count()) / factor, unit) + << std::setw(time_width) + << format(static_cast(p99.count()) / factor, unit) + << std::setw(time_width) + << format(static_cast(total.count()) / factor, unit) + << std::endl; + } + + void Clear() { + timestamp_.clear(); + last_time_ = std::chrono::nanoseconds::zero(); + started_ = false; + } + + void SetUnit(UNIT unit) { unit_ = unit; } + + void SetName(const std::string& name) { name_ = name; } + + ~Monitor() = default; + + private: + std::string format(double value, std::string unit) { + std::stringstream ss; + ss << std::fixed << std::setprecision(3) << value << unit; + return ss.str(); + } + + std::vector timestamp_; + std::chrono::nanoseconds last_time_; + bool started_ = false; + std::string name_; + UNIT unit_; + std::mutex mutex_; +}; + +class MonitorGuard { + public: + explicit MonitorGuard(Monitor& monitor) : monitor_(monitor) { + MONITOR_START(monitor_); + } + + ~MonitorGuard() { MONITOR_END(monitor_); } + + private: + Monitor& monitor_; +}; + +} // namespace monitor + +} // namespace vineyard + +#endif // SRC_COMMON_UTIL_MONITOR_H_ diff --git a/src/common/util/protocols.cc b/src/common/util/protocols.cc index ae67eefba..d31ceadef 100644 --- a/src/common/util/protocols.cc +++ b/src/common/util/protocols.cc @@ -34,6 +34,10 @@ namespace vineyard { return Status::Wrap(st, err_message.str()); \ } \ } \ + if (root.value("type", "UNKNOWN") != (type)) { \ + std::cerr << "Json root:" << json_to_string(tree) << std::endl; \ + std::cerr << "Expected type: " << (type) << std::endl; \ + } \ RETURN_ON_ASSERT(root.value("type", "UNKNOWN") == (type)); \ } while (0) @@ -45,6 +49,10 @@ const std::string command_t::REGISTER_REQUEST = "register_request"; const std::string command_t::REGISTER_REPLY = "register_reply"; const std::string command_t::EXIT_REQUEST = "exit_request"; const std::string command_t::EXIT_REPLY = "exit_reply"; +const std::string command_t::REQUIRE_EXTRA_REQUEST_MEMORY_REQUEST = + "require_extra_request_memory_request"; +const std::string command_t::REQUIRE_EXTRA_REQUEST_MEMORY_REPLY = + "require_extra_request_memory_reply"; // Blobs APIs const std::string command_t::CREATE_BUFFER_REQUEST = "create_buffer_request"; @@ -80,6 +88,9 @@ const std::string command_t::CREATE_REMOTE_BUFFERS_REQUEST = "create_remote_buffers_request"; const std::string command_t::GET_REMOTE_BUFFERS_REQUEST = "get_remote_buffers_request"; +const std::string command_t::GET_USER_BUFFERS_REQUEST = + "get_user_buffers_request"; +const std::string command_t::GET_USER_BUFFERS_REPLY = "get_user_buffers_reply"; const std::string command_t::INCREASE_REFERENCE_COUNT_REQUEST = "increase_reference_count_request"; @@ -91,6 +102,8 @@ const std::string command_t::DEL_DATA_WITH_FEEDBACKS_REQUEST = "del_data_with_feedbacks_request"; const std::string command_t::DEL_DATA_WITH_FEEDBACKS_REPLY = "del_data_with_feedbacks_reply"; +const std::string command_t::DEL_HUGE_DATA_REQUEST = "del_huge_data_request"; +const std::string command_t::DEL_HUGE_DATA_REPLY = "del_huge_data_reply"; const std::string command_t::CREATE_BUFFER_PLASMA_REQUEST = "create_buffer_by_plasma_request"; @@ -135,6 +148,27 @@ const std::string command_t::RELEASE_BLOBS_WITH_RDMA_REQUEST = "release_blobs_with_rdma_request"; const std::string command_t::RELEASE_BLOBS_WITH_RDMA_REPLY = "release_blobs_with_rdma_reply"; +const std::string command_t::BATCH_PERSIST_REQUEST = "batch_persist_request"; +const std::string command_t::BATCH_PERSIST_REPLY = "batch_persist_reply"; +const std::string command_t::CREATE_HUGE_DATAS_REQUEST = + "create_huge_datas_request"; +const std::string command_t::CREATE_HUGE_DATAS_REPLY = + "create_huge_datas_reply"; +const std::string command_t::GET_HUGE_DATA_REQUEST = "get_huge_data_request"; +const std::string command_t::GET_HUGE_DATA_REPLY = "get_huge_data_reply"; + +const std::string command_t::CREATE_USER_BUFFERS_REQUEST = + "create_user_buffers_request"; +const std::string command_t::CREATE_USER_BUFFERS_REPLY = + "create_user_buffers_reply"; +const std::string command_t::GET_REMOTE_BLOBS_WITH_RDMA_REQUEST = + "get_remote_blobs_with_rdma_request"; +const std::string command_t::GET_REMOTE_BLOBS_WITH_RDMA_REPLY = + "get_remote_blobs_with_rdma_reply"; +const std::string command_t::DELETE_USER_BUFFERS_REQUEST = + "delete_user_buffers_request"; +const std::string command_t::DELETE_USER_BUFFERS_REPLY = + "delete_user_buffers_reply"; // Stream APIs const std::string command_t::CREATE_STREAM_REQUEST = "create_stream_request"; @@ -149,6 +183,10 @@ const std::string command_t::PUSH_NEXT_STREAM_CHUNK_REQUEST = "push_next_stream_chunk_request"; const std::string command_t::PUSH_NEXT_STREAM_CHUNK_REPLY = "push_next_stream_chunk_reply"; +const std::string command_t::PUSH_NEXT_STREAM_CHUNK_BY_OFFSET_REQUEST = + "push_next_stream_chunk_by_offset_request"; +const std::string command_t::PUSH_NEXT_STREAM_CHUNK_BY_OFFSET_REPLY = + "push_next_stream_chunk_by_offset_reply"; const std::string command_t::PULL_NEXT_STREAM_CHUNK_REQUEST = "pull_next_stream_chunk_request"; const std::string command_t::PULL_NEXT_STREAM_CHUNK_REPLY = @@ -157,6 +195,82 @@ const std::string command_t::STOP_STREAM_REQUEST = "stop_stream_request"; const std::string command_t::STOP_STREAM_REPLY = "stop_stream_reply"; const std::string command_t::DROP_STREAM_REQUEST = "drop_stream_request"; const std::string command_t::DROP_STREAM_REPLY = "drop_stream_reply"; +const std::string command_t::ABORT_STREAM_REQUEST = "abort_stream_request"; +const std::string command_t::ABORT_STREAM_REPLY = "abort_stream_reply"; +const std::string command_t::PUT_STREAM_NAME_REQUEST = + "put_stream_name_request"; +const std::string command_t::PUT_STREAM_NAME_REPLY = "put_stream_name_reply"; +const std::string command_t::GET_STREAM_ID_BY_NAME_REQUEST = + "get_stream_id_by_name_request"; +const std::string command_t::GET_STREAM_ID_BY_NAME_REPLY = + "get_stream_id_by_name_reply"; +const std::string command_t::ACTIVATE_REMOTE_FIXED_STREAM_REQUEST = + "activate_remote_fixed_stream_request"; +const std::string command_t::ACTIVATE_REMOTE_FIXED_STREAM_REPLY = + "activate_remote_fixed_stream_reply"; +const std::string command_t::STREAM_READY_ACK = "stream_ready_ack"; +const std::string command_t::CREATE_FIXED_STREAM_REQUEST = + "create_fixed_stream_request"; +const std::string command_t::CREATE_FIXED_STREAM_REPLY = + "create_fixed_stream_reply"; +const std::string command_t::OPEN_FIXED_STREAM_REQUEST = + "open_fixed_stream_request"; +const std::string command_t::OPEN_FIXED_STREAM_REPLY = + "open_fixed_stream_reply"; +const std::string command_t::CLOSE_STREAM_REQUEST = "close_stream_request"; +const std::string command_t::CLOSE_STREAM_REPLY = "close_stream_reply"; +const std::string command_t::DELETE_STREAM_REQUEST = "delete_stream_request"; +const std::string command_t::DELETE_STREAM_REPLY = "delete_stream_reply"; +const std::string command_t::CHECK_FIXED_STREAM_RECEIVED_REQUEST = + "check_fixed_stream_received_request"; +const std::string command_t::CHECK_FIXED_STREAM_RECEIVED_REPLY = + "check_fixed_stream_received_reply"; + +// sidecar operation +const std::string command_t::VINEYARD_OPEN_REMOTE_FIXED_STREAM_REQUEST = + "VINEYARD_OPEN_REMOTE_FIXED_STREAM_REQUEST"; +const std::string command_t::VINEYARD_OPEN_REMOTE_FIXED_STREAM_REPLY = + "VINEYARD_OPEN_REMOTE_FIXED_STREAM_REPLY"; +const std::string command_t::VINEYARD_ACTIVATE_REMOTE_FIXED_STREAM_REQUEST = + "vineyard_activate_remote_fixed_stream_request"; +const std::string command_t::VINEYARD_ACTIVATE_REMOTE_FIXED_STREAM_REPLY = + "vineyard_activate_remote_fixed_stream_reply"; +const std::string + command_t::VINEYARD_ACTIVATE_REMOTE_FIXED_STREAM_WITH_OFFSET_REQUEST = + "vineyard_activate_remote_fixed_stream_with_offset_request"; +const std::string + command_t::VINEYARD_ACTIVATE_REMOTE_FIXED_STREAM_WITH_OFFSET_REPLY = + "vineyard_activate_remote_fixed_stream_with_offset_reply"; +const std::string command_t::VINEYARD_STOP_STREAM_REQUEST = + "vineyard_stop_stream_request"; +const std::string command_t::VINEYARD_STOP_STREAM_REPLY = + "vineyard_stop_stream_reply"; +const std::string command_t::VINEYARD_DROP_STREAM_REQUEST = + "vineyard_drop_stream_request"; +const std::string command_t::VINEYARD_DROP_STREAM_REPLY = + "vineyard_drop_stream_reply"; +const std::string command_t::VINEYARD_ABORT_REMOTE_STREAM_REQUEST = + "vineyard_abort_remote_stream_request"; +const std::string command_t::VINEYARD_ABORT_REMOTE_STREAM_REPLY = + "vineyard_abort_remote_stream_reply"; +const std::string command_t::VINEYARD_STREAM_READY_ACK = + "vineyard_stream_ready_ack"; +const std::string command_t::VINEYARD_CLOSE_REMOTE_FIXED_STREAM_REQUEST = + "vineyard_close_remote_fixed_stream_request"; +const std::string command_t::VINEYARD_CLOSE_REMOTE_FIXED_STREAM_REPLY = + "vineyard_close_remote_fixed_stream_reply"; +const std::string command_t::VINEYARD_GET_METAS_BY_NAMES_REQUEST = + "vineyard_get_metas_by_names_request"; +const std::string command_t::VINEYARD_GET_METAS_BY_NAMES_REPLY = + "vineyard_get_metas_by_names_reply"; +const std::string command_t::VINEYARD_GET_REMOTE_BLOBS_WITH_RDMA_REQUEST = + "vineyard_get_remote_blobs_with_rdma_request"; +const std::string command_t::VINEYARD_GET_REMOTE_BLOBS_WITH_RDMA_REPLY = + "vineyard_get_remote_blobs_with_rdma_reply"; +const std::string command_t::VINEYARD_GET_REMOTE_BLOBS_WITH_OFFSET_REQUEST = + "vineyard_get_remote_blobs_with_offset_request"; +const std::string command_t::VINEYARD_GET_REMOTE_BLOBS_WITH_OFFSET_REPLY = + "vineyard_get_remote_blobs_with_offset_reply"; // Names APIs const std::string command_t::PUT_NAME_REQUEST = "put_name_request"; @@ -167,6 +281,24 @@ const std::string command_t::LIST_NAME_REQUEST = "list_name_request"; const std::string command_t::LIST_NAME_REPLY = "list_name_reply"; const std::string command_t::DROP_NAME_REQUEST = "drop_name_request"; const std::string command_t::DROP_NAME_REPLY = "drop_name_reply"; +const std::string command_t::GET_NAME_LOCATION_REQUEST = + "get_name_location_request"; +const std::string command_t::GET_NAME_LOCATION_REPLY = + "get_name_location_reply"; +const std::string command_t::GET_METAS_BY_NAMES_REQUEST = + "get_metas_by_names_request"; +const std::string command_t::GET_METAS_BY_NAMES_REPLY = + "get_metas_by_names_reply"; +const std::string command_t::PUT_NAME_LOCATION_REQUEST = + "put_name_location_request"; +const std::string command_t::PUT_NAME_LOCATION_REPLY = + "put_name_location_reply"; +const std::string command_t::GET_NAMES_REQUEST = "get_names_request"; +const std::string command_t::GET_NAMES_REPLY = "get_names_reply"; +const std::string command_t::DROP_NAMES_REQUEST = "drop_names_request"; +const std::string command_t::DROP_NAMES_REPLY = "drop_names_reply"; +const std::string command_t::PUT_NAMES_REQUEST = "put_names_request"; +const std::string command_t::PUT_NAMES_REPLY = "put_names_reply"; // Arena APIs const std::string command_t::MAKE_ARENA_REQUEST = "make_arena_request"; @@ -198,6 +330,10 @@ const std::string command_t::IS_IN_USE_REQUEST = "is_in_use_request"; const std::string command_t::IS_IN_USE_REPLY = "is_in_use_reply"; // Meta APIs +const std::string command_t::GET_VINEYARD_MMAP_FD_REQUEST = + "get_vineyard_mmap_fd_request"; +const std::string command_t::GET_VINEYARD_MMAP_FD_REPLY = + "get_vineyard_mmap_fd_reply"; const std::string command_t::CLUSTER_META_REQUEST = "cluster_meta"; const std::string command_t::CLUSTER_META_REPLY = "cluster_meta"; const std::string command_t::INSTANCE_STATUS_REQUEST = @@ -216,6 +352,117 @@ const std::string command_t::ACQUIRE_LOCK_REPLY = "acquire_lock_reply"; const std::string command_t::RELEASE_LOCK_REQUEST = "release_lock_request"; const std::string command_t::RELEASE_LOCK_REPLY = "release_lock_reply"; +std::map CommandMap = { + // Connecting APIs + {command_t::REGISTER_REQUEST, 1}, + {command_t::EXIT_REQUEST, 2}, + + // Blobs APIs + {command_t::CREATE_BUFFER_REQUEST, 10001}, + {command_t::CREATE_BUFFERS_REQUEST, 10002}, + {command_t::CREATE_DISK_BUFFER_REQUEST, 10003}, + {command_t::CREATE_GPU_BUFFER_REQUEST, 10004}, + {command_t::SEAL_BUFFER_REQUEST, 10005}, + {command_t::GET_BUFFERS_REQUEST, 10006}, + {command_t::GET_GPU_BUFFERS_REQUEST, 10007}, + {command_t::DROP_BUFFER_REQUEST, 10008}, + {command_t::SHRINK_BUFFER_REQUEST, 10009}, + {command_t::REQUEST_FD_REQUEST, 10010}, + {command_t::CREATE_REMOTE_BUFFER_REQUEST, 10011}, + {command_t::CREATE_REMOTE_BUFFERS_REQUEST, 10012}, + {command_t::GET_REMOTE_BUFFERS_REQUEST, 10013}, + {command_t::INCREASE_REFERENCE_COUNT_REQUEST, 10014}, + {command_t::RELEASE_REQUEST, 10015}, + {command_t::DEL_DATA_WITH_FEEDBACKS_REQUEST, 10016}, + {command_t::CREATE_BUFFER_PLASMA_REQUEST, 10017}, + {command_t::GET_BUFFERS_PLASMA_REQUEST, 10018}, + {command_t::PLASMA_SEAL_REQUEST, 10019}, + {command_t::PLASMA_RELEASE_REQUEST, 10020}, + {command_t::PLASMA_DEL_DATA_REQUEST, 10021}, + {command_t::CREATE_USER_BUFFERS_REQUEST, 10022}, + {command_t::GET_REMOTE_BLOBS_WITH_RDMA_REQUEST, 10023}, + {command_t::DELETE_USER_BUFFERS_REQUEST, 10024}, + + // Metadata APIs + {command_t::CREATE_DATA_REQUEST, 20001}, + {command_t::CREATE_DATAS_REQUEST, 20002}, + {command_t::GET_DATA_REQUEST, 20003}, + {command_t::LIST_DATA_REQUEST, 20004}, + {command_t::DELETE_DATA_REQUEST, 20005}, + {command_t::EXISTS_REQUEST, 20006}, + {command_t::PERSIST_REQUEST, 20007}, + {command_t::IF_PERSIST_REQUEST, 20008}, + {command_t::LABEL_REQUEST, 20009}, + {command_t::CLEAR_REQUEST, 20010}, + {command_t::MEMORY_TRIM_REQUEST, 20011}, + {command_t::RELEASE_BLOBS_WITH_RDMA_REQUEST, 20012}, + {command_t::GET_METAS_BY_NAMES_REQUEST, 20013}, + + // Stream APIs + {command_t::CREATE_STREAM_REQUEST, 30001}, + {command_t::OPEN_STREAM_REQUEST, 30002}, + {command_t::GET_NEXT_STREAM_CHUNK_REQUEST, 30003}, + {command_t::PUSH_NEXT_STREAM_CHUNK_REQUEST, 30004}, + {command_t::PUSH_NEXT_STREAM_CHUNK_BY_OFFSET_REQUEST, 30005}, + {command_t::PULL_NEXT_STREAM_CHUNK_REQUEST, 30006}, + {command_t::STOP_STREAM_REQUEST, 30007}, + {command_t::DROP_STREAM_REQUEST, 30008}, + {command_t::ABORT_STREAM_REQUEST, 30009}, + {command_t::PUT_STREAM_NAME_REQUEST, 30010}, + {command_t::GET_STREAM_ID_BY_NAME_REQUEST, 30011}, + {command_t::ACTIVATE_REMOTE_FIXED_STREAM_REQUEST, 30012}, + {command_t::CREATE_FIXED_STREAM_REQUEST, 30014}, + {command_t::OPEN_FIXED_STREAM_REQUEST, 30015}, + {command_t::CLOSE_STREAM_REQUEST, 30016}, + {command_t::DELETE_STREAM_REQUEST, 30017}, + {command_t::CHECK_FIXED_STREAM_RECEIVED_REQUEST, 30018}, + + // stream operation by vineyardd + {command_t::VINEYARD_OPEN_REMOTE_FIXED_STREAM_REQUEST, 40001}, + {command_t::VINEYARD_ACTIVATE_REMOTE_FIXED_STREAM_REQUEST, 40002}, + {command_t::VINEYARD_STOP_STREAM_REQUEST, 40004}, + {command_t::VINEYARD_DROP_STREAM_REQUEST, 40005}, + {command_t::VINEYARD_ABORT_REMOTE_STREAM_REQUEST, 40006}, + {command_t::VINEYARD_CLOSE_REMOTE_FIXED_STREAM_REQUEST, 40007}, + {command_t::VINEYARD_GET_METAS_BY_NAMES_REQUEST, 40008}, + {command_t::VINEYARD_GET_REMOTE_BLOBS_WITH_RDMA_REQUEST, 40009}, + + // Names APIs + {command_t::PUT_NAME_REQUEST, 50001}, + {command_t::GET_NAME_REQUEST, 50002}, + {command_t::LIST_NAME_REQUEST, 50003}, + {command_t::DROP_NAME_REQUEST, 50004}, + {command_t::GET_METAS_BY_NAMES_REQUEST, 50005}, + + // Arena APIs + {command_t::MAKE_ARENA_REQUEST, 60001}, + {command_t::FINALIZE_ARENA_REQUEST, 60002}, + + // Session APIs + {command_t::NEW_SESSION_REQUEST, 70001}, + {command_t::DELETE_SESSION_REQUEST, 70002}, + {command_t::MOVE_BUFFERS_OWNERSHIP_REQUEST, 70003}, + + // Spill APIs + {command_t::EVICT_REQUEST, 80001}, + {command_t::LOAD_REQUEST, 80002}, + {command_t::UNPIN_REQUEST, 80003}, + {command_t::IS_SPILLED_REQUEST, 80004}, + {command_t::IS_IN_USE_REQUEST, 80005}, + + // Meta APIs + {command_t::GET_VINEYARD_MMAP_FD_REQUEST, 90001}, + {command_t::CLUSTER_META_REQUEST, 90002}, + {command_t::INSTANCE_STATUS_REQUEST, 90003}, + {command_t::MIGRATE_OBJECT_REQUEST, 90004}, + {command_t::SHALLOW_COPY_REQUEST, 90005}, + {command_t::DEBUG_REQUEST, 90006}, + + // distributed lock + {command_t::ACQUIRE_LOCK_REQUEST, 100001}, + {command_t::RELEASE_LOCK_REQUEST, 100002}, +}; + void WriteErrorReply(Status const& status, std::string& msg) { encode_msg(status.ToJSON(), msg); } @@ -318,6 +565,33 @@ void WriteExitRequest(std::string& msg) { encode_msg(root, msg); } +void WriteRequireExtraRequestMemoryRequest(const size_t size, + std::string& msg) { + json root; + root["type"] = command_t::REQUIRE_EXTRA_REQUEST_MEMORY_REQUEST; + root["size"] = size; + + encode_msg(root, msg); +} + +Status ReadRequireExtraRequestMemoryRequest(const json& root, size_t& size) { + CHECK_IPC_ERROR(root, command_t::REQUIRE_EXTRA_REQUEST_MEMORY_REQUEST); + size = root["size"].get(); + return Status::OK(); +} + +void WriteRequireExtraRequestMemoryReply(std::string& msg) { + json root; + root["type"] = command_t::REQUIRE_EXTRA_REQUEST_MEMORY_REPLY; + + encode_msg(root, msg); +} + +Status ReadRequireExtraRequestMemoryReply(const json& root) { + CHECK_IPC_ERROR(root, command_t::REQUIRE_EXTRA_REQUEST_MEMORY_REPLY); + return Status::OK(); +} + void WriteCreateBufferRequest(const size_t size, std::string& msg) { json root; root["type"] = command_t::CREATE_BUFFER_REQUEST; @@ -609,6 +883,60 @@ Status ReadGetBuffersReply(const json& root, std::vector& objects, return Status::OK(); } +void WriteGetUserBuffersRequest(std::vector& ids, std::string& msg) { + json root; + root["type"] = command_t::GET_USER_BUFFERS_REQUEST; + root["ids"] = ids; + + encode_msg(root, msg); +} + +Status ReadGetUserBuffersRequest(const json& root, std::vector& ids) { + CHECK_IPC_ERROR(root, command_t::GET_USER_BUFFERS_REQUEST); + ids = root["ids"].get>(); + + return Status::OK(); +} + +void WriteGetUserBuffersReply( + const std::vector>& objects, std::string& msg) { + json root; + root["type"] = command_t::GET_USER_BUFFERS_REPLY; + json payloads = json::array(); + for (size_t i = 0; i < objects.size(); ++i) { + json tree; + objects[i]->ToJSON(tree); + root[std::to_string(i)] = tree; + payloads.push_back(tree); + } + root["payloads"] = payloads; + root["num"] = objects.size(); + + encode_msg(root, msg); +} + +Status ReadGetUserBuffersReply(const json& root, + std::vector& objects) { + CHECK_IPC_ERROR(root, command_t::GET_USER_BUFFERS_REPLY); + + if (root.contains("payloads") && root["payloads"].is_array()) { + for (auto const& payload : root["payloads"]) { + Payload object; + object.FromJSON(payload); + objects.emplace_back(object); + } + } else { + for (size_t i = 0; i < root.value("num", static_cast(0)); ++i) { + json tree = root[std::to_string(i)]; + Payload object; + object.FromJSON(tree); + objects.emplace_back(object); + } + } + + return Status::OK(); +} + void WriteGetGPUBuffersRequest(const std::set& ids, const bool unsafe, std::string& msg) { json root; @@ -922,6 +1250,43 @@ Status ReadDelDataWithFeedbacksReply(json const& root, return Status::OK(); } +void WriteDelHugeDataRequest(const size_t id_num, const bool force, + const bool deep, const bool memory_trim, + const bool fastpath, std::string& msg) { + json root; + root["type"] = command_t::DEL_HUGE_DATA_REQUEST; + root["id_num"] = id_num; + root["force"] = force; + root["deep"] = deep; + root["memory_trim"] = memory_trim; + root["fastpath"] = fastpath; + + encode_msg(root, msg); +} + +Status ReadDelHugeDataRequest(json const& root, size_t& id_num, bool& force, + bool& deep, bool& memory_trim, bool& fastpath) { + CHECK_IPC_ERROR(root, command_t::DEL_HUGE_DATA_REQUEST); + id_num = root["id_num"].get(); + force = root.value("force", false); + deep = root.value("deep", false); + memory_trim = root.value("memory_trim", false); + fastpath = root.value("fastpath", false); + return Status::OK(); +} + +void WriteDelHugeDataReply(std::string& msg) { + json root; + root["type"] = command_t::DEL_HUGE_DATA_REPLY; + + encode_msg(root, msg); +} + +Status ReadDelHugeDataReply(json const& root) { + CHECK_IPC_ERROR(root, command_t::DEL_HUGE_DATA_REPLY); + return Status::OK(); +} + void WriteCreateBufferByPlasmaRequest(PlasmaID const plasma_id, size_t const size, size_t const plasma_size, @@ -1083,6 +1448,102 @@ Status ReadPlasmaDelDataReply(json const& root) { return Status::OK(); } +void WriteCreateUserBuffersRequest(const std::vector& offsets, + const std::vector& sizes, + std::string& msg) { + json root; + + root["type"] = command_t::CREATE_USER_BUFFERS_REQUEST; + root["offsets_size"] = offsets.size(); + root["sizes_size"] = sizes.size(); + + encode_msg(root, msg); +} + +Status ReadCreateUserBuffersRequest(const json& root, size_t& offsets_num, + size_t& sizes_num) { + CHECK_IPC_ERROR(root, command_t::CREATE_USER_BUFFERS_REQUEST); + offsets_num = root["offsets_size"].get(); + sizes_num = root["sizes_size"].get(); + + return Status::OK(); +} + +void WriteCreateUserBuffersReply(const std::vector& ids, + std::string& msg) { + json root; + root["type"] = command_t::CREATE_USER_BUFFERS_REPLY; + root["ids_size"] = ids.size(); + + encode_msg(root, msg); +} + +Status ReadCreateUserBuffersReply(const json& root, + std::vector& ids) { + CHECK_IPC_ERROR(root, command_t::CREATE_USER_BUFFERS_REPLY); + uint64_t ids_size = root["ids_size"].get(); + ids.resize(ids_size); + return Status::OK(); +} + +void WriteDeleteUserBuffersRequest(const std::vector& ids, + std::string& msg) { + json root; + root["type"] = command_t::DELETE_USER_BUFFERS_REQUEST; + root["id_num"] = ids.size(); + + encode_msg(root, msg); +} + +Status ReadDeleteUserBuffersRequest(const json& root, + std::vector& ids) { + CHECK_IPC_ERROR(root, command_t::DELETE_USER_BUFFERS_REQUEST); + ids.resize(root["id_num"].get()); + return Status::OK(); +} + +void WriteDeleteUserBuffersReply(std::string& msg) { + json root; + root["type"] = command_t::DELETE_USER_BUFFERS_REPLY; + + encode_msg(root, msg); +} + +Status ReadDeleteUserBuffersReply(const json& root) { + CHECK_IPC_ERROR(root, command_t::DELETE_USER_BUFFERS_REPLY); + return Status::OK(); +} + +void WriteGetRemoteBlobsWithRDMARequest( + std::vector>& remote_ids, std::string& msg) { + json root; + root["type"] = command_t::GET_REMOTE_BLOBS_WITH_RDMA_REQUEST; + root["remote_ids"] = remote_ids; + encode_msg(root, msg); +} + +Status ReadGetRemoteBlobsWithRDMARequest( + const json& root, + + std::vector>& remote_ids) { + CHECK_IPC_ERROR(root, command_t::GET_REMOTE_BLOBS_WITH_RDMA_REQUEST); + remote_ids = root["remote_ids"].get>>(); + return Status::OK(); +} + +void WriteGetRemoteBlobsWithRDMAReply(std::string& msg, int index) { + json root; + root["type"] = command_t::GET_REMOTE_BLOBS_WITH_RDMA_REPLY; + root["index"] = index; + encode_msg(root, msg); +} + +Status ReadGetRemoteBlobsWithRDMAReply(const json& root, int& index) { + CHECK_IPC_ERROR(root, command_t::GET_REMOTE_BLOBS_WITH_RDMA_REPLY); + index = root["index"].get(); + return Status::OK(); +} + void WriteCreateDataRequest(const json& content, std::string& msg) { json root; root["type"] = command_t::CREATE_DATA_REQUEST; @@ -1218,6 +1679,77 @@ Status ReadGetDataReply(const json& root, return Status::OK(); } +void WriteCreateHugeDatasRequest(const size_t& json_num, std::string& msg) { + json root; + root["type"] = command_t::CREATE_HUGE_DATAS_REQUEST; + root["json_num"] = json_num; + + encode_msg(root, msg); +} + +Status ReadCreateHugeDatasRequest(const json& root, size_t& json_num) { + CHECK_IPC_ERROR(root, command_t::CREATE_HUGE_DATAS_REQUEST); + json_num = root["json_num"].get(); + return Status::OK(); +} + +void WriteCreateHugeDatasReply(const size_t& ids_num, + const Signature& signature, + const InstanceID& instance_id, + std::string& msg) { + json root; + root["type"] = command_t::CREATE_HUGE_DATAS_REPLY; + root["ids_num"] = ids_num; + root["signature"] = signature; + root["instance_id"] = instance_id; + + encode_msg(root, msg); +} + +Status ReadCreateHugeDatasReply(const json& root, size_t& ids_num, + Signature& signatures, + InstanceID& instance_id) { + CHECK_IPC_ERROR(root, command_t::CREATE_HUGE_DATAS_REPLY); + ids_num = root["ids_num"].get(); + signatures = root["signature"].get(); + instance_id = root["instance_id"].get(); + return Status::OK(); +} + +void WriteGetHugeDataRequest(const size_t id_num, const bool sync_remote, + const bool wait, std::string& msg) { + json root; + root["type"] = command_t::GET_HUGE_DATA_REQUEST; + root["id_num"] = id_num; + root["sync_remote"] = sync_remote; + root["wait"] = wait; + + encode_msg(root, msg); +} + +Status ReadGetHugeDataRequest(const json& root, size_t& id_num, + bool& sync_remote, bool& wait) { + CHECK_IPC_ERROR(root, command_t::GET_HUGE_DATA_REQUEST); + id_num = root["id_num"].get(); + sync_remote = root.value("sync_remote", false); + wait = root.value("wait", false); + return Status::OK(); +} + +void WriteGetHugeDataReply(size_t json_length, std::string& msg) { + json root; + root["type"] = command_t::GET_HUGE_DATA_REPLY; + root["json_length"] = json_length; + + encode_msg(root, msg); +} + +Status ReadGetHugeDataReply(const json& root, size_t& json_length) { + CHECK_IPC_ERROR(root, command_t::GET_HUGE_DATA_REPLY); + json_length = root["json_length"].get(); + return Status::OK(); +} + void WriteListDataRequest(std::string const& pattern, bool const regex, size_t const limit, std::string& msg) { json root; @@ -1344,6 +1876,40 @@ Status ReadPersistReply(const json& root) { return Status::OK(); } +void WriteBatchPersistRequest(const std::vector& ids, + std::string& msg) { + json root; + root["type"] = command_t::BATCH_PERSIST_REQUEST; + std::string buffer(ids.size() * sizeof(ObjectID), 0); + memcpy(buffer.data(), ids.data(), ids.size() * sizeof(ObjectID)); + root["ids_size"] = ids.size(); + root["buffer"] = base64_encode(buffer); + + encode_msg(root, msg); +} + +Status ReadBatchPersistRequest(const json& root, std::vector& ids) { + CHECK_IPC_ERROR(root, command_t::BATCH_PERSIST_REQUEST); + uint64_t ids_size = root["ids_size"].get(); + std::string encoded = root["buffer"].get(); + ids.resize(ids_size); + std::string decoded = base64_decode(encoded); + memcpy(ids.data(), decoded.data(), ids_size * sizeof(ObjectID)); + return Status::OK(); +} + +void WriteBatchPersistReply(std::string& msg) { + json root; + root["type"] = command_t::BATCH_PERSIST_REPLY; + + encode_msg(root, msg); +} + +Status ReadBatchPersistReply(const json& root) { + CHECK_IPC_ERROR(root, command_t::BATCH_PERSIST_REPLY); + return Status::OK(); +} + void WriteIfPersistRequest(const ObjectID id, std::string& msg) { json root; root["type"] = command_t::IF_PERSIST_REQUEST; @@ -1514,6 +2080,39 @@ Status ReadReleaseBlobsWithRDMAReply(const json& root) { return Status::OK(); } +void WriteGetMetasByNamesRequest(const std::vector& names, + std::string& msg) { + json root; + root["type"] = command_t::GET_METAS_BY_NAMES_REQUEST; + root["names"] = names; + encode_msg(root, msg); +} + +Status ReadGetMetasByNamesRequest(const json& root, + std::vector& names) { + CHECK_IPC_ERROR(root, command_t::GET_METAS_BY_NAMES_REQUEST); + names = root["names"].get>(); + return Status::OK(); +} + +void WriteGetMetasByNamesReply(std::vector& ids, json& contents, + std::string& msg) { + json root; + root["type"] = command_t::GET_METAS_BY_NAMES_REPLY; + root["ids"] = ids; + root["contents"] = contents; + + encode_msg(root, msg); +} + +Status ReadGetMetasByNamesReply(const json& root, std::vector& ids, + json& contents) { + CHECK_IPC_ERROR(root, command_t::GET_METAS_BY_NAMES_REPLY); + ids = root["ids"].get>(); + contents = root["contents"]; + return Status::OK(); +} + void WriteCreateStreamRequest(const ObjectID& object_id, std::string& msg) { json root; root["type"] = command_t::CREATE_STREAM_REQUEST; @@ -1540,33 +2139,77 @@ Status ReadCreateStreamReply(const json& root) { return Status::OK(); } -void WriteOpenStreamRequest(const ObjectID& object_id, const int64_t& mode, +void WriteCreateFixedStreamRequest(std::string stream_name, int blob_nums, + size_t blob_size, std::string& msg) { + json root; + root["type"] = command_t::CREATE_FIXED_STREAM_REQUEST; + root["stream_name"] = stream_name; + root["blob_nums"] = blob_nums; + root["blob_size"] = blob_size; + + encode_msg(root, msg); +} + +Status ReadCreateFixedStreamRequest(const json& root, std::string& stream_name, + int& blob_nums, size_t& blob_size) { + CHECK_IPC_ERROR(root, command_t::CREATE_FIXED_STREAM_REQUEST); + stream_name = root["stream_name"].get(); + blob_nums = root["blob_nums"].get(); + blob_size = root["blob_size"].get(); + return Status::OK(); +} + +void WriteCreateFixedStreamReply(std::string& msg, ObjectID& stream_id) { + json root; + root["type"] = command_t::CREATE_FIXED_STREAM_REPLY; + root["stream_id"] = stream_id; + + encode_msg(root, msg); +} + +Status ReadCreateFixedStreamReply(const json& root, ObjectID& stream_id) { + CHECK_IPC_ERROR(root, command_t::CREATE_FIXED_STREAM_REPLY); + stream_id = root["stream_id"].get(); + return Status::OK(); +} + +void WriteOpenStreamRequest(const ObjectID& object_id, std::string stream_name, + const int64_t& mode, bool wait, uint64_t timeout, std::string& msg) { json root; root["type"] = command_t::OPEN_STREAM_REQUEST; root["object_id"] = object_id; root["mode"] = mode; + root["stream_name"] = stream_name; + root["wait"] = wait; + root["timeout"] = timeout; encode_msg(root, msg); } Status ReadOpenStreamRequest(const json& root, ObjectID& object_id, - int64_t& mode) { + std::string& stream_name, int64_t& mode, + bool& wait, uint64_t& timeout) { CHECK_IPC_ERROR(root, command_t::OPEN_STREAM_REQUEST); object_id = root["object_id"].get(); mode = root["mode"].get(); + stream_name = root["stream_name"].get(); + wait = root["wait"].get(); + timeout = root["timeout"].get(); return Status::OK(); } -void WriteOpenStreamReply(std::string& msg) { +void WriteOpenStreamReply(std::string& msg, ObjectID& id) { json root; root["type"] = command_t::OPEN_STREAM_REPLY; + root["id"] = id; encode_msg(root, msg); } -Status ReadOpenStreamReply(const json& root) { +Status ReadOpenStreamReply(const json& root, ObjectID& ret_id) { CHECK_IPC_ERROR(root, command_t::OPEN_STREAM_REPLY); + ret_id = root["id"].get(); return Status::OK(); } @@ -1637,6 +2280,38 @@ Status ReadPushNextStreamChunkReply(const json& root) { return Status::OK(); } +void WritePushNextStreamChunkByOffsetRequest(const ObjectID stream_id, + const size_t offset, + std::string& msg) { + json root; + root["type"] = command_t::PUSH_NEXT_STREAM_CHUNK_BY_OFFSET_REQUEST; + root["id"] = stream_id; + root["offset"] = offset; + + encode_msg(root, msg); +} + +Status ReadPushNextStreamChunkByOffsetRequest(const json& root, + ObjectID& stream_id, + size_t& offset) { + CHECK_IPC_ERROR(root, command_t::PUSH_NEXT_STREAM_CHUNK_BY_OFFSET_REQUEST); + stream_id = root["id"].get(); + offset = root["offset"].get(); + + return Status::OK(); +} + +void WritePushNextStreamChunkByOffsetReply(std::string& msg) { + json root; + root["type"] = command_t::PUSH_NEXT_STREAM_CHUNK_BY_OFFSET_REPLY; + encode_msg(root, msg); +} + +Status ReadPushNextStreamChunkByOffsetReply(const json& root) { + CHECK_IPC_ERROR(root, command_t::PUSH_NEXT_STREAM_CHUNK_BY_OFFSET_REPLY); + return Status::OK(); +} + void WritePullNextStreamChunkRequest(const ObjectID stream_id, std::string& msg) { json root; @@ -1666,17 +2341,49 @@ Status ReadPullNextStreamChunkReply(const json& root, ObjectID& chunk) { return Status::OK(); } -void WriteStopStreamRequest(const ObjectID stream_id, const bool failed, - std::string& msg) { +void WriteCheckFixedStreamReceivedRequest(const ObjectID stream_id, int index, + std::string& msg) { json root; - root["type"] = command_t::STOP_STREAM_REQUEST; + root["type"] = command_t::CHECK_FIXED_STREAM_RECEIVED_REQUEST; root["id"] = stream_id; - root["failed"] = failed; + root["index"] = index; encode_msg(root, msg); } -Status ReadStopStreamRequest(const json& root, ObjectID& stream_id, +Status ReadCheckFixedStreamReceivedRequest(const json& root, + ObjectID& stream_id, int& index) { + CHECK_IPC_ERROR(root, command_t::CHECK_FIXED_STREAM_RECEIVED_REQUEST); + stream_id = root["id"].get(); + index = root["index"].get(); + return Status::OK(); +} + +void WriteCheckFixedStreamReceivedReply(bool finished, std::string& msg) { + json root; + root["type"] = command_t::CHECK_FIXED_STREAM_RECEIVED_REPLY; + root["finished"] = finished; + + encode_msg(root, msg); +} + +Status ReadCheckFixedStreamReceivedReply(bool& finished, const json& root) { + CHECK_IPC_ERROR(root, command_t::CHECK_FIXED_STREAM_RECEIVED_REPLY); + finished = root["finished"].get(); + return Status::OK(); +} + +void WriteStopStreamRequest(const ObjectID stream_id, const bool failed, + std::string& msg) { + json root; + root["type"] = command_t::STOP_STREAM_REQUEST; + root["id"] = stream_id; + root["failed"] = failed; + + encode_msg(root, msg); +} + +Status ReadStopStreamRequest(const json& root, ObjectID& stream_id, bool& failed) { CHECK_IPC_ERROR(root, command_t::STOP_STREAM_REQUEST); stream_id = root["id"].get(); @@ -1722,21 +2429,599 @@ Status ReadDropStreamReply(const json& root) { return Status::OK(); } +void WriteAbortStreamRequest(const ObjectID stream_id, std::string& msg) { + json root; + root["type"] = command_t::ABORT_STREAM_REQUEST; + root["id"] = stream_id; + + encode_msg(root, msg); +} + +Status ReadAbortStreamRequest(const json& root, ObjectID& stream_id) { + CHECK_IPC_ERROR(root, command_t::ABORT_STREAM_REQUEST); + stream_id = root["id"].get(); + return Status::OK(); +} + +void WriteAbortStreamReply(std::string& msg, bool success) { + json root; + root["type"] = command_t::ABORT_STREAM_REPLY; + root["success"] = success; + + encode_msg(root, msg); +} + +Status ReadAbortStreamReply(const json& root, bool& success) { + CHECK_IPC_ERROR(root, command_t::ABORT_STREAM_REPLY); + success = root["success"].get(); + return Status::OK(); +} + +void WritePutStreamNameRequest(const ObjectID stream_id, std::string name, + std::string& msg) { + json root; + root["type"] = command_t::PUT_STREAM_NAME_REQUEST; + root["id"] = stream_id; + root["name"] = name; + + encode_msg(root, msg); +} + +Status ReadPutStreamNameRequest(const json& root, ObjectID& stream_id, + std::string& name) { + CHECK_IPC_ERROR(root, command_t::PUT_STREAM_NAME_REQUEST); + stream_id = root["id"].get(); + name = root["name"].get(); + return Status::OK(); +} + +void WritePutStreamNameReply(std::string& msg) { + json root; + root["type"] = command_t::PUT_STREAM_NAME_REPLY; + + encode_msg(root, msg); +} + +Status ReadPutStreamNameReply(const json& root) { + CHECK_IPC_ERROR(root, command_t::PUT_STREAM_NAME_REPLY); + return Status::OK(); +} + +void WriteGetStreamIDByNameRequest(const std::string name, std::string& msg) { + json root; + root["type"] = command_t::GET_STREAM_ID_BY_NAME_REQUEST; + root["name"] = name; + + encode_msg(root, msg); +} + +Status ReadGetStreamIDByNameRequest(const json& root, std::string& name) { + CHECK_IPC_ERROR(root, command_t::GET_STREAM_ID_BY_NAME_REQUEST); + name = root["name"].get(); + return Status::OK(); +} + +void WriteGetStreamIDByNameReply(const ObjectID stream_id, std::string& msg) { + json root; + root["type"] = command_t::GET_STREAM_ID_BY_NAME_REPLY; + root["id"] = stream_id; + + encode_msg(root, msg); +} + +Status ReadGetStreamIDByNameReply(const json& root, ObjectID& stream_id) { + CHECK_IPC_ERROR(root, command_t::GET_STREAM_ID_BY_NAME_REPLY); + stream_id = root["id"].get(); + return Status::OK(); +} + +void WriteActivateRemoteFixedStreamRequest( + ObjectID stream_id, std::vector>& buffer_list, + std::vector>& rkeys_list, + std::vector>& sizes_list, std::string advice_device, + int port, std::string& msg) { + json root; + root["type"] = command_t::ACTIVATE_REMOTE_FIXED_STREAM_REQUEST; + root["id"] = stream_id; + root["buffer_list"] = buffer_list; + root["rkeys_list"] = rkeys_list; + root["sizes_list"] = sizes_list; + root["advice_device"] = advice_device; + root["port"] = port; + + encode_msg(root, msg); +} + +Status ReadActivateRemoteFixedStreamRequest( + const json& root, ObjectID& stream_id, + std::vector>& buffer_list, + std::vector>& rkeys, + std::vector>& sizes_list, std::string& advice_device, + int& port) { + CHECK_IPC_ERROR(root, command_t::ACTIVATE_REMOTE_FIXED_STREAM_REQUEST); + stream_id = root["id"].get(); + buffer_list = root["buffer_list"].get>>(); + rkeys = root["rkeys_list"].get>>(); + sizes_list = root["sizes_list"].get>>(); + advice_device = root["advice_device"].get(); + port = root["port"].get(); + return Status::OK(); +} + +void WriteActivateRemoteFixedStreamReply(std::string& msg) { + json root; + root["type"] = command_t::ACTIVATE_REMOTE_FIXED_STREAM_REPLY; + + encode_msg(root, msg); +} + +Status ReadActivateRemoteFixedStreamReply(const json& root) { + CHECK_IPC_ERROR(root, command_t::ACTIVATE_REMOTE_FIXED_STREAM_REPLY); + return Status::OK(); +} + +void WriteStreamReadyAckReply(std::string& msg, int index) { + json root; + root["type"] = command_t::STREAM_READY_ACK; + root["index"] = index; + + encode_msg(root, msg); +} + +Status ReadStreamReadyAckReply(const json& root, int& index) { + CHECK_IPC_ERROR(root, command_t::STREAM_READY_ACK); + index = root["index"].get(); + return Status::OK(); +} + +Status WriteOpenFixedStreamRequest(const ObjectID stream_id, + const uint64_t mode, std::string& msg) { + json root; + root["type"] = command_t::OPEN_FIXED_STREAM_REQUEST; + root["id"] = stream_id; + root["mode"] = mode; + + encode_msg(root, msg); + return Status::OK(); +} + +Status ReadOpenFixedStreamRequest(const json& root, ObjectID& stream_id, + int64_t& mode) { + CHECK_IPC_ERROR(root, command_t::OPEN_FIXED_STREAM_REQUEST); + stream_id = root["id"].get(); + mode = root["mode"].get(); + return Status::OK(); +} + +void WriteOpenFixedStreamReply(std::string& msg) { + json root; + root["type"] = command_t::OPEN_FIXED_STREAM_REPLY; + + encode_msg(root, msg); +} + +Status ReadOpenFixedStreamReply(const json& root) { + CHECK_IPC_ERROR(root, command_t::OPEN_FIXED_STREAM_REPLY); + return Status::OK(); +} + +void WriteCloseStreamRequest(ObjectID stream_id, std::string& msg) { + json root; + root["type"] = command_t::CLOSE_STREAM_REQUEST; + root["id"] = stream_id; + + encode_msg(root, msg); +} + +Status ReadCloseStreamRequest(const json& root, ObjectID& stream_id) { + CHECK_IPC_ERROR(root, command_t::CLOSE_STREAM_REQUEST); + stream_id = root["id"].get(); + return Status::OK(); +} + +void WriteCloseStreamReply(std::string& msg) { + json root; + root["type"] = command_t::CLOSE_STREAM_REPLY; + + encode_msg(root, msg); +} + +Status ReadCloseStreamReply(const json& root) { + CHECK_IPC_ERROR(root, command_t::CLOSE_STREAM_REPLY); + return Status::OK(); +} + +void WriteDeleteStreamRequest(ObjectID stream_id, std::string& msg) { + json root; + root["type"] = command_t::DELETE_STREAM_REQUEST; + root["id"] = stream_id; + + encode_msg(root, msg); +} + +Status ReadDeleteStreamRequest(const json& root, ObjectID& stream_id) { + CHECK_IPC_ERROR(root, command_t::DELETE_STREAM_REQUEST); + stream_id = root["id"].get(); + return Status::OK(); +} + +void WriteDeleteStreamReply(std::string& msg) { + json root; + root["type"] = command_t::DELETE_STREAM_REPLY; + + encode_msg(root, msg); +} + +Status ReadDeleteStreamReply(const json& root) { + CHECK_IPC_ERROR(root, command_t::DELETE_STREAM_REPLY); + return Status::OK(); +} + +void WriteVineyardOpenRemoteFixedStreamRequest( + ObjectID const remote_id, std::string stream_name, ObjectID local_id, + int blob_nums, size_t size, std::string remote_endpoint, uint64_t mode, + bool wait, uint64_t timeout, std::string& msg) { + json root; + root["type"] = command_t::VINEYARD_OPEN_REMOTE_FIXED_STREAM_REQUEST; + root["remote_id"] = remote_id; + root["stream_name"] = stream_name; + root["local_id"] = local_id; + root["blob_nums"] = blob_nums; + root["size"] = size; + root["remote_endpoint"] = remote_endpoint; + root["mode"] = mode; + root["wait"] = wait; + root["timeout"] = timeout; + + encode_msg(root, msg); +} + +Status ReadVineyardOpenRemoteFixedStreamRequest( + const json& root, ObjectID& remote_id, std::string& remote_stream_name, + ObjectID& local_id, int& blob_nums, size_t& size, + std::string& remote_endpoint, uint64_t& mode, bool& wait, + uint64_t& timeout) { + CHECK_IPC_ERROR(root, command_t::VINEYARD_OPEN_REMOTE_FIXED_STREAM_REQUEST); + remote_id = root["remote_id"].get(); + remote_stream_name = root["stream_name"].get(); + local_id = root["local_id"].get(); + blob_nums = root["blob_nums"].get(); + size = root["size"].get(); + remote_endpoint = root["remote_endpoint"].get(); + mode = root["mode"].get(); + wait = root["wait"].get(); + timeout = root["timeout"].get(); + return Status::OK(); +} + +void WriteVineyardOpenRemoteFixedStreamReply(std::string& msg, + ObjectID const local_stream_id) { + json root; + root["type"] = command_t::VINEYARD_OPEN_REMOTE_FIXED_STREAM_REPLY; + root["local_stream_id"] = local_stream_id; + + encode_msg(root, msg); +} + +Status ReadVineyardOpenRemoteFixedStreamReply(const json& root, + ObjectID& local_id) { + CHECK_IPC_ERROR(root, command_t::VINEYARD_OPEN_REMOTE_FIXED_STREAM_REPLY); + local_id = root["local_stream_id"].get(); + return Status::OK(); +} + +void WriteVineyardActivateRemoteFixedStreamRequest( + ObjectID stream_id, bool create, std::vector& blob_list, + std::string& msg) { + json root; + root["type"] = command_t::VINEYARD_ACTIVATE_REMOTE_FIXED_STREAM_REQUEST; + root["id"] = stream_id; + root["create"] = create; + root["blob_list"] = blob_list; + + encode_msg(root, msg); +} + +Status ReadVineyardActivateRemoteFixedStreamRequest( + const json& root, ObjectID& stream_id, bool& create, + std::vector& blob_list) { + CHECK_IPC_ERROR(root, + command_t::VINEYARD_ACTIVATE_REMOTE_FIXED_STREAM_REQUEST); + stream_id = root["id"].get(); + create = root["create"].get(); + blob_list = root["blob_list"].get>(); + return Status::OK(); +} + +void WriteVineyardActivateRemoteFixedStreamReply( + std::string& msg, std::vector>& payload_list, + std::vector& fds_to_send) { + json root; + root["type"] = command_t::VINEYARD_ACTIVATE_REMOTE_FIXED_STREAM_REPLY; + json payloads = json::array(); + for (size_t i = 0; i < payload_list.size(); ++i) { + json buffer_meta; + payload_list[i]->ToJSON(buffer_meta); + root[std::to_string(i)] = buffer_meta; + payloads.push_back(buffer_meta); + } + root["payloads"] = payloads; + root["fds"] = fds_to_send; + root["num"] = payload_list.size(); + + encode_msg(root, msg); +} + +Status ReadVineyardActivateRemoteFixedStreamReply(const json& root, + std::vector& objects, + std::vector& fds_sent) { + CHECK_IPC_ERROR(root, command_t::VINEYARD_ACTIVATE_REMOTE_FIXED_STREAM_REPLY); + for (size_t i = 0; i < root["num"]; i++) { + json tree = root[std::to_string(i)]; + Payload object; + object.FromJSON(tree); + objects.push_back(object); + } + fds_sent = root["fds"].get>(); + return Status::OK(); +} + +void WriteVineyardActivateRemoteFixedStreamWithOffsetRequest( + ObjectID stream_id, std::vector& offsets, std::string& msg) { + json root; + root["type"] = + command_t::VINEYARD_ACTIVATE_REMOTE_FIXED_STREAM_WITH_OFFSET_REQUEST; + root["id"] = stream_id; + root["offsets"] = offsets; + + encode_msg(root, msg); +} + +Status ReadVineyardActivateRemoteFixedStreamWithOffsetRequest( + const json& root, ObjectID& stream_id, std::vector& offsets) { + CHECK_IPC_ERROR( + root, + command_t::VINEYARD_ACTIVATE_REMOTE_FIXED_STREAM_WITH_OFFSET_REQUEST); + stream_id = root["id"].get(); + offsets = root["offsets"].get>(); + return Status::OK(); +} + +void WriteVineyardActivateRemoteFixedStreamWithOffsetReply(std::string& msg) { + json root; + root["type"] = + command_t::VINEYARD_ACTIVATE_REMOTE_FIXED_STREAM_WITH_OFFSET_REPLY; + + encode_msg(root, msg); +} + +Status ReadVineyardActivateRemoteFixedStreamWithOffsetReply(const json& root) { + CHECK_IPC_ERROR( + root, command_t::VINEYARD_ACTIVATE_REMOTE_FIXED_STREAM_WITH_OFFSET_REPLY); + return Status::OK(); +} + +void WriteVineyardCloseRemoteFixedStreamRequest(ObjectID stream_id, + std::string& msg) { + json root; + root["type"] = command_t::VINEYARD_CLOSE_REMOTE_FIXED_STREAM_REQUEST; + root["id"] = stream_id; + + encode_msg(root, msg); +} + +Status ReadVineyardCloseRemoteFixedStreamRequest(const json& root, + ObjectID& stream_id) { + CHECK_IPC_ERROR(root, command_t::VINEYARD_CLOSE_REMOTE_FIXED_STREAM_REQUEST); + stream_id = root["id"].get(); + return Status::OK(); +} + +void WriteVineyardCloseRemoteFixedStreamReply(std::string& msg) { + json root; + root["type"] = command_t::VINEYARD_CLOSE_REMOTE_FIXED_STREAM_REPLY; + + encode_msg(root, msg); +} + +Status ReadVineyardCloseRemoteFixedStreamReply(const json& root) { + CHECK_IPC_ERROR(root, command_t::VINEYARD_CLOSE_REMOTE_FIXED_STREAM_REPLY); + return Status::OK(); +} + +void WriteVineyardGetMetasByNamesRequest(std::vector& names, + std::string rpc_endpoint, + std::string& msg) { + json root; + root["type"] = command_t::VINEYARD_GET_METAS_BY_NAMES_REQUEST; + root["names"] = names; + root["rpc_endpoint"] = rpc_endpoint; + encode_msg(root, msg); +} + +Status ReadVineyardGetMetasByNamesRequest(const json& root, + std::vector& names, + std::string& rpc_endpoint) { + CHECK_IPC_ERROR(root, command_t::VINEYARD_GET_METAS_BY_NAMES_REQUEST); + names = root["names"].get>(); + rpc_endpoint = root["rpc_endpoint"].get(); + return Status::OK(); +} + +void WriteVineyardGetMetasByNamesReply(const std::vector& contents, + std::string& msg) { + json root; + root["type"] = command_t::VINEYARD_GET_METAS_BY_NAMES_REPLY; + root["contents"] = contents; + + encode_msg(root, msg); +} + +Status ReadVineyardGetMetasByNamesReply(const json& root, + std::vector& contents) { + CHECK_IPC_ERROR(root, command_t::VINEYARD_GET_METAS_BY_NAMES_REPLY); + contents = root["contents"].get>(); + return Status::OK(); +} + +void WriteVineyardGetRemoteBlobsWithRDMARequest( + std::vector>& local_ids, + std::vector>& remote_ids, std::string& rpc_endpoint, + std::string& msg) { + json root; + root["type"] = command_t::VINEYARD_GET_REMOTE_BLOBS_WITH_RDMA_REQUEST; + root["local_ids"] = local_ids; + root["remote_ids"] = remote_ids; + root["rpc_endpoint"] = rpc_endpoint; + + encode_msg(root, msg); +} + +Status ReadVineyardGetRemoteBlobsWithRDMARequest( + const json& root, std::vector>& local_ids, + std::vector>& remote_ids, std::string& rpc_endpoint) { + CHECK_IPC_ERROR(root, command_t::VINEYARD_GET_REMOTE_BLOBS_WITH_RDMA_REQUEST); + local_ids = root["local_ids"].get>>(); + remote_ids = root["remote_ids"].get>>(); + rpc_endpoint = root["rpc_endpoint"].get(); + return Status::OK(); +} + +void WriteVineyardGetRemoteBlobsWithRDMAReply(std::string& msg) { + json root; + root["type"] = command_t::VINEYARD_GET_REMOTE_BLOBS_WITH_RDMA_REPLY; + + encode_msg(root, msg); +} + +Status ReadVineyardGetRemoteBlobsWithRDMAReply(const json& root) { + CHECK_IPC_ERROR(root, command_t::VINEYARD_GET_REMOTE_BLOBS_WITH_RDMA_REPLY); + return Status::OK(); +} + +void WriteVineyardGetRemoteBlobsWithOffsetRequest(size_t batch_nums, + size_t batch_size, + std::string& rpc_endpoint, + std::string& msg) { + json root; + root["type"] = command_t::VINEYARD_GET_REMOTE_BLOBS_WITH_OFFSET_REQUEST; + root["batch_nums"] = batch_nums; + root["batch_size"] = batch_size; + root["rpc_endpoint"] = rpc_endpoint; + + encode_msg(root, msg); +} + +Status ReadVineyardGetRemoteBlobsWithOffsetRequest(const json& root, + size_t& batch_nums, + size_t& batch_size, + std::string& rpc_endpoint) { + CHECK_IPC_ERROR(root, + command_t::VINEYARD_GET_REMOTE_BLOBS_WITH_OFFSET_REQUEST); + batch_nums = root["batch_nums"].get(); + batch_size = root["batch_size"].get(); + rpc_endpoint = root["rpc_endpoint"].get(); + return Status::OK(); +} + +void WriteVineyardGetRemoteBlobsWithOffsetReply(std::string& msg) { + json root; + root["type"] = command_t::VINEYARD_GET_REMOTE_BLOBS_WITH_OFFSET_REPLY; + + encode_msg(root, msg); +} + +Status ReadVineyardGetRemoteBlobsWithOffsetReply(const json& root) { + CHECK_IPC_ERROR(root, command_t::VINEYARD_GET_REMOTE_BLOBS_WITH_OFFSET_REPLY); + return Status::OK(); +} + +void WriteVineyardStopStreamRequest(ObjectID stream_id, bool failed, + std::string& msg) { + // TBD +} + +Status ReadVineyardStopStreamRequest(const json& root, ObjectID& stream_id, + bool& failed) { + // TBD + return Status::NotImplemented("Not implemented yet"); +} + +void WriteVineyardStopStreamReply(std::string& msg) { + // TBD +} + +Status ReadVineyardStopStreamReply(const json& root) { + // TBD + return Status::NotImplemented("Not implemented yet"); +} + +void WriteVineyardDropStreamRequest(ObjectID stream_id, std::string& msg) { + // TBD +} + +Status ReadVineyardDropStreamRequest(const json& root, ObjectID& stream_id) { + // TBD + return Status::NotImplemented("Not implemented yet"); +} + +void WriteVineyardDropStreamReply(std::string& msg) { + // TBD +} + +Status ReadVineyardDropStreamReply(const json& root) { + // TBD + return Status::NotImplemented("Not implemented yet"); +} + +void WriteVineyardAbortRemoteStreamRequest(ObjectID stream_id, + std::string& msg) { + json root; + root["type"] = command_t::VINEYARD_ABORT_REMOTE_STREAM_REQUEST; + root["id"] = stream_id; + + encode_msg(root, msg); +} + +Status ReadVineyardAbortRemoteStreamRequest(const json& root, + ObjectID& stream_id) { + CHECK_IPC_ERROR(root, command_t::VINEYARD_ABORT_REMOTE_STREAM_REQUEST); + stream_id = root["id"].get(); + return Status::OK(); +} + +void WriteVineyardAbortRemoteStreamReply(std::string& msg, bool success) { + json root; + root["type"] = command_t::VINEYARD_ABORT_REMOTE_STREAM_REPLY; + root["success"] = success; + + encode_msg(root, msg); +} + +Status ReadVineyardAbortRemoteStreamReply(const json& root, bool& success) { + CHECK_IPC_ERROR(root, command_t::VINEYARD_ABORT_REMOTE_STREAM_REPLY); + success = root["success"].get(); + + return Status::OK(); +} + void WritePutNameRequest(const ObjectID object_id, const std::string& name, - std::string& msg) { + bool overwrite, std::string& msg) { json root; root["type"] = command_t::PUT_NAME_REQUEST; root["object_id"] = object_id; root["name"] = name; + root["overwrite"] = overwrite; encode_msg(root, msg); } Status ReadPutNameRequest(const json& root, ObjectID& object_id, - std::string& name) { + std::string& name, bool& overwrite) { CHECK_IPC_ERROR(root, command_t::PUT_NAME_REQUEST); object_id = root["object_id"].get(); name = root["name"].get_ref(); + overwrite = root.value("overwrite", true); return Status::OK(); } @@ -1752,6 +3037,39 @@ Status ReadPutNameReply(const json& root) { return Status::OK(); } +void WritePutNamesRequest(const std::vector object_ids, + const std::vector& names, bool overwrite, + std::string& msg) { + json root; + root["type"] = command_t::PUT_NAMES_REQUEST; + root["object_ids"] = object_ids; + root["names"] = names; + root["overwrite"] = overwrite; + + encode_msg(root, msg); +} + +Status ReadPutNamesRequest(const json& root, std::vector& object_id, + std::vector& name, bool& overwrite) { + CHECK_IPC_ERROR(root, command_t::PUT_NAMES_REQUEST); + object_id = root["object_ids"].get>(); + name = root["names"].get>(); + overwrite = root.value("overwrite", true); + return Status::OK(); +} + +void WritePutNamesReply(std::string& msg) { + json root; + root["type"] = command_t::PUT_NAMES_REPLY; + + encode_msg(root, msg); +} + +Status ReadPutNamesReply(const json& root) { + CHECK_IPC_ERROR(root, command_t::PUT_NAMES_REPLY); + return Status::OK(); +} + void WriteGetNameRequest(const std::string& name, const bool wait, std::string& msg) { json root; @@ -1783,6 +3101,107 @@ Status ReadGetNameReply(const json& root, ObjectID& object_id) { return Status::OK(); } +void WriteGetNamesRequest(const std::vector& name, const bool wait, + std::string& msg) { + json root; + root["type"] = command_t::GET_NAMES_REQUEST; + root["names"] = name; + root["wait"] = wait; + + encode_msg(root, msg); +} + +Status ReadGetNamesRequest(const json& root, std::vector& name, + bool& wait) { + CHECK_IPC_ERROR(root, command_t::GET_NAMES_REQUEST); + name = root["names"].get>(); + wait = root["wait"].get(); + return Status::OK(); +} + +void WriteGetNamesReply(const std::vector& object_id, + std::string& msg) { + json root; + root["type"] = command_t::GET_NAMES_REPLY; + root["object_ids"] = object_id; + + encode_msg(root, msg); +} + +Status ReadGetNamesReply(const json& root, std::vector& object_id) { + CHECK_IPC_ERROR(root, command_t::GET_NAMES_REPLY); + object_id = root["object_ids"].get>(); + return Status::OK(); +} + +void WriteGetObjectLocationRequest(const std::vector& names, + std::string& msg) { + json root; + root["type"] = command_t::GET_NAME_LOCATION_REQUEST; + root["names"] = names; + + encode_msg(root, msg); +} + +Status ReadGetObjectLocationRequest(const json& root, + std::vector& names) { + CHECK_IPC_ERROR(root, command_t::GET_NAME_LOCATION_REQUEST); + names = root["names"].get>(); + return Status::OK(); +} + +void WriteGetObjectLocationReply( + std::vector>& locations, std::string& msg) { + json root; + root["type"] = command_t::GET_NAME_LOCATION_REPLY; + root["locations"] = locations; + + encode_msg(root, msg); +} + +Status ReadGetObjectLocationReply( + const json& root, std::vector>& locations) { + CHECK_IPC_ERROR(root, command_t::GET_NAME_LOCATION_REPLY); + locations = root["locations"].get>>(); + return Status::OK(); +} + +void WritePutObjectLocationRequest(const std::vector& names, + const std::vector& locations, + int ttl_seconds, std::string& msg) { + json root; + root["type"] = command_t::PUT_NAME_LOCATION_REQUEST; + root["names"] = names; + root["locations"] = locations; + root["ttl_seconds"] = ttl_seconds; + + encode_msg(root, msg); +} + +Status ReadPutObjectLocationRequest(const json& root, + std::vector& names, + std::vector& locations, + int& ttl_seconds) { + CHECK_IPC_ERROR(root, command_t::PUT_NAME_LOCATION_REQUEST); + names = root["names"].get>(); + locations = root["locations"].get>(); + ttl_seconds = root.value("ttl_seconds", 300); + + return Status::OK(); +} + +void WritePutObjectLocationReply(std::string& msg) { + json root; + root["type"] = command_t::PUT_NAME_LOCATION_REPLY; + + encode_msg(root, msg); +} + +Status ReadPutObjectLocationReply(const json& root) { + CHECK_IPC_ERROR(root, command_t::PUT_NAME_LOCATION_REPLY); + return Status::OK(); +} + void WriteListNameRequest(std::string const& pattern, bool const regex, size_t const limit, std::string& msg) { json root; @@ -1846,6 +3265,35 @@ Status ReadDropNameReply(const json& root) { return Status::OK(); } +void WriteDropNamesRequest(const std::vector& name, + std::string& msg) { + json root; + root["type"] = command_t::DROP_NAMES_REQUEST; + // root["names"] = name; + root["name_num"] = name.size(); + + encode_msg(root, msg); +} + +Status ReadDropNamesRequest(const json& root, std::vector& name) { + CHECK_IPC_ERROR(root, command_t::DROP_NAMES_REQUEST); + // name = root["names"].get>(); + name.resize(root["name_num"].get()); + return Status::OK(); +} + +void WriteDropNamesReply(std::string& msg) { + json root; + root["type"] = command_t::DROP_NAMES_REPLY; + + encode_msg(root, msg); +} + +Status ReadDropNamesReply(const json& root) { + CHECK_IPC_ERROR(root, command_t::DROP_NAMES_REPLY); + return Status::OK(); +} + void WriteMakeArenaRequest(const size_t size, std::string& msg) { json root; root["type"] = command_t::MAKE_ARENA_REQUEST; @@ -2151,6 +3599,33 @@ Status ReadIsInUseReply(json const& root, bool& is_in_use) { return Status::OK(); } +void WriteGetVineyardMmapFdRequest(std::string& msg) { + json root; + root["type"] = command_t::GET_VINEYARD_MMAP_FD_REQUEST; + encode_msg(root, msg); +} + +Status ReadGetVineyardMmapFdRequest(const json& root) { + CHECK_IPC_ERROR(root, command_t::GET_VINEYARD_MMAP_FD_REQUEST); + return Status::OK(); +} + +void WriteGetVineyardMmapFdReply(size_t size, size_t offset, std::string& msg) { + json root; + root["type"] = command_t::GET_VINEYARD_MMAP_FD_REPLY; + root["size"] = size; + root["offset"] = offset; + encode_msg(root, msg); +} + +Status ReadGetVineyardMmapFdReply(const json& root, size_t& size, + size_t& offset) { + CHECK_IPC_ERROR(root, command_t::GET_VINEYARD_MMAP_FD_REPLY); + size = root["size"].get(); + offset = root["offset"].get(); + return Status::OK(); +} + void WriteClusterMetaRequest(std::string& msg) { json root; root["type"] = command_t::CLUSTER_META_REQUEST; diff --git a/src/common/util/protocols.h b/src/common/util/protocols.h index e79dc6aa4..154c12db4 100644 --- a/src/common/util/protocols.h +++ b/src/common/util/protocols.h @@ -37,6 +37,8 @@ struct command_t { static const std::string REGISTER_REPLY; static const std::string EXIT_REQUEST; static const std::string EXIT_REPLY; + static const std::string REQUIRE_EXTRA_REQUEST_MEMORY_REQUEST; + static const std::string REQUIRE_EXTRA_REQUEST_MEMORY_REPLY; // Blobs APIs static const std::string CREATE_BUFFER_REQUEST; @@ -57,6 +59,8 @@ struct command_t { static const std::string DROP_BUFFER_REPLY; static const std::string SHRINK_BUFFER_REQUEST; static const std::string SHRINK_BUFFER_REPLY; + static const std::string GET_USER_BUFFERS_REQUEST; + static const std::string GET_USER_BUFFERS_REPLY; static const std::string REQUEST_FD_REQUEST; static const std::string REQUEST_FD_REPLY; @@ -71,6 +75,8 @@ struct command_t { static const std::string RELEASE_REPLY; static const std::string DEL_DATA_WITH_FEEDBACKS_REQUEST; static const std::string DEL_DATA_WITH_FEEDBACKS_REPLY; + static const std::string DEL_HUGE_DATA_REQUEST; + static const std::string DEL_HUGE_DATA_REPLY; static const std::string CREATE_BUFFER_PLASMA_REQUEST; static const std::string CREATE_BUFFER_PLASMA_REPLY; @@ -83,6 +89,14 @@ struct command_t { static const std::string PLASMA_DEL_DATA_REQUEST; static const std::string PLASMA_DEL_DATA_REPLY; + static const std::string CREATE_USER_BUFFERS_REQUEST; + static const std::string CREATE_USER_BUFFERS_REPLY; + static const std::string DELETE_USER_BUFFERS_REQUEST; + static const std::string DELETE_USER_BUFFERS_REPLY; + + static const std::string GET_REMOTE_BLOBS_WITH_RDMA_REQUEST; + static const std::string GET_REMOTE_BLOBS_WITH_RDMA_REPLY; + // Metadata APIs static const std::string CREATE_DATA_REQUEST; static const std::string CREATE_DATA_REPLY; @@ -108,6 +122,12 @@ struct command_t { static const std::string MEMORY_TRIM_REPLY; static const std::string RELEASE_BLOBS_WITH_RDMA_REQUEST; static const std::string RELEASE_BLOBS_WITH_RDMA_REPLY; + static const std::string BATCH_PERSIST_REQUEST; + static const std::string BATCH_PERSIST_REPLY; + static const std::string CREATE_HUGE_DATAS_REQUEST; + static const std::string CREATE_HUGE_DATAS_REPLY; + static const std::string GET_HUGE_DATA_REQUEST; + static const std::string GET_HUGE_DATA_REPLY; // Stream APIs static const std::string CREATE_STREAM_REQUEST; @@ -118,12 +138,58 @@ struct command_t { static const std::string GET_NEXT_STREAM_CHUNK_REPLY; static const std::string PUSH_NEXT_STREAM_CHUNK_REQUEST; static const std::string PUSH_NEXT_STREAM_CHUNK_REPLY; + static const std::string PUSH_NEXT_STREAM_CHUNK_BY_OFFSET_REQUEST; + static const std::string PUSH_NEXT_STREAM_CHUNK_BY_OFFSET_REPLY; static const std::string PULL_NEXT_STREAM_CHUNK_REQUEST; static const std::string PULL_NEXT_STREAM_CHUNK_REPLY; static const std::string STOP_STREAM_REQUEST; static const std::string STOP_STREAM_REPLY; static const std::string DROP_STREAM_REQUEST; static const std::string DROP_STREAM_REPLY; + static const std::string ABORT_STREAM_REQUEST; + static const std::string ABORT_STREAM_REPLY; + static const std::string PUT_STREAM_NAME_REQUEST; + static const std::string PUT_STREAM_NAME_REPLY; + static const std::string GET_STREAM_ID_BY_NAME_REQUEST; + static const std::string GET_STREAM_ID_BY_NAME_REPLY; + static const std::string ACTIVATE_REMOTE_FIXED_STREAM_REQUEST; + static const std::string ACTIVATE_REMOTE_FIXED_STREAM_REPLY; + static const std::string STREAM_READY_ACK; + static const std::string CREATE_FIXED_STREAM_REQUEST; + static const std::string CREATE_FIXED_STREAM_REPLY; + static const std::string OPEN_FIXED_STREAM_REQUEST; + static const std::string OPEN_FIXED_STREAM_REPLY; + static const std::string CLOSE_STREAM_REQUEST; + static const std::string CLOSE_STREAM_REPLY; + static const std::string DELETE_STREAM_REQUEST; + static const std::string DELETE_STREAM_REPLY; + static const std::string CHECK_FIXED_STREAM_RECEIVED_REQUEST; + static const std::string CHECK_FIXED_STREAM_RECEIVED_REPLY; + + // sidecar api + static const std::string VINEYARD_OPEN_REMOTE_FIXED_STREAM_REQUEST; + static const std::string VINEYARD_OPEN_REMOTE_FIXED_STREAM_REPLY; + static const std::string VINEYARD_ACTIVATE_REMOTE_FIXED_STREAM_REQUEST; + static const std::string VINEYARD_ACTIVATE_REMOTE_FIXED_STREAM_REPLY; + static const std::string + VINEYARD_ACTIVATE_REMOTE_FIXED_STREAM_WITH_OFFSET_REQUEST; + static const std::string + VINEYARD_ACTIVATE_REMOTE_FIXED_STREAM_WITH_OFFSET_REPLY; + static const std::string VINEYARD_STOP_STREAM_REQUEST; + static const std::string VINEYARD_STOP_STREAM_REPLY; + static const std::string VINEYARD_DROP_STREAM_REQUEST; + static const std::string VINEYARD_DROP_STREAM_REPLY; + static const std::string VINEYARD_ABORT_REMOTE_STREAM_REQUEST; + static const std::string VINEYARD_ABORT_REMOTE_STREAM_REPLY; + static const std::string VINEYARD_STREAM_READY_ACK; + static const std::string VINEYARD_CLOSE_REMOTE_FIXED_STREAM_REQUEST; + static const std::string VINEYARD_CLOSE_REMOTE_FIXED_STREAM_REPLY; + static const std::string VINEYARD_GET_METAS_BY_NAMES_REQUEST; + static const std::string VINEYARD_GET_METAS_BY_NAMES_REPLY; + static const std::string VINEYARD_GET_REMOTE_BLOBS_WITH_RDMA_REQUEST; + static const std::string VINEYARD_GET_REMOTE_BLOBS_WITH_RDMA_REPLY; + static const std::string VINEYARD_GET_REMOTE_BLOBS_WITH_OFFSET_REQUEST; + static const std::string VINEYARD_GET_REMOTE_BLOBS_WITH_OFFSET_REPLY; // Names APIs static const std::string PUT_NAME_REQUEST; @@ -134,6 +200,18 @@ struct command_t { static const std::string LIST_NAME_REPLY; static const std::string DROP_NAME_REQUEST; static const std::string DROP_NAME_REPLY; + static const std::string GET_NAME_LOCATION_REQUEST; + static const std::string GET_NAME_LOCATION_REPLY; + static const std::string GET_METAS_BY_NAMES_REQUEST; + static const std::string GET_METAS_BY_NAMES_REPLY; + static const std::string PUT_NAME_LOCATION_REQUEST; + static const std::string PUT_NAME_LOCATION_REPLY; + static const std::string GET_NAMES_REQUEST; + static const std::string GET_NAMES_REPLY; + static const std::string DROP_NAMES_REQUEST; + static const std::string DROP_NAMES_REPLY; + static const std::string PUT_NAMES_REQUEST; + static const std::string PUT_NAMES_REPLY; // Arena APIs static const std::string MAKE_ARENA_REQUEST; @@ -163,6 +241,8 @@ struct command_t { static const std::string IS_IN_USE_REPLY; // Meta APIs + static const std::string GET_VINEYARD_MMAP_FD_REQUEST; + static const std::string GET_VINEYARD_MMAP_FD_REPLY; static const std::string CLUSTER_META_REQUEST; static const std::string CLUSTER_META_REPLY; static const std::string INSTANCE_STATUS_REQUEST; @@ -181,6 +261,8 @@ struct command_t { static const std::string RELEASE_LOCK_REPLY; }; +extern std::map CommandMap; + enum class StoreType { kDefault = 1, kPlasma = 2, @@ -214,6 +296,14 @@ Status ReadRegisterReply(const json& msg, std::string& ipc_socket, void WriteExitRequest(std::string& msg); +void WriteRequireExtraRequestMemoryRequest(const size_t size, std::string& msg); + +Status ReadRequireExtraRequestMemoryRequest(const json& root, size_t& size); + +void WriteRequireExtraRequestMemoryReply(std::string& msg); + +Status ReadRequireExtraRequestMemoryReply(const json& root); + void WriteCreateBufferRequest(const size_t size, std::string& msg); Status ReadCreateBufferRequest(const json& root, size_t& size); @@ -291,6 +381,15 @@ Status ReadGetBuffersReply(const json& root, std::vector& objects, Status ReadGetBuffersReply(const json& root, std::vector& objects, std::vector& fd_sent, bool& compress); +void WriteGetUserBuffersRequest(std::vector& ids, std::string& msg); + +Status ReadGetUserBuffersRequest(const json& root, std::vector& ids); + +void WriteGetUserBuffersReply( + const std::vector>& objects, std::string& msg); + +Status ReadGetUserBuffersReply(const json& root, std::vector& objects); + void WriteGetGPUBuffersRequest(const std::set& ids, const bool unsafe, std::string& msg); @@ -403,6 +502,17 @@ void WriteDelDataWithFeedbacksReply(const std::vector& deleted_bids, Status ReadDelDataWithFeedbacksReply(json const& root, std::vector& deleted_bids); +void WriteDelHugeDataRequest(const size_t id_num, const bool force, + const bool deep, const bool memory_trim, + const bool fastpath, std::string& msg); + +Status ReadDelHugeDataRequest(json const& root, size_t& id_num, bool& force, + bool& deep, bool& memory_trim, bool& fastpath); + +void WriteDelHugeDataReply(std::string& msg); + +Status ReadDelHugeDataReply(json const& root); + void WriteCreateBufferByPlasmaRequest(PlasmaID const plasma_id, size_t const size, size_t const plasma_size, @@ -461,6 +571,38 @@ void WritePlasmaDelDataReply(std::string& msg); Status ReadPlasmaDelDataReply(json const& root); +void WriteCreateUserBuffersRequest(const std::vector& offsets, + const std::vector& sizes, + std::string& msg); + +Status ReadCreateUserBuffersRequest(const json& root, size_t& offsets_num, + size_t& sizes_num); + +void WriteCreateUserBuffersReply(const std::vector& ids, + std::string& msg); + +Status ReadCreateUserBuffersReply(const json& root, std::vector& ids); + +void WriteDeleteUserBuffersRequest(const std::vector& ids, + std::string& msg); + +Status ReadDeleteUserBuffersRequest(const json& root, + std::vector& ids); + +void WriteDeleteUserBuffersReply(std::string& msg); + +Status ReadDeleteUserBuffersReply(const json& root); + +void WriteGetRemoteBlobsWithRDMARequest( + std::vector>& remote_ids, std::string& msg); + +Status ReadGetRemoteBlobsWithRDMARequest( + const json& root, std::vector>& remote_ids); + +void WriteGetRemoteBlobsWithRDMAReply(std::string& msg, int index); + +Status ReadGetRemoteBlobsWithRDMAReply(const json& root, int& index); + void WriteCreateDataRequest(const json& content, std::string& msg); Status ReadCreateDataRequest(const json& root, json& content); @@ -502,6 +644,28 @@ Status ReadGetDataReply(const json& root, json& content); Status ReadGetDataReply(const json& root, std::unordered_map& content); +void WriteCreateHugeDatasRequest(const size_t& json_num, std::string& msg); + +Status ReadCreateHugeDatasRequest(const json& root, size_t& json_num); + +void WriteCreateHugeDatasReply(const size_t& ids_num, + const Signature& signature, + const InstanceID& instance_id, std::string& msg); + +Status ReadCreateHugeDatasReply(const json& root, size_t& ids_num, + Signature& signatures, + InstanceID& instance_ids); + +void WriteGetHugeDataRequest(const size_t id_num, const bool sync_remote, + const bool wait, std::string& msg); + +Status ReadGetHugeDataRequest(const json& root, size_t& id_num, + bool& sync_remote, bool& wait); + +void WriteGetHugeDataReply(size_t json_length, std::string& msg); + +Status ReadGetHugeDataReply(const json& root, size_t& json_length); + void WriteListDataRequest(std::string const& pattern, bool const regex, size_t const limit, std::string& msg); @@ -540,6 +704,15 @@ void WritePersistReply(std::string& msg); Status ReadPersistReply(const json& root); +void WriteBatchPersistRequest(const std::vector& ids, + std::string& msg); + +Status ReadBatchPersistRequest(const json& root, std::vector& ids); + +void WriteBatchPersistReply(std::string& msg); + +Status ReadBatchPersistReply(const json& root); + void WriteIfPersistRequest(const ObjectID id, std::string& msg); Status ReadIfPersistRequest(const json& root, ObjectID& id); @@ -593,6 +766,18 @@ void WriteReleaseBlobsWithRDMAReply(std::string& msg); Status ReadReleaseBlobsWithRDMAReply(const json& root); +void WriteGetMetasByNamesRequest(const std::vector& names, + std::string& msg); + +Status ReadGetMetasByNamesRequest(const json& root, + std::vector& names); + +void WriteGetMetasByNamesReply(std::vector& ids, json& contents, + std::string& msg); + +Status ReadGetMetasByNamesReply(const json& root, std::vector& ids, + json& contents); + void WriteCreateStreamRequest(const ObjectID& object_id, std::string& msg); Status ReadCreateStreamRequest(const json& root, ObjectID& object_id); @@ -601,15 +786,27 @@ void WriteCreateStreamReply(std::string& msg); Status ReadCreateStreamReply(const json& root); -void WriteOpenStreamRequest(const ObjectID& object_id, const int64_t& mode, - std::string& msg); +void WriteCreateFixedStreamRequest(std::string stream_name, int blob_nums, + size_t blob_size, std::string& msg); + +Status ReadCreateFixedStreamRequest(const json& root, std::string& stream_name, + int& blob_nums, size_t& blob_size); + +void WriteCreateFixedStreamReply(std::string& msg, ObjectID& stream_id); + +Status ReadCreateFixedStreamReply(const json& root, ObjectID& stream_id); + +void WriteOpenStreamRequest(const ObjectID& object_id, + const std::string stream_name, const int64_t& mode, + bool wait, uint64_t timeout, std::string& msg); Status ReadOpenStreamRequest(const json& root, ObjectID& object_id, - int64_t& mode); + std::string& stream_name, int64_t& mode, + bool& wait, uint64_t& timeout); -void WriteOpenStreamReply(std::string& msg); +void WriteOpenStreamReply(std::string& msg, ObjectID& ret_id); -Status ReadOpenStreamReply(const json& root); +Status ReadOpenStreamReply(const json& root, ObjectID& ret_id); void WriteGetNextStreamChunkRequest(const ObjectID stream_id, const size_t size, std::string& msg); @@ -633,6 +830,18 @@ void WritePushNextStreamChunkReply(std::string& msg); Status ReadPushNextStreamChunkReply(const json& root); +void WritePushNextStreamChunkByOffsetRequest(const ObjectID stream_id, + const size_t offset, + std::string& msg); + +Status ReadPushNextStreamChunkByOffsetRequest(const json& root, + ObjectID& stream_id, + size_t& offset); + +void WritePushNextStreamChunkByOffsetReply(std::string& msg); + +Status ReadPushNextStreamChunkByOffsetReply(const json& root); + void WritePullNextStreamChunkRequest(const ObjectID stream_id, std::string& msg); @@ -642,6 +851,16 @@ void WritePullNextStreamChunkReply(ObjectID const chunk, std::string& msg); Status ReadPullNextStreamChunkReply(const json& root, ObjectID& chunk); +void WriteCheckFixedStreamReceivedRequest(const ObjectID stream_id, int index, + std::string& msg); + +Status ReadCheckFixedStreamReceivedRequest(const json& root, + ObjectID& stream_id, int& index); + +void WriteCheckFixedStreamReceivedReply(bool finished, std::string& msg); + +Status ReadCheckFixedStreamReceivedReply(bool& finished, const json& root); + void WriteStopStreamRequest(const ObjectID stream_id, const bool failed, std::string& msg); @@ -660,16 +879,246 @@ void WriteDropStreamReply(std::string& msg); Status ReadDropStreamReply(const json& root); +void WriteAbortStreamRequest(const ObjectID stream_id, std::string& msg); + +Status ReadAbortStreamRequest(const json& root, ObjectID& stream_id); + +void WriteAbortStreamReply(std::string& msg, bool success); + +Status ReadAbortStreamReply(const json& root, bool& success); + +void WritePutStreamNameRequest(const ObjectID stream_id, std::string name, + std::string& msg); + +Status ReadPutStreamNameRequest(const json& root, ObjectID& stream_id, + std::string& name); + +void WritePutStreamNameReply(std::string& msg); + +Status ReadPutStreamNameReply(const json& root); + +void WriteGetStreamIDByNameRequest(const std::string name, std::string& msg); + +Status ReadGetStreamIDByNameRequest(const json& root, std::string& name); + +void WriteGetStreamIDByNameReply(const ObjectID stream_id, std::string& msg); + +Status ReadGetStreamIDByNameReply(const json& root, ObjectID& stream_id); + +void WriteActivateRemoteFixedStreamRequest(ObjectID stream_id, + std::vector& buffer_list, + std::vector& rkeys, + std::string& msg); + +Status ReadActivateRemoteFixedStreamRequest(const json& root, + ObjectID& stream_id, + std::vector& buffer_list, + std::vector& rkeys); + +void WriteActivateRemoteFixedStreamReply(std::string& msg); + +Status ReadActivateRemoteFixedStreamReply(const json& root); + +void WriteStreamReadyAckReply(std::string& msg, int index); + +Status ReadStreamReadyAckReply(const json& root, int& index); + +void WriteActivateRemoteFixedStreamRequest( + ObjectID stream_id, std::vector>& buffer_list, + std::vector>& rkeys_list, + std::vector>& sizes, std::string advice_device, + int port, std::string& msg); + +Status ReadActivateRemoteFixedStreamRequest( + const json& root, ObjectID& stream_id, + std::vector>& buffer_list, + std::vector>& rkeys, + std::vector>& sizes_list, std::string& advice_device, + int& port); + +void WriteActivateRemoteFixedStreamRequest(ObjectID stream_id, + std::vector& buffer_list, + std::vector& sizes, + std::string& msg); + +Status ReadActivateRemoteFixedStreamRequest(const json& root, + ObjectID& stream_id, + std::vector& buffer_list, + std::vector& sizes_list); + +void WriteActivateRemoteFixedStreamReply(std::string& msg); + +Status ReadActivateRemoteFixedStreamReply(const json& root); + +Status WriteOpenFixedStreamRequest(const ObjectID stream_id, + const uint64_t mode, std::string& msg); + +Status ReadOpenFixedStreamRequest(const json& root, ObjectID& stream_id, + int64_t& mode); + +void WriteOpenFixedStreamReply(std::string& msg); + +Status ReadOpenFixedStreamReply(const json& root); + +void WriteCloseStreamRequest(ObjectID stream_id, std::string& msg); + +Status ReadCloseStreamRequest(const json& root, ObjectID& stream_id); + +void WriteCloseStreamReply(std::string& msg); + +Status ReadCloseStreamReply(const json& root); + +void WriteDeleteStreamRequest(ObjectID stream_id, std::string& msg); + +Status ReadDeleteStreamRequest(const json& root, ObjectID& stream_id); + +void WriteDeleteStreamReply(std::string& msg); + +Status ReadDeleteStreamReply(const json& root); + +void WriteVineyardOpenRemoteFixedStreamRequest( + ObjectID const remote_id, std::string stream_name, ObjectID local_id, + int blob_nums, size_t size, std::string remote_endpoint, uint64_t mode, + bool wait, uint64_t timeout, std::string& msg); + +Status ReadVineyardOpenRemoteFixedStreamRequest( + const json& root, ObjectID& remote_id, std::string& remote_stream_name, + ObjectID& local_id, int& blob_nums, size_t& size, + std::string& remote_endpoint, uint64_t& mode, bool& wait, + uint64_t& timeout); + +void WriteVineyardOpenRemoteFixedStreamReply(std::string& msg, + ObjectID const local_stream_id); + +Status ReadVineyardOpenRemoteFixedStreamReply(const json& root, + ObjectID& local_id); + +void WriteVineyardActivateRemoteFixedStreamRequest( + ObjectID stream_id, bool create, std::vector& blob_list, + std::string& msg); + +Status ReadVineyardActivateRemoteFixedStreamRequest( + const json& root, ObjectID& stream_id, bool& create, + std::vector& blob_list); + +void WriteVineyardActivateRemoteFixedStreamReply( + std::string& msg, std::vector>& payload_list, + std::vector& fds_to_send); + +Status ReadVineyardActivateRemoteFixedStreamReply(const json& root, + std::vector& objects, + std::vector& fds_sent); + +void WriteVineyardActivateRemoteFixedStreamWithOffsetRequest( + ObjectID stream_id, std::vector& offsets, std::string& msg); + +Status ReadVineyardActivateRemoteFixedStreamWithOffsetRequest( + const json& root, ObjectID& stream_id, std::vector& offsets); + +void WriteVineyardActivateRemoteFixedStreamWithOffsetReply(std::string& msg); + +Status ReadVineyardActivateRemoteFixedStreamWithOffsetReply(const json& root); + +void WriteVineyardCloseRemoteFixedStreamRequest(ObjectID stream_id, + std::string& msg); + +Status ReadVineyardCloseRemoteFixedStreamRequest(const json& root, + ObjectID& stream_id); + +void WriteVineyardCloseRemoteFixedStreamReply(std::string& msg); + +Status ReadVineyardCloseRemoteFixedStreamReply(const json& root); + +void WriteVineyardGetMetasByNamesRequest(std::vector& names, + std::string rpc_endpoint, + std::string& msg); + +Status ReadVineyardGetMetasByNamesRequest(const json& root, + std::vector& names, + std::string& rpc_endpoint); + +void WriteVineyardGetMetasByNamesReply(const std::vector& contents, + std::string& msg); + +Status ReadVineyardGetMetasByNamesReply(const json& root, + std::vector& contents); + +void WriteVineyardGetRemoteBlobsWithRDMARequest( + std::vector>& local_ids, + std::vector>& remote_ids, std::string& rpc_endpoint, + std::string& msg); + +Status ReadVineyardGetRemoteBlobsWithRDMARequest( + const json& root, std::vector>& local_ids, + std::vector>& remote_ids, std::string& rpc_endpoint); + +void WriteVineyardGetRemoteBlobsWithRDMAReply(std::string& msg); + +Status ReadVineyardGetRemoteBlobsWithRDMAReply(const json& root); + +void WriteVineyardGetRemoteBlobsWithOffsetRequest(size_t batch_nums, + size_t batch_size, + std::string& rpc_endpoint, + std::string& msg); + +Status ReadVineyardGetRemoteBlobsWithOffsetRequest(const json& root, + size_t& batch_nums, + size_t& batch_size, + std::string& rpc_endpoint); + +void WriteVineyardGetRemoteBlobsWithOffsetReply(std::string& msg); + +Status ReadVineyardGetRemoteBlobsWithOffsetReply(const json& root); + +void WriteVineyardStopStreamRequest(ObjectID stream_id, bool failed, + std::string& msg); + +Status ReadVineyardStopStreamRequest(const json& root, ObjectID& stream_id, + bool& failed); + +void WriteVineyardStopStreamReply(std::string& msg); + +Status ReadVineyardStopStreamReply(const json& root); + +void WriteVineyardDropStreamRequest(ObjectID stream_id, std::string& msg); + +Status ReadVineyardDropStreamRequest(const json& root, ObjectID& stream_id); + +void WriteVineyardDropStreamReply(std::string& msg); + +Status ReadVineyardDropStreamReply(const json& root); + +void WriteVineyardAbortRemoteStreamRequest(ObjectID stream_id, + std::string& msg); + +Status ReadVineyardAbortRemoteStreamRequest(const json& root, + ObjectID& stream_id); + +void WriteVineyardAbortRemoteStreamReply(std::string& msg, bool success); + +Status ReadVineyardAbortRemoteStreamReply(const json& root, bool& success); + void WritePutNameRequest(const ObjectID object_id, const std::string& name, - std::string& msg); + bool overwrite, std::string& msg); Status ReadPutNameRequest(const json& root, ObjectID& object_id, - std::string& name); + std::string& name, bool& overwrite); void WritePutNameReply(std::string& msg); Status ReadPutNameReply(const json& root); +void WritePutNamesRequest(const std::vector object_ids, + const std::vector& names, bool overwrite, + std::string& msg); + +Status ReadPutNamesRequest(const json& root, std::vector& object_id, + std::vector& name, bool& overwrite); + +void WritePutNamesReply(std::string& msg); + +Status ReadPutNamesReply(const json& root); + void WriteGetNameRequest(const std::string& name, const bool wait, std::string& msg); @@ -679,6 +1128,17 @@ void WriteGetNameReply(const ObjectID& object_id, std::string& msg); Status ReadGetNameReply(const json& root, ObjectID& object_id); +void WriteGetNamesRequest(const std::vector& name, const bool wait, + std::string& msg); + +Status ReadGetNamesRequest(const json& root, std::vector& name, + bool& wait); + +void WriteGetNamesReply(const std::vector& object_id, + std::string& msg); + +Status ReadGetNamesReply(const json& root, std::vector& object_id); + void WriteListNameRequest(std::string const& pattern, bool const regex, size_t const limit, std::string& msg); @@ -699,6 +1159,40 @@ void WriteDropNameReply(std::string& msg); Status ReadDropNameReply(const json& root); +void WriteDropNamesRequest(const std::vector& name, + std::string& msg); + +Status ReadDropNamesRequest(const json& root, std::vector& name); + +void WriteDropNamesReply(std::string& msg); + +Status ReadDropNamesReply(const json& root); + +void WriteGetObjectLocationRequest(const std::vector& names, + std::string& msg); + +Status ReadGetObjectLocationRequest(const json& root, + std::vector& names); + +void WriteGetObjectLocationReply( + std::vector>& locations, std::string& msg); + +Status ReadGetObjectLocationReply( + const json& root, std::vector>& locations); + +void WritePutObjectLocationRequest(const std::vector& names, + const std::vector& locations, + int ttl_seconds, std::string& msg); + +Status ReadPutObjectLocationRequest(const json& root, + std::vector& names, + std::vector& locations, + int& ttl_seconds); + +void WritePutObjectLocationReply(std::string& msg); + +Status ReadPutObjectLocationReply(const json& root); + void WriteMakeArenaRequest(const size_t size, std::string& msg); Status ReadMakeArenaRequest(const json& root, size_t& size); @@ -794,6 +1288,15 @@ void WriteIsInUseReply(const bool is_in_use, std::string& msg); Status ReadIsInUseReply(json const& root, bool& is_in_use); +void WriteGetVineyardMmapFdRequest(std::string& msg); + +Status ReadGetVineyardMmapFdRequest(const json& root); + +void WriteGetVineyardMmapFdReply(size_t size, size_t offset, std::string& msg); + +Status ReadGetVineyardMmapFdReply(const json& root, size_t& size, + size_t& offset); + void WriteClusterMetaRequest(std::string& msg); Status ReadClusterMetaRequest(const json& root); diff --git a/src/common/util/sidecar.cc b/src/common/util/sidecar.cc new file mode 100644 index 000000000..7d54f1d4e --- /dev/null +++ b/src/common/util/sidecar.cc @@ -0,0 +1,153 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include +#include +#include + +#include + +#include "common/util/sidecar.h" +#include "common/util/status.h" +#include "common/util/uuid.h" + +namespace vineyard { + +#ifndef GET_BLOB_RECV_MEM_SIZE +#define GET_BLOB_RECV_MEM_SIZE (4096) +#endif // GET_BLOB_RECV_MEM_SIZE + +#ifndef ERROR_MSG_LENGTH +#define ERROR_MSG_LENGTH (256) +#endif // ERROR_MSG_LENGTH + +#ifndef MAX_METAS_FROM_NAME +#define MAX_METAS_FROM_NAME (1000) +#endif // MAX_METAS_FROM_NAME + +Status CreateMmapMemory(int& fd, size_t size, void*& base) { + std::string file_name = + std::string("/tmp/" + ObjectIDToString(GenerateObjectID(0))) + ".mmap"; + return CreateMmapMemory(file_name, fd, size, base); +} + +Status CreateMmapMemory(std::string file_name, int& fd, size_t size, + void*& base) { + fd = open(file_name.c_str(), O_RDWR | O_CREAT | O_NONBLOCK, 0666); + if (fd < 0) { + std::cout << "Failed to create mmap file: '" << file_name << "', " + << strerror(errno); + return Status::IOError("Failed to open file '" + file_name + "', " + + strerror(errno)); + } + + unlink(file_name.c_str()); + if (ftruncate64(fd, size) != 0) { + std::cout << "Failed to ftruncate file " << file_name; + close(fd); + return Status::IOError("Failed to ftruncate file " + file_name); + } + + base = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (base == MAP_FAILED) { + std::cout << "Failed to mmap file '" << file_name << "', " + << strerror(errno); + close(fd); + return Status::IOError("Failed to mmap file '" + file_name + "', " + + strerror(errno)); + } + + memset(base, 0, size); + return Status::OK(); +} + +Status WriteErrorMsg(Status error, void* base, size_t size) { + if (base == nullptr) { + std::cout << "Base pointer is null, cannot write error message." + << std::endl; + return Status::Invalid("Base pointer is null, cannot write error message."); + } + std::string error_str = error.ToString().substr(0, ERROR_MSG_LENGTH); + memcpy(reinterpret_cast(base) + + (size - ERROR_MSG_LENGTH - sizeof(unsigned char)), + error_str.c_str(), error_str.size()); + reinterpret_cast(base)[size - sizeof(unsigned char)] = + static_cast(error.code()); + return Status::OK(); +} + +Status CheckBlobReceived(void* base, size_t size, int index, bool& finished) { + if (base == nullptr) { + std::cout << "Base pointer is null, cannot check blob received." + << std::endl; + return Status::Invalid("Base pointer is null, cannot check blob received."); + } + if (size >= static_cast(GET_BLOB_RECV_MEM_SIZE - ERROR_MSG_LENGTH - + sizeof(unsigned char))) { + return Status::Invalid("Size is too small to check blob received."); + } + + finished = false; + if (index == -1) { + for (size_t i = 0; i < size; i++) { + if (reinterpret_cast(base)[i] == 0) { + return Status::OK(); + } + } + finished = true; + return Status::OK(); + } else if (index > 0 && index < static_cast(size)) { + finished = reinterpret_cast(base)[index] == 1; + return Status::OK(); + } + return Status::Invalid("Index is out of bounds for checking blob received."); +} + +Status SetBlobReceived(void* base, int index) { + if (base == nullptr) { + std::cout << "Base pointer is null, cannot set blob received." << std::endl; + return Status::Invalid("Base pointer is null, cannot set blob received."); + } + if (index < 0 || + index >= static_cast(GET_BLOB_RECV_MEM_SIZE - ERROR_MSG_LENGTH - + sizeof(unsigned char))) { + return Status::Invalid("Index is out of bounds for setting blob received."); + } + + reinterpret_cast(base)[index] = 1; // mark as received + return Status::OK(); +} + +Status ReleaseMmapMemory(int fd, void* base, size_t size) { + if (base != nullptr) { + if (munmap(base, size) != 0) { + std::cout << "Failed to munmap memory, " << strerror(errno); + return Status::IOError("Failed to munmap memory, " + + std::string(strerror(errno))); + } + int ret = close(fd); + if (ret != 0) { + std::cout << "Failed to close file descriptor, error:" << strerror(errno) + << ", it may cause resource leak."; + return Status::IOError("Failed to close file descriptor, " + + std::string(strerror(errno))); + } + std::cout << "Released mmap memory: fd = " << fd << ", base = " << base + << ", size = " << size << std::endl; + } + return Status::OK(); +} + +} // namespace vineyard diff --git a/src/common/util/sidecar.h b/src/common/util/sidecar.h new file mode 100644 index 000000000..cd134d8a5 --- /dev/null +++ b/src/common/util/sidecar.h @@ -0,0 +1,95 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef SRC_COMMON_UTIL_SIDECAR_H_ +#define SRC_COMMON_UTIL_SIDECAR_H_ + +#include + +#include "common/util/json.h" +#include "common/util/status.h" + +namespace vineyard { + +#ifndef GET_BLOB_RECV_MEM_SIZE +#define GET_BLOB_RECV_MEM_SIZE (4096) +#endif // GET_BLOB_RECV_MEM_SIZE + +#ifndef ERROR_MSG_LENGTH +#define ERROR_MSG_LENGTH (256) +#endif // ERROR_MSG_LENGTH + +#ifndef MAX_METAS_FROM_NAME +#define MAX_METAS_FROM_NAME (1000) +#endif // MAX_METAS_FROM_NAME + +struct ClientAttributes { + std::string req_name; + + static ClientAttributes Default() { return ClientAttributes{.req_name = ""}; } + + ClientAttributes SetReqName(std::string name) { + this->req_name = name; + return *this; + } + + json ToJson() { + json j; + j["req_name"] = req_name; + return j; + } + + static ClientAttributes FromJson(const json& j) { + ClientAttributes attr; + if (j.contains("req_name")) { + attr.req_name = j["req_name"].get(); + } + return attr; + } + + void ToBinary(void* data, size_t& size) { + std::string str = json_to_string(ToJson()); + memcpy(data, str.c_str(), str.size()); + size = str.length(); + } + + void FromBinary(void* data, size_t size, ClientAttributes& attr) { + std::string str(static_cast(data), size); + attr = FromJson(json_from_buf(str.c_str(), str.length())); + } + + std::string ToJsonString() { return json_to_string(ToJson()); } + + static ClientAttributes FromJsonString(std::string s) { + return FromJson(json_from_buf(s.c_str(), s.length())); + } +}; + +Status CreateMmapMemory(int& fd, size_t size, void*& base); + +Status CreateMmapMemory(std::string file_name, int& fd, size_t size, + void*& base); + +Status WriteErrorMsg(Status status, void* base, size_t size); + +Status ReleaseMmapMemory(int fd, void* base, size_t size); + +Status CheckBlobReceived(void* base, size_t size, int index, bool& finished); + +Status SetBlobReceived(void* base, int index); + +} // namespace vineyard + +#endif // SRC_COMMON_UTIL_SIDECAR_H_ diff --git a/src/common/util/trace.h b/src/common/util/trace.h new file mode 100644 index 000000000..bdf81d5fa --- /dev/null +++ b/src/common/util/trace.h @@ -0,0 +1,69 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef SRC_COMMON_UTIL_TRACE_H_ +#define SRC_COMMON_UTIL_TRACE_H_ + +#include +#include +#include +#include + +#include "common/util/env.h" + +namespace vineyard { + +class Logger { + public: + Logger() : enable_(false) {} + + explicit Logger(bool enable) : enable_(enable) {} + + explicit Logger(int log_level) { + int glog_level = 0; + try { + glog_level = stoi(read_env("GLOG_v", "0")); + } catch (...) { glog_level = 0; } + enable_ = log_level <= glog_level ? true : false; + } + + template + Logger& operator<<(const T& msg) { + if (!enable_) { + return *this; + } + ss_ << msg; + return *this; + } + + using endl_type = std::ostream& (*) (std::ostream&); + + Logger& operator<<(endl_type e) { + if (!enable_) { + return *this; + } + std::cout << ss_.str() << std::endl; + ss_.str(""); + return *this; + } + + private: + bool enable_; + std::stringstream ss_; +}; + +} // namespace vineyard + +#endif // SRC_COMMON_UTIL_TRACE_H_ diff --git a/src/common/util/uuid.h b/src/common/util/uuid.h index b8f7251f5..294f3d1a2 100644 --- a/src/common/util/uuid.h +++ b/src/common/util/uuid.h @@ -155,14 +155,18 @@ class IDGenerator { auto instance_id = id & 0x3FFUL; uint64_t sequence = sequence_.fetch_add(1) & sequence_mask; - return ((timestamp << timestamp_shift) | - (instance_id << instance_id_shift) | sequence); + // instance_id | sequence | timestamp + return ((instance_id & instance_id_mask) << instance_id_shift) | + ((sequence & sequence_mask) << sequence_shift) | + (timestamp & timestamp_mask); } private: - const uint64_t timestamp_shift = 22; // 41 bits for timestamp - const uint64_t instance_id_shift = 12; // 10 bits for instance id - const uint64_t sequence_mask = 0xFFFUL; // 12 bits for sequence number + const uint64_t instance_id_shift = 53; + const uint64_t sequence_shift = 41; + const uint64_t instance_id_mask = 0x3FFUL; // 10 bits for instance id + const uint64_t sequence_mask = 0xFFFUL; // 12 bits for sequence number + const uint64_t timestamp_mask = 0x1FFFFFFFFFFUL; // 41 bits for timestamp std::atomic sequence_{0}; @@ -183,12 +187,13 @@ class IDGenerator { * (0x8000000000000000UL,0xFFFFFFFFFFFFFFFFUL) exclusively. */ inline ObjectID GenerateBlobID(const uintptr_t ptr) { - static IDGenerator& idGenerator = IDGenerator::getInstance(); if (ptr == 0x8000000000000000UL || ptr == std::numeric_limits::max()) { return static_cast(ptr) | 0x8000000000000000UL; } - return (idGenerator.GenerateID() | 0x8000000000000000UL); + auto ts = detail::cycleclock::now() % (0x7FFFFFFFFFFFFFFFUL - 2) + 1; + return (0x7FFFFFFFFFFFFFFFUL & static_cast(ts)) | + 0x8000000000000000UL; } inline SessionID GenerateSessionID() { diff --git a/src/server/async/rpc_server.cc b/src/server/async/rpc_server.cc index 0ea981905..f7e4348ea 100644 --- a/src/server/async/rpc_server.cc +++ b/src/server/async/rpc_server.cc @@ -17,11 +17,13 @@ limitations under the License. #include #include +#include "common/memory/memcpy.h" #include "common/rdma/util.h" #include "common/util/json.h" #include "common/util/logging.h" // IWYU pragma: keep #include "server/async/rpc_server.h" #include "server/server/vineyard_server.h" +#include "server/util/utils.h" namespace vineyard { @@ -31,6 +33,8 @@ RPCServer::RPCServer(std::shared_ptr vs_ptr) acceptor_(vs_ptr_->GetContext()), socket_(vs_ptr_->GetContext()) { auto endpoint = getEndpoint(vs_ptr_->GetContext()); + VINEYARD_ASSERT(is_port_available(endpoint.port()), + "Use another port for vineyard rpc service."); acceptor_.open(endpoint.protocol()); using reuse_port = asio::detail::socket_option::boolean; @@ -99,6 +103,12 @@ Status RPCServer::InitRDMA() { return Status::OK(); } +Status RPCServer::SendDataWithRDMA(int tcp_conn, uint64_t addr, + uint64_t local_addr, size_t size, + uint64_t rkey) { + return Status::NotImplemented("SendDataWithRDMA is not implemented"); +} + void RPCServer::Start() { vs_ptr_->RPCReady(); SocketServer::Start(); @@ -148,10 +158,13 @@ void RPCServer::doAccept() { if (self->stopped_.load() || self->closable_.load()) { return; } + self->socket_.set_option(boost::asio::ip::tcp::no_delay(true)); + + std::string host = self->socket_.remote_endpoint().address().to_string(); std::shared_ptr conn = std::make_shared(std::move(self->socket_), self->vs_ptr_, self, - self->next_conn_id_); + self->next_conn_id_, host); conn->Start(); self->connections_.emplace(self->next_conn_id_, conn); ++self->next_conn_id_; diff --git a/src/server/async/rpc_server.h b/src/server/async/rpc_server.h index 567fbb56d..aea43e6b7 100644 --- a/src/server/async/rpc_server.h +++ b/src/server/async/rpc_server.h @@ -87,6 +87,9 @@ class RPCServer : public SocketServer, void doNothing(VineyardRecvContext* recv_context); + Status SendDataWithRDMA(int tcp_conn, uint64_t addr, uint64_t local_addr, + size_t size, uint64_t rkey); + const json rpc_spec_; asio::ip::tcp::acceptor acceptor_; asio::ip::tcp::socket socket_; diff --git a/src/server/async/socket_server.cc b/src/server/async/socket_server.cc index 95e26c616..3e880d4b0 100644 --- a/src/server/async/socket_server.cc +++ b/src/server/async/socket_server.cc @@ -15,6 +15,9 @@ limitations under the License. #include "server/async/socket_server.h" +#include +#include + #include #include #include @@ -25,14 +28,18 @@ limitations under the License. #include "common/memory/cuda_ipc.h" #include "common/memory/fling.h" +#include "common/memory/memcpy.h" #include "common/util/callback.h" #include "common/util/functions.h" #include "common/util/json.h" #include "common/util/protocols.h" +#include "common/util/sidecar.h" #include "server/server/vineyard_server.h" #include "server/util/metrics.h" #include "server/util/remote.h" +#include "thread-pool/thread_pool.h" + namespace vineyard { // We set a hard limit for the message buffer size, since an evil client, @@ -44,11 +51,13 @@ constexpr size_t MESSAGE_HEADER_LIMIT = 256 * 1024 * 1024; // 256M bytes SocketConnection::SocketConnection( stream_protocol::socket socket, std::shared_ptr server_ptr, - std::shared_ptr socket_server_ptr, int conn_id) + std::shared_ptr socket_server_ptr, int conn_id, + std::string host) : socket_(std::move(socket)), server_ptr_(server_ptr), socket_server_ptr_(socket_server_ptr), - conn_id_(conn_id) { + conn_id_(conn_id), + peer_host(std::move(host)) { // hold the references of bulkstore using `shared_from_this()`. auto bulk_store = server_ptr_->GetBulkStore(); if (bulk_store != nullptr) { @@ -60,8 +69,11 @@ SocketConnection::SocketConnection( } // initializing this->registered_.store(false); + this->trace_log_level_ = server_ptr_->GetTraceLogLevel(); } +bool SocketConnection::sendFd(int fd) { return send_fd(nativeHandle(), fd); } + bool SocketConnection::Start() { running_.store(true); doReadHeader(); @@ -111,6 +123,7 @@ void SocketConnection::doReadHeader() { doReadBody(); } else { doStop(); + // ThrowException(); } }); } @@ -119,6 +132,7 @@ void SocketConnection::doReadBody() { if (read_msg_header_ > MESSAGE_HEADER_LIMIT) { VLOG(10) << "invalid message header value: " << read_msg_header_; doStop(); + // ThrowException(); return; } read_msg_body_.resize(read_msg_header_ + 1); @@ -130,10 +144,14 @@ void SocketConnection::doReadBody() { bool exit = processMessage(read_msg_body_); if (exit || ec == asio::error::eof) { doStop(); + if (!exit) { + // ThrowException(); + } return; } } else { doStop(); + // ThrowException(); return; } }); @@ -231,7 +249,13 @@ bool SocketConnection::processMessage(const std::string& message_in) { auto self(shared_from_this()); // DON'T let vineyardd crash when the client is malicious. + uint64_t start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); TRY_READ_FROM_JSON(root = json::parse(message_in), message_in); + uint64_t end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); if (!root.contains("type")) { RESPONSE_ON_ERROR(Status::Invalid("Invalid message: no 'type' field")); } @@ -241,8 +265,16 @@ bool SocketConnection::processMessage(const std::string& message_in) { RESPONSE_ON_ERROR(Status::Invalid( "The connection is not registered yet, command is: " + cmd)); } + + if (end - start > 500) { + LOG(INFO) << "parse cmd: " << cmd << " json time: " << (end - start) + << " us"; + } + if (cmd == command_t::REGISTER_REQUEST) { return doRegister(root); + } else if (cmd == command_t::REQUIRE_EXTRA_REQUEST_MEMORY_REQUEST) { + return doRequireExtraRequestMemory(root); } else if (cmd == command_t::EXIT_REQUEST) { return true; } else if (cmd == command_t::CREATE_BUFFER_REQUEST) { @@ -275,6 +307,8 @@ bool SocketConnection::processMessage(const std::string& message_in) { return doRelease(root); } else if (cmd == command_t::DEL_DATA_WITH_FEEDBACKS_REQUEST) { return doDelDataWithFeedbacks(root); + } else if (cmd == command_t::DEL_HUGE_DATA_REQUEST) { + return doDelHugeData(root); } else if (cmd == command_t::CREATE_BUFFER_PLASMA_REQUEST) { return doCreateBufferByPlasma(root); } else if (cmd == command_t::GET_BUFFERS_PLASMA_REQUEST) { @@ -285,12 +319,22 @@ bool SocketConnection::processMessage(const std::string& message_in) { return doPlasmaRelease(root); } else if (cmd == command_t::PLASMA_DEL_DATA_REQUEST) { return doPlasmaDelData(root); + } else if (cmd == command_t::CREATE_USER_BUFFERS_REQUEST) { + return doCreateUserBuffers(root); + } else if (cmd == command_t::GET_USER_BUFFERS_REQUEST) { + return doGetUserBuffers(root); + } else if (cmd == command_t::DELETE_USER_BUFFERS_REQUEST) { + return doDeleteUserBuffers(root); } else if (cmd == command_t::CREATE_DATA_REQUEST) { return doCreateData(root); } else if (cmd == command_t::CREATE_DATAS_REQUEST) { return doCreateDatas(root); + } else if (cmd == command_t::CREATE_HUGE_DATAS_REQUEST) { + return doCreatehugeDatas(root); } else if (cmd == command_t::GET_DATA_REQUEST) { return doGetData(root); + } else if (cmd == command_t::GET_HUGE_DATA_REQUEST) { + return doGetHugeData(root); } else if (cmd == command_t::DELETE_DATA_REQUEST) { return doDelData(root); } else if (cmd == command_t::LIST_DATA_REQUEST) { @@ -301,6 +345,8 @@ bool SocketConnection::processMessage(const std::string& message_in) { return doPersist(root); } else if (cmd == command_t::IF_PERSIST_REQUEST) { return doIfPersist(root); + } else if (cmd == command_t::BATCH_PERSIST_REQUEST) { + return doBatchPersist(root); } else if (cmd == command_t::LABEL_REQUEST) { return doLabelObject(root); } else if (cmd == command_t::CLEAR_REQUEST) { @@ -309,22 +355,72 @@ bool SocketConnection::processMessage(const std::string& message_in) { return doMemoryTrim(root); } else if (cmd == command_t::CREATE_STREAM_REQUEST) { return doCreateStream(root); + } else if (cmd == command_t::CREATE_FIXED_STREAM_REQUEST) { + return doCreateFixedStream(root); } else if (cmd == command_t::OPEN_STREAM_REQUEST) { return doOpenStream(root); } else if (cmd == command_t::GET_NEXT_STREAM_CHUNK_REQUEST) { return doGetNextStreamChunk(root); } else if (cmd == command_t::PUSH_NEXT_STREAM_CHUNK_REQUEST) { return doPushNextStreamChunk(root); + } else if (cmd == command_t::PUSH_NEXT_STREAM_CHUNK_BY_OFFSET_REQUEST) { + return doPushNextStreamChunkByOffset(root); } else if (cmd == command_t::PULL_NEXT_STREAM_CHUNK_REQUEST) { return doPullNextStreamChunk(root); + } else if (cmd == command_t::CHECK_FIXED_STREAM_RECEIVED_REQUEST) { + return doCheckFixedStreamReceived(root); } else if (cmd == command_t::STOP_STREAM_REQUEST) { return doStopStream(root); } else if (cmd == command_t::DROP_STREAM_REQUEST) { return doDropStream(root); + } else if (cmd == command_t::ABORT_STREAM_REQUEST) { + return doAbortStream(root); + } else if (cmd == command_t::PUT_STREAM_NAME_REQUEST) { + return doPutStreamName(root); + } else if (cmd == command_t::GET_STREAM_ID_BY_NAME_REQUEST) { + return doGetStreamIDByName(root); + } else if (cmd == command_t::ACTIVATE_REMOTE_FIXED_STREAM_REQUEST) { + return doActivateRemoteFixedStream(root); + } else if (cmd == + command_t:: + VINEYARD_ACTIVATE_REMOTE_FIXED_STREAM_WITH_OFFSET_REQUEST) { + return doVineyardActivateRemoteFixedStreamWithOffset(root); + } else if (cmd == command_t::OPEN_FIXED_STREAM_REQUEST) { + return doOpenFixedStream(root); + } else if (cmd == command_t::CLOSE_STREAM_REQUEST) { + return doCloseStream(root); + } else if (cmd == command_t::DELETE_STREAM_REQUEST) { + return doDeleteStream(root); + } else if (cmd == command_t::VINEYARD_OPEN_REMOTE_FIXED_STREAM_REQUEST) { + return doVineyardOpenRemoteFixedStream(root); + } else if (cmd == command_t::VINEYARD_STOP_STREAM_REQUEST) { + return doVineyardStopStream(root); + } else if (cmd == command_t::VINEYARD_ABORT_REMOTE_STREAM_REQUEST) { + return doVineyardAbortRemoteStream(root); + } else if (cmd == command_t::VINEYARD_DROP_STREAM_REQUEST) { + return doVineyardDropStream(root); + } else if (cmd == command_t::VINEYARD_CLOSE_REMOTE_FIXED_STREAM_REQUEST) { + return doVineyardCloseRemoteFixedStream(root); + } else if (cmd == command_t::VINEYARD_GET_METAS_BY_NAMES_REQUEST) { + return doVineyardGetMetasByNames(root); + } else if (cmd == command_t::VINEYARD_GET_REMOTE_BLOBS_WITH_RDMA_REQUEST) { + return doVineyardGetRemoteBlobs(root); + } else if (cmd == command_t::VINEYARD_GET_REMOTE_BLOBS_WITH_OFFSET_REQUEST) { + return doVineyardGetRemoteBlobsWithOffset(root); } else if (cmd == command_t::PUT_NAME_REQUEST) { return doPutName(root); + } else if (cmd == command_t::PUT_NAMES_REQUEST) { + return doPutNames(root); } else if (cmd == command_t::GET_NAME_REQUEST) { return doGetName(root); + } else if (cmd == command_t::GET_NAME_LOCATION_REQUEST) { + return doGetObjectLocation(root); + } else if (cmd == command_t::PUT_NAME_LOCATION_REQUEST) { + return doPutObjectLocation(root); + } else if (cmd == command_t::GET_NAMES_REQUEST) { + return doGetNames(root); + } else if (cmd == command_t::DROP_NAMES_REQUEST) { + return doDropNames(root); } else if (cmd == command_t::LIST_NAME_REQUEST) { return doListName(root); } else if (cmd == command_t::DROP_NAME_REQUEST) { @@ -349,6 +445,8 @@ bool SocketConnection::processMessage(const std::string& message_in) { return doIsSpilled(root); } else if (cmd == command_t::IS_IN_USE_REQUEST) { return doIsInUse(root); + } else if (cmd == command_t::GET_VINEYARD_MMAP_FD_REQUEST) { + return doGetVineyardMmapFd(root); } else if (cmd == command_t::CLUSTER_META_REQUEST) { return doClusterMeta(root); } else if (cmd == command_t::INSTANCE_STATUS_REQUEST) { @@ -406,6 +504,28 @@ bool SocketConnection::doRegister(const json& root) { return false; } +bool SocketConnection::doRequireExtraRequestMemory(json const& root) { + auto self(shared_from_this()); + size_t size = 0; + TRY_READ_REQUEST(ReadRequireExtraRequestMemoryRequest, root, size); + if (size == 0 || size > std::numeric_limits::max() / 2) { + RESPONSE_ON_ERROR(Status::Invalid( + "require extra request memory: invalid size: " + std::to_string(size))); + } + + int fd = -1; + RESPONSE_ON_ERROR(socket_server_ptr_->RequireExtraRequestMemory( + this->getConnId(), size, fd)); + std::string message_out; + WriteRequireExtraRequestMemoryReply(message_out); + this->doWrite(message_out); + LOG(INFO) << "require extra request memory: size = " << size + << ", fd = " << fd; + sendFd(fd); + + return false; +} + bool SocketConnection::doCreateBuffer(const json& root) { auto self(shared_from_this()); size_t size; @@ -427,7 +547,7 @@ bool SocketConnection::doCreateBuffer(const json& root) { this->doWrite(message_out, [this, self, fd_to_send](const Status& status) { if (fd_to_send != -1) { - send_fd(self->nativeHandle(), fd_to_send); + sendFd(fd_to_send); } LOG_SUMMARY("instances_memory_usage_bytes", server_ptr_->instance_id(), bulk_store_->Footprint()); @@ -473,7 +593,7 @@ bool SocketConnection::doCreateBuffers(const json& root) { this->doWrite(message_out, [this, self, fds_to_send](const Status& status) { for (auto const& fd_to_send : fds_to_send) { if (fd_to_send != -1) { - send_fd(self->nativeHandle(), fd_to_send); + sendFd(fd_to_send); } } LOG_SUMMARY("instances_memory_usage_bytes", server_ptr_->instance_id(), @@ -511,7 +631,7 @@ bool SocketConnection::doCreateDiskBuffer(const json& root) { this->doWrite(message_out, [this, self, fd_to_send](const Status& status) { if (fd_to_send != -1) { - send_fd(self->nativeHandle(), fd_to_send); + sendFd(fd_to_send); } LOG_SUMMARY("instances_memory_usage_bytes", server_ptr_->instance_id(), bulk_store_->Footprint()); @@ -606,7 +726,7 @@ bool SocketConnection::doGetBuffers(const json& root) { */ this->doWrite(message_out, [self, objects, fd_to_send](const Status& status) { for (int store_fd : fd_to_send) { - send_fd(self->nativeHandle(), store_fd); + self->sendFd(store_fd); } return Status::OK(); }); @@ -906,6 +1026,41 @@ bool SocketConnection::doDelDataWithFeedbacks(json const& root) { return false; } +bool SocketConnection::doDelHugeData(json const& root) { + auto self(shared_from_this()); + std::vector ids; + bool force, deep, memory_trim, fastpath; + + size_t total_id = 0; + TRY_READ_REQUEST(ReadDelHugeDataRequest, root, total_id, force, deep, + memory_trim, fastpath); + ids.resize(total_id); + VINEYARD_DISCARD(socket_server_ptr_->LseekExtraMsgReadPos(0, conn_id_)); + ClientAttributes attr; + RESPONSE_ON_ERROR( + self->socket_server_ptr_->GetClientAttributeMsg(self->conn_id_, attr)); + RESPONSE_ON_ERROR(socket_server_ptr_->ReadExtraMessage( + ids.data(), total_id * sizeof(ObjectID), this->getConnId())); + + RESPONSE_ON_ERROR(server_ptr_->DelData( + ids, force, deep, memory_trim, fastpath, + [self, attr](const Status& status) { + VLOG(2) << "Delete huge data request completed. Request id: " + << attr.req_name << ", status: " << status.ToString(); + std::string message_out; + if (status.ok()) { + WriteDelHugeDataReply(message_out); + } else { + LOG(ERROR) << "Error: " << status.ToString() + << ". Request id: " << attr.req_name; + WriteErrorReply(status, message_out); + } + self->doWrite(message_out); + return Status::OK(); + })); + return false; +} + bool SocketConnection::doCreateBufferByPlasma(json const& root) { auto self(shared_from_this()); PlasmaID plasma_id; @@ -934,7 +1089,7 @@ bool SocketConnection::doCreateBufferByPlasma(json const& root) { this->doWrite(message_out, [this, self, fd_to_send](const Status& status) { if (fd_to_send != -1) { - send_fd(self->nativeHandle(), fd_to_send); + self->sendFd(fd_to_send); } LOG_SUMMARY("instances_memory_usage_bytes", server_ptr_->instance_id(), plasma_bulk_store_->Footprint()); @@ -976,7 +1131,7 @@ bool SocketConnection::doGetBuffersByPlasma(json const& root) { if (data_size > 0 && self->used_fds_.find(store_fd) == self->used_fds_.end()) { self->used_fds_.emplace(store_fd); - send_fd(self->nativeHandle(), store_fd); + self->sendFd(store_fd); } } return Status::OK(); @@ -1051,6 +1206,149 @@ bool SocketConnection::doCreateData(const json& root) { return false; } +bool SocketConnection::doCreateUserBuffers(json const& root) { + auto self(shared_from_this()); + size_t offsets_num = 0, sizes_num = 0; + TRY_READ_REQUEST(ReadCreateUserBuffersRequest, root, offsets_num, sizes_num); + + boost::asio::post( + server_ptr_->GetIOContext(), [self, offsets_num, sizes_num]() { + std::vector offsets(offsets_num); + std::vector sizes(sizes_num); + uint64_t start = 0, end = 0; + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + + VINEYARD_DISCARD( + self->socket_server_ptr_->LseekExtraMsgReadPos(0, self->conn_id_)); + ClientAttributes attr; + RESPONSE_ON_ERROR(self->socket_server_ptr_->GetClientAttributeMsg( + self->conn_id_, attr)); + RESPONSE_ON_ERROR(self->socket_server_ptr_->ReadExtraMessage( + offsets.data(), offsets.size() * sizeof(size_t), self->conn_id_)); + RESPONSE_ON_ERROR(self->socket_server_ptr_->ReadExtraMessage( + sizes.data(), sizes.size() * sizeof(size_t), self->conn_id_)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(self->trace_log_level_) + << "Request: " << attr.req_name + << ", doCreateUserBuffers read extra message time consumed = " + << (end - start) << " us"; + + start = end; + std::vector object_ids; + object_ids.reserve(offsets.size()); + for (size_t i = 0; i < offsets.size(); i++) { + ObjectID object_id; + std::shared_ptr object; + Status status = self->server_ptr_->GetBulkStore()->CreateUserBlob( + offsets[i], sizes[i], object_id, object); + if (!status.ok()) { + for (auto const& object : object_ids) { + self->server_ptr_->GetBulkStore()->Delete(object); + } + RESPONSE_ON_ERROR(status); + } + object_ids.emplace_back(object_id); + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(self->trace_log_level_) + << "Request: " << attr.req_name << " create " << object_ids.size() + << " user buffers, time consumed = " << (end - start) << " us"; + + start = end; + std::string message_out; + WriteCreateUserBuffersReply(object_ids, message_out); + VINEYARD_DISCARD( + self->socket_server_ptr_->LseekExtraMsgWritePos(0, self->conn_id_)); + RESPONSE_ON_ERROR(self->socket_server_ptr_->WriteExtraMessage( + object_ids.data(), object_ids.size() * sizeof(ObjectID), + self->conn_id_)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(self->trace_log_level_) + << "Request: " << attr.req_name + << ", doCreateUserBuffers write extra message time consumed = " + << (end - start) << " us"; + self->doWrite(message_out); + return false; + }); + return false; +} + +bool SocketConnection::doGetUserBuffers(json const& root) { + auto self(shared_from_this()); + std::vector object_ids; + std::vector> objects; + std::string message_out; + + TRY_READ_REQUEST(ReadGetUserBuffersRequest, root, object_ids); + for (auto const& object_id : object_ids) { + VLOG(2) << "GetUserBuffers: object_id = " << ObjectIDToString(object_id); + } + VLOG(2) << "GetUserBuffers: object_ids.size() = " << object_ids.size(); + RESPONSE_ON_ERROR(bulk_store_->GetUnsafe(object_ids, true, objects)); + WriteGetUserBuffersReply(objects, message_out); + self->doWrite(message_out); + return false; +} + +bool SocketConnection::doDeleteUserBuffers(const json& root) { + auto self(shared_from_this()); + std::vector object_ids; + TRY_READ_REQUEST(ReadDeleteUserBuffersRequest, root, object_ids); + VINEYARD_DISCARD(socket_server_ptr_->LseekExtraMsgReadPos(0, conn_id_)); + ClientAttributes attr; + RESPONSE_ON_ERROR(socket_server_ptr_->GetClientAttributeMsg(conn_id_, attr)); + VLOG(2) << "DeleteUserBuffers: object_ids.size() = " << object_ids.size(); + uint64_t start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + RESPONSE_ON_ERROR(socket_server_ptr_->ReadExtraMessage( + object_ids.data(), object_ids.size() * sizeof(ObjectID), this->conn_id_)); + boost::asio::post(self->server_ptr_->GetIOContext(), [self, object_ids, + start]() { + std::string message_out; + std::vector transmissions, non_transmissions; + std::vector> to_delete_user_buffers; + self->server_ptr_->GetBulkStore()->GetUnsafe(object_ids, true, + to_delete_user_buffers); + for (auto const& object_id : object_ids) { + // make the user blob invisible. + VINEYARD_DISCARD( + self->server_ptr_->GetBulkStore()->DeleteUserBlob(object_id)); + } + do { + transmissions.clear(); + non_transmissions.clear(); + std::unique_lock lock = + self->server_ptr_->FindTransmissionObjects(object_ids, transmissions, + non_transmissions); + if (!transmissions.empty()) { + // sleep + lock.unlock(); + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } else { + self->server_ptr_->RemoveFromMigrationList(non_transmissions); + WriteDeleteUserBuffersReply(message_out); + self->doWrite(message_out); + uint64_t end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(self->trace_log_level_) + << "delete " << object_ids.size() + << " user buffers, time consumed = " << (end - start) << " us"; + } + } while (!transmissions.empty()); + }); + return false; +} + bool SocketConnection::doCreateDatas(const json& root) { auto self(shared_from_this()); std::vector tree; @@ -1078,6 +1376,84 @@ bool SocketConnection::doCreateDatas(const json& root) { return false; } +bool SocketConnection::doCreatehugeDatas(json const& root) { + auto self(shared_from_this()); + uint64_t start = 0, end = 0; + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + std::vector tree; + size_t total_json; + std::vector json_lengths; + TRY_READ_REQUEST(ReadCreateHugeDatasRequest, root, total_json); + json_lengths.resize(total_json); + VINEYARD_DISCARD(socket_server_ptr_->LseekExtraMsgReadPos(0, conn_id_)); + ClientAttributes attr; + RESPONSE_ON_ERROR(socket_server_ptr_->GetClientAttributeMsg(conn_id_, attr)); + RESPONSE_ON_ERROR(socket_server_ptr_->ReadExtraMessage( + json_lengths.data(), total_json * sizeof(size_t), this->getConnId())); + tree.resize(total_json); + for (size_t i = 0; i < total_json; i++) { + std::string json_str; + json_str.resize(json_lengths[i]); + RESPONSE_ON_ERROR(socket_server_ptr_->ReadExtraMessage( + &json_str[0], json_lengths[i], this->getConnId())); + tree[i] = json_from_buf(json_str.c_str(), json_str.size()); + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(self->trace_log_level_) + << "Request: " << attr.req_name + << ". Read doCreatehugeDatas IPC msg cost:" << (end - start) << " us."; + + start = end; + RESPONSE_ON_ERROR(server_ptr_->CreateData( + tree, + [self, start, attr](const Status& status, const std::vector ids, + const std::vector signatures, + const std::vector instance_ids) { + uint64_t end_ = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(self->trace_log_level_) + << "Request: " << attr.req_name + << ". CreateData cost:" << (end_ - start) << " us."; + + uint64_t start_ = end_; + std::string message_out; + if (status.ok()) { + VINEYARD_DISCARD(self->socket_server_ptr_->LseekExtraMsgWritePos( + 0, self->conn_id_)); + Status status_ = self->socket_server_ptr_->WriteExtraMessage( + ids.data(), ids.size() * sizeof(ObjectID), self->getConnId()); + if (status_.ok()) { + WriteCreateHugeDatasReply(ids.size(), signatures[0], + instance_ids[0], message_out); + } else { + LOG(ERROR) << "Error: " << status_.ToString() + << ". Request: " << attr.req_name; + WriteErrorReply(status_, message_out); + } + } else { + LOG(ERROR) << "Error: " << status.ToString() + << ". Request: " << attr.req_name; + WriteErrorReply(status, message_out); + } + end_ = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(self->trace_log_level_) + << "Request: " << attr.req_name + << ". Write doCreatehugeDatas IPC reply cost:" << (end_ - start_) + << " us."; + self->doWrite(message_out); + return Status::OK(); + })); + return false; +} + bool SocketConnection::doGetData(const json& root) { auto self(shared_from_this()); std::vector ids; @@ -1105,6 +1481,75 @@ bool SocketConnection::doGetData(const json& root) { return false; } +bool SocketConnection::doGetHugeData(json const& root) { + auto self(shared_from_this()); + std::vector ids; + size_t ids_num; + bool sync_remote = false, wait = false; + uint64_t start = 0, end = 0; + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + TRY_READ_REQUEST(ReadGetHugeDataRequest, root, ids_num, sync_remote, wait); + ids.resize(ids_num); + json tree; + VINEYARD_DISCARD(socket_server_ptr_->LseekExtraMsgReadPos(0, conn_id_)); + ClientAttributes attr; + RESPONSE_ON_ERROR(socket_server_ptr_->GetClientAttributeMsg(conn_id_, attr)); + RESPONSE_ON_ERROR(socket_server_ptr_->ReadExtraMessage( + ids.data(), ids_num * sizeof(ObjectID), this->getConnId())); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(self->trace_log_level_) + << "Request: " << attr.req_name + << ". Read doGetHugeData IPC msg cost:" << (end - start) << " us."; + + start = end; + RESPONSE_ON_ERROR(server_ptr_->GetData( + ids, sync_remote, wait, [self]() { return self->running_.load(); }, + [self, start, attr](const Status& status, const json& tree) { + uint64_t start_ = start, end_ = 0; + end_ = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(self->trace_log_level_) + << "Request: " << attr.req_name + << ". GetData cost:" << (end_ - start_) << " us."; + start_ = end_; + std::string message_out; + if (status.ok()) { + std::string json_str = json_to_string(tree); + VINEYARD_DISCARD(self->socket_server_ptr_->LseekExtraMsgWritePos( + 0, self->conn_id_)); + Status status_ = self->socket_server_ptr_->WriteExtraMessage( + json_str.data(), json_str.size(), self->getConnId()); + if (status_.ok()) { + WriteGetHugeDataReply(json_str.size(), message_out); + } else { + LOG(ERROR) << "Error: " << status_.ToString() + << ". Request: " << attr.req_name; + WriteErrorReply(status_, message_out); + } + } else { + LOG(ERROR) << "Error: " << status.ToString() + << ". Request: " << attr.req_name; + WriteErrorReply(status, message_out); + } + end_ = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(self->trace_log_level_) + << "Request: " << attr.req_name + << ". Write doGetHugeData IPC reply cost:" << (end_ - start_) + << " us."; + + self->doWrite(message_out); + return Status::OK(); + })); + return false; +} + bool SocketConnection::doListData(const json& root) { auto self(shared_from_this()); std::string pattern; @@ -1198,6 +1643,33 @@ bool SocketConnection::doPersist(const json& root) { return false; } +bool SocketConnection::doBatchPersist(const json& root) { + auto self(shared_from_this()); + std::vector ids; + TRY_READ_REQUEST(ReadBatchPersistRequest, root, ids); + + RESPONSE_ON_ERROR( + self->server_ptr_->Persist(ids, [self, root](const Status& status) { + std::string message_out; + if (status.ok()) { + WriteBatchPersistReply(message_out); + self->doWrite(message_out); + } else if (status.IsEtcdError()) { + // retry on etcd error: reprocess the message + VLOG(100) << "Warning: " + << "Retry persist on etcd error: " << status.ToString(); + self->server_ptr_->GetIOContext().post( + [self, root]() { self->doPersist(root); }); + } else { + VLOG(100) << "Error: " << status.ToString(); + WriteErrorReply(status, message_out); + self->doWrite(message_out); + } + return Status::OK(); + })); + return false; +} + bool SocketConnection::doIfPersist(const json& root) { auto self(shared_from_this()); ObjectID id; @@ -1294,8 +1766,12 @@ bool SocketConnection::doMemoryTrim(const json& root) { bool SocketConnection::doCreateStream(const json& root) { auto self(shared_from_this()); ObjectID stream_id; + bool fixed_size = false; + int nums = 0; + size_t size = 0; TRY_READ_REQUEST(ReadCreateStreamRequest, root, stream_id); - auto status = server_ptr_->GetStreamStore()->Create(stream_id); + auto status = + server_ptr_->GetStreamStore()->Create(stream_id, fixed_size, nums, size); std::string message_out; if (status.ok()) { WriteCreateStreamReply(message_out); @@ -1307,23 +1783,102 @@ bool SocketConnection::doCreateStream(const json& root) { return false; } -bool SocketConnection::doOpenStream(const json& root) { +bool SocketConnection::doCreateFixedStream(json const& root) { auto self(shared_from_this()); + std::string stream_name; + int nums = 0; + size_t size = 0; ObjectID stream_id; - int64_t mode; - TRY_READ_REQUEST(ReadOpenStreamRequest, root, stream_id, mode); - auto status = server_ptr_->GetStreamStore()->Open(stream_id, mode); + TRY_READ_REQUEST(ReadCreateFixedStreamRequest, root, stream_name, nums, size); + VLOG(2) << "Create fixed stream: " << stream_name << ", nums: " << nums + << ", size: " << size; + stream_id = GenerateObjectID(server_ptr_->instance_id()); + if (stream_id == InvalidObjectID()) { + LOG(ERROR) << "Error: Failed to generate stream id"; + std::string message_out; + WriteErrorReply(Status::Invalid("Failed to generate stream id"), + message_out); + this->doWrite(message_out); + return false; + } + + Status status = + server_ptr_->GetStreamStore()->Create(stream_id, true, nums, size); + if (!status.ok()) { + std::string message_out; + LOG(ERROR) << "Error: " << status.ToString(); + WriteErrorReply(status, message_out); + this->doWrite(message_out); + return false; + } + + if (!stream_name.empty()) { + status = server_ptr_->GetStreamStore()->PutName(stream_name, stream_id); + } std::string message_out; if (status.ok()) { - WriteOpenStreamReply(message_out); + WriteCreateFixedStreamReply(message_out, stream_id); } else { - VLOG(100) << "Error: " << status.ToString(); + LOG(ERROR) << "Error: " << status.ToString(); + if (!server_ptr_->GetStreamStore()->Delete(stream_id).ok()) { + LOG(ERROR) << "Failed to cleanup stream: " << stream_id; + } WriteErrorReply(status, message_out); } this->doWrite(message_out); return false; } +bool SocketConnection::doOpenStream(const json& root) { + auto self(shared_from_this()); + ObjectID stream_id; + int64_t mode; + std::string stream_name; + bool wait = false; + uint64_t timeout = 0; + TRY_READ_REQUEST(ReadOpenStreamRequest, root, stream_id, stream_name, mode, + wait, timeout); + VLOG(2) << "Open stream: " << stream_id << ", name: " << stream_name + << ", mode: " << mode << ", wait: " << wait + << ", timeout: " << timeout; + std::string owner = StreamStore::BuildOwner(self->peer_host, self->conn_id_); + if (stream_id != InvalidObjectID()) { + RESPONSE_ON_ERROR(server_ptr_->GetStreamStore()->Open( + stream_id, mode, owner, wait, timeout, + [self, stream_name, stream_id](Status& status, ObjectID id) { + VLOG(2) << "doOpenStream callback, stream_id:" << stream_id + << " stream_name:" << stream_name + << " status:" << status.ToString(); + std::string message_out; + if (status.ok()) { + WriteOpenStreamReply(message_out, id); + } else { + LOG(ERROR) << "Error: " << status.ToString(); + WriteErrorReply(status, message_out); + } + self->doWrite(message_out); + })); + } else { + RESPONSE_ON_ERROR(server_ptr_->GetStreamStore()->Open( + stream_name, mode, owner, wait, timeout, + [self, stream_name, stream_id](Status& status, ObjectID id) { + VLOG(2) << "doOpenStream callback, stream_id:" << stream_id + << " stream_name:" << stream_name + << " status:" << status.ToString(); + std::string message_out; + if (status.ok()) { + WriteOpenStreamReply(message_out, id); + } else { + LOG(ERROR) << "Error: " << status.ToString(); + WriteErrorReply(status, message_out); + } + self->doWrite(message_out); + })); + } + + return false; +} + bool SocketConnection::doGetNextStreamChunk(const json& root) { auto self(shared_from_this()); ObjectID stream_id; @@ -1346,7 +1901,7 @@ bool SocketConnection::doGetNextStreamChunk(const json& root) { WriteGetNextStreamChunkReply(object, fd_to_send, message_out); self->doWrite(message_out, [self, fd_to_send](const Status& status) { if (fd_to_send != -1) { - send_fd(self->nativeHandle(), fd_to_send); + self->sendFd(fd_to_send); } return Status::OK(); }); @@ -1379,6 +1934,35 @@ bool SocketConnection::doPushNextStreamChunk(const json& root) { return false; } +bool SocketConnection::doPushNextStreamChunkByOffset(const json& root) { + auto self(shared_from_this()); + ObjectID stream_id; + size_t offset; + TRY_READ_REQUEST(ReadPushNextStreamChunkByOffsetRequest, root, stream_id, + offset); + ObjectID blob_id; + size_t size = 0; + int blob_nums; + std::shared_ptr blob_payload; + RESPONSE_ON_ERROR(server_ptr_->GetStreamStore()->GetFixedStreamSizeInfo( + stream_id, size, blob_nums)); + RESPONSE_ON_ERROR(server_ptr_->GetBulkStore()->CreateUserBlob( + offset, size, blob_id, blob_payload)); + RESPONSE_ON_ERROR(server_ptr_->GetStreamStore()->Push( + stream_id, blob_id, [self](const Status& status, const ObjectID) { + std::string message_out; + if (status.ok()) { + WritePushNextStreamChunkByOffsetReply(message_out); + } else { + VLOG(100) << "Error: " << status.ToString(); + WriteErrorReply(status, message_out); + } + self->doWrite(message_out); + return Status::OK(); + })); + return false; +} + bool SocketConnection::doPullNextStreamChunk(const json& root) { auto self(shared_from_this()); ObjectID stream_id; @@ -1401,6 +1985,21 @@ bool SocketConnection::doPullNextStreamChunk(const json& root) { return false; } +bool SocketConnection::doCheckFixedStreamReceived(const json& root) { + auto self(shared_from_this()); + ObjectID stream_id = InvalidObjectID(); + int index = -1; + TRY_READ_REQUEST(ReadCheckFixedStreamReceivedRequest, root, stream_id, index); + bool finished = false; + RESPONSE_ON_ERROR(self->server_ptr_->GetStreamStore()->CheckBlobReceived( + stream_id, index, finished)); + std::string message_out; + WriteCheckFixedStreamReceivedReply(finished, message_out); + self->doWrite(message_out); + + return false; +} + bool SocketConnection::doStopStream(const json& root) { auto self(shared_from_this()); ObjectID stream_id; @@ -1426,32 +2025,689 @@ bool SocketConnection::doDropStream(const json& root) { return false; } -bool SocketConnection::doPutName(const json& root) { +bool SocketConnection::doAbortStream(const json& root) { auto self(shared_from_this()); - ObjectID object_id; + ObjectID stream_id; + TRY_READ_REQUEST(ReadAbortStreamRequest, root, stream_id); + /* + * Currently, abort only occurs before the stream is activated or after the + * stream is transferred. So we don't need to wait the transfer to be done. + */ + bool success = false; + RESPONSE_ON_ERROR(server_ptr_->GetStreamStore()->Abort(stream_id, success)); + std::string message_out; + WriteAbortStreamReply(message_out, success); + this->doWrite(message_out); + return false; +} + +bool SocketConnection::doPutStreamName(const json& root) { + auto self(shared_from_this()); + ObjectID stream_id; std::string name; - TRY_READ_REQUEST(ReadPutNameRequest, root, object_id, name); - name = escape_json_pointer(name); + TRY_READ_REQUEST(ReadPutStreamNameRequest, root, stream_id, name); + RESPONSE_ON_ERROR(server_ptr_->GetStreamStore()->PutName(name, stream_id)); + std::string message_out; + WritePutStreamNameReply(message_out); + this->doWrite(message_out); + return false; +} + +bool SocketConnection::doGetStreamIDByName(const json& root) { + auto self(shared_from_this()); + std::string name; + ObjectID stream_id; + TRY_READ_REQUEST(ReadGetStreamIDByNameRequest, root, name); RESPONSE_ON_ERROR( - server_ptr_->PutName(object_id, name, [self](const Status& status) { + server_ptr_->GetStreamStore()->GetStreamIDByName(name, stream_id)); + + std::string message_out; + WriteGetStreamIDByNameReply(stream_id, message_out); + this->doWrite(message_out); + + return false; +} + +bool SocketConnection::doActivateRemoteFixedStream(const json& root) { + auto self(shared_from_this()); + ObjectID stream_id = InvalidObjectID(); + std::vector> recv_addr_list; + std::vector> rkeys_list; + std::vector> sizes_list; + std::string advice_device; + int port; + TRY_READ_REQUEST(ReadActivateRemoteFixedStreamRequest, root, stream_id, + recv_addr_list, rkeys_list, sizes_list, advice_device, port); + VLOG(100) << "remote device:" << advice_device << " with port:" << port; + + std::string message_out; + WriteActivateRemoteFixedStreamReply(message_out); + this->doWrite(message_out); + + boost::asio::post(server_ptr_->GetIOContext(), [self, recv_addr_list, + rkeys_list, sizes_list, + advice_device, port, + stream_id]() { + self->server_ptr_->GetStreamStore()->ActivateRemoteFixedStream( + stream_id, recv_addr_list, rkeys_list, sizes_list, + [self, advice_device, port, stream_id]( + const Status& status_, ObjectID chunk, + std::vector addr_list, std::vector rkey_list, + std::vector size_list, int index) { + std::string message_out; + Status status = status_; + if (!status.ok()) { + VLOG(100) << "Error: " << status.ToString(); + self->server_ptr_->GetStreamStore()->SetErrorFlag(stream_id, + status); + WriteErrorReply(status, message_out); + self->doWriteWithoutRead(message_out); + return status; + } + + // RDMA write + uint64_t local_addr; + size_t size; + std::shared_ptr object; + status = self->bulk_store_->GetUnsafe(chunk, true, object); + if (!status.ok()) { + VLOG(100) << "Error: failed to get object"; + self->server_ptr_->GetStreamStore()->SetErrorFlag(stream_id, + status); + WriteErrorReply(Status::KeyError("Failed to get object"), + message_out); + self->doWriteWithoutRead(message_out); + return status; + } + self->server_ptr_->GetBulkStore()->DeleteUserBlob(chunk); + local_addr = reinterpret_cast(object->pointer); + size = object->data_size; + void* base_addr = nullptr; + status = self->server_ptr_->GetBulkStoreMmapAddr(base_addr); + if (!status.ok()) { + VLOG(100) << "Error: failed to get bulk store mmap addr"; + self->server_ptr_->GetStreamStore()->SetErrorFlag(stream_id, + status); + WriteErrorReply( + Status::KeyError("Failed to get bulk store mmap addr"), + message_out); + self->doWriteWithoutRead(message_out); + return status; + } + uint64_t offset = local_addr - reinterpret_cast(base_addr); + status = self->socket_server_ptr_->SendDataWithRDMA( + addr_list, local_addr, offset, size_list, size, rkey_list, + self->peer_host, port, advice_device); + if (!status.ok()) { + VLOG(100) << "Error: failed to send data with RDMA"; + self->server_ptr_->GetStreamStore()->SetErrorFlag(stream_id, + status); + WriteErrorReply(Status::IOError("Failed to send data with RDMA"), + message_out); + self->doWriteWithoutRead(message_out); + return status; + } + + VLOG(100) << "SendDataWithRDMA success, index:" << index; + self->server_ptr_->GetStreamStore()->SetBlobReceived(stream_id, + index); + WriteStreamReadyAckReply(message_out, index); + bool finished = false; + status = self->server_ptr_->GetStreamStore() + ->IsFixedStreamTransferFinished(stream_id, finished); + if (!status.ok()) { + VLOG(100) + << "Error: failed to check fixed stream transfer finished"; + self->server_ptr_->GetStreamStore()->SetErrorFlag(stream_id, + status); + WriteErrorReply(status, message_out); + self->doWriteWithoutRead(message_out); + return status; + } + + self->doWriteWithoutRead(message_out); + return Status::OK(); + }); + }); + + return false; +} + +bool SocketConnection::doOpenFixedStream(const json& root) { + auto self(shared_from_this()); + ObjectID stream_id; + int64_t mode; + TRY_READ_REQUEST(ReadOpenFixedStreamRequest, root, stream_id, mode); + std::string owner = StreamStore::BuildOwner(self->peer_host, self->conn_id_); + RESPONSE_ON_ERROR(server_ptr_->GetStreamStore()->Open( + stream_id, mode, owner, false, 0, [&](Status& status, ObjectID id) { std::string message_out; + VLOG(100) << "Open stream return!"; + int fd = -1; if (status.ok()) { - WritePutNameReply(message_out); + if (!server_ptr_->GetStreamStore()->GetRecvFd(stream_id, fd).ok()) { + WriteErrorReply(Status::KeyError("Failed to get recv fd"), + message_out); + VINEYARD_DISCARD( + server_ptr_->GetStreamStore()->Close(stream_id, owner)); + this->doWrite(message_out); + } else { + WriteOpenFixedStreamReply(message_out); + this->doWrite(message_out); + self->sendFd(fd); + } } else { - VLOG(100) << "Error: failed to put name: " << status.ToString(); + VLOG(100) << "Error: " << status.ToString(); WriteErrorReply(status, message_out); + this->doWrite(message_out); } - self->doWrite(message_out); - return Status::OK(); })); + return false; } -bool SocketConnection::doGetName(const json& root) { +bool SocketConnection::doCloseStream(const json& root) { auto self(shared_from_this()); - std::string name; - bool wait; - TRY_READ_REQUEST(ReadGetNameRequest, root, name, wait); + ObjectID stream_id; + TRY_READ_REQUEST(ReadCloseStreamRequest, root, stream_id); + std::string owner = StreamStore::BuildOwner(self->peer_host, self->conn_id_); + RESPONSE_ON_ERROR(server_ptr_->GetStreamStore()->Close(stream_id, owner)); + std::string message_out; + WriteCloseStreamReply(message_out); + this->doWrite(message_out); + return false; +} + +bool SocketConnection::doDeleteStream(const json& root) { + auto self(shared_from_this()); + ObjectID stream_id; + TRY_READ_REQUEST(ReadDeleteStreamRequest, root, stream_id); + RESPONSE_ON_ERROR(server_ptr_->GetStreamStore()->Delete(stream_id)); + std::string message_out; + WriteDeleteStreamReply(message_out); + this->doWrite(message_out); + return false; +} + +bool SocketConnection::doVineyardOpenRemoteFixedStream(const json& root) { + auto self(shared_from_this()); + ObjectID stream_id; + std::string stream_name; + ObjectID local_id; + std::string endpoint; + uint64_t mode; + size_t size; + int blob_num; + bool wait = false; + uint64_t timeout = 0; + TRY_READ_REQUEST(ReadVineyardOpenRemoteFixedStreamRequest, root, stream_id, + stream_name, local_id, blob_num, size, endpoint, mode, wait, + timeout); + VLOG(2) << "Stream ID: " << ObjectIDToString(stream_id) + << ", Stream Name: " << stream_name + << ", Local ID: " << ObjectIDToString(local_id) + << ", Blob Num: " << blob_num << ", Size: " << size + << ", Endpoint: " << endpoint << ", Mode: " << mode + << ", Wait: " << wait << ", Timeout: " << timeout; + std::string owner = StreamStore::BuildOwner(self->peer_host, self->conn_id_); + RESPONSE_ON_ERROR(server_ptr_->VineyardOpenRemoteFixedStream( + stream_id, stream_name, local_id, blob_num, size, endpoint, mode, owner, + wait, timeout, [self, local_id](const Status& status) { + VLOG(2) << "VineyardOpenRemoteFixedStream done callback, local_id:" + << ObjectIDToString(local_id) + << ", status: " << status.ToString(); + std::string message_out; + if (status.ok()) { + int fd = -1; + if (!self->server_ptr_->GetStreamStore() + ->GetRecvFd(local_id, fd) + .ok()) { + WriteErrorReply(Status::KeyError("Failed to get recv fd"), + message_out); + VINEYARD_DISCARD( + self->server_ptr_->GetStreamStore()->Close(local_id, "")); + self->doWrite(message_out); + } else { + WriteVineyardOpenRemoteFixedStreamReply(message_out, local_id); + self->doWrite(message_out); + self->sendFd(fd); + } + } else { + LOG(ERROR) << "Error: " << status.ToString(); + WriteErrorReply(status, message_out); + self->doWrite(message_out); + } + return Status::OK(); + })); + + return false; +} + +bool SocketConnection::doVineyardActivateRemoteFixedStreamWithOffset( + const json& root) { + auto self(shared_from_this()); + std::vector offset_list; + ObjectID stream_id = InvalidObjectID(); + TRY_READ_REQUEST(ReadVineyardActivateRemoteFixedStreamWithOffsetRequest, root, + stream_id, offset_list); + VLOG(2) << "Vineyard activate remote fixed stream with offset, local id: " + << ObjectIDToString(stream_id); + + boost::asio::post(server_ptr_->GetIOContext(), [self, stream_id, + offset_list]() { + VLOG(2) << "doVineyardActivateRemoteFixedStreamWithOffset post task"; + std::vector blob_list; + size_t size = 0; + int blob_num = 0; + RESPONSE_ON_ERROR( + self->server_ptr_->GetStreamStore()->GetFixedStreamSizeInfo( + stream_id, size, blob_num)); + for (uint64_t offset : offset_list) { + ObjectID blob_id = InvalidObjectID(); + std::shared_ptr blob_payload; + Status status = self->server_ptr_->GetBulkStore()->CreateUserBlob( + offset, size, blob_id, blob_payload); + // currently, create user blob will never fail + if (!status.ok()) { + for (auto const& id : blob_list) { + self->server_ptr_->GetBulkStore()->DeleteUserBlob(id); + } + RESPONSE_ON_ERROR(status); + } + blob_list.push_back(blob_id); + } + + RESPONSE_ON_ERROR(self->server_ptr_->VineyardActivateRemoteFixedStream( + stream_id, self->conn_id_, false, blob_list, + [self, blob_list, stream_id]( + const Status& status, + std::vector>& payload_list) { + VLOG(2) << "VineyardActivateRemoteFixedStreamWithOffset done " + << "callback, local_id:" << ObjectIDToString(stream_id) + << ", status: " << status.ToString(); + for (auto blob_id : blob_list) { + self->server_ptr_->GetBulkStore()->DeleteUserBlob(blob_id); + } + std::string message_out; + if (status.ok()) { + WriteVineyardActivateRemoteFixedStreamWithOffsetReply(message_out); + } else { + LOG(ERROR) << "Error: " << status.ToString(); + WriteErrorReply(status, message_out); + // release the user blobs + } + self->doWrite(message_out); + })); + return false; + }); + return false; +} + +bool SocketConnection::doVineyardCloseRemoteFixedStream(const json& root) { + auto self(shared_from_this()); + ObjectID stream_id; + TRY_READ_REQUEST(ReadVineyardCloseRemoteFixedStreamRequest, root, stream_id); + VLOG(2) << "Vineyard close remote fixed stream, local id: " + << ObjectIDToString(stream_id); + RESPONSE_ON_ERROR(server_ptr_->VineyardCloseRemoteFixedStream( + stream_id, [self, stream_id](const Status& status) { + VLOG(2) << "VineyardCloseRemoteFixedStream done callback, local_id:" + << ObjectIDToString(stream_id) + << ", status: " << status.ToString(); + std::string message_out; + if (status.ok() || status.IsObjectNotExists()) { + // if the stream is not exists, it means that the remote node + // is restarted. So we need to close the stream in local node. + WriteVineyardCloseRemoteFixedStreamReply(message_out); + } else { + LOG(ERROR) << "Error: " << status.ToString(); + WriteErrorReply(status, message_out); + } + std::string owner = + StreamStore::BuildOwner(self->peer_host, self->conn_id_); + self->server_ptr_->GetStreamStore()->UnbindRemoteStream(stream_id); + self->server_ptr_->GetStreamStore()->Close(stream_id, owner); + self->doWrite(message_out); + return Status::OK(); + })); + return false; +} + +bool SocketConnection::doVineyardGetMetasByNames(const json& root) { + auto self(shared_from_this()); + uint64_t start = 0, end = 0; + std::vector names; + std::string rpc_endpoint; + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + TRY_READ_REQUEST(ReadVineyardGetMetasByNamesRequest, root, names, + rpc_endpoint); + RESPONSE_ON_ERROR(socket_server_ptr_->LseekExtraMsgReadPos(0, conn_id_)); + ClientAttributes attr; + RESPONSE_ON_ERROR(socket_server_ptr_->GetClientAttributeMsg(conn_id_, attr)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(self->trace_log_level_) + << "Request: " << attr.req_name + << ". Read get metas by names request cost: " << (end - start) << " us."; + + start = end; + RESPONSE_ON_ERROR(server_ptr_->VineyardGetMetasByNames( + names, rpc_endpoint, attr, + [self, attr, start](const Status& status, + const std::vector& metas) { + uint64_t start_ = start, end_ = 0; + end_ = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(self->trace_log_level_) + << "Request: " << attr.req_name + << ", get metas size: " << metas.size() + << ", get metas by names request cost: " << (end_ - start_) + << " us."; + + VLOG(2) << "VineyardGetMetasByNames done callback" + << ", request:" << attr.req_name + << ", status: " << status.ToString() + << ", request id: " << attr.req_name; + start_ = end_; + std::string message_out; + if (status.ok()) { + WriteVineyardGetMetasByNamesReply(metas, message_out); + } else { + LOG(ERROR) << "Error: " << status.ToString() + << ", request: " << attr.req_name; + WriteErrorReply(status, message_out); + } + end_ = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(self->trace_log_level_) + << "Request: " << attr.req_name + << ", write reply cost: " << (end_ - start_) << " us."; + self->doWrite(message_out); + return Status::OK(); + })); + return false; +} + +bool SocketConnection::doVineyardGetRemoteBlobs(const json& root) { + auto self(shared_from_this()); + std::vector> local_ids; + std::vector> remote_ids; + std::string rpc_endpoint; + + uint64_t start = 0, end = 0; + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + TRY_READ_REQUEST(ReadVineyardGetRemoteBlobsWithRDMARequest, root, local_ids, + remote_ids, rpc_endpoint); + RESPONSE_ON_ERROR(socket_server_ptr_->LseekExtraMsgReadPos(0, conn_id_)); + ClientAttributes attr; + RESPONSE_ON_ERROR(socket_server_ptr_->GetClientAttributeMsg(conn_id_, attr)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + + // VLOG(2) << "Vineyard get remote blobs, local ids size: " << + // local_ids.size() + // << ", remote ids size: " << remote_ids.size() + // << ", rpc endpoint: " << rpc_endpoint; + // for (size_t i = 0; i < local_ids.size(); i++) { + // VLOG(3) << "layer " << i << " :"; + // for (size_t j = 0; j < local_ids[i].size(); ++j) { + // VLOG(3) << "local id: " << ObjectIDToString(local_ids[i][j]) + // << ", remote id: " << ObjectIDToString(remote_ids[i][j]); + // } + // } + VLOG(self->trace_log_level_) + << "Request: " << attr.req_name + << ". Read get remote blobs request cost: " << (end - start) << " us."; + start = end; + RESPONSE_ON_ERROR(self->server_ptr_->VineyardGetRemoteBlobs( + local_ids, remote_ids, rpc_endpoint, attr, + [self, local_ids, start, attr](const Status& status, int fd) { + uint64_t start_ = 0, end_ = 0; + start_ = start; + end_ = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(self->trace_log_level_) + << "Request: " << attr.req_name + << ", local_ids size: " << local_ids.size() + << ". Start get remote blobs request cost: " << (end_ - start_) + << " us."; + VLOG(2) << "VineyardGetRemoteBlobs ready callback, status: " + << status.ToString() << ", request: " << attr.req_name; + + std::string message_out; + if (status.ok()) { + WriteVineyardGetRemoteBlobsWithRDMAReply(message_out); + } else { + LOG(ERROR) << "Error: " << status.ToString() + << ", request: " << attr.req_name; + WriteErrorReply(status, message_out); + } + self->doWrite(message_out); + if (fd != -1) { + VLOG(2) << "Send fd: " << fd; + self->sendFd(fd); + } else { + VLOG(2) << "No fd to send"; + } + return Status::OK(); + })); + return false; +} + +bool SocketConnection::doVineyardGetRemoteBlobsWithOffset(const json& root) { + auto self(shared_from_this()); + std::vector> local_offsets; + std::vector> remote_ids; + std::vector> sizes_vec; + size_t batch_nums = 0, batch_size = 0; + std::string rpc_endpoint; + + uint64_t start = 0, end = 0; + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + TRY_READ_REQUEST(ReadVineyardGetRemoteBlobsWithOffsetRequest, root, + batch_nums, batch_size, rpc_endpoint); + local_offsets.resize(batch_nums); + remote_ids.resize(batch_nums); + sizes_vec.resize(batch_nums); + RESPONSE_ON_ERROR(socket_server_ptr_->LseekExtraMsgReadPos(0, conn_id_)); + ClientAttributes attr; + RESPONSE_ON_ERROR(socket_server_ptr_->GetClientAttributeMsg(conn_id_, attr)); + for (size_t i = 0; i < batch_nums; ++i) { + local_offsets[i].resize(batch_size); + remote_ids[i].resize(batch_size); + sizes_vec[i].resize(batch_size); + RESPONSE_ON_ERROR(socket_server_ptr_->ReadExtraMessage( + reinterpret_cast(local_offsets[i].data()), + sizeof(size_t) * batch_size, conn_id_)); + RESPONSE_ON_ERROR(socket_server_ptr_->ReadExtraMessage( + reinterpret_cast(remote_ids[i].data()), + sizeof(ObjectID) * batch_size, conn_id_)); + RESPONSE_ON_ERROR(socket_server_ptr_->ReadExtraMessage( + reinterpret_cast(sizes_vec[i].data()), + sizeof(uint64_t) * batch_size, conn_id_)); + for (size_t j = 0; j < batch_size; ++j) { + local_offsets[i][j] = + reinterpret_cast( + server_ptr_->GetBulkStore()->GetUserBasePointer()) + + local_offsets[i][j]; + } + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + + VLOG(self->trace_log_level_) + << "Request: " << attr.req_name + << ". Read get remote blobs request cost: " << (end - start) << " us."; + start = end; + RESPONSE_ON_ERROR(self->server_ptr_->VineyardGetRemoteBlobsWithOffset( + local_offsets, remote_ids, sizes_vec, rpc_endpoint, attr, + [self, batch_nums, batch_size, start, attr](const Status& status, + int fd) { + uint64_t start_ = 0, end_ = 0; + start_ = start; + end_ = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(self->trace_log_level_) + << "Request: " << attr.req_name + << ", buffer nums: " << batch_nums * batch_size + << ". Start get remote blobs request cost: " << (end_ - start_) + << " us."; + VLOG(2) << "VineyardGetRemoteBlobs ready callback, status: " + << status.ToString(); + + std::string message_out; + if (status.ok()) { + WriteVineyardGetRemoteBlobsWithOffsetReply(message_out); + } else { + LOG(ERROR) << "Error: " << status.ToString(); + WriteErrorReply(status, message_out); + } + self->doWrite(message_out); + if (fd != -1) { + VLOG(2) << "Send fd: " << fd << " for request: " << attr.req_name; + self->sendFd(fd); + } else { + VLOG(2) << "No fd to send for request: " << attr.req_name; + } + return Status::OK(); + })); + return false; +} + +bool SocketConnection::doVineyardStopStream(const json& root) { + // TBD + return false; +} + +bool SocketConnection::doVineyardDropStream(const json& root) { + // TBD + return false; +} + +bool SocketConnection::doVineyardAbortRemoteStream(const json& root) { + auto self(shared_from_this()); + ObjectID stream_id; + TRY_READ_REQUEST(ReadVineyardAbortRemoteStreamRequest, root, stream_id); + VLOG(2) << "Vineyard abort remote stream, local id: " + << ObjectIDToString(stream_id); + bool success; + RESPONSE_ON_ERROR(server_ptr_->GetStreamStore()->Abort(stream_id, success)); + if (!success) { + std::string message_out; + WriteVineyardAbortRemoteStreamReply(message_out, success); + this->doWrite(message_out); + return false; + } + RESPONSE_ON_ERROR(server_ptr_->VineyardAbortRemoteStream( + stream_id, [self, stream_id](const Status& status, bool success) { + VLOG(2) << "VineyardAbortRemoteStream done callback, local_id:" + << ObjectIDToString(stream_id) + << ", status: " << status.ToString(); + std::string message_out; + if (status.ok()) { + WriteVineyardAbortRemoteStreamReply(message_out, success); + } else { + LOG(ERROR) << "Error: " << status.ToString(); + WriteErrorReply(status, message_out); + } + self->doWrite(message_out); + return Status::OK(); + })); + return false; +} + +bool SocketConnection::doPutName(const json& root) { + auto self(shared_from_this()); + ObjectID object_id; + std::string name; + bool overwrite = true; + TRY_READ_REQUEST(ReadPutNameRequest, root, object_id, name, overwrite); + name = escape_json_pointer(name); + RESPONSE_ON_ERROR(server_ptr_->PutName( + object_id, name, overwrite, [self](const Status& status) { + std::string message_out; + if (status.ok()) { + WritePutNameReply(message_out); + } else { + VLOG(100) << "Error: failed to put name: " << status.ToString(); + WriteErrorReply(status, message_out); + } + self->doWrite(message_out); + return Status::OK(); + })); + return false; +} + +bool SocketConnection::doPutNames(const json& root) { + auto self(shared_from_this()); + uint64_t start = 0, end = 0; + std::vector object_id_vec; + std::vector name_vec; + bool overwrite = true; + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + TRY_READ_REQUEST(ReadPutNamesRequest, root, object_id_vec, name_vec, + overwrite); + VINEYARD_DISCARD(socket_server_ptr_->LseekExtraMsgReadPos(0, conn_id_)); + ClientAttributes attr; + RESPONSE_ON_ERROR(socket_server_ptr_->GetClientAttributeMsg(conn_id_, attr)); + for (auto& name : name_vec) { + name = escape_json_pointer(name); + } + + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(self->trace_log_level_) + << "Request: " << attr.req_name << ". Read put names reqest " + << "cost: " << (end - start) << " us."; + start = end; + RESPONSE_ON_ERROR(server_ptr_->PutNames( + object_id_vec, name_vec, overwrite, + [self, start, attr](const Status& status) { + uint64_t end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(self->trace_log_level_) + << "Request: " << attr.req_name << ". Process put names " + << "request cost: " << (end - start) << " us."; + std::string message_out; + if (status.ok()) { + WritePutNamesReply(message_out); + } else { + LOG(ERROR) << "Error: failed to put names: " << status.ToString() + << ", request: " << attr.req_name; + WriteErrorReply(status, message_out); + } + self->doWrite(message_out); + return Status::OK(); + })); + return false; +} + +bool SocketConnection::doGetName(const json& root) { + auto self(shared_from_this()); + std::string name; + bool wait; + TRY_READ_REQUEST(ReadGetNameRequest, root, name, wait); // n.b.: no need for escape for `get`, as the translation has been handled // by nlohmann/json when compare keys. // @@ -1472,6 +2728,134 @@ bool SocketConnection::doGetName(const json& root) { return false; } +bool SocketConnection::doGetNames(const json& root) { + auto self(shared_from_this()); + std::vector name_vec; + bool wait; + uint64_t start = 0, end = 0; + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + TRY_READ_REQUEST(ReadGetNamesRequest, root, name_vec, wait); + // n.b.: no need for escape for `get`, as the translation has been handled + // by nlohmann/json when compare keys. + // + RESPONSE_ON_ERROR(socket_server_ptr_->LseekExtraMsgReadPos(0, conn_id_)); + ClientAttributes attr; + RESPONSE_ON_ERROR(socket_server_ptr_->GetClientAttributeMsg(conn_id_, attr)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(self->trace_log_level_) + << "Request: " << attr.req_name << ". Read get names request " + << "cost: " << (end - start) << " us."; + + start = end; + RESPONSE_ON_ERROR(server_ptr_->GetNames( + name_vec, wait, [self]() { return self->running_.load(); }, + [self, start, attr](const Status& status, + const std::vector& object_id_vec) { + uint64_t start_ = start, end_ = 0; + end_ = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(self->trace_log_level_) + << "Request: " << attr.req_name << ". Process get names " + << "request cost: " << (end_ - start_) << " us."; + start_ = end_; + std::string message_out; + if (status.ok()) { + WriteGetNamesReply(object_id_vec, message_out); + } else { + VLOG(100) << "Error: failed to get name: " << status.ToString(); + WriteErrorReply(status, message_out); + } + end_ = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(self->trace_log_level_) + << "Request: " << attr.req_name << ". Write get names " + << "reply cost: " << (end_ - start_) << " us."; + self->doWrite(message_out); + return Status::OK(); + })); + return false; +} + +bool SocketConnection::doGetObjectLocation(const json& root) { + auto self(shared_from_this()); + std::vector names; + TRY_READ_REQUEST(ReadGetObjectLocationRequest, root, names); + VINEYARD_DISCARD(socket_server_ptr_->LseekExtraMsgReadPos(0, conn_id_)); + ClientAttributes attr; + RESPONSE_ON_ERROR(socket_server_ptr_->GetClientAttributeMsg(conn_id_, attr)); + + boost::asio::post(server_ptr_->GetIOContext(), [self, names]() { + self->server_ptr_->GetObjectLocation( + names, [self](const Status& status, + std::vector>& locations_vector) { + std::string message_out; + if (status.ok()) { + WriteGetObjectLocationReply(locations_vector, message_out); + } else { + VLOG(100) << "Error: " << status.ToString(); + WriteErrorReply(status, message_out); + } + self->doWrite(message_out); + return Status::OK(); + }); + }); + return false; +} + +bool SocketConnection::doPutObjectLocation(const json& root) { + auto self(shared_from_this()); + std::vector names; + std::vector locations; + int ttl_seconds = 300; + uint64_t start = 0, end = 0; + + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + TRY_READ_REQUEST(ReadPutObjectLocationRequest, root, names, locations, + ttl_seconds); + RESPONSE_ON_ERROR(socket_server_ptr_->LseekExtraMsgReadPos(0, conn_id_)); + ClientAttributes attr; + RESPONSE_ON_ERROR(socket_server_ptr_->GetClientAttributeMsg(conn_id_, attr)); + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(self->trace_log_level_) + << "Request: " << attr.req_name << ". Read put object location " + << "request cost: " << (end - start) << " us."; + start = end; + boost::asio::post(server_ptr_->GetIOContext(), [self, names, locations, + ttl_seconds, start]() { + self->server_ptr_->PutObjectLocation( + names, locations, ttl_seconds, [self, start](Status status) { + uint64_t end = + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(self->trace_log_level_) + << "Request: " << self->peer_host << ":" << self->conn_id_ + << ". Process put object location request cost: " << (end - start) + << " us."; + std::string message_out; + if (status.ok()) { + WritePutObjectLocationReply(message_out); + } else { + VLOG(2) << "Error: " << status.ToString(); + WriteErrorReply(status, message_out); + } + self->doWrite(message_out); + return Status::OK(); + }); + }); + return false; +} + bool SocketConnection::doListName(const json& root) { auto self(shared_from_this()); std::string pattern; @@ -1522,6 +2906,57 @@ bool SocketConnection::doDropName(const json& root) { return false; } +bool SocketConnection::doDropNames(const json& root) { + auto self(shared_from_this()); + uint64_t start = 0, end = 0; + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + std::vector name_vec; + TRY_READ_REQUEST(ReadDropNamesRequest, root, name_vec); + + std::vector name_lengths; + name_lengths.resize(name_vec.size()); + VINEYARD_DISCARD(socket_server_ptr_->LseekExtraMsgReadPos(0, conn_id_)); + ClientAttributes attr; + RESPONSE_ON_ERROR(socket_server_ptr_->GetClientAttributeMsg(conn_id_, attr)); + RESPONSE_ON_ERROR(socket_server_ptr_->ReadExtraMessage( + name_lengths.data(), name_lengths.size() * sizeof(size_t), conn_id_)); + for (size_t i = 0; i < name_vec.size(); ++i) { + name_vec[i].resize(name_lengths[i]); + RESPONSE_ON_ERROR(socket_server_ptr_->ReadExtraMessage( + name_vec[i].data(), name_lengths[i], conn_id_)); + } + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(self->trace_log_level_) + << "Request: " << attr.req_name << ". Read drop names request " + << "cost: " << (end - start) << " us."; + + start = end; + RESPONSE_ON_ERROR(server_ptr_->DropNames( + name_vec, [self, start, attr](const Status& status) { + uint64_t end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(self->trace_log_level_) + << "Request: " << attr.req_name << ". Process drop names " + << "request cost: " << (end - start) << " us."; + std::string message_out; + if (status.ok()) { + WriteDropNamesReply(message_out); + } else { + LOG(ERROR) << "Error: failed to drop name: " << status.ToString() + << " for request: " << attr.req_name; + WriteErrorReply(status, message_out); + } + self->doWrite(message_out); + return Status::OK(); + })); + return false; +} + bool SocketConnection::doShallowCopy(const json& root) { auto self(shared_from_this()); ObjectID id; @@ -1563,7 +2998,7 @@ bool SocketConnection::doMakeArena(const json& root) { this->doWrite(message_out, [self, fd_to_send](const Status& status) { if (fd_to_send != -1) { - send_fd(self->nativeHandle(), fd_to_send); + self->sendFd(fd_to_send); } return Status::OK(); }); @@ -1770,6 +3205,29 @@ bool SocketConnection::doIsInUse(json const& root) { return false; } +bool SocketConnection::doGetVineyardMmapFd(const json& root) { + auto self(shared_from_this()); + TRY_READ_REQUEST(ReadGetVineyardMmapFdRequest, root); + int fd = -1; + size_t size = 0; + size_t offset; + std::string message_out; + fd = server_ptr_->GetBulkStore()->GetBaseFd(); + size = server_ptr_->GetBulkStore()->GetBaseSize(); + // offset = server_ptr_->GetBulkStore()->GetBaseOffset(); + offset = server_ptr_->GetBulkStore()->GetUserOffset(); + + WriteGetVineyardMmapFdReply(size, offset, message_out); + this->doWrite(message_out, [self, fd](const Status& status) { + if (status.ok()) { + self->sendFd(fd); + } + return Status::OK(); + }); + + return false; +} + bool SocketConnection::doClusterMeta(const json& root) { auto self(shared_from_this()); TRY_READ_REQUEST(ReadClusterMetaRequest, root); @@ -1921,8 +3379,22 @@ void SocketConnection::doWrite(std::string&& buf) { doAsyncWrite(std::move(buf)); } +void SocketConnection::doWriteWithoutRead(std::string& buf) { + std::string to_send; + size_t length = buf.size(); + to_send.resize(length + sizeof(size_t)); + char* ptr = &to_send[0]; + memcpy(ptr, &length, sizeof(size_t)); + ptr += sizeof(size_t); + memcpy(ptr, buf.data(), length); + doAsyncWriteWithoutRead(std::move(to_send)); +} + void SocketConnection::doStop() { this->ClearLockedObjects(); + this->socket_server_ptr_->ReleaseExtraRequestMemory(conn_id_); + this->server_ptr_->GetStreamStore()->CleanResource( + StreamStore::BuildOwner(this->peer_host, conn_id_)); if (this->Stop()) { // drop connection socket_server_ptr_->RemoveConnection(conn_id_); @@ -1940,6 +3412,21 @@ void SocketConnection::doAsyncWrite(std::string&& buf) { doReadHeader(); } else { doStop(); + // ThrowException(); + } + }); +} + +void SocketConnection::doAsyncWriteWithoutRead(std::string&& buf) { + std::shared_ptr payload = + std::make_shared(std::move(buf)); + auto self(shared_from_this()); + asio::async_write( + socket_, boost::asio::buffer(payload->data(), payload->length()), + [this, self, payload](boost::system::error_code ec, std::size_t) { + if (ec) { + doStop(); + // ThrowException(); } }); } @@ -1960,9 +3447,11 @@ void SocketConnection::doAsyncWrite(std::string&& buf, callback_t<> callback, } } else { doStop(); + // ThrowException(); } } else { doStop(); + // ThrowException(); } }); } @@ -2011,6 +3500,13 @@ void SocketConnection::ClearLockedObjects() { server_ptr_->UnlockTransmissionObjects(ids); } +void SocketConnection::ThrowException() { + LOG(ERROR) << "Connection closed unexpected from " << peer_host + << ", conn_id: " << conn_id_; + throw std::runtime_error("Connection closed unexpected from " + peer_host + + ", conn_id: " + std::to_string(conn_id_)); +} + SocketServer::SocketServer(std::shared_ptr vs_ptr) : vs_ptr_(vs_ptr), next_conn_id_(0) {} @@ -2033,6 +3529,13 @@ void SocketServer::Stop() { pair.second->Stop(); } connections_.clear(); + + std::lock_guard lock( + conn_id_to_extra_request_mem_mutex_); + for (auto& item : conn_id_to_extra_request_mem_) { + munmap(item.second.addr_, item.second.size_); + close(item.second.fd_); + } } void SocketServer::Close() { closable_.store(true); } @@ -2076,4 +3579,140 @@ size_t SocketServer::AliveConnections() const { return connections_.size(); } +Status SocketServer::RequireExtraRequestMemory(int conn_id, size_t size, + int& fd) { + { + std::lock_guard scope_lock(this->connections_mutex_); + auto conn = connections_.find(conn_id); + if (conn == connections_.end()) { + return Status::Invalid("connection id is not exists:" + + std::to_string(conn_id)); + } + } + + std::lock_guard lock( + conn_id_to_extra_request_mem_mutex_); + if (conn_id_to_extra_request_mem_.find(conn_id) == + conn_id_to_extra_request_mem_.end()) { + std::string name = "/vineyard_big_request_" + std::to_string(conn_id); + fd = memfd_create(name.c_str(), 0); + if (fd == -1) { + return Status::IOError("fail to create memfd for big request"); + } + if (ftruncate(fd, size) == -1) { + close(fd); + return Status::IOError("fail to truncate memfd for big request"); + } + void* addr = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + close(fd); + return Status::IOError("fail to mmap memfd for big request"); + } + conn_id_to_extra_request_mem_[conn_id] = {addr, size, fd}; + } else { + return Status::Invalid( + "big request memory has been allocated for conn_id: " + + std::to_string(conn_id)); + } + return Status::OK(); +} + +Status SocketServer::ReleaseExtraRequestMemory(int conn_id) { + std::lock_guard lock( + conn_id_to_extra_request_mem_mutex_); + auto it = conn_id_to_extra_request_mem_.find(conn_id); + if (it != conn_id_to_extra_request_mem_.end()) { + munmap(it->second.addr_, it->second.size_); + close(it->second.fd_); + conn_id_to_extra_request_mem_.erase(it); + } + return Status::OK(); +} + +Status SocketServer::ReadExtraMessage(void* data, size_t size, int conn_id) { + std::lock_guard lock( + conn_id_to_extra_request_mem_mutex_); + auto it = conn_id_to_extra_request_mem_.find(conn_id); + if (it == conn_id_to_extra_request_mem_.end()) { + return Status::Invalid( + "big request memory has not been allocated for conn_id: " + + std::to_string(conn_id)); + } + if (it->second.read_pos_ + size > it->second.size_) { + return Status::Invalid("big request memory overflow for conn_id: " + + std::to_string(conn_id)); + } + memory::concurrent_memcpy( + data, reinterpret_cast(it->second.addr_) + it->second.read_pos_, + size); + it->second.read_pos_ += size; + return Status::OK(); +} + +Status SocketServer::WriteExtraMessage(const void* data, size_t size, + int conn_id) { + std::lock_guard lock( + conn_id_to_extra_request_mem_mutex_); + auto it = conn_id_to_extra_request_mem_.find(conn_id); + if (it == conn_id_to_extra_request_mem_.end()) { + return Status::Invalid( + "big request memory has not been allocated for conn_id: " + + std::to_string(conn_id)); + } + if (it->second.write_pos_ + size > it->second.size_) { + return Status::Invalid("big request memory overflow for conn_id: " + + std::to_string(conn_id)); + } + memory::concurrent_memcpy( + reinterpret_cast(it->second.addr_) + it->second.write_pos_, data, + size); + it->second.write_pos_ += size; + return Status::OK(); +} + +Status SocketServer::LseekExtraMsgWritePos(uint64_t offset, int conn_id) { + std::lock_guard lock( + conn_id_to_extra_request_mem_mutex_); + auto it = conn_id_to_extra_request_mem_.find(conn_id); + if (it == conn_id_to_extra_request_mem_.end()) { + return Status::Invalid( + "big request memory has not been allocated for conn_id: " + + std::to_string(conn_id)); + } + if (offset > it->second.size_) { + return Status::Invalid("big request memory overflow for conn_id: " + + std::to_string(conn_id)); + } + it->second.write_pos_ = offset; + return Status::OK(); +} + +Status SocketServer::LseekExtraMsgReadPos(uint64_t offset, int conn_id) { + std::lock_guard lock( + conn_id_to_extra_request_mem_mutex_); + auto it = conn_id_to_extra_request_mem_.find(conn_id); + if (it == conn_id_to_extra_request_mem_.end()) { + return Status::Invalid( + "big request memory has not been allocated for conn_id: " + + std::to_string(conn_id)); + } + if (offset > it->second.size_) { + return Status::Invalid("big request memory overflow for conn_id: " + + std::to_string(conn_id)); + } + it->second.read_pos_ = offset; + return Status::OK(); +} + +Status SocketServer::GetClientAttributeMsg(uint64_t conn_id, + ClientAttributes& attr) { + size_t length; + std::string attr_str; + RETURN_ON_ERROR(ReadExtraMessage(&length, sizeof(size_t), conn_id)); + attr_str.resize(length); + RETURN_ON_ERROR(ReadExtraMessage(attr_str.data(), length, conn_id)); + attr = ClientAttributes::FromJsonString(attr_str); + return Status::OK(); +} + } // namespace vineyard diff --git a/src/server/async/socket_server.h b/src/server/async/socket_server.h index da2c3dda5..1469bb570 100644 --- a/src/server/async/socket_server.h +++ b/src/server/async/socket_server.h @@ -22,6 +22,7 @@ limitations under the License. #include #include #include +#include #include #include #include @@ -29,8 +30,13 @@ limitations under the License. #include "common/memory/payload.h" #include "common/util/asio.h" // IWYU pragma: keep #include "common/util/callback.h" +#include "common/util/sidecar.h" #include "common/util/uuid.h" +#include "common/rdma/rdma_server.h" + +#include "thread-pool/thread_pool.h" + namespace vineyard { using boost::asio::generic::stream_protocol; @@ -52,8 +58,8 @@ class SocketConnection : public std::enable_shared_from_this { public: SocketConnection(stream_protocol::socket socket, std::shared_ptr server_ptr, - std::shared_ptr socket_server_ptr, - int conn_id); + std::shared_ptr socket_server_ptr, int conn_id, + std::string peer_host = ""); bool Start(); @@ -64,6 +70,7 @@ class SocketConnection : public std::enable_shared_from_this { protected: bool doRegister(json const& root); + bool doRequireExtraRequestMemory(json const& root); bool doCreateBuffer(json const& root); bool doCreateBuffers(json const& root); @@ -91,12 +98,16 @@ class SocketConnection : public std::enable_shared_from_this { bool doIncreaseReferenceCount(json const& root); bool doRelease(json const& root); bool doDelDataWithFeedbacks(json const& root); + bool doDelHugeData(json const& root); bool doCreateBufferByPlasma(json const& root); bool doGetBuffersByPlasma(json const& root); bool doSealPlasmaBlob(json const& root); bool doPlasmaRelease(json const& root); bool doPlasmaDelData(json const& root); + bool doCreateUserBuffers(json const& root); + bool doDeleteUserBuffers(json const& root); + bool doGetUserBuffers(json const& root); bool doCreateData(json const& root); bool doCreateDatas(json const& root); @@ -109,19 +120,47 @@ class SocketConnection : public std::enable_shared_from_this { bool doLabelObject(json const& root); bool doClear(json const& root); bool doMemoryTrim(json const& root); + bool doBatchPersist(json const& root); + bool doCreatehugeDatas(json const& root); + bool doGetHugeData(json const& root); bool doCreateStream(json const& root); bool doOpenStream(json const& root); bool doGetNextStreamChunk(json const& root); bool doPushNextStreamChunk(json const& root); + bool doPushNextStreamChunkByOffset(json const& root); bool doPullNextStreamChunk(json const& root); + bool doCheckFixedStreamReceived(json const& root); bool doStopStream(json const& root); bool doDropStream(json const& root); + bool doAbortStream(json const& root); + bool doPutStreamName(json const& root); + bool doGetStreamIDByName(json const& root); + bool doActivateRemoteFixedStream(const json& root); + bool doCreateFixedStream(json const& root); + bool doOpenFixedStream(json const& root); + bool doCloseStream(json const& root); + bool doDeleteStream(json const& root); + + bool doVineyardOpenRemoteFixedStream(const json& root); + bool doVineyardActivateRemoteFixedStreamWithOffset(const json& root); + bool doVineyardStopStream(const json& root); + bool doVineyardDropStream(const json& root); + bool doVineyardAbortRemoteStream(const json& root); + bool doVineyardCloseRemoteFixedStream(const json& root); + bool doVineyardGetMetasByNames(const json& root); + bool doVineyardGetRemoteBlobs(const json& root); + bool doVineyardGetRemoteBlobsWithOffset(const json& root); bool doPutName(json const& root); bool doGetName(json const& root); bool doListName(json const& root); bool doDropName(json const& root); + bool doGetObjectLocation(const json& root); + bool doPutObjectLocation(const json& root); + bool doGetNames(json const& root); + bool doDropNames(json const& root); + bool doPutNames(json const& root); bool doMakeArena(json const& root); bool doFinalizeArena(json const& root); @@ -136,6 +175,7 @@ class SocketConnection : public std::enable_shared_from_this { bool doIsSpilled(json const& root); bool doIsInUse(json const& root); + bool doGetVineyardMmapFd(json const& root); bool doClusterMeta(json const& root); bool doInstanceStatus(json const& root); bool doMigrateObject(json const& root); @@ -153,11 +193,12 @@ class SocketConnection : public std::enable_shared_from_this { Status MoveBuffers(std::map mapping, std::shared_ptr& source_session); - private: int nativeHandle() { return socket_.native_handle(); } int getConnId() { return conn_id_; } + virtual bool sendFd(int fd); + /** * @brief Return should be exit after this message. * @@ -170,12 +211,14 @@ class SocketConnection : public std::enable_shared_from_this { void doReadBody(); - void doWrite(const std::string& buf); + virtual void doWrite(const std::string& buf); + + virtual void doWriteWithoutRead(std::string& buf); - void doWrite(std::string&& buf); + virtual void doWrite(std::string&& buf); - void doWrite(const std::string& buf, callback_t<> callback, - const bool partial = false); + virtual void doWrite(const std::string& buf, callback_t<> callback, + const bool partial = false); /** * Being called when the encounter a socket error (in read/write), or by @@ -190,6 +233,8 @@ class SocketConnection : public std::enable_shared_from_this { void doAsyncWrite(std::string&& buf, callback_t<> callback, const bool partial = false); + void doAsyncWriteWithoutRead(std::string&& buf); + void switchSession(std::shared_ptr& session) { this->server_ptr_ = session; } @@ -200,6 +245,9 @@ class SocketConnection : public std::enable_shared_from_this { void ClearLockedObjects(); + // TODO: remove this + void ThrowException(); + // whether the connection has been correctly "registered" std::atomic_bool registered_; @@ -213,6 +261,7 @@ class SocketConnection : public std::enable_shared_from_this { int conn_id_; std::atomic_bool running_; + std::string peer_host; asio::streambuf buf_; @@ -226,6 +275,8 @@ class SocketConnection : public std::enable_shared_from_this { std::unordered_map locked_objects_; std::mutex locked_objects_mutex_; + int trace_log_level_ = 0; + friend class IPCServer; friend class RPCServer; }; @@ -259,19 +310,19 @@ class SocketServer { /** * Check if @conn_id@ exists in the connection pool. */ - bool ExistsConnection(int conn_id) const; + virtual bool ExistsConnection(int conn_id) const; /** * Remove @conn_id@ from connection pool, before removing, the "Stop" * on the connection has already been called. */ - void RemoveConnection(int conn_id); + virtual void RemoveConnection(int conn_id); /** * Invoke the "Stop" on the connection, and then remove it from the * connection pool. */ - void CloseConnection(int conn_id); + virtual void CloseConnection(int conn_id); /** * Inspect the size of current alive connections. @@ -285,7 +336,43 @@ class SocketServer { virtual Status Register(std::shared_ptr conn, const SessionID session_id) = 0; + virtual Status SendDataWithRDMA(int tcp_conn, uint64_t addr, + uint64_t local_addr, size_t size, + uint64_t rkey) { + return Status::NotImplemented("SendDataWithRDMA is not implemented"); + } + + Status SendDataWithRDMA(std::vector& addr_list, uint64_t local_addr, + size_t offset, std::vector& size_list, + size_t& size, std::vector& rkey_list, + const std::string& peer_host, const int port, + const std::string& advice_device) { + return Status::NotImplemented("RDMA is not supported yet."); + } + + Status RequireExtraRequestMemory(int conn_id, size_t size, int& fd); + + Status ReleaseExtraRequestMemory(int conn_id); + + Status ReadExtraMessage(void* data, size_t size, int conn_id); + + Status WriteExtraMessage(const void* data, size_t size, int conn_id); + + Status LseekExtraMsgWritePos(uint64_t offset, int conn_id); + + Status LseekExtraMsgReadPos(uint64_t offset, int conn_id); + + Status GetClientAttributeMsg(uint64_t conn_id, ClientAttributes& attr); + protected: + struct ExtraRequestMem { + void* addr_ = nullptr; + size_t size_ = 0; + int fd_ = -1; + uint64_t write_pos_ = 0; + uint64_t read_pos_ = 0; + }; + std::atomic_bool stopped_; // if the socket server being stopped. std::atomic_bool closable_; // if client want to close the session, @@ -293,6 +380,8 @@ class SocketServer { int next_conn_id_; std::unordered_map> connections_; mutable std::recursive_mutex connections_mutex_; // protect `connections_` + std::map conn_id_to_extra_request_mem_; + std::recursive_mutex conn_id_to_extra_request_mem_mutex_; private: virtual void doAccept() = 0; diff --git a/src/server/memory/malloc.cc b/src/server/memory/malloc.cc index 96502e11d..69292006e 100644 --- a/src/server/memory/malloc.cc +++ b/src/server/memory/malloc.cc @@ -60,6 +60,8 @@ constexpr int64_t kMmapRegionsGap = sizeof(size_t); DEFINE_bool(reserve_memory, false, "Reserving enough physical memory pages for vineyardd"); +DEFINE_bool(2M_alignment, false, "Align the mmap address to 2M"); + std::unordered_map mmap_records; static void* pointer_advance(void* p, ptrdiff_t n) { @@ -260,11 +262,44 @@ void* mmap_buffer(int fd, int64_t size, bool gap, bool* is_committed, #endif } - void* pointer = mmap(NULL, size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0); - if (pointer == MAP_FAILED) { - LOG(ERROR) << "mmap failed with error: " << strerror(errno); - return pointer; + void* pointer = nullptr; + if (FLAGS_2M_alignment) { + // 2M alignment + const size_t size2MB = 1UL << 21; + void* addr = mmap(NULL, (size + size2MB), PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANON, -1, 0); + if (addr == MAP_FAILED) { + LOG(ERROR) << "mmap failed with error: " << strerror(errno); + return addr; + } + + if (munmap(addr, (size + size2MB)) != 0) { + LOG(ERROR) << "munmap failed with error: " << strerror(errno); + return nullptr; + } + addr = reinterpret_cast((uintptr_t(addr) + size2MB - 1) & + ~(size2MB - 1)); + + pointer = mmap(addr, size, PROT_READ | PROT_WRITE, + mmap_flag | MAP_FIXED_NOREPLACE, fd, 0); + if (pointer == MAP_FAILED) { + LOG(ERROR) << "mmap failed with error: " << strerror(errno); + return pointer; + } + if (pointer != addr) { + LOG(ERROR) << "mmap failed with error: " << strerror(errno); + munmap(pointer, size); + return nullptr; + } + } else { + pointer = mmap(NULL, size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0); + if (pointer == MAP_FAILED) { + LOG(ERROR) << "mmap failed with error: " << strerror(errno); + return pointer; + } } + LOG(INFO) << "mmap addr:" << reinterpret_cast(pointer) + << ", size:" << size; MmapRecord& record = mmap_records[pointer]; record.fd = fd; diff --git a/src/server/memory/memory.cc b/src/server/memory/memory.cc index 7f2e11e09..b75cbb607 100644 --- a/src/server/memory/memory.cc +++ b/src/server/memory/memory.cc @@ -317,6 +317,10 @@ Status BulkStoreBase::Delete(ID const& object_id, if (object->IsSpilled()) { return true; } + if (object->IsUserCreated()) { + // Object created by user, delete it directly + return true; + } if (object->IsGPU()) { // DeleteGPU() will deal with GPU object return false; @@ -522,6 +526,8 @@ Status BulkStoreBase::PreAllocate(const size_t size, auto payload = std::make_shared

( object_id, size, static_cast(pointer), fd, map_size, offset); payload->is_sealed = true; + // TODO: what if offset is large than 4096? + payload->user_offset = 4096; objects_.insert(object_id, payload); return Status::OK(); } @@ -642,6 +648,80 @@ Status BulkStore::Create(const size_t data_size, ObjectID& object_id, return Status::OK(); } +Status BulkStore::CreateUserBlob(uint64_t offset, size_t size, + ObjectID& object_id, + std::shared_ptr& object) { + if (size == 0) { + object_id = EmptyBlobID(); + object = Payload::MakeEmpty(); + return Status::OK(); + } + + bool insert_success = false; + int times = 0; + while (!insert_success) { + if (times > retry_times) { + break; + } + + int fd = -1; + ptrdiff_t ptrdiff = 0; + int64_t map_size = 0; + uint64_t base_addr = reinterpret_cast(GetUserBasePointer()); + object_id = GenerateBlobID(base_addr + offset); + GetMallocMapInfo(reinterpret_cast(base_addr + offset), &fd, + &map_size, &ptrdiff); + object = std::make_shared( + object_id, size, reinterpret_cast(base_addr + offset), fd, + map_size, ptrdiff); + object->user_offset = offset; + object->is_user_created = true; + try { + insert_success = objects_.insert(object_id, object); + } catch (const std::exception& e) { + LOG(ERROR) << "Failed to insert user blob: " << e.what(); + return Status::Invalid("Failed to insert blob to the object store."); + } + times++; + } + + if (!insert_success) { + return Status::Invalid("Failed to allocated a unique user blob id."); + } + + static uint64_t count = 0; + if (count % 2000 == 0) { + VLOG(3) << "Current map size:" << objects_.size(); + } + count++; + VLOG(3) << "Created user blob with id: " << IDToString(object_id) + << ", size: " << size << ", offset: " << offset; + + return Status::OK(); +} + +Status BulkStore::DeleteUserBlob(ObjectID id) { + if (id == EmptyBlobID()) { + return Status::OK(); + } + + if (id == InvalidObjectID()) { + return Status::OK(); + } + + if (objects_.contains(id) && objects_.find(id)->is_user_created) { + objects_.erase(id); + } + + static uint64_t count = 0; + if (count % 2000 == 0) { + VLOG(3) << "Current map size:" << objects_.size(); + } + count++; + + return Status::OK(); +} + Status BulkStore::OnRelease(ObjectID const& id) { Status status; objects_.find_fn(id, diff --git a/src/server/memory/memory.h b/src/server/memory/memory.h index 5ae363bd6..00e41c62e 100644 --- a/src/server/memory/memory.h +++ b/src/server/memory/memory.h @@ -82,14 +82,6 @@ class BulkStoreBase { object_map_t& List() { return objects_; } - void* GetBasePointer() { - return objects_.find(PlaceholderBlobID())->pointer; - } - - uint64_t GetBaseSize() { - return objects_.find(PlaceholderBlobID())->data_size; - } - bool MemoryTrim(); size_t Footprint() const; @@ -146,6 +138,8 @@ class BulkStoreBase { int64_t mem_spill_upper_bound_; int64_t mem_spill_lower_bound_; + + int retry_times = 10; }; class BulkStore @@ -159,6 +153,11 @@ class BulkStore Status Create(const size_t size, ObjectID& object_id, std::shared_ptr& object); + Status CreateUserBlob(uint64_t offset, size_t size, ObjectID& object_id, + std::shared_ptr& object); + + Status DeleteUserBlob(ObjectID id); + /* * @brief Decrease the reference count of a blob, when its reference count * reaches zero. It will trigger `OnRelease` behavior. See ColdObjectTracker @@ -184,6 +183,31 @@ class BulkStore */ Status Release_GPU(ObjectID const& id, int conn); + void* GetBasePointer() { + return objects_.find(PlaceholderBlobID())->pointer; + } + + void* GetUserBasePointer() { + auto object = objects_.find(PlaceholderBlobID()); + return object->pointer + object->user_offset - object->data_offset; + } + + uint64_t GetBaseSize() { + return objects_.find(PlaceholderBlobID())->data_size; + } + + int GetBaseFd() { + return objects_.find(PlaceholderBlobID())->store_fd; + } + + int64_t GetBaseOffset() { + return objects_.find(PlaceholderBlobID())->data_offset; + } + + int64_t GetUserOffset() { + return objects_.find(PlaceholderBlobID())->user_offset; + } + protected: /** * @brief change the reference count of the object on the client-side cache. diff --git a/src/server/memory/stream_store.cc b/src/server/memory/stream_store.cc index dfce3912b..35d8df798 100644 --- a/src/server/memory/stream_store.cc +++ b/src/server/memory/stream_store.cc @@ -15,9 +15,15 @@ limitations under the License. #include "server/memory/stream_store.h" +#include +#include + +#include #include #include +#include #include +#include #include "common/util/callback.h" #include "common/util/logging.h" // IWYU pragma: keep @@ -37,31 +43,308 @@ namespace vineyard { } while (0) #endif // CHECK_STREAM_STATE +bool DeferredStream::Alive() const { return alive_fn_(); } + +bool DeferredStream::TestThenCall() const { + Status status = Status::IOError("Stream operation timeout."); + ObjectID ret_id = InvalidObjectID(); + if (!Alive()) { + VLOG(100) << "Timeout, rpc return!"; + call_fn_(status, ret_id); + return true; + } else if (test_fn_(status, ret_id)) { + VLOG(100) << "Test and call return!"; + call_fn_(status, ret_id); + return true; + } + return false; +} + // manage a pool of streams. -Status StreamStore::Create(ObjectID const stream_id) { +Status StreamStore::Create(ObjectID const stream_id, bool fixed_size, int nums, + size_t size) { + VLOG(2) << "Create stream, id: " << ObjectIDToString(stream_id) + << ", fixed_size: " << fixed_size << ", nums: " << nums + << ", size: " << size; std::lock_guard __guard(this->mutex_); + if (stream_id == InvalidObjectID()) { + LOG(ERROR) << "Failed to create stream with invalid id."; + return Status::Invalid("Failed to create stream with invalid id."); + } if (streams_.find(stream_id) != streams_.end()) { + LOG(ERROR) << "Failed to create the stream as it is already exists: " + << ObjectIDToString(stream_id); return Status::ObjectExists( "Failed to create the stream as it is already exists: " + ObjectIDToString(stream_id)); } - streams_.emplace(stream_id, std::make_shared()); + std::shared_ptr stream_holder = + std::make_shared(); + if (fixed_size) { + stream_holder->SetFixedBlobStream(nums, size); + std::string stream_file_name = "/tmp/vineyard-stream-" + + std::to_string(getpid()) + "-" + + std::to_string(stream_id); + stream_holder->recv_mem_fd = + open(stream_file_name.c_str(), O_RDWR | O_CREAT | O_NONBLOCK, 0666); + if (stream_holder->recv_mem_fd < 0) { + LOG(ERROR) << "failed to create stream file '" << stream_file_name + << "', " << strerror(errno); + return Status::IOError("failed to open file '" + stream_file_name + + "', " + strerror(errno)); + } + + unlink(stream_file_name.c_str()); + if (ftruncate(stream_holder->recv_mem_fd, (off_t) STREAM_PAGE_SIZE) != 0) { + LOG(ERROR) << "failed to ftruncate file " << stream_file_name; + close(stream_holder->recv_mem_fd); + return Status::IOError("failed to ftruncate file " + stream_file_name); + } + + stream_holder->recv_mem_base = + mmap(0, STREAM_PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, + stream_holder->recv_mem_fd, 0); + if (stream_holder->recv_mem_base == MAP_FAILED) { + LOG(ERROR) << "failed to mmap stream file '" << stream_file_name << "', " + << strerror(errno); + close(stream_holder->recv_mem_fd); + return Status::IOError("failed to mmap file '" + stream_file_name + + "', " + strerror(errno)); + } + VLOG(2) << "Create stream file:" << stream_file_name + << " fd:" << stream_holder->recv_mem_fd + << " base:" << stream_holder->recv_mem_base; + + memset(stream_holder->recv_mem_base, 0, STREAM_PAGE_SIZE); + } + std::string ttl_str = + read_env("VINEYARD_STREAM_TTL_S", std::to_string(UINT64_MAX)); + stream_holder->ttl = std::stoull(ttl_str); + streams_.emplace(stream_id, stream_holder); + return Status::OK(); +} + +Status StreamStore::PutName(std::string name, ObjectID stream_id) { + VLOG(2) << "Put name to stream, name: " << name + << ", stream_id: " << ObjectIDToString(stream_id); + std::lock_guard __guard(this->mutex_); + if (streams_.find(stream_id) == streams_.end()) { + LOG(ERROR) << "Failed to put name to stream: " << name + << ", stream not exists."; + return Status::ObjectNotExists("failed to put name to stream: " + name); + } + + if (stream_names_.find(name) != stream_names_.end()) { + LOG(ERROR) << "Failed to put name to stream: " << name + << ". Name already exists."; + return Status::ObjectExists("failed to put name to stream: " + name + + ". Name already exists."); + } + + stream_names_[name] = stream_id; + streams_[stream_id]->name = name; + return Status::OK(); +} + +Status StreamStore::BindRemoteStream(ObjectID local_stream_id, + ObjectID remote_stream_id, + std::string endpoint, + std::shared_ptr client) { + VLOG(2) << "BindRemoteStream"; + std::lock_guard __guard(this->mutex_); + if (streams_.find(local_stream_id) == streams_.end()) { + LOG(ERROR) << "Failed to bind remote stream: " + << ObjectIDToString(local_stream_id) << ", stream not exists."; + return Status::ObjectNotExists("failed to bind remote stream: " + + ObjectIDToString(local_stream_id)); + } + + VLOG(2) << "Bind local stream to remote stream, local_id:" + << ObjectIDToString(local_stream_id) + << ", remote_id:" << ObjectIDToString(remote_stream_id) + << ", endpoint:" << endpoint << ", remote client:" << client.get(); + + streams_[local_stream_id]->is_forked = true; + streams_[local_stream_id]->bind_stream_id = remote_stream_id; + streams_[local_stream_id]->endpoint = endpoint; + streams_[local_stream_id]->remote_client = client; + + return Status::OK(); +} + +Status StreamStore::UnbindRemoteStream(ObjectID local_stream_id) { + std::lock_guard __guard(this->mutex_); + if (streams_.find(local_stream_id) == streams_.end()) { + LOG(ERROR) << "Failed to unbind remote stream: " + << ObjectIDToString(local_stream_id) << ", stream not exists."; + return Status::ObjectNotExists("failed to unbind remote stream: " + + ObjectIDToString(local_stream_id)); + } + + VLOG(2) << "Unbind local stream, local_id:" + << ObjectIDToString(local_stream_id) << ", remote_id:" + << ObjectIDToString(streams_[local_stream_id]->bind_stream_id) + << ", endpoint:" << streams_[local_stream_id]->endpoint + << ", remote client:" + << streams_[local_stream_id]->remote_client.get(); + + streams_[local_stream_id]->bind_stream_id = InvalidObjectID(); + streams_[local_stream_id]->endpoint = ""; + streams_[local_stream_id]->remote_client = nullptr; + return Status::OK(); } -Status StreamStore::Open(ObjectID const stream_id, int64_t const mode) { +Status StreamStore::GetStreamIDByName(std::string name, ObjectID& stream_id) { std::lock_guard __guard(this->mutex_); + if (stream_names_.find(name) == stream_names_.end()) { + LOG(ERROR) << "Failed to get stream id by name: " << name; + return Status::ObjectNotExists("failed to get stream id by name: " + name); + } + + stream_id = stream_names_.at(name); + return Status::OK(); +} + +Status StreamStore::Open(ObjectID const stream_id, int64_t const mode, + std::string owner) { + VLOG(2) << "Try to open stream by id: " << ObjectIDToString(stream_id) + << ", mode: " << mode << ", owner: " << owner; + std::lock_guard __guard(this->mutex_); + if (stream_id == InvalidObjectID()) { + LOG(ERROR) << "Failed to open stream by id: " + << ObjectIDToString(stream_id); + return Status::ObjectNotExists("failed to open stream by id: " + + ObjectIDToString(stream_id)); + } + if (streams_.find(stream_id) == streams_.end()) { + LOG(ERROR) << "Failed to open stream by id: " + << ObjectIDToString(stream_id); return Status::ObjectNotExists("stream cannot be open: " + ObjectIDToString(stream_id)); } + + if (streams_[stream_id]->abort) { + LOG(ERROR) << "Failed to open stream by id: " << ObjectIDToString(stream_id) + << ", stream is aborted."; + return Status::InvalidStreamState("stream is aborted"); + } + if (streams_[stream_id]->open_mark & mode) { + LOG(ERROR) << "Failed to open stream by id: " << ObjectIDToString(stream_id) + << ", stream already opened."; return Status::StreamOpened(); } + VLOG(100) << "owner:" << owner << " mode:" << mode + << " read:" << (int64_t) StreamOpenMode::read + << " write:" << (int64_t) StreamOpenMode::write; + if (mode & (int64_t) StreamOpenMode::read) { + streams_[stream_id]->reader_owner = owner; + } + if (mode & (int64_t) StreamOpenMode::write) { + streams_[stream_id]->writer_owner = owner; + } streams_[stream_id]->open_mark |= mode; return Status::OK(); } +Status StreamStore::Open(std::string name, ObjectID& ret_id, int64_t const mode, + std::string owner) { + std::lock_guard __guard(this->mutex_); + if (stream_names_.find(name) == stream_names_.end()) { + return Status::ObjectNotExists("failed to open stream by name: " + name); + } + + Status status = Open(stream_names_.at(name), mode, owner); + if (status.ok()) { + ret_id = stream_names_.at(name); + } + return status; +} + +Status StreamStore::Open(ObjectID const stream_id, int64_t const mode, + std::string owner, bool wait, uint64_t timeout, + void_callback_t callback) { + std::lock_guard __guard(this->mutex_); + VLOG(100) << "open owner:" << owner; + auto self(shared_from_this()); + Status status = Open(stream_id, mode, owner); + if (status.IsObjectNotExists() && wait) { + LOG(INFO) << "Stream is not exist, waiting for it to be created"; + uint64_t start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + auto alive = [start, timeout]() -> bool { + uint64_t now = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + return now - start < timeout; + }; + auto test = [self, stream_id, mode, owner](Status& status, + ObjectID& ret_id) -> bool { + VLOG(100) << "Try to open stream: " << stream_id; + status = self->Open(stream_id, mode, owner); + if (status.IsObjectNotExists()) { + VLOG(100) << "Stream is not exist, waiting for it to be created"; + return false; + } + VLOG(100) << "Status:" << status.ToString(); + return true; + }; + auto call = [callback](Status& status, ObjectID id) { + callback(status, id); + }; + deferred_.emplace_back(alive, test, call); + return Status::OK(); + } else { + VLOG(100) << "Stream is already exist, call callback and return"; + callback(status, stream_id); + return Status::OK(); + } +} + +Status StreamStore::Open(std::string stream_name, int64_t const mode, + std::string owner, bool wait, uint64_t timeout, + void_callback_t callback) { + std::lock_guard __guard(this->mutex_); + auto self(shared_from_this()); + ObjectID stream_id = InvalidObjectID(); + Status status = Open(stream_name, stream_id, mode, owner); + if (status.IsObjectNotExists() && wait) { + VLOG(2) << "Stream is not exist, waiting for it to be created"; + uint64_t start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + auto alive = [start, timeout]() -> bool { + uint64_t now = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + return now - start < timeout; + }; + auto test = [self, stream_name, mode, owner](Status& status, + ObjectID& ret_id) -> bool { + VLOG(100) << "Try to open stream: " << stream_name; + status = self->Open(stream_name, ret_id, mode, owner); + if (status.IsObjectNotExists()) { + VLOG(100) << "Stream is not exist, waiting for it to be created"; + return false; + } + VLOG(100) << "Status:" << status.ToString(); + return true; + }; + auto call = [callback](Status& status, ObjectID id) { + callback(status, id); + }; + deferred_.emplace_back(alive, test, call); + return Status::OK(); + } else { + VLOG(2) << "Stream is already exist, call callback and return"; + callback(status, stream_id); + return Status::OK(); + } +} + // for producer: return the next chunk to write, and make current chunk // available for consumer to read Status StreamStore::Get(ObjectID const stream_id, size_t const size, @@ -120,6 +403,8 @@ Status StreamStore::Push(ObjectID const stream_id, ObjectID const chunk, callback_t callback) { std::lock_guard __guard(this->mutex_); if (streams_.find(stream_id) == streams_.end()) { + LOG(ERROR) << "Failed to push to stream: " << ObjectIDToString(stream_id) + << ", stream not exists."; return callback(Status::ObjectNotExists("failed to push to stream"), InvalidObjectID()); } @@ -129,19 +414,35 @@ Status StreamStore::Push(ObjectID const stream_id, ObjectID const chunk, CHECK_STREAM_STATE(!stream->writer_); CHECK_STREAM_STATE(!stream->drained && !stream->failed); + if (stream->type == STREAM_TYPE::FIXED_SIZE_STREAM) { + if (stream->pushed_nums == stream->blob_nums) { + LOG(ERROR) << "Stream is full."; + return callback(Status::IOError("Stream is full."), InvalidObjectID()); + } + if (stream->abort) { + LOG(ERROR) << "Stream is aborted."; + return callback(Status::IOError("Stream is aborted."), InvalidObjectID()); + } + } + // seal current chunk stream->ready_chunks_.push(chunk); + stream->pushed_nums++; // weak up the pending reader - if (stream->reader_) { - // should be no reading chunk - CHECK_STREAM_STATE(!stream->current_reading_); - if (!stream->ready_chunks_.empty()) { - stream->current_reading_ = stream->ready_chunks_.front(); - stream->ready_chunks_.pop(); - VINEYARD_SUPPRESS( - stream->reader_.get()(Status::OK(), stream->current_reading_.get())); - stream->reader_ = boost::none; + if (stream->type == STREAM_TYPE::NOMAL_STREAM) { + if (stream->reader_) { + // should be no reading chunk + CHECK_STREAM_STATE(!stream->current_reading_); + if (!stream->ready_chunks_.empty()) { + stream->current_reading_ = stream->ready_chunks_.front(); + stream->ready_chunks_.pop(); + VINEYARD_SUPPRESS(stream->reader_.get()( + Status::OK(), stream->current_reading_.get())); + stream->reader_ = boost::none; + } + } else { + VLOG(100) << "No reader, push chunk to ready queue"; } } @@ -221,6 +522,316 @@ Status StreamStore::Pull(ObjectID const stream_id, } } +void StreamStore::AutoRead( + std::shared_ptr stream, + callback_t, std::vector, + std::vector, int> + callback) { + std::vector local_buffers; + std::vector> addr_list; + std::vector> rkey_list; + std::vector> size_list; + std::vector index_list; + { + std::lock_guard __guard(this->mutex_); + if (stream == nullptr) { + LOG(ERROR) << "Stream object invalid."; + callback(Status::Invalid("Stream object invalid."), InvalidObjectID(), + {0}, {0}, {0}, -1); + return; + } + + if (stream->abort) { + LOG(ERROR) << "Stream is aborted."; + stream->transfer_finished = true; + callback(Status::IOError("Stream is aborted."), InvalidObjectID(), {0}, + {0}, {0}, -1); + return; + } + + while (!stream->ready_chunks_.empty()) { + stream->current_reading_ = stream->ready_chunks_.front(); + stream->ready_chunks_.pop(); + local_buffers.push_back(stream->current_reading_.get()); + addr_list.push_back(stream->receive_addr_list[stream->read_index]); + rkey_list.push_back(stream->rkeys_list[stream->read_index]); + size_list.push_back(stream->sizes_list[stream->read_index]); + index_list.push_back(stream->read_index); + + stream->read_index++; + stream->current_reading_ = boost::none; + } + } + + for (size_t i = 0; i < local_buffers.size(); i++) { + { + std::lock_guard __guard(this->mutex_); + if (stream->abort) { + LOG(ERROR) << "Stream is aborted. Interrupt the transfer."; + callback(Status::IOError("Stream is aborted."), InvalidObjectID(), {0}, + {0}, {0}, -1); + stream->transfer_finished = true; + return; + } + } + + Status status = callback(Status::OK(), local_buffers[i], addr_list[i], + rkey_list[i], size_list[i], index_list[i]); + + { + std::lock_guard __guard(this->mutex_); + if (!status.ok()) { + LOG(ERROR) << "Failed to send data to remote: " << status.ToString(); + stream->transfer_finished = true; + return; + } + } + } + + { + std::lock_guard __guard(this->mutex_); + if (stream->blob_received_nums == stream->blob_nums) { + stream->transfer_finished = true; + } else { + boost::asio::post( + this->server_->GetIOContext(), + [this, stream, callback]() { this->AutoRead(stream, callback); }); + } + } +} + +void StreamStore::ActivateRemoteFixedStream( + ObjectID stream_id, std::vector> recv_addr_list, + std::vector> rkeys, + std::vector> sizes_list, + callback_t, std::vector, + std::vector, int> + callback) { + VLOG(2) << "Activate remote fixed stream, stream_id: " + << ObjectIDToString(stream_id); + std::lock_guard __guard(this->mutex_); + if (streams_.find(stream_id) == streams_.end()) { + LOG(ERROR) << "failed to pull from stream, stream id: " + << ObjectIDToString(stream_id); + callback(Status::ObjectNotExists("failed to pull from stream, stream id: " + + ObjectIDToString(stream_id)), + InvalidObjectID(), {0}, {0}, {0}, -1); + return; + } + auto stream = streams_.at(stream_id); + + if (stream->type != FIXED_SIZE_STREAM) { + LOG(ERROR) << "Stream is not fixed size stream"; + callback(Status::InvalidStreamState("Stream is not fixed size stream"), + InvalidObjectID(), {0}, {0}, {0}, -1); + return; + } + + if (stream->abort) { + LOG(ERROR) << "Stream is aborted."; + callback(Status::IOError("Stream is aborted."), InvalidObjectID(), {0}, {0}, + {0}, -1); + return; + } + + stream->receive_addr_list = std::move(recv_addr_list); + stream->rkeys_list = std::move(rkeys); + stream->sizes_list = std::move(sizes_list); + + stream->auto_reader_ = callback; + stream->transfer_finished = false; + + VLOG(2) << "Post read task to IOContext, stream_id: " + << ObjectIDToString(stream_id); + auto self(shared_from_this()); + boost::asio::post(server_->GetIOContext(), [callback, stream, self]() { + self->AutoRead(stream, callback); + }); + + // if stream has been stopped, return a proper status. + if (stream->drained) { + callback(Status::StreamDrained(), InvalidObjectID(), {0}, {0}, {0}, -1); + } else if (stream->failed) { + callback(Status::StreamFailed(), InvalidObjectID(), {0}, {0}, {0}, -1); + } +} + +void StreamStore::AutoRead(std::shared_ptr stream, + callback_t callback) { + std::vector local_buffers; + std::vector index_list; + { + std::lock_guard __guard(this->mutex_); + if (stream == nullptr) { + LOG(ERROR) << "Stream object invalid."; + callback(Status::Invalid("Stream object invalid."), InvalidObjectID(), + -1); + return; + } + + if (stream->abort) { + LOG(ERROR) << "Stream is aborted."; + stream->transfer_finished = true; + callback(Status::IOError("Stream is aborted."), InvalidObjectID(), -1); + return; + } + + while (!stream->ready_chunks_.empty()) { + stream->current_reading_ = stream->ready_chunks_.front(); + stream->ready_chunks_.pop(); + local_buffers.push_back(stream->current_reading_.get()); + index_list.push_back(stream->read_index); + + stream->read_index++; + stream->current_reading_ = boost::none; + } + } + + for (size_t i = 0; i < local_buffers.size(); i++) { + { + std::lock_guard __guard(this->mutex_); + if (stream->abort) { + callback(Status::IOError("Stream is aborted."), InvalidObjectID(), -1); + stream->transfer_finished = true; + return; + } + } + + Status status = callback(Status::OK(), local_buffers[i], index_list[i]); + + { + std::lock_guard __guard(this->mutex_); + if (!status.ok()) { + LOG(ERROR) << "Failed to send data to remote: " << status.ToString(); + stream->transfer_finished = true; + return; + } + } + } + + { + std::lock_guard __guard(this->mutex_); + if (stream->blob_received_nums == stream->blob_nums) { + stream->transfer_finished = true; + } else { + boost::asio::post( + this->server_->GetIOContext(), + [this, stream, callback]() { this->AutoRead(stream, callback); }); + } + } +} + +void StreamStore::ActivateRemoteFixedStream( + ObjectID stream_id, callback_t callback) { + std::lock_guard __guard(this->mutex_); + if (streams_.find(stream_id) == streams_.end()) { + LOG(ERROR) << "failed to pull from stream, stream id: " + << ObjectIDToString(stream_id); + callback(Status::ObjectNotExists("failed to pull from stream, stream id: " + + ObjectIDToString(stream_id)), + InvalidObjectID(), -1); + return; + } + auto stream = streams_.at(stream_id); + + if (stream->type != FIXED_SIZE_STREAM) { + LOG(ERROR) << "Stream is not fixed size stream"; + callback(Status::InvalidStreamState("Stream is not fixed size stream"), + InvalidObjectID(), -1); + return; + } + + if (stream->abort) { + LOG(ERROR) << "Stream is aborted."; + callback(Status::IOError("Stream is aborted."), InvalidObjectID(), -1); + return; + } + + stream->auto_reader_test_ = callback; + stream->transfer_finished = false; + + auto self(shared_from_this()); + boost::asio::post(server_->GetIOContext(), [callback, stream, self]() { + self->AutoRead(stream, callback); + }); + + // if stream has been stopped, return a proper status. + if (stream->drained) { + callback(Status::StreamDrained(), InvalidObjectID(), -1); + } else if (stream->failed) { + callback(Status::StreamFailed(), InvalidObjectID(), -1); + } +} + +Status StreamStore::CheckBlobReceived(ObjectID stream_id, int index, + bool& finished) { + std::lock_guard __guard(this->mutex_); + if (streams_.find(stream_id) == streams_.end()) { + LOG(ERROR) << "failed to check blob received, stream id: " + << ObjectIDToString(stream_id); + return Status::ObjectNotExists( + "failed to check blob received, stream id: " + + ObjectIDToString(stream_id)); + } + auto stream = streams_.at(stream_id); + + if (stream->type != FIXED_SIZE_STREAM) { + LOG(ERROR) << "Stream is not fixed size stream"; + return Status::InvalidStreamState("Stream is not fixed size stream"); + } + + if (index < 0) { + for (int i = 0; i < stream->blob_nums; i++) { + if (stream->blob_received.find(i) == stream->blob_received.end()) { + finished = false; + return Status::OK(); + } + } + finished = true; + } else { + if (index >= stream->blob_nums) { + LOG(ERROR) << "Index out of range"; + return Status::InvalidStreamState("Index out of range"); + } + + if (stream->blob_received.find(index) == stream->blob_received.end()) { + finished = false; + } else { + finished = true; + } + } + + return Status::OK(); +} + +Status StreamStore::SetBlobReceived(ObjectID stream_id, int index) { + std::lock_guard __guard(this->mutex_); + if (streams_.find(stream_id) == streams_.end()) { + LOG(ERROR) << "failed to set blob received, stream id: " + << ObjectIDToString(stream_id); + return Status::ObjectNotExists("failed to set blob received, stream id: " + + ObjectIDToString(stream_id)); + } + auto stream = streams_.at(stream_id); + + if (stream->type != FIXED_SIZE_STREAM) { + LOG(ERROR) << "Stream is not fixed size stream"; + return Status::InvalidStreamState("Stream is not fixed size stream"); + } + + if (index < 0 || index >= stream->blob_nums || index >= STREAM_PAGE_SIZE) { + LOG(ERROR) << "Index out of range"; + return Status::InvalidStreamState("Index out of range"); + } + + stream->blob_received.insert(index); + stream->blob_received_nums++; + + reinterpret_cast(stream->recv_mem_base)[index] = 1; + + return Status::OK(); +} + Status StreamStore::Stop(ObjectID const stream_id, bool failed) { std::lock_guard __guard(this->mutex_); if (streams_.find(stream_id) == streams_.end()) { @@ -329,6 +940,227 @@ Status StreamStore::Drop(ObjectID const stream_id) { return Status::OK(); } +Status StreamStore::Close(ObjectID const stream_id, std::string access_key) { + std::lock_guard __guard(this->mutex_); + if (streams_.find(stream_id) == streams_.end()) { + LOG(ERROR) << "failed to close stream: " << ObjectIDToString(stream_id) + << ", stream is not found."; + return Status::ObjectNotExists("failed to close stream: " + + ObjectIDToString(stream_id)); + } + VLOG(100) << "Close stream: " << stream_id; + auto stream = streams_.at(stream_id); + if (stream->type == FIXED_SIZE_STREAM) { + if (stream->reader_owner == access_key) { + stream->reader_owner = ""; + stream->reader_ = boost::none; + stream->open_mark &= ~(static_cast(StreamOpenMode::read)); + VLOG(100) << "Close reader, open mode:" << stream->open_mark; + } else if (stream->writer_owner == access_key) { + stream->writer_owner = ""; + stream->open_mark &= ~(static_cast(StreamOpenMode::write)); + VLOG(100) << "Close writer, open mode:" << stream->open_mark; + } else { + VLOG(100) << "access key: " << access_key + << " reader: " << stream->reader_owner + << " writer: " << stream->writer_owner; + return Status::Invalid("Invalid access key, access denied."); + } + } else { + LOG(ERROR) << "Close is not supported for this stream."; + return Status::NotImplemented("Close is not supported for this stream."); + } + return Status::OK(); +} + +Status StreamStore::SetErrorFlag(ObjectID const stream_id, Status const error) { + std::lock_guard __guard(this->mutex_); + if (streams_.find(stream_id) == streams_.end()) { + LOG(ERROR) << "failed to set error flag for stream: " + << ObjectIDToString(stream_id) << ", stream not found."; + return Status::ObjectNotExists("failed to set error flag for stream: " + + ObjectIDToString(stream_id)); + } + + auto stream = streams_.at(stream_id); + if (stream->type != FIXED_SIZE_STREAM) { + LOG(ERROR) << "Set error flag is not supported for this stream."; + return Status::NotImplemented( + "Set error flag is not supported for this stream."); + } + + std::string error_str = error.ToString().substr(0, STREAM_ERROR_LENGTH); + memcpy(reinterpret_cast(stream->recv_mem_base) + + (STREAM_PAGE_SIZE - STREAM_ERROR_LENGTH - sizeof(unsigned char)), + error_str.c_str(), error_str.size()); + reinterpret_cast( + stream->recv_mem_base)[STREAM_PAGE_SIZE - sizeof(unsigned char)] = + static_cast(error.code()); + return Status::OK(); +} + +Status StreamStore::Abort(ObjectID const stream_id, bool& success) { + VLOG(2) << "Try to abort stream: " << ObjectIDToString(stream_id); + std::lock_guard __guard(this->mutex_); + if (streams_.find(stream_id) == streams_.end()) { + LOG(ERROR) << "failed to abort stream: " << ObjectIDToString(stream_id) + << ", stream is not found."; + return Status::ObjectNotExists("failed to abort stream: " + + ObjectIDToString(stream_id)); + } + auto stream = streams_.at(stream_id); + stream->abort = true; + if (stream->auto_reader_ == boost::none) { + // means that the stream is not activated + stream->transfer_finished = true; + } + // To prevent the stream is not activated. To prevent the sender from waiting + // forever after push, we set the error flag here. + SetErrorFlag(stream_id, Status::IOError("Stream is aborted.")); + success = stream->transfer_finished; + VLOG(2) << "Stream id: " << ObjectIDToString(stream_id) + << ", abort result:" << success; + return Status::OK(); +} + +Status StreamStore::Delete(ObjectID const stream_id) { + std::lock_guard __guard(this->mutex_); + auto self(shared_from_this()); + if (streams_.find(stream_id) == streams_.end()) { + LOG(WARNING) << "Delete stream not found: " << stream_id; + return Status::OK(); + } + auto stream = streams_.at(stream_id); + stream->start_time = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(2) << "Try to delete stream:" << stream->name + << ", stream id:" << ObjectIDToString(stream_id) + << ", open mark: " << stream->open_mark << ", type: " << stream->type + << ", ttl: " << stream->ttl; + if (stream->type == FIXED_SIZE_STREAM) { + if (stream->open_mark) { + VLOG(2) << "Stream is still open, defered delete it."; + auto alive_t = [self, stream_id]() -> bool { return true; }; + auto test_t = [self, stream_id](Status& status, + ObjectID& ret_id) -> bool { + std::lock_guard __guard(self->mutex_); + VLOG(100) << "Check if the stream can be deleted"; + if (self->streams_.find(stream_id) == self->streams_.end()) { + status = Status::ObjectNotExists("stream not found"); + return true; + } else if (self->streams_[stream_id]->open_mark) { + if (std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count() - + self->streams_[stream_id]->start_time > + self->streams_[stream_id]->ttl) { + LOG(WARNING) << "Stream is still open, but timeout, force to " + "delete it"; + LOG(WARNING) << "Stream name: " << self->streams_[stream_id]->name + << " open mark: " + << self->streams_[stream_id]->open_mark + << " ttl: " << self->streams_[stream_id]->ttl + << " start time: " + << self->streams_[stream_id]->start_time; + return true; + } + VLOG(100) + << "Stream is still open, waiting for it to be closed.open mark: " + << self->streams_[stream_id]->open_mark + << " name:" << self->streams_[stream_id]->name; + VLOG(100) << "stream is abort: " << self->streams_[stream_id]->abort; + return false; + } else { + VLOG(100) << "Stream can be deleted"; + return true; + } + }; + auto call_t = [self, stream_id](Status& status, ObjectID id) { + std::lock_guard __guard(self->mutex_); + self->PrintStreamInfo(); + if (self->streams_.find(stream_id) == self->streams_.end()) { + return; + } + auto stream_ = self->streams_[stream_id]; + if (stream_->type == FIXED_SIZE_STREAM) { + if (stream_->recv_mem_base != nullptr) { + munmap(stream_->recv_mem_base, STREAM_PAGE_SIZE); + close(stream_->recv_mem_fd); + } + while (!stream_->ready_chunks_.empty()) { + stream_->current_reading_ = stream_->ready_chunks_.front(); + stream_->ready_chunks_.pop(); + self->server_->GetBulkStore()->DeleteUserBlob( + stream_->current_reading_.get()); + } + } + for (auto item : self->stream_names_) { + if (item.second == stream_id) { + VLOG(100) << "Delete stream name: " << item.first; + self->stream_names_.erase(item.first); + break; + } + } + VLOG(100) << "Delete stream: " << stream_id; + auto stream = self->streams_[stream_id]; + self->streams_.erase(stream_id); + VLOG(100) << "Stream deleted: " << stream_id; + self->PrintStreamInfo(); + }; + VLOG(100) << "Defered delete stream: " << stream_id; + deferred_.emplace_back(alive_t, test_t, call_t); + } else { + VLOG(2) << "Stream closed, delete it directly."; + PrintStreamInfo(); + if (stream->recv_mem_base != nullptr) { + munmap(stream->recv_mem_base, STREAM_PAGE_SIZE); + close(stream->recv_mem_fd); + } + while (!stream->ready_chunks_.empty()) { + stream->current_reading_ = stream->ready_chunks_.front(); + stream->ready_chunks_.pop(); + self->server_->GetBulkStore()->DeleteUserBlob( + stream->current_reading_.get()); + } + for (auto item : stream_names_) { + if (item.second == stream_id) { + stream_names_.erase(item.first); + VLOG(100) << "Delete stream name: " << item.first; + break; + } + } + streams_.erase(stream_id); + VLOG(100) << "Stream deleted: " << stream_id; + PrintStreamInfo(); + } + } else { + LOG(INFO) << "Delete is not supported for this stream."; + return Status::NotImplemented("Delete is not supported for this stream."); + } + + return Status::OK(); +} + +Status StreamStore::CleanResource(std::string owner) { + VLOG(2) << "Clean resource for owner: " << owner; + std::lock_guard __guard(this->mutex_); + for (auto iter = streams_.begin(); iter != streams_.end(); iter++) { + auto stream = iter->second; + if (stream->reader_owner == owner || stream->writer_owner == owner) { + if (stream->type == FIXED_SIZE_STREAM) { + LOG(INFO) << "Clean and abort stream: " << iter->first; + stream->abort = true; + Status status = + SetErrorFlag(iter->first, Status::IOError("Stream is aborted.")); + LOG(INFO) << "set error status:" << status.ToString(); + Close(iter->first, owner); + } + } + } + return Status::OK(); +} + bool StreamStore::allocatable(std::shared_ptr stream, size_t size) { if (store_->Footprint() + size < diff --git a/src/server/memory/stream_store.h b/src/server/memory/stream_store.h index 0add9ddee..7848d4d38 100644 --- a/src/server/memory/stream_store.h +++ b/src/server/memory/stream_store.h @@ -16,22 +16,44 @@ limitations under the License. #ifndef SRC_SERVER_MEMORY_STREAM_STORE_H_ #define SRC_SERVER_MEMORY_STREAM_STORE_H_ +#include +#include #include #include #include +#include +#include #include #include +#include #include "boost/optional/optional.hpp" +#include "common/util/asio.h" #include "common/util/callback.h" +#include "common/util/uuid.h" #include "server/memory/memory.h" +#include "server/util/remote.h" +#include "server/util/utils.h" namespace vineyard { +#define STREAM_PAGE_SIZE 4096 +#define STREAM_ERROR_LENGTH 256 + // forward declarations. class VineyardServer; +enum STREAM_TYPE { + NOMAL_STREAM = 0, + FIXED_SIZE_STREAM = 1, +}; + +enum class StreamOpenMode { + read = 1, + write = 2, +}; + /** * @brief StreamHolder aims to maintain all chunks for a single stream. * "Stream" is a special kind of "Object" in vineyard, which represents @@ -43,24 +65,141 @@ struct StreamHolder { boost::optional current_writing_, current_reading_; std::queue ready_chunks_; boost::optional> reader_; + boost::optional, + std::vector, std::vector, int>> + auto_reader_ = boost::none; + + boost::optional> auto_reader_test_ = boost::none; boost::optional>> writer_; bool drained{false}, failed{false}; int64_t open_mark{0}; + STREAM_TYPE type{STREAM_TYPE::NOMAL_STREAM}; + int blob_nums; + int pushed_nums = 0; + size_t blob_size; + int read_index; + bool abort = false; + bool transfer_finished = true; + std::string reader_owner = ""; + std::string writer_owner = ""; + std::vector> receive_addr_list; + std::vector> rkeys_list; + std::vector> sizes_list; + + ObjectID bind_stream_id; + std::string name; + std::string endpoint; + std::shared_ptr remote_client; + bool is_forked = false; + std::set blob_received; + int blob_received_nums = 0; + uint64_t ttl = UINT64_MAX; + uint64_t start_time = 0; + // pointer to the memory region of the received flag array + int recv_mem_fd = -1; + void* recv_mem_base; + + void SetFixedBlobStream(int nums, size_t size) { + type = FIXED_SIZE_STREAM; + blob_nums = nums; + blob_size = size; + read_index = 0; + } +}; + +class DeferredStream { + public: + using alive_t = std::function; + using test_t = std::function; + using call_t = std::function; + + DeferredStream(alive_t alive_fn, test_t test_fn, call_t call_fn) + : alive_fn_(alive_fn), test_fn_(test_fn), call_fn_(call_fn) {} + + bool Alive() const; + + bool TestThenCall() const; + + private: + alive_t alive_fn_; + test_t test_fn_; + call_t call_fn_; }; /** * @brief StreamStore manages a pool of streams. * */ -class StreamStore { +class StreamStore : public std::enable_shared_from_this { public: StreamStore(std::shared_ptr server, - std::shared_ptr store, size_t const stream_threshold) - : server_(server), store_(store), threshold_(stream_threshold) {} + std::shared_ptr store, size_t const stream_threshold, + boost::asio::io_context& context) + : server_(server), + store_(store), + threshold_(stream_threshold), + timer_(context, boost::asio::chrono::milliseconds(timer_millseconds_)) { + ProcessDefered(); + } + + ~StreamStore() { + std::lock_guard guard(deferred_mutex_); + deferred_.clear(); + } + + Status Create(ObjectID const stream_id, bool fixed_size = false, int nums = 0, + size_t size = 0); + + Status PutName(std::string name, ObjectID stream_id); + + Status GetStreamIDByName(std::string name, ObjectID& stream_id); + + Status Open(ObjectID const stream_id, int64_t const mode, std::string owner); + + Status Open(std::string name, ObjectID& ret_id, int64_t const mode, + std::string owner); + + Status Open(ObjectID const stream_id, int64_t const mode, std::string owner, + bool wait, uint64_t timeout, + void_callback_t callback); - Status Create(ObjectID const stream_id); + Status Open(std::string stream_name, int64_t const mode, std::string owner, + bool wait, uint64_t timeout, + void_callback_t callback); - Status Open(ObjectID const stream_id, int64_t const mode); + Status BindRemoteStream(ObjectID local_stream_id, ObjectID remote_stream_id, + std::string endpoint, + std::shared_ptr client); + + Status UnbindRemoteStream(ObjectID local_stream_id); + + Status SetErrorFlag(ObjectID const stream_id, Status const error); + + void PrintStreamInfo() { + std::lock_guard guard(mutex_); + VLOG(100) << "-----------------"; + VLOG(100) << "stream_name_list:"; + for (auto item : stream_names_) { + VLOG(100) << "stream_name: " << item.first + << " stream_id: " << item.second; + } + VLOG(100) << "stream_list:"; + for (auto item : streams_) { + VLOG(100) << "stream_id: " << item.first + << " stream_name: " << item.second->name; + } + static uint64_t last_time = + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + uint64_t now = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + if (now - last_time > SECOND_TO_MILLISECOND(3)) { + LOG(INFO) << "Currently activate stream size:" << streams_.size(); + last_time = now; + } + } /** * @brief This is called by the producer of the stream and it makes current @@ -84,6 +223,78 @@ class StreamStore { */ Status Pull(ObjectID const stream_id, callback_t callback); + void ActivateRemoteFixedStream( + ObjectID stream_id, std::vector> recv_addr_list, + std::vector> rkeys, + std::vector> sizes_list, + callback_t, std::vector, + std::vector, int> + callback); + + void ActivateRemoteFixedStream(ObjectID stream_id, + callback_t callback); + + Status CheckBlobReceived(ObjectID stream_id, int index, bool& finished); + + Status SetBlobReceived(ObjectID stream_id, int index); + + Status GetRemoteInfo(ObjectID stream_id, ObjectID& remote_stream_id, + std::string& endpoint, + std::shared_ptr& client) { + std::lock_guard guard(mutex_); + auto stream = streams_.find(stream_id); + if (stream == streams_.end()) { + return Status::ObjectNotExists("stream not found"); + } + if (stream->second->is_forked) { + remote_stream_id = stream->second->bind_stream_id; + endpoint = stream->second->endpoint; + client = stream->second->remote_client; + return Status::OK(); + } else { + return Status::Invalid("stream is not forked"); + } + } + + Status GetFixedStreamSizeInfo(ObjectID stream_id, size_t& size, int& nums) { + std::lock_guard guard(mutex_); + auto stream = streams_.find(stream_id); + if (stream == streams_.end()) { + return Status::ObjectNotExists("stream not found"); + } + if (stream->second->type == FIXED_SIZE_STREAM) { + size = stream->second->blob_size; + nums = stream->second->blob_nums; + return Status::OK(); + } else { + return Status::Invalid("stream is not fixed size stream"); + } + } + + Status ProcessDefered() { + std::lock_guard guard(deferred_mutex_); + auto iter = deferred_.begin(); + while (iter != deferred_.end()) { + if (iter->TestThenCall()) { + VLOG(100) << "Remove from defered stream"; + iter = deferred_.erase(iter); + } else { + VLOG(100) << "Keep defered stream"; + ++iter; + } + } + + timer_.expires_after(boost::asio::chrono::milliseconds(1)); + timer_.async_wait([this](const boost::system::error_code& ec) { + if (ec) { + LOG(ERROR) << "Timer error: " << ec.message(); + return; + } + this->ProcessDefered(); + }); + return Status::OK(); + } + /** * @brief Function stop is called by the vineyard clients. * @@ -97,9 +308,61 @@ class StreamStore { */ Status Drop(ObjectID const stream_id); + Status Close(ObjectID const stream_id, std::string owner); + + Status Delete(ObjectID const stream_id); + + Status CleanResource(std::string owner); + + Status Abort(ObjectID const stream_id, bool& success); + + Status IsFixedStreamTransferFinished(ObjectID const stream_id, + bool& finished) { + std::lock_guard guard(mutex_); + if (streams_.find(stream_id) == streams_.end()) { + return Status::ObjectNotExists("stream not found"); + } + auto stream = streams_.at(stream_id); + if (stream->type == FIXED_SIZE_STREAM) { + finished = stream->blob_received_nums == stream->blob_nums; + return Status::OK(); + } else { + return Status::Invalid("stream is not fixed size stream"); + } + } + + std::string static BuildOwner(std::string host, int conn_id) { + return host + ":" + std::to_string(conn_id); + } + + Status GetRecvFd(ObjectID const stream_id, int& fd) { + std::lock_guard guard(mutex_); + auto stream = streams_.find(stream_id); + if (stream == streams_.end()) { + return Status::ObjectNotExists("stream not found"); + } + if (stream->second->recv_mem_fd == -1) { + return Status::Invalid("stream mmap not ready"); + } + fd = stream->second->recv_mem_fd; + return Status::OK(); + } + private: bool allocatable(std::shared_ptr stream, size_t size); + void AutoRead(std::shared_ptr stream, + callback_t, + std::vector, std::vector, int> + callback); + + void AutoRead(std::shared_ptr stream, + callback_t callback); + + bool IsStreamFinished(std::shared_ptr stream) { + return stream->blob_received_nums == stream->blob_nums; + } + // protect the stream store std::recursive_mutex mutex_; @@ -107,6 +370,13 @@ class StreamStore { std::shared_ptr store_; size_t threshold_; std::unordered_map> streams_; + std::unordered_map stream_names_; + uint64_t timer_millseconds_ = 1; + + boost::asio::steady_timer timer_; + + std::list deferred_; + std::recursive_mutex deferred_mutex_; }; } // namespace vineyard diff --git a/src/server/memory/usage.h b/src/server/memory/usage.h index 33f4a1b81..095dae1a5 100644 --- a/src/server/memory/usage.h +++ b/src/server/memory/usage.h @@ -30,13 +30,16 @@ limitations under the License. #include "libcuckoo/cuckoohash_map.hh" #include "common/memory/payload.h" -#include "common/util/arrow.h" #include "common/util/lifecycle.h" #include "common/util/logging.h" // IWYU pragma: keep #include "common/util/status.h" #include "server/memory/allocator.h" + +#ifdef BUILD_VINEYARDD_SPILLING +#include "common/util/arrow.h" #include "server/util/file_io_adaptor.h" #include "server/util/spill_file.h" +#endif // BUILD_VINEYARDD_SPILLING namespace vineyard { @@ -398,10 +401,12 @@ class ColdObjectTracker ColdObjectTracker() {} ~ColdObjectTracker() { +#ifdef BUILD_VINEYARDD_SPILLING if (!spill_path_.empty()) { io::FileIOAdaptor io_adaptor(spill_path_); DISCARD_ARROW_ERROR(io_adaptor.DeleteDir()); } +#endif // BUILD_VINEYARDD_SPILLING } /** @@ -619,6 +624,7 @@ class ColdObjectTracker protected: Status SpillPayload(const std::shared_ptr

& payload) { +#ifdef BUILD_VINEYARDD_SPILLING if (!payload->is_sealed) { return Status::ObjectNotSealed( "payload is not sealed and cannot be spilled: " + @@ -637,9 +643,13 @@ class ColdObjectTracker payload->pointer = nullptr; payload->is_spilled = true; return Status::OK(); +#else + return Status::Invalid("Spilling is not enabled"); +#endif // BUILD_VINEYARDD_SPILLING } Status ReloadPayload(const ID id, const std::shared_ptr

& payload) { +#ifdef BUILD_VINEYARDD_SPILLING if (!payload->is_spilled) { return Status::ObjectNotSpilled(payload->object_id); } @@ -649,15 +659,21 @@ class ColdObjectTracker } payload->is_spilled = false; return this->DeletePayloadFile(id); +#else + return Status::Invalid("Spilling is not enabled"); +#endif // BUILD_VINEYARDD_SPILLING } Status DeletePayloadFile(const ID id) { +#ifdef BUILD_VINEYARDD_SPILLING io::FileIOAdaptor io_adaptor(spill_path_); RETURN_ON_ERROR(io_adaptor.RemoveFile(spill_path_ + std::to_string(id))); +#endif // BUILD_VINEYARDD_SPILLING return Status::OK(); } void SetSpillPath(const std::string& spill_path) { +#ifdef BUILD_VINEYARDD_SPILLING spill_path_ = spill_path; if (spill_path.empty()) { LOG(INFO) << "No spill path set, spill has been disabled ..."; @@ -676,6 +692,7 @@ class ColdObjectTracker << "' doesn't exist, or vineyardd doesn't have the write permission"; spill_path_.clear(); } +#endif // BUILD_VINEYARDD_SPILLING } private: diff --git a/src/server/server/vineyard_runner.cc b/src/server/server/vineyard_runner.cc index 72d397d26..48afd27b1 100644 --- a/src/server/server/vineyard_runner.cc +++ b/src/server/server/vineyard_runner.cc @@ -31,7 +31,9 @@ namespace vineyard { VineyardRunner::VineyardRunner(const json& spec) : spec_template_(spec), - concurrency_(std::thread::hardware_concurrency()), + concurrency_(std::stoi( + read_env("VINEYARD_CONCURRENCY", + std::to_string(std::thread::hardware_concurrency())))), context_(concurrency_), meta_context_(), io_context_(concurrency_), @@ -45,6 +47,11 @@ VineyardRunner::VineyardRunner(const json& spec) io_guard_(asio::make_work_guard(io_context_)) #endif { + LOG(INFO) << "Vineyard runner is created with concurrency: " << concurrency_ + << ", means context threads: " << concurrency_ + << ", meta context threads: " << 1 + << ", io context threads: " << concurrency_ / 2 + << ", spec: " << spec_template_.dump(); } std::shared_ptr VineyardRunner::Get(const json& spec) { diff --git a/src/server/server/vineyard_server.cc b/src/server/server/vineyard_server.cc index 982f8d538..5d080f7e1 100644 --- a/src/server/server/vineyard_server.cc +++ b/src/server/server/vineyard_server.cc @@ -26,12 +26,15 @@ limitations under the License. #include #include +#include "common/memory/memcpy.h" #include "common/util/uuid.h" #include "gulrak/filesystem.hpp" +#include #include "common/util/callback.h" #include "common/util/json.h" #include "common/util/logging.h" // IWYU pragma: keep +#include "common/util/sidecar.h" #include "server/async/ipc_server.h" #include "server/async/rpc_server.h" #include "server/services/meta_service.h" @@ -67,6 +70,12 @@ bool DeferredReq::TestThenCall(const json& meta) const { return false; } +std::vector SplitString(const std::string& input) { + std::vector result; + boost::split(result, input, boost::is_any_of(",")); + return result; +} + VineyardServer::VineyardServer(const json& spec, const SessionID& session_id, std::shared_ptr runner, asio::io_context& context, @@ -80,7 +89,9 @@ VineyardServer::VineyardServer(const json& spec, const SessionID& session_id, io_context_(io_context), callback_(callback), runner_(runner), - ready_(0) {} + ready_(0) { + this->trace_log_level_ = stoi(VineyardEnv::GetVineyardTraceLogLevel()); +} template <> std::shared_ptr VineyardServer::GetBulkStore() { @@ -95,10 +106,16 @@ std::shared_ptr VineyardServer::GetBulkStore() { Status VineyardServer::Serve(StoreType const& bulk_store_type, const bool create_new_instance) { stopped_.store(false); + char* ld_library_path = getenv("LD_LIBRARY_PATH"); + LOG(INFO) << "LD_LIBRARY_PATH: " + << (ld_library_path == nullptr ? "nullptr" + : std::string(ld_library_path)); + this->bulk_store_type_ = bulk_store_type; this->meta_service_ptr_ = IMetaService::Get(shared_from_this()); - RETURN_ON_ERROR(this->meta_service_ptr_->Start(create_new_instance)); + RETURN_ON_ERROR(this->meta_service_ptr_->Start( + create_new_instance)); // temporary solution // Initialize the ipc/rpc server ptr after the meta service. // It's useful to probe whether the vineyardd and meta service are both @@ -109,6 +126,8 @@ Status VineyardServer::Serve(StoreType const& bulk_store_type, // of "Register" request in RPC server the session will be set as the // request session as expected. rpc_server_ptr_ = std::make_shared(shared_from_this()); + remote_client_pool_ = + std::make_shared(shared_from_this(), rpc_server_ptr_); } auto memory_limit = spec_["bulkstore_spec"]["memory_size"].get(); @@ -151,7 +170,8 @@ Status VineyardServer::Serve(StoreType const& bulk_store_type, // setup stream store stream_store_ = std::make_shared( shared_from_this(), bulk_store_, - spec_["bulkstore_spec"]["stream_threshold"].get()); + spec_["bulkstore_spec"]["stream_threshold"].get(), + GetIOContext()); } BulkReady(); @@ -664,6 +684,47 @@ Status VineyardServer::Persist(const ObjectID id, callback_t<> callback) { return Status::OK(); } +Status VineyardServer::Persist(const std::vector& ids, + callback_t<> callback) { + ENSURE_VINEYARDD_READY(); + auto self(shared_from_this()); + for (auto& id : ids) { + RETURN_ON_ASSERT(!IsBlob(id), "The blobs cannot be persisted"); + } + meta_service_ptr_->RequestToPersist( + [self, ids](const Status& status, const json& meta, + std::vector& ops) { + if (status.ok()) { + for (auto const& id : ids) { + Status s; + VCATCH_JSON_ERROR( + meta, s, + meta_tree::PersistOps(meta, self->instance_name(), id, ops)); + if (s.ok() && !ops.empty() && + self->spec_["sync_crds"].get()) { + json tree; + Status s; + VCATCH_JSON_ERROR( + meta, s, + meta_tree::GetData(meta, self->instance_name(), id, tree)); + if (s.ok() && tree.is_object() && !tree.empty()) { + auto kube = std::make_shared(self->GetMetaContext()); + kube->CreateObject(meta["instances"], tree); + kube->Finish(); + } + } + RETURN_ON_ERROR(s); + } + return Status::OK(); + } else { + VLOG(100) << "Error: " << status.ToString(); + return status; + } + }, + callback); + return Status::OK(); +} + Status VineyardServer::IfPersist(const ObjectID id, callback_t callback) { ENSURE_VINEYARDD_READY(); @@ -868,12 +929,13 @@ Status VineyardServer::DeleteAllAt(const json& meta, } Status VineyardServer::PutName(const ObjectID object_id, - const std::string& name, callback_t<> callback) { + const std::string& name, bool overwrite, + callback_t<> callback) { ENSURE_VINEYARDD_READY(); auto self(shared_from_this()); meta_service_ptr_->RequestToPersist( - [object_id, name](const Status& status, const json& meta, - std::vector& ops) { + [object_id, name, overwrite](const Status& status, const json& meta, + std::vector& ops) { if (status.ok()) { // TODO: do proper validation: // 1. global objects can have name, local ones cannot. @@ -909,6 +971,10 @@ Status VineyardServer::PutName(const ObjectID object_id, "transient objects cannot have name, please persist it first"); } + if (!overwrite && + meta.contains(json::json_pointer("/names/" + name))) { + return Status::Invalid("name already exists, name: " + name); + } ops.emplace_back(meta_tree::op_t::Put("/names/" + name, object_id)); ops.emplace_back(meta_tree::op_t::Put( "/data/" + ObjectIDToString(object_id) + "/__name", @@ -923,6 +989,77 @@ Status VineyardServer::PutName(const ObjectID object_id, return Status::OK(); } +Status VineyardServer::PutNames(const std::vector& object_ids, + const std::vector& names, + bool overwrite, callback_t<> callback) { + ENSURE_VINEYARDD_READY(); + auto self(shared_from_this()); + RETURN_ON_ASSERT(object_ids.size() == names.size(), + "object_ids and names should have the same size"); + meta_service_ptr_->RequestToPersist( + [object_ids, names, overwrite](const Status& status, const json& meta, + std::vector& ops) { + if (status.ok()) { + // TODO: do proper validation: + // 1. global objects can have name, local ones cannot. + // 2. the name-object_id mapping shouldn't be overwrite. + + // blob cannot have name + for (size_t i = 0; i < object_ids.size(); ++i) { + std::string name = names[i]; + ObjectID object_id = object_ids[i]; + if (IsBlob(object_id)) { + return Status::Invalid("blobs cannot have name"); + } + + bool exists = false; + { + Status s; + VCATCH_JSON_ERROR(meta, s, + meta_tree::Exists(meta, object_id, exists)); + VINEYARD_DISCARD(s); + } + if (!exists) { + return Status::ObjectNotExists("failed to put name: object " + + ObjectIDToString(object_id) + + " doesn't exist"); + } + + bool persist = false; + { + Status s; + VCATCH_JSON_ERROR(meta, s, + meta_tree::IfPersist(meta, object_id, persist)); + VINEYARD_DISCARD(s); + } + // FIXME: add a new type for meta(user defined blob need not + // persist) if (!persist) { + // return Status::Invalid( + // "transient objects cannot have name, please persist it + // first"); + // } + + // if one name exists and overwrite is not permitted, all the + // operation will be aborted. + if (!overwrite && + meta.contains(json::json_pointer("/names/" + name))) { + return Status::Invalid("name already exists, name: " + name); + } + ops.emplace_back(meta_tree::op_t::Put("/names/" + name, object_id)); + ops.emplace_back(meta_tree::op_t::Put( + "/data/" + ObjectIDToString(object_id) + "/__name", + meta_tree::EncodeValue(name))); + } + return Status::OK(); + } else { + VLOG(100) << "Error: " << status.ToString(); + return status; + } + }, + callback); + return Status::OK(); +} + Status VineyardServer::GetName(const std::string& name, const bool wait, DeferredReq::alive_t alive, callback_t callback) { @@ -933,16 +1070,17 @@ Status VineyardServer::GetName(const std::string& name, const bool wait, const json& meta) { if (status.ok()) { auto test_task = [name](const json& meta) -> bool { - auto names = meta.value("names", json(nullptr)); - if (names.is_object()) { - return names.contains(name); + auto names_iter = meta.find("names"); + if (names_iter != meta.end() && names_iter->is_object()) { + return names_iter->contains(name); } return false; }; auto eval_task = [name, callback](const json& meta) -> Status { - auto names = meta.value("names", json(nullptr)); - if (names.is_object() && names.contains(name)) { - auto entry = names[name]; + auto names_iter = meta.find("names"); + if (names_iter != meta.end() && names_iter->is_object() && + names_iter->contains(name)) { + auto entry = (*names_iter)[name]; if (!entry.is_null()) { return callback(Status::OK(), entry.get()); } @@ -964,6 +1102,48 @@ Status VineyardServer::GetName(const std::string& name, const bool wait, return Status::OK(); } +Status VineyardServer::GetNames( + const std::vector& name_vec, const bool wait, + DeferredReq::alive_t alive, // if connection is still alive + callback_t> callback) { + ENSURE_VINEYARDD_READY(); + auto self(shared_from_this()); + meta_service_ptr_->RequestToGetData(true, [self, name_vec, wait, alive, + callback](const Status& status, + const json& meta) { + if (status.ok()) { + auto eval_task = [name_vec, callback](const json& meta) -> Status { + std::vector object_ids(name_vec.size(), InvalidObjectID()); + auto names_iter = meta.find("names"); + if (names_iter != meta.end() && names_iter->is_object()) { + for (size_t i = 0; i < name_vec.size(); ++i) { + if (names_iter != meta.end() && names_iter->is_object() && + names_iter->contains(name_vec[i])) { + auto entry = (*names_iter)[name_vec[i]]; + if (!entry.is_null()) { + // return callback(Status::OK(), entry.get()); + object_ids[i] = entry.get(); + } + } + } + } + return callback(Status::OK(), object_ids); + }; + if (!wait) { + return eval_task(meta); + } else { + VINEYARD_ASSERT(false, + "GetNames should not be used with wait=true, " + "otherwise it will not work with deferred requests."); + } + } else { + VLOG(100) << "Error: " << status.ToString(); + return status; + } + }); + return Status::OK(); +} + Status VineyardServer::DropName(const std::string& name, callback_t<> callback) { ENSURE_VINEYARDD_READY(); @@ -972,10 +1152,10 @@ Status VineyardServer::DropName(const std::string& name, [name](const Status& status, const json& meta, std::vector& ops) { if (status.ok()) { - auto names = meta.value("names", json(nullptr)); - if (names.is_object()) { - auto iter = names.find(name); - if (iter != names.end()) { + auto names_iter = meta.find("names"); + if (names_iter != meta.end() && names_iter->is_object()) { + auto iter = names_iter->find(name); + if (iter != names_iter->end()) { ops.emplace_back( meta_tree::op_t::Del("/names/" + escape_json_pointer(name))); auto object_id = iter->get(); @@ -1004,6 +1184,111 @@ Status VineyardServer::DropName(const std::string& name, return Status::OK(); } +Status VineyardServer::DropNames(std::vector& name_vec, + callback_t<> callback) { + ENSURE_VINEYARDD_READY(); + auto self(shared_from_this()); + meta_service_ptr_->RequestToPersist( + [name_vec_ = std::move(name_vec)](const Status& status, const json& meta, + std::vector& ops) { + if (status.ok()) { + auto names_iter = meta.find("names"); + if (names_iter != meta.end() && names_iter->is_object()) { + for (size_t i = 0; i < name_vec_.size(); ++i) { + auto const& name = name_vec_[i]; + auto iter = names_iter->find(name); + if (iter != names_iter->end()) { + ops.emplace_back(meta_tree::op_t::Del( + "/names/" + escape_json_pointer(name))); + auto object_id = iter->get(); + // delete the name in the object meta as well. + bool exists = false; + { + Status s; + VCATCH_JSON_ERROR(meta, s, + meta_tree::Exists(meta, object_id, exists)); + VINEYARD_DISCARD(s); + } + + if (exists) { + ops.emplace_back(meta_tree::op_t::Del( + "/data/" + ObjectIDToString(object_id) + "/__name")); + } else { + LOG(WARNING) << "Object " << ObjectIDToString(object_id) + << " does not exist when dropping name " << name; + } + } + } + } + return Status::OK(); + } else { + LOG(ERROR) << "Drop name failed!Error: " << status.ToString(); + return status; + } + }, + callback); + return Status::OK(); +} + +Status VineyardServer::GetObjectLocation( + const std::vector& names, + callback_t>&> callback) { + ENSURE_VINEYARDD_READY(); + auto self(shared_from_this()); + + // Create a vector with just the name we're looking for + meta_service_ptr_->DirectGetFromMetaService( + names, [self, names, callback](const Status& status, const json& result) { + std::vector> locations; + if (status.ok()) { + for (auto const& name : names) { + auto iter = result.find(name); + if (iter != result.end()) { + auto entry = + iter->get>>(); + std::vector location; + for (auto const& pair : entry) { + location.emplace_back(pair.first); + } + locations.emplace_back(location); + } else { + locations.emplace_back(std::vector()); + } + } + return callback(status, locations); + } else { + // Propagate any errors from RequestToMetaService + VLOG(100) << "Error: " << status.ToString(); + return callback(status, locations); + } + }); + + return Status::OK(); +} + +Status VineyardServer::PutObjectLocation( + const std::vector& names, + const std::vector& locations, int ttl_seconds, + callback_t<> callback) { + ENSURE_VINEYARDD_READY(); + auto self(shared_from_this()); + + // Create a vector with just the name we're looking for + meta_service_ptr_->DirectPutToMetaService( + names, locations, ttl_seconds, + [self, names, callback](const Status& status) { + if (status.ok()) { + return callback(status); + } else { + // Propagate any errors from RequestToMetaService + VLOG(100) << "Error: " << status.ToString(); + return callback(status); + } + }); + + return Status::OK(); +} + Status VineyardServer::MigrateObject(const ObjectID object_id, DeferredReq::alive_t alive, callback_t callback) { @@ -1060,7 +1345,7 @@ Status VineyardServer::MigrateObject(const ObjectID object_id, std::string rdma_endpoint = ""; if ((*instance).contains("rdma_endpoint") && !(*instance)["rdma_endpoint"].is_null()) { - std::string rdma_endpoint = + rdma_endpoint = (*instance)["rdma_endpoint"].get_ref(); } @@ -1426,6 +1711,335 @@ Status VineyardServer::Verify(const std::string& username, return callback(Status::IOError(m.str())); } +Status VineyardServer::VineyardOpenRemoteFixedStream( + ObjectID remote_id, std::string stream_name, ObjectID local_id, + int blob_nums, size_t size, std::string& endpoint, uint64_t mode, + std::string owner, bool wait, uint64_t timeout, callback_t<> callback) { + if (local_id == InvalidObjectID()) { + return Status::Invalid("Invalid local id"); + } + + // get remote client + auto self(shared_from_this()); + std::shared_ptr remote; + Status status = remote_client_pool_->BorrowClient(endpoint, remote); + if (!status.ok()) { + return callback(status); + } + + // call open stream + boost::asio::post(GetIOContext(), [remote_id, stream_name, local_id, + blob_nums, size, endpoint, mode, owner, + wait, timeout, callback, remote, self]() { + ObjectID ret_id = InvalidObjectID(); + Status status = remote->OpenRemoteStream(remote_id, stream_name, ret_id, + mode, wait, timeout); + LOG(INFO) << "Open done, remote_id is:" << remote_id + << ", local_id is:" << local_id << ", ret_id is:" << ret_id + << ", endpoint is:" << endpoint; + + if (!status.ok()) { + callback(status); + return; + } + + status = self->GetStreamStore()->Open(local_id, mode, owner); + if (!status.ok()) { + callback(status); + status = remote->CloseRemoteStream(remote_id); + if (!status.ok()) { + LOG(ERROR) + << "Open local stream error and fail to close remote stream: " + << status.ToString() + << ", remote_id is:" << ObjectIDToString(remote_id) + << ". May cause resource leak."; + } + return; + } + status = self->GetStreamStore()->BindRemoteStream(local_id, ret_id, + endpoint, remote); + if (!status.ok()) { + callback(status); + status = remote->CloseRemoteStream(remote_id); + if (!status.ok()) { + LOG(ERROR) + << "Open local stream error and fail to close remote stream: " + << status.ToString() + << ", remote_id is:" << ObjectIDToString(remote_id) + << ". May cause resource leak."; + } + self->GetStreamStore()->Close(local_id, owner); + self->GetStreamStore()->UnbindRemoteStream(local_id); + return; + } + + callback(Status::OK()); + }); + + // return result + return Status::OK(); +} + +Status VineyardServer::VineyardCloseRemoteFixedStream(ObjectID stream_id, + callback_t<> callback) { + VLOG(2) << "VineyardCloseRemoteFixedStream, stream_id: " + << ObjectIDToString(stream_id); + auto self(shared_from_this()); + std::shared_ptr remote; + ObjectID remote_id = InvalidObjectID(); + std::string endpoint; + RETURN_ON_ERROR(self->GetStreamStore()->GetRemoteInfo(stream_id, remote_id, + endpoint, remote)); + + if (remote_id == InvalidObjectID()) { + // Remote stream is already closed. + callback(Status::OK()); + } + + boost::asio::post(GetIOContext(), [remote_id, remote, callback]() { + callback(remote->CloseRemoteStream(remote_id)); + }); + + return Status::OK(); +} + +Status VineyardServer::VineyardAbortRemoteStream(ObjectID stream_id, + callback_t callback) { + VLOG(2) << "VineyardAbortRemoteStream, stream_id: " + << ObjectIDToString(stream_id); + auto self(shared_from_this()); + std::shared_ptr remote; + ObjectID remote_id = InvalidObjectID(); + std::string endpoint; + RETURN_ON_ERROR(self->GetStreamStore()->GetRemoteInfo(stream_id, remote_id, + endpoint, remote)); + + if (remote_id == InvalidObjectID()) { + LOG(ERROR) << "Abort remote stream error: stream id is invalid, please " + "check if the stream is opened as fork stream of remote " + "stream."; + return Status::Invalid( + "Stream id is invalid, please check if the stream is opened as fork " + "stream of remote stream."); + } + + bool success = false; + Status status = remote->AbortRemoteStream(remote_id, success); + if ((!status.ok()) && + ((!remote->IsConnected()) || status.IsObjectNotExists())) { + // If the remote is not connected, we consider the remote node is down. + callback(Status::OK(), true); + } else { + callback(status, success); + } + + return Status::OK(); +} + +Status VineyardServer::VineyardGetMetasByNames( + std::vector& names, std::string rpc_endpoint, + ClientAttributes attr, callback_t&> callback) { + Status status = + Status::NotImplemented("VineyardGetMetasByNames is not implemented yet."); + std::vector json; + callback(status, json); + return Status::OK(); +} + +Status VineyardServer::VineyardGetRemoteBlobs( + std::vector> local_id_vec, + std::vector> remote_id_vec, std::string rpc_endpoint, + ClientAttributes attr, callback_t callback) { + VLOG(2) << "Vineyard get remote blob from:" << rpc_endpoint + << ", local_id_vec size: " << local_id_vec.size() + << ", remote_id_vec size: " << remote_id_vec.size() + << ", request name: " << attr.req_name; + uint64_t start = 0, end = 0; + start = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + RETURN_ON_ASSERT(local_id_vec.size() == remote_id_vec.size(), + "local_id_vec and remote_id_vec size not match"); + // fetch local blobs + std::vector> local_buffer_vec; + std::vector> sizes_vec; + for (const auto& local_ids : local_id_vec) { + std::vector local_buffers; + std::vector sizes; + for (const auto& local_id : local_ids) { + std::shared_ptr payload; + RETURN_ON_ERROR(GetBulkStore()->GetUnsafe(local_id, true, payload)); + if (payload->pointer == nullptr) { + LOG(ERROR) << "Local blob is invalid, id: " + << ObjectIDToString(local_id); + return Status::Invalid("Local blob is invalid, id: " + + ObjectIDToString(local_id)); + } + local_buffers.push_back(reinterpret_cast(payload->pointer)); + sizes.push_back(payload->data_size); + } + local_buffer_vec.push_back(local_buffers); + sizes_vec.push_back(sizes); + } + + end = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + VLOG(trace_log_level_) << "Request: " << attr.req_name + << ", Prepare data for GetRemoteBlobs, time cost: " + << (end - start) << " us"; + + return VineyardGetRemoteBlobsWithOffset( + local_buffer_vec, remote_id_vec, sizes_vec, rpc_endpoint, attr, callback); +} + +Status VineyardServer::VineyardActivateRemoteFixedStream( + ObjectID stream_id, int conn_id, bool create_buffer, + std::vector& blob_list, + void_callback_t>&> callback) { + auto self(shared_from_this()); + boost::asio::post(GetIOContext(), [stream_id, conn_id, callback, self, + create_buffer, blob_list]() { + std::vector> payload_list; + size_t blob_size = 0; + int blob_nums = 0; + std::string endpoint; + ObjectID remote_id = InvalidObjectID(); + std::shared_ptr remote; + Status status = self->GetStreamStore()->GetRemoteInfo(stream_id, remote_id, + endpoint, remote); + if (!status.ok()) { + LOG(ERROR) << "Get remote info error: " << status.ToString() + << ", stream_id is:" << ObjectIDToString(stream_id); + callback(status, payload_list); + return; + } + + VLOG(2) << "Get remote info done, remote_id is:" + << ObjectIDToString(remote_id) << ", endpoint is:" << endpoint; + + status = self->GetStreamStore()->GetFixedStreamSizeInfo( + stream_id, blob_size, blob_nums); + if (!status.ok()) { + callback(status, payload_list); + return; + } + VLOG(2) << "Get fixed stream size info done, sizes is:" << blob_size; + + std::vector addr_list; + // for RUNC + std::vector local_buffers; + if (create_buffer) { + VLOG(2) << "Create bulk object is needed."; + for (int i = 0; i < blob_nums; i++) { + ObjectID blob_id; + std::shared_ptr object; + status = self->GetBulkStore()->Create(blob_size, blob_id, object); + if (!status.ok()) { + for (std::shared_ptr payload : payload_list) { + self->GetBulkStore()->Delete(payload->id()); + } + callback(status, payload_list); + return; + } + payload_list.push_back(object); + // TODO: send fd + } + VLOG(100) << "Create bulk object done, payload list size:" + << payload_list.size() << " addr_list is:"; + for (auto payload : payload_list) { + VLOG(100) << reinterpret_cast(payload->pointer); + addr_list.push_back(reinterpret_cast(payload->pointer)); + } + } else { + VLOG(2) << "Create bulk object not needed."; + for (ObjectID blob_id : blob_list) { + std::shared_ptr object; + status = self->GetBulkStore()->GetUnsafe(blob_id, true, object); + if (!status.ok()) { + LOG(ERROR) << "Get bulk object error: " << status.ToString() + << ", blob_id is:" << ObjectIDToString(blob_id); + callback(status, payload_list); + return; + } + addr_list.push_back(reinterpret_cast(object->pointer)); + } + } + + // prepare done + VLOG(2) << "Activate remote fixed stream done, vineyard will listen on ack"; + callback(status, payload_list); + + // add a new task of receiving ack + boost::asio::post(self->GetIOContext(), [remote, self, stream_id, remote_id, + addr_list, blob_size, blob_nums, + local_buffers, create_buffer, + conn_id, endpoint]() { + VLOG(2) << "VineyardActivateRemoteFixedStream wait ack task, local_id:" + << ObjectIDToString(stream_id) + << ", remote_id:" << ObjectIDToString(remote_id) + << ", addr_list size:" << addr_list.size() + << ", blob_size:" << blob_size << ", blob_nums:" << blob_nums + << ", local_buffers size:" << local_buffers.size() + << ", conn_id:" << conn_id; + std::vector local_buffers_ = std::move(local_buffers); + if (create_buffer) { + Status status = remote->ActivateRemoteFixedStream( + remote_id, addr_list, blob_size, local_buffers_, conn_id); + if (!status.ok()) { + for (uint64_t addr : addr_list) { + self->GetBulkStore()->Delete(addr); + } + self->GetStreamStore()->SetErrorFlag(stream_id, status); + return; + } + } else { + LOG(INFO) << "activate remote addr:" << remote.get(); + Status status = remote->ActivateRemoteFixedStream( + remote_id, addr_list, blob_size, local_buffers_, conn_id); + if (!status.ok()) { + LOG(ERROR) << "Activate remote fixed stream error: " + << status.ToString() + << ", remote_id is:" << ObjectIDToString(remote_id); + self->GetStreamStore()->SetErrorFlag(stream_id, status); + return; + } + } + LOG(INFO) << "get next stream chunk remote addr:" << remote.get(); + for (int i = 0; i < blob_nums; i++) { + int index = -1; + Status status = remote->GetNextFixedStreamChunk(index); + if (!status.ok()) { + LOG(ERROR) << "Get next fixed stream chunk error: " + << status.ToString() + << ", remote_id is:" << ObjectIDToString(remote_id); + self->GetStreamStore()->SetErrorFlag(stream_id, status); + break; + } + + status = self->GetStreamStore()->SetBlobReceived(stream_id, index); + if (!status.ok()) { + self->GetStreamStore()->SetErrorFlag(stream_id, status); + break; + } + } + }); + }); + + return Status::OK(); +} + +Status VineyardServer::VineyardGetRemoteBlobsWithOffset( + std::vector> local_buffer_vec, + std::vector> remote_id_vec, + std::vector> sizes_vec, std::string rpc_endpoint, + ClientAttributes attr, callback_t callback) { + callback(Status::NotImplemented( + "VineyardGetRemoteBlobsWithOffset is not implemented yet."), + 0); + return Status::OK(); +} + const std::string VineyardServer::IPCSocket() { if (this->ipc_server_ptr_) { return ipc_server_ptr_->Socket(); diff --git a/src/server/server/vineyard_server.h b/src/server/server/vineyard_server.h index 085310aad..effb9d9c2 100644 --- a/src/server/server/vineyard_server.h +++ b/src/server/server/vineyard_server.h @@ -32,12 +32,16 @@ limitations under the License. #include "common/util/callback.h" #include "common/util/json.h" #include "common/util/protocols.h" +#include "common/util/sidecar.h" #include "common/util/status.h" #include "common/util/uuid.h" +#include "server/async/rpc_server.h" #include "server/memory/memory.h" - #include "server/memory/stream_store.h" +#include "server/util/remote_pool.h" + #include "server/server/vineyard_runner.h" +#include "server/util/remote.h" namespace vineyard { @@ -46,7 +50,6 @@ namespace asio = boost::asio; class IMetaService; class IPCServer; -class RPCServer; /** * @brief DeferredReq aims to defer a socket request such that the request @@ -147,6 +150,8 @@ class VineyardServer : public std::enable_shared_from_this { Status Persist(const ObjectID id, callback_t<> callback); + Status Persist(const std::vector& ids, callback_t<> callback); + Status IfPersist(const ObjectID id, callback_t callback); Status Exists(const ObjectID id, callback_t callback); @@ -168,14 +173,32 @@ class VineyardServer : public std::enable_shared_from_this { Status DeleteAllAt(const json& meta, InstanceID const instance_id); Status PutName(const ObjectID object_id, const std::string& name, - callback_t<> callback); + bool overwrite, callback_t<> callback); + + Status PutNames(const std::vector& object_ids, + const std::vector& names, bool overwrite, + callback_t<> callback); Status GetName(const std::string& name, const bool wait, DeferredReq::alive_t alive, // if connection is still alive callback_t callback); + Status GetNames(const std::vector& name_vec, const bool wait, + DeferredReq::alive_t alive, // if connection is still alive + callback_t> callback); + + Status GetObjectLocation( + const std::vector& names, + callback_t>&> callback); + + Status PutObjectLocation(const std::vector& names, + const std::vector& locations, + int ttl_seconds, callback_t<> callback); + Status DropName(const std::string& name, callback_t<> callback); + Status DropNames(std::vector& name_vec, callback_t<> callback); + Status MigrateObject( const ObjectID object_id, DeferredReq::alive_t alive, // if connection is still alive @@ -207,7 +230,42 @@ class VineyardServer : public std::enable_shared_from_this { Status TryReleaseLock(std::string& key, callback_t callback); - inline SessionID session_id() const { return session_id_; } + // stream + Status VineyardOpenRemoteFixedStream( + ObjectID remote_id, std::string stream_name, ObjectID local_id, + int blob_nums, size_t size, std::string& endpoint, uint64_t mode, + std::string owner, bool wait, uint64_t timeout, callback_t<> callback); + + Status VineyardActivateRemoteFixedStream( + ObjectID stream_id, int conn_id, bool create_buffer, + std::vector& blob_list, + void_callback_t>&> + callback); + + Status VineyardCloseRemoteFixedStream(ObjectID stream_id, + callback_t<> callback); + + Status VineyardAbortRemoteStream(ObjectID stream_id, + callback_t callback); + + Status VineyardGetMetasByNames(std::vector& names, + std::string rpc_endpoint, + ClientAttributes attr, + callback_t&> callback); + + Status VineyardGetRemoteBlobs( + std::vector> local_id_vec, + std::vector> remote_id_vec, + std::string rpc_endpoint, ClientAttributes attr, + callback_t callback); + + Status VineyardGetRemoteBlobsWithOffset( + std::vector> local_buffer_vec, + std::vector> remote_id_vec, + std::vector> sizes_vec, std::string rpc_endpoint, + ClientAttributes attr, callback_t callback); + + virtual inline SessionID session_id() const { return session_id_; } inline InstanceID instance_id() { return instance_id_; } inline std::string instance_name() { return instance_name_; } inline void set_instance_id(InstanceID id) { @@ -238,6 +296,8 @@ class VineyardServer : public std::enable_shared_from_this { const std::string RDMAEndpoint(); + int GetTraceLogLevel() { return trace_log_level_; } + void LockTransmissionObjects(std::vector const& ids) { std::lock_guard lock(transmission_objects_mutex_); for (auto const& id : ids) { @@ -259,6 +319,8 @@ class VineyardServer : public std::enable_shared_from_this { } } } + VLOG(3) << "locked transmission objects size: " + << transmission_objects_.size(); } DeletePendingObjects(); } @@ -328,9 +390,35 @@ class VineyardServer : public std::enable_shared_from_this { } } + Status GetBulkStoreBasePointer(void*& pointer) { + if (bulk_store_ == nullptr) { + return Status::ObjectNotExists("bulk store is not ready"); + } + pointer = bulk_store_->GetBasePointer(); + return Status::OK(); + } + + virtual Status GetBulkStoreMmapAddr(void*& addr) { + if (bulk_store_ == nullptr) { + return Status::ObjectNotExists("bulk store is not ready"); + } + addr = reinterpret_cast( + reinterpret_cast(bulk_store_->GetBasePointer()) - + bulk_store_->GetBaseOffset()); + return Status::OK(); + } + + virtual Status GetBulkStoreBaseSize(size_t& size) { + if (bulk_store_ == nullptr) { + return Status::ObjectNotExists("bulk store is not ready"); + } + size = bulk_store_->GetBaseSize(); + return Status::OK(); + } + ~VineyardServer(); - private: + protected: json spec_; SessionID session_id_; @@ -381,6 +469,9 @@ class VineyardServer : public std::enable_shared_from_this { // It must be blob. std::unordered_set pendding_to_delete_objects_; std::mutex pendding_to_delete_objects_mutex_; + std::shared_ptr remote_client_pool_; + + int trace_log_level_ = 0; }; } // namespace vineyard diff --git a/src/server/services/meta_service.cc b/src/server/services/meta_service.cc index bf02e76a4..b3a6ec95c 100644 --- a/src/server/services/meta_service.cc +++ b/src/server/services/meta_service.cc @@ -96,6 +96,7 @@ Status IMetaService::Start(bool create_new_instance) { std::this_thread::sleep_for(std::chrono::seconds(1)); } RETURN_ON_ERROR(s); +#if defined(BUILD_VINEYARDD_ETCD) auto self(shared_from_this()); requestValues( "", [self](const Status& status, const json& meta, unsigned rev) { @@ -119,6 +120,9 @@ Status IMetaService::Start(bool create_new_instance) { } return status; }); +#else + Ready(); +#endif return Status::OK(); } @@ -517,6 +521,7 @@ void IMetaService::CloneRef(ObjectID const target, ObjectID const mirror) { } } +#if defined(BUILD_VINEYARDD_ETCD) void IMetaService::registerToEtcd() { auto self(shared_from_this()); RequestToPersist( @@ -597,7 +602,9 @@ void IMetaService::registerToEtcd() { return status; }); } +#endif // defined(BUILD_VINEYARDD_ETCD) +#if defined(BUILD_VINEYARDD_ETCD) void IMetaService::checkInstanceStatus( std::shared_ptr const& self, callback_t<> callback_after_finish) { @@ -720,6 +727,7 @@ Status IMetaService::startHeartbeat(std::shared_ptr const& self, }); return Status::OK(); } +#endif // defined(BUILD_VINEYARDD_ETCD) void IMetaService::requestValues(const std::string& prefix, callback_t callback) { @@ -1064,10 +1072,12 @@ void IMetaService::metaUpdate(const RangeT& ops, const bool from_remote, continue; } - // update instance status +// update instance status +#if defined(BUILD_VINEYARDD_ETCD) if (boost::algorithm::starts_with(op.kv.key, "/instances/")) { instanceUpdate(op, from_remote); } +#endif #ifndef NDEBUG if (from_remote) { @@ -1182,6 +1192,7 @@ void IMetaService::metaUpdate(const RangeT& ops, const bool from_remote, VINEYARD_SUPPRESS(server_ptr_->ProcessDeferred(meta_)); } +#if defined(BUILD_VINEYARDD_ETCD) void IMetaService::instanceUpdate(const op_t& op, const bool from_remote) { std::vector key_segments; boost::split(key_segments, op.kv.key, boost::is_any_of("/")); @@ -1290,5 +1301,6 @@ Status IMetaService::UpdateEtcdEndpoint() { }, Status::OK()); } +#endif // defined(BUILD_VINEYARDD_ETCD) } // namespace vineyard diff --git a/src/server/services/meta_service.h b/src/server/services/meta_service.h index e8c8fb5b5..dbd6b3a10 100644 --- a/src/server/services/meta_service.h +++ b/src/server/services/meta_service.h @@ -65,7 +65,9 @@ class ILock { const unsigned rev_; }; +#if defined(BUILD_VINEYARDD_ETCD) class EtcdMetaService; +#endif // BUILD_VINEYARDD_ETCD /** * @brief IMetaService is the base class of EtcdMetaService @@ -118,10 +120,24 @@ class IMetaService : public std::enable_shared_from_this { void RequestToDirectUpdate(std::vector const& ops, const bool from_remote = false); - void RequestToPersist( + virtual void RequestToPersist( callback_t&> callback_after_ready, callback_t<> callback_after_finish); + virtual void DirectGetFromMetaService(const std::vector& keys, + callback_t callback) { + callback(Status::NotImplemented("DirectGetFromMetaService is not " + "implemented in this meta service."), + json()); + } + + virtual void DirectPutToMetaService(const std::vector& keys, + const std::vector& values, + int ttl_seconds, callback_t<> callback) { + callback(Status::NotImplemented( + "DirectPutToMetaService is not implemented in this meta service.")); + } + void RequestToGetData(const bool sync_remote, callback_t callback); @@ -149,14 +165,18 @@ class IMetaService : public std::enable_shared_from_this { virtual void TryReleaseLock(std::string key, callback_t callback) = 0; +#if defined(BUILD_VINEYARDD_ETCD) Status RemoveEtcdMember(const uint64_t& member_id); const uint64_t GetEtcdMemberID(); Status UpdateEtcdEndpoint(); +#endif // BUILD_VINEYARDD_ETCD private: +#if defined(BUILD_VINEYARDD_ETCD) void registerToEtcd(); +#endif // BUILD_VINEYARDD_ETCD /** * Watch rules: @@ -203,6 +223,7 @@ class IMetaService : public std::enable_shared_from_this { // validate the liveness of the underlying meta service. virtual Status probe() = 0; +#if defined(BUILD_VINEYARDD_ETCD) template ReturnType callIfEtcdMetaService(Func&& func, ReturnType defaultValue = ReturnType()) { @@ -213,6 +234,7 @@ class IMetaService : public std::enable_shared_from_this { } return defaultValue; } +#endif // BUILD_VINEYARDD_ETCD void printDepsGraph(); @@ -249,16 +271,22 @@ class IMetaService : public std::enable_shared_from_this { void delVal(const kv_t& kv); void delVal(ObjectID const& target, std::set& blobs); + protected: template void metaUpdate(const RangeT& ops, bool const from_remote, const bool memory_trim = false); + private: +#if defined(BUILD_VINEYARDD_ETCD) void instanceUpdate(const op_t& op, const bool from_remote = true); +#endif // defined(BUILD_VINEYARDD_ETCD) +#if defined(BUILD_VINEYARDD_ETCD) static Status daemonWatchHandler(std::shared_ptr self, const Status& status, const std::vector& ops, unsigned rev, callback_t callback_after_update); +#endif // defined(BUILD_VINEYARDD_ETCD) std::unique_ptr heartbeat_timer_; std::set instances_list_; diff --git a/src/server/util/etcd_member.cc b/src/server/util/etcd_member.cc index 46b22f900..021f11018 100644 --- a/src/server/util/etcd_member.cc +++ b/src/server/util/etcd_member.cc @@ -13,12 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "server/util/etcd_member.h" - #include #include #include -#include "etcd/Response.hpp" #if defined(BUILD_VINEYARDD_ETCD) @@ -26,6 +23,9 @@ limitations under the License. #include "common/util/logging.h" // IWYU pragma: keep #include "common/util/status.h" +#include "server/util/etcd_member.h" + +#include "etcd/Response.hpp" namespace vineyard { diff --git a/src/server/util/remote.cc b/src/server/util/remote.cc index 8f4999e71..ccd3edbf9 100644 --- a/src/server/util/remote.cc +++ b/src/server/util/remote.cc @@ -55,16 +55,6 @@ Status RemoteClient::StopRDMA() { } rdma_connected_ = false; - void* msg; - RETURN_ON_ERROR(rdma_client_->GetTXFreeMsgBuffer(msg)); - VineyardMsg* vmsg = reinterpret_cast(msg); - vmsg->type = VINEYARD_MSG_CLOSE; - RETURN_ON_ERROR(rdma_client_->Send(msg, sizeof(VineyardMsg), nullptr)); - RETURN_ON_ERROR(rdma_client_->GetTXCompletion(-1, nullptr)); - - RETURN_ON_ERROR(rdma_client_->Stop()); - RETURN_ON_ERROR(rdma_client_->Close()); - RETURN_ON_ERROR(RDMAClientCreator::Release(rdma_endpoint_)); return Status::OK(); } @@ -93,16 +83,6 @@ Status RemoteClient::Connect(const std::string& rpc_endpoint, rdma_port = rdma_endpoint.substr(pos + 1); } - Status status = ConnectRDMAServer(rdma_host, std::atoi(rdma_port.c_str())); - if (status.ok()) { - rdma_endpoint_ = rdma_host + ":" + rdma_port; - VLOG(100) << "Connect to RDMA server successfully. RDMA host:" << rdma_host - << ", port:" << rdma_port; - } else { - VLOG(100) << "Failed to connect to RDMA server. Fall back to TCP. Error:" - << status.message(); - } - return Status::OK(); } @@ -169,6 +149,7 @@ Status RemoteClient::ConnectRDMAServer(const std::string& host, VLOG(100) << "Try to connect to RDMA server " << host << ":" << port << "..."; RETURN_ON_ERROR(this->rdma_client_->Connect()); + this->rdma_connected_ = true; return Status::OK(); } @@ -180,9 +161,18 @@ Status RemoteClient::Connect(const std::string& host, const uint32_t port, } asio::ip::tcp::resolver resolver(context_); - int retries = 0, max_connect_retries = 10; + int retries = 0; + std::string retry_times_env_str = + read_env("VINEYARD_REMOTE_CONNECT_RETRIES", "1"); + std::string retry_interval_ms_env_str = + read_env("VINEYARD_REMOTE_CONNECT_INTERVAL_MS", "100"); + int retry_times = std::stoi(retry_times_env_str); + int retry_interval_ms = std::stoi(retry_interval_ms_env_str); + LOG(INFO) << "Connecting to remote server at " << host << ":" << port + << " with " << retry_times << " retries and " << retry_interval_ms + << " ms interval"; boost::system::error_code ec; - while (retries < max_connect_retries) { + while (retries < retry_times) { #if BOOST_VERSION >= 106600 asio::connect(remote_tcp_socket_, resolver.resolve(host, std::to_string(port)), ec); @@ -192,19 +182,22 @@ Status RemoteClient::Connect(const std::string& host, const uint32_t port, host, std::to_string(port))), ec); #endif - if (ec) { - std::this_thread::sleep_for(std::chrono::seconds(1)); + if (ec && retries < retry_times - 1) { + std::this_thread::sleep_for(std::chrono::milliseconds(retry_interval_ms)); retries += 1; } else { break; } } if (ec) { + LOG(ERROR) << "Failed to connect to peer after " + + std::to_string(retry_times) + " retries: " + ec.message(); return Status::IOError("Failed to connect to peer after " + - std::to_string(max_connect_retries) + + std::to_string(retry_times) + " retries: " + ec.message()); } socket_ = std::move(remote_tcp_socket_); + socket_.set_option(asio::ip::tcp::no_delay(true)); std::string message_out; WriteRegisterRequest(message_out, StoreType::kDefault, session_id); @@ -219,6 +212,10 @@ Status RemoteClient::Connect(const std::string& host, const uint32_t port, message_in, ipc_socket_value, rpc_endpoint_value, remote_instance_id_, session_id_, server_version_, store_match, support_rpc_compression)); this->connected_ = true; + VLOG(2) << "Connected to remote server at " << host << ":" << port + << ", instance id: " << remote_instance_id_ + << ", session id: " << session_id_ + << ", server version: " << server_version_; return Status::OK(); } @@ -291,6 +288,74 @@ Status RemoteClient::MigrateObject(const ObjectID object_id, const json& meta, return Status::OK(); } +Status RemoteClient::OpenRemoteStream(ObjectID remote_id, + std::string stream_name, ObjectID& ret_id, + uint64_t mode, bool wait, + uint64_t timeout) { + TRY_ACQUIRE_CONNECTION(this); + std::string message_out; + VLOG(100) << "Remote client open stream: " << remote_id + << ", name:" << stream_name << ", mode:" << mode; + WriteOpenStreamRequest(remote_id, stream_name, mode, wait, timeout, + message_out); + RETURN_ON_ERROR(doWrite(message_out)); + + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + RETURN_ON_ERROR(ReadOpenStreamReply(message_in, ret_id)); + VLOG(100) << "Get remote stream id: " << ret_id; + return Status::OK(); +} + +Status ActivateRemoteFixedStream(ObjectID remote_id, + std::vector buffers, + size_t buffer_size, + std::vector& local_buffers, + int conn_id, callback_t callback) { + callback(Status::NotImplemented("Without is not implemented yet"), -1); + return Status::OK(); +} + +Status RemoteClient::ActivateRemoteFixedStream( + ObjectID remote_id, std::vector buffers, size_t buffer_size, + std::vector& local_buffers, int conn_id) { + return Status::NotImplemented("Without is not implemented yet"); +} + +Status RemoteClient::GetNextFixedStreamChunk(int& index) { + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + RETURN_ON_ERROR(ReadStreamReadyAckReply(message_in, index)); + if (index < 0) { + return Status::Invalid("Invalid index: " + std::to_string(index)); + } + return Status::OK(); +} + +Status RemoteClient::CloseRemoteStream(ObjectID stream_id) { + TRY_ACQUIRE_CONNECTION(this); + std::string message_out; + WriteCloseStreamRequest(stream_id, message_out); + RETURN_ON_ERROR(doWrite(message_out)); + + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + RETURN_ON_ERROR(ReadCloseStreamReply(message_in)); + return Status::OK(); +} + +Status RemoteClient::AbortRemoteStream(ObjectID stream_id, bool& success) { + TRY_ACQUIRE_CONNECTION(this); + std::string message_out; + WriteAbortStreamRequest(stream_id, message_out); + RETURN_ON_ERROR(doWrite(message_out)); + + json message_in; + RETURN_ON_ERROR(doRead(message_in)); + RETURN_ON_ERROR(ReadAbortStreamReply(message_in, success)); + return Status::OK(); +} + Status RemoteClient::collectRemoteBlobs(const json& tree, std::set& blobs) { if (tree.empty()) { @@ -346,6 +411,7 @@ Status RemoteClient::recreateMetadata( Status RemoteClient::migrateBuffers( const std::set blobs, callback_t&> callback) { + TRY_ACQUIRE_CONNECTION(this); std::vector payloads; std::vector fd_sent; bool compress = server_ptr_->GetSpec().value( @@ -511,9 +577,15 @@ Status RemoteClient::doWrite(const std::string& message_out) { boost::system::error_code ec; size_t length = message_out.length(); asio::write(socket_, asio::const_buffer(&length, sizeof(size_t)), ec); + if (ec) { + this->connected_ = false; + } RETURN_ON_ASIO_ERROR(ec); asio::write(socket_, asio::const_buffer(message_out.data(), message_out.length()), ec); + if (ec) { + this->connected_ = false; + } RETURN_ON_ASIO_ERROR(ec); return Status::OK(); } @@ -522,6 +594,9 @@ Status RemoteClient::doRead(std::string& message_in) { boost::system::error_code ec; size_t length = std::numeric_limits::max(); asio::read(socket_, asio::buffer(&length, sizeof(size_t)), ec); + if (ec) { + this->connected_ = false; + } RETURN_ON_ASIO_ERROR(ec); if (length > 64 * 1024 * 1024) { // 64M bytes return Status::IOError("Invalid message header value: " + @@ -531,6 +606,9 @@ Status RemoteClient::doRead(std::string& message_in) { asio::read(socket_, asio::mutable_buffer(const_cast(message_in.data()), length), ec); + if (ec) { + this->connected_ = false; + } RETURN_ON_ASIO_ERROR(ec); return Status::OK(); } diff --git a/src/server/util/remote.h b/src/server/util/remote.h index 5c0499dc3..7022d7e90 100644 --- a/src/server/util/remote.h +++ b/src/server/util/remote.h @@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -23,21 +23,37 @@ limitations under the License. #include #include "common/memory/payload.h" +#include "common/rdma/rdma_client.h" +#include "common/rdma/util.h" #include "common/util/asio.h" // IWYU pragma: keep #include "common/util/callback.h" #include "common/util/status.h" #include "common/util/uuid.h" -#include "common/rdma/rdma_client.h" -#include "common/rdma/util.h" +#include "server/util/utils.h" + +#ifndef TRY_ACQUIRE_CONNECTION +#define TRY_ACQUIRE_CONNECTION(this) \ + if (!this->connected_) { \ + return Status::ConnectionError("Client is not connected"); \ + } \ + std::unique_lock __guard(this->client_mutex_, \ + std::defer_lock); \ + if (!__guard.try_lock()) { \ + return Status::ConnectionError("Client is busy"); \ + } +#endif // TRY_ACQUIRE_CONNECTION namespace vineyard { class VineyardServer; +class RemoteClientPool; + class RemoteClient : public std::enable_shared_from_this { public: explicit RemoteClient(const std::shared_ptr vs_ptr); + ~RemoteClient(); Status Connect(const std::string& rpc_endpoint, const SessionID session_id, @@ -51,6 +67,34 @@ class RemoteClient : public std::enable_shared_from_this { Status MigrateObject(const ObjectID object_id, const json& meta, callback_t callback); + Status OpenRemoteStream(ObjectID remote_id, std::string stream_name, + ObjectID& ret_id, uint64_t mode, bool wait, + uint64_t timeout); + + Status ActivateRemoteFixedStream(ObjectID remote_id, + std::vector buffers, + size_t buffer_size, + std::vector& local_buffers, + int conn_id, callback_t callback); + + Status ActivateRemoteFixedStream(ObjectID remote_id, + std::vector buffer, + size_t buffer_size, + std::vector& local_buffers, + int conn_id); + + Status GetNextFixedStreamChunk(int& index); + + Status CloseRemoteStream(ObjectID stream_id); + + Status AbortRemoteStream(ObjectID stream_id, bool& success); + + bool IsConnected() const { return connected_; } + + void AcquireConnection() { client_mutex_.lock(); } + + void ReleaseConnection() { client_mutex_.unlock(); } + private: Status migrateBuffers( const std::set blobs, @@ -85,6 +129,9 @@ class RemoteClient : public std::enable_shared_from_this { std::string rdma_endpoint_; std::shared_ptr rdma_client_; mutable bool rdma_connected_ = false; + mutable std::recursive_mutex client_mutex_; + + friend class RemoteClientPool; }; /** diff --git a/src/server/util/remote_pool.cc b/src/server/util/remote_pool.cc new file mode 100644 index 000000000..f9447239e --- /dev/null +++ b/src/server/util/remote_pool.cc @@ -0,0 +1,96 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include +#include + +#include "common/util/status.h" +#include "server/async/rpc_server.h" +#include "server/server/vineyard_server.h" +#include "server/util/remote.h" +#include "server/util/remote_pool.h" + +namespace vineyard { + +Status RemoteClientPool::BorrowClient(std::string endpoint, + std::shared_ptr& client) { + VLOG(2) << "Borrow client from pool, endpoint: " << endpoint; + std::lock_guard lock(clients_mutex_); + auto iter = clients_.find(endpoint); + if (iter != clients_.end() && !iter->second.empty()) { + VLOG(2) << "Get client from pool"; + client = iter->second.front(); + iter->second.pop(); + return Status::OK(); + } else { + VLOG(2) << "Client is not enough, create a new one"; + client = std::make_shared(server_ptr_); + RETURN_ON_ERROR(client->Connect(endpoint, server_ptr_->session_id(), "")); + total_clients_++; + return Status::OK(); + } +} + +Status RemoteClientPool::ReleaseClient(std::string endpoint, + std::shared_ptr client) { + VLOG(2) << "Release client to pool, endpoint: " << endpoint; + std::lock_guard lock(clients_mutex_); + if (!client->connected_) { + LOG(WARNING) << "Client is not connected, discard..."; + LOG(INFO) << "Pool size of " << endpoint << " is " + << clients_[endpoint].size(); + total_clients_--; + return Status::OK(); + } + auto iter = clients_.find(endpoint); + if (iter != clients_.end()) { + iter->second.push(client); + } else { + std::queue> q; + q.push(client); + clients_[endpoint] = q; + } + VLOG(2) << "Release client to pool. Pool size of " << endpoint << " is " + << clients_[endpoint].size(); + static uint64_t last_time = + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + uint64_t now = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + if (now - last_time > SECOND_TO_MILLISECOND(3)) { + LOG(INFO) << "Currently remote client pool size is:" << clients_.size(); + for (auto item : clients_) { + LOG(INFO) << "endpoint: " << item.first + << " client num: " << item.second.size(); + } + LOG(INFO) << "Total client:" << total_clients_; + last_time = now; + } + return Status::OK(); +} + +size_t RemoteClientPool::AvailableClientNum(std::string endpoint) { + size_t num = 0; + std::lock_guard lock(clients_mutex_); + auto iter = clients_.find(endpoint); + if (iter != clients_.end()) { + num = iter->second.size(); + } + return num; +} + +} // namespace vineyard diff --git a/src/server/util/remote_pool.h b/src/server/util/remote_pool.h new file mode 100644 index 000000000..e29c9a766 --- /dev/null +++ b/src/server/util/remote_pool.h @@ -0,0 +1,59 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef SRC_SERVER_UTIL_REMOTE_POOL_H_ +#define SRC_SERVER_UTIL_REMOTE_POOL_H_ + +#include +#include +#include +#include + +#include "common/util/status.h" +#include "server/async/rpc_server.h" +#include "server/server/vineyard_server.h" +#include "server/util/remote.h" + +namespace vineyard { + +class RemoteClientPool { + public: + explicit RemoteClientPool(std::shared_ptr server_ptr, + std::shared_ptr rpc_server_ptr) + : server_ptr_(server_ptr), rpc_server_ptr_(rpc_server_ptr) {} + + ~RemoteClientPool() = default; + + Status BorrowClient(std::string endpoint, + std::shared_ptr& client); + + Status ReleaseClient(std::string endpoint, + std::shared_ptr client); + + size_t AvailableClientNum(std::string endpoint); + + private: + std::unordered_map>> + clients_; + uint64_t total_clients_ = 0; + + std::recursive_mutex clients_mutex_; + std::shared_ptr server_ptr_; + std::shared_ptr rpc_server_ptr_; +}; + +} // namespace vineyard + +#endif // SRC_SERVER_UTIL_REMOTE_POOL_H_ diff --git a/src/server/util/spill_file.cc b/src/server/util/spill_file.cc index cc24197fd..4fcb1bf0b 100644 --- a/src/server/util/spill_file.cc +++ b/src/server/util/spill_file.cc @@ -13,8 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "server/util/spill_file.h" - #include #include #include @@ -25,6 +23,9 @@ limitations under the License. #include "server/memory/memory.h" #include "server/util/file_io_adaptor.h" +#ifdef BUILD_VINEYARDD_SPILLING +#include "server/util/spill_file.h" + namespace vineyard { namespace io { @@ -112,3 +113,5 @@ Status SpillFileReader::Delete_(const ObjectID id) { } // namespace io } // namespace vineyard + +#endif // BUILD_VINEYARDD_SPILLING diff --git a/src/server/util/utils.h b/src/server/util/utils.h new file mode 100644 index 000000000..2b81ae9f3 --- /dev/null +++ b/src/server/util/utils.h @@ -0,0 +1,54 @@ +/** Copyright 2020-2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef SRC_SERVER_UTIL_UTILS_H_ +#define SRC_SERVER_UTIL_UTILS_H_ + +#include +#include +#include + +namespace vineyard { + +#ifndef SECOND_TO_MILLISECOND +#define SECOND_TO_MILLISECOND(x) ((x) *1000) +#endif + +static inline bool is_port_available(int port) { + int sock = socket(AF_INET, SOCK_STREAM, 0); + if (sock < 0) { + return false; + } + + int opt = 1; + setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); + + struct sockaddr_in addr; + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = INADDR_ANY; + addr.sin_port = htons(port); + + if (bind(sock, (struct sockaddr*) &addr, sizeof(addr)) == 0) { + close(sock); + return true; + } else { + close(sock); + return false; + } +} + +} // namespace vineyard + +#endif // SRC_SERVER_UTIL_UTILS_H_ diff --git a/thirdparty/etcd-cpp-apiv3 b/thirdparty/etcd-cpp-apiv3 index ea56cee80..7c6e714f1 160000 --- a/thirdparty/etcd-cpp-apiv3 +++ b/thirdparty/etcd-cpp-apiv3 @@ -1 +1 @@ -Subproject commit ea56cee80f441973a0149b57604e7a7874c61b65 +Subproject commit 7c6e714f188f9576e25e0350cac4181139eec23e diff --git a/thirdparty/thread-pool/thread_pool.h b/thirdparty/thread-pool/thread_pool.h new file mode 100644 index 000000000..fdb1ba1d1 --- /dev/null +++ b/thirdparty/thread-pool/thread_pool.h @@ -0,0 +1,117 @@ +#ifndef THREAD_POOL_H +#define THREAD_POOL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +class ThreadPool { +public: + ThreadPool(size_t); + template + auto enqueue(F&& f, Args&&... args) + -> std::future::type>; + template + void enqueue_noreturn(F&& f, Args&&... args); + ~ThreadPool(); +private: + // need to keep track of threads so we can join them + std::vector< std::thread > workers; + // the task queue + std::queue< std::function > tasks; + + // synchronization + std::mutex queue_mutex; + std::condition_variable condition; + bool stop; +}; + +// the constructor just launches some amount of workers +inline ThreadPool::ThreadPool(size_t threads) + : stop(false) +{ + for(size_t i = 0;i task; + + { + std::unique_lock lock(this->queue_mutex); + this->condition.wait(lock, + [this]{ return this->stop || !this->tasks.empty(); }); + if(this->stop && this->tasks.empty()) + return; + task = std::move(this->tasks.front()); + this->tasks.pop(); + } + + task(); + } + } + ); +} + +// add new work item to the pool +template +auto ThreadPool::enqueue(F&& f, Args&&... args) + -> std::future::type> +{ + using return_type = typename std::result_of::type; + + auto task = std::make_shared< std::packaged_task >( + std::bind(std::forward(f), std::forward(args)...) + ); + + std::future res = task->get_future(); + { + std::unique_lock lock(queue_mutex); + + // don't allow enqueueing after stopping the pool + if(stop) + throw std::runtime_error("enqueue on stopped ThreadPool"); + + tasks.emplace([task](){ (*task)(); }); + } + condition.notify_one(); + return res; +} + +template +void ThreadPool::enqueue_noreturn(F&& f, Args&&... args) +{ + auto task = std::bind(std::forward(f), std::forward(args)...); + + { + std::unique_lock lock(queue_mutex); + + // don't allow enqueueing after stopping the pool + if(stop) + throw std::runtime_error("enqueue on stopped ThreadPool"); + + tasks.emplace(task); + } + condition.notify_one(); +} + +// the destructor joins all threads +inline ThreadPool::~ThreadPool() +{ + { + std::unique_lock lock(queue_mutex); + stop = true; + } + condition.notify_all(); + for(std::thread &worker: workers) + worker.join(); +} + +#endif \ No newline at end of file From 640485e589f4a6a0ae0e105dd856ae8912cf5f58 Mon Sep 17 00:00:00 2001 From: vegetableysm Date: Tue, 18 Nov 2025 13:55:54 +0800 Subject: [PATCH 2/2] Format code. Signed-off-by: vegetableysm --- python/vineyard/core/client.py | 78 +++++++--- .../core/tests/fixed_stream_receiver.py | 146 +++++++++--------- .../core/tests/fixed_stream_sender.py | 104 +++++++------ python/vineyard/io/fixed_blob.py | 101 ++++++------ 4 files changed, 248 insertions(+), 181 deletions(-) diff --git a/python/vineyard/core/client.py b/python/vineyard/core/client.py index 0f2207d10..6c4795b36 100644 --- a/python/vineyard/core/client.py +++ b/python/vineyard/core/client.py @@ -18,11 +18,11 @@ import contextlib import os +import threading import warnings +from concurrent.futures import Future from concurrent.futures import ThreadPoolExecutor from concurrent.futures import as_completed -from concurrent.futures import Future -import threading from typing import Any from typing import Dict from typing import List @@ -45,28 +45,28 @@ from vineyard._C import _connect from vineyard.core.builder import BuilderContext from vineyard.core.builder import put -from vineyard.core.resolver import get_current_resolvers from vineyard.core.resolver import ResolverContext from vineyard.core.resolver import get +from vineyard.core.resolver import get_current_resolvers + class AsyncFixedStreamChunk: - + def __init__(self, client, chunk_nums): self.client = client self._chunk_nums = chunk_nums - self.future : Optional[Future] = None + self.future: Optional[Future] = None self._reader_index = 0 self._writer_index = 0 self._ready_list = [] - self._exception : Optional[Exception] = None + self._exception: Optional[Exception] = None self._lock = threading.RLock() self._start_fetch() - def _start_fetch(self): self.future = self.client._async_task_thread_pool.submit(self._fetch) self.future.add_done_callback(self._callback) - + def _fetch(self): try: try: @@ -81,7 +81,7 @@ def _fetch(self): self._writer_index += 1 except Exception as e: self._exception = e - + def _callback(self, future): try: future.result() @@ -91,7 +91,7 @@ def _callback(self, future): with self._lock: if self._writer_index < self._chunk_nums: self._start_fetch() - + def get(self) -> int: if self._exception: raise self._exception @@ -107,6 +107,7 @@ def get(self) -> int: pass return -1 + def _apply_docstring(func): def _apply(fn): fn.__doc__ = func.__doc__ @@ -455,13 +456,15 @@ def open_stream(self, id: ObjectID, mode: str) -> None: @_apply_docstring(IPCClient.close_stream) def close_stream(self, id: ObjectID) -> None: return self.default_client().close_stream(id) - + @_apply_docstring(IPCClient.delete_stream) def delete_stream(self, id: ObjectID) -> None: return self.default_client().delete_stream(id) - + @_apply_docstring(IPCClient.create_fixed_stream) - def create_fixed_stream(self, stream_name: str, blob_num: int, size: int) -> ObjectID: + def create_fixed_stream( + self, stream_name: str, blob_num: int, size: int + ) -> ObjectID: return self.default_client().create_fixed_stream(stream_name, blob_num, size) @_apply_docstring(IPCClient.push_chunk) @@ -738,17 +741,45 @@ def next_buffer_chunk(self, stream: ObjectID) -> memoryview: return self.ipc_client.next_buffer_chunk(stream) @_apply_docstring(IPCClient.vineyard_open_remote_fixed_stream_with_id) - def vineyard_open_remote_fixed_stream_with_id(self, remote_id: ObjectID, local_id: ObjectID, blob_nums: int, size: int, remote_endpoint: str, mode: str, wait: bool, timeout: int) -> int: - return self.ipc_client.vineyard_open_remote_fixed_stream_with_id(remote_id, local_id, blob_nums, size, remote_endpoint, mode, wait, timeout) - + def vineyard_open_remote_fixed_stream_with_id( + self, + remote_id: ObjectID, + local_id: ObjectID, + blob_nums: int, + size: int, + remote_endpoint: str, + mode: str, + wait: bool, + timeout: int, + ) -> int: + return self.ipc_client.vineyard_open_remote_fixed_stream_with_id( + remote_id, local_id, blob_nums, size, remote_endpoint, mode, wait, timeout + ) + @_apply_docstring(IPCClient.vineyard_open_remote_fixed_stream_with_name) - def vineyard_open_remote_fixed_stream_with_name(self, remote_name: str, local_id: ObjectID, blob_nums: int, size: int, remote_endpoint: str, mode: str, wait: bool, timeout: int) -> int: - return self.ipc_client.vineyard_open_remote_fixed_stream_with_name(remote_name, local_id, blob_nums, size, remote_endpoint, mode, wait, timeout) - + def vineyard_open_remote_fixed_stream_with_name( + self, + remote_name: str, + local_id: ObjectID, + blob_nums: int, + size: int, + remote_endpoint: str, + mode: str, + wait: bool, + timeout: int, + ) -> int: + return self.ipc_client.vineyard_open_remote_fixed_stream_with_name( + remote_name, local_id, blob_nums, size, remote_endpoint, mode, wait, timeout + ) + @_apply_docstring(IPCClient.vineyard_activate_remote_fixed_stream_with_offset) - def vineyard_activate_remote_fixed_stream_with_offset(self, stream_id: ObjectID, offsets: List[int]) -> None: - return self.ipc_client.vineyard_activate_remote_fixed_stream_with_offset(stream_id, offsets) - + def vineyard_activate_remote_fixed_stream_with_offset( + self, stream_id: ObjectID, offsets: List[int] + ) -> None: + return self.ipc_client.vineyard_activate_remote_fixed_stream_with_offset( + stream_id, offsets + ) + # List[0]: fd, List[1]: size, List[2]: offset @_apply_docstring(IPCClient.get_vineyard_mmap_fd) def get_vineyard_mmap_fd(self) -> List[int]: @@ -757,7 +788,7 @@ def get_vineyard_mmap_fd(self) -> List[int]: @_apply_docstring(IPCClient.vineyard_get_next_fixed_stream_chunk) def vineyard_get_next_fixed_stream_chunk(self) -> int: return self.ipc_client.vineyard_get_next_fixed_stream_chunk() - + @_apply_docstring(IPCClient.open_fixed_stream) def open_fixed_stream(self, stream_id: ObjectID, mode: str) -> None: return self.ipc_client.open_fixed_stream(stream_id, mode) @@ -1003,4 +1034,5 @@ def with_spread(self, enabled: bool = True): def vineyard_get_next_fixed_stream_chunk_async(self, nums) -> AsyncFixedStreamChunk: return AsyncFixedStreamChunk(self, nums) + __all__ = ['Client'] diff --git a/python/vineyard/core/tests/fixed_stream_receiver.py b/python/vineyard/core/tests/fixed_stream_receiver.py index dc302083e..cabeb87a0 100644 --- a/python/vineyard/core/tests/fixed_stream_receiver.py +++ b/python/vineyard/core/tests/fixed_stream_receiver.py @@ -16,90 +16,96 @@ # limitations under the License. # -from datetime import datetime import mmap import sys import time -import vineyard -from vineyard.io.fixed_blob import FixedBlobStream +from datetime import datetime +import vineyard from vineyard._C import ObjectID +from vineyard.io.fixed_blob import FixedBlobStream blob_num = 10 blob_size = 1024 * 1024 * 2 -def run_receiver(client: vineyard.Client, mm: mmap.mmap, ipc_socket: str, rpc_endpoint: str): - fixed_blob_stream = FixedBlobStream.new(client, "test-stream-5", blob_num, blob_size, True, rpc_endpoint) - stream_reader = fixed_blob_stream.open_reader(client, True, 10000) - offset_list = [] - for i in range(blob_num): - offset_list.append(i * blob_size) - - stream_reader.activate_stream_with_offset(offset_list) - - total_finished = stream_reader.check_block_received(-1) - print("Stream is :", "finished" if total_finished else "not finished") - - for i in range(blob_num): - finished = False - while not finished: - start_time = datetime.now().microsecond - try: - finished = stream_reader.check_block_received(i) - except Exception as e: - print(f"Error checking block {i}: {e}") - break - - end_time = datetime.now().microsecond - print(f"Waiting for chunk {i}...") - time.sleep(0.2) - - if finished is not True: - while True: - aborted = stream_reader.abort() - if aborted: - print("Stream aborted, bye...") - return - - for j in range(blob_size): - assert mm.read_byte() == j % 256 - print("Chunk ", i, " received successfully") - - for i in range(blob_num): - finished = False - while not finished: - start_time = datetime.now().microsecond - finished = stream_reader.check_block_received(i) - end_time = datetime.now().microsecond - print(f"check used time: {end_time - start_time} us") - - start_time = datetime.now().microsecond - total_finished = stream_reader.check_block_received(-1) - end_time = datetime.now().microsecond - print("Stream is :", "finished" if total_finished else "not finished") - print("check all use time: ", end_time - start_time, " us") - stream_reader.finish_and_delete() + +def run_receiver( + client: vineyard.Client, mm: mmap.mmap, ipc_socket: str, rpc_endpoint: str +): + fixed_blob_stream = FixedBlobStream.new( + client, "test-stream-5", blob_num, blob_size, True, rpc_endpoint + ) + stream_reader = fixed_blob_stream.open_reader(client, True, 10000) + offset_list = [] + for i in range(blob_num): + offset_list.append(i * blob_size) + + stream_reader.activate_stream_with_offset(offset_list) + + total_finished = stream_reader.check_block_received(-1) + print("Stream is :", "finished" if total_finished else "not finished") + + for i in range(blob_num): + finished = False + while not finished: + start_time = datetime.now().microsecond + try: + finished = stream_reader.check_block_received(i) + except Exception as e: + print(f"Error checking block {i}: {e}") + break + + end_time = datetime.now().microsecond + print(f"Waiting for chunk {i}...") + time.sleep(0.2) + + if finished is not True: + while True: + aborted = stream_reader.abort() + if aborted: + print("Stream aborted, bye...") + return + + for j in range(blob_size): + assert mm.read_byte() == j % 256 + print("Chunk ", i, " received successfully") + + for i in range(blob_num): + finished = False + while not finished: + start_time = datetime.now().microsecond + finished = stream_reader.check_block_received(i) + end_time = datetime.now().microsecond + print(f"check used time: {end_time - start_time} us") + + start_time = datetime.now().microsecond + total_finished = stream_reader.check_block_received(-1) + end_time = datetime.now().microsecond + print("Stream is :", "finished" if total_finished else "not finished") + print("check all use time: ", end_time - start_time, " us") + stream_reader.finish_and_delete() def __main__(): - arguments = sys.argv[1:] - if len(arguments) < 2: - print("Usage: fixed_stream_receiver.py ") - return 1 - - ipc_socket = arguments[0] - rpc_endpoint = arguments[1] - client = vineyard.connect(ipc_socket) - client.timeout_seconds = 5 + arguments = sys.argv[1:] + if len(arguments) < 2: + print("Usage: fixed_stream_receiver.py ") + return 1 + + ipc_socket = arguments[0] + rpc_endpoint = arguments[1] + client = vineyard.connect(ipc_socket) + client.timeout_seconds = 5 + + list = client.get_vineyard_mmap_fd() + fd = list[0] + offset = list[2] - list = client.get_vineyard_mmap_fd() - fd = list[0] - offset = list[2] + mm = mmap.mmap(fd, 0) + mm.seek(offset) - mm = mmap.mmap(fd, 0) - mm.seek(offset) + run_receiver(client, mm, ipc_socket, rpc_endpoint) - run_receiver(client, mm, ipc_socket, rpc_endpoint) if __name__ == "__main__": - __main__() \ No newline at end of file + __main__() diff --git a/python/vineyard/core/tests/fixed_stream_sender.py b/python/vineyard/core/tests/fixed_stream_sender.py index eca399813..a206461c2 100644 --- a/python/vineyard/core/tests/fixed_stream_sender.py +++ b/python/vineyard/core/tests/fixed_stream_sender.py @@ -15,68 +15,86 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import sys import mmap -import vineyard +import sys import threading from time import sleep + +import vineyard from vineyard.io.fixed_blob import FixedBlobStream blob_num = 10 blob_size = 1024 * 1024 * 2 -def check_received(client:vineyard.Client, stream_id: vineyard.ObjectID, stream_reader: FixedBlobStream.Reader): - finished = False - while not finished: - finished = stream_reader.check_block_received(-1) - print("Waiting for stream to finish...") - sleep(2) - success = False - success = stream_reader.abort() - print("Stream aborted: ", success) - stream_reader.finish_and_delete() +def check_received( + client: vineyard.Client, + stream_id: vineyard.ObjectID, + stream_reader: FixedBlobStream.Reader, +): + finished = False + while not finished: + finished = stream_reader.check_block_received(-1) + print("Waiting for stream to finish...") + sleep(2) + + success = False + success = stream_reader.abort() + print("Stream aborted: ", success) + stream_reader.finish_and_delete() + def run_sender(client: vineyard.Client, mm: mmap.mmap): - fixed_blob_stream = FixedBlobStream.new(client, "test-stream-5", blob_num, blob_size, False, "") - stream_writer = fixed_blob_stream.open_writer(client) + fixed_blob_stream = FixedBlobStream.new( + client, "test-stream-5", blob_num, blob_size, False, "" + ) + stream_writer = fixed_blob_stream.open_writer(client) + + offset_list = [] + for i in range(blob_num): + for j in range(blob_size): + mm.write_byte(j % 256) + + for i in range(blob_num): + offset_list.append(i * blob_size) - offset_list = [] - for i in range(blob_num): - for j in range(blob_size): - mm.write_byte(j % 256) + thread = threading.Thread( + target=check_received, + args=( + client, + id, + stream_writer, + ), + ) - for i in range(blob_num): - offset_list.append(i * blob_size) - - thread = threading.Thread(target=check_received, args=(client, id, stream_writer,)) + thread.start() - thread.start() + for offset in offset_list: + stream_writer.append(offset) + sleep(1) + + thread.join() - for offset in offset_list: - stream_writer.append(offset) - sleep(1) - - thread.join() def __main__(): - arguments = sys.argv[1:] - if len(arguments) < 1: - print("Usage: fixed_stream_receiver.py ") - return 1 - - ipc_socket = arguments[0] - client = vineyard.connect(ipc_socket) - client.timeout_seconds = 5 + arguments = sys.argv[1:] + if len(arguments) < 1: + print("Usage: fixed_stream_receiver.py ") + return 1 + + ipc_socket = arguments[0] + client = vineyard.connect(ipc_socket) + client.timeout_seconds = 5 + + list = client.get_vineyard_mmap_fd() + fd = list[0] + offset = list[2] - list = client.get_vineyard_mmap_fd() - fd = list[0] - offset = list[2] + mm = mmap.mmap(fd, 0) + mm.seek(offset) - mm = mmap.mmap(fd, 0) - mm.seek(offset) + run_sender(client, mm) - run_sender(client, mm) if __name__ == "__main__": - __main__() \ No newline at end of file + __main__() diff --git a/python/vineyard/io/fixed_blob.py b/python/vineyard/io/fixed_blob.py index 5e4a2a161..17ec54422 100644 --- a/python/vineyard/io/fixed_blob.py +++ b/python/vineyard/io/fixed_blob.py @@ -16,8 +16,7 @@ # limitations under the License. # -''' This module exposes support for FixedBlobStream. -''' +''' This module exposes support for FixedBlobStream.''' import contextlib import mmap @@ -26,11 +25,12 @@ from typing import Optional from vineyard._C import InvalidException -from vineyard._C import ObjectMeta from vineyard._C import ObjectID +from vineyard._C import ObjectMeta from vineyard.core import context from vineyard.io.stream import BaseStream + class FixedBlobStream(BaseStream): def __init__(self, meta: ObjectMeta): super().__init__(meta) @@ -43,12 +43,14 @@ def __init__(self, meta: ObjectMeta): self.error_msg_len = 256 @staticmethod - def new(client, - stream_name: str, - nums: int, - size: int, - is_remote: bool = False, - rpc_endpoint: Optional[str] = "") -> "FixedBlobStream": + def new( + client, + stream_name: str, + nums: int, + size: int, + is_remote: bool = False, + rpc_endpoint: Optional[str] = "", + ) -> "FixedBlobStream": meta = ObjectMeta() meta['typename'] = 'vineyard::FixedBlobStream' meta['nums'] = nums @@ -63,10 +65,10 @@ def new(client, class Reader(BaseStream.Reader): def __init__(self, stream: "FixedBlobStream"): self.stream_ = stream - + def next(self) -> object: raise NotImplementedError("FixedBlobStream does not support read yet.") - + def next_metadata(self) -> ObjectMeta: raise NotImplementedError("FixedBlobStream does not support read yet.") @@ -78,78 +80,89 @@ def abort(self) -> bool: def finish(self): self.stream_.close() - + def finish_and_delete(self): client_ = self.stream_.client_ self.stream_.close() FixedBlobStream.delete(client_, self.stream_) - - def check_block_received(self, index:int) -> bool: + + def check_block_received(self, index: int) -> bool: return self.stream_.check_block_received(index) - + class Writer(BaseStream.Writer): def __init__(self, stream: "FixedBlobStream"): self.stream_ = stream - + def next(self, size: int) -> memoryview: raise NotImplementedError("FixedBlobStream does not support write yet.") - + def append(self, offset: int): self.stream_.push_offset_block(offset) def fail(self): raise NotImplementedError("FixedBlobStream does not support write yet.") - + def abort(self) -> bool: return self.stream_.abort() - + def finish(self): self.stream_.close() - + def finish_and_delete(self): client_ = self.stream_.client_ self.stream_.close() FixedBlobStream.delete(client_, self.stream_) - - def check_block_received(self, index:int) -> bool: + + def check_block_received(self, index: int) -> bool: return self.stream_.check_block_received(index) def open_reader(self, client, wait: bool = False, timeout: int = 0): self.open(client, "r", wait, timeout) return FixedBlobStream.Reader(self) - + def open_writer(self, client): self.open(client, "w") return FixedBlobStream.Writer(self) - def open(self, - client, - mode, - wait: bool = False, - timeout: int = 0): + def open(self, client, mode, wait: bool = False, timeout: int = 0): self.client_ = client - if (self.is_remote_): - self.recv_mem_fd_ = self.client_.vineyard_open_remote_fixed_stream_with_name(self.stream_name_, self.meta.id, self.nums_, self.size_, self.rpc_endpoint_, mode, wait, timeout) + if self.is_remote_: + self.recv_mem_fd_ = ( + self.client_.vineyard_open_remote_fixed_stream_with_name( + self.stream_name_, + self.meta.id, + self.nums_, + self.size_, + self.rpc_endpoint_, + mode, + wait, + timeout, + ) + ) else: self.recv_mem_fd_ = self.client_.open_fixed_stream(self.meta.id, mode) - if (self.recv_mem_fd_ < 0): + if self.recv_mem_fd_ < 0: raise ValueError("Failed to open remote fixed stream") try: - self.recv_mem_ = mmap.mmap(self.recv_mem_fd_, self.mmap_size, access=mmap.ACCESS_READ) + self.recv_mem_ = mmap.mmap( + self.recv_mem_fd_, self.mmap_size, access=mmap.ACCESS_READ + ) except Exception as e: self.close() raise e def activate_stream_with_offset(self, offsets: List[int]): - if (not self.is_remote_): + if not self.is_remote_: raise ValueError("The stream is not remote stream") - self.client_.vineyard_activate_remote_fixed_stream_with_offset(self.meta.id, offsets) + self.client_.vineyard_activate_remote_fixed_stream_with_offset( + self.meta.id, offsets + ) def push_offset_block(self, offsets: int): self.client_.push_next_stream_chunk_by_offset(self.meta.id, offsets) - def check_block_received(self, index:int) -> bool: - if (self.recv_mem_[self.mmap_size - 1] != 0): + def check_block_received(self, index: int) -> bool: + if self.recv_mem_[self.mmap_size - 1] != 0: self.recv_mem_.seek(self.mmap_size - self.error_msg_len - 1) error_msg = self.recv_mem_.read(self.error_msg_len) null_byte_index = error_msg.find(b'\0') @@ -159,22 +172,21 @@ def check_block_received(self, index:int) -> bool: error_msg = error_msg raise InvalidException(error_msg.decode('ascii')) - if (index == -1): + if index == -1: ret = True for i in range(self.nums_): if self.recv_mem_[i] == 0: ret = False break return ret - elif (index < 0 or index >= self.nums_): + elif index < 0 or index >= self.nums_: raise ValueError("Invalid index") else: return self.recv_mem_[index] == 1 - def close(self): try: - if (self.is_remote_): + if self.is_remote_: self.client_.vineyard_close_remote_fixed_stream(self.meta.id) else: self.client_.close_stream(self.meta.id) @@ -184,9 +196,9 @@ def close(self): os.close(self.recv_mem_fd_) self.recv_mem_.close() self.client_ = None - + def abort(self) -> bool: - if (self.is_remote_): + if self.is_remote_: return self.client_.vineyard_abort_remote_stream(self.meta.id) else: return self.client_.abort_stream(self.meta.id) @@ -195,6 +207,7 @@ def abort(self) -> bool: def delete(client, fixed_blob_stream: "FixedBlobStream"): client.delete_stream(fixed_blob_stream.meta.id) + def fixed_blob_stream_resolver(obj, resolver): # pylint: disable=unused-argument meta = obj.meta return FixedBlobStream(meta) @@ -202,9 +215,7 @@ def fixed_blob_stream_resolver(obj, resolver): # pylint: disable=unused-argumen def register_fixed_blob_stream_types(_builder_ctx, resolver_ctx): if resolver_ctx is not None: - resolver_ctx.register( - 'vineyard::FixedBlobStream', fixed_blob_stream_resolver - ) + resolver_ctx.register('vineyard::FixedBlobStream', fixed_blob_stream_resolver) @contextlib.contextmanager