diff --git a/.github/workflows/windows-x64-gpu.yml b/.github/workflows/windows-x64-gpu.yml index d3339ac15f..078eb7e4af 100644 --- a/.github/workflows/windows-x64-gpu.yml +++ b/.github/workflows/windows-x64-gpu.yml @@ -50,11 +50,11 @@ jobs: INPUT_CUDA_VERSION: ${{ matrix.cudaver }} - name: Build wheel run: | - $env:BUILD_TEST="ON" + $env:BUILD_TEST="OFF" mkdir build cd build ..\builder\windows\generate.ps1 - cmake --build . --config Release -- /m /v:q + cmake --build . --config Release -- /m /v:n if (-Not $?) { echo "build failed" exit 1 diff --git a/CMakeLists.txt b/CMakeLists.txt index 34d04f7a06..5754f00dbd 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,13 +15,16 @@ cmake_minimum_required(VERSION 3.11 FATAL_ERROR) # for PyTorch extensions, version should be greater than 3.13 project(TurboMind LANGUAGES CXX CUDA) -find_package(CUDA 10.2 REQUIRED) +if (MSVC) + # use standard conformant preprocessor + add_compile_options($<$:/Zc:preprocessor>) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=/Zc:preprocessor") +endif () find_package(CUDAToolkit REQUIRED) -if(${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "11") +if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "11") add_definitions("-DENABLE_BF16") - message("CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} is greater or equal than 11.0, enable -DENABLE_BF16 flag") endif() set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules) @@ -29,8 +32,11 @@ set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules) option(BUILD_MULTI_GPU "Build multi-gpu support" ON) option(BUILD_PY_FFI "Build python ffi" ON) option(BUILD_TEST "Build tests" OFF) +option(SPARSITY_SUPPORT "Build project with Ampere sparsity feature support" OFF) +option(BUILD_FAST_MATH "Build in fast math mode" ON) include(FetchContent) + if (BUILD_TEST) FetchContent_Declare( repo-cutlass @@ -45,6 +51,14 @@ if (BUILD_TEST) set(CUTLASS_HEADER_DIR ${PROJECT_SOURCE_DIR}/3rdparty/cutlass/include) set(CUTLASS_EXTENSIONS_DIR ${PROJECT_SOURCE_DIR}/src/turbomind/cutlass_extensions/include) + + + FetchContent_Declare( + Catch2 + GIT_REPOSITORY https://github.com/catchorg/Catch2.git + GIT_TAG v3.8.0 + ) + FetchContent_MakeAvailable(Catch2) endif() FetchContent_Declare( @@ -56,10 +70,6 @@ set(YAML_BUILD_SHARED_LIBS OFF CACHE BOOL "Build static library of yaml-cpp") FetchContent_MakeAvailable(yaml-cpp) -option(SPARSITY_SUPPORT "Build project with Ampere sparsity feature support" OFF) - -option(BUILD_FAST_MATH "Build in fast math mode" ON) - # the environment variable # ASAN_OPTIONS=protect_shadow_gap=0,intercept_tls_get_addr=0 # must be set at runtime @@ -112,13 +122,13 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall -ldl") # -Xptxas -v # TODO: build for sm_72 & sm_87 on aarch64 platform (Jetson devices) if (NOT CMAKE_CUDA_ARCHITECTURES) set(CMAKE_CUDA_ARCHITECTURES 70-real 75-real) - if (${CUDA_VERSION} VERSION_GREATER_EQUAL "11") + if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "11") list(APPEND CMAKE_CUDA_ARCHITECTURES 80-real) endif () - if (${CUDA_VERSION} VERSION_GREATER_EQUAL "11.1") + if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "11.1") list(APPEND CMAKE_CUDA_ARCHITECTURES 86-real) endif () - if (${CUDA_VERSION} VERSION_GREATER_EQUAL "11.8") + if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "11.8") list(APPEND CMAKE_CUDA_ARCHITECTURES 89-real 90-real) endif () if (MSVC) @@ -132,19 +142,23 @@ set(CMAKE_CUDA_RUNTIME_LIBRARY Shared) set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wall -O0") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall -O0") # set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall --ptxas-options=-v --resource-usage") -set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall -DCUDA_PTX_FP8_F2FP_ENABLED") +set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall") set(CMAKE_CXX_STANDARD "${CXX_STD}") set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++${CXX_STD} -DCUDA_PTX_FP8_F2FP_ENABLED") +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++${CXX_STD}") + +string(REPLACE "-O2" "" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}") +string(REPLACE "-O2" "" CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE}") +string(REPLACE "-O2" "" CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}") +string(REPLACE "-O2" "" CMAKE_CUDA_FLAGS_RELWITHDEBINFO "${CMAKE_CUDA_FLAGS_RELWITHDEBINFO}") -set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") -set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3") -# set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler -O3 --ptxas-options=--verbose") -set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler -O3 -DCUDA_PTX_FP8_F2FP_ENABLED") -set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "${CMAKE_CUDA_FLAGS_RELWITHDEBINFO} -Xcompiler -O3 -DCUDA_PTX_FP8_F2FP_ENABLED") +set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3") +set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -O3") +set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "${CMAKE_CUDA_FLAGS_RELWITHDEBINFO} -O3") if(BUILD_FAST_MATH) set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} --use_fast_math") @@ -207,13 +221,11 @@ link_directories( ${COMMON_LIB_DIRS} ) -# add_subdirectory(3rdparty) add_subdirectory(src) -# add_subdirectory(examples) -if(BUILD_TEST) - add_subdirectory(tests/csrc) -endif() +# if(BUILD_TEST) +# add_subdirectory(tests/csrc) +# endif() # install python api if (BUILD_PY_FFI) diff --git a/builder/windows/generate.ps1 b/builder/windows/generate.ps1 index 96dbbc70bd..0c133b37d0 100644 --- a/builder/windows/generate.ps1 +++ b/builder/windows/generate.ps1 @@ -3,6 +3,5 @@ cmake .. -A x64 -T "v142,cuda=$env:CUDA_PATH" ` -DCMAKE_INSTALL_PREFIX=install ` -DBUILD_PY_FFI=ON ` -DBUILD_MULTI_GPU=OFF ` - -DCMAKE_CUDA_FLAGS="-lineinfo" ` - -DUSE_NVTX=ON ` + -DUSE_NVTX=OFF ` -DBUILD_TEST="$env:BUILD_TEST" diff --git a/builder/windows/setup_cuda.ps1 b/builder/windows/setup_cuda.ps1 index b573198ce2..5615aba84a 100644 --- a/builder/windows/setup_cuda.ps1 +++ b/builder/windows/setup_cuda.ps1 @@ -24,6 +24,8 @@ if ($CUDA_VERSION_FULL -eq "12.1.0") { $downloadUrl = "https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_531.14_windows.exe" } elseif ($CUDA_VERSION_FULL -eq "11.8.0") { $downloadUrl = "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_522.06_windows.exe" +} elseif ($CUDA_VERSION_FULL -eq "12.5.0") { + $downloadUrl = "https://developer.download.nvidia.com/compute/cuda/12.5.0/local_installers/cuda_12.5.0_555.85_windows.exe" } else { Write-Output "Unsupported CUDA version specified" exit 1 @@ -84,6 +86,8 @@ $msBuildExtensions = (Get-ChildItem "$src\visual_studio_integration\CUDAVisualS } } +$CUDA_FLAGS="-allow-unsupported-compiler -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH=1" + # Add to Github env Write-Output "Setting environment variables for GitHub Actions..." @@ -97,7 +101,7 @@ Write-Output "CUDA_PATH_V$($CUDA_MAJOR)_$($CUDA_MINOR)=$dst" >> $env:GITHUB_ENV Write-Output "CUDA_PATH_VX_Y=CUDA_PATH_V$($CUDA_MAJOR)_$($CUDA_MINOR)" >> $env:GITHUB_ENV Write-Output "CudaToolkitDir=$dst" >> $env:GITHUB_ENV Write-Output "CMAKE_CUDA_COMPILER=$dst\bin\nvcc.exe" >> $env:GITHUB_ENV -Write-Output "NVCC_APPEND_FLAGS=-allow-unsupported-compiler" >> $env:GITHUB_ENV +Write-Output "NVCC_APPEND_FLAGS=$CUDA_FLAGS" >> $env:GITHUB_ENV Write-Output "CUDA_VERSION=$CUDA_VERSION_FULL" >> $env:GITHUB_ENV Write-Output "Setup completed." diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py index 7b2bc5db6f..53e2f6b7e1 100644 --- a/lmdeploy/turbomind/deploy/module.py +++ b/lmdeploy/turbomind/deploy/module.py @@ -319,7 +319,8 @@ def pad_weight(tensor: torch.Tensor, tp: int): if output_weight is not None: tp = self.model.attn_tp_size output_weight = pad_weight(output_weight, tp=tp) - self.model.save_split(output_weight, 'output.weight', split_dim=0, split_num=tp) + # transpose + self.model.save_split(output_weight.t(), 'output.weight', split_dim=1, split_num=tp) class Transformer: diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 8d43923109..3ff1dc1436 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -241,7 +241,7 @@ def _from_hf(self, model_source: ModelSource, model_path: str, engine_config: Tu model_comm = _tm.AbstractTransformerModel.create_llama_model(model_dir='', config=yaml.safe_dump(self.config_dict), - data_type=self.config.model_config.weight_type) + weight_type=self.config.model_config.weight_type) # create empty weight self._create_weight(model_comm) @@ -275,7 +275,7 @@ def _from_workspace(self, model_path: str, engine_config: TurbomindEngineConfig) weight_dir = osp.join(model_path, 'triton_models', 'weights') model_comm = _tm.AbstractTransformerModel.create_llama_model(model_dir=weight_dir, config=yaml.safe_dump(self.config_dict), - data_type=self.config.weight_type) + weight_type=self.config.weight_type) # create weight and load params self._create_weight(model_comm) diff --git a/src/turbomind/CMakeLists.txt b/src/turbomind/CMakeLists.txt index b4f1033e67..df86f40ea6 100644 --- a/src/turbomind/CMakeLists.txt +++ b/src/turbomind/CMakeLists.txt @@ -13,6 +13,7 @@ # limitations under the License. add_subdirectory(utils) +add_subdirectory(core) add_subdirectory(kernels) add_subdirectory(layers) add_subdirectory(comm) diff --git a/src/turbomind/comm/CMakeLists.txt b/src/turbomind/comm/CMakeLists.txt index 43a2dacf21..6e5c772c46 100644 --- a/src/turbomind/comm/CMakeLists.txt +++ b/src/turbomind/comm/CMakeLists.txt @@ -3,10 +3,11 @@ cmake_minimum_required(VERSION 3.8) add_library(host_comm STATIC host_comm.cc thread_comm.cc) +target_link_libraries(host_comm PRIVATE core logger) set_property(TARGET host_comm PROPERTY POSITION_INDEPENDENT_CODE ON) add_library(device_comm STATIC device_comm.cc) -target_link_libraries(device_comm PRIVATE logger) +target_link_libraries(device_comm PRIVATE core logger) set_property(TARGET device_comm PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET device_comm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) @@ -21,7 +22,7 @@ if (BUILD_MULTI_GPU) if (BUILD_TEST) add_executable(test_comm test_comm.cu) - target_link_libraries(test_comm PRIVATE device_comm host_comm pthread nvtx_utils) + target_link_libraries(test_comm PRIVATE device_comm host_comm core pthread nvtx_utils) target_compile_options(test_comm PRIVATE -O3 -march=native -mtune=native) endif () endif () diff --git a/src/turbomind/comm/cuda_ipc/CMakeLists.txt b/src/turbomind/comm/cuda_ipc/CMakeLists.txt index 948d75c94e..7cc07c11db 100644 --- a/src/turbomind/comm/cuda_ipc/CMakeLists.txt +++ b/src/turbomind/comm/cuda_ipc/CMakeLists.txt @@ -12,6 +12,8 @@ add_library(cuda_ipc_comm STATIC target_link_libraries(cuda_ipc_comm PRIVATE rms_norm host_comm + core + cuda_utils CUDA::cuda_driver logger) diff --git a/src/turbomind/comm/cuda_ipc/allgather.cu b/src/turbomind/comm/cuda_ipc/allgather.cu index 94d0ebe1f9..f71bae395c 100644 --- a/src/turbomind/comm/cuda_ipc/allgather.cu +++ b/src/turbomind/comm/cuda_ipc/allgather.cu @@ -4,7 +4,6 @@ #include "src/turbomind/comm/cuda_ipc/device_semaphore.h" #include "src/turbomind/kernels/core/meta.h" -#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/cuda_utils.h" namespace turbomind::comm { @@ -51,7 +50,7 @@ __global__ void __launch_bounds__(1024, 1) Allgather_Simple_Pull(T* void CudaIpcCommImpl::AllGather( const void* sendbuff, void* recvbuff, size_t sendcount, DataType type, int group, cudaStream_t stream) { - const size_t bytesize = get_elem_size(type) * sendcount; + const size_t bytesize = turbomind::byte_size(type) * sendcount; const int peers = this->n_ranks(group) - 1; const int rank = this->rank(group); @@ -165,9 +164,9 @@ void CudaIpcCommImpl::AllGather2D(const void* sendbuff, int group, cudaStream_t stream) { - const size_t byte_width = get_elem_size(type) * width; - const size_t byte_pitch = get_elem_size(type) * pitch; - const size_t byte_stride = get_elem_size(type) * stride; + const size_t byte_width = byte_size(type, width); + const size_t byte_pitch = byte_size(type, pitch); + const size_t byte_stride = byte_size(type, stride); void* base{}; size_t offset{}; diff --git a/src/turbomind/comm/cuda_ipc/allreduce.cu b/src/turbomind/comm/cuda_ipc/allreduce.cu index 8461252a66..631aa1f212 100644 --- a/src/turbomind/comm/cuda_ipc/allreduce.cu +++ b/src/turbomind/comm/cuda_ipc/allreduce.cu @@ -6,9 +6,9 @@ #include "src/turbomind/comm/cuda_ipc/cuda_ipc_comm.h" #include "src/turbomind/comm/cuda_ipc/device_semaphore.h" +#include "src/turbomind/core/data_type.h" #include "src/turbomind/kernels/core/array_ops.h" #include "src/turbomind/kernels/core/meta.h" -#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/cuda_utils.h" @@ -423,14 +423,7 @@ void CudaIpcCommImpl::AllReduceSum( } }; - switch (type) { - case DataType::TYPE_FP16: - return invoke(half{}); - case DataType::TYPE_BF16: - return invoke(nv_bfloat16{}); - default: - throw std::runtime_error("not implemented"); - } + TM_DISPATCH_PRIMARY_DTYPES(type, invoke); } } // namespace turbomind::comm diff --git a/src/turbomind/comm/cuda_ipc/cuda_ipc_comm.cu b/src/turbomind/comm/cuda_ipc/cuda_ipc_comm.cu index 0d229c58f0..7c0dde00af 100644 --- a/src/turbomind/comm/cuda_ipc/cuda_ipc_comm.cu +++ b/src/turbomind/comm/cuda_ipc/cuda_ipc_comm.cu @@ -1,8 +1,7 @@ // Copyright (c) OpenMMLab. All rights reserved. #include -#include -#include +#include #include #include diff --git a/src/turbomind/comm/cuda_ipc/cuda_ipc_comm.h b/src/turbomind/comm/cuda_ipc/cuda_ipc_comm.h index ba820bfc7a..f985f12d25 100644 --- a/src/turbomind/comm/cuda_ipc/cuda_ipc_comm.h +++ b/src/turbomind/comm/cuda_ipc/cuda_ipc_comm.h @@ -10,7 +10,6 @@ #include "src/turbomind/kernels/core/array.h" -#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/cuda_utils.h" namespace turbomind::comm { diff --git a/src/turbomind/comm/cuda_ipc/fused_allreduce.cu b/src/turbomind/comm/cuda_ipc/fused_allreduce.cu index 4948065e04..23e84cfbdf 100644 --- a/src/turbomind/comm/cuda_ipc/fused_allreduce.cu +++ b/src/turbomind/comm/cuda_ipc/fused_allreduce.cu @@ -8,13 +8,13 @@ #include "src/turbomind/comm/cuda_ipc/device_semaphore.h" #include "src/turbomind/comm/cuda_ipc/group_sum.h" +#include "src/turbomind/core/data_type.h" #include "src/turbomind/kernels/core/array_ops.h" #include "src/turbomind/kernels/core/common.h" #include "src/turbomind/kernels/core/meta.h" #include "src/turbomind/kernels/norm/rms_norm.h" -#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/cuda_utils.h" namespace turbomind::comm { @@ -424,7 +424,7 @@ void CudaIpcCommImpl::AllreduceResidualBiasRMSnorm(void* hidden, cudaStream_t stream) { - const size_t elemsize = get_elem_size(dtype); + const size_t elemsize = byte_size(dtype); const size_t bytesize = elemsize * token_num * dim; const int n_ranks = this->n_ranks(group); @@ -504,19 +504,10 @@ void CudaIpcCommImpl::AllreduceResidualBiasRMSnorm(void* hidden, return false; // > 1024 vdim }; - auto dispatch = [&] { - switch (dtype) { - case DataType::TYPE_FP16: - return dispatch_D(half{}); - case DataType::TYPE_BF16: - return dispatch_D(nv_bfloat16{}); - default: - return false; - } - }; + auto dispatch = [&]() -> bool { TM_DISPATCH_PRIMARY_DTYPES_RET(dtype, dispatch_D); }; if (bytesize > (1 << 19)) { - if (auto success = dispatch()) { + if (dispatch()) { return; } } diff --git a/src/turbomind/comm/cuda_ipc/fused_allreduce_ex.cu b/src/turbomind/comm/cuda_ipc/fused_allreduce_ex.cu index 3340000777..a57172e60e 100644 --- a/src/turbomind/comm/cuda_ipc/fused_allreduce_ex.cu +++ b/src/turbomind/comm/cuda_ipc/fused_allreduce_ex.cu @@ -5,6 +5,7 @@ #include "src/turbomind/comm/cuda_ipc/group_sum.h" #include "src/turbomind/comm/cuda_ipc/mscclpp.h" +#include "src/turbomind/core/data_type.h" #include "src/turbomind/kernels/core/array_ops.h" #include "src/turbomind/kernels/core/common.h" #include "src/turbomind/kernels/core/meta.h" @@ -279,18 +280,11 @@ void CudaIpcCommImpl::AllreduceResidualBiasRMSnormEx(void* hidden, return false; // > 1024 vdim }; - auto dispatch = [&] { - switch (dtype) { - case DataType::TYPE_FP16: - return dispatch_D(half{}); - case DataType::TYPE_BF16: - return dispatch_D(nv_bfloat16{}); - default: - return false; - } + auto dispatch = [&]() -> bool { // + TM_DISPATCH_PRIMARY_DTYPES_RET(dtype, dispatch_D); }; - FT_CHECK(dispatch()); + TM_CHECK(dispatch()); } } // namespace turbomind::comm diff --git a/src/turbomind/comm/device_comm.cc b/src/turbomind/comm/device_comm.cc index 8e35d9d22c..8217d9c298 100644 --- a/src/turbomind/comm/device_comm.cc +++ b/src/turbomind/comm/device_comm.cc @@ -25,7 +25,7 @@ DeviceComm CreateDeviceCommunicator(const std::string& backend, int n_ranks, int } #endif - FT_CHECK_WITH_INFO(0, fmtstr("Unknown communication backend: %s", backend.c_str())); + TM_CHECK(0) << "Unknown communication backend: " << backend; return {}; } diff --git a/src/turbomind/comm/device_comm.h b/src/turbomind/comm/device_comm.h index 52045cbb03..d68ebdc4da 100644 --- a/src/turbomind/comm/device_comm.h +++ b/src/turbomind/comm/device_comm.h @@ -9,7 +9,6 @@ #include #include "src/turbomind/comm/host_comm.h" -#include "src/turbomind/utils/Tensor.h" namespace turbomind::comm { diff --git a/src/turbomind/comm/host_comm.h b/src/turbomind/comm/host_comm.h index 5cf35d7b28..b036142264 100644 --- a/src/turbomind/comm/host_comm.h +++ b/src/turbomind/comm/host_comm.h @@ -6,8 +6,9 @@ #include #include #include +#include -#include "src/turbomind/utils/Tensor.h" +#include "src/turbomind/core/data_type.h" namespace turbomind::comm { @@ -79,12 +80,12 @@ template void Broadcast(HostCommImpl* comm, T* data, int n, int root) { if constexpr (std::is_trivially_copyable_v) { - comm->Broadcast((char*)data, sizeof(T) * n, TYPE_INT8, root, detail::copy_fn); + comm->Broadcast(data, sizeof(T) * n, data_type_v, root, detail::copy_fn); } else { if (comm->is_same_process()) { /// TODO: Constness should be considered - comm->Broadcast(data, n, TYPE_INVALID, root, detail::copy_fn); + comm->Broadcast(data, n, kNull, root, detail::copy_fn); } else { throw std::runtime_error("not implemented"); @@ -96,12 +97,12 @@ template void AllGather(HostCommImpl* comm, T* data, int n) { if constexpr (std::is_trivially_copyable_v) { - comm->AllGather(data, sizeof(T) * n, TYPE_INT8, detail::copy_fn); + comm->AllGather(data, sizeof(T) * n, data_type_v, detail::copy_fn); } else { if (comm->is_same_process()) { /// TODO: Constness should be considered - comm->AllGather(data, n, TYPE_INVALID, detail::copy_fn); + comm->AllGather(data, n, kNull, detail::copy_fn); } else { /// serialize data @@ -113,7 +114,7 @@ void AllGather(HostCommImpl* comm, T* data, int n) template void AllReduce(HostCommImpl* comm, T* data, int n, RedOp red_op) { - comm->AllReduce(data, n, getTensorType(), red_op); + comm->AllReduce(data, n, data_type_v, red_op); } ////////////////////////////////////////////////////////////////////////////////// diff --git a/src/turbomind/comm/nccl/CMakeLists.txt b/src/turbomind/comm/nccl/CMakeLists.txt index 4a6e8d71a7..ceddbfc3d3 100644 --- a/src/turbomind/comm/nccl/CMakeLists.txt +++ b/src/turbomind/comm/nccl/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.8) add_library(nccl_comm STATIC nccl.cu) -target_link_libraries(nccl_comm PRIVATE rms_norm ${NCCL_LIBRARIES} logger) +target_link_libraries(nccl_comm PRIVATE rms_norm core ${NCCL_LIBRARIES} logger) target_include_directories(nccl_comm PRIVATE ${NCCL_INCLUDE_DIRS}) set_property(TARGET nccl_comm PROPERTY POSITION_INDEPENDENT_CODE ON) diff --git a/src/turbomind/comm/nccl/nccl.cu b/src/turbomind/comm/nccl/nccl.cu index 5a02d5b51e..804dfaaa46 100644 --- a/src/turbomind/comm/nccl/nccl.cu +++ b/src/turbomind/comm/nccl/nccl.cu @@ -10,7 +10,6 @@ #include "src/turbomind/comm/device_comm.h" #include "src/turbomind/comm/host_comm.h" -#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/logger.h" #include "src/turbomind/utils/string_utils.h" @@ -33,16 +32,16 @@ namespace turbomind::comm { -static inline ncclDataType_t getNcclDataType(DataType type) +static inline ncclDataType_t to_nccl_dtype(DataType type) { switch (type) { - case DataType::TYPE_FP32: + case kFloat32: return ncclFloat; - case DataType::TYPE_FP16: + case kFloat16: return ncclHalf; - case DataType::TYPE_BF16: + case kBfloat16: return ncclBfloat16; - case DataType::TYPE_UINT8: + case kUint8: return ncclUint8; default: throw std::runtime_error("not supported"); @@ -166,7 +165,7 @@ public: const void* sendbuff, void* recvbuff, size_t count, DataType type, int group, cudaStream_t stream) override { NCCLCHECK(ncclGroupStart()); - NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, getNcclDataType(type), ncclSum, groups_.at(group), stream)); + NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, to_nccl_dtype(type), ncclSum, groups_.at(group), stream)); NCCLCHECK(ncclGroupEnd()); } @@ -174,7 +173,7 @@ public: const void* sendbuff, void* recvbuff, size_t sendcount, DataType type, int group, cudaStream_t stream) override { NCCLCHECK(ncclGroupStart()); - NCCLCHECK(ncclAllGather(sendbuff, recvbuff, sendcount, getNcclDataType(type), groups_.at(group), stream)); + NCCLCHECK(ncclAllGather(sendbuff, recvbuff, sendcount, to_nccl_dtype(type), groups_.at(group), stream)); NCCLCHECK(ncclGroupEnd()); } @@ -182,8 +181,8 @@ public: const void* sendbuff, void* recvbuff, size_t recvcount, DataType type, int group, cudaStream_t stream) override { NCCLCHECK(ncclGroupStart()); - NCCLCHECK(ncclReduceScatter( - sendbuff, recvbuff, recvcount, getNcclDataType(type), ncclSum, groups_.at(group), stream)); + NCCLCHECK( + ncclReduceScatter(sendbuff, recvbuff, recvcount, to_nccl_dtype(type), ncclSum, groups_.at(group), stream)); NCCLCHECK(ncclGroupEnd()); } @@ -198,7 +197,7 @@ public: int group, cudaStream_t stream) override { - const auto elem_size = get_elem_size(dtype); + const auto elem_size = byte_size(dtype); auto rms_norm = [&](int64_t first, int64_t count) { invokeResidualBiasRMSNorm((char*)hidden + elem_size * first * dim, @@ -241,8 +240,8 @@ public: const int* local_token_nums, cudaStream_t stream) override { - const size_t elem_size = get_elem_size(type); - const ncclDataType_t nccl_type = getNcclDataType(type); + const size_t elem_size = byte_size(type); + const ncclDataType_t nccl_type = to_nccl_dtype(type); FT_CHECK(group0 == 0 || group1 == 0); diff --git a/src/turbomind/comm/test_comm.cu b/src/turbomind/comm/test_comm.cu index 9bbc52d26b..3b2eac954b 100644 --- a/src/turbomind/comm/test_comm.cu +++ b/src/turbomind/comm/test_comm.cu @@ -17,11 +17,10 @@ #include "src/turbomind/comm/device_comm.h" #include "src/turbomind/comm/host_comm.h" -#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/cuda_utils.h" using namespace turbomind::comm; -using turbomind::getTensorType; +using turbomind::data_type_v; using turbomind::check; using turbomind::myAssert; using std::vector; @@ -164,7 +163,7 @@ struct TestComm { tp = device_num; } - std::tie(h_comm_, d_comm_, h_split_, d_split_) = Init(device_num, 4, "cudaipc"); + std::tie(h_comm_, d_comm_, h_split_, d_split_) = Init(device_num, 4, "cuda-ipc"); warmup_ = warmup; iters_ = iters; @@ -186,7 +185,7 @@ struct TestComm { template void TestAllReduce(size_t dim, int group = 0) { - const auto dtype = getTensorType(); + const auto dtype = data_type_v; const int tp_size = d_comm_[0]->n_ranks(group); const int dp_size = d_comm_.size() / tp_size; @@ -325,7 +324,7 @@ struct TestComm { } } - const auto dtype = getTensorType(); + const auto dtype = data_type_v; const int tp_size = d_comm_[0]->n_ranks(group); const int dp_size = d_comm_.size() / tp_size; @@ -497,7 +496,7 @@ struct TestComm { template void TestAllGather(size_t dim, int group) { - const auto dtype = getTensorType(); + const auto dtype = data_type_v; const int tp_size = d_comm_[0]->n_ranks(group); const int dp_size = d_comm_.size() / tp_size; @@ -621,7 +620,7 @@ struct TestComm { const int inner_tp = std::gcd(tp_size_0, tp_size_1); - const auto dtype = getTensorType(); + const auto dtype = data_type_v; std::mt19937 gen{}; std::uniform_int_distribution dist{0, 31}; // 5 mantissa bits diff --git a/src/turbomind/comm/thread_comm.cc b/src/turbomind/comm/thread_comm.cc index cb8dd66e9c..017d83abb0 100644 --- a/src/turbomind/comm/thread_comm.cc +++ b/src/turbomind/comm/thread_comm.cc @@ -7,12 +7,11 @@ #include #include #include +#include #include "src/turbomind/comm/host_comm.h" - -#include "src/turbomind/utils/Tensor.h" -#include "src/turbomind/utils/cuda_utils.h" - +#include "src/turbomind/core/check.h" +#include "src/turbomind/core/data_type.h" namespace turbomind::comm { struct ThreadCommImpl: public HostCommImpl { @@ -71,8 +70,8 @@ struct ThreadCommImpl: public HostCommImpl { std::shared_ptr Split(int color, int key) override { - FT_CHECK(color >= 0); - FT_CHECK(g2l_[rank_] >= 0); + TM_CHECK(color >= 0); + TM_CHECK(g2l_[rank_] >= 0); // `g2l_[rank_]` imposes proper ordering when keys are equal auto vec = comm::AllGather(this, std::make_tuple(color, key, g2l_[rank_])); @@ -124,7 +123,7 @@ struct ThreadCommImpl: public HostCommImpl { void Broadcast(void* data, int count, DataType dtype, int root, copy_fn copy) override { - FT_CHECK(copy); + TM_CHECK(copy); if (n_ranks() == 1) { return; } @@ -158,7 +157,7 @@ struct ThreadCommImpl: public HostCommImpl { void AllGather(void* data, int count, DataType dtype, copy_fn copy) override { - FT_CHECK(copy); + TM_CHECK(copy); if (n_ranks() == 1) { return; } @@ -226,13 +225,13 @@ struct ThreadCommImpl: public HostCommImpl { }; auto dispatch = [&]() -> reduce_fn { switch (dtype) { - case DataType::TYPE_INT32: + case kInt32: return dispatch_op(int32_t{}); - case DataType::TYPE_INT64: + case kInt64: return dispatch_op(int64_t{}); - case DataType::TYPE_UINT32: + case kUint32: return dispatch_op(uint32_t{}); - case DataType::TYPE_UINT64: + case kUint64: return dispatch_op(uint64_t{}); default: return {}; @@ -250,7 +249,7 @@ struct ThreadCommImpl: public HostCommImpl { void AllReduce(void* data, int count, DataType dtype, RedOp red_op) override { const auto reduce = get_reduce(dtype, red_op); - const auto elem_size = get_elem_size(dtype); + const auto elem_size = byte_size(dtype); if (n_ranks() == 1) { return; } @@ -292,7 +291,7 @@ class ThreadGroupId: public HostGroupId { void Export(std::ostream& os) override { - FT_CHECK((bool)internal_); // `Initialize` must come befor `Export` + TM_CHECK((bool)internal_); // `Initialize` must come befor `Export` const void* ptr = this; os.write((const char*)&ptr, sizeof(ptr)); @@ -304,7 +303,7 @@ class ThreadGroupId: public HostGroupId { is.read((char*)&ptr, sizeof(ptr)); internal_ = reinterpret_cast(ptr)->internal_; - FT_CHECK((bool)internal_); + TM_CHECK((bool)internal_); } HostComm CreateCommunicator(int n_ranks, int rank) override @@ -313,12 +312,12 @@ class ThreadGroupId: public HostGroupId { internal_->state = std::make_shared(n_ranks); }; - FT_CHECK((bool)internal_); + TM_CHECK((bool)internal_); // One of the rank initialize the shared state std::call_once(internal_->flag, init_shared_state); - FT_CHECK((bool)internal_->state); + TM_CHECK((bool)internal_->state); auto impl = std::make_shared(n_ranks, internal_->state, rank); diff --git a/src/turbomind/core/CMakeLists.txt b/src/turbomind/core/CMakeLists.txt new file mode 100644 index 0000000000..9a0c9ff5ba --- /dev/null +++ b/src/turbomind/core/CMakeLists.txt @@ -0,0 +1,26 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +cmake_minimum_required(VERSION 3.8) + +add_library(core STATIC + check.cc + allocator.cc + stream.cc + context.cc + buffer.cc + layout.cc + tensor.cc + tensor.cu + module.cc) + +target_link_libraries(core PUBLIC cuda_utils CUDA::cudart CUDA::cuda_driver) + +set_property(TARGET core PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET core PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) + +target_compile_options(core PRIVATE $<$:-Xptxas=-v>) + +if (BUILD_TEST) + add_executable(test_core test_core.cc) + target_link_libraries(test_core PRIVATE core logger Catch2::Catch2WithMain) +endif () diff --git a/src/turbomind/core/allocator.cc b/src/turbomind/core/allocator.cc new file mode 100644 index 0000000000..5471acdf3a --- /dev/null +++ b/src/turbomind/core/allocator.cc @@ -0,0 +1,159 @@ + +#include +#include + +#include "src/turbomind/core/allocator.h" +#include "src/turbomind/core/check.h" + +namespace turbomind::core { + +AllocatorImpl::~AllocatorImpl() = default; + +Stream AllocatorImpl::stream() const noexcept +{ + return Stream{}; +} + +class CudaMemPoolAllocator: public AllocatorImpl { +public: + CudaMemPoolAllocator(Stream stream, bool use_default_pool): + pool_{}, stream_{stream}, device_{kDEVICE}, use_default_pool_{use_default_pool} + { + check_cuda_error(cudaGetDevice(&device_.id)); + if (use_default_pool_) { + check_cuda_error(cudaDeviceGetDefaultMemPool(&pool_, device_.id)); + } + else { + cudaMemPoolProps props{}; + props.allocType = cudaMemAllocationTypePinned; + props.handleTypes = cudaMemHandleTypeNone; + props.location.type = cudaMemLocationTypeDevice; + props.location.id = device_.id; + check_cuda_error(cudaMemPoolCreate(&pool_, &props)); + cuuint64_t thres = (cuuint64_t)-1; + check_cuda_error(cudaMemPoolSetAttribute(pool_, cudaMemPoolAttrReleaseThreshold, &thres)); + } + } + + ~CudaMemPoolAllocator() override + { + if (!use_default_pool_) { + check_cuda_error(cudaMemPoolDestroy(pool_)); + } + pool_ = {}; + } + + void* allocate(ssize_t size) override + { + void* ptr{}; + check_cuda_error(cudaMallocFromPoolAsync(&ptr, size, pool_, stream_.handle())); + return ptr; + } + + void deallocate(void* p, ssize_t) override + { + check_cuda_error(cudaFreeAsync(p, stream_.handle())); + } + + Device device() const noexcept override + { + return device_; + } + + Stream stream() const noexcept override + { + return stream_; + } + + void trim(size_t bytes_to_keep) + { + check_cuda_error(cudaMemPoolTrimTo(pool_, bytes_to_keep)); + } + +private: + cudaMemPool_t pool_; + Stream stream_; + Device device_; + bool use_default_pool_; +}; + +class CudaAllocator: public AllocatorImpl { +public: + void* allocate(ssize_t size) override + { + void* ptr{}; + check_cuda_error(cudaMalloc(&ptr, size)); + return ptr; + } + + void deallocate(void* p, ssize_t) override + { + check_cuda_error(cudaFree(p)); + } + + Device device() const noexcept override + { + return kDEVICE; + } +}; + +class CudaHostAllocator: public AllocatorImpl { +public: + void* allocate(ssize_t size) override + { + void* ptr{}; + check_cuda_error(cudaHostAlloc(&ptr, size, cudaHostAllocDefault)); + return ptr; + } + + void deallocate(void* p, ssize_t) override + { + check_cuda_error(cudaFreeHost(p)); + } + + Device device() const noexcept override + { + return kCPUpinned; + } +}; + +class HostAllocator: public AllocatorImpl { +public: + void* allocate(ssize_t size) override + { + return ::operator new(size); + } + + void deallocate(void* p, ssize_t) override + { + ::operator delete(p); + } + + Device device() const noexcept override + { + return kCPU; + } +}; + +Allocator::Allocator(DeviceType type) +{ + impl_ = [&]() -> shared_ptr { + switch (type) { + case kCPU: + return std::make_shared(); + case kDEVICE: + return std::make_shared(); + case kCPUpinned: + return std::make_shared(); + } + return {}; + }(); + TM_CHECK_NOTNULL(impl_); +} + +Allocator::Allocator(Stream stream, bool use_default_pool) +{ + impl_ = std::make_shared(std::move(stream), use_default_pool); +} + +} // namespace turbomind::core diff --git a/src/turbomind/core/allocator.h b/src/turbomind/core/allocator.h new file mode 100644 index 0000000000..bbc3ffb2d5 --- /dev/null +++ b/src/turbomind/core/allocator.h @@ -0,0 +1,244 @@ +#pragma once + +#include + +#include "src/turbomind/core/check.h" +#include "src/turbomind/core/common.h" +#include "src/turbomind/core/stream.h" + +#include "src/turbomind/kernels/core/math.h" + +namespace turbomind { + +enum class DeviceType : int +{ + kCPU, + kCPUpinned, + kDEVICE +}; + +inline constexpr DeviceType kCPU = DeviceType::kCPU; +inline constexpr DeviceType kCPUpinned = DeviceType::kCPUpinned; +inline constexpr DeviceType kDEVICE = DeviceType::kDEVICE; + +constexpr const char* to_string(DeviceType device) +{ + switch (device) { + case kCPU: + return "cpu"; + case kCPUpinned: + return "cpu_pinned"; + case kDEVICE: + return "device"; + } + return ""; +} + +inline std::ostream& operator<<(std::ostream& os, DeviceType device) +{ + return os << to_string(device); +} + +} // namespace turbomind + +namespace turbomind::core { + +struct Device { + DeviceType type; + int id; + Device(): Device{kCPU} {} + Device(DeviceType type_): type{type_}, id{-1} {} + Device(DeviceType type_, int device_): type{type_}, id{device_} {} + friend bool operator==(const Device& a, const Device& b) + { + return a.type == b.type && a.id == b.id; + } + friend bool operator!=(const Device& a, const Device& b) + { + return !(a == b); + } +}; + +class AllocatorImpl { +public: + virtual ~AllocatorImpl(); + + virtual void* allocate(ssize_t size) = 0; + + virtual void deallocate(void* p, ssize_t size) = 0; + + // Returns invalid stream by default + virtual Stream stream() const noexcept; + + virtual Device device() const noexcept = 0; +}; + +class Allocator { +public: + Allocator() = default; + + explicit Allocator(DeviceType type); + + Allocator(Stream stream, bool use_default_pool); + + Allocator(shared_ptr impl): impl_{std::move(impl)} {}; + + AllocatorImpl* operator->() const + { + TM_CHECK_NOTNULL(impl_); + return impl_.get(); + } + + explicit operator bool() const noexcept + { + return static_cast(impl_); + } + + friend bool operator==(const Allocator& a, const Allocator& b) + { + return a.impl_ == b.impl_; + } + + friend bool operator!=(const Allocator& a, const Allocator& b) + { + return !(a == b); + } + + template + shared_ptr adapt(Args&&... args) const + { + return {std::make_shared(impl_, ((Args &&) args)...)}; + } + +private: + shared_ptr impl_; +}; + +class StackAllocatorImpl: public AllocatorImpl { +public: + static constexpr ssize_t kAlignment = 256; + + explicit StackAllocatorImpl(shared_ptr underlying_impl): underlying_impl_{std::move(underlying_impl)} + { + } + + ~StackAllocatorImpl() override + { + if (cached_beg_) { + underlying_impl_->deallocate(cached_beg_, cached_end_ - cached_beg_); + } + } + + void* allocate(ssize_t size) override + { + size = round_up(size, kAlignment); + + void* p{}; + if (cached_ptr_ + size <= cached_end_) { + p = cached_ptr_; + cached_ptr_ += size; + } + else { + TM_CHECK(!cached_beg_); + p = underlying_impl_->allocate(size); + } + + // TM_LOG_ERROR("allocate %p, %ld", p, size); + + size_ += size; + ++num_; + max_size_ = std::max(size_, max_size_); + num_ = std::max(num_, max_num_); + return p; + } + + void deallocate(void* p, ssize_t size) override + { + size = round_up(size, kAlignment); + + // TM_LOG_ERROR("deallocate %p, %p, %ld", p, cached_ptr_, size); + + if ((char*)p + size == cached_ptr_) { + cached_ptr_ -= size; + } + else { + TM_CHECK(!cached_beg_); + underlying_impl_->deallocate(p, size); + } + size_ -= size; + --num_; + } + + Stream stream() const noexcept override + { + return underlying_impl_->stream(); + } + + Device device() const noexcept override + { + return underlying_impl_->device(); + } + + void iter() + { + TM_CHECK_EQ((void*)cached_beg_, (void*)cached_ptr_); + auto excpected = max_size_ + kAlignment * max_num_; + if (cached_end_ - cached_beg_ < excpected) { + if (cached_beg_) { + underlying_impl_->deallocate(cached_beg_, cached_end_ - cached_beg_); + } + cached_ptr_ = cached_beg_ = (char*)underlying_impl_->allocate(excpected); + cached_end_ = cached_beg_ + excpected; + } + size_ = num_ = max_size_ = max_num_ = 0; + } + +private: + ssize_t size_{}; + ssize_t num_{}; + ssize_t max_size_{}; + ssize_t max_num_{}; + + char* cached_beg_{}; + char* cached_end_{}; + char* cached_ptr_{}; + + std::shared_ptr underlying_impl_; +}; + +class SimpleAllocator: public AllocatorImpl { +public: + template + static Allocator Create(Alloc&& alloc, Dealloc&& dealloc, Device device) + { + return Allocator{std::make_shared((Alloc &&) alloc, (Dealloc &&) dealloc, device)}; + } + + template + SimpleAllocator(Alloc&& alloc, Dealloc&& dealloc, Device device): + alloc_{std::move(alloc)}, dealloc_{std ::move(dealloc)}, device_{device} + { + } + + void* allocate(ssize_t size) override + { + return alloc_(size); + }; + + void deallocate(void* p, ssize_t size) override + { + return dealloc_(p, size); + } + + Device device() const noexcept override + { + return device_; + } + +private: + std::function alloc_; + std::function dealloc_; + Device device_; +}; + +} // namespace turbomind::core diff --git a/src/turbomind/core/buffer.cc b/src/turbomind/core/buffer.cc new file mode 100644 index 0000000000..6971e63482 --- /dev/null +++ b/src/turbomind/core/buffer.cc @@ -0,0 +1,89 @@ + +#include "src/turbomind/core/buffer.h" +#include "src/turbomind/core/check.h" +#include "src/turbomind/core/context.h" +#include "src/turbomind/core/data_type.h" +#include "src/turbomind/core/stream.h" +namespace turbomind::core { + +Buffer Buffer::view(DataType dtype) const +{ + auto b = *this; + if (dtype == dtype_) { + return b; + } + b.dtype_ = dtype; + b.size_ = numel(dtype, byte_size()); + if (base_) { + b.base_ = numel(dtype, turbomind::byte_size(dtype_, base_)); + } + return b; +} + +Buffer Buffer::slice(ssize_t base, ssize_t size) const +{ + TM_CHECK_LE(base + size, size_); + auto b = *this; + b.base_ += base; + if (size == -1) { + b.size_ -= base; + } + else { + b.size_ = size; + } + return b; +} + +std::ostream& operator<<(std::ostream& os, const Buffer& b) +{ + os << b.dtype() << "[" << b.size() << "]@" << b.data_; + if (b.base_) { + os << "+" << b.base_; + } + return os; +} + +void Copy(const Buffer& a, ssize_t n, Ref b_, const Stream& stream) +{ + auto& b = b_.get(); + TM_CHECK_EQ(a.dtype(), b.dtype()); + TM_CHECK_LE(n, a.size()); + TM_CHECK_LE(n, b.size()); + check_cuda_error( + cudaMemcpyAsync(b.raw_data(), a.raw_data(), byte_size(a.dtype(), n), cudaMemcpyDefault, stream.handle())); +} + +void Copy(const Buffer& a, ssize_t n, Ref b_) +{ + Copy(a, n, b_, Context::stream()); +} + +void Copy(const Buffer& a, Ref b_, const Stream& stream) +{ + TM_CHECK_EQ(a.size(), b_.get().size()); + Copy(a, a.size(), b_, stream); +} + +void Copy(const Buffer& a, Ref b_) +{ + Copy(a, b_, Context::stream()); +} + +void* Copy(const void* a, ssize_t n, void* b, const Stream& stream) +{ + check_cuda_error(cudaMemcpyAsync(b, a, n, cudaMemcpyDefault, stream.handle())); + return (char*)b + n; +} + +void Clear(Ref b_, const Stream& stream) +{ + auto& b = b_.get(); + check_cuda_error(cudaMemsetAsync(b.raw_data(), 0, b.byte_size(), stream.handle())); +} + +void Clear(Ref b_) +{ + Clear(b_, Context::stream()); +} + +} // namespace turbomind::core diff --git a/src/turbomind/core/buffer.h b/src/turbomind/core/buffer.h new file mode 100644 index 0000000000..48263facf8 --- /dev/null +++ b/src/turbomind/core/buffer.h @@ -0,0 +1,343 @@ +#pragma once + +#include + +#include +#include + +#include "src/turbomind/core/allocator.h" +#include "src/turbomind/core/check.h" +#include "src/turbomind/core/common.h" +#include "src/turbomind/core/context.h" +#include "src/turbomind/core/data_type.h" + +namespace turbomind::core { + +class Buffer { +public: + Buffer(): data_{}, base_{}, size_{}, device_{}, dtype_{} {} + + // Typed empty buffer + explicit Buffer(DataType dtype): Buffer() + { + dtype_ = dtype; + } + + // Reference into `data` buffer + template + Buffer(T* data, ssize_t size, Device device): + data_{data, [](auto) {}}, base_{}, size_{size}, device_{device}, dtype_{data_type_v} + { + } + + Buffer(void* data, ssize_t size, DataType dtype, Device device): + data_{data, [](auto) {}}, base_{}, size_{size}, device_{device}, dtype_{dtype} + { + } + + // Share ownership of `data` + Buffer(shared_ptr data, ssize_t size, DataType dtype, Device device): + data_{std::move(data)}, base_{}, size_{size}, device_{device}, dtype_{dtype} + { + } + + // Create from the allocator + Buffer(ssize_t size, DataType dtype, Allocator& alloc): + base_{}, size_{size}, device_{alloc->device()}, dtype_{dtype} + { + auto bytes = turbomind::byte_size(dtype, size); + data_ = {alloc->allocate(bytes), [=](auto p) { alloc->deallocate(p, bytes); }}; + } + + Buffer(ssize_t size, DataType dtype, Device device): Buffer{size, dtype, Context::alloc(device)} {} + + template + T* data() + { + TM_CHECK_EQ(data_type_v, dtype_); + return (T*)((char*)TM_CHECK_NOTNULL(data_).get() + turbomind::byte_size(base_)); + } + + template + const T* data() const + { + return const_cast(this)->data(); + } + + void* raw_data(ssize_t offset = 0) + { + return (char*)TM_CHECK_NOTNULL(data_).get() + turbomind::byte_size(dtype_, base_ + offset); + } + + const void* raw_data(ssize_t offset = 0) const + { + return const_cast(this)->raw_data(offset); + } + + template + T* data_or(T* other) noexcept + { + if constexpr (std::is_void_v) { + return data_ ? (T*)raw_data() : other; + } + else { + return data_ ? data() : other; + } + } + + template + const T* data_or(const T* other) const noexcept + { + return const_cast(this)->data_or(other); + } + + DataType dtype() const + { + return dtype_; + } + + Device device() const + { + return device_; + } + + ssize_t size() const + { + return size_; + } + + ssize_t byte_size() const + { + return turbomind::byte_size(dtype_, size_); + } + + explicit operator bool() const noexcept + { + return static_cast(data_); + } + + Buffer view(DataType dtype) const; + + template + Buffer view() const + { + return view(data_type_v); + } + + Buffer slice(ssize_t base, ssize_t size) const; + + Buffer borrow() const + { + return Buffer{const_cast(raw_data()), size_, dtype_, device_}; + } + + friend bool operator==(const Buffer& a, const Buffer& b); + + friend bool operator!=(const Buffer& a, const Buffer& b); + + friend std::ostream& operator<<(std::ostream& os, const Buffer& b); + +protected: + auto as_tuple() const + { + return std::tie(data_, base_, size_, dtype_, device_); + } + + shared_ptr data_; + ssize_t base_; + ssize_t size_; + Device device_; + DataType dtype_; +}; + +inline bool operator==(const Buffer& a, const Buffer& b) +{ + return a.as_tuple() == b.as_tuple(); +} + +inline bool operator!=(const Buffer& a, const Buffer& b) +{ + return !(a == b); +} + +/////////////////////////////////////////////////////////// +// fill + +void Fill(Buffer& b, const void* v); + +void Fill(Buffer&& b, const void* v); + +void Fill(Buffer& b, const void* v, const Stream& stream); + +void Fill(Buffer&& b, const void* v, const Stream& stream); + +template +struct Buffer_: public Buffer { + + Buffer_(): Buffer{data_type_v} {} + + Buffer_(T* data, ssize_t size, Device device): Buffer{data, size, device} {} + + Buffer_(shared_ptr data, ssize_t size, Device device): Buffer{std::move(data), size, data_type_v, device} + { + } + + Buffer_(ssize_t size, Allocator& alloc): Buffer{size, data_type_v, alloc} {} + + Buffer_(ssize_t size, Device device): Buffer{size, data_type_v, device} {} + + Buffer_(const Buffer_&) = default; + Buffer_& operator=(const Buffer_&) = default; + + Buffer_(Buffer_&&) noexcept = default; + Buffer_& operator=(Buffer_&&) noexcept = default; + + Buffer_(const Buffer& b) + { + *static_cast(this) = ensure_dtype(b); + } + Buffer_(Buffer&& b) noexcept + { + *static_cast(this) = ensure_dtype(std::move(b)); + } + + T* data_or(T* other) + { + return data_ ? data() : other; + } + + const T* data_or(const T* other) const + { + return const_cast(this)->data_or(other); + } + + void* raw_data(ssize_t offset = 0) + { + return (char*)TM_CHECK_NOTNULL(data_).get() + turbomind::byte_size(base_ + offset); + } + + const void* raw_data(ssize_t offset = 0) const + { + return const_cast(this)->raw_data(offset); + } + + T* data() + { + return static_cast(raw_data()); + } + + const T* data() const + { + return static_cast(raw_data()); + } + + T* begin() + { + return data(); + } + + const T* begin() const + { + return data(); + } + + T* end() + { + return begin() + size(); + } + + const T* end() const + { + return begin() + size(); + } + + T& operator[](ssize_t i) + { + return data()[i]; + } + + const T& operator[](ssize_t i) const + { + return data()[i]; + } + + T& at(ssize_t i) + { + TM_CHECK_LT(i, size()); + return data()[i]; + } + + T& at(ssize_t i) const + { + TM_CHECK_LT(i, size()); + return data()[i]; + } + + constexpr DataType dtype() const noexcept + { + return data_type_v; + } + +private: + template + static decltype(auto) ensure_dtype(U&& u) noexcept + { + TM_CHECK_EQ(u.dtype(), data_type_v); + return (U &&) u; + } +}; + +template +class Ref { +public: + Ref(T& x): ref_{x} {} + Ref(T&& x): ref_{x} {} + + operator T&() + { + return ref_; + } + + T& get() + { + return ref_; + } + +private: + T& ref_; +}; + +void Copy(const Buffer& a, ssize_t n, Ref b_, const Stream& stream); + +void Copy(const Buffer& a, ssize_t n, Ref b_); + +void Copy(const Buffer& a, Ref b_, const Stream& stream); + +void Copy(const Buffer& a, Ref b_); + +// Static type checking +template +inline void Copy_(const Buffer_& a, ssize_t n, Buffer_& b_) +{ + Copy((const Buffer&)a, n, (Buffer&)b_); +} + +void* Copy(const void* a, ssize_t n, void* b, const Stream& stream); + +template +inline T* Copy(const T* a, ssize_t n, T* b, const Stream& stream) +{ + return (T*)Copy((const void*)a, sizeof(T) * n, (void*)b, stream); +} + +template +inline T* Copy(const T* a, ssize_t n, T* b) +{ + return Copy(a, n, b, Context::stream()); +} + +void Clear(Ref b_, const Stream& stream); + +void Clear(Ref b_); + +} // namespace turbomind::core diff --git a/src/turbomind/core/check.cc b/src/turbomind/core/check.cc new file mode 100644 index 0000000000..47ad9a2ec7 --- /dev/null +++ b/src/turbomind/core/check.cc @@ -0,0 +1,90 @@ + +#include +#include +#include +#include + +#include "src/turbomind/core/check.h" +#include "src/turbomind/utils/logger.h" + +namespace turbomind::core { + +namespace { + +std::string StripSrcPrefix(const char* file) +{ + static const char* flag = std::getenv("TM_SRC_FULL_PATH"); + if (flag) { + return file; + } + + std::filesystem::path path{file}; + std::filesystem::path ret{path}; // return the original path if anchor is not found + + constexpr auto anchor = "turbomind"; + + bool found = false; + + for (const auto& x : path) { + if (x == anchor) { + found = true; + ret.clear(); + } + else if (found) { + ret /= x; + } + } + + return ret.string(); +} + +} // namespace + +CheckOpStringBuilder::CheckOpStringBuilder() +{ + oss_ = new std::ostringstream; +} + +std::ostream* CheckOpStringBuilder::ForVal1() +{ + (*oss_) << "("; + return oss_; +} +std::ostream* CheckOpStringBuilder::ForVal2() +{ + (*oss_) << " vs. "; + return oss_; +} +std::string* CheckOpStringBuilder::NewString() +{ + (*oss_) << ")"; + return new std::string{oss_->str()}; +} + +CheckErrorStream::CheckErrorStream(const char* file, int line, const char* expr) +{ + oss_ = new std::ostringstream{}; + *oss_ << StripSrcPrefix(file) << "(" << line << "): Check failed: " << expr << " "; +} + +CheckErrorStream::CheckErrorStream(const char* file, int line, const char* expr, std::string* str): + CheckErrorStream{file, line, expr} +{ + *oss_ << *str << " "; +} + +void CheckErrorStream::Report() +{ + // ! Be aware of `%` in expr + std::cerr << "[TM][FATAL] " << oss_->str() << "\n"; + std::abort(); +} + +void ReportNullError(const char* file, int line, const char* expr) +{ + // ! Be aware of `%` in expr + std::cerr << "[TM][FATAL] " << StripSrcPrefix(file) << "(" << line << "): '" << expr << "' Must be non NULL\n"; + std::abort(); +} + +} // namespace turbomind::core diff --git a/src/turbomind/core/check.h b/src/turbomind/core/check.h new file mode 100644 index 0000000000..33b275251f --- /dev/null +++ b/src/turbomind/core/check.h @@ -0,0 +1,143 @@ + +// Inspired by + +#pragma once + +#include + +namespace turbomind::core { + +#if defined(_MSC_VER) && !defined(__clang__) +#define TM_LIKELY(expr) (expr) +#define TM_UNLIKELY(expr) (expr) +#define TM_NOINLINE +#define TM_UNREACHABLE __assume(0) +#else +#define TM_LIKELY(expr) (__builtin_expect(bool(expr), 1)) +#define TM_UNLIKELY(expr) (__builtin_expect(bool(expr), 0)) +#define TM_NOINLINE __attribute__((noinline)) +#define TM_UNREACHABLE __builtin_unreachable() +#endif + +#define TM_DISABLE_CHECK_STREAM 0 +#define TM_DISABLE_CHECK_OP 0 + +class CheckErrorStream { +public: + CheckErrorStream(const char* file, int line, const char* expr); + + CheckErrorStream(const char* file, int line, const char* expr, std::string* str); + +#if defined(_MSC_VER) && !defined(__clang__) +#pragma warning(push) +#pragma warning(disable : 4722) // MSVC warns dtor never return +#endif + ~CheckErrorStream() + { + Report(); + } +#if defined(_MSC_VER) && !defined(__clang__) +#pragma warning(pop) +#endif + + template + CheckErrorStream& operator<<(const T& msg) + { +#if TM_DISABLE_CHECK_STREAM +#else + *oss_ << msg; +#endif + return *this; + } + +private: + [[noreturn]] void Report(); + + std::ostringstream* oss_; +}; + +class CheckOpStringBuilder { +public: + CheckOpStringBuilder(); + std::ostream* ForVal1(); + std::ostream* ForVal2(); + std::string* NewString(); + +private: + std::ostringstream* oss_; +}; + +template +std::string* MakeCheckOpString(const T1& v1, const T2& v2) TM_NOINLINE; + +template +std::string* MakeCheckOpString(const T1& v1, const T2& v2) +{ + CheckOpStringBuilder builder; + *builder.ForVal1() << v1; + *builder.ForVal2() << v2; + return builder.NewString(); +} + +#define DEFINE_CHECK_OP_IMPL(name, op) \ + template \ + inline std::pair name##Impl(const T1& v1, const T2& v2) \ + { \ + if (TM_LIKELY(v1 op v2)) \ + return {false, nullptr}; \ + else \ + return {true, MakeCheckOpString(v1, v2)}; \ + } + +DEFINE_CHECK_OP_IMPL(Check_EQ, ==); +DEFINE_CHECK_OP_IMPL(Check_NE, !=); +DEFINE_CHECK_OP_IMPL(Check_LE, <=); +DEFINE_CHECK_OP_IMPL(Check_LT, <); +DEFINE_CHECK_OP_IMPL(Check_GE, >=); +DEFINE_CHECK_OP_IMPL(Check_GT, >); + +#undef DEFINE_CHECK_OP_IMPL + +// clang-format off +#define TM_CHECK(e) \ + if (TM_UNLIKELY(!(e))) turbomind::core::CheckErrorStream(__FILE__, __LINE__, #e) + +#define TM_CHECK_OP(name, op, a, b) \ + if (auto&& [__p, __s] = turbomind::core::Check##name##Impl(a, b); __p) \ + turbomind::core::CheckErrorStream(__FILE__, __LINE__, #a " " #op " " #b, __s) +// clang-format on + +#if TM_DISABLE_CHECK_OP + +#define TM_CHECK_EQ(a, b) TM_CHECK(a == b) +#define TM_CHECK_NE(a, b) TM_CHECK(a != b) +#define TM_CHECK_LE(a, b) TM_CHECK(a <= b) +#define TM_CHECK_LT(a, b) TM_CHECK(a < b) +#define TM_CHECK_GE(a, b) TM_CHECK(a >= b) +#define TM_CHECK_GT(a, b) TM_CHECK(a > b) + +#else + +#define TM_CHECK_EQ(a, b) TM_CHECK_OP(_EQ, ==, a, b) +#define TM_CHECK_NE(a, b) TM_CHECK_OP(_NE, !=, a, b) +#define TM_CHECK_LE(a, b) TM_CHECK_OP(_LE, <=, a, b) +#define TM_CHECK_LT(a, b) TM_CHECK_OP(_LT, <, a, b) +#define TM_CHECK_GE(a, b) TM_CHECK_OP(_GE, >=, a, b) +#define TM_CHECK_GT(a, b) TM_CHECK_OP(_GT, >, a, b) + +#endif + +[[noreturn]] void ReportNullError(const char* file, int line, const char* expr); + +template +decltype(auto) EnsureNotNull(const char* file, int line, const char* expr, T&& p) +{ + if (TM_UNLIKELY(p == nullptr)) { + ReportNullError(file, line, expr); + } + return (T &&) p; +} + +#define TM_CHECK_NOTNULL(p) ::turbomind::core::EnsureNotNull(__FILE__, __LINE__, #p, (p)) + +} // namespace turbomind::core diff --git a/src/turbomind/core/common.h b/src/turbomind/core/common.h new file mode 100644 index 0000000000..d3d4de6000 --- /dev/null +++ b/src/turbomind/core/common.h @@ -0,0 +1,24 @@ + +#pragma once + +#include +#include +#include + +/// TODO: remove this dependency +#include "src/turbomind/utils/cuda_utils.h" + +namespace turbomind::core { + +class Allocator; +class Buffer; +class Stream; +class Event; +class Context; + +using std::shared_ptr; +using std::vector; + +using ssize_t = std::ptrdiff_t; + +} // namespace turbomind::core diff --git a/src/turbomind/core/context.cc b/src/turbomind/core/context.cc new file mode 100644 index 0000000000..41589fb9e4 --- /dev/null +++ b/src/turbomind/core/context.cc @@ -0,0 +1,144 @@ + +#include + +#include "src/turbomind/core/allocator.h" +#include "src/turbomind/core/context.h" + +namespace turbomind::core { + +namespace { + +struct ContextStorage { + enum + { + stream_bit = 1, + host_alloc_bit = 2, + device_alloc_bit = 4, + pinned_alloc_bit = 8, + }; + + std::stack stream_; + std::stack host_alloc_; + std::stack device_alloc_; + std::stack pinned_alloc_; + std::stack mask_; + + ContextStorage() + { + push(Allocator{kCPU}); + } + + void push(const Stream& stream) + { + int mask{}; + if (stream) { + stream_.push(stream); + mask = stream_bit; + } + mask_.push(mask); + } + + void push(const Allocator& alloc) + { + int mask{}; + if (alloc) { + const auto type = alloc->device().type; + if (type == kCPU) { + mask = host_alloc_bit; + host_alloc_.push(alloc); + } + else if (type == kDEVICE) { + mask = device_alloc_bit; + device_alloc_.push(alloc); + } + else if (type == kCPUpinned) { + mask = pinned_alloc_bit; + pinned_alloc_.push(alloc); + } + } + mask_.push(mask); + } + + void pop() + { + if (mask_.top() & stream_bit) { + stream_.pop(); + } + if (mask_.top() & host_alloc_bit) { + host_alloc_.pop(); + } + if (mask_.top() & device_alloc_bit) { + device_alloc_.pop(); + } + if (mask_.top() & pinned_alloc_bit) { + pinned_alloc_.pop(); + } + mask_.pop(); + } + + static ContextStorage& instance() + { + thread_local ContextStorage inst{}; + return inst; + } +}; + +} // namespace + +void Context::push(const Stream& stream) +{ + ContextStorage::instance().push(stream); +} + +void Context::push(const Allocator& alloc) +{ + ContextStorage::instance().push(alloc); +} + +void Context::pop() +{ + ContextStorage::instance().pop(); +} + +Stream& Context::stream() +{ + auto& stream_ = ContextStorage::instance().stream_; + TM_CHECK(!stream_.empty()) << "No STREAM available in current context"; + return stream_.top(); +} + +Allocator& Context::host_alloc() +{ + auto& host_alloc_ = ContextStorage::instance().host_alloc_; + TM_CHECK(!host_alloc_.empty()) << "No HOST memory allocator available in current context"; + return host_alloc_.top(); +} + +Allocator& Context::device_alloc() +{ + auto& device_alloc_ = ContextStorage::instance().device_alloc_; + TM_CHECK(!device_alloc_.empty()) << "No DEVICE memory allocator available in current context"; + return device_alloc_.top(); +} + +Allocator& Context::pinned_alloc() +{ + auto& pinned_alloc_ = ContextStorage::instance().pinned_alloc_; + TM_CHECK(!pinned_alloc_.empty()) << "No PINNED memory allocator available in current context"; + return pinned_alloc_.top(); +} + +Allocator& Context::alloc(Device device) +{ + switch (device.type) { + case kDEVICE: + return device_alloc(); + case kCPU: + return host_alloc(); + case kCPUpinned: + return pinned_alloc(); + } + TM_UNREACHABLE; +} + +} // namespace turbomind::core diff --git a/src/turbomind/core/context.h b/src/turbomind/core/context.h new file mode 100644 index 0000000000..ec8abe6f1e --- /dev/null +++ b/src/turbomind/core/context.h @@ -0,0 +1,43 @@ +#pragma once + +#include "src/turbomind/core/allocator.h" +#include "src/turbomind/core/common.h" +#include "src/turbomind/core/stream.h" + +namespace turbomind::core { + +class Context { +public: + static Stream& stream(); + static Allocator& host_alloc(); + static Allocator& device_alloc(); + static Allocator& pinned_alloc(); + static Allocator& alloc(Device device); + +private: + friend class ContextGuard; + static void push(const Stream& stream); + static void push(const Allocator& alloc); + static void pop(); +}; + +class ContextGuard { +public: + template + explicit ContextGuard(Args&&... args): n_{} + { + (Context::push((Args &&) args), ...); + n_ = sizeof...(Args); + } + ~ContextGuard() + { + for (int i = 0; i < n_; ++i) { + Context::pop(); + } + } + +private: + int n_; +}; + +} // namespace turbomind::core diff --git a/src/turbomind/core/core.h b/src/turbomind/core/core.h new file mode 100644 index 0000000000..a58daba3d6 --- /dev/null +++ b/src/turbomind/core/core.h @@ -0,0 +1,26 @@ +#pragma once + +#include "src/turbomind/core/allocator.h" +#include "src/turbomind/core/buffer.h" +#include "src/turbomind/core/check.h" +#include "src/turbomind/core/context.h" +#include "src/turbomind/core/data_type.h" +#include "src/turbomind/core/layout.h" +#include "src/turbomind/core/stream.h" +#include "src/turbomind/core/tensor.h" + +namespace turbomind { + +using core::ssize_t; +using core::Buffer; +using core::Buffer_; +using core::Tensor; +using core::Tensor_; +using core::TensorMap; +using core::Ref; +using core::Layout; +using core::Allocator; +using core::Stream; +using core::Event; + +} // namespace turbomind diff --git a/src/turbomind/core/cuda_data_type.h b/src/turbomind/core/cuda_data_type.h new file mode 100644 index 0000000000..e3227a9056 --- /dev/null +++ b/src/turbomind/core/cuda_data_type.h @@ -0,0 +1,59 @@ +#include +#include + +#include +#include +#include + +#include "src/turbomind/core/data_type.h" + +namespace turbomind { + +// clang-format off + +constexpr cudaDataType to_cuda_dtype(DataType type) +{ + switch (type) { + case kUint8: return CUDA_R_8U; + case kUint16: return CUDA_R_16U; + case kUint32: return CUDA_R_32U; + case kUint64: return CUDA_R_64U; + case kInt8: return CUDA_R_8I; + case kInt16: return CUDA_R_16I; + case kInt32: return CUDA_R_32I; + case kInt64: return CUDA_R_64I; + case kFloat16: return CUDA_R_16F; + case kFloat32: return CUDA_R_32F; + case kFloat64: return CUDA_R_64F; + case kBfloat16: return CUDA_R_16BF; + case kFloat8_e4m3: return CUDA_R_8F_E4M3; + case kFloat8_e5m2: return CUDA_R_8F_E5M2; + default: + throw std::runtime_error("Not supported " + std::string{to_string(type)}); + } +} + +constexpr DataType from_cuda_dtype(cudaDataType type) { + switch (type) { + case CUDA_R_8U: return kUint8; + case CUDA_R_16U: return kUint16; + case CUDA_R_32U: return kUint32; + case CUDA_R_64U: return kUint64; + case CUDA_R_8I: return kInt8; + case CUDA_R_16I: return kInt16; + case CUDA_R_32I: return kInt32; + case CUDA_R_64I: return kInt64; + case CUDA_R_16F: return kFloat16; + case CUDA_R_32F: return kFloat32; + case CUDA_R_64F: return kFloat64; + case CUDA_R_16BF: return kBfloat16; + case CUDA_R_8F_E4M3: return kFloat8_e4m3; + case CUDA_R_8F_E5M2: return kFloat8_e5m2; + default: + throw std::runtime_error("Not supported " + std::string{std::to_string(type)}); + } +} + +// clang-format on + +} // namespace turbomind diff --git a/src/turbomind/core/data_type.h b/src/turbomind/core/data_type.h new file mode 100644 index 0000000000..a6a42079cf --- /dev/null +++ b/src/turbomind/core/data_type.h @@ -0,0 +1,318 @@ +// Copyright (c) OpenMMLab. All rights reserved. + +#pragma once + +#include "src/turbomind/core/check.h" + +#include +#include +#include + +// forward declarations for CUDA floating point types +struct __half; +struct __nv_bfloat16; +struct __nv_fp8_e4m3; +struct __nv_fp8_e5m2; + +namespace turbomind { + +// clang-format off + +struct uint2_t {}; +struct uint4_t {}; +struct uint6_t {}; + +template +struct int_constant: std::integral_constant {}; + +template +struct bitsof_t: int_constant {}; + +template <> struct bitsof_t: int_constant<2> {}; +template <> struct bitsof_t: int_constant<4> {}; +template <> struct bitsof_t: int_constant<6> {}; + +template +inline constexpr bitsof_t bitsof{}; + +using half_t = __half; +using bfloat16_t = __nv_bfloat16; +using fp8_e4m3_t = __nv_fp8_e4m3; +using fp8_e5m2_t = __nv_fp8_e5m2; + +constexpr int encode_data_type(bool sign, int exponent, int mantissa) { + return ((sign << 16) | (exponent << 8) | mantissa); +} + +enum class DataType: int { + kNull = 0, + kBool = 1, + kUint8 = encode_data_type(0, 0, 8), + kUint16 = encode_data_type(0, 0, 16), + kUint32 = encode_data_type(0, 0, 32), + kUint64 = encode_data_type(0, 0, 64), + kInt8 = encode_data_type(1, 0, 8), + kInt16 = encode_data_type(1, 0, 16), + kInt32 = encode_data_type(1, 0, 32), + kInt64 = encode_data_type(1, 0, 64), + kFloat16 = encode_data_type(1, 5, 10), + kFloat32 = encode_data_type(1, 8, 23), + kFloat64 = encode_data_type(1, 11, 52), + kBfloat16 = encode_data_type(1, 8, 7), + kFloat8_e4m3 = encode_data_type(1, 4, 3), + kFloat8_e5m2 = encode_data_type(1, 5, 2), + kUint2 = encode_data_type(0, 0, 2), + kUint4 = encode_data_type(0, 0, 4), + kUint6 = encode_data_type(0, 0, 6), + kUint = kUint32, + kInt = kInt32, + kFloat = kFloat32, + kHalf = kFloat16, + kDouble = kFloat64, +}; + +inline constexpr DataType kNull = DataType::kNull; +inline constexpr DataType kBool = DataType::kBool; +inline constexpr DataType kUint8 = DataType::kUint8; +inline constexpr DataType kUint16 = DataType::kUint16; +inline constexpr DataType kUint32 = DataType::kUint32; +inline constexpr DataType kUint64 = DataType::kUint64; +inline constexpr DataType kInt8 = DataType::kInt8; +inline constexpr DataType kInt16 = DataType::kInt16; +inline constexpr DataType kInt32 = DataType::kInt32; +inline constexpr DataType kInt64 = DataType::kInt64; +inline constexpr DataType kFloat16 = DataType::kFloat16; +inline constexpr DataType kFloat32 = DataType::kFloat32; +inline constexpr DataType kFloat64 = DataType::kFloat64; +inline constexpr DataType kBfloat16 = DataType::kBfloat16; +inline constexpr DataType kFloat8_e4m3 = DataType::kFloat8_e4m3; +inline constexpr DataType kFloat8_e5m2 = DataType::kFloat8_e5m2; +inline constexpr DataType kUint2 = DataType::kUint2; +inline constexpr DataType kUint4 = DataType::kUint4; +inline constexpr DataType kUint6 = DataType::kUint6; +inline constexpr DataType kUint = DataType::kUint; +inline constexpr DataType kInt = DataType::kInt; +inline constexpr DataType kHalf = DataType::kHalf; +inline constexpr DataType kFloat = DataType::kFloat; +inline constexpr DataType kDouble = DataType::kDouble; + +template +struct to_data_type; + +template +struct from_data_type; + +#define CVT_DATA_TYPE(D, T) \ + template <> struct to_data_type { static constexpr auto value = DataType::D; }; \ + template <> struct from_data_type { using type = T; } + +CVT_DATA_TYPE(kNull, void); + +CVT_DATA_TYPE(kBool, bool); +CVT_DATA_TYPE( kUint8, uint8_t); +CVT_DATA_TYPE(kUint16, uint16_t); +CVT_DATA_TYPE(kUint32, uint32_t); +CVT_DATA_TYPE(kUint64, uint64_t); + +CVT_DATA_TYPE( kInt8, int8_t); // NOTE: `int8_t` is `signed char` and is different from `char` +CVT_DATA_TYPE(kInt16, int16_t); +CVT_DATA_TYPE(kInt32, int32_t); +CVT_DATA_TYPE(kInt64, int64_t); + +CVT_DATA_TYPE(kFloat16, half_t); +CVT_DATA_TYPE(kFloat32, float); +CVT_DATA_TYPE(kFloat64, double); +CVT_DATA_TYPE(kBfloat16, bfloat16_t); +CVT_DATA_TYPE(kFloat8_e4m3, fp8_e4m3_t); +CVT_DATA_TYPE(kFloat8_e5m2, fp8_e5m2_t); + +CVT_DATA_TYPE(kUint2, uint2_t); +CVT_DATA_TYPE(kUint4, uint4_t); +CVT_DATA_TYPE(kUint6, uint6_t); + +#undef CVT_DATA_TYPE + +template +inline constexpr auto data_type_v = to_data_type>::value; + +template +using data_type_t = typename from_data_type::type; + +constexpr std::ptrdiff_t byte_size(DataType type, std::ptrdiff_t size = 1) { + switch (type) { + case kNull: return 0; + case kBool: + case kUint8: + case kInt8: + case kFloat8_e4m3: + case kFloat8_e5m2: + return size; + case kUint16: + case kInt16: + case kFloat16: + case kBfloat16: + return size * 2; + case kUint32: + case kInt32: + case kFloat32: + return size * 4; + case kUint64: + case kInt64: + case kFloat64: + return size * 8; + case kUint2: return size * 2 / 8; + case kUint4: return size * 4 / 8; + case kUint6: return size * 6 / 8; + } + return 0; +} + +template +constexpr std::ptrdiff_t byte_size(std::ptrdiff_t size = 1) { return byte_size(data_type_v, size); } + +constexpr std::ptrdiff_t numel(DataType type, std::ptrdiff_t size = 1) { + switch (type) { + case kNull: return 0; + case kBool: + case kUint8: + case kInt8: + case kFloat8_e4m3: + case kFloat8_e5m2: + return size; + case kUint16: + case kInt16: + case kFloat16: + case kBfloat16: + return size / 2; + case kUint32: + case kInt32: + case kFloat32: + return size / 4; + case kUint64: + case kInt64: + case kFloat64: + return size / 8; + case kUint2: return size * 8 / 2; + case kUint4: return size * 8 / 4; + case kUint6: return size * 8 / 6; + } + return 0; +} + +template +constexpr std::ptrdiff_t numel(std::ptrdiff_t size) { return numel(data_type_v, size); } + +constexpr const char* to_string(DataType type) { + switch (type) { + case kNull: return "nil"; + case kBool: return "bool"; + case kUint8: return "u8"; + case kUint16: return "u16"; + case kUint32: return "u32"; + case kUint64: return "u64"; + case kInt8: return "i8"; + case kInt16: return "i16"; + case kInt32: return "i32"; + case kInt64: return "i64"; + case kFloat16: return "f16"; + case kFloat32: return "f32"; + case kFloat64: return "f64"; + case kBfloat16: return "bf16"; + case kFloat8_e4m3: return "f8_e4m3"; + case kFloat8_e5m2: return "f8_e5m2"; + case kUint2: return "u2"; + case kUint4: return "u4"; + case kUint6: return "u8"; + default: + return "unknown"; + } + return ""; +} + +inline std::ostream& operator<<(std::ostream& os, DataType type) { + os << to_string(type); + return os; +} + +/// TODO: mapping with DLPack + +// clang-format on + +#define TM_PP_NARGS(...) TM_PP_NARGS_IMPL(__VA_ARGS__, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#define TM_PP_NARGS_IMPL(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N + +#define TM_PP_CAT(a, b) a##b +#define TM_PP_STR(x) #x + +#define TM_PP_DISPATCH_N(macro, ...) TM_PP_DISPATCH_N_IMPL(macro, TM_PP_NARGS(__VA_ARGS__)) +#define TM_PP_DISPATCH_N_IMPL(macro, x) TM_PP_CAT(macro, x) + +#define TM_PP_INVOKE_1(macro, f, _0) macro(f, _0) + +#define TM_PP_INVOKE_2(macro, f, _0, _1) \ + macro(f, _0); \ + macro(f, _1) + +#define TM_PP_INVOKE_3(macro, f, _0, _1, _2) \ + macro(f, _0); \ + macro(f, _1); \ + macro(f, _2) + +#define TM_PP_INVOKE_4(macro, f, _0, _1, _2, _3) \ + macro(f, _0); \ + macro(f, _1); \ + macro(f, _2); \ + macro(f, _3) + +#define TM_PP_INVOKE_5(macro, f, _0, _1, _2, _3, _4) \ + macro(f, _0); \ + macro(f, _1); \ + macro(f, _2); \ + macro(f, _3); \ + macro(f, _4) + +#define TM_DISPATCH_DTYPE_RET_CASE(f, t) \ + case ::turbomind::data_type_v: \ + return f(t{}); + +#define TM_DISPATCH_DTYPE_CASE(f, t) \ + case ::turbomind::data_type_v: \ + f(t{}); \ + break + +// clang-format off +#define TM_DISPATCH_DTYPES_RET(var, f, ...) \ + switch (var) { \ + TM_PP_DISPATCH_N(TM_PP_INVOKE_, __VA_ARGS__)(TM_DISPATCH_DTYPE_RET_CASE, f, __VA_ARGS__); \ + default: \ + TM_CHECK(0) << "unsupported type: " << to_string(var); \ + return {}; \ + } + +#define TM_DISPATCH_DTYPES(var, f, ...) \ + switch (var) { \ + TM_PP_DISPATCH_N(TM_PP_INVOKE_, __VA_ARGS__)(TM_DISPATCH_DTYPE_CASE, f, __VA_ARGS__); \ + default: \ + TM_CHECK(0) << "unsupported type: " << to_string(var); \ + } +// clang-format on + +#define TM_PRIMARY_DTYPES_0 ::turbomind::half_t + +#if ENABLE_BF16 +#define TM_PRIMARY_DTYPES_1 TM_PRIMARY_DTYPES_0, ::turbomind::bfloat16_t +#else +#define TM_PRIMARY_DTYPES_1 TM_PRIMARY_DTYPES_0 +#endif + +#if ENABLE_FP32 +#define TM_PRIMARY_DTYPES TM_PRIMARY_DTYPES_1, float +#else +#define TM_PRIMARY_DTYPES TM_PRIMARY_DTYPES_1 +#endif + +#define TM_DISPATCH_PRIMARY_DTYPES(var, func) TM_DISPATCH_DTYPES(var, func, TM_PRIMARY_DTYPES) + +#define TM_DISPATCH_PRIMARY_DTYPES_RET(var, func) TM_DISPATCH_DTYPES_RET(var, func, TM_PRIMARY_DTYPES) + +} // namespace turbomind diff --git a/src/turbomind/core/layout.cc b/src/turbomind/core/layout.cc new file mode 100644 index 0000000000..995f2a1fbf --- /dev/null +++ b/src/turbomind/core/layout.cc @@ -0,0 +1,153 @@ + +#include + +#include "src/turbomind/core/check.h" +#include "src/turbomind/core/layout.h" + +namespace turbomind::core { + +Layout::Layout(std::vector shape): shape_{std::move(shape)} +{ + TM_CHECK(shape_.size()); + stride_.resize(shape_.size()); + size_ = 1; + for (int i = shape_.size() - 1; i >= 0; --i) { + stride_[i] = size_; + size_ *= shape_[i]; + } +} + +Layout::Layout(vector shape, vector stride): shape_{std::move(shape)}, stride_{std::move(stride)} +{ + TM_CHECK(shape_.size()); + TM_CHECK_EQ(shape_.size(), stride_.size()); + + size_ = std::accumulate(shape_.begin(), shape_.end(), ssize_t{1}, std::multiplies<>{}); + + TM_CHECK_GE(size_, 0); +} + +ssize_t Layout::cosize() const noexcept +{ + if (rank() == 0) { + return 0; + } + ssize_t value{1}; + for (size_t i = 0; i < shape_.size(); ++i) { + value += (shape_[i] - 1) * stride_[i]; + } + return value; +} + +Layout Layout::coalesce() const noexcept +{ + vector shape{shape_.front()}; + vector stride{stride_.front()}; + + for (size_t i = 1; i < shape_.size(); ++i) { + if (shape_[i] == 1) { + continue; + } + else if (shape.back() == 1) { + shape.back() = shape_[i]; + stride.back() = stride_[i]; + } + else if (stride.back() == shape_[i] * stride_[i]) { + stride.back() = stride_[i]; + shape.back() *= shape_[i]; + } + else { + shape.push_back(shape_[i]); + stride.push_back(stride_[i]); + } + } + + return Layout{shape, stride}; +} + +Layout Layout::view(vector shape) const +{ + if (shape == shape_) { + return *this; + } + + TM_CHECK(!shape.empty()); + + // size check & wildcard resolution + auto wildcard = std::find(shape.begin(), shape.end(), -1); + if (wildcard != shape.end()) { + TM_CHECK(std::find(wildcard + 1, shape.end(), -1) == shape.end()); + *wildcard = 1; + } + auto new_size = std::accumulate(shape.begin(), shape.end(), ssize_t{1}, std::multiplies<>{}); + if (wildcard != shape.end()) { + TM_CHECK(size_ % new_size == 0) << size_ << " % " << new_size; + *wildcard = size_ / new_size; + } + else { + TM_CHECK_EQ(size_, new_size); + } + + if (is_contiguous()) { + return Layout{shape}; + } + + const Layout c = coalesce(); // merge contiguous dimensions + + ssize_t p = c.rank(); + ssize_t s = 1; + ssize_t d = 0; + + vector stride(shape.size()); + + for (int i = shape.size() - 1; i >= 0; --i) { + if (shape[i] == 1) { + stride[i] = 0; + } + else { + if (s == 1) { + --p; + s = c.shape().at(p); + d = c.stride().at(p); + } + TM_CHECK_EQ(s % shape[i], 0); // crossing non-contiguous dimensions + stride[i] = d; + d *= shape[i]; + s /= shape[i]; + } + } + return Layout{std::move(shape), std::move(stride)}; +} + +std::pair Layout::slice(const vector& base, vector shape) const +{ + TM_CHECK_EQ(base.size(), shape.size()); + TM_CHECK_EQ(shape_.size(), shape.size()); + ssize_t offset = 0; + for (size_t i = 0; i < shape.size(); ++i) { + const auto space = shape_[i] - base[i]; + TM_CHECK_GE(space, 0); + if (shape[i] == -1) { + shape[i] = space; + } + TM_CHECK_LE(shape[i], space); + offset += base[i] * stride_[i]; + } + return {Layout{std::move(shape), stride_}, offset}; +} + +std::ostream& operator<<(std::ostream& os, const Layout& x) +{ + os << "("; + for (int i = 0; i < x.rank(); ++i) { + os << (i ? "," : "") << x.shape_[i]; + } + os << "):("; + for (int i = 0; i < x.rank(); ++i) { + os << (i ? "," : "") << x.stride_[i]; + } + os << ")"; + return os; +} + +} // namespace turbomind::core diff --git a/src/turbomind/core/layout.h b/src/turbomind/core/layout.h new file mode 100644 index 0000000000..2806d87c73 --- /dev/null +++ b/src/turbomind/core/layout.h @@ -0,0 +1,156 @@ + +#pragma once + +#include + +#include "src/turbomind/core/check.h" +#include "src/turbomind/core/common.h" + +namespace turbomind::core { + +class Layout { +public: + Layout(): size_{0} {} + + /* implicit */ Layout(vector shape); + + /* implicit */ Layout(std::initializer_list shape): Layout(vector(shape)) {} + + Layout(vector shape, vector stride); + + ssize_t size() const noexcept + { + return size_; + } + + ssize_t cosize() const noexcept; + + ssize_t rank() const noexcept + { + return shape_.size(); + } + + auto& shape() const noexcept + { + return shape_; + } + + auto shape(int i) const + { + return shape_.at(wrap(i)); + } + + template + auto shapes(Is... is) const + { + return std::make_tuple(shape(is)...); + } + + auto& stride() const noexcept + { + return stride_; + } + + auto stride(int i) const + { + return stride_.at(wrap(i)); + } + + template + auto strides(Is... is) + { + return std::make_tuple(stride(is)...); + } + + bool is_contiguous() const noexcept + { + if (stride_.back() != 1) { + return false; + } + if (size() != cosize()) { + return false; + } + for (int i = 0; i < rank() - 1; ++i) { + // TODO: skip when shape == 1 + if (stride_[i] < stride_[i + 1]) { + return false; + } + } + return true; + } + + Layout permute(const vector& dims) + { + TM_CHECK((int)dims.size() == rank()); + auto a = *this; + for (int i = 0; i < rank(); ++i) { + a.shape_[i] = shape_[dims[i]]; + a.stride_[i] = stride_[dims[i]]; + } + return a; + } + + ssize_t offset(const vector& idxs) const + { + TM_CHECK((int)idxs.size() < rank()); + ssize_t val = 0; + for (size_t i = 0; i < idxs.size(); ++i) { + TM_CHECK_LT(idxs[i], shape_[i]); + val += idxs[i] * stride_[i]; + } + return val; + } + + ssize_t offset(ssize_t idx0) const + { + TM_CHECK(rank()); + TM_CHECK_LT(idx0, shape_[0]); + return stride_[0] * idx0; + } + + Layout coalesce() const noexcept; + + Layout view(vector shape) const; + + std::pair slice(const vector& base, vector shape) const; + + Layout squeeze(int dim) const + { + if (rank() == 1 || shape(dim) != 1) { + return *this; + } + Layout a; + a.shape_.reserve(rank() - 1); + a.stride_.reserve(rank() - 1); + for (int i = 0; i < rank(); ++i) { + if (i != dim) { + a.shape_.push_back(shape_[i]); + a.stride_.push_back(stride_[i]); + } + } + a.size_ = size_; + return a; + } + + friend std::ostream& operator<<(std::ostream& os, const Layout& x); + +private: + int wrap(int dim) const noexcept + { + return dim < 0 ? dim + shape_.size() : dim; + } + +private: + vector shape_; + vector stride_; + ssize_t size_; +}; + +inline std::string to_string(const Layout& x) +{ + std::stringstream ss; + ss << x; + return ss.str(); +} + +} // namespace turbomind::core diff --git a/src/turbomind/core/module.cc b/src/turbomind/core/module.cc new file mode 100644 index 0000000000..92b08ba24c --- /dev/null +++ b/src/turbomind/core/module.cc @@ -0,0 +1,78 @@ + +#include "src/turbomind/core/module.h" +#include "src/turbomind/core/check.h" +#include + +namespace turbomind::core { + +Module::Module(): parent_{} {} + +Module::~Module() +{ + if (parent_) { + parent_->remove_module(*this); + parent_ = {}; + } +} + +void Module::register_module(std::string name, Module& module, std::optional index) +{ + module.parent_ = this; + if (index) { + name += "."; + name += std::to_string(*index); + } + // std::cout << "register Module " << name << " " << &module << ", parent " << this << "\n"; + modules_.emplace_back(std::move(name), &module); +} + +void Module::register_parameter(std::string name, Tensor& param) +{ + // std::cout << "register Parameter " << name << " " << ¶m << " " << param.layout() << "\n"; + params_.emplace_back(std::move(name), ¶m); +} + +void Module::remove_module(Module& module) +{ + for (auto it = modules_.begin(); it != modules_.end(); ++it) { + if (it->second == &module) { + // std::cout << "erase " << it->first << " " << &module << " from " << this << "\n"; + modules_.erase(it); + return; + } + } + TM_CHECK(0) << "module " << &module << " not found"; +} + +void Module::remove_parameter(Tensor& param) +{ + for (auto it = params_.begin(); it != params_.end(); ++it) { + if (it->second == ¶m) { + params_.erase(it); + return; + } + } + TM_CHECK(0) << "param " << ¶m << " not found"; +} + +TensorMap Module::get_parameters() const +{ + TensorMap m; + get_parameters_impl({}, m); + return m; +} + +void Module::get_parameters_impl(std::string prefix, TensorMap& m) const +{ + if (!prefix.empty()) { + prefix += "."; + } + for (const auto& [k, v] : params_) { + m.emplace(prefix + k, *v); + } + for (const auto& [k, v] : modules_) { + v->get_parameters_impl(prefix + k, m); + } +} + +} // namespace turbomind::core diff --git a/src/turbomind/core/module.h b/src/turbomind/core/module.h new file mode 100644 index 0000000000..f48939a84b --- /dev/null +++ b/src/turbomind/core/module.h @@ -0,0 +1,36 @@ + +#include "src/turbomind/core/tensor.h" + +namespace turbomind::core { + +class Module { +public: + virtual ~Module(); + + Module(); + + Module(const Module&) = delete; + Module& operator=(const Module&) = delete; + + Module(Module&&) noexcept = delete; + Module& operator=(Module&&) noexcept = delete; + + void register_module(std::string name, Module& module, std::optional index = {}); + void register_parameter(std::string name, Tensor& param); + + void remove_module(Module& module); + void remove_parameter(Tensor& param); + + TensorMap get_parameters() const; + +private: + void get_parameters_impl(std::string prefix, TensorMap& m) const; + +protected: + Module* parent_; + + std::vector> modules_; + std::vector> params_; +}; + +} // namespace turbomind::core diff --git a/src/turbomind/core/stream.cc b/src/turbomind/core/stream.cc new file mode 100644 index 0000000000..d63326133c --- /dev/null +++ b/src/turbomind/core/stream.cc @@ -0,0 +1,19 @@ + +#include "src/turbomind/core/stream.h" +#include + +namespace turbomind::core { + +Stream Stream::create(int priority) +{ + Stream stream; + stream.impl_ = std::make_shared(priority); + return stream; +} + +void StreamImpl::Wait(const Event& event) +{ + check_cuda_error(cudaStreamWaitEvent(stream_, event)); +} + +} // namespace turbomind::core diff --git a/src/turbomind/core/stream.h b/src/turbomind/core/stream.h new file mode 100644 index 0000000000..9727dd5b64 --- /dev/null +++ b/src/turbomind/core/stream.h @@ -0,0 +1,160 @@ +#pragma once + +#include + +#include "src/turbomind/core/check.h" +#include "src/turbomind/core/common.h" + +namespace turbomind::core { + +class StreamImpl { +public: + StreamImpl(int priority): stream_{} + { + check_cuda_error(cudaStreamCreateWithPriority(&stream_, cudaStreamNonBlocking, priority)); + } + + ~StreamImpl() + { + if (auto ec = cudaStreamDestroy(stream_); ec != cudaSuccess) { + TM_LOG_ERROR(cudaGetErrorString(ec)); + } + stream_ = {}; + } + + void Sync() + { + check_cuda_error(cudaStreamSynchronize(stream_)); + } + + void Wait(const Event& event); + + cudaStream_t handle() const + { + return stream_; + } + +public: + cudaStream_t stream_; +}; + +class Stream { +public: + Stream() = default; + + static Stream create(int priority = 0); + + void Sync() + { + impl_->Sync(); + } + + void Wait(const Event& event) + { + impl_->Wait(event); + } + + cudaStream_t handle() const + { + return TM_CHECK_NOTNULL(impl_)->handle(); + } + + explicit operator cudaStream_t() const + { + return handle(); + } + + explicit operator bool() const noexcept + { + return static_cast(impl_); + } + + friend bool operator==(const Stream& a, const Stream& b) + { + return a.impl_ == b.impl_; + } + + friend bool operator!=(const Stream& a, const Stream& b) + { + return !(a == b); + } + + friend std::ostream& operator<<(std::ostream& os, const Stream& s) + { + os << s.impl_; + return os; + } + +private: + shared_ptr impl_; +}; + +class EventImpl { +public: + explicit EventImpl(unsigned flags) + { + check_cuda_error(cudaEventCreateWithFlags(&event_, flags)); + } + + ~EventImpl() + { + if (auto ec = cudaEventDestroy(event_); ec != cudaSuccess) { + TM_LOG_ERROR(cudaGetErrorString(ec)); + } + } + + void Record(const Stream& stream) + { + check_cuda_error(cudaEventRecord(event_, stream.handle())); + } + + void Sync() const + { + check_cuda_error(cudaEventSynchronize(event_)); + } + + cudaEvent_t handle() const + { + return event_; + } + +private: + cudaEvent_t event_; +}; + +class Event { +public: + Event() = default; + + static Event create(bool timing = false) + { + Event e{}; + e.impl_ = std::make_shared(timing ? 0 : cudaEventDisableTiming); + return e; + } + + void Record(const Stream& stream) + { + TM_CHECK_NOTNULL(impl_)->Record(stream); + } + + void Sync() const + { + TM_CHECK_NOTNULL(impl_)->Sync(); + } + + operator cudaEvent_t() const + { + return TM_CHECK_NOTNULL(impl_)->handle(); + } + + explicit operator bool() const noexcept + { + return static_cast(impl_); + } + +private: + shared_ptr impl_; +}; + +} // namespace turbomind::core diff --git a/src/turbomind/core/tensor.cc b/src/turbomind/core/tensor.cc new file mode 100644 index 0000000000..959d04ce6c --- /dev/null +++ b/src/turbomind/core/tensor.cc @@ -0,0 +1,142 @@ + +#include "src/turbomind/core/tensor.h" +#include "src/turbomind/core/buffer.h" +#include "src/turbomind/core/stream.h" + +namespace turbomind::core { + +std::ostream& operator<<(std::ostream& os, const Tensor& t) +{ + os << t.dtype() << "[" << t.layout() << "]@" << t.buffer_.data_or((void*)nullptr); + return os; +} + +Tensor& TensorMap::at(const std::string& key) +{ + auto it = find(key); + TM_CHECK(it != end()) << get_out_of_range_msg(key); + return it->second; +} + +std::string TensorMap::get_out_of_range_msg(const std::string& key) const +{ + std::ostringstream oss; + oss << "Cannot find a tensor of name '" << key << "' in the tensor map (keys: "; + auto sep = ""; + for (const auto& [k, _] : *this) { + oss << std::exchange(sep, ", ") << k; + } + oss << ")"; + return oss.str(); +} + +Tensor* TensorMap::try_(const std::string& key) +{ + auto it = find(key); + if (it != end()) { + return &it->second; + } + return nullptr; +} + +#if 0 + +void Copy(const Tensor& src, Tensor& dst, Stream& stream) +{ + TM_CHECK(src.dtype() == dst.dtype()); + TM_CHECK(src.shape() == dst.shape()); + + const DataType dtype = src.dtype(); + + auto trivial = [&] { + const ssize_t bytesize = get_byte_size(dtype, src.size()); + check_cuda_error(cudaMemcpyAsync(dst.raw_data(), src.raw_data(), bytesize, cudaMemcpyDefault, stream.handle())); + }; + + if (src.layout().is_contiguous() && dst.layout().is_contiguous()) { + return trivial(); + } + + auto a = src.layout(); + auto b = dst.layout(); + + vector idxs(a.rank()); + std::iota(idxs.begin(), idxs.end(), 0); + std::sort(idxs.begin(), idxs.end(), [&](int i, int j) { // + return a.stride()[j] < a.stride()[i]; + }); + + // innermost dim is not contiguous + if (a.stride(idxs.back()) > 1 || b.stride(idxs.back()) > 1) { + return GenericCopy(src, dst, stream); + } + + a = a.reorder(idxs); + b = b.reorder(idxs); + + // trivial after reorder (e.g. transposed matrices) + if (a.is_contiguous() && b.is_contiguous()) { + return trivial(); + } + + a = a.coalesce(); + b = b.coalesce(); + + int rank = std::max(a.rank(), b.rank()); + + if (rank > 3) { + return GenericCopy(src, dst, stream); + } + + if (a.rank() < rank) { + a = a.view(b.shape()); + } + else if (b.rank() < rank) { + b = b.view(b.shape()); + } + + if (rank == 2) { + check_cuda_error(cudaMemcpy2DAsync(dst.raw_data(), + get_byte_size(dtype, b.stride(0)), + src.raw_data(), + get_byte_size(dtype, a.stride(0)), + get_byte_size(dtype, a.shape(1)), + a.shape(0), + cudaMemcpyDefault, + stream.handle())); + return; + } + + auto [a0, a1] = a.strides(0, 1); + auto [b0, b1] = b.strides(0, 1); + + // make sure the underlying space is actually a cube [x % (y * z) == 0] + if (rank == 3 && a0 % a1 == 0 && b0 % b1 == 0) { + const auto xsz_a = get_byte_size(dtype, a.stride(1)); + const auto xsz_b = get_byte_size(dtype, b.stride(1)); + const auto ysz_a = a0 / a1; + const auto ysz_b = b0 / b1; + + cudaMemcpy3DParms param{}; + param.srcPtr = make_cudaPitchedPtr((void*)src.raw_data(), xsz_a, xsz_a, ysz_a); + param.dstPtr = make_cudaPitchedPtr((void*)dst.raw_data(), xsz_b, xsz_b, ysz_b); + param.extent = make_cudaExtent(get_byte_size(dtype, a.shape(2)), a.shape(1), a.shape(0)); + param.kind = cudaMemcpyDefault; + + if (auto ec = cudaMemcpy3DAsync(¶m, stream.handle()); ec == cudaSuccess) { + TM_LOG_WARNING(cudaGetErrorString(ec)); + return; + } + } + + return GenericCopy(src, dst, stream); +} + +void Copy(const Tensor& src, Tensor&& dst, Stream& stream) +{ + return Copy(src, dst, stream); +} + +#endif + +} // namespace turbomind::core diff --git a/src/turbomind/core/tensor.cu b/src/turbomind/core/tensor.cu new file mode 100644 index 0000000000..8b6c0f724c --- /dev/null +++ b/src/turbomind/core/tensor.cu @@ -0,0 +1,201 @@ + + +#include "src/turbomind/core/buffer.h" +#include "src/turbomind/core/tensor.h" +#include "src/turbomind/kernels/core/array.h" +#include "src/turbomind/kernels/core/math.h" +#include "src/turbomind/kernels/core/meta.h" + +namespace turbomind::core { + +#if 0 + +namespace kernel { + +// This is going to be slow for transposing the innermost dim +template +__global__ void GenericCopy(const T* a, + T* b, + Array stride_a, + Array stride_b, + Array shape, + int ndim, + int64_t size) +{ + Index idx = threadIdx.x + (Index)blockIdx.x * blockDim.x; + + if (idx >= size) { + return; + } + + Array coord; + PRAGMA_UNROLL + for (int i = 0; i < D; ++i) { + if (i < ndim) { + auto div = idx / shape[i]; + auto mod = idx % shape[i]; + coord[i] = mod; + idx = div; + } + } + + int64_t idx_a = 0; + int64_t idx_b = 0; + + PRAGMA_UNROLL + for (int i = 0; i < D; ++i) { + if (i < ndim) { + idx_a += coord[i] * stride_a[i]; + idx_b += coord[i] * stride_b[i]; + } + } + + b[idx_b] = a[idx_a]; +} + +} // namespace kernel + +void GenericCopy(const Tensor& src, Tensor& dst, Stream& stream) +{ + auto a = src.layout(); + auto b = dst.layout(); + + // Sort strides ascending + vector idxs(a.rank()); + std::iota(idxs.begin(), idxs.end(), 0); + std::sort(idxs.begin(), idxs.end(), [&](int i, int j) { // + return a.stride()[i] < a.stride()[j]; + }); + + a = a.permute(idxs); + b = b.permute(idxs); + + a = a.coalesce(); + b = b.coalesce(); + + int rank = std::max(a.rank(), b.rank()); + + if (a.rank() < rank) { + a = a.view(b.shape()); + } + else if (b.rank() < rank) { + b = b.view(b.shape()); + } + + const DataType dtype = src.dtype(); + + int64_t alignment = 16; + + auto align = [&](auto v) { alignment = std::gcd(alignment, v); }; + + if (a.stride(0) > 1 || b.stride(0) > 1) { + alignment = get_byte_size(dtype); + } + + align(get_byte_size(dtype, a.shape(0))); + + auto data_a = src.raw_data(); + auto data_b = dst.raw_data(); + + align(reinterpret_cast(data_a)); + align(reinterpret_cast(data_b)); + + for (int i = 1; i < rank; ++i) { + align(get_byte_size(dtype, a.stride(i))); + align(get_byte_size(dtype, b.stride(i))); + } + + const auto vec_size = get_elem_num(alignment, dtype); + + const auto size = a.size() / vec_size; + + int device{}; + check_cuda_error(cudaGetDevice(&device)); + int sm_num{}; + check_cuda_error(cudaDeviceGetAttribute(&sm_num, cudaDevAttrMultiProcessorCount, device)); + + auto invoke = [&](auto vec_t, auto index_t, auto d) { + using T = decltype(vec_t); + using Index = decltype(index_t); + constexpr int D = d.value; + + Array shape; + std::fill(shape.begin() + rank, shape.end(), 1); + std::copy_n(a.shape().data(), rank, shape.data()); + + Array stride_a{}; + Array stride_b{}; + std::copy_n(a.stride().data(), rank, stride_a.data()); + std::copy_n(b.stride().data(), rank, stride_b.data()); + + if (vec_size > 1) { + shape[0] /= vec_size; + for (int i = 0; i < rank; ++i) { + stride_a[i] /= vec_size; + stride_b[i] /= vec_size; + } + } + + auto func = kernel::GenericCopy; + + int min_waves = INT_MAX; + int block_size = 0; + int grid_size = 0; + + for (int threads = 256; threads <= 1024; threads *= 2) { + int blocks = cdiv(size, block_size); + int n_active{}; + check_cuda_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_active, func, block_size, 0)); + int waves = cdiv(blocks, n_active * sm_num); + if (waves < min_waves) { + min_waves = waves; + block_size = threads; + grid_size = blocks; + } + } + + func<<>>( + (const T*)data_a, (T*)data_b, stride_a, stride_b, shape, rank, a.size()); + }; + + auto invoke_d = [&](auto vec_t, auto idx_t) { + if (rank <= 2) { + invoke(vec_t, idx_t, constant<2>{}); + } + else if (rank <= 4) { + invoke(vec_t, idx_t, constant<4>{}); + } + else if (rank <= 8) { + invoke(vec_t, idx_t, constant<8>{}); + } + else { + throw std::runtime_error("not implemented"); + } + }; + + auto invoke_i = [&](auto vec_t) { + if (size < INT_MAX) { + invoke_d(vec_t, int{}); + } + else { + invoke_d(vec_t, int64_t{}); + } + }; + + switch (alignment) { + case 16: + return invoke_i(uint4{}); + case 8: + return invoke_i(uint2{}); + case 4: + return invoke_i(uint{}); + case 2: + return invoke_i(ushort{}); + default: + return invoke_i(char{}); + } +} + +#endif + +} // namespace turbomind::core diff --git a/src/turbomind/core/tensor.h b/src/turbomind/core/tensor.h new file mode 100644 index 0000000000..3721327748 --- /dev/null +++ b/src/turbomind/core/tensor.h @@ -0,0 +1,316 @@ +#pragma once + +#include +#include +#include + +#include "src/turbomind/core/allocator.h" +#include "src/turbomind/core/buffer.h" +#include "src/turbomind/core/context.h" +#include "src/turbomind/core/layout.h" + +namespace turbomind::core { + +class Tensor { +public: + Tensor() = default; + + Tensor(Layout layout, DataType dtype, Device device): Tensor{layout, dtype, Context::alloc(device)} {} + + Tensor(Layout layout, DataType dtype, Allocator& alloc): layout_{std::move(layout)} + { + buffer_ = Buffer(layout_.cosize(), dtype, alloc); + } + + Tensor(Buffer buffer, Layout layout): layout_{std::move(layout)}, buffer_{std::move(buffer)} + { + TM_CHECK_LE(layout_.cosize(), buffer_.size()); + } + + Tensor(Buffer buffer): layout_{buffer.size()}, buffer_{buffer} {} + + Tensor(void* data, Layout layout, DataType dtype, Device device): + Tensor{Buffer{data, layout.cosize(), dtype, device}, layout} + { + } + + Tensor(std::shared_ptr data, Layout layout, DataType dtype, Device device): + Tensor{Buffer{data, layout.cosize(), dtype, device}, layout} + { + } + + template + Tensor(T* data, Layout layout, Device device): Tensor{Buffer{data, layout.cosize(), device}, layout} + { + } + + static Tensor empty_like(const Tensor& tensor, std::optional device = {}) + { + return Tensor{tensor.layout_, tensor.dtype(), device ? *device : tensor.device()}; + } + + Buffer& buffer() noexcept + { + return buffer_; + } + + const Buffer& buffer() const noexcept + { + return buffer_; + } + + DataType dtype() const + { + return buffer_.dtype(); + } + + Device device() const + { + return buffer_.device(); + } + + ssize_t size() const noexcept + { + return layout_.size(); + } + + ssize_t byte_size() const noexcept + { + return turbomind::byte_size(dtype(), size()); + } + + explicit operator bool() const noexcept + { + return static_cast(buffer_); + } + + template + T* data() + { + return buffer_.data(); + } + + template + const T* data() const + { + return const_cast(this)->data(); + } + + void* raw_data() + { + return buffer_.raw_data(); + } + + const void* raw_data() const + { + return const_cast(this)->raw_data(); + } + + template + T* data_or(T* other) + { + return buffer_.data_or(other); + } + + template + const T* data_or(T* other) const + { + return buffer_.data_or(other); + } + + Tensor view(std::vector shape) const + { + return Tensor{buffer_, layout_.view(std::move(shape))}; + } + + auto& layout() const noexcept + { + return layout_; + } + + auto& shape() const noexcept + { + return layout_.shape(); + } + + auto shape(int i) const + { + return layout_.shape(i); + } + + template + auto shapes(Is&&... is) const + { + return layout_.shapes(((Is &&) is)...); + } + + auto& stride() const noexcept + { + return layout_.stride(); + } + + auto stride(int i) const + { + return layout_.stride(i); + } + + bool is_contiguous() const noexcept + { + return layout().is_contiguous(); + } + + Tensor slice(std::vector base, std::vector shape) const + { + auto&& [layout, offset] = layout_.slice(base, std::move(shape)); + const auto cosize = layout.cosize(); + return Tensor{buffer_.slice(offset, cosize), std::move(layout)}; + } + + // The outermost dimension + Tensor slice(ssize_t base, ssize_t size = 1) const + { + vector bases(shape().size()); + bases.front() = base; + vector sizes{this->shape()}; + sizes.front() = size; + return slice(bases, sizes); + } + + Tensor borrow() const + { + return Tensor{buffer_.borrow(), layout_}; + } + + Tensor squeeze(int dim) const + { + return Tensor{buffer_, layout_.squeeze(dim)}; + } + + int ndim() const noexcept + { + return layout_.rank(); + } + + friend std::ostream& operator<<(std::ostream& os, const Tensor& t); + +private: + Layout layout_; + Buffer buffer_; +}; + +#if 0 +void Copy(const Tensor& src, Tensor& dst, Stream& stream); + +void Copy(const Tensor& src, Tensor&& dst, Stream& stream); + +// Launch a kernel to perform the complicated copying +void GenericCopy(const Tensor& src, Tensor& dst, Stream& stream); + +Tensor Reshape(const Tensor& t, vector shape); + +Tensor Transpoe(const Tensor& t, int dim0, int dim1); + +Tensor Permute(const Tensor& t, vector dims); + +Tensor Contiguous(const Tensor& t); +#endif + +template +struct Tensor_: public Tensor { + Tensor_() = default; + + Tensor_(Layout layout, Device device): Tensor{std::move(layout), data_type_v, device} {} + + Tensor_(Layout layout, Allocator& alloc): Tensor{std::move(layout), data_type_v, alloc} {} + + Tensor_(Buffer buffer, Layout layout): Tensor{ensure_dtype(std::move(buffer)), std::move(layout)} {} + + Tensor_(T* data, Layout layout, Device device): Tensor{data, std::move(layout), device} {} + + Tensor_(shared_ptr data, Layout layout, Device device): + Tensor{Buffer{std::move(data), layout.cosize(), data_type_v, device}, layout} + { + } + + Tensor_(const Tensor_&) = default; + Tensor_& operator=(const Tensor_&) = default; + + Tensor_(Tensor_&&) noexcept = default; + Tensor_& operator=(Tensor_&&) noexcept = default; + + Tensor_(const Tensor& other) + { + *static_cast(this) = ensure_dtype(other); + } + Tensor_(Tensor&& other) noexcept + { + *static_cast(this) = ensure_dtype(std::move(other)); + } + + ssize_t offset(const vector& idxs) + { + return layout().offset(idxs); + } + + T* data() noexcept + { + return Tensor::data(); + } + + const T* data() const noexcept + { + return Tensor::data(); + } + + T* data_or(T* other) + { + return Tensor::data_or(other); + } + + const T* data_or(T* other) const + { + return Tensor::data_or(other); + } + + constexpr DataType dtype() const noexcept + { + return data_type_v; + } + +private: + template + static decltype(auto) ensure_dtype(U&& u) + { + TM_CHECK_EQ(u.dtype(), data_type_v); + return (U &&) u; + } +}; + +class TensorMap: public std::unordered_map { +public: + using std::unordered_map::unordered_map; + + Tensor& at(const std::string& key); + + const Tensor& at(const std::string& key) const + { + return const_cast(this)->at(key); + } + + Tensor* try_(const std::string& key); + + const Tensor* try_(const std::string& key) const + { + return const_cast(this)->try_(key); + } + + bool contains(const std::string& key) const + { + return find(key) != end(); + } + +private: + std::string get_out_of_range_msg(const std::string& key) const; +}; + +} // namespace turbomind::core diff --git a/src/turbomind/core/test_core.cc b/src/turbomind/core/test_core.cc new file mode 100644 index 0000000000..f0abac9b44 --- /dev/null +++ b/src/turbomind/core/test_core.cc @@ -0,0 +1,282 @@ + +#include + +#include "src/turbomind/core/core.h" + +#include "catch2/catch_test_macros.hpp" + +using namespace turbomind; + +TEST_CASE("test check", "[check]") +{ + int zero = 0; + + TM_CHECK(!zero); + + TM_CHECK_EQ(42, 42) << "Ok"; + TM_CHECK_NE(42, 24) << "Ok"; + TM_CHECK_GE(50, 42) << "Ok"; + TM_CHECK_GT(50, 42) << "Ok"; + TM_CHECK_LE(42, 50) << "Ok"; + TM_CHECK_LT(42, 50) << "Ok"; + + if (0) { + TM_CHECK(zero); + TM_CHECK_EQ(42, 43) << "Not " + << "Ok"; + } + + int x = 42; + auto p = TM_CHECK_NOTNULL(&x); + REQUIRE(p == &x); + + if (0) { + int* y{}; + TM_CHECK_NOTNULL(y); + TM_CHECK_NOTNULL(std::shared_ptr{}); + } + + auto y = TM_CHECK_NOTNULL(std::make_shared(42)); + REQUIRE(*y == 42); + + TM_CHECK(y); +} + +TEST_CASE("test allocator", "[allocator]") +{ + + using core::Allocator; + using core::Stream; + + Allocator a; + REQUIRE(!a); + + Allocator b{kCPU}; + REQUIRE(b); + REQUIRE(a != b); + REQUIRE(b->device() == kCPU); + Stream s{}; + REQUIRE(!b->stream()); + + // std::vector v(1 << 20); + // std::iota(v.begin(), v.end(), 0); + + // auto p = (int*)b->allocate(sizeof(int) * v.size()); + // std::iota(p, p + v.size(), 0); + + // REQUIRE(v == std::vector(p, p + v.size())); +} + +TEST_CASE("test context", "[context]") +{ + using core::Context; + using core::ContextGuard; + using core::Stream; + using core::Allocator; + + Stream s0 = Stream::create(); + + ContextGuard g0{s0, Allocator{kCPU}}; + + REQUIRE(Context::stream()); + REQUIRE(Context::stream() == s0); + + auto a0 = Context::host_alloc(); + + { + Allocator a1(Context::stream(), false); // device allocator + REQUIRE(a1->device().type == kDEVICE); + + ContextGuard g1{a1}; + + REQUIRE(Context::stream() == s0); + REQUIRE(Context::device_alloc() == a1); + REQUIRE(Context::host_alloc() == a0); + + { + ContextGuard g2{Stream::create(), Allocator(kDEVICE)}; + REQUIRE(Context::device_alloc() != a1); + REQUIRE(Context::stream() != s0); + } + + REQUIRE(Context::stream() == s0); + REQUIRE(Context::device_alloc() == a1); + } + + REQUIRE(Context::stream() == s0); +} + +TEST_CASE("test basic buffer", "[buffer]") +{ + using core::Buffer; + using core::Buffer_; + using core::Allocator; + + Buffer a; + REQUIRE(!a); + + Buffer b; + REQUIRE(!b); + REQUIRE(a == b); + + std::vector v{0, 1, 2, 3, 4, 5, 6, 7}; + + SECTION("reference into v") + { + b = Buffer(v.data(), v.size(), kCPU); + REQUIRE(b.data() == v.data()); + REQUIRE(b.raw_data() == v.data()); + } + SECTION("shared ownership") + { + auto x = std::shared_ptr(new int[v.size()]); + std::copy(v.begin(), v.end(), x.get()); + b = Buffer(x, v.size(), data_type_v, kCPU); + REQUIRE(b.data() == x.get()); + REQUIRE(b.raw_data() == x.get()); + } + SECTION("allocation") + { + Allocator alloc{kCPU}; + b = Buffer(v.size(), data_type_v, alloc); + std::copy(v.begin(), v.end(), b.data()); + } + + REQUIRE(b); + REQUIRE(b.size() == v.size()); + REQUIRE(b.dtype() == data_type_v); + REQUIRE(b.byte_size() == sizeof(int) * v.size()); + auto c = b; + REQUIRE(c == b); + REQUIRE(b == c); + REQUIRE(a != b); + REQUIRE(b != a); + REQUIRE(std::vector(b.data(), b.data() + b.size()) == v); + + auto s = b.slice(3, 2); + REQUIRE(s.size() == 2); + REQUIRE(s.raw_data() == b.data() + 3); + + Buffer_ x; + Buffer_ y = Buffer{data_type_v}; + + Buffer z = Buffer_(1024, kCPU); + + x = z; + + for (int i = 0; i < z.size(); ++i) { + x[i] = i; + } + + std::vector ref(1024); + std::iota(ref.begin(), ref.end(), 0); + REQUIRE(std::vector(x.begin(), x.end()) == ref); + + Buffer e; + REQUIRE(!e.data_or((void*)0)); + REQUIRE(!e.data_or(nullptr)); +} + +TEST_CASE("test buffer view", "[buffer]") +{ + using core::Buffer; + + std::vector v{0, 1, 2, 3, 4, 5, 6, 7}; + + Buffer b(v.data(), v.size(), kCPU); + + auto c = b.slice(2, 4); + REQUIRE(c.size() == 4); + REQUIRE(c.raw_data() == b.data() + 2); + + std::cout << c << std::endl; + + auto d = c.view(); + + REQUIRE(d.size() == c.size() * 2); + REQUIRE(d.raw_data() == c.raw_data()); +} + +TEST_CASE("test layout", "[layout]") +{ + using core::Layout; + + Layout a; // default ctor + REQUIRE(a.size() == 0); + REQUIRE(a.cosize() == 0); + + Layout b({20, 50}); + REQUIRE(b.size() == 1000); + REQUIRE(b.cosize() == b.size()); + REQUIRE(to_string(b) == "(20,50):(50,1)"); + + Layout c = b.coalesce(); + REQUIRE(c.size() == b.size()); + REQUIRE(c.cosize() == b.cosize()); + REQUIRE(to_string(c) == "(1000):(1)"); + + Layout v = b.view({50, 20}); + REQUIRE(v.size() == b.size()); + REQUIRE(v.cosize() == b.cosize()); + REQUIRE(to_string(v) == "(50,20):(20,1)"); + + v = b.view({25, -1}); + REQUIRE(to_string(v) == "(25,40):(40,1)"); + + v = b.view({5, -1, 5}); + REQUIRE(to_string(v) == "(5,40,5):(200,5,1)"); + + v = b.view({-1, 20, 10, 1}); + REQUIRE(to_string(v) == "(5,20,10,1):(200,10,1,1)"); + + REQUIRE(to_string(v.coalesce()) == "(1000):(1)"); + + auto [s, offset] = b.slice({10, 20}, {-1, -1}); + REQUIRE(to_string(s) == "(10,30):(50,1)"); + REQUIRE(offset == 520); + + v = s.view({2, -1, 3, 10}); + std::cout << v << std::endl; + + std::cout << v.coalesce() << std::endl; + + // v = s.view({30, 10}); + // std::cout << v << std::endl; +} + +TEST_CASE("test tensor", "[tensor]") +{ + using core::Tensor; + using core::Tensor_; + using core::Allocator; + + Tensor a; + REQUIRE(!a); + + Tensor_ b{{10, 20}, kCPU}; + Tensor_ c = b.slice(0, 5); + + std::cout << b << std::endl; + + REQUIRE(c.shape() == std::vector{5, 20}); + REQUIRE(c.data() == b.data()); + + auto d = b.view({2, -1, 10}); + REQUIRE(d.shape() == std::vector{2, 10, 10}); + + // this is typed + Tensor_ x = Tensor_{}; + // while being empty + REQUIRE(!x); + + if (0) { + // empty Tensor has invalid type + Tensor_ x = Tensor{}; + } + a = {}; + x = {}; + + Tensor y = core::Buffer{100, kInt32, kCPU}; + REQUIRE(y.ndim() == 1); + REQUIRE(y.shape(0) == 100); +} diff --git a/src/turbomind/engine/CMakeLists.txt b/src/turbomind/engine/CMakeLists.txt index 1d68116cf6..6836d98155 100644 --- a/src/turbomind/engine/CMakeLists.txt +++ b/src/turbomind/engine/CMakeLists.txt @@ -3,5 +3,6 @@ cmake_minimum_required(VERSION 3.8) add_library(engine STATIC gateway.cc request_queue.cc model_request.cc) +target_link_libraries(engine PRIVATE core) set_property(TARGET engine PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET engine PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) diff --git a/src/turbomind/engine/model_request.cc b/src/turbomind/engine/model_request.cc index dc27305139..29986405d6 100644 --- a/src/turbomind/engine/model_request.cc +++ b/src/turbomind/engine/model_request.cc @@ -3,57 +3,15 @@ #include #include #include -#include #include -#include #include -#include #include "src/turbomind/engine/model_request.h" #include "src/turbomind/engine/request.h" -#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/constant.h" -#include "src/turbomind/utils/cuda_utils.h" namespace turbomind { -static ManagedTensor create(DataType dtype, MemoryType where, const std::vector& size, int64_t& byte_size) -{ - byte_size = std::accumulate(size.begin(), size.end(), Tensor::getTypeSize(dtype), std::multiplies<>{}); - void* data{}; - - if (where == MEMORY_GPU) { - check_cuda_error(cudaMallocAsync(&data, byte_size, nullptr)); - } - else { - data = std::malloc(byte_size); - } - - ManagedTensor ret; - ret.tensor = Tensor{where, dtype, std::vector(size.begin(), size.end()), data}; - ret.data_holder.reset((void*)nullptr, [data, where](auto) { - // std::cerr << "turbomind tensor deallocate" << std::endl; - if (where == MEMORY_GPU) { - /// TODO: guard device id - check_cuda_error(cudaFreeAsync(data, nullptr)); - } - else { - std::free(data); - } - }); - return ret; -} - -template -static T get(const std::unordered_map& m, const std::string& key, T fallback = {}) -{ - auto it = m.find(key); - if (it != m.end()) { - return it->second->getVal(); - } - return fallback; -} - ModelRequest::ModelRequest(Gateway* gateway, DataType data_type, int session_len, int vocab_size, int hidden_dim): gateway_{gateway}, data_type_{data_type}, @@ -85,27 +43,25 @@ void ModelRequest::End(std::function cb, uint64_t session_id) auto ModelRequest::Forward(InputParam param, std::function cb) -> OutputParam { - inputs_ = std::make_shared(); - outputs_ = std::make_shared(); + inputs_ = std::make_shared(); + outputs_ = std::make_shared(); auto add = [](auto& dest, auto key, auto dtype, auto where, auto shape, auto&&... dims) { - std::vector shape_; + Layout shape_; if constexpr (std::is_integral_v) { shape_ = {shape, dims...}; } else { shape_ = {shape.cbegin(), shape.cend()}; } - int64_t byte_size{}; - auto it = dest->emplace(key, create(dtype, where, shape_, byte_size)).first; - return std::make_pair(it->second->data, byte_size); + dest->emplace(key, Tensor{shape_, dtype, where}); }; auto& inputs = *param.tensors; - FT_CHECK(inputs.at("input_ids")->shape.size() == 1); + TM_CHECK_EQ(inputs.at("input_ids").ndim(), 1); - const int input_len = inputs.at("input_ids")->shape[0]; + const int input_len = inputs.at("input_ids").shape(0); const int output_len = param.gen_cfg.max_new_tokens; // Max possible length of a sequence, this depends on `history_len` which isn't available here, so `session_len` @@ -119,32 +75,32 @@ auto ModelRequest::Forward(InputParam param, std::function cb) -> Output inputs_->emplace(k, v); } - add(outputs_, "output_ids", TYPE_INT32, MEMORY_CPU, max_seq_len); - add(outputs_, "sequence_length", TYPE_INT32, MEMORY_CPU, 1); + add(outputs_, "output_ids", data_type_v, kCPU, max_seq_len); + add(outputs_, "sequence_length", data_type_v, kCPU, 1); if (param.gen_cfg.output_logits) { const int len = param.gen_cfg.output_logits == GenerationConfig::kAll ? max_in_out_len : max_out_len; - add(outputs_, "logits", data_type_, MEMORY_CPU, len, vocab_size_); + add(outputs_, "logits", data_type_, kCPU, len, vocab_size_); } if (param.gen_cfg.output_last_hidden_state) { const int len = param.gen_cfg.output_last_hidden_state == GenerationConfig::kAll ? max_in_out_len : max_out_len; - add(outputs_, "last_hidden_state", data_type_, MEMORY_CPU, len, hidden_dim_); + add(outputs_, "last_hidden_state", data_type_, kCPU, len, hidden_dim_); } if (param.gen_cfg.output_logprobs) { - add(outputs_, "logprob_vals", data_type_, MEMORY_CPU, max_out_len, kMaxLogProb); - add(outputs_, "logprob_indexes", TYPE_INT32, MEMORY_CPU, max_out_len, kMaxLogProb); - add(outputs_, "logprob_nums", TYPE_INT32, MEMORY_CPU, max_out_len); + add(outputs_, "logprob_vals", data_type_, kCPU, max_out_len, kMaxLogProb); + add(outputs_, "logprob_indexes", data_type_v, kCPU, max_out_len, kMaxLogProb); + add(outputs_, "logprob_nums", data_type_v, kCPU, max_out_len); } auto r = std::make_shared(); for (const auto& [k, v] : *inputs_) { - r->inputs.insert(k, *v); + r->inputs.emplace(k, v); } for (const auto& [k, v] : *outputs_) { - r->outputs.insert(k, *v); + r->outputs.emplace(k, v); } auto state = std::make_shared(); @@ -160,8 +116,8 @@ auto ModelRequest::Forward(InputParam param, std::function cb) -> Output r->forward_cb = std::move(cb); r->state = state; - r->output_ids = *outputs_->at("output_ids"); - r->sequence_length = *outputs_->at("sequence_length"); + r->output_ids = outputs_->at("output_ids"); + r->sequence_length = outputs_->at("sequence_length"); // Keep a weak reference for canceling the request request_ = r; diff --git a/src/turbomind/engine/model_request.h b/src/turbomind/engine/model_request.h index aea889e856..b788c0434f 100644 --- a/src/turbomind/engine/model_request.h +++ b/src/turbomind/engine/model_request.h @@ -4,8 +4,8 @@ #include +#include "src/turbomind/core/core.h" #include "src/turbomind/engine/gateway.h" -#include "src/turbomind/utils/Tensor.h" namespace turbomind { @@ -21,10 +21,8 @@ class ModelRequest { // Reset the channel to uninitailized state, calls `notify` when done void End(std::function cb, uint64_t session_id); - using TensorMap_ = std::unordered_map; - struct InputParam { - std::shared_ptr tensors; + std::shared_ptr tensors; SessionParam session; GenerationConfig gen_cfg; @@ -33,7 +31,7 @@ class ModelRequest { }; struct OutputParam { - std::shared_ptr tensors; + std::shared_ptr tensors; std::shared_ptr state; }; @@ -52,8 +50,8 @@ class ModelRequest { std::weak_ptr request_; - std::shared_ptr inputs_; // owned by caller - std::shared_ptr outputs_; // owned by `this` + std::shared_ptr inputs_; + std::shared_ptr outputs_; }; } // namespace turbomind diff --git a/src/turbomind/engine/request.h b/src/turbomind/engine/request.h index 28f2943b54..31276c004a 100644 --- a/src/turbomind/engine/request.h +++ b/src/turbomind/engine/request.h @@ -10,7 +10,7 @@ #include #include -#include "src/turbomind/utils/Tensor.h" +#include "src/turbomind/core/core.h" namespace turbomind { @@ -122,8 +122,8 @@ struct Request { TensorMap inputs; TensorMap outputs; // fast path for accessing common output buffers - Tensor output_ids; - Tensor sequence_length; + Tensor_ output_ids; + Tensor_ sequence_length; std::function end_cb; diff --git a/src/turbomind/kernels/activation_kernels.cu b/src/turbomind/kernels/activation_kernels.cu index ec5292976f..77373a090c 100644 --- a/src/turbomind/kernels/activation_kernels.cu +++ b/src/turbomind/kernels/activation_kernels.cu @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "src/turbomind/core/core.h" +#include "src/turbomind/core/data_type.h" #include "src/turbomind/kernels/activation_kernels.h" #include "src/turbomind/kernels/core/array.h" #include "src/turbomind/kernels/core/array_ops.h" @@ -171,157 +173,6 @@ struct IdentityActivation { } }; -// clang-format off -template class Activation, typename T, typename BT> -__global__ void generic_activation(T* out, - const BT* __restrict bias, - const T* __restrict gated_weights, - const BT* __restrict gated_bias, - const int* __restrict ia3_tasks, - const T* __restrict ia3_weights, - const int int8_mode, - const float* __restrict activation_in, - const float* __restrict activation_out, - const int* __restrict padding_offset, - const int seq_len, - int m, - int n) -{ - constexpr size_t packed_elems = num_elems::value; - - const bool with_bias = bias != nullptr; - const bool with_gate = gated_weights != nullptr; - // const bool with_ia3 = ia3_tasks != nullptr; - - using Act_T = typename Activation::return_type; - using Float_T = typename packed_as::type; - using Packed_Int8_t = typename packed_as::type; - - for (int64_t id = blockIdx.x * blockDim.x + threadIdx.x; id < 1LL * m * n; id += blockDim.x * gridDim.x) { - T val; - if (int8_mode == 2) { - // val = cuda_cast(cuda_cast(reinterpret_cast(out)[id]) * activation_in[0]); - } - else { - val = out[id]; - } - - T gated_val; - if (with_gate) { - gated_val = gated_weights[id]; - } - - // if (with_bias) { - // const T reg_bias = static_cast(bias[id % n]); - // val = val + reg_bias; - - // if (with_gate) { - // const T reg_gated_bias = static_cast(gated_bias[id % n]); - // gated_val = gated_val + reg_gated_bias; - // } - // } - - if (with_gate) { - val = cuda_cast(Activation::apply(val) * cuda_cast(gated_val)); - } - else { - // val = cuda_cast(Activation::apply(val)); - } - - // if (with_ia3) { - // const int word_id = id / n; - // const int offset = padding_offset == nullptr ? 0 : padding_offset[word_id]; - // const int batch_id = (word_id + offset) / seq_len; - // const int task = ia3_tasks[batch_id]; - // val = val * ia3_weights[task * n + (id % n)]; - // } - - if (int8_mode != 2) { - out[id] = val; - } - else { - // reinterpret_cast(out)[id] = - // cuda_cast(cuda_cast(val) * activation_out[0]); - } - } -} -// clang-format on - -template class Activation, typename T, typename BT> -void invokeGenericActivation(T* out, - const BT* bias, - const T* gated_weights, - const BT* gated_bias, - const int* ia3_tasks, - const T* ia3_weights, - const int m, - const int n, - const int int8_mode, - const float* activation_in, - const float* activation_out, - const int* padding_offset, - const int seq_len, - cudaStream_t stream) -{ - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - TM_LOG_DEBUG("invokeGenericActivation %d %d %d", m, n, seq_len); - using PT = typename packed_type::type; - constexpr int packed_elems = num_elems::value; - using PBT = typename packed_as::type; - - const int n_threads = 512; - - dim3 block, grid; - if (n / 4 / packed_elems <= n_threads) { - block.x = n / 4 / packed_elems; - grid.x = m; - } - else { - block.x = n_threads; - grid.x = ceil(1LL * m * n / double(n_threads)); - } - TM_LOG_DEBUG("%d %d", grid.x, block.x); - sync_check_cuda_error(); - generic_activation<<>>(reinterpret_cast(out), - reinterpret_cast(bias), - reinterpret_cast(gated_weights), - reinterpret_cast(gated_bias), - ia3_tasks, - reinterpret_cast(ia3_weights), - int8_mode, - activation_in, - activation_out, - padding_offset, - seq_len, - m, - n / packed_elems); - sync_check_cuda_error(); -} - -#define INSTANTIATE_GENERIC_ACTIVATION(Activation, T, BT) \ - template void invokeGenericActivation(T * out, \ - const BT* bias, \ - const T* gated_weights, \ - const BT* gated_bias, \ - const int* ia3_tasks, \ - const T* ia3_weights, \ - const int m, \ - const int n, \ - const int int8_mode, \ - const float* activation_in, \ - const float* activation_out, \ - const int* padding_offset, \ - const int seq_len, \ - cudaStream_t stream); - -INSTANTIATE_GENERIC_ACTIVATION(SiluActivation, half, half); -#ifdef ENABLE_FP32 -INSTANTIATE_GENERIC_ACTIVATION(SiluActivation, float, float); -#endif -#ifdef ENABLE_BF16 -INSTANTIATE_GENERIC_ACTIVATION(SiluActivation, __nv_bfloat16, __nv_bfloat16); -#endif - // `output` may be an alias of `inter_buf` template class Activation, typename T> __global__ void activation_kernel(T* inter_buf, const T* __restrict__ gate_buf, int64_t stride, int token_num, int dims) @@ -367,16 +218,33 @@ void invokeGenericActivation_v2( <<>>(inter_buf, gate_buf, stride, token_num, dims); } -#define INSTANTIATE_ACTIVATION(Activation, T) \ - template void invokeGenericActivation_v2( \ - T * inter_buf, const T* __restrict__ gate_buf, int64_t stride, int token_num, int dims, cudaStream_t stream) +template class Activation> +void invokeGenericActivation_v3(Ref inter_, const Tensor& gate, cudaStream_t stream) +{ + auto& inter = inter_.get(); + TM_CHECK_EQ(inter.ndim(), 2); + TM_CHECK_EQ(gate.ndim(), 2); + TM_CHECK_EQ(inter.stride(0), gate.stride(0)); -INSTANTIATE_ACTIVATION(SiluActivation, half); -#ifdef ENABLE_FP32 -INSTANTIATE_ACTIVATION(SiluActivation, float); -#endif -#ifdef ENABLE_BF16 -INSTANTIATE_ACTIVATION(SiluActivation, __nv_bfloat16); -#endif + TM_CHECK(inter.shape() == gate.shape()); + + auto invoke = [&](auto t) { + using T = decltype(t); + + const auto [num, dim] = inter.shapes(0, 1); + + constexpr int kVecSize = 4; + constexpr int block = 512; + + const dim3 grid(num, cdiv((int)dim, block * kVecSize)); + + activation_kernel + <<>>(inter.data(), gate.data(), inter.stride(0), num, dim); + }; + + TM_DISPATCH_PRIMARY_DTYPES(inter.dtype(), invoke); +} + +template void invokeGenericActivation_v3(Ref inter_, const Tensor& gate, cudaStream_t stream); } // namespace turbomind diff --git a/src/turbomind/kernels/activation_kernels.h b/src/turbomind/kernels/activation_kernels.h index 1197ee4806..935203cf1e 100644 --- a/src/turbomind/kernels/activation_kernels.h +++ b/src/turbomind/kernels/activation_kernels.h @@ -16,10 +16,9 @@ #pragma once -#include "src/turbomind/utils/cuda_bf16_wrapper.h" -#include #include -#include + +#include "src/turbomind/core/core.h" namespace turbomind { @@ -30,85 +29,7 @@ template struct SiluActivation; template struct IdentityActivation; // clang-format on -template class Activation, typename T, typename BT> -void invokeGenericActivation(T* out, - const BT* bias, - const T* gated_weights, - const BT* gated_bias, - const int* ia3_tasks, - const T* ia3_weights, - const int m, - const int n, - const int int8_mode, - const float* activation_in, - const float* activation_out, - const int* padding_offset, - const int seq_len, - cudaStream_t stream); - -template class Activation, typename T, typename BT> -void invokeGenericActivation(T* out, - const BT* bias, - const T* gated_weights, - const BT* gated_bias, - const int* ia3_tasks, - const T* ia3_weights, - const int m, - const int n, - const int int8_mode, - const float* activation_in, - const float* activation_out, - cudaStream_t stream) -{ - invokeGenericActivation(out, - bias, - gated_weights, - gated_bias, - ia3_tasks, - ia3_weights, - m, - n, - int8_mode, - activation_in, - activation_out, - (const int*)nullptr, - 0, - stream); -} - -template class Activation, typename T> -void invokeGenericActivation_v2( - T* inter_buf, const T* __restrict__ gate_buf, int64_t stride, int token_num, int dims, cudaStream_t stream); - -template -void invokeAddBiasGeluV2(T* out, - const T* bias, - const int* ia3_tasks, - const T* ia3_weights, - const int* padding_offset, - const int seq_len, - const int m, - const int n, - cudaStream_t stream); - -template -void invokeAddBias(T* out, T const* bias, const int m, const int n, cudaStream_t stream) -{ - invokeGenericActivation( - out, bias, nullptr, nullptr, nullptr, nullptr, m, n, 0, nullptr, nullptr, stream); -} - -template -void invokeAddBiasGeluV2( - T* out, const T* bias, const int* ia3_tasks, const T* ia3_weights, const int m, const int n, cudaStream_t stream) -{ - invokeAddBiasGeluV2(out, bias, ia3_tasks, ia3_weights, nullptr, 0, m, n, stream); -} - -template -void invokeAddBiasTanh(T* out, const T* bias, const int m, const int n, cudaStream_t stream); - -template -void invokeSigmoid(T* data, const int size, const float scale, cudaStream_t stream); +template class Activation> +void invokeGenericActivation_v3(Ref inter_, const Tensor& gate, cudaStream_t stream); } // namespace turbomind diff --git a/src/turbomind/kernels/attention/CMakeLists.txt b/src/turbomind/kernels/attention/CMakeLists.txt index 32de38981a..e1c92cf83c 100644 --- a/src/turbomind/kernels/attention/CMakeLists.txt +++ b/src/turbomind/kernels/attention/CMakeLists.txt @@ -63,7 +63,6 @@ if (BUILD_TEST) Llama unfused_attention_kernels logger - tensor cublas) add_executable(test_quant test_quant.cu test_utils.cu) diff --git a/src/turbomind/kernels/attention/attention.cu b/src/turbomind/kernels/attention/attention.cu index e7642584c2..8dcd409474 100644 --- a/src/turbomind/kernels/attention/attention.cu +++ b/src/turbomind/kernels/attention/attention.cu @@ -4,6 +4,7 @@ #include "attention_config.h" #include "src/turbomind/kernels/attention/arch.h" #include "src/turbomind/models/llama/llama_utils.h" +#include "src/turbomind/utils/cuda_utils.h" namespace turbomind { diff --git a/src/turbomind/kernels/attention/decoding.cu b/src/turbomind/kernels/attention/decoding.cu index 67bd81e45b..d7b0821b5d 100644 --- a/src/turbomind/kernels/attention/decoding.cu +++ b/src/turbomind/kernels/attention/decoding.cu @@ -1,11 +1,13 @@ // Copyright (c) OpenMMLab. All rights reserved. +#include +#include + #include "decoding.h" #include "decoding_config.h" #include "src/turbomind/kernels/attention/arch.h" #include "src/turbomind/models/llama/llama_utils.h" -#include -#include +#include "src/turbomind/utils/cuda_utils.h" namespace turbomind { diff --git a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu index f4b7fd4296..adb697e8c4 100644 --- a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu +++ b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu @@ -1,5 +1,7 @@ // Copyright (c) OpenMMLab. All rights reserved. +#include + #include "src/turbomind/kernels/attention/block.h" #include "src/turbomind/kernels/attention/kv_cache_utils_v2.h" #include "src/turbomind/kernels/attention/quantization.h" @@ -7,7 +9,7 @@ #include "src/turbomind/kernels/core/array_ops.h" #include "src/turbomind/kernels/core/thread_map.h" #include "src/turbomind/models/llama/llama_utils.h" -#include +#include "src/turbomind/utils/cuda_utils.h" namespace turbomind { diff --git a/src/turbomind/kernels/attention/kv_cache_utils_v2.h b/src/turbomind/kernels/attention/kv_cache_utils_v2.h index 8a34f58759..01525f5596 100644 --- a/src/turbomind/kernels/attention/kv_cache_utils_v2.h +++ b/src/turbomind/kernels/attention/kv_cache_utils_v2.h @@ -2,8 +2,8 @@ #pragma once +#include "src/turbomind/core/data_type.h" #include "src/turbomind/kernels/attention/attention_params.h" -#include "src/turbomind/utils/Tensor.h" namespace turbomind { diff --git a/src/turbomind/kernels/attention/quantization.h b/src/turbomind/kernels/attention/quantization.h index 02f49d0089..8f8dd4a92f 100644 --- a/src/turbomind/kernels/attention/quantization.h +++ b/src/turbomind/kernels/attention/quantization.h @@ -694,6 +694,7 @@ struct ConvertKvCache { } }; +#if 0 inline __device__ Array cvt_bf16x4_e4m3(const Array& v) { #if TURBOMIND_ARCH_SM80 @@ -743,6 +744,7 @@ struct ConvertKvCache { } } }; +#endif template inline __device__ void StoreQuantParam(T* dst, Array src) diff --git a/src/turbomind/kernels/attention/reference.h b/src/turbomind/kernels/attention/reference.h index 9958ddd3ad..7c55c6d9df 100644 --- a/src/turbomind/kernels/attention/reference.h +++ b/src/turbomind/kernels/attention/reference.h @@ -2,12 +2,14 @@ #pragma once -#include "src/turbomind/kernels/flash_attention/flash_attention.h" -#include "src/turbomind/kernels/unfused_attention_kernels.h" -#include "src/turbomind/utils/cublasMMWrapper.h" +#include #include + #include +#include "src/turbomind/kernels/flash_attention/flash_attention.h" +#include "src/turbomind/kernels/unfused_attention_kernels.h" + namespace turbomind { template diff --git a/src/turbomind/kernels/ban_bad_words.cu b/src/turbomind/kernels/ban_bad_words.cu index 376432116f..3cc133c688 100644 --- a/src/turbomind/kernels/ban_bad_words.cu +++ b/src/turbomind/kernels/ban_bad_words.cu @@ -15,11 +15,40 @@ */ #include "src/turbomind/kernels/ban_bad_words.h" -#include "src/turbomind/kernels/reduce_kernel_utils.cuh" -#include "src/turbomind/utils/cuda_utils.h" +#include +// #include "src/turbomind/kernels/reduce_kernel_utils.cuh" +// #include "src/turbomind/utils/cuda_utils.h" +#include +#include namespace turbomind { +template +__device__ inline T getMaxValue(); + +template<> +__device__ inline float getMaxValue() +{ + return FLT_MAX; +} + +template<> +__device__ inline half getMaxValue() +{ + return __ushort_as_half((unsigned short)0x7BFFU); +} + +#ifdef ENABLE_BF16 +template<> +__device__ inline __nv_bfloat16 getMaxValue<__nv_bfloat16>() +{ +#if __CUDA_ARCH__ >= 800 + return __ushort_as_bfloat16((unsigned short)0x7F7FU); +#endif + return {}; +} +#endif + template __global__ void ban_bad_words(T* logits, const int* output_ids_buf, @@ -117,7 +146,6 @@ void invokeBanBadWords(T* logits, id_offset, vocab_size_padded, step); - sync_check_cuda_error(); } #define INSTANTIATE_INVOKE_BAN_BAD_WORDS(T) \ diff --git a/src/turbomind/kernels/ban_bad_words.h b/src/turbomind/kernels/ban_bad_words.h index 05bdc00849..af2c21158a 100644 --- a/src/turbomind/kernels/ban_bad_words.h +++ b/src/turbomind/kernels/ban_bad_words.h @@ -16,7 +16,6 @@ #pragma once -#include #include namespace turbomind { diff --git a/src/turbomind/kernels/core/data_type.h b/src/turbomind/kernels/core/data_type.h index f57d1a2714..0c438bade0 100644 --- a/src/turbomind/kernels/core/data_type.h +++ b/src/turbomind/kernels/core/data_type.h @@ -2,67 +2,16 @@ #pragma once -#include -#include - #include #if ENABLE_BF16 #include #endif -namespace turbomind { - -struct uint1_t { -}; -struct uint2_t { -}; -struct uint3_t { -}; -struct uint4_t { -}; -struct uint5_t { -}; -struct uint6_t { -}; - -template -struct bitsof_t: std::integral_constant { -}; - -template<> -struct bitsof_t: std::integral_constant { -}; - -template<> -struct bitsof_t: std::integral_constant { -}; - -template<> -struct bitsof_t: std::integral_constant { -}; // 2 + 1 - -template<> -struct bitsof_t: std::integral_constant { -}; - -template<> -struct bitsof_t: std::integral_constant { -}; // 4 + 1 - -template<> -struct bitsof_t: std::integral_constant { -}; // 4 + 2 +#include -template -inline constexpr bitsof_t bitsof{}; +#include "src/turbomind/core/data_type.h" -struct fp8 { - char v; -}; -struct fp8_e4m3: fp8 { -}; -struct fp8_e5m2: fp8 { -}; +namespace turbomind { namespace detail { diff --git a/src/turbomind/kernels/gemm/CMakeLists.txt b/src/turbomind/kernels/gemm/CMakeLists.txt index 4e398e9e25..a9ff849e9e 100644 --- a/src/turbomind/kernels/gemm/CMakeLists.txt +++ b/src/turbomind/kernels/gemm/CMakeLists.txt @@ -47,10 +47,10 @@ if (BUILD_TEST) # test/test_utils.cu test/quantization.cu test/reference.cu) - target_link_libraries(gemm_test PRIVATE gemm2 cublas) + target_link_libraries(gemm_test PRIVATE gemm2 core cublas) add_executable(test_moe_utils test/test_moe_utils.cu test/test_utils.cu) - target_link_libraries(test_moe_utils PRIVATE gemm2 cublas) + target_link_libraries(test_moe_utils PRIVATE gemm2 core cublas) if (NOT MSVC) FetchContent_Declare( @@ -60,6 +60,7 @@ if (BUILD_TEST) ) set(NVBench_ENABLE_EXAMPLES OFF) + set(NVBench_ENABLE_TESTING OFF) set(BUILD_SHARED_LIBS OFF) FetchContent_MakeAvailable(repo-nvbench) @@ -69,6 +70,6 @@ if (BUILD_TEST) # test/test_utils.cu test/quantization.cu test/reference.cu) - target_link_libraries(gemm_bench PRIVATE gemm2 nvbench::nvbench cublas) + target_link_libraries(gemm_bench PRIVATE gemm2 core nvbench::nvbench cublas) endif () endif () diff --git a/src/turbomind/kernels/gemm/context.cu b/src/turbomind/kernels/gemm/context.cu index 1b1ea1a2c3..4aca585673 100644 --- a/src/turbomind/kernels/gemm/context.cu +++ b/src/turbomind/kernels/gemm/context.cu @@ -188,10 +188,10 @@ std::vector StaticGemmContext::Populate(const Kernel& kernel, const const int64_t mma_cost = wave_mma_cost * waves; // IO has less severe quantization effect - const int64_t mio_cost_a = get_size(desc.type_a, tiled_shape_n * m * split_ceil_k) * splits; - const int64_t mio_cost_b = get_size(desc.type_b, tiled_shape_m * n * split_ceil_k) * splits; + const int64_t mio_cost_a = byte_size(desc.type_a, tiled_shape_n * m * split_ceil_k) * splits; + const int64_t mio_cost_b = byte_size(desc.type_b, tiled_shape_m * n * split_ceil_k) * splits; /// TODO: read type from `desc_.accum` when added - const int64_t mio_cost_c = get_size(DataType::F32, (int64_t)m * n) * (splits - 1) * 2; + const int64_t mio_cost_c = byte_size(desc.type_c, (int64_t)m * n) * (splits - 1) * 2; const int64_t mio_cost = mio_cost_a + mio_cost_b + mio_cost_c; // std::cout << name() << " " << splits << " " << waves << " " << (float)mio_cost << " " << (float)mma_cost @@ -435,10 +435,10 @@ std::vector MoeGemmContext::Populate(const Kernel& kernel, const Pop const int64_t mma_cost = wave_mma_cost * waves; // IO has less severe quantization effect - const int64_t mio_cost_a = get_size(desc.type_a, tiled_shape_n * batch_size * split_ceil_k) * num * splits; - const int64_t mio_cost_b = get_size(desc.type_b, tiled_shape_m * n * split_ceil_k) * num * splits; + const int64_t mio_cost_a = byte_size(desc.type_a, tiled_shape_n * batch_size * split_ceil_k) * num * splits; + const int64_t mio_cost_b = byte_size(desc.type_b, tiled_shape_m * n * split_ceil_k) * num * splits; /// TODO: read type from `desc_.accum` when added - const int64_t mio_cost_c = get_size(DataType::F32, (int64_t)batch_size * n) * num * (splits - 1) * 2; + const int64_t mio_cost_c = byte_size(desc.type_c, (int64_t)batch_size * n) * num * (splits - 1) * 2; const int64_t mio_cost = mio_cost_a + mio_cost_b + mio_cost_c; LaunchSpec spec{}; diff --git a/src/turbomind/kernels/gemm/convert_v2.cu b/src/turbomind/kernels/gemm/convert_v2.cu index e58bfc9b95..a718c50410 100644 --- a/src/turbomind/kernels/gemm/convert_v2.cu +++ b/src/turbomind/kernels/gemm/convert_v2.cu @@ -157,12 +157,12 @@ int Convert(const void* S, // auto dispatch_3 = [&](auto mma, auto operand, auto order) -> bool { if constexpr (is_AB(operand)) { switch (Ddesc.type) { - case DataType::F16: - case DataType::BF16: + case kFloat16: + case kBfloat16: return dispatch_4(mma, operand, order, type_c, type_c); - case DataType::U8: + case kUint8: return dispatch_4(mma, operand, order, type_c, type_c); - case DataType::U4: + case kUint4: return dispatch_4(mma, operand, order, type_c, type_c); default: return false; @@ -170,7 +170,7 @@ int Convert(const void* S, // } else { // UV: U16, U32 switch (Ddesc.type) { - case DataType::U32: + case kUint32: return dispatch_4(mma, operand, order, type_c, type_c); default: return false; @@ -228,11 +228,11 @@ std::tuple get_weight_and_scales_layout(DataType dtype, bool is_fused_moe, int sm, bool force_simt) { if (is_fused_moe) { - if (dtype == DataType::BF16 && sm >= 80) { + if (dtype == kBfloat16 && sm >= 80) { return {kColMajor, HMMA_16816 | OPERAND_B | 1, {}, {}}; } - if (dtype == DataType::F16) { + if (dtype == kFloat16) { if (sm >= 80) { return {kColMajor, HMMA_16816 | OPERAND_B | 1, {}, {}}; } @@ -243,7 +243,7 @@ get_weight_and_scales_layout(DataType dtype, bool is_fused_moe, int sm, bool for return {kColMajor, HMMA_884 | OPERAND_B | 1, {}, {}}; } } - else if (dtype == DataType::U4) { + else if (dtype == kUint4) { if (sm >= 80) { return {kColMajor, HMMA_16816 | OPERAND_B | 2, kRowMajor, HMMA_16816 | OPERAND_V | 1}; } @@ -256,7 +256,7 @@ get_weight_and_scales_layout(DataType dtype, bool is_fused_moe, int sm, bool for } } else { - if (dtype == DataType::U4) { + if (dtype == kUint4) { if (force_simt) { return {kColMajor, HMMA_SIMT | OPERAND_B | 1, kRowMajor, HMMA_SIMT | OPERAND_V | 1}; } diff --git a/src/turbomind/kernels/gemm/kernel_impl.h b/src/turbomind/kernels/gemm/kernel_impl.h index 3980e1d222..760f29fc55 100644 --- a/src/turbomind/kernels/gemm/kernel_impl.h +++ b/src/turbomind/kernels/gemm/kernel_impl.h @@ -39,9 +39,9 @@ class KernelImpl: public Kernel { desc_.order_b = transpose(OpB::kOrder); desc_.order_c = Gemm::kOrderC; - desc_.type_a = get_data_type_v; - desc_.type_b = get_data_type_v; - desc_.type_c = get_data_type_v; + desc_.type_a = data_type_v; + desc_.type_b = data_type_v; + desc_.type_c = data_type_v; using IterA = typename OpA::GmemIter; using IterB = typename OpB::GmemIter; @@ -127,9 +127,9 @@ class KernelImpl: public Kernel { MatrixLayout Adesc = _Adesc; - const int m = Ddesc.rows; - const int n = Ddesc.cols; - const int k = Adesc.cols; + [[maybe_unused]] const int m = Ddesc.rows; + [[maybe_unused]] const int n = Ddesc.cols; + [[maybe_unused]] const int k = Adesc.cols; auto transpose = [](MatrixLayout x) { std::swap(x.rows, x.cols); diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu index 3309933dbf..4d3f87b3f6 100644 --- a/src/turbomind/kernels/gemm/moe_utils_v2.cu +++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu @@ -11,6 +11,7 @@ #include #include +#include "src/turbomind/core/data_type.h" #include "src/turbomind/kernels/core/array_ops.h" #include "src/turbomind/kernels/core/common.h" #include "src/turbomind/kernels/core/math.h" @@ -690,20 +691,21 @@ __global__ void MoeGatherKernel(T* dst, // [e*n, d] } } -template -void invokeMoeGather(T* dst, const T* src, const int* f2n, int tokens, int experts_per_token, int dims, cudaStream_t st) +void invokeMoeDispatch(Ref out_, const Tensor& src, const int* f2n, int expert_per_token, cudaStream_t st) { + using T = uint16_t; + TM_CHECK_EQ(byte_size(src.dtype()), byte_size()); + auto& out = out_.get(); + auto [num, dim] = src.shapes(0, 1); constexpr int threads = 256; constexpr int vec_size = 16 / sizeof(T); - MoeGatherKernel<<>>( // - dst, - src, + MoeGatherKernel<<>>( // + (T*)out.raw_data(), + (const T*)src.raw_data(), f2n, - dims / vec_size); + dim / vec_size); } -template void invokeMoeGather(uint16_t*, const uint16_t*, const int*, int, int, int, cudaStream_t); - template __global__ void MoeReduceKernel(T* dst, // [ n, d] const T* src, // [e*n, d] @@ -819,12 +821,36 @@ void invokeMoeReduce(T* dst, } } -template void -invokeMoeReduce(half*, const half*, const float*, const int*, const float*, int, int, int, float, cudaStream_t); -#ifdef ENABLE_BF16 -template void invokeMoeReduce( - nv_bfloat16*, const nv_bfloat16*, const float*, const int*, const float*, int, int, int, float, cudaStream_t); -#endif +void invokeMoeCombine(Ref out_, + const Tensor& src, + const float* scales, + const int* en2f, + const float* dst_scales, + int experts_per_token, + float dst_scale, + cudaStream_t st) +{ + auto& out = out_.get(); + + const int tokens = out.shape(0); + TM_CHECK_EQ(src.shape(0), tokens * experts_per_token); + + auto invoke = [&](auto t) { + using T = decltype(t); + return invokeMoeReduce(out.data(), + src.data(), + scales, + en2f, + dst_scales, + tokens, + experts_per_token, + src.shape(1), + dst_scale, + st); + }; + + TM_DISPATCH_PRIMARY_DTYPES(src.dtype(), invoke); +} std::vector SampleUniform(int token_num, int expert_num, int exp_per_tok, std::mt19937& g) { diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.h b/src/turbomind/kernels/gemm/moe_utils_v2.h index 4a603a07b3..618d097d11 100644 --- a/src/turbomind/kernels/gemm/moe_utils_v2.h +++ b/src/turbomind/kernels/gemm/moe_utils_v2.h @@ -5,6 +5,8 @@ #include #include +#include "src/turbomind/core/core.h" + namespace turbomind { constexpr int kMoeGateMaxTiles = 16; @@ -26,38 +28,20 @@ void invokeMoeGate_V2(int* f2n, float routed_scale, cudaStream_t st); -template -void invokeMoeGather( - T* dst, const T* src, const int* f2n, int tokens, int experts_per_token, int dims, cudaStream_t st); - -template -inline void -dispatchMoeGather(T* dst, const T* src, const int* f2n, int tokens, int experts_per_token, int dims, cudaStream_t st) -{ - const auto invoke = [&](auto type) { - using V = decltype(type); - invokeMoeGather((V*)dst, (const V*)src, f2n, tokens, experts_per_token, dims, st); - }; - - if constexpr (sizeof(T) == 2) { - invoke(uint16_t{}); - } - else { /// TODO: dispatch for more types - static_assert(sizeof(T) != sizeof(T), "Not implemented"); - } -} - -template -void invokeMoeReduce(T* dst, - const T* src, - const float* scales, - const int* en2f, - const float* dst_scales, - int tokens, - int experts_per_token, - int dims, - float dst_scale, - cudaStream_t st); +void invokeMoeDispatch(Ref out_, // + const Tensor& src, + const int* f2n, + int expert_per_token, + cudaStream_t st); + +void invokeMoeCombine(Ref out_, + const Tensor& src, + const float* scales, + const int* en2f, + const float* dst_scales, + int experts_per_token, + float dst_scale, + cudaStream_t st); void invokeMoeSoftmaxMaskTopKGroups( float* logits, int token_num, int expert_num, int group_size, int top_k, cudaStream_t st); diff --git a/src/turbomind/kernels/gemm/test/reference.cu b/src/turbomind/kernels/gemm/test/reference.cu index d1f7f34f64..ab7a1951bd 100644 --- a/src/turbomind/kernels/gemm/test/reference.cu +++ b/src/turbomind/kernels/gemm/test/reference.cu @@ -25,9 +25,9 @@ MatrixLayout transpose(MatrixLayout x) cudaDataType to_cuda_dtype(DataType dtype) { switch (dtype) { - case DataType::F16: + case DataType::kFloat16: return CUDA_R_16F; - case DataType::BF16: + case DataType::kBfloat16: return CUDA_R_16BF; default: CHECK("unsupported data type" && 0); diff --git a/src/turbomind/kernels/gemm/test/testbed.h b/src/turbomind/kernels/gemm/test/testbed.h index 4747644f9a..c296ae95c1 100644 --- a/src/turbomind/kernels/gemm/test/testbed.h +++ b/src/turbomind/kernels/gemm/test/testbed.h @@ -2,6 +2,20 @@ #pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "src/turbomind/core/core.h" + #include "src/turbomind/kernels/core/array.h" #include "src/turbomind/kernels/core/data_type.h" #include "src/turbomind/kernels/core/math.h" @@ -16,16 +30,6 @@ #include "src/turbomind/kernels/gemm/test/test_utils.h" #include "src/turbomind/kernels/gemm/types.h" #include "src/turbomind/kernels/gemm/utils.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include namespace turbomind::gemm { @@ -110,9 +114,9 @@ class Testbed { b_.resize(n * k * E); c_.resize(m * n); - a_desc_ = MatrixLayout{get_data_type_v, order_a, m, k, mk2cs(m, k).x, 0}; - b_desc_ = MatrixLayout{get_data_type_v, order_b, k, n, _kn2cs(k, n).x, 0}; - c_desc_ = MatrixLayout{get_data_type_v, order_c, m, n, mk2cs(m, n).x, 0}; + a_desc_ = MatrixLayout{data_type_v, order_a, m, k, mk2cs(m, k).x, 0}; + b_desc_ = MatrixLayout{data_type_v, order_b, k, n, _kn2cs(k, n).x, 0}; + c_desc_ = MatrixLayout{data_type_v, order_c, m, n, mk2cs(m, n).x, 0}; c_f_.resize(c_.size()); c_ref_.resize(c_.size()); @@ -151,7 +155,7 @@ class Testbed { if constexpr (is_quant_a) { static_assert(pack_a && pack_u); Quantize(a_, m, k, order_a, g, a_f_, a_q_, u_, stream); - u_pack_desc_ = u_desc_ = {DataType::U32, kColMajor, m, ceil_div(k, g), m}; + u_pack_desc_ = u_desc_ = {kUint32, kColMajor, m, ceil_div(k, g), m}; u_pack_desc_.pack = pack_u; u_pack_.resize(u_.size()); CHECK(!Convert(u_.data().get(), u_desc_, u_pack_.data().get(), u_pack_desc_, stream_)); @@ -172,7 +176,7 @@ class Testbed { Quantize(b_, n * E, k, _order_b, g, b_f_, b_q_, v_, stream); quant_b_ = {QuantType::kDefault, g}; - v_pack_desc_ = v_desc_ = {DataType::U32, kRowMajor, ceil_div(k, g), n, int(n * E)}; + v_pack_desc_ = v_desc_ = {kUint32, kRowMajor, ceil_div(k, g), n, int(n * E)}; v_pack_desc_.pack = pack_v; v_pack_.resize(v_.size()); auto v_src_data = (uint32_t*)v_.data().get(); @@ -194,7 +198,7 @@ class Testbed { } if constexpr (pack_a) { - a_pack_desc_.type = get_data_type_v; + a_pack_desc_.type = data_type_v; a_pack_desc_.pack = pack_a; const auto a_data = is_quant_a ? (void*)a_q_.data().get() : (void*)a_.data().get(); CHECK(!Convert(a_data, a_desc_, a_pack_.data().get(), a_pack_desc_, stream_)); @@ -206,7 +210,7 @@ class Testbed { if constexpr (pack_b) { // CHECK(experts == 0); - b_pack_desc_.type = get_data_type_v; + b_pack_desc_.type = data_type_v; b_pack_desc_.pack = pack_b; // clang-format off auto b_src_data = [&] { @@ -367,8 +371,11 @@ class Testbed { c_e_ref_.resize(c_e_.size()); for (int i = 0; i < 10; ++i) { - dispatchMoeGather( - a_e_.data().get(), a_f_.data().get(), moe_f2n_.data().get(), batch_size_, top_e, input_dims_, stream_); + invokeMoeDispatch(Tensor{a_e_.data().get(), {top_e * batch_size_, input_dims_}, kDEVICE}, + Tensor{a_f_.data().get(), {batch_size_, input_dims_}, kDEVICE}, + moe_f2n_.data().get(), + top_e, + stream_); } a_pack_desc_.num = b_pack_desc_.num = c_desc_.num = experts_; @@ -510,27 +517,23 @@ class Testbed { Compare(c_.data().get(), c_ref_.data().get(), dims, dims, bsz, 0); } else { - invokeMoeReduce(c_.data().get(), - c_e_.data().get(), - moe_scales_.data().get(), - moe_en2f_.data().get(), - nullptr, - batch_size_, - expert_ids_.size() / batch_size_, - output_dims_, - 0.f, - stream_); - - invokeMoeReduce(c_ref_.data().get(), - c_e_ref_.data().get(), - moe_scales_.data().get(), - moe_en2f_.data().get(), - nullptr, - batch_size_, - expert_ids_.size() / batch_size_, - output_dims_, - 0.f, - stream_); + invokeMoeCombine(Tensor{c_.data().get(), {batch_size_, output_dims_}, kDEVICE}, + Tensor{c_e_.data().get(), {(int)expert_ids_.size(), output_dims_}, kDEVICE}, + moe_scales_.data().get(), + moe_en2f_.data().get(), + nullptr, + expert_ids_.size() / batch_size_, + 0.f, + stream_); + + invokeMoeCombine(Tensor{c_ref_.data().get(), {batch_size_, output_dims_}, kDEVICE}, + Tensor{c_e_ref_.data().get(), {(int)expert_ids_.size(), output_dims_}, kDEVICE}, + moe_scales_.data().get(), + moe_en2f_.data().get(), + nullptr, + expert_ids_.size() / batch_size_, + 0.f, + stream_); cudaDeviceSynchronize(); @@ -586,13 +589,14 @@ class Testbed { int64_t get_global_memory_reads() { if (experts_ == 0) { - return get_size(a_pack_desc_) + get_size(b_pack_desc_) + get_size(u_pack_desc_) + get_size(v_pack_desc_); + return byte_size(a_pack_desc_) + byte_size(b_pack_desc_) + byte_size(u_pack_desc_) + + byte_size(v_pack_desc_); } else { - size_t size = get_size(a_pack_desc_) + get_size(u_pack_desc_); + size_t size = byte_size(a_pack_desc_) + byte_size(u_pack_desc_); const int nnz = std::accumulate(moe_cnt_.begin(), moe_cnt_.end(), 0, [](auto a, auto x) { return a + (x > 0); }); - size += nnz * (get_size(b_pack_desc_) + get_size(v_pack_desc_)); + size += nnz * (byte_size(b_pack_desc_) + byte_size(v_pack_desc_)); return size; } } @@ -600,13 +604,13 @@ class Testbed { int64_t get_ref_global_memory_reads() { if (experts_ == 0) { - return get_size(a_desc_) + get_size(b_desc_); + return byte_size(a_desc_) + byte_size(b_desc_); } else { - size_t size = get_size(a_desc_); + size_t size = byte_size(a_desc_); const int nnz = std::accumulate(moe_cnt_.begin(), moe_cnt_.end(), 0, [](auto a, auto x) { return a + (x > 0); }); - size += nnz * get_size(b_desc_); + size += nnz * byte_size(b_desc_); return size; } } diff --git a/src/turbomind/kernels/gemm/types.h b/src/turbomind/kernels/gemm/types.h index 94a31e9452..00c4c87efd 100644 --- a/src/turbomind/kernels/gemm/types.h +++ b/src/turbomind/kernels/gemm/types.h @@ -98,126 +98,6 @@ enum class Epilogue : int kGatedSilu = 0x2, }; -enum class DataType : int -{ - U4, - U8, - U16, - U32, - U64, - F8_E4M3, - F8_E5M2, - F16, - F32, - BF16, - TF32, -}; - -inline const char* to_string(DataType data_type) -{ - switch (data_type) { - case DataType::U4: - return "u4"; - case DataType::U8: - return "u8"; - case DataType::F16: - return "f16"; - case DataType::F32: - return "f32"; - case DataType::BF16: - return "bf16"; - case DataType::TF32: - return "tf32"; - default: - return "unknown"; - } -} - -inline int64_t get_size(DataType type, int64_t size) -{ - if (!size) { - return 0; - } - switch (type) { - case DataType::U64: - return size * 8; - case DataType::F32: - case DataType::U32: - return size * 4; - case DataType::BF16: - case DataType::F16: - case DataType::U16: - return size * 2; - case DataType::U8: - case DataType::F8_E4M3: - case DataType::F8_E5M2: - return size; - case DataType::U4: - return size / 2; - default: - // std::cerr << to_string(type) << "\n"; - return -1; - } -} - -template -struct get_data_type { -}; - -template<> -struct get_data_type { - static constexpr auto value = DataType::F16; -}; - -#if ENABLE_BF16 -template<> -struct get_data_type { - static constexpr auto value = DataType::BF16; -}; -#endif - -template<> -struct get_data_type { - static constexpr auto value = DataType::U4; -}; - -template<> -struct get_data_type { - static constexpr auto value = DataType::U8; -}; - -template -inline constexpr auto get_data_type_v = get_data_type::value; - -template -struct get_dtype { -}; - -template<> -struct get_dtype { - using type = half; -}; - -template<> -struct get_dtype { - using type = uint4_t; -}; - -template<> -struct get_dtype { - using type = uint8_t; -}; - -template<> -struct get_dtype { - using type = uint16_t; -}; - -template<> -struct get_dtype { - using type = uint32_t; -}; - struct QuantDesc { QuantType type; int group_size; @@ -273,9 +153,9 @@ struct MatrixLayout { int* idxs; }; -inline int64_t get_size(const MatrixLayout& m) +inline int64_t byte_size(const MatrixLayout& m) { - return get_size(m.type, (int64_t)m.rows * m.cols); + return byte_size(m.type, (int64_t)m.rows * m.cols); } inline Striding get_mode(const MatrixLayout& m) diff --git a/src/turbomind/kernels/gpt_kernels.cu b/src/turbomind/kernels/gpt_kernels.cu index 1d22d21b15..ed465bf078 100644 --- a/src/turbomind/kernels/gpt_kernels.cu +++ b/src/turbomind/kernels/gpt_kernels.cu @@ -14,194 +14,66 @@ * limitations under the License. */ -#include "src/turbomind/utils/cuda_fp8_utils.h" -#ifndef CUDART_VERSION -#error CUDART_VERSION Undefined! -#elif (CUDART_VERSION >= 11000) #include -#else -#include "3rdparty/cub/cub.cuh" -#endif + +#include "src/turbomind/kernels/core/array_ops.h" #include "src/turbomind/kernels/gpt_kernels.h" #include "src/turbomind/utils/memory_utils.h" namespace turbomind { -// PROMPT_SRC: 0 --> no prompts, 1 --> from loaded prompts, 2 --> from request prompts -template -__global__ void start_id_embedding_position_lookups_kernel(T* from_tensor, - int* output_ids, - const T* embedding_table, - const T* pos_table, - pPromptTuningParam prompt_param, - const int* input_ids, - const int start_step, - const int length, - const int max_length, - const int batch_size, - const int64_t hidden_units) +template +__global__ void +embeddingLookupKernel(T* dst, int dst_stride, const T* src, int src_stride, const int* ids, int num, int dim) { - for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch_size * length * hidden_units; - index += blockDim.x * gridDim.x) { - // transpose the input_ids [batch, length] (part of [batch, max_length]) to output_ids [length, batch] - if (OUTPUT_ID && index < batch_size * max_length) { - // for p/prompt_tuning (have prompt templates like [input1, prompt1, input2, prompt2]) - // we have to process it to like [input1, input2, prompt1, prompt2], and then remove the prompts during post - // processing - if (PROMPT_SRC > 0) { - if (index < batch_size) { - int no_prompt_output_seq_id = 0; -#pragma unroll 1 - for (int seq_id = 0; seq_id < max_length; seq_id++) { - int current_input_id = input_ids[index * max_length + seq_id]; - if (current_input_id < prompt_param.p_prompt_tuning_id_start) { - output_ids[no_prompt_output_seq_id * batch_size + index] = current_input_id; - no_prompt_output_seq_id++; - } - } - } - } - else { - const int seq_id = index % max_length; - const int batch_id = index / max_length; - if (seq_id < length) { - output_ids[seq_id * batch_size + batch_id] = input_ids[index]; - } - } - } + const int ti = blockIdx.x; - // embedding lookup from word ids [batch, length] (part of [batch, max_length]) and [vocab, hidden] to generate - // embedding [batch, length, hidden] - const int word_index = index / hidden_units; - const int word_index_row = word_index / length; // batch_id - const int word_index_col = word_index % length; - const int real_word_index = word_index_row * max_length + word_index_col; - const int step = start_step + word_index % length; - const int col_index = index % hidden_units; - const int input_id = input_ids == nullptr ? real_word_index : input_ids[real_word_index]; - const int prompt_id = input_id - prompt_param.p_prompt_tuning_id_start; - T embedding = (T)0.0f; - if (PROMPT_SRC > 0 && prompt_id >= 0) { - if (PROMPT_SRC == 1) { - // from loaded prompt embedding tables - embedding = - prompt_param.p_prompt_tuning_batch_weights[word_index_row][prompt_id * hidden_units + col_index]; - } - else { - // from request prompt embedding - embedding = - prompt_param - .request_prompt_embedding[word_index_row * prompt_param.request_prompt_max_length * hidden_units - + prompt_id * hidden_units + col_index]; - } - } - else { - embedding = embedding_table[input_id * hidden_units + col_index]; - } - T pos_embed = pos_table == nullptr ? (T)0.f : pos_table[(step - 1) * hidden_units + col_index]; - from_tensor[index] = embedding + pos_embed; + const int64_t idx = ids[ti]; + + src += idx * src_stride; + dst += ti * dst_stride; + + for (int di = threadIdx.x * vec_size; di < dim; di += blockDim.x * vec_size) { + Array vec; + Ldg(vec, &src[di]); + Store(&dst[di], vec); } } -#define WORD_POS_EMBEDDING_LOOPUP_KERNEL(OUTPUT_ID, PROMPT_SRC) \ - start_id_embedding_position_lookups_kernel<<>>(from_tensor, \ - output_ids, \ - embedding_table, \ - pos_table, \ - prompt_param, \ - input_ids, \ - start_step, \ - length, \ - max_length, \ - batch_size, \ - hidden_units); - -template -void invokeInputIdsEmbeddingLookupPosEncoding(T* from_tensor, - int* output_ids, - const T* embedding_table, // can also be inputs_embeds - const T* pos_table, - pPromptTuningParam prompt_param, - const int* input_ids, - const int start_step, - const int length, - const int max_length, - const int batch_size, - const int hidden_units, - cudaStream_t stream) +void invokeEmbeddingLookup(Ref out_, + const Buffer_& token_ids, + const Tensor& embedding_table, + cudaStream_t st) { - dim3 grid(min(batch_size * length, 65536)); - dim3 block(min(hidden_units, 512)); - const bool has_output_ids = output_ids != nullptr; - FT_CHECK(!(has_output_ids && input_ids == nullptr)); - - if (has_output_ids) { - if (prompt_param.use_request_p_prompt_embedding) { - WORD_POS_EMBEDDING_LOOPUP_KERNEL(true, 2); - } - else if (prompt_param.p_prompt_tuning_batch_weights != nullptr) { - WORD_POS_EMBEDDING_LOOPUP_KERNEL(true, 1); - } - else { - WORD_POS_EMBEDDING_LOOPUP_KERNEL(true, 0); - } - } - else { - if (prompt_param.use_request_p_prompt_embedding) { - WORD_POS_EMBEDDING_LOOPUP_KERNEL(false, 2); - } - else if (prompt_param.p_prompt_tuning_batch_weights != nullptr) { - WORD_POS_EMBEDDING_LOOPUP_KERNEL(false, 1); - } - else { - WORD_POS_EMBEDDING_LOOPUP_KERNEL(false, 0); - } + auto& out = out_.get(); + + TM_CHECK_EQ(out.shape(0), token_ids.size()); + TM_CHECK_EQ(out.shape(1), embedding_table.shape(1)); + + int num, dim; + std::tie(num, dim) = out.shapes(0, 1); + + auto invoke = [&](auto t) { + using T = decltype(t); + constexpr int vec_size = sizeof(uint4) / sizeof(T); + TM_CHECK(dim % vec_size == 0) << dim << " " << vec_size; + const int threads = std::min(dim / vec_size, 1024); + const int blocks = num; + embeddingLookupKernel<<>>((T*)out.raw_data(), + out.stride(0), + (const T*)embedding_table.raw_data(), + embedding_table.stride(0), + token_ids.data(), + num, + dim); + }; + + if (byte_size(out.dtype()) == byte_size()) { + return invoke(uint16_t{}); } + TM_CHECK(0) << "not implemented"; } -#ifdef ENABLE_FP32 -template void invokeInputIdsEmbeddingLookupPosEncoding(float* from_tensor, - int* output_ids, - const float* embedding_table, - const float* pos_table, - pPromptTuningParam prompt_param, - const int* input_ids, - const int start_step, - const int length, - const int max_length, - const int batch_size, - const int hidden_units, - cudaStream_t stream); -#endif - -template void invokeInputIdsEmbeddingLookupPosEncoding(half* from_tensor, - int* output_ids, - const half* embedding_table, - const half* pos_table, - pPromptTuningParam prompt_param, - const int* input_ids, - const int start_step, - const int length, - const int max_length, - const int batch_size, - const int hidden_units, - cudaStream_t stream); - -#ifdef ENABLE_BF16 -template void invokeInputIdsEmbeddingLookupPosEncoding(__nv_bfloat16* from_tensor, - int* output_ids, - const __nv_bfloat16* embedding_table, - const __nv_bfloat16* pos_table, - pPromptTuningParam<__nv_bfloat16> prompt_param, - const int* input_ids, - const int start_step, - const int length, - const int max_length, - const int batch_size, - const int hidden_units, - cudaStream_t stream); -#endif - // TODO Add half2 implementation template __global__ void transposeAxis01(T* out, T* in, const int dim0, const int dim1, const int dim2) diff --git a/src/turbomind/kernels/gpt_kernels.h b/src/turbomind/kernels/gpt_kernels.h index a351473332..f2ce314ba0 100644 --- a/src/turbomind/kernels/gpt_kernels.h +++ b/src/turbomind/kernels/gpt_kernels.h @@ -20,7 +20,7 @@ #include #include -#include "src/turbomind/utils/Tensor.h" +#include "src/turbomind/core/core.h" #include "src/turbomind/utils/memory_utils.h" namespace turbomind { @@ -130,20 +130,6 @@ void invokeFindContextDups(int* shared_contexts, const size_t input_seq_len, cudaStream_t stream = 0); -template -void handleOptArg(TensorMap* input_tensors, const std::string& arg_name, T* d_ptr, T default_value, size_t size) -{ - if (input_tensors->isExist(arg_name)) { - FT_CHECK(input_tensors->at(arg_name).size() == size); - cudaH2Dcpy(d_ptr, input_tensors->at(arg_name).getPtr(), size); - } - else { - deviceFill(d_ptr, size, default_value); - } -} - -void setSeqLimitLen(uint32_t* seq_len_d, Tensor seq_len, int limit_len_offset, int batch_size); - template void invokeCompactInputs(T* compact_input, T* compact_attention_mask, @@ -253,4 +239,9 @@ void invokeTranspose2D(T* dst, const T* src, int rows, int cols, cudaStream_t st } } +void invokeEmbeddingLookup(Ref out_, + const Buffer_& token_ids, + const Tensor& embedding_table, + cudaStream_t st); + } // namespace turbomind diff --git a/src/turbomind/kernels/norm/rms_norm.cu b/src/turbomind/kernels/norm/rms_norm.cu index 428725b62d..ee826c4105 100644 --- a/src/turbomind/kernels/norm/rms_norm.cu +++ b/src/turbomind/kernels/norm/rms_norm.cu @@ -4,26 +4,28 @@ #include "cub/block/block_reduce.cuh" +#include "src/turbomind/core/data_type.h" #include "src/turbomind/kernels/core/array_ops.h" #include "src/turbomind/kernels/core/common.h" #include "src/turbomind/kernels/core/math.h" #include "src/turbomind/kernels/core/meta.h" #include "src/turbomind/kernels/norm/rms_norm.h" -#include "src/turbomind/utils/Tensor.h" namespace turbomind { +namespace kernel { + template -__global__ void RMSNormKernel(T* dst, - int dst_ld, - const T* src, - int src_ld, - const T* __restrict__ weights, - int dims, - int num, - float eps, - float inv_dims) +__global__ void RMSNorm(T* dst, + int dst_ld, + const T* src, + int src_ld, + const T* __restrict__ weights, + int dims, + int num, + float eps, + float inv_dims) { const int ti = blockIdx.x; const int di = threadIdx.x * vec_size; @@ -80,60 +82,54 @@ __global__ void RMSNormKernel(T* dst, } } -template -void invokeRMSNorm( - T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, cudaStream_t st) +} // namespace kernel + +void invokeRMSNorm(Tensor& out, const Tensor& x, const Tensor& w, float eps, cudaStream_t st) { - if (num == 0) { + TM_CHECK(x.ndim() == 2); + TM_CHECK(out.shape() == x.shape()); + TM_CHECK(out.dtype() == x.dtype()); + TM_CHECK(w.dtype() == x.dtype() && w.shape(-1) == x.shape(-1)); + + if (x.size() == 0) { return; } - constexpr int vec_size = 16 / sizeof(T); + auto invoke = [&](auto t) { + using T = decltype(t); + + const auto [num, dim] = x.shapes(0, 1); + + constexpr int vec_size = 16 / sizeof(T); + + constexpr int threads = 512; + const int blocks = num; - constexpr int threads = 512; - const int blocks = num; - - RMSNormKernel<<>>(dst, // - dst_ld, - src, - src_ld, - weights, - dims, - num, - eps, - 1.f / dims); + kernel::RMSNorm<<>>((T*)out.raw_data(), // + out.stride(0), + (const T*)x.raw_data(), + x.stride(0), + (const T*)w.raw_data(), + dim, + num, + eps, + 1.f / dim); + }; + + TM_DISPATCH_PRIMARY_DTYPES(x.dtype(), invoke); } -template void invokeRMSNorm(half* dst, - int dst_ld, - const half* src, - int src_ld, - const half* weights, - int dims, - int num, - float eps, - cudaStream_t st); -#if ENABLE_BF16 -template void invokeRMSNorm(nv_bfloat16* dst, - int dst_ld, - const nv_bfloat16* src, - int src_ld, - const nv_bfloat16* weights, - int dims, - int num, - float eps, - cudaStream_t st); -#endif +namespace kernel { template -__global__ void QkRMSNormKernel(T* data, // - int ld, - const T* weight, - int dim, - int n, - int token_num, - float eps, - float inv_dim) +__global__ void RMSNormQK(T* data, // + int ld, + const T* weight, + int dim, + int n, + int token_num, + float eps, + float inv_dim) { static_assert((max_dim & (max_dim - 1)) == 0); @@ -183,6 +179,8 @@ __global__ void QkRMSNormKernel(T* data, // } } +} // namespace kernel + void invokeQkRMSNorm(void* data, int ld, const void* weight, @@ -193,12 +191,16 @@ void invokeQkRMSNorm(void* data, float eps, cudaStream_t stream) { - auto invoke = [&](auto t, auto max_dim_t) { + + constexpr constant<128> max_dim{}; + TM_CHECK_LE(head_dim, max_dim); + + auto invoke = [&](auto t) { using T = decltype(t); - constexpr int vec_size = sizeof(uint4) / sizeof(T); - constexpr int max_dim = max_dim_t.value; - constexpr int thr_per_qk = max_dim / vec_size; + constexpr int vec_size = sizeof(uint4) / sizeof(T); + // Captured constexpr may not be constant to MSVC + constexpr int thr_per_qk = max_dim.value / vec_size; FT_CHECK(head_dim % vec_size == 0); @@ -206,21 +208,45 @@ void invokeQkRMSNorm(void* data, const int block_dim = 512; const int grid_dim = cdiv(threads, block_dim); - QkRMSNormKernel<<>>( + kernel::RMSNormQK<<>>( (T*)data, ld, (const T*)weight, head_dim, n, token_num, eps, 1.f / head_dim); }; + TM_DISPATCH_PRIMARY_DTYPES(dtype, invoke); +} + +void invokeRMSNormQK(Tensor& x, const Tensor& w, float eps, cudaStream_t st) +{ + TM_CHECK(x.ndim() == 3); + + int token_num, head_num, head_dim; + std::tie(token_num, head_num, head_dim) = x.shapes(0, 1, 2); + + TM_CHECK(x.stride(1) == head_dim); + + auto data = x.raw_data(); + auto stride = x.stride(0); + constexpr constant<128> max_dim{}; - FT_CHECK(head_dim <= max_dim); - - switch (dtype) { - case TYPE_FP16: - return invoke(half{}, max_dim); - case TYPE_BF16: - return invoke(nv_bfloat16{}, max_dim); - default: - throw std::runtime_error("not implemented"); - } + TM_CHECK_LE(head_dim, max_dim); + + auto invoke = [&](auto t) { + using T = decltype(t); + + constexpr int vec_size = sizeof(uint4) / sizeof(T); + constexpr int thr_per_qk = max_dim.value / vec_size; + + TM_CHECK(head_dim % vec_size == 0); + + const int threads = token_num * head_num * thr_per_qk; + const int block_dim = 512; + const int grid_dim = cdiv(threads, block_dim); + + kernel::RMSNormQK<<>>( + (T*)data, stride, (const T*)w.raw_data(), head_dim, head_num, token_num, eps, 1.f / head_dim); + }; + + TM_DISPATCH_PRIMARY_DTYPES(x.dtype(), invoke); } // r' <- r + (h + b) @@ -368,14 +394,8 @@ void invokeResidualBiasRMSNorm(void* hidden_states, eps, 1.f / dims); }; - switch (dtype) { - case DataType::TYPE_FP16: - return invoke(half{}); - case DataType::TYPE_BF16: - return invoke(nv_bfloat16{}); - default: - FT_CHECK(0); - } + + TM_DISPATCH_PRIMARY_DTYPES(dtype, invoke); } } // namespace turbomind diff --git a/src/turbomind/kernels/norm/rms_norm.h b/src/turbomind/kernels/norm/rms_norm.h index 562be1aea6..4027d83260 100644 --- a/src/turbomind/kernels/norm/rms_norm.h +++ b/src/turbomind/kernels/norm/rms_norm.h @@ -2,29 +2,13 @@ #include -#include "src/turbomind/utils/Tensor.h" +#include "src/turbomind/core/core.h" namespace turbomind { -template -void invokeRMSNorm( - T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, cudaStream_t st); +void invokeRMSNorm(Tensor& out, const Tensor& x, const Tensor& w, float eps, cudaStream_t st); -template -void invokeRMSNorm(T* dst, const T* src, const T* weights, int dims, int num, float eps, cudaStream_t st) -{ - invokeRMSNorm(dst, dims, src, dims, weights, dims, num, eps, st); -} - -void invokeQkRMSNorm(void* data, - int ld, - const void* weight, - DataType dtype, - int head_dim, - int n, - int token_num, - float eps, - cudaStream_t stream); +void invokeRMSNormQK(Tensor& x, const Tensor& w, float eps, cudaStream_t st); template void invokeBiasResidualRMSNorm( diff --git a/src/turbomind/kernels/sampling_topk_kernels.cu b/src/turbomind/kernels/sampling_topk_kernels.cu index d52d112765..a3834ebce3 100644 --- a/src/turbomind/kernels/sampling_topk_kernels.cu +++ b/src/turbomind/kernels/sampling_topk_kernels.cu @@ -55,14 +55,15 @@ __global__ void curandBatchInitialize(curandState_t* states, const int size, con } } -void invokeCurandBatchInitialize(curandState_t* states, - const size_t batch_size, - const unsigned long long* random_seeds, - cudaStream_t stream) +void invokeCurandBatchInitialize(curandState_t* states, + const size_t batch_size, + const uint64_t* random_seeds, + cudaStream_t stream) { dim3 block(256); dim3 grid((int)(ceil(batch_size * 1.0 / 256))); - curandBatchInitialize<<>>(states, batch_size, random_seeds); + static_assert(sizeof(uint64_t) == sizeof(unsigned long long)); + curandBatchInitialize<<>>(states, batch_size, (unsigned long long*)random_seeds); } template diff --git a/src/turbomind/kernels/sampling_topk_kernels.h b/src/turbomind/kernels/sampling_topk_kernels.h index cb357bc1c9..c0c60b4f82 100644 --- a/src/turbomind/kernels/sampling_topk_kernels.h +++ b/src/turbomind/kernels/sampling_topk_kernels.h @@ -48,10 +48,10 @@ void invokeCurandInitialize(curandState_t* state, unsigned long long random_seed, cudaStream_t stream); -void invokeCurandBatchInitialize(curandState_t* states, - const size_t batch_size, - const unsigned long long* random_seeds, - cudaStream_t stream); +void invokeCurandBatchInitialize(curandState_t* states, + const size_t batch_size, + const uint64_t* random_seeds, + cudaStream_t stream); struct TopKSortFilterParams { void* workspace; diff --git a/src/turbomind/kernels/stop_criteria_kernels.cu b/src/turbomind/kernels/stop_criteria_kernels.cu index 06452535b4..b31dd9216d 100644 --- a/src/turbomind/kernels/stop_criteria_kernels.cu +++ b/src/turbomind/kernels/stop_criteria_kernels.cu @@ -104,58 +104,32 @@ void invokeStopWordsCriterion(const int* output_ids, sync_check_cuda_error(); } -__global__ void length_criterion(bool* finished, - bool* should_stop, - int* finished_sum, - const uint32_t* sequence_limit_length, - int batch_size, - int beam_width, - int step) +__global__ void length_criterion(bool* finished, // + const int* sequence_limit_length, + int batch_size, + int beam_width, + int step) { - int thread_finished_count = 0; for (int index = threadIdx.x; index < batch_size * beam_width; index += blockDim.x) { const int batch_idx = index / beam_width; - finished[index] |= step >= sequence_limit_length[batch_idx]; - thread_finished_count += finished[index] ? 1 : 0; - } - int block_finished_count = 0; - if (blockDim.x <= 32) { - block_finished_count = warpReduceSum(thread_finished_count); - } - else { - block_finished_count = blockReduceSum(thread_finished_count); - } - __syncthreads(); - - if (threadIdx.x == 0 && should_stop) { - finished_sum[0] = block_finished_count; } } -void invokeLengthCriterion(bool* finished, - bool* should_stop, - int* h_pinned_finished_sum_, - const uint32_t* sequence_limit_length, - int batch_size, - int beam_width, - int step, - cudaStream_t stream) +void invokeLengthCriterion(bool* finished, // + const int* sequence_limit_length, + int batch_size, + int beam_width, + int step, + cudaStream_t stream) { // Check if we have attained the sequence length limit. If so, stop the sequence. // In addition, check if all sequences are stopped and return the result in should_stop TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); dim3 block(std::min(512, batch_size * beam_width)); dim3 grid{1}; - h_pinned_finished_sum_[0] = -1; - - length_criterion<<>>( - finished, should_stop, h_pinned_finished_sum_, sequence_limit_length, batch_size, beam_width, step); - if (should_stop) { - check_cuda_error(cudaStreamSynchronize(stream)); - *should_stop = h_pinned_finished_sum_[0] == batch_size * beam_width; - } + length_criterion<<>>(finished, sequence_limit_length, batch_size, beam_width, step); } } // namespace turbomind diff --git a/src/turbomind/kernels/stop_criteria_kernels.h b/src/turbomind/kernels/stop_criteria_kernels.h index e403c947cb..2a83fbb6fd 100644 --- a/src/turbomind/kernels/stop_criteria_kernels.h +++ b/src/turbomind/kernels/stop_criteria_kernels.h @@ -30,13 +30,11 @@ void invokeStopWordsCriterion(const int* output_ids, int step, cudaStream_t stream); -void invokeLengthCriterion(bool* finished, - bool* should_stop, - int* finished_sum, - const uint32_t* sequence_limit_length, - int batch_size, - int beam_width, - int step, - cudaStream_t stream); +void invokeLengthCriterion(bool* finished, // + const int* sequence_limit_length, + int batch_size, + int beam_width, + int step, + cudaStream_t stream); } // namespace turbomind diff --git a/src/turbomind/kernels/unfused_attention_kernels.cu b/src/turbomind/kernels/unfused_attention_kernels.cu index 7f733a6dfc..a5c36dd148 100644 --- a/src/turbomind/kernels/unfused_attention_kernels.cu +++ b/src/turbomind/kernels/unfused_attention_kernels.cu @@ -531,6 +531,45 @@ void invokeMaskedSoftmax(MaskedSoftmaxParam<__nv_bfloat16, __nv_bfloat16>& param #undef LAUNCH_MAKSED_SOFTMAX #undef LAUNCH_MAKSED_SOFTMAX_ +// clang-format off +template struct packed_type; +template <> struct packed_type { using type = float; }; // we don't need to pack float by default +template <> struct packed_type { using type = half2; }; + +#ifdef ENABLE_BF16 +template<> +struct packed_type<__nv_bfloat16> { + using type = __nv_bfloat162; +}; +#endif + +template struct num_elems; +template <> struct num_elems { static constexpr int value = 1; }; +template <> struct num_elems { static constexpr int value = 2; }; +template <> struct num_elems { static constexpr int value = 4; }; +template <> struct num_elems { static constexpr int value = 1; }; +template <> struct num_elems { static constexpr int value = 2; }; +#ifdef ENABLE_BF16 +template <> struct num_elems<__nv_bfloat16> { static constexpr int value = 1; }; +template <> struct num_elems<__nv_bfloat162> { static constexpr int value = 2; }; +#endif + +template struct packed_as; +template struct packed_as { using type = T; }; +template<> struct packed_as { using type = half2; }; +template<> struct packed_as { using type = float2; }; +template<> struct packed_as { using type = int16_t; }; +template<> struct packed_as { using type = int2; }; +template<> struct packed_as { using type = half; }; +#ifdef ENABLE_BF16 +template<> struct packed_as<__nv_bfloat16, 2> { using type = __nv_bfloat162; }; +template<> struct packed_as<__nv_bfloat162, 1> { using type = __nv_bfloat16; }; +#endif + +inline __device__ float2 operator*(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); } +inline __device__ float2 operator*(float2 a, float b) { return make_float2(a.x * b, a.y * b); } +// clang-format on + template __global__ void transpose_remove_padding(const T* src, T* dst, diff --git a/src/turbomind/kernels/unfused_attention_kernels.h b/src/turbomind/kernels/unfused_attention_kernels.h index 758fe7fba0..7df6a421e5 100644 --- a/src/turbomind/kernels/unfused_attention_kernels.h +++ b/src/turbomind/kernels/unfused_attention_kernels.h @@ -15,8 +15,6 @@ */ #pragma once -#include "src/turbomind/utils/Tensor.h" - namespace turbomind { template @@ -142,7 +140,4 @@ void invokeMaskedSoftMaxWithRelPosBias(T* qk_buf, const float qk_scale, cudaStream_t stream); -template -void invokeTransposeAttentions(Tensor& attentions_out, const Tensor& attentions_in, cudaStream_t stream = 0); - } // namespace turbomind diff --git a/src/turbomind/layers/BaseDynamicDecodeLayer.h b/src/turbomind/layers/BaseDynamicDecodeLayer.h new file mode 100644 index 0000000000..a3e14407ff --- /dev/null +++ b/src/turbomind/layers/BaseDynamicDecodeLayer.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "src/turbomind/core/core.h" +#include "src/turbomind/engine/request.h" + +namespace turbomind { + +class BaseDynamicDecodeLayer { +public: + struct BaseParam { + int max_batch_size; + int vocab_size; + int vocab_size_padded; + cudaStream_t stream; + const cudaDeviceProp* device_prop; + }; + + virtual ~BaseDynamicDecodeLayer() = default; + + explicit BaseDynamicDecodeLayer(const BaseParam& param) + { + max_batch_size_ = param.max_batch_size; + vocab_size_ = param.vocab_size; + vocab_size_padded_ = param.vocab_size_padded; + stream_ = param.stream; + device_prop_ = param.device_prop; + }; + + virtual void Setup(const std::vector& rs, const TensorMap& args) = 0; + + virtual void Forward(TensorMap& args) = 0; + +protected: + int max_batch_size_; + int vocab_size_; + int vocab_size_padded_; + cudaStream_t stream_; + const cudaDeviceProp* device_prop_; +}; + +} // namespace turbomind diff --git a/src/turbomind/layers/BaseLayer.h b/src/turbomind/layers/BaseLayer.h deleted file mode 100644 index fcb0ef37cc..0000000000 --- a/src/turbomind/layers/BaseLayer.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include "src/turbomind/utils/Tensor.h" -#include "src/turbomind/utils/allocator.h" -#include "src/turbomind/utils/cublasMMWrapper.h" - -namespace turbomind { - -class BaseLayer { -public: - BaseLayer(cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - cudaDeviceProp* cuda_device_prop = nullptr, - bool sparse = false): - stream_(stream), - cublas_wrapper_(cublas_wrapper), - allocator_(allocator), - cuda_device_prop_(cuda_device_prop), - is_free_buffer_after_forward_(is_free_buffer_after_forward), - sparse_(sparse){}; - virtual ~BaseLayer() = default; - - virtual cudaStream_t getStream() - { - return stream_; - } - - virtual void setStream(cudaStream_t stream) - { - stream_ = stream; - } - -protected: - virtual void allocateBuffer() = 0; - virtual void freeBuffer() = 0; - - // device environments - cudaStream_t stream_; - cublasMMWrapper* cublas_wrapper_; - IAllocator* allocator_; - cudaDeviceProp* cuda_device_prop_ = nullptr; - - bool is_free_buffer_after_forward_; - bool is_allocate_buffer_ = false; // TODO (bhsueh) to be deprecated - bool sparse_; -}; - -} // namespace turbomind diff --git a/src/turbomind/layers/CMakeLists.txt b/src/turbomind/layers/CMakeLists.txt index ae308d0fd8..975ee77ec7 100644 --- a/src/turbomind/layers/CMakeLists.txt +++ b/src/turbomind/layers/CMakeLists.txt @@ -22,4 +22,4 @@ set_property(TARGET DynamicDecodeLayer PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET DynamicDecodeLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) target_link_libraries(DynamicDecodeLayer PUBLIC CUDA::cudart LogitsProcessorLayer SamplingLayer StopCriteriaLayer - gpt_kernels tensor nvtx_utils) + gpt_kernels nvtx_utils) diff --git a/src/turbomind/layers/DenseWeight.h b/src/turbomind/layers/DenseWeight.h deleted file mode 100644 index ba27764d38..0000000000 --- a/src/turbomind/layers/DenseWeight.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#include "src/turbomind/utils/cuda_fp8_utils.h" -#include "stdlib.h" -#include - -namespace turbomind { - -// Note that the int8 mode of BERT and GPT are different. -// For int8 mode = 2 on GPT: -// scale (gemm input scale): quantize input of GEMM (float/half) in the int8 range. Namely, int8_x = scale * x -// scale_inter: (gemm output scale) / (gemm input scale * gemm weight scale) -// scale_out: 1 / (gemm output scale), dequantize activation from int8 range to float/half. -template -struct DenseWeight { - const T1* kernel = nullptr; - const T2* bias = nullptr; - const T1* fp8_bias = nullptr; - const T1* sp_kernel = nullptr; - // for int8 kernel - const int8_t* int8_kernel = nullptr; - const float* scale = nullptr; - const T2* weight_only_quant_scale = nullptr; - const T2* moe_scale = nullptr; - const float* scale_inter = nullptr; - const float* scale_out = nullptr; - - // FP8 scales - // scale = AMAX(tensor) / FP8_MAX - // During GEMM, A (original) = A_scaled (fp8) * "scale of A" - const float* input_scale = nullptr; // a scalar - const float* input_scale_inv = nullptr; // a scalar - const float* weight_scale = nullptr; // a scalar or a vector - const float* weight_scale_inv = nullptr; // a scalar or a vector - const float* output_scale = nullptr; // a scalar - const float* output_scale_inv = nullptr; // a scalar - // host pointer of scales, all are scalars - const float* input_h_scale = nullptr; - const float* input_h_scale_inv = nullptr; - const float* weight_h_scale = nullptr; - const float* weight_h_scale_inv = nullptr; - const float* output_h_scale = nullptr; - const float* output_h_scale_inv = nullptr; - - // TODO(bhsueh) check do we need this param - const float* per_channel_scale_min = - nullptr; // = min(weight_scale), used to adjust the scaling of per channel scaling - - bool fuse_gemm_bias = false; -}; - -} // namespace turbomind diff --git a/src/turbomind/layers/DynamicDecodeBaseLayer.h b/src/turbomind/layers/DynamicDecodeBaseLayer.h deleted file mode 100644 index 132197269a..0000000000 --- a/src/turbomind/layers/DynamicDecodeBaseLayer.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -#include "src/turbomind/layers/BaseLayer.h" - -namespace turbomind { - -struct DynamicDecodeCommonArgs { - size_t vocab_size; - size_t vocab_size_padded; -}; - -class DynamicDecodeBaseLayer: public BaseLayer { -protected: - DynamicDecodeCommonArgs args_; - - virtual void allocateBuffer() = 0; - virtual void freeBuffer() = 0; - -public: - DynamicDecodeBaseLayer(cudaStream_t stream, - IAllocator* allocator, - bool is_free_buffer_after_forward, - DynamicDecodeCommonArgs args): - BaseLayer(stream, nullptr, allocator, is_free_buffer_after_forward, nullptr), args_(args){}; - ~DynamicDecodeBaseLayer() = default; - - virtual void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) = 0; - - virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors) = 0; -}; - -} // namespace turbomind diff --git a/src/turbomind/layers/DynamicDecodeLayer.cc b/src/turbomind/layers/DynamicDecodeLayer.cc index 7d1f1b5ed3..748c0e7184 100644 --- a/src/turbomind/layers/DynamicDecodeLayer.cc +++ b/src/turbomind/layers/DynamicDecodeLayer.cc @@ -15,103 +15,44 @@ */ #include "src/turbomind/layers/DynamicDecodeLayer.h" +#include "src/turbomind/core/data_type.h" +#include "src/turbomind/layers/BaseDynamicDecodeLayer.h" #include "src/turbomind/layers/sampling_layers/LogitsProcessorLayer.h" #include "src/turbomind/layers/sampling_layers/SamplingLayer.h" #include "src/turbomind/layers/sampling_layers/StopCriteriaLayer.h" #include "src/turbomind/macro.h" -#include "src/turbomind/utils/cuda_utils.h" namespace turbomind { -template -void DynamicDecodeLayer::allocateBuffer() -{ -} - -template -void DynamicDecodeLayer::freeBuffer() -{ -} - -template -void DynamicDecodeLayer::initialize() -{ - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - - DynamicDecodeCommonArgs args{vocab_size_, vocab_size_padded_}; - layers_.emplace_back(new LogitsProcessorLayer(stream_, allocator_, is_free_buffer_after_forward_, args)); - layers_.emplace_back(new SamplingLayer(stream_, allocator_, is_free_buffer_after_forward_, args)); - layers_.emplace_back(new StopCriteriaLayer(stream_, allocator_, is_free_buffer_after_forward_, args)); -} - -template -DynamicDecodeLayer::DynamicDecodeLayer(size_t vocab_size, - size_t vocab_size_padded, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - cudaDeviceProp* cuda_device_prop): - BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward), - vocab_size_(vocab_size), - vocab_size_padded_(vocab_size_padded), - cuda_device_prop_(cuda_device_prop) +DynamicDecodeLayer::DynamicDecodeLayer(DataType dtype, + int max_batch_size, + int vocab_size, + int vocab_size_padded, + cudaStream_t stream, + const cudaDeviceProp* device_prop) { TM_LOG_DEBUG(__PRETTY_FUNCTION__); - initialize(); -} - -template -DynamicDecodeLayer::~DynamicDecodeLayer() -{ + auto dispatch = [&](auto t) { + using T = decltype(t); + BaseDynamicDecodeLayer::BaseParam param{max_batch_size, vocab_size, vocab_size_padded, stream, device_prop}; + layers_.emplace_back(new LogitsProcessorLayer{param}); + layers_.emplace_back(new SamplingLayer{param}); + layers_.emplace_back(new StopCriteriaLayer{param}); + }; + TM_DISPATCH_PRIMARY_DTYPES(dtype, dispatch); } -template -DynamicDecodeLayer::DynamicDecodeLayer(DynamicDecodeLayer const& dynamic_decode_layer): - BaseLayer(dynamic_decode_layer), - vocab_size_(dynamic_decode_layer.vocab_size_), - vocab_size_padded_(dynamic_decode_layer.vocab_size_padded_), - cuda_device_prop_(dynamic_decode_layer.cuda_device_prop_) -{ - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - initialize(); -} +DynamicDecodeLayer::~DynamicDecodeLayer() {} -template -void DynamicDecodeLayer::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) +void DynamicDecodeLayer::Setup(const std::vector& rs, const TensorMap& args) { - /** - * @brief Set up the dynamic decode layer for given input runtime arguments. - * - * runtime_args: - * \param runtime_top_k [batch_size] on cpu, optional. - * \param runtime_top_p [batch_size] on cpu, optional - * \param temperature [batch_size] on cpu, optional - * \param repetition_penalty [batch_size] on cpu, optional - * \param min_length [batch_size], optional - * \param context_length [batch_size], optional - * \param prompt_length [batch_size], optional - */ - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - FT_CHECK_WITH_INFO(beam_width == 1, "only support beam_width=1"); for (const auto& layer : layers_) { - layer->setup(batch_size, beam_width, runtime_args); + layer->Setup(rs, args); } } -template -void DynamicDecodeLayer::forward(std::unordered_map* output_tensors, - const std::unordered_map* input_tensors) -{ - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - TensorMap input_map(*input_tensors); - TensorMap output_map(*output_tensors); - forward(&output_map, &input_map); -} - -template -void DynamicDecodeLayer::forward(TensorMap* output_tensors, TensorMap* input_tensors) +void DynamicDecodeLayer::Forward(TensorMap& args) { /** * @brief @@ -140,25 +81,9 @@ void DynamicDecodeLayer::forward(TensorMap* output_tensors, TensorMap* input_ * \param sampled_nums [batch_size, 1], optional */ - const int ite = (int)input_tensors->at("ite").getVal(); - const size_t batch_size = input_tensors->at("logits").shape[0]; - const size_t local_batch_size = (size_t)input_tensors->at("local_batch_size").getVal(); - - FT_CHECK(ite == 0); - FT_CHECK(local_batch_size == batch_size); - FT_CHECK(input_tensors->at("logits").shape.size() == 3); - for (const auto& layer : layers_) { - layer->forward(output_tensors, input_tensors); + layer->Forward(args); } } -#ifdef ENABLE_FP32 -template class DynamicDecodeLayer; -#endif -template class DynamicDecodeLayer; -#ifdef ENABLE_BF16 -template class DynamicDecodeLayer<__nv_bfloat16>; -#endif - } // namespace turbomind diff --git a/src/turbomind/layers/DynamicDecodeLayer.h b/src/turbomind/layers/DynamicDecodeLayer.h index 152a5c30a5..c527ff8e0f 100644 --- a/src/turbomind/layers/DynamicDecodeLayer.h +++ b/src/turbomind/layers/DynamicDecodeLayer.h @@ -16,43 +16,33 @@ #pragma once -#include -#include +#include +#include -#include "src/turbomind/layers/BaseLayer.h" -#include "src/turbomind/layers/DynamicDecodeBaseLayer.h" +#include "src/turbomind/engine/request.h" +#include "src/turbomind/layers/BaseDynamicDecodeLayer.h" -namespace turbomind { - -template -class DynamicDecodeLayer: public BaseLayer { -protected: - void allocateBuffer() override; - void freeBuffer() override; - void initialize(); - - size_t vocab_size_; - size_t vocab_size_padded_; - cudaDeviceProp* cuda_device_prop_; +#include "src/turbomind/core/tensor.h" - std::vector> layers_; +namespace turbomind { +class DynamicDecodeLayer { public: - DynamicDecodeLayer(size_t vocab_size, - size_t vocab_size_padded, - cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - cudaDeviceProp* cuda_device_prop); + DynamicDecodeLayer(DataType data_type, + int max_batch_size, + int vocab_size, + int vocab_size_padded, + cudaStream_t stream, + const cudaDeviceProp* device_prop); ~DynamicDecodeLayer(); - DynamicDecodeLayer(DynamicDecodeLayer const& dynamic_decode_layer); - void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args); - void forward(TensorMap* output_tensors, TensorMap* input_tensors); - void forward(std::unordered_map* output_tensors, - const std::unordered_map* input_tensors); + void Setup(const std::vector& rs, const TensorMap& args); + + void Forward(TensorMap& args); + +private: + std::vector> layers_; }; } // namespace turbomind diff --git a/src/turbomind/layers/attention_layers/AttentionWeight.h b/src/turbomind/layers/attention_layers/AttentionWeight.h deleted file mode 100644 index 46d7bf3e89..0000000000 --- a/src/turbomind/layers/attention_layers/AttentionWeight.h +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "src/turbomind/layers/DenseWeight.h" - -namespace turbomind { - -template -struct AttentionWeight { - DenseWeight query_weight; - DenseWeight key_weight; - DenseWeight value_weight; - DenseWeight attention_output_weight; - DenseWeight ia3_key_weight; - DenseWeight ia3_value_weight; -}; - -} // namespace turbomind diff --git a/src/turbomind/layers/attention_layers/BaseAttentionLayer.h b/src/turbomind/layers/attention_layers/BaseAttentionLayer.h deleted file mode 100644 index db9972ab65..0000000000 --- a/src/turbomind/layers/attention_layers/BaseAttentionLayer.h +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -// #include "3rdparty/trt_fused_multihead_attention/fused_multihead_attention_common.h" -#include "src/turbomind/layers/BaseLayer.h" -#include "src/turbomind/layers/attention_layers/AttentionWeight.h" -#include "src/turbomind/utils/Tensor.h" -#include "src/turbomind/utils/allocator.h" -#include "src/turbomind/utils/cublasMMWrapper.h" -#include "src/turbomind/utils/cuda_fp8_utils.h" -#include "src/turbomind/utils/memory_utils.h" - -namespace turbomind { - -enum class AttentionType -{ - UNFUSED_MHA, - UNFUSED_PADDED_MHA, - FUSED_MHA, - FUSED_PADDED_MHA -}; - -/* NOTE: -1. only swin-style relative position bias is supported currently -2. gpt-style (causal-mask) models support any-sequence-length fmha, so we don't need to call isValidSeqLen at run-time -3. bert/vit can also support any-seq-length fmha -*/ -template -AttentionType getAttentionType(size_t size_per_head, - const int sm, - const bool remove_padding, - const int max_seq_len, - const bool is_fuse = true, - const bool with_swin_relative_position_bias = false, - const bool causal_mask = false) -{ - - if (std::is_same::value && is_fuse) { - // Bert/Vit - if (!causal_mask) { - if (!with_swin_relative_position_bias - && (((sm == kSM_70 || sm == kSM_72) && size_per_head == 64) - || ((sm == kSM_75 || sm == kSM_80 || sm == kSM_86) - && (size_per_head == 64 || size_per_head == 32)))) { - return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA; - } - else if (with_swin_relative_position_bias && (sm == kSM_75 || sm == kSM_80 || sm == kSM_86) - && max_seq_len <= 256 && size_per_head == 32) { - return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA; - } - } - // GPT and its variants - else { - // FMHA_ENABLE only affects gpt-style models (causal-mask) - char* fused_qkv = std::getenv("FMHA_ENABLE"); - if (fused_qkv != nullptr && std::string(fused_qkv) == "ON") { - if ((sm == kSM_70 || sm == kSM_72 || sm == kSM_75 || sm == kSM_80 || sm == kSM_86 || sm == kSM_89) - && (size_per_head == 32 || size_per_head == 40 || size_per_head == 64 || size_per_head == 80 - || size_per_head == 128 || size_per_head == 144 || size_per_head == 160 - || size_per_head == 256)) { - return remove_padding ? AttentionType::FUSED_MHA : AttentionType::UNFUSED_PADDED_MHA; - } - } - } - } -#ifdef ENABLE_FP8 - else if (std::is_same::value && is_fuse) { - if (!causal_mask) { - if ((sm == kSM_89 || sm == kSM_90) && max_seq_len < 512 && is_fuse && size_per_head == 64) { - return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA; - } - else { - return remove_padding ? AttentionType::UNFUSED_MHA : AttentionType::UNFUSED_PADDED_MHA; - } - } - } -#endif - - return remove_padding ? AttentionType::UNFUSED_MHA : AttentionType::UNFUSED_PADDED_MHA; -} - -template -AttentionType getAttentionTypeINT8( - size_t size_per_head, const int sm, const bool remove_padding, const int max_seq_len, const int int8_mode) -{ - if ((int8_mode == 1 || int8_mode == 2) - && (((sm == kSM_80 || sm == kSM_86) && (size_per_head == 64 || size_per_head == 32) && max_seq_len <= 512) - || (sm == kSM_75 - && ((size_per_head == 64 && max_seq_len <= 384) || (size_per_head == 32 && max_seq_len <= 512))))) { - return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA; - } - else { - return remove_padding ? AttentionType::UNFUSED_MHA : AttentionType::UNFUSED_PADDED_MHA; - } -} - -inline bool isFusedMHA(AttentionType attention_type) -{ - return attention_type == AttentionType::FUSED_MHA || attention_type == AttentionType::FUSED_PADDED_MHA; -} - -inline bool isUnPaddedMHA(AttentionType attention_type) -{ - return attention_type == AttentionType::FUSED_MHA || attention_type == AttentionType::UNFUSED_MHA; -} - -inline bool isPaddedMHA(AttentionType attention_type) -{ - return attention_type == AttentionType::FUSED_PADDED_MHA || attention_type == AttentionType::UNFUSED_PADDED_MHA; -} - -inline AttentionType getUnfusedAttentionType(AttentionType attention_type) -{ - if (attention_type == AttentionType::FUSED_MHA) { - return AttentionType::UNFUSED_MHA; - } - else if (attention_type == AttentionType::FUSED_PADDED_MHA) { - return AttentionType::UNFUSED_PADDED_MHA; - } - return attention_type; -} - -template -class BaseAttentionLayer: public BaseLayer { - -public: - virtual void - forward(TensorMap* output_tensors, TensorMap* input_tensors, const AttentionWeight* attention_weights) = 0; - - BaseAttentionLayer(cudaStream_t stream, - cublasMMWrapper* cublas_wrapper, - IAllocator* allocator, - bool is_free_buffer_after_forward, - bool sparse = false): - BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse) - { - } - virtual ~BaseAttentionLayer() = default; - virtual bool isValidSeqLen(const size_t seq_len) - { - return true; - } -}; - -} // namespace turbomind diff --git a/src/turbomind/layers/attention_layers/CMakeLists.txt b/src/turbomind/layers/attention_layers/CMakeLists.txt deleted file mode 100644 index 0d1a96fef3..0000000000 --- a/src/turbomind/layers/attention_layers/CMakeLists.txt +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -cmake_minimum_required(VERSION 3.8) diff --git a/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.cc b/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.cc index 1194ad16f1..1839284f03 100644 --- a/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.cc +++ b/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.cc @@ -14,140 +14,108 @@ * limitations under the License. */ -#include "src/turbomind/layers/sampling_layers/LogitsProcessorLayer.h" +#include +#include + +#include "src/turbomind/core/check.h" +#include "src/turbomind/engine/request.h" #include "src/turbomind/kernels/ban_bad_words.h" +#include "src/turbomind/kernels/penalty_types.h" #include "src/turbomind/kernels/sampling_penalty_kernels.h" -#include "src/turbomind/utils/memory_utils.h" +#include "src/turbomind/layers/sampling_layers/LogitsProcessorLayer.h" +#include "src/turbomind/layers/sampling_layers/utils.h" namespace turbomind { #define ALL_OF(p_, sz_, dt_, v_) (std::all_of(p_, p_ + sz_, [&](dt_ b) { return b == v_; })) +namespace { + template -void init_host_buffer(TensorMap* runtime_args, const std::string& key, size_t size, T* dst, T default_value) +void init_host_buffer(const TensorMap& map, const std::string& key, size_t size, T* dst, T default_value) { - const Tensor src = runtime_args->isExist(key) ? runtime_args->at(key) : Tensor(); - const size_t src_size = src.size(); - if (src_size > size) { - TM_LOG_ERROR("runtime_args %s has invalid size %ld vs batch_size %ld", key.c_str(), src_size, size); - } - if (src_size > 0) { - std::copy_n(src.getPtr(), size, dst); + Tensor empty{}; + const Tensor& src = map.contains(key) ? map.at(key) : empty; + + if (src) { + if (size_t sz = src.size(); sz > size) { + TM_LOG_ERROR("runtime_args %s has invalid size %ld vs batch_size %ld", key.c_str(), sz, size); + } + std::copy_n(src.data(), size, dst); } else { std::fill_n(dst, size, default_value); } } -template -void LogitsProcessorLayer::allocateBuffer() -{ - FT_CHECK(false); -} - -template -void LogitsProcessorLayer::allocateBuffer(const size_t batch_size) -{ - TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); - - repetition_penalty_buf_ = - reinterpret_cast(allocator_->reMalloc(repetition_penalty_buf_, sizeof(float) * batch_size, false)); - min_lengths_buf_ = reinterpret_cast(allocator_->reMalloc(min_lengths_buf_, sizeof(int) * batch_size, false)); - temperature_buf_ = - reinterpret_cast(allocator_->reMalloc(temperature_buf_, sizeof(float) * batch_size, false)); - - repetition_penalty_.resize(batch_size); - min_lengths_.resize(batch_size); - context_length_.resize(batch_size); - prompt_length_.resize(batch_size); - temperature_.resize(batch_size); - - TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); -} +} // namespace template -void LogitsProcessorLayer::freeBuffer() +LogitsProcessorLayer::LogitsProcessorLayer(const BaseParam& param): BaseDynamicDecodeLayer{param} { - TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); - - repetition_penalty_ = {}; - min_lengths_ = {}; - context_length_ = {}; - prompt_length_ = {}; - temperature_ = {}; - allocator_->free((void**)&repetition_penalty_workspace_); - allocator_->free((void**)&repetition_penalty_buf_); - allocator_->free((void**)&min_lengths_buf_); - allocator_->free((void**)&temperature_buf_); - - TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); + repetition_penalty_ = {max_batch_size_, kCPUpinned}; + min_lengths_ = {max_batch_size_, kCPUpinned}; + temperature_ = {max_batch_size_, kCPUpinned}; + bad_words_ = {max_batch_size_ * 2 * kMaxStopBadWordsLen, kCPUpinned}; + end_ids_ = {max_batch_size_ * kMaxEndIdsSize, kCPUpinned}; + + repetition_penalty_buf_ = {max_batch_size_, kDEVICE}; + min_lengths_buf_ = {max_batch_size_, kDEVICE}; + temperature_buf_ = {max_batch_size_, kDEVICE}; + bad_words_buf_ = {max_batch_size_ * 2 * kMaxStopBadWordsLen, kDEVICE}; + end_ids_buf_ = {max_batch_size_ * kMaxEndIdsSize, kDEVICE}; } template -LogitsProcessorLayer::~LogitsProcessorLayer() -{ - TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); - - freeBuffer(); - - TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); -} - -template -void LogitsProcessorLayer::forward(TensorMap* output_tensors, TensorMap* input_tensors) +void LogitsProcessorLayer::Forward(TensorMap& args) { // apply repetition penalty -> ban bad words -> min length penalty -> temperature penalty // the order is same with transformers TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); - FT_CHECK(input_tensors->at("logits").shape.size() == 3); + Tensor_ output_ids = args.at("output_ids"); + Tensor_ logits = args.at("logits"); + + const auto bsz = logits.shape(0); - const int batch_size = output_tensors->at("output_ids").shape[1]; - const int step = input_tensors->at("step").getVal(); - const int max_input_length = input_tensors->at("max_input_length").getVal(); - T* logits = input_tensors->at("logits").getPtr(); + const int step = *args.at("step").data(); + const int max_input_length = *args.at("max_input_length").data(); // repetition penalty if (step > 1 && repetition_penalty_type_ != RepetitionPenaltyType::None) { - float default_value = getDefaultPenaltyValue(repetition_penalty_type_); - if (!ALL_OF(repetition_penalty_.begin(), batch_size, float, default_value)) { - repetition_penalty_workspace_ = reinterpret_cast(allocator_->reMalloc( - repetition_penalty_workspace_, batch_size * step * (sizeof(int) + sizeof(float)), false)); - invokeBatchApplyRepetitionPenalty( - logits, - repetition_penalty_buf_, - repetition_penalty_workspace_, - output_tensors->at("output_ids").getPtr(), - batch_size, - batch_size, - args_.vocab_size_padded, - input_tensors->at("input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {}, nullptr}).getPtr(), - max_input_length, - step, - repetition_penalty_type_, - stream_); - sync_check_cuda_error(); - } + Buffer_ workspace(bsz * step * (sizeof(int) + sizeof(float)), kDEVICE); + invokeBatchApplyRepetitionPenalty(logits.data(), + repetition_penalty_buf_.data(), + (int*)workspace.data(), + output_ids.data(), + bsz, + bsz, + vocab_size_padded_, + args.at("init_context_length").data(), + max_input_length, + step, + repetition_penalty_type_, + stream_); + sync_check_cuda_error(); } // ban bad words - if (input_tensors->isExist("bad_words_list")) { - const Tensor bad_words = input_tensors->at("bad_words_list"); - FT_CHECK(bad_words.shape.size() == 3); - const size_t bad_words_len = bad_words.shape[2]; - invokeBanBadWords(logits, - output_tensors->at("output_ids").getPtr(), + if (auto& bad_words = bad_words_ten_) { + TM_CHECK_EQ(bad_words.ndim(), 3); + const auto bad_words_len = bad_words.shape(2); + invokeBanBadWords(logits.data(), + output_ids.data(), nullptr, - batch_size, - batch_size, + bsz, + bsz, 1, - bad_words.getPtr(), + bad_words.data(), false, bad_words_len, 0, - args_.vocab_size_padded, + vocab_size_padded_, step, stream_); @@ -155,72 +123,113 @@ void LogitsProcessorLayer::forward(TensorMap* output_tensors, TensorMap* inpu } // min length - { - const int num_generated_tokens = step - max_input_length; - const int* min_lengths = min_lengths_.data(); - std::vector index(batch_size); - std::iota(index.begin(), index.end(), 0); - const bool invoke_min_length_penalty = std::any_of(index.begin(), index.end(), [&](int i) { - return min_lengths[i] > context_length_[i] + num_generated_tokens; - }); - if (invoke_min_length_penalty && input_tensors->isExist("end_ids")) { - const Tensor end_ids = input_tensors->at("end_ids"); - FT_CHECK(end_ids.shape.size() == 2); - invokeMinLengthPenalty(logits, - min_lengths_buf_, - output_tensors->getPtr("sequence_length"), - args_.vocab_size_padded, - batch_size, - input_tensors->getPtr("end_ids"), - end_ids.shape[1], + if (end_ids_ten_) { + TM_CHECK_EQ(end_ids_ten_.ndim(), 2); + auto enable = [&] { + const int num_generated_tokens = step - max_input_length; + auto context_len = args.at("context_length").data(); + for (int i = 0; i < bsz; ++i) { + if (min_lengths_[i] > context_len[i] + num_generated_tokens) { + return true; + } + } + return false; + }(); + if (enable) { + invokeMinLengthPenalty(logits.data(), + min_lengths_buf_.data(), + args.at("sequence_length").data(), + vocab_size_padded_, + bsz, + end_ids_ten_.data(), + end_ids_ten_.shape(1), stream_); sync_check_cuda_error(); } } // temperature - { - if (!ALL_OF(temperature_.begin(), batch_size, float, 1.f)) { - invokeBatchApplyTemperaturePenalty_v2( - logits, (T*)nullptr, temperature_buf_, batch_size, args_.vocab_size, args_.vocab_size_padded, stream_); - sync_check_cuda_error(); - } + if (!ALL_OF(temperature_.begin(), bsz, float, 1.f)) { + invokeBatchApplyTemperaturePenalty_v2(logits.data(), // + (T*)nullptr, + temperature_buf_.data(), + bsz, + vocab_size_, + vocab_size_padded_, + stream_); + sync_check_cuda_error(); } TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } template -void LogitsProcessorLayer::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) +void LogitsProcessorLayer::Setup(const std::vector& rs, const TensorMap& args) { TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); - allocateBuffer(batch_size); - - // repetition_penalty - if (runtime_args->isExist("repetition_penalty")) { - init_host_buffer(runtime_args, "repetition_penalty", batch_size, repetition_penalty_.data(), 1.f); - repetition_penalty_type_ = RepetitionPenaltyType::Multiplicative; - } + const int bsz = rs.size(); - // temperature - init_host_buffer(runtime_args, "temperature", batch_size, temperature_.data(), 1.f); + const auto prompt_length = args.at("prompt_length").data(); - // min_length - init_host_buffer(runtime_args, "min_length", batch_size, min_lengths_.data(), 0); - init_host_buffer(runtime_args, "context_length", batch_size, context_length_.data(), 0); - init_host_buffer(runtime_args, "prompt_length", batch_size, prompt_length_.data(), 0); + repetition_penalty_type_ = RepetitionPenaltyType::None; - // invokeMinLengthPenalty if min_length > context_length - prompt_length + num_generated_tokens - std::transform( - min_lengths_.begin(), min_lengths_.end(), prompt_length_.begin(), min_lengths_.begin(), std::plus()); + for (int i = 0; i < bsz; ++i) { + auto& c = rs[i]->gen_cfg; + // repetition_penalty + repetition_penalty_[i] = c.repetition_penalty; + if (repetition_penalty_[i] != 1.f) { + repetition_penalty_type_ = RepetitionPenaltyType::Multiplicative; + } + // temperature + temperature_[i] = c.temperature; + // min_length + min_lengths_[i] = c.min_new_tokens + prompt_length[i]; + } - cudaAutoCpy(temperature_buf_, temperature_.data(), batch_size, stream_); - cudaAutoCpy(repetition_penalty_buf_, repetition_penalty_.data(), batch_size, stream_); - cudaAutoCpy(min_lengths_buf_, min_lengths_.data(), batch_size, stream_); + Copy_(temperature_, bsz, temperature_buf_); + Copy_(repetition_penalty_, bsz, repetition_penalty_buf_); + Copy_(min_lengths_, bsz, min_lengths_buf_); sync_check_cuda_error(); + init_stop_bad_words(&GenerationConfig::bad_ids, // + "bad_words", + rs, + bad_words_.data(), + bad_words_buf_.data(), + bad_words_ten_); + + { // end ids for min length + end_ids_ten_ = {}; + int max_length = 0; + for (int i = 0; i < bsz; ++i) { + max_length = std::max(max_length, (int)rs[i]->gen_cfg.eos_ids.size()); + } + if (max_length) { + max_length = std::min(max_length, kMaxEndIdsSize); + int* h_end_ids = end_ids_.data(); + std::fill(h_end_ids, h_end_ids + std::min(kMaxEndIdsSize, max_length) * bsz, -1); + for (int i = 0; i < bsz; ++i) { + const auto& eos_ids = rs[i]->gen_cfg.eos_ids; + if (eos_ids.size() == 0) { + continue; + } + if (TM_UNLIKELY(eos_ids.size() > kMaxEndIdsSize)) { + TM_LOG_WARNING("[InitializeSampling] [%ld] eos length (%d) exceeds %d, truncated to %d", + (long)rs[i]->id, + (int)eos_ids.size(), + kMaxEndIdsSize, + kMaxEndIdsSize); + } + std::copy_n(eos_ids.begin(), std::min((int)eos_ids.size(), kMaxEndIdsSize), h_end_ids); + h_end_ids += max_length; + } + Copy(end_ids_, bsz * max_length, end_ids_buf_); + end_ids_ten_ = {end_ids_buf_.data(), {bsz, max_length}, kDEVICE}; + } + } + TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } diff --git a/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.h b/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.h index 23f108b829..1e56dabd64 100644 --- a/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.h +++ b/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.h @@ -16,47 +16,45 @@ #pragma once +#include + #include "src/turbomind/kernels/penalty_types.h" -#include "src/turbomind/layers/DynamicDecodeBaseLayer.h" +#include "src/turbomind/layers/BaseDynamicDecodeLayer.h" #include "src/turbomind/macro.h" -#include + +#include "src/turbomind/engine/request.h" namespace turbomind { template -class LogitsProcessorLayer: public DynamicDecodeBaseLayer { +class LogitsProcessorLayer: public BaseDynamicDecodeLayer { public: - using DynamicDecodeBaseLayer::DynamicDecodeBaseLayer; - using DynamicDecodeBaseLayer::args_; + explicit LogitsProcessorLayer(const BaseParam& param); - void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override; + void Setup(const std::vector& rs, const TensorMap& args) override; - void forward(TensorMap* output_tensors, TensorMap* input_tensors) override; - - ~LogitsProcessorLayer(); + void Forward(TensorMap& args) override; private: - void allocateBuffer() override; - - void allocateBuffer(const size_t batch_size); - - void freeBuffer() override; - // repetition penalty type RepetitionPenaltyType repetition_penalty_type_ = RepetitionPenaltyType::None; // host buffer - std::vector repetition_penalty_; - std::vector min_lengths_; - std::vector temperature_; - std::vector context_length_; - std::vector prompt_length_; + Buffer_ repetition_penalty_; + Buffer_ min_lengths_; + Buffer_ temperature_; + Buffer_ bad_words_; + Buffer_ end_ids_; // device buffer - int* repetition_penalty_workspace_ = nullptr; - float* repetition_penalty_buf_ = nullptr; - int* min_lengths_buf_ = nullptr; - float* temperature_buf_ = nullptr; + Buffer_ repetition_penalty_buf_; + Buffer_ min_lengths_buf_; + Buffer_ temperature_buf_; + Buffer_ bad_words_buf_; + Buffer_ end_ids_buf_; + + Tensor_ bad_words_ten_; + Tensor_ end_ids_ten_; }; } // namespace turbomind diff --git a/src/turbomind/layers/sampling_layers/SamplingLayer.cc b/src/turbomind/layers/sampling_layers/SamplingLayer.cc index 315226f7e3..04d051d10d 100644 --- a/src/turbomind/layers/sampling_layers/SamplingLayer.cc +++ b/src/turbomind/layers/sampling_layers/SamplingLayer.cc @@ -15,135 +15,34 @@ */ #include "src/turbomind/layers/sampling_layers/SamplingLayer.h" +#include "src/turbomind/core/check.h" +#include "src/turbomind/core/tensor.h" #include "src/turbomind/kernels/sampling_kernels.h" #include "src/turbomind/kernels/sampling_topk_kernels.h" #include "src/turbomind/kernels/sampling_topp_kernels.h" -#include "src/turbomind/utils/memory_utils.h" +#include "src/turbomind/utils/logger.h" namespace turbomind { -void set_runtime_args(int batch_size, - int top_k, - int* top_ks, - int top_ks_size, - int* runtime_top_k, - float top_p, - float* top_ps, - int top_ps_size, - float* runtime_top_p, - float min_p, - float* min_ps, - int min_ps_size, - float* runtime_min_p) -{ - for (int i = 0; i < batch_size; i++) { - int topk = top_ks_size > 1 ? top_ks[i] : top_k; - float topp = top_ps_size > 1 ? top_ps[i] : top_p; - float minp = min_ps_size > 1 ? min_ps[i] : min_p; - - if (topk == 0 && topp == 0.f) { - topk = 1; - } - - if (topk < 0 || topk > 1024) { - TM_LOG_WARNING("topk (%d) is out of range [0, 1024]", topk); - topk = std::max(0, std::min(topk, 1024)); - } - if (topp < 0.f || topp > 1.f) { - TM_LOG_WARNING("topp (%f) is out of range [0.0, 1.0f]", topp); - topp = std::max(0.f, std::min(topp, 1.f)); - } - if (minp < 0.f || minp > 1.f) { - TM_LOG_WARNING("minp (%f) is out of range [0.0, 1.0f]", minp); - minp = std::max(0.f, std::min(minp, 1.f)); - } - runtime_top_k[i] = topk; - runtime_top_p[i] = topp; - runtime_min_p[i] = minp; - } -} - template -void SamplingLayer::allocateBuffer() +SamplingLayer::SamplingLayer(const BaseParam& param): BaseDynamicDecodeLayer{param} { - FT_CHECK(false); + top_k_ = {max_batch_size_, kCPUpinned}; + top_p_ = {max_batch_size_, kCPUpinned}; + min_p_ = {max_batch_size_, kCPUpinned}; + kept_ = {max_batch_size_, kCPUpinned}; + + // constant array + std::fill_n(kept_.data(), max_batch_size_, vocab_size_); + + top_k_buf_ = {max_batch_size_, kDEVICE}; + top_p_buf_ = {max_batch_size_, kDEVICE}; + min_p_buf_ = {max_batch_size_, kDEVICE}; + kept_buf_ = {max_batch_size_, kDEVICE}; } template -void SamplingLayer::allocateBuffer(const size_t batch_size) -{ - TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); - - runtime_top_k_buf_ = - reinterpret_cast(allocator_->reMalloc(runtime_top_k_buf_, sizeof(int) * batch_size, false)); - runtime_top_p_buf_ = - reinterpret_cast(allocator_->reMalloc(runtime_top_p_buf_, sizeof(float) * batch_size, false)); - runtime_min_p_buf_ = - reinterpret_cast(allocator_->reMalloc(runtime_min_p_buf_, sizeof(float) * batch_size, false)); - - indices_ = reinterpret_cast( - allocator_->reMalloc(indices_, batch_size * sizeof(int) * args_.vocab_size_padded, false)); - kept_ = reinterpret_cast(allocator_->reMalloc(kept_, batch_size * sizeof(int), false)); - - { - // topk buffer - TopKSortFilterParams params{}; - params.batch_size = batch_size; - params.max_top_k = max_topk_; - invokeTopKSortFilter(params, stream_); - topk_ws_size_ = params.workspace_size; - topk_ws_ = allocator_->reMalloc(topk_ws_, topk_ws_size_, false); - } - - { - // topp buffer - TopPSortParams params{}; - params.batch_size = batch_size; - params.vocab_size = args_.vocab_size; - params.vocab_size_padded = args_.vocab_size_padded; - invokeTopPSort(params, stream_); - topp_ws_size_ = params.workspace_size; - topp_ws_ = allocator_->reMalloc(topp_ws_, topp_ws_size_, false); - } - - TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); -} - -template -void SamplingLayer::freeBuffer() -{ - TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); - - kept_n_ = {}; - runtime_top_k_ = {}; - runtime_top_p_ = {}; - runtime_min_p_ = {}; - - allocator_->free((void**)&runtime_top_k_buf_); - allocator_->free((void**)&runtime_top_p_buf_); - allocator_->free((void**)&runtime_min_p_buf_); - allocator_->free((void**)&topk_ws_); - allocator_->free((void**)&topp_ws_); - - allocator_->free((void**)&indices_); - allocator_->free((void**)&kept_); - logits_ = nullptr; - - TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); -} - -template -SamplingLayer::~SamplingLayer() -{ - TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); - - freeBuffer(); - - TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); -} - -template -void SamplingLayer::forward(TensorMap* output_tensors, TensorMap* input_tensors) +void SamplingLayer::Forward(TensorMap& args) { // step1: // - use topk / topp_minp kernel to sort and filter the scores @@ -153,82 +52,82 @@ void SamplingLayer::forward(TensorMap* output_tensors, TensorMap* input_tenso TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); - Tensor logits = input_tensors->at("logits"); - const int batch_size = logits.shape[0]; - const int step = input_tensors->at("step").getVal(); - logits_ = logits.getPtr(); + Tensor_ logits = args.at("logits"); + + const auto bsz = logits.shape(0); + + const int step = *args.at("step").data(); - cudaAutoCpy(kept_, kept_n_.data(), batch_size, stream_); + core::Copy(kept_.data(), bsz, kept_buf_.data()); // use topk sort if some request use topk filter if (max_topk_ > 0) { // TODO: top_k >= 64 is much slower than torch.topk() TopKSortFilterParams params{}; - params.workspace = topk_ws_; - params.workspace_size = topk_ws_size_; - params.logits = logits_; - params.sorted_logits = logits_; - params.sorted_indices = indices_; - params.kept = kept_; - params.top_ks = runtime_top_k_buf_; + params.workspace = topk_ws_.data(); + params.workspace_size = topk_ws_.size(); + params.logits = logits.data(); + params.sorted_logits = logits.data(); + params.sorted_indices = indices_.data(); + params.kept = kept_buf_.data(); + params.top_ks = top_k_buf_.data(); params.max_top_k = max_topk_; - params.batch_size = batch_size; - params.vocab_size = args_.vocab_size; - params.vocab_size_padded = args_.vocab_size_padded; + params.batch_size = bsz; + params.vocab_size = vocab_size_; + params.vocab_size_padded = vocab_size_padded_; invokeTopKSortFilter(params, stream_); } // use topp sort if some request skip topk filter if (min_topk_ == 0) { - invokeSoftmax(logits_, args_.vocab_size_padded, args_.vocab_size, batch_size, kept_, stream_); + invokeSoftmax(logits.data(), vocab_size_padded_, vocab_size_, bsz, kept_buf_.data(), stream_); TopPSortParams params{}; - params.workspace = topp_ws_; - params.workspace_size = topp_ws_size_; - params.logits = logits_; - params.sorted_logits = logits_; - params.sorted_indices = indices_; - params.kept = kept_; - params.top_ks = runtime_top_k_buf_; - params.top_ps = runtime_top_p_buf_; - params.batch_size = batch_size; - params.vocab_size = args_.vocab_size; - params.vocab_size_padded = args_.vocab_size_padded; + params.workspace = topp_ws_.data(); + params.workspace_size = topp_ws_.size(); + params.logits = logits.data(); + params.sorted_logits = logits.data(); + params.sorted_indices = indices_.data(); + params.kept = kept_buf_.data(); + params.top_ks = top_k_buf_.data(); + params.top_ps = top_p_buf_.data(); + params.batch_size = bsz; + params.vocab_size = vocab_size_; + params.vocab_size_padded = vocab_size_padded_; invokeTopPSort(params, stream_); } // apply topp minp filter if (max_minp_ != 0.f || min_topp_ != 1.f) { TopPMinPFilterParams params{}; - params.sorted_logits = logits_; - params.sorted_indices = indices_; - params.kept = kept_; - params.top_ps = runtime_top_p_buf_; - params.min_ps = runtime_min_p_buf_; - params.batch_size = batch_size; - params.vocab_size = args_.vocab_size; - params.vocab_size_padded = args_.vocab_size_padded; + params.sorted_logits = logits.data(); + params.sorted_indices = indices_.data(); + params.kept = kept_buf_.data(); + params.top_ps = top_p_buf_.data(); + params.min_ps = min_p_buf_.data(); + params.batch_size = bsz; + params.vocab_size = vocab_size_; + params.vocab_size_padded = vocab_size_padded_; invokeTopPMinPFilter(params, stream_); } // sample { SamplingParams params{}; - params.logits = logits.getPtr(); - params.stride = args_.vocab_size_padded; - params.indices = indices_; - params.kept = kept_; - params.curandstate = output_tensors->at("curand_state").getPtr(); - params.batch_size = batch_size; - params.output_ids = output_tensors->at("output_ids").getPtrWithOffset(step * batch_size); - params.sequence_length = - output_tensors->at("sequence_length", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr(); - params.sampled_logprobs = - output_tensors->at("sampled_logprobs", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr(); - params.sampled_indexes = - output_tensors->at("sampled_indexes", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr(); - params.sampled_nums = - output_tensors->at("sampled_nums", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr(); + params.logits = logits.data(); + params.stride = vocab_size_padded_; + params.indices = indices_.data(); + params.kept = kept_buf_.data(); + params.curandstate = (curandState_t*)args.at("curand_state").raw_data(); + params.batch_size = bsz; + params.output_ids = args.at("output_ids").data() + step * bsz; + params.sequence_length = args.at("sequence_length").data(); + + if (auto sampled_logprobs = args.try_("sampled_logprobs")) { + params.sampled_logprobs = sampled_logprobs->data(); + params.sampled_indexes = args.at("sampled_indexes").data(); + params.sampled_nums = args.at("sampled_nums").data(); + } invokeSampling(params, stream_); sync_check_cuda_error(); @@ -238,51 +137,45 @@ void SamplingLayer::forward(TensorMap* output_tensors, TensorMap* input_tenso } template -void SamplingLayer::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) +void SamplingLayer::Setup(const std::vector& rs, const TensorMap&) { - TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); - - const Tensor runtime_top_k = runtime_args->isExist("runtime_top_k") ? runtime_args->at("runtime_top_k") : Tensor(); - const Tensor runtime_top_p = runtime_args->isExist("runtime_top_p") ? runtime_args->at("runtime_top_p") : Tensor(); - const Tensor runtime_min_p = runtime_args->isExist("runtime_min_p") ? runtime_args->at("runtime_min_p") : Tensor(); + const auto bsz = rs.size(); - kept_n_.resize(batch_size); - runtime_top_k_.resize(batch_size); - runtime_top_p_.resize(batch_size); - runtime_min_p_.resize(batch_size); - - int top_k = runtime_top_k.size() > 0 ? runtime_top_k.getVal() : 0; - float top_p = runtime_top_p.size() > 0 ? runtime_top_p.getVal() : 0.0f; - float min_p = runtime_min_p.size() > 0 ? runtime_min_p.getVal() : 0.0f; - set_runtime_args(batch_size, - top_k, - runtime_top_k.getPtr(), - runtime_top_k.size(), - runtime_top_k_.data(), - top_p, - runtime_top_p.getPtr(), - runtime_top_p.size(), - runtime_top_p_.data(), - min_p, - runtime_min_p.getPtr(), - runtime_min_p.size(), - runtime_min_p_.data()); + for (int i = 0; i < bsz; ++i) { + top_k_[i] = rs[i]->gen_cfg.top_k; + top_p_[i] = rs[i]->gen_cfg.top_p; + min_p_[i] = rs[i]->gen_cfg.min_p; + } - max_topk_ = *std::max_element(runtime_top_k_.begin(), runtime_top_k_.end()); - min_topk_ = *std::min_element(runtime_top_k_.begin(), runtime_top_k_.end()); - min_topp_ = *std::min_element(runtime_top_p_.begin(), runtime_top_p_.end()); - max_minp_ = *std::max_element(runtime_min_p_.begin(), runtime_min_p_.end()); + max_topk_ = *std::max_element(top_k_.begin(), top_k_.end()); + min_topk_ = *std::min_element(top_k_.begin(), top_k_.end()); + min_topp_ = *std::min_element(top_p_.begin(), top_p_.end()); + max_minp_ = *std::max_element(min_p_.begin(), min_p_.end()); - allocateBuffer(batch_size); + indices_ = Buffer_(bsz * vocab_size_padded_, kDEVICE); - // kept - std::fill_n(kept_n_.data(), batch_size, args_.vocab_size); + { + // topk buffer + TopKSortFilterParams params{}; + params.batch_size = bsz; + params.max_top_k = max_topk_; + invokeTopKSortFilter(params, stream_); + topk_ws_ = {(ssize_t)params.workspace_size, kDEVICE}; + } - cudaAutoCpy(runtime_top_k_buf_, runtime_top_k_.data(), batch_size, stream_); - cudaAutoCpy(runtime_top_p_buf_, runtime_top_p_.data(), batch_size, stream_); - cudaAutoCpy(runtime_min_p_buf_, runtime_min_p_.data(), batch_size, stream_); + { + // topp buffer + TopPSortParams params{}; + params.batch_size = bsz; + params.vocab_size = vocab_size_; + params.vocab_size_padded = vocab_size_padded_; + invokeTopPSort(params, stream_); + topp_ws_ = {(ssize_t)params.workspace_size, kDEVICE}; + } - TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); + core::Copy(top_k_.data(), bsz, top_k_buf_.data()); + core::Copy(top_p_.data(), bsz, top_p_buf_.data()); + core::Copy(min_p_.data(), bsz, min_p_buf_.data()); } #ifdef ENABLE_FP32 diff --git a/src/turbomind/layers/sampling_layers/SamplingLayer.h b/src/turbomind/layers/sampling_layers/SamplingLayer.h index 0de3088248..55696767fb 100644 --- a/src/turbomind/layers/sampling_layers/SamplingLayer.h +++ b/src/turbomind/layers/sampling_layers/SamplingLayer.h @@ -15,55 +15,47 @@ */ #pragma once -#include "src/turbomind/layers/DynamicDecodeBaseLayer.h" -#include "src/turbomind/macro.h" #include +#include "src/turbomind/core/tensor.h" +#include "src/turbomind/layers/BaseDynamicDecodeLayer.h" +#include "src/turbomind/macro.h" + +#include "src/turbomind/engine/request.h" + namespace turbomind { template -class SamplingLayer: public DynamicDecodeBaseLayer { +class SamplingLayer: public BaseDynamicDecodeLayer { public: - using DynamicDecodeBaseLayer::DynamicDecodeBaseLayer; - using DynamicDecodeBaseLayer::args_; - - void setup(const size_t batch_size, const size_t beam_width, TensorMap* params) override; + explicit SamplingLayer(const BaseParam& param); - void forward(TensorMap* output_tensors, TensorMap* input_tensors) override; + void Setup(const std::vector& rs, const TensorMap&) override; - ~SamplingLayer(); + void Forward(TensorMap& args) override; private: - void allocateBuffer() override; - - void freeBuffer() override; - - void allocateBuffer(const size_t batch_size); - // host buffer - std::vector kept_n_; - std::vector runtime_top_k_; - std::vector runtime_top_p_; - std::vector runtime_min_p_; - int max_topk_; - int min_topk_; - float min_topp_; - float max_minp_; + Buffer_ kept_; + Buffer_ top_k_; + Buffer_ top_p_; + Buffer_ min_p_; - // device buffer - int* runtime_top_k_buf_{}; - float* runtime_top_p_buf_{}; - float* runtime_min_p_buf_{}; + int max_topk_; + int min_topk_; + float min_topp_; + float max_minp_; - void* topk_ws_{}; - size_t topk_ws_size_; + // device buffer + Buffer_ top_k_buf_; + Buffer_ top_p_buf_; + Buffer_ min_p_buf_; - void* topp_ws_{}; - size_t topp_ws_size_; + Buffer_ topk_ws_; + Buffer_ topp_ws_; - T* logits_{}; // sorted logits - int* indices_{}; // sorted indices - int* kept_{}; // kept sample + Buffer_ indices_; // sorted indices + Buffer_ kept_buf_; // kept sample }; } // namespace turbomind diff --git a/src/turbomind/layers/sampling_layers/StopCriteriaLayer.cc b/src/turbomind/layers/sampling_layers/StopCriteriaLayer.cc index 5d40d85dce..b4e49cc5a6 100644 --- a/src/turbomind/layers/sampling_layers/StopCriteriaLayer.cc +++ b/src/turbomind/layers/sampling_layers/StopCriteriaLayer.cc @@ -16,92 +16,66 @@ #include "src/turbomind/layers/sampling_layers/StopCriteriaLayer.h" #include "src/turbomind/kernels/stop_criteria_kernels.h" -#include "src/turbomind/utils/memory_utils.h" +#include "src/turbomind/layers/sampling_layers/utils.h" +#include "src/turbomind/macro.h" namespace turbomind { template -void StopCriteriaLayer::allocateBuffer() +StopCriteriaLayer::StopCriteriaLayer(const BaseParam& param): BaseDynamicDecodeLayer{param} { - TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); - - h_pinned_finished_sum_ = (int*)allocator_->reMalloc(h_pinned_finished_sum_, sizeof(int), true, true); - - TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); + stop_words_ = {max_batch_size_ * 2 * kMaxStopBadWordsLen, kCPUpinned}; + stop_words_buf_ = {max_batch_size_ * 2 * kMaxStopBadWordsLen, kDEVICE}; } template -void StopCriteriaLayer::freeBuffer() +void StopCriteriaLayer::Setup(const std::vector& rs, const TensorMap&) { - TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); - - allocator_->free((void**)(&h_pinned_finished_sum_), true); - - TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); -} - -template -StopCriteriaLayer::~StopCriteriaLayer() -{ - TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); - - freeBuffer(); - - TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); + init_stop_bad_words(&GenerationConfig::stop_ids, // + "stop_words", + rs, + stop_words_.data(), + stop_words_buf_.data(), + stop_words_ten_); } template -void StopCriteriaLayer::forward(TensorMap* output_tensors, TensorMap* input_tensors) +void StopCriteriaLayer::Forward(TensorMap& args) { TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); - const size_t batch_size = input_tensors->at("logits").shape[0]; - const int step = input_tensors->at("step").getVal(); + const int batch_size = args.at("logits").shape(0); + const int step = *args.at("step").data(); - if (input_tensors->isExist("stop_words_list")) { - const Tensor stop_words_list = input_tensors->at("stop_words_list"); - FT_CHECK(stop_words_list.shape.size() == 3); // [batch, 2, len] - size_t stop_words_len = stop_words_list.shape[2]; - invokeStopWordsCriterion(output_tensors->at("output_ids").getPtr(), + if (auto& stop_words = stop_words_ten_) { + TM_CHECK_EQ(stop_words.ndim(), 3); // [batch, 2, len] + size_t stop_words_len = stop_words.shape(2); + invokeStopWordsCriterion(args.at("output_ids").data(), nullptr, - stop_words_list.getPtr(), - output_tensors->at("finished").getPtr(), + stop_words.data(), + args.at("finished").data(), 0, stop_words_len, batch_size, 1, step, stream_); - sync_check_cuda_error(); } - if (input_tensors->isExist("sequence_limit_length")) { - invokeLengthCriterion(output_tensors->at("finished").getPtr(), - output_tensors->getPtr("should_stop", nullptr), - h_pinned_finished_sum_, - input_tensors->at("sequence_limit_length").getPtr(), + if (auto seq_lim_len = args.try_("sequence_limit_length")) { + invokeLengthCriterion(args.at("finished").data(), // + seq_lim_len->data(), batch_size, 1, step, stream_); - sync_check_cuda_error(); } TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } -template -void StopCriteriaLayer::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) -{ - TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); - - allocateBuffer(); - - TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); -} - #ifdef ENABLE_FP32 template class StopCriteriaLayer; #endif diff --git a/src/turbomind/layers/sampling_layers/StopCriteriaLayer.h b/src/turbomind/layers/sampling_layers/StopCriteriaLayer.h index b70ed2e69e..b9f6ee1b9a 100644 --- a/src/turbomind/layers/sampling_layers/StopCriteriaLayer.h +++ b/src/turbomind/layers/sampling_layers/StopCriteriaLayer.h @@ -16,29 +16,25 @@ #pragma once -#include "src/turbomind/layers/DynamicDecodeBaseLayer.h" -#include "src/turbomind/macro.h" +#include "src/turbomind/layers/BaseDynamicDecodeLayer.h" + +#include "src/turbomind/engine/request.h" namespace turbomind { template -class StopCriteriaLayer: public DynamicDecodeBaseLayer { +class StopCriteriaLayer: public BaseDynamicDecodeLayer { public: - using DynamicDecodeBaseLayer::DynamicDecodeBaseLayer; - - void setup(const size_t batch_size, const size_t beam_width, TensorMap* params) override; + explicit StopCriteriaLayer(const BaseParam& param); - void forward(TensorMap* output_tensors, TensorMap* input_tensors) override; + void Setup(const std::vector& rs, const TensorMap&) override; - ~StopCriteriaLayer(); + void Forward(TensorMap& args) override; private: - void allocateBuffer() override; - - void freeBuffer() override; - - // host buffer - int* h_pinned_finished_sum_{}; + Buffer_ stop_words_; + Buffer_ stop_words_buf_; + Tensor_ stop_words_ten_; }; } // namespace turbomind diff --git a/src/turbomind/layers/sampling_layers/utils.h b/src/turbomind/layers/sampling_layers/utils.h new file mode 100644 index 0000000000..c36c910dac --- /dev/null +++ b/src/turbomind/layers/sampling_layers/utils.h @@ -0,0 +1,72 @@ + +#include +#include + +#include "src/turbomind/core/core.h" + +namespace turbomind { + +constexpr int kMaxStopBadWordsLen = 32; +constexpr int kMaxEndIdsSize = 32; + +namespace { + +template +void init_stop_bad_words(G getter, const char* key, const Rs& rs, T* h_buf, T* d_buf, Tensor_& out) +{ + const int bsz = rs.size(); + int max_length = 0; + + std::vector> copy_tokens(bsz); + std::vector> copy_offsets(bsz); + for (int i = 0; i < bsz; ++i) { + const auto& [token_ids, offsets] = std::invoke(getter, rs[i]->gen_cfg); + if (offsets.size() == 0 || token_ids.size() == 0) { + continue; + } + FT_CHECK(offsets.back() == token_ids.size()); + if (offsets.back() <= kMaxStopBadWordsLen) { + copy_tokens[i] = std::make_pair(token_ids.data(), (int)token_ids.size()); + copy_offsets[i] = std::make_pair(offsets.data(), (int)offsets.size()); + max_length = std::max(max_length, (int)token_ids.size()); + } + else { + auto trunc_offset_size = + std::upper_bound(offsets.begin(), + offsets.begin() + std::min(kMaxStopBadWordsLen, (int)offsets.size()), + kMaxStopBadWordsLen) + - offsets.begin(); + TM_LOG_WARNING("[InitializeSampling] [%ld] %s length (%d) exceeds %d, truncated to %d", + rs[i]->id, + key, + offsets.back(), + kMaxStopBadWordsLen, + trunc_offset_size); + if (trunc_offset_size > 0) { + int trunc_token_size = offsets[trunc_token_size - 1]; + copy_tokens[i] = std::make_pair(token_ids.data(), trunc_token_size); + copy_offsets[i] = std::make_pair(offsets.data(), trunc_offset_size); + max_length = std::max(max_length, trunc_token_size); + } + } + } + if (!max_length) { + return; + } + std::fill_n(h_buf, bsz * 2 * max_length, -1); + for (int i = 0; i < bsz; ++i) { + if (copy_tokens[i].first != nullptr) { + std::copy_n(copy_tokens[i].first, copy_tokens[i].second, h_buf + i * 2 * max_length); + } + if (copy_offsets[i].first != nullptr) { + std::copy_n(copy_offsets[i].first, copy_offsets[i].second, h_buf + i * 2 * max_length + max_length); + } + } + core::Copy(h_buf, bsz * 2 * max_length, d_buf); + // Construct a tensor from the device buffer + out = {d_buf, {bsz, 2, max_length}, kDEVICE}; +}; + +} // namespace + +} // namespace turbomind diff --git a/src/turbomind/models/llama/BlockManager.cc b/src/turbomind/models/llama/BlockManager.cc index 2744b71b55..d04634a287 100644 --- a/src/turbomind/models/llama/BlockManager.cc +++ b/src/turbomind/models/llama/BlockManager.cc @@ -28,7 +28,7 @@ size_t GetSyncFreeMemSize(Barrier& barrier, std::atomic& value) } BlockManager::BlockManager( - size_t block_size, double block_count, int chunk_size, IAllocator* allocator, GetFreeMemSize get_free_size): + size_t block_size, double block_count, int chunk_size, core::Allocator allocator, GetFreeMemSize get_free_size): block_size_(block_size), allocator_(allocator) { if (block_count < 1.) { @@ -66,7 +66,7 @@ BlockManager::BlockManager( BlockManager::~BlockManager() { for (auto& chunk : chunks_) { - allocator_->free(&chunk); + allocator_->deallocate(chunk, block_size_); } } @@ -78,7 +78,7 @@ bool BlockManager::Malloc() return false; } - auto ptr = (std::byte*)allocator_->malloc(block_size_ * chunk_size); + auto ptr = (std::byte*)allocator_->allocate(block_size_ * chunk_size); if (!ptr) { return false; } @@ -285,8 +285,7 @@ std::ostream& operator<<(std::ostream& os, const BlockManager& manager) os << "free_ids: " << manager.free_ids_.size() << ", "; os << "blocks: " << manager.blocks_.size() << ", "; os << "unique_id: " << manager.unique_id_ << ", "; - os << "timestamp: " << manager.timestamp_ << ", "; - os << "allocator: " << manager.allocator_; + os << "timestamp: " << manager.timestamp_; return os; } diff --git a/src/turbomind/models/llama/BlockManager.h b/src/turbomind/models/llama/BlockManager.h index 70ca74475d..df1f0d3833 100644 --- a/src/turbomind/models/llama/BlockManager.h +++ b/src/turbomind/models/llama/BlockManager.h @@ -2,8 +2,8 @@ #pragma once +#include "src/turbomind/core/allocator.h" #include "src/turbomind/models/llama/Barrier.h" -#include "src/turbomind/utils/allocator.h" #include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/logger.h" #include @@ -73,7 +73,7 @@ size_t GetSyncFreeMemSize(Barrier& barrier, std::atomic& value); class BlockManager { public: explicit BlockManager( - size_t block_size, double block_count, int chunk_size, IAllocator* allocator, GetFreeMemSize get_free_size); + size_t block_size, double block_count, int chunk_size, core::Allocator allocator, GetFreeMemSize get_free_size); ~BlockManager(); @@ -141,10 +141,11 @@ class BlockManager { bool Malloc(); private: - size_t block_size_; - int max_block_count_{}; - int chunk_size_{}; - IAllocator* allocator_; + size_t block_size_; + int max_block_count_{}; + int chunk_size_{}; + + core::Allocator allocator_; std::vector chunks_; diff --git a/src/turbomind/models/llama/CMakeLists.txt b/src/turbomind/models/llama/CMakeLists.txt index 3b79254970..90c1b239fe 100644 --- a/src/turbomind/models/llama/CMakeLists.txt +++ b/src/turbomind/models/llama/CMakeLists.txt @@ -13,42 +13,30 @@ add_library(Llama STATIC BlockTrie.cc SequenceManager.cc LlamaWeight.cc + LlamaDenseWeight.cc LlamaDecoderLayerWeight.cc LlamaFfnLayer.cc moe_ffn_layer.cc unified_decoder.cc unified_attention_layer.cc llama_kernels.cu - llama_decoder_kernels.cu llama_utils.cu mla_utils.cu) set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) target_link_libraries(Llama PUBLIC CUDA::cudart engine + core gemm2 + CUDA::cublas rms_norm - cublasMMWrapper DynamicDecodeLayer activation_kernels attention decoding_kernels unfused_attention_kernels gpt_kernels - tensor memory_utils cuda_utils logger anomaly_handler) - - -add_executable(llama_gemm llama_gemm.cc) -target_link_libraries(llama_gemm PUBLIC CUDA::cudart gpt_gemm_func memory_utils cuda_utils logger) - -install(TARGETS llama_gemm DESTINATION ${CMAKE_SOURCE_DIR}/lmdeploy/bin) - -# find_package(Catch2 3 QUIET) -# if (Catch2_FOUND) -# add_executable(test_cache_manager test_cache_manager.cc) -# target_link_libraries(test_cache_manager PRIVATE Llama Catch2::Catch2WithMain) -# endif () diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc index deb8a49da9..065200acb8 100644 --- a/src/turbomind/models/llama/LlamaBatch.cc +++ b/src/turbomind/models/llama/LlamaBatch.cc @@ -21,12 +21,17 @@ #include "src/turbomind/comm/device_comm.h" #include "src/turbomind/comm/host_comm.h" + +#include "src/turbomind/core/allocator.h" +#include "src/turbomind/core/buffer.h" +#include "src/turbomind/core/context.h" +#include "src/turbomind/core/tensor.h" + #include "src/turbomind/macro.h" #include "src/turbomind/engine/gateway.h" #include "src/turbomind/engine/request.h" -#include "src/turbomind/kernels/core/data_type.h" #include "src/turbomind/kernels/decoding_kernels.h" #include "src/turbomind/kernels/gemm/tuner/params.h" #include "src/turbomind/kernels/sampling_topk_kernels.h" @@ -39,7 +44,6 @@ #include "src/turbomind/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_utils.h" -#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/anomaly_handler.h" #include "src/turbomind/utils/constant.h" #include "src/turbomind/utils/cuda_utils.h" @@ -93,8 +97,7 @@ void DropEmbeddings(const Sequence& seq) seq.input_embedding_ranges.resize(sz); } -template -void LlamaBatch::DisableInvalidRequests(Requests& infer_reqs, Requests& kill_reqs) +void LlamaBatch::DisableInvalidRequests(Requests& infer_reqs, Requests& kill_reqs) { NvtxScope _("disable invalid"); @@ -137,8 +140,7 @@ void LlamaBatch::DisableInvalidRequests(Requests& infer_reqs, Requests& kill_ } } -template -void LlamaBatch::FindCanceledIndices(std::vector& indices) +void LlamaBatch::FindCanceledIndices(std::vector& indices) { for (int i = 0; i < state_->size; ++i) { // current batch const auto& r = state_->requests[i]; @@ -148,8 +150,7 @@ void LlamaBatch::FindCanceledIndices(std::vector& indices) } } -template -void LlamaBatch::ProcessCancelRequests(std::vector& indices, std::vector& signals) +void LlamaBatch::ProcessCancelRequests(std::vector& indices, std::vector& signals) { int count = 0; @@ -168,8 +169,7 @@ void LlamaBatch::ProcessCancelRequests(std::vector& indices, std::vector } } -template -void LlamaBatch::ProcessKillRequests(const Requests& kill_reqs, std::vector& signals) +void LlamaBatch::ProcessKillRequests(const Requests& kill_reqs, std::vector& signals) { for (auto& r : kill_reqs) { if (r) { @@ -188,8 +188,7 @@ void LlamaBatch::ProcessKillRequests(const Requests& kill_reqs, std::vector -void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vector& signals) +void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vector& signals) { NvtxScope scope("infer_request"); auto& state = *incoming_; @@ -211,7 +210,7 @@ void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vectorinputs.at("input_ids").shape[0]; + const int input_length = r->inputs.at("input_ids").shape(0); if (input_length > session_len_) { signals.push_back([r] { UpdateState(*r, Request::kTooLong, 0); }); @@ -257,22 +256,22 @@ void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vectorinputs.getPtr("input_ids"); + const int* input_ids = r->inputs.at("input_ids").data(); { // `output_ids` contains all token ids of the sequences - const auto output_ids_base = state.output_ids + session_len_ * idx; + const auto output_ids_base = state.output_ids.data() + session_len_ * idx; auto d_output_ids = output_ids_base; - auto h_output_ids = r->output_ids.getPtr(); + auto h_output_ids = r->output_ids.data(); // copy history tokens if (!seq.tokens.empty()) { - d_output_ids = Copy(seq.tokens.data(), seq.tokens.size(), d_output_ids); + d_output_ids = core::Copy(seq.tokens.data(), seq.tokens.size(), d_output_ids); h_output_ids = std::copy_n(seq.tokens.data(), seq.tokens.size(), h_output_ids); } // copy input tokens if (input_length) { - d_output_ids = Copy(input_ids, input_length, d_output_ids); + d_output_ids = core::Copy(input_ids, input_length, d_output_ids); h_output_ids = std::copy_n(input_ids, input_length, h_output_ids); } @@ -283,23 +282,25 @@ void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vectorsession.start_flag && !r->inputs.isExist("input_embedding_ranges")) { + if (input_length && r->session.start_flag && !r->inputs.contains("input_embedding_ranges")) { // TODO: truncate prompt to enable prefix caching for VLM seq.prompt.resize(input_length); std::copy_n(input_ids, input_length, seq.prompt.data()); } + const int elem_size = byte_size(data_type_); + // copy input embeddings - if (r->inputs.isExist("input_embedding_ranges")) { - const auto range_tensor = r->inputs.at("input_embedding_ranges"); - const auto emb_tensor = r->inputs.at("input_embeddings"); - const int* ranges = range_tensor.getPtr(); + if (r->inputs.contains("input_embedding_ranges")) { + const auto& range_tensor = r->inputs.at("input_embedding_ranges"); + const auto& emb_tensor = r->inputs.at("input_embeddings"); + const int* ranges = range_tensor.data(); auto check_embeddings = [&](int& num_valid_embeddings) { - if (range_tensor.shape.size() != 3 || range_tensor.shape[2] % 2 != 0) { + if (range_tensor.ndim() != 3 || range_tensor.shape(2) % 2 != 0) { return false; } - int embedding_count = range_tensor.shape[1]; + int embedding_count = range_tensor.shape(1); int embedding_length = 0; int pre_end = -1; @@ -311,7 +312,7 @@ void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vector= end || end > input_length || begin < pre_end - || embedding_length * model_->hidden_units_ * sizeof(T) > emb_tensor.shape[1]) { + || embedding_length * model_->hidden_units_ * elem_size > emb_tensor.shape(1)) { return false; } pre_end = end; @@ -322,20 +323,17 @@ void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vector(); + const std::byte* emb_tensor_ptr = (const std::byte*)emb_tensor.raw_data(); for (size_t i = 0; i < num_valid_embeddings; i++) { int begin = ranges[i * 2]; int end = ranges[i * 2 + 1]; - size_t count = (end - begin) * model_->hidden_units_ * sizeof(T); - seq.input_embeddings.emplace_back((std::byte*)emb_tensor_ptr, (std::byte*)(emb_tensor_ptr + count)); + size_t count = (end - begin) * model_->hidden_units_ * elem_size; + seq.input_embeddings.emplace_back(emb_tensor_ptr, emb_tensor_ptr + count); seq.input_embedding_ranges.emplace_back(begin + seq.tokens.size(), end + seq.tokens.size()); emb_tensor_ptr += count; } @@ -388,7 +386,7 @@ void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vector::ProcessInferRequests(const Requests& reqs, std::vector -int LlamaBatch::AdjustMaxInputCount(GenerationState& g, - const std::vector& sequences, - const std::vector& context_length) +int LlamaBatch::AdjustMaxInputCount(GenerationState& g, + const std::vector& sequences, + const std::vector& context_length) { int input_count = 0; for (int i = 0; i < sequences.size(); ++i) { @@ -448,8 +448,7 @@ int LlamaBatch::AdjustMaxInputCount(GenerationState& g, return input_count; } -template -void LlamaBatch::Initialize(GenerationState& g) +void LlamaBatch::Initialize(GenerationState& g) { NvtxScope scope("initialize"); std::vector sequences; @@ -558,7 +557,7 @@ void LlamaBatch::Initialize(GenerationState& g) // Prepare intermediate buffers h_cu_block_counts_[0] = 0; - auto block_ptrs = h_block_ptrs_; + auto block_ptrs = h_block_ptrs_.data(); const int batch_size = state_->active_size; @@ -577,8 +576,6 @@ void LlamaBatch::Initialize(GenerationState& g) Copy(h_cu_block_counts_, batch_size + 1, cu_block_counts_); Copy(h_block_ptrs_, h_cu_block_counts_[batch_size], block_ptrs_); - // Copy(h_k_block_ptrs_, h_cu_block_counts_[batch_size], k_block_ptrs_); - // Copy(h_v_block_ptrs_, h_cu_block_counts_[batch_size], v_block_ptrs_); } const int batch_size = state_->active_size; @@ -597,7 +594,8 @@ void LlamaBatch::Initialize(GenerationState& g) } } - const int max_context_len = *std::max_element(state_->h_context_length, state_->h_context_length + batch_size); + const int max_context_len = + *std::max_element(state_->h_context_length.data(), state_->h_context_length.data() + batch_size); std::vector unique_ids(batch_size); for (int i = 0; i < batch_size; ++i) { @@ -605,9 +603,9 @@ void LlamaBatch::Initialize(GenerationState& g) } // Real-time context length that will change during generation - Copy(state_->h_context_length, batch_size, context_length_buf_); - Copy(state_->h_finished, batch_size, finished_buf_); - Copy(state_->h_rope_theta, batch_size, rope_theta_); + Copy_(state_->h_context_length, batch_size, context_length_buf_); + Copy_(state_->h_finished, batch_size, finished_buf_); + Copy_(state_->h_rope_theta, batch_size, rope_theta_); bool skip_init_sampling = std::equal(g.unique_ids.begin(), // g.unique_ids.end() - g.partial, @@ -628,8 +626,7 @@ void LlamaBatch::Initialize(GenerationState& g) } } -template -void LlamaBatch::CopyState(const std::vector>& desc) +void LlamaBatch::CopyState(const std::vector>& desc) { if (desc.empty()) { return; @@ -674,8 +671,8 @@ void LlamaBatch::CopyState(const std::vectoroutput_ids, d->output_ids, session_len_}, - std::tuple{s->curand_state, d->curand_state, 1}); + std::tuple{s->output_ids.data(), d->output_ids.data(), session_len_}, + std::tuple{(curandState_t*)s->curand_state.data(), (curandState_t*)d->curand_state.data(), 1}); } for (const auto& [s, d, si, di] : desc) { @@ -689,258 +686,103 @@ void LlamaBatch::CopyState(const std::vector -void LlamaBatch::AllocateBuffer(size_t batch_size, size_t session_len, int cache_block_seq_len) +void LlamaBatch::AllocateBuffer(ssize_t batch_size, ssize_t session_len, int cache_block_seq_len) { TM_LOG_DEBUG(__PRETTY_FUNCTION__); - const size_t batchxbeam = batch_size; + const ssize_t batchxbeam = batch_size; - const size_t hidden_units = model_->hidden_units_; - const size_t vocab_size = model_->vocab_size_padded_; - const size_t head_dim = model_->size_per_head_; - const size_t local_kv_head_num = model_->local_kv_head_num_; + const ssize_t hidden_units = model_->hidden_units_; + const ssize_t vocab_size = model_->vocab_size_padded_; + const ssize_t head_dim = model_->size_per_head_; + const ssize_t local_kv_head_num = model_->local_kv_head_num_; // +1 padding, BlockIterator does not use predicate - const size_t max_batch_block_count = + const ssize_t max_batch_block_count = batch_size * ((session_len + cache_block_seq_len - 1) / cache_block_seq_len) + 1; - context_decoder_input_buf_ = - (T*)allocator_->reMalloc(context_decoder_input_buf_, sizeof(T) * max_forward_token_num_ * hidden_units, false); - context_decoder_ids_buf_ = - (int*)allocator_->reMalloc(context_decoder_ids_buf_, sizeof(int) * max_forward_token_num_, false); - - decoder_input_buf_ = (T*)allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units, false); - decoder_output_buf_ = (T*)allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units, false); - - input_ids_buf_ = (int*)allocator_->reMalloc(input_ids_buf_, sizeof(int) * batchxbeam * session_len, true); - input_length_buf_ = (int*)allocator_->reMalloc(input_length_buf_, sizeof(int) * batchxbeam); - context_length_buf_ = (int*)allocator_->reMalloc(context_length_buf_, sizeof(int) * batchxbeam); - init_context_length_ = (int*)allocator_->reMalloc(init_context_length_, sizeof(int) * batchxbeam); + input_ids_buf_ = {max_forward_token_num_, kDEVICE}; - sequence_lengths_ = (int*)allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false); + decoder_output_buf_ = {{batchxbeam, hidden_units}, data_type_, kDEVICE}; - cu_block_counts_ = (int*)allocator_->reMalloc(cu_block_counts_, sizeof(int) * (batch_size + 1)); - block_ptrs_ = (uintptr_t*)allocator_->reMalloc(block_ptrs_, sizeof(uintptr_t) * max_batch_block_count); + input_length_buf_ = {batchxbeam, kDEVICE}; + context_length_buf_ = {batchxbeam, kDEVICE}; + init_context_length_ = {batchxbeam, kDEVICE}; - if (!logits_buf_) { // may be alias of local_logits_buf_ - logits_buf_ = (T*)allocator_->reMalloc(logits_buf_, sizeof(T) * batchxbeam * vocab_size, false); - } - - sampled_logprobs_ = (T*)allocator_->reMalloc(sampled_logprobs_, sizeof(T) * batchxbeam * kMaxLogProb, false); - sampled_indexes_ = - (uint32_t*)allocator_->reMalloc(sampled_indexes_, sizeof(uint32_t) * batchxbeam * kMaxLogProb, false); - sampled_nums_ = (uint32_t*)allocator_->reMalloc(sampled_nums_, sizeof(uint32_t) * batchxbeam, false); + sequence_lengths_ = {batchxbeam, kDEVICE}; - token_ids_buf_ = (int*)allocator_->reMalloc(token_ids_buf_, sizeof(int) * batchxbeam * session_len * 2, true); + cu_block_counts_ = {batch_size + 1, kDEVICE}; + block_ptrs_ = {max_batch_block_count, kDEVICE}; - finished_buf_ = (bool*)allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false); - seq_limit_len_ = (uint32_t*)allocator_->reMalloc(seq_limit_len_, sizeof(uint32_t) * batch_size, false); + sampled_logprobs_ = {batchxbeam * kMaxLogProb, kDEVICE}; + sampled_indexes_ = {batchxbeam * kMaxLogProb, kDEVICE}; + sampled_nums_ = {batchxbeam, kDEVICE}; - rope_theta_ = (float*)allocator_->reMalloc(rope_theta_, sizeof(float) * batch_size, false); + token_ids_buf_ = {ssize_t(session_len * 2 * batchxbeam), kDEVICE}; - is_allocate_buffer_ = true; -} - -template -void LlamaBatch::AllocatePersistantBuffer(size_t max_batch_size, int cache_block_seq_len) -{ - d_stop_words_ = - (int*)allocator_->reMalloc(d_stop_words_, sizeof(int) * max_batch_size * 2 * kMaxStopBadWordsLen, true); - d_bad_words_ = - (int*)allocator_->reMalloc(d_bad_words_, sizeof(int) * max_batch_size * 2 * kMaxStopBadWordsLen, true); - h_stop_words_ = - (int*)allocator_->reMalloc(h_stop_words_, sizeof(int) * max_batch_size * 2 * kMaxStopBadWordsLen, true, true); - h_bad_words_ = - (int*)allocator_->reMalloc(h_bad_words_, sizeof(int) * max_batch_size * 2 * kMaxStopBadWordsLen, true, true); - - h_min_length_ = (int*)allocator_->reMalloc(h_min_length_, sizeof(int) * max_batch_size, true, true); - h_runtime_top_k_ = (int*)allocator_->reMalloc(h_runtime_top_k_, sizeof(int) * max_batch_size, true, true); - h_runtime_top_p_ = (float*)allocator_->reMalloc(h_runtime_top_p_, sizeof(float) * max_batch_size, true, true); - h_runtime_min_p_ = (float*)allocator_->reMalloc(h_runtime_min_p_, sizeof(float) * max_batch_size, true, true); - h_temperature_ = (float*)allocator_->reMalloc(h_temperature_, sizeof(float) * max_batch_size, true, true); - h_repetition_penalty_ = - (float*)allocator_->reMalloc(h_repetition_penalty_, sizeof(float) * max_batch_size, true, true); - - h_random_seed_ = (unsigned long long*)allocator_->reMalloc( - h_random_seed_, sizeof(unsigned long long) * max_batch_size, true, true); - d_random_seed_ = (unsigned long long*)allocator_->reMalloc( - d_random_seed_, sizeof(unsigned long long) * max_batch_size, true, false); - - h_curand_state_ = - (curandState_t*)allocator_->reMalloc(h_curand_state_, sizeof(curandState_t) * max_batch_size, true, true); - d_curand_state_ = - (curandState_t*)allocator_->reMalloc(d_curand_state_, sizeof(curandState_t) * max_batch_size, true, false); - - d_end_ids_buf_ = (int*)allocator_->reMalloc(d_end_ids_buf_, sizeof(int) * max_batch_size * kMaxEndIdsSize, false); - h_end_ids_buf_ = - (int*)allocator_->reMalloc(h_end_ids_buf_, sizeof(int) * max_batch_size * kMaxEndIdsSize, false, true); + finished_buf_ = {(int)batchxbeam, kDEVICE}; + seq_limit_len_ = {batch_size, kDEVICE}; - for (auto& s : states_) { - s.output_ids = (int*)allocator_->reMalloc(s.output_ids, sizeof(int) * max_batch_size * session_len_, true); - s.curand_state = - (curandState_t*)allocator_->reMalloc(s.curand_state, sizeof(curandState_t) * max_batch_size, true); - } + rope_theta_ = {batch_size, kDEVICE}; - const size_t max_batch_block_count = - max_batch_size * ((session_len_ + cache_block_seq_len - 1) / cache_block_seq_len); + h_random_seed_ = {batch_size, kCPUpinned}; + Clear(h_random_seed_); - { - h_input_ids_buf_ = - (int*)allocator_->reMalloc(h_input_ids_buf_, sizeof(int) * max_batch_size * session_len_, false, true); - h_input_length_buf_ = - (int*)allocator_->reMalloc(h_input_length_buf_, sizeof(int) * max_batch_size, false, true); + d_random_seed_ = {batch_size, kDEVICE}; + Clear(d_random_seed_); - h_cu_block_counts_ = - (int*)allocator_->reMalloc(h_cu_block_counts_, sizeof(int) * (max_batch_size + 1), false, true); - h_block_ptrs_ = - (uintptr_t*)allocator_->reMalloc(h_block_ptrs_, sizeof(uintptr_t) * max_batch_block_count, false, true); + h_curand_state_ = {{batch_size, sizeof(curandState_t)}, kCPUpinned}; + Clear(h_curand_state_.buffer()); - for (auto& s : states_) { - s.h_prompt_length = - (int*)allocator_->reMalloc(s.h_prompt_length, sizeof(int) * max_batch_size, false, true); - s.h_context_length = - (int*)allocator_->reMalloc(s.h_context_length, sizeof(int) * max_batch_size, false, true); - s.h_finished = (bool*)allocator_->reMalloc(s.h_finished, sizeof(bool) * max_batch_size * 2, false, true); - s.h_rope_theta = (float*)allocator_->reMalloc(s.h_rope_theta, sizeof(float) * max_batch_size, false, true); - } + d_curand_state_ = {{batch_size, sizeof(curandState_t)}, kDEVICE}; + Clear(d_curand_state_.buffer()); - h_seq_limit_len_ = - (uint32_t*)allocator_->reMalloc(h_seq_limit_len_, sizeof(uint32_t) * max_batch_size, false, true); + for (auto& s : states_) { + s.output_ids = {{batch_size, session_len_}, kDEVICE}; + Clear(s.output_ids.buffer()); - h_output_ids_ = - (int*)allocator_->reMalloc(h_output_ids_, sizeof(int) * max_batch_size * session_len_, false, true); + s.curand_state = {{batch_size, sizeof(curandState_t)}, kDEVICE}; + Clear(s.curand_state.buffer()); } - h_sampled_logprobs_ = - (T*)allocator_->reMalloc(h_sampled_logprobs_, sizeof(T) * max_batch_size * kMaxLogProb, false, true); - h_sampled_indexes_ = (uint32_t*)allocator_->reMalloc( - h_sampled_indexes_, sizeof(uint32_t) * max_batch_size * kMaxLogProb, false, true); - h_sampled_nums_ = (uint32_t*)allocator_->reMalloc(h_sampled_nums_, sizeof(uint32_t) * max_batch_size, false, true); + h_input_length_buf_ = {batch_size, kCPUpinned}; + h_cu_block_counts_ = {batch_size + 1, kCPUpinned}; + h_block_ptrs_ = {(ssize_t)max_batch_block_count, kCPUpinned}; - is_allocate_persistant_buffer_ = true; -} + for (auto& s : states_) { + s.h_prompt_length = {batch_size, kCPUpinned}; + s.h_context_length = {batch_size, kCPUpinned}; + s.h_finished = {batch_size * 2, kCPUpinned}; + s.h_rope_theta = {batch_size, kCPUpinned}; + } -template -void LlamaBatch::AllocCommBuffers() -{ - const size_t hidden_units = model_->hidden_units_; - const size_t vocab_size_padded = model_->vocab_size_padded_; + h_seq_limit_len_ = {batch_size, kCPUpinned}; + std::fill_n(h_seq_limit_len_.data(), batch_size, 0); - // Native comm fuses allreduce & rmsnorm in token granularity - const size_t max_fwd_token_num = ((size_t)max_forward_token_num_ + tp_size_ - 1) / tp_size_ * tp_size_; + h_output_ids_ = {batch_size * session_len_, kCPUpinned}; - // TODO: rename this to hidden_states - context_decoder_output_buf_ = - (T*)CommBufAlloc(sizeof(T) * param_.attn_dp_size * max_fwd_token_num * hidden_units, true); - - local_logits_buf_ = (T*)CommBufAlloc(sizeof(T) * max_batch_size_ * vocab_size_padded, true); - if (model_->use_allgather_2d_) { - logits_buf_ = local_logits_buf_; - } + h_sampled_logprobs_ = {batch_size * kMaxLogProb, kCPUpinned}; + h_sampled_indexes_ = {batch_size * kMaxLogProb, kCPUpinned}; + h_sampled_nums_ = {batch_size, kCPUpinned}; } -template -void LlamaBatch::FreeCommBuffers() +void LlamaBatch::AllocSymmBuffers() { - CommBufFree((void**)&context_decoder_output_buf_, true); + const ssize_t hidden_units = model_->hidden_units_; + const ssize_t vocab_size_padded = model_->vocab_size_padded_; - if (local_logits_buf_) { - if (logits_buf_ == local_logits_buf_) { - logits_buf_ = {}; - } - CommBufFree((void**)&local_logits_buf_, true); - } + // Native comm fuses allreduce & rmsnorm in token granularity + TM_CHECK(max_forward_token_num_ % tp_size_ == 0); - if (local_context_logits_buf_) { - if (context_logits_buf_ == local_context_logits_buf_) { - context_logits_buf_ = {}; - } - CommBufFree((void**)&local_context_logits_buf_, true); - } + symm_hidden_states_buf_ = {{max_forward_token_num_ * param_.attn_dp_size, hidden_units}, data_type_, symm_alloc_}; + symm_logits_buf_ = {{max_batch_size_, vocab_size_padded}, data_type_, symm_alloc_}; } -template -void LlamaBatch::FreeBuffer() +void LlamaBatch::FreeSymmBuffers() { - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - if (is_allocate_buffer_) { - allocator_->free((void**)&context_decoder_input_buf_); - - allocator_->free((void**)&context_decoder_ids_buf_); - allocator_->free((void**)&lora_mask_buf_); - - allocator_->free((void**)&decoder_input_buf_); - allocator_->free((void**)&decoder_output_buf_); - - allocator_->free((void**)&input_ids_buf_); - allocator_->free((void**)&input_length_buf_); - allocator_->free((void**)&context_length_buf_); - allocator_->free((void**)&init_context_length_); - - allocator_->free((void**)&sequence_lengths_); - - allocator_->free((void**)&cu_block_counts_); - allocator_->free((void**)&block_ptrs_); - - if (logits_buf_) { - allocator_->free((void**)&logits_buf_); - } - if (context_logits_buf_) { - allocator_->free((void**)&context_logits_buf_); - } - - allocator_->free((void**)&token_ids_buf_); - - allocator_->free((void**)&d_end_ids_buf_); - allocator_->free((void**)&h_end_ids_buf_, true); - - allocator_->free((void**)&finished_buf_); - allocator_->free((void**)&seq_limit_len_); - - allocator_->free((void**)&rope_theta_); - - allocator_->free((void**)&sampled_logprobs_); - allocator_->free((void**)&sampled_indexes_); - allocator_->free((void**)&sampled_nums_); - - is_allocate_buffer_ = false; - } - - if (is_allocate_persistant_buffer_) { - - allocator_->free((void**)&d_stop_words_); - allocator_->free((void**)&h_stop_words_, true); - allocator_->free((void**)&d_bad_words_); - allocator_->free((void**)&h_bad_words_, true); - allocator_->free((void**)&d_random_seed_); - allocator_->free((void**)&h_random_seed_, true); - allocator_->free((void**)&d_curand_state_); - allocator_->free((void**)&h_curand_state_, true); - - for (auto& s : states_) { - allocator_->free((void**)&s.h_context_length, true); - allocator_->free((void**)&s.h_finished, true); - allocator_->free((void**)&s.h_rope_theta, true); - allocator_->free((void**)&s.output_ids); - allocator_->free((void**)&s.curand_state); - } - allocator_->free((void**)&h_cu_block_counts_, true); - allocator_->free((void**)&h_block_ptrs_, true); - allocator_->free((void**)&h_input_ids_buf_, true); - allocator_->free((void**)&h_input_length_buf_, true); - allocator_->free((void**)&h_seq_limit_len_, true); - - allocator_->free((void**)&h_output_ids_, true); - - allocator_->free((void**)&h_sampled_logprobs_); - allocator_->free((void**)&h_sampled_indexes_); - allocator_->free((void**)&h_sampled_nums_); - - is_allocate_persistant_buffer_ = false; - } + symm_hidden_states_buf_ = {}; + symm_logits_buf_ = {}; } -template -LlamaBatch::~LlamaBatch() +LlamaBatch::~LlamaBatch() { TM_LOG_DEBUG("~LlamaBatch()"); @@ -950,24 +792,22 @@ LlamaBatch::~LlamaBatch() cudaSetDevice(device_id_); cudaStreamSynchronize(stream_); - FreeBuffer(); - model_.reset(); sequence_manager_.reset(); context_.reset(); // This destroy all objects in context except for `stream` } -template -LlamaBatch::LlamaBatch(const EngineParam& param, - std::unique_ptr> model, // ! This is moved - std::unique_ptr> ctx, // ! This is moved - std::shared_ptr gateway, - int device_id, - int dp_rank): +LlamaBatch::LlamaBatch(DataType data_type, + const EngineParam& param, + std::unique_ptr model, // ! This is moved + std::unique_ptr ctx, // ! This is moved + std::shared_ptr gateway, + int device_id, + int dp_rank): param_(param), gateway_(gateway), max_batch_size_(param.max_batch_size), - max_forward_token_num_(param.max_prefill_token_num + param.max_batch_size), + max_forward_token_num_(param.max_forward_token_num), max_context_token_num_(param.max_context_token_num), num_tokens_per_iter_(param.num_tokens_per_iter), max_prefill_iters_(param.max_prefill_iters), @@ -975,11 +815,9 @@ LlamaBatch::LlamaBatch(const EngineParam& param, dp_rank_(dp_rank), tp_size_(model->tp_size_), tp_rank_(model->tp_rank_), - data_type_(getTensorType()), + data_type_(data_type), debug_(isDebug()), stream_(ctx->stream), - allocator_(ctx->allocator.get()), - cublas_wrapper_(ctx->cublas_wrapper.get()), context_(std::move(ctx)), model_(std::move(model)), comm_(context_->comm), @@ -987,14 +825,16 @@ LlamaBatch::LlamaBatch(const EngineParam& param, { const auto cache_block_seq_len = model_->attn_param_.cache_block_seq_len; + const int dbits = byte_size(data_type, 8); + const auto quant_policy = model_->param_.quant_policy; - const int elem_bits = quant_policy ? quant_policy : bitsof; + const int elem_bits = quant_policy ? quant_policy : dbits; SequenceManager::BlockConfig block_config{ (int)model_->size_per_head_, (int)model_->local_kv_head_num_, cache_block_seq_len, - elem_bits == bitsof ? 0 : bitsof, + elem_bits == dbits ? 0 : dbits, elem_bits, }; @@ -1010,7 +850,7 @@ LlamaBatch::LlamaBatch(const EngineParam& param, param.cache_chunk_size, param.enable_prefix_caching, tp_rank_, - allocator_, + core::Context::alloc(kDEVICE), get_free_size}); const size_t max_session_len = sequence_manager_->max_block_count() * cache_block_seq_len; @@ -1037,20 +877,24 @@ LlamaBatch::LlamaBatch(const EngineParam& param, back_ = &states_[1]; incoming_ = &states_[2]; - AllocCommBuffers(); + symm_alloc_ = core::SimpleAllocator::Create([this](ssize_t size) { return SymmAlloc(size, true); }, + [this](void* p, ssize_t size) { return SymmFree(p, size, true); }, + kDEVICE); + + AllocSymmBuffers(); AllocateBuffer(max_batch_size_, session_len_, cache_block_seq_len); - AllocatePersistantBuffer(max_batch_size_, cache_block_seq_len); // Wait for allocations check_cuda_error(cudaStreamSynchronize(stream_)); } -template -void LlamaBatch::InitializeSampling(const GenerationState& g) +void LlamaBatch::InitializeSampling(const GenerationState& g) { NvtxScope _("InitSampling"); + const int batch_size = state_->active_size - g.partial; + if (batch_size == 0) { return; } @@ -1063,11 +907,11 @@ void LlamaBatch::InitializeSampling(const GenerationState& g) // note that in decoder and in output "sequence length" has different semantic // - in decoder it means length of sequence that has kv cache already computed // - in output it means length of all tokens (the last generated token does not have k/v cache computed yet) - invokePlusScalar(sequence_lengths_, -1, batch_size, stream_); + invokePlusScalar(sequence_lengths_.data(), -1, batch_size, stream_); sync_check_cuda_error(); - Clear(token_ids_buf_, batch_size * session_len_); - invokeTranspose2D(token_ids_buf_, state_->output_ids, batch_size, session_len_, stream_); + Clear(token_ids_buf_.slice(0, batch_size * session_len_)); + invokeTranspose2D(token_ids_buf_.data(), state_->output_ids.data(), batch_size, session_len_, stream_); sync_check_cuda_error(); // token_ids_buf_[s, b] @@ -1076,7 +920,7 @@ void LlamaBatch::InitializeSampling(const GenerationState& g) // ABCDEFGHi -> ABCDEFGHi i // ABCDEFGh ABCDEFGh h // ABCd ABCd d - invokePadLastTokenIds(token_ids_buf_, init_context_length_, g.max_init_ctx_len, batch_size, stream_); + invokePadLastTokenIds(token_ids_buf_.data(), init_context_length_.data(), g.max_init_ctx_len, batch_size, stream_); sync_check_cuda_error(); // seq_limit_len_, will be compared to `step` instead of `sequence_length`, so padding len should be accounted for @@ -1085,213 +929,74 @@ void LlamaBatch::InitializeSampling(const GenerationState& g) } Copy(h_seq_limit_len_, batch_size, seq_limit_len_); - TensorMap inputs; - - auto member_to_tensor = [&](auto getter, auto key, auto dest, auto init) { - int count = 0; - for (int i = 0; i < batch_size; ++i) { - // `std::invoke` - dest[i] = state_->requests[i]->gen_cfg.*getter; - count += dest[i] != init; - } - if (count) { - inputs.insert(key, {MEMORY_CPU, getTensorType(), {(size_t)batch_size}, dest}); - } - }; - - using G = GenerationConfig; - member_to_tensor(&G::top_k, "runtime_top_k", h_runtime_top_k_, 0); - member_to_tensor(&G::top_p, "runtime_top_p", h_runtime_top_p_, 0); - member_to_tensor(&G::min_p, "runtime_min_p", h_runtime_min_p_, 0); - member_to_tensor(&G::temperature, "temperature", h_temperature_, 1.f); - member_to_tensor(&G::repetition_penalty, "repetition_penalty", h_repetition_penalty_, 1.f); - member_to_tensor(&G::min_new_tokens, "min_length", h_min_length_, 0); - - auto init_stop_bad_words = [&](auto getter, auto key, auto h_buf, auto d_buf) { - int max_length = 0; - std::vector> copy_tokens(batch_size); - std::vector> copy_offsets(batch_size); - for (int i = 0; i < batch_size; ++i) { - const auto& [token_ids, offsets] = std::invoke(getter, state_->requests[i]->gen_cfg); - if (offsets.size() == 0 || token_ids.size() == 0) { - continue; - } - FT_CHECK(offsets.back() == token_ids.size()); - if (offsets.back() <= kMaxStopBadWordsLen) { - copy_tokens[i] = std::make_pair(token_ids.data(), (int)token_ids.size()); - copy_offsets[i] = std::make_pair(offsets.data(), (int)offsets.size()); - max_length = std::max(max_length, (int)token_ids.size()); - } - else { - auto trunc_offset_size = - std::upper_bound(offsets.begin(), - offsets.begin() + std::min(kMaxStopBadWordsLen, (int)offsets.size()), - kMaxStopBadWordsLen) - - offsets.begin(); - TM_LOG_WARNING("[InitializeSampling] [%ld] %s length (%d) exceeds %d, truncated to %d", - state_->requests[i]->id, - key, - offsets.back(), - kMaxStopBadWordsLen, - trunc_offset_size); - if (trunc_offset_size > 0) { - int trunc_token_size = offsets[trunc_token_size - 1]; - copy_tokens[i] = std::make_pair(token_ids.data(), trunc_token_size); - copy_offsets[i] = std::make_pair(offsets.data(), trunc_offset_size); - max_length = std::max(max_length, trunc_token_size); - } - } - } - if (!max_length) { - return; - } - std::fill_n(h_buf, batch_size * 2 * max_length, -1); - for (int i = 0; i < batch_size; ++i) { - if (copy_tokens[i].first != nullptr) { - std::copy_n(copy_tokens[i].first, copy_tokens[i].second, h_buf + i * 2 * max_length); - } - if (copy_offsets[i].first != nullptr) { - std::copy_n(copy_offsets[i].first, copy_offsets[i].second, h_buf + i * 2 * max_length + max_length); - } - } - Copy(h_buf, batch_size * 2 * max_length, d_buf); - inputs.insert(key, {MEMORY_GPU, TYPE_INT32, {(size_t)batch_size, (size_t)2, (size_t)max_length}, d_buf}); - }; - init_stop_bad_words(&G::stop_ids, "stop_words_list", h_stop_words_, d_stop_words_); - init_stop_bad_words(&G::bad_ids, "bad_words_list", h_bad_words_, d_bad_words_); - - // MinLengthPenalty - if (inputs.isExist("min_length")) { - inputs.insert({"prompt_length", {MEMORY_CPU, TYPE_INT32, {(size_t)batch_size}, state_->h_prompt_length}}); - inputs.insert({"context_length", {MEMORY_CPU, TYPE_INT32, {(size_t)batch_size}, state_->h_context_length}}); - } - - // init for eos - auto init_for_eos = [&] { - int max_length = 0; - for (int i = 0; i < batch_size; ++i) { - max_length = std::max(max_length, (int)state_->requests[i]->gen_cfg.eos_ids.size()); - } - if (max_length) { - max_length = std::min(max_length, kMaxEndIdsSize); - int* h_end_ids = h_end_ids_buf_; - std::fill(h_end_ids, h_end_ids + std::min(kMaxEndIdsSize, max_length) * batch_size, -1); - for (int i = 0; i < batch_size; ++i) { - const auto& eos_ids = state_->requests[i]->gen_cfg.eos_ids; - if (eos_ids.size() == 0) { - continue; - } - if (eos_ids.size() > kMaxEndIdsSize) { - TM_LOG_WARNING("[InitializeSampling] [%ld] eos length (%d) exceeds %d, truncated to %d", - (long)state_->requests[i]->id, - (int)eos_ids.size(), - kMaxEndIdsSize, - kMaxEndIdsSize); - } - std::copy_n(eos_ids.begin(), std::min((int)eos_ids.size(), kMaxEndIdsSize), h_end_ids); - h_end_ids += max_length; - } - Copy(h_end_ids_buf_, batch_size * max_length, d_end_ids_buf_); - inputs.insert("end_ids", - {MEMORY_GPU, TYPE_INT32, {(size_t)batch_size, (size_t)max_length}, d_end_ids_buf_}); - } - }; - init_for_eos(); - - inputs_ = std::move(inputs); - - { - NvtxScope setup("DynamicDecodeLayer.setup"); - model_->dynamic_decode_layer_->setup(batch_size, 1, &inputs_); + std::vector rs; + rs.reserve(batch_size); + for (int i = 0; i < batch_size; ++i) { + rs.push_back(state_->requests[i].get()); } - TensorMap outputs; - for (int i = 0; i < batch_size; i++) { - if (state_->requests[i]->gen_cfg.output_logprobs) { - outputs.insert({"sampled_logprobs", - {MEMORY_GPU, getTensorType(), {(size_t)batch_size, 1, kMaxLogProb}, sampled_logprobs_}}); - outputs.insert( - {"sampled_indexes", {MEMORY_GPU, TYPE_UINT32, {(size_t)batch_size, 1, kMaxLogProb}, sampled_indexes_}}); - outputs.insert({"sampled_nums", {MEMORY_GPU, TYPE_UINT32, {(size_t)batch_size, 1}, sampled_nums_}}); + model_->dynamic_decode_->Setup(rs, {{"prompt_length", {state_->h_prompt_length, {batch_size}}}}); - break; - } - } - outputs_ = std::move(outputs); sync_check_cuda_error(); } -template -void LlamaBatch::ComputeAndOutputLogits(T* hidden_states, int first, int last) +void LlamaBatch::ComputeAndOutputLogits(const Tensor& hidden_states, int first, int last) { - int token_num = 0; - bool found = false; - for (int i = first; i < last; ++i) { - if (state_->requests[i]->gen_cfg.output_logits == GenerationConfig::kAll) { - const auto& s = *state_->sequences[i]; - // Skip when the seq is filling missed cache only - if (s.cache_len + h_input_length_buf_[i] > s.tokens.size()) { - found = true; + auto enable = [&] { + for (int i = first; i < last; ++i) { + if (state_->requests[i]->gen_cfg.output_logits == GenerationConfig::kAll) { + const auto& s = *state_->sequences[i]; + // Skip when the seq is filling missed cache only + if (s.cache_len + h_input_length_buf_[i] > s.tokens.size()) { + return true; + } } } - token_num += h_input_length_buf_[i]; - } + return false; + }(); - if (!found) { + if (!enable) { return; } - if (tp_size_ > 1) { - FT_CHECK(model_->vocab_size_padded_ % tp_size_ == 0); - const size_t byte_size = sizeof(T) * model_->vocab_size_padded_ * token_num; + const int vocab_size_padded = model_->vocab_size_padded_; + const int token_num = hidden_states.shape(0); - if (local_context_logits_buf_size_ < byte_size) { + if (symm_logits_buf_.shape(0) < token_num) { + if (tp_size_ > 1) { check_cuda_error(cudaStreamSynchronize(stream_)); comm_.h_tp_group->Sync(); - - CommBufFree((void**)&local_context_logits_buf_, true); - local_context_logits_buf_ = (T*)CommBufAlloc(byte_size, true); - local_context_logits_buf_size_ = byte_size; - + } + symm_logits_buf_ = {{token_num, vocab_size_padded}, data_type_, symm_alloc_}; + if (tp_size_ > 1) { check_cuda_error(cudaStreamSynchronize(stream_)); comm_.h_tp_group->Sync(); } } - if (model_->use_allgather_2d_) { - // No intermediate transpose needed - context_logits_buf_ = local_context_logits_buf_; - } - else { - context_logits_buf_ = - (T*)allocator_->reMalloc(context_logits_buf_, sizeof(T) * model_->vocab_size_padded_ * token_num, false); - } - - model_->postDecodeEmbedding(context_logits_buf_, local_context_logits_buf_, hidden_states, token_num); + auto logits = model_->postDecodeEmbedding(hidden_states, symm_logits_buf_.buffer()); - if (tp_rank_ != 0) { - return; + if (tp_rank_ == 0) { + OutputLogits(logits, first, last, GenerationConfig::kAll); } - - OutputLogits(context_logits_buf_, first, last, GenerationConfig::kAll); } -template -void LlamaBatch::OutputLogits(const T* logits, int first, int last, GenerationConfig::OutType out_type) +void LlamaBatch::OutputLogits(const Tensor& logits, int first, int last, GenerationConfig::OutType out_type) { + const auto& src_buf = logits.buffer(); + const auto elem_size = byte_size(logits.dtype(), 1); // when `is_all` is true, logits only contains last token of the sequences const bool is_all = out_type == GenerationConfig::kAll; + int base = 0; + for (int i = first; i < last; ++i) { const int input_len = h_input_length_buf_[i]; // input lenght for this iter - const T* src_ptr = logits; - - logits += (is_all ? input_len : 1) * model_->vocab_size_padded_; if (state_->requests[i]->gen_cfg.output_logits == out_type) { - auto dst_ptr = state_->requests[i]->outputs.getPtr("logits"); + auto& dst_buf = state_->requests[i]->outputs.at("logits").buffer(); const int cache_len = state_->sequences[i]->cache_len; const int history_len = state_->sequences[i]->tokens.size(); @@ -1300,7 +1005,7 @@ void LlamaBatch::OutputLogits(const T* logits, int first, int last, Generatio // C C C C // offset to the last token prompt - const int offset = is_all ? 0 : state_->requests[i]->inputs.at("input_ids").shape[0] - 1; + const int offset = is_all ? 0 : state_->requests[i]->inputs.at("input_ids").shape(0) - 1; int diff = (history_len + offset) - cache_len; @@ -1319,67 +1024,72 @@ void LlamaBatch::OutputLogits(const T* logits, int first, int last, Generatio continue; } + int src_base = base; + if (is_all) { // Skip invalid tokens caused by cache miss - src_ptr += std::max(0, (history_len + offset) - cache_len) * model_->vocab_size_padded_; + src_base += std::max(0, (history_len + offset) - cache_len); } // Skip previous chunks - dst_ptr += std::max(0, cache_len - (history_len + offset)) * model_->vocab_size_; + int dst_base = std::max(0, cache_len - (history_len + offset)); - check_cuda_error(cudaMemcpy2DAsync(dst_ptr, - sizeof(T) * model_->vocab_size_, - src_ptr, - sizeof(T) * model_->vocab_size_padded_, - sizeof(T) * model_->vocab_size_, + check_cuda_error(cudaMemcpy2DAsync(dst_buf.raw_data(dst_base * model_->vocab_size_), + elem_size * model_->vocab_size_, + src_buf.raw_data(src_base * model_->vocab_size_padded_), + elem_size * model_->vocab_size_padded_, + elem_size * model_->vocab_size_, valid_len, cudaMemcpyDefault, stream_)); } + + base += is_all ? input_len : 1; } } -template -void LlamaBatch::OutputLastHiddenState(const T* hidden_states, int first, int last) +void LlamaBatch::OutputLastHiddenState(const Tensor& hidden_states, int first, int last) { - for (int i = first; i < last; ++i) { + const auto& src_buf = hidden_states.buffer(); + const auto data_type = src_buf.dtype(); + int base = 0; + for (int i = first; i < last; ++i) { const int input_len = h_input_length_buf_[i]; // input lenght for this iter - const T* src_ptr = hidden_states; - - hidden_states += input_len * model_->hidden_units_; if (auto out_type = state_->requests[i]->gen_cfg.output_last_hidden_state) { const bool is_all = out_type == GenerationConfig::kAll; - T* dst_ptr = state_->requests[i]->outputs.getPtr("last_hidden_state"); + auto& dst_buf = state_->requests[i]->outputs.at("last_hidden_state").buffer(); const int cache_len = state_->sequences[i]->cache_len; const int history_len = state_->sequences[i]->tokens.size(); // offset to the last prompt token - const int offset = is_all ? 0 : state_->requests[i]->inputs.at("input_ids").shape[0] - 1; + const int offset = is_all ? 0 : state_->requests[i]->inputs.at("input_ids").shape(0) - 1; const int valid_len = input_len - std::max(0, (history_len + offset) - cache_len); // TM_LOG_ERROR("%d %d %d %d %d", history_len, offset, cache_len, input_len, valid_len); - if (valid_len <= 0) { - continue; - } - - // Skip invalid tokens caused by cache miss - src_ptr += std::max(0, (history_len + offset) - cache_len) * model_->hidden_units_; - // Skip previous chunks - dst_ptr += std::max(0, cache_len - (history_len + offset)) * model_->hidden_units_; + if (valid_len > 0) { + // Skip invalid tokens caused by cache miss + int src_base = std::max(0, (history_len + offset) - cache_len) + base; + // Skip previous chunks + int dst_base = std::max(0, cache_len - (history_len + offset)); - Copy(src_ptr, valid_len * model_->hidden_units_, dst_ptr); + core::Copy(src_buf.raw_data(src_base * model_->hidden_units_), + byte_size(data_type, valid_len * model_->hidden_units_), + dst_buf.raw_data(dst_base * model_->hidden_units_)); + } } + + // hidden_states += input_len * model_->hidden_units_; + base += input_len; } } -template -void LlamaBatch::Finish(GenerationState& g, std::vector& signals) +void LlamaBatch::Finish(GenerationState& g, std::vector& signals) { NvtxScope scope("Finish"); const int batch_size = state_->active_size; @@ -1390,9 +1100,9 @@ void LlamaBatch::Finish(GenerationState& g, std::vector& signals) FT_CHECK(g.step >= 0); // [s,b] -> [b,s] and skip padding in [context_len, max_context_len) - invokeGatherOutput(state_->output_ids, - token_ids_buf_, - init_context_length_, + invokeGatherOutput(state_->output_ids.data(), + token_ids_buf_.data(), + init_context_length_.data(), g.max_init_ctx_len, g.step, session_len_, @@ -1401,7 +1111,7 @@ void LlamaBatch::Finish(GenerationState& g, std::vector& signals) sync_check_cuda_error(); } - Copy(token_ids_buf_ + (g.step - 1) * (batch_size - g.partial), batch_size - g.partial, h_output_ids_); + Copy(token_ids_buf_.slice((g.step - 1) * (batch_size - g.partial), -1), batch_size - g.partial, h_output_ids_); Copy(finished_buf_, batch_size, state_->h_finished); Copy(sequence_lengths_, batch_size, state_->h_context_length); @@ -1430,14 +1140,14 @@ void LlamaBatch::Finish(GenerationState& g, std::vector& signals) if (tp_rank_ == 0 && output_logprobs) { NvtxScope scope("logprobs"); // output logprobs, should be set before sequence_length - T* sampled_logprobs_ptr = h_sampled_logprobs_; - uint32_t* sampled_indexes_ptr = h_sampled_indexes_; - uint32_t* sampled_nums_ptr = h_sampled_nums_; + float* sampled_logprobs_ptr = h_sampled_logprobs_.data(); + uint32_t* sampled_indexes_ptr = h_sampled_indexes_.data(); + uint32_t* sampled_nums_ptr = h_sampled_nums_.data(); for (int i = 0; i < batch_size - g.partial; ++i) { if (state_->requests[i] && state_->requests[i]->gen_cfg.output_logprobs) { - auto logprob_vals = state_->requests[i]->outputs.getPtr("logprob_vals"); - auto logprob_indexes = state_->requests[i]->outputs.getPtr("logprob_indexes"); - auto logprob_nums = state_->requests[i]->outputs.getPtr("logprob_nums"); + auto logprob_vals = state_->requests[i]->outputs.at("logprob_vals").data(); + auto logprob_indexes = state_->requests[i]->outputs.at("logprob_indexes").data(); + auto logprob_nums = state_->requests[i]->outputs.at("logprob_nums").data(); int offset = state_->h_context_length[i] - state_->h_prompt_length[i] - 1; std::copy(sampled_logprobs_ptr, @@ -1457,35 +1167,13 @@ void LlamaBatch::Finish(GenerationState& g, std::vector& signals) // ! Only rank-0 writes to output if (tp_rank_ == 0) { NvtxScope scope("output_ids"); - if constexpr (0) { - // set output tokens ids and sequence length - int* output_ptr = h_output_ids_; - for (int i = 0; i < batch_size - g.partial; ++i) { - if (auto& r = state_->requests[i]) { - auto output_ids = static_cast(r->output_ids.data); - auto output_len = static_cast(r->sequence_length.data); - const int count = state_->h_context_length[i]; - if (r->stream_output) { - output_ids[count - 1] = output_ptr[count - 1]; - *output_len = count; - } - else if (state_->h_finished[i]) { - std::copy(output_ptr, output_ptr + count, output_ids); - *output_len = count; - } - } - output_ptr += session_len_; - } - } - else { - for (int i = 0; i < batch_size - g.partial; ++i) { - if (auto& r = state_->requests[i]) { - auto output_ids = static_cast(r->output_ids.data); - auto output_len = static_cast(r->sequence_length.data); - const int count = state_->h_context_length[i]; - output_ids[count - 1] = h_output_ids_[i]; - *output_len = count; - } + for (int i = 0; i < batch_size - g.partial; ++i) { + if (auto& r = state_->requests[i]) { + auto output_ids = r->output_ids.data(); + auto output_len = r->sequence_length.data(); + const int count = state_->h_context_length[i]; + output_ids[count - 1] = h_output_ids_[i]; + *output_len = count; } } } @@ -1497,7 +1185,7 @@ void LlamaBatch::Finish(GenerationState& g, std::vector& signals) for (int i = 0; i < batch_size; ++i) { // ss << (i ? ", " : "") << "(" << state_->h_context_length[i] << "," << state_->h_finished[i] << ")"; std::vector tokens(state_->h_context_length[i]); - Copy(state_->output_ids + i * session_len_, tokens.size(), tokens.data()); + core::Copy(state_->output_ids.data() + i * session_len_, tokens.size(), tokens.data()); cudaStreamSynchronize(stream_); std::stringstream ss; for (const auto& t : tokens) { @@ -1535,7 +1223,7 @@ void LlamaBatch::Finish(GenerationState& g, std::vector& signals) FT_CHECK(!r); } else if (r->stream_output && tp_rank_ == 0) { - const auto seq_len = r->sequence_length.getVal(); + const auto seq_len = *r->sequence_length.data(); // Create signals by copying the request handles for non-finished streaming requests signals.push_back([this, r, seq_len] { // UpdateState(*r, Request::kOk, seq_len); @@ -1556,8 +1244,7 @@ void LlamaBatch::Finish(GenerationState& g, std::vector& signals) } } -template -auto LlamaBatch::Interrupt(int index, bool force_stop, bool force_end) -> Signal +auto LlamaBatch::Interrupt(int index, bool force_stop, bool force_end) -> Signal { if (tp_rank_ == 0) { TM_LOG_INFO("[Interrupt] slot %d, request %lu, stop %d, end %d", @@ -1569,7 +1256,7 @@ auto LlamaBatch::Interrupt(int index, bool force_stop, bool force_end) -> Sig if (debug_ && tp_rank_ == 0) { std::vector tokens(state_->h_context_length[index]); - Copy(state_->output_ids + index * session_len_, tokens.size(), tokens.data()); + core::Copy(state_->output_ids.data() + index * session_len_, tokens.size(), tokens.data()); cudaStreamSynchronize(stream_); std::stringstream ss; for (const auto& t : tokens) { @@ -1590,13 +1277,13 @@ auto LlamaBatch::Interrupt(int index, bool force_stop, bool force_end) -> Sig seq.tokens.resize(output_len); // output_ids is updated & synced in `Finish` - const auto output_ids = state_->requests[index]->output_ids.getPtr(); + const auto output_ids = state_->requests[index]->output_ids.data(); std::copy_n(output_ids, output_len, seq.tokens.data()); // Save random state in host memory seq.random_state.resize(sizeof(curandState_t)); // This async copy must be synchronized by the caller - Copy(state_->curand_state + index, 1, (curandState_t*)seq.random_state.data()); + core::Copy((curandState_t*)state_->curand_state.data() + index, 1, (curandState_t*)seq.random_state.data()); // Set unlock flag for corresponding blocks, will be unlocked in the next `Materialize()` sequence_manager_->UpdateAndSetUnlock(seq); @@ -1606,7 +1293,7 @@ auto LlamaBatch::Interrupt(int index, bool force_stop, bool force_end) -> Sig auto ec = std::exchange(state_->errors[index], Request::kOk); - const auto len = state_->requests[index]->sequence_length.getVal(); + const auto len = *state_->requests[index]->sequence_length.data(); // move the request handle into the signal return [this, len, force_stop, r = std::move(state_->requests[index])] { // UpdateState(*r, force_stop ? Request::kCancel : Request::kFinish, len); @@ -1625,12 +1312,13 @@ struct RequestData { } // namespace -template -void LlamaBatch::InternalThreadEntry() +void LlamaBatch::InternalThreadEntry() { // TM_LOG_INFO("[InternalThreadEntry] %d", (int)rank_); check_cuda_error(cudaSetDevice(device_id_)); + core::ContextGuard guard{context_->core_stream, context_->allocator}; + // Initialize `AnomalyHandler` AnomalyHandler::instance().Init(tp_rank_, model_->vocab_size_padded_, 0, max_batch_size_, stream_); @@ -1712,8 +1400,7 @@ void LlamaBatch::InternalThreadEntry() DestroyCommunicators(); } -template -void LlamaBatch::Start() +void LlamaBatch::Start() { TM_LOG_INFO("LlamaBatch::Start()"); internal_thread_ = std::thread([this] { @@ -1727,8 +1414,7 @@ void LlamaBatch::Start() }); } -template -bool LlamaBatch::Forward(GenerationState& g) +bool LlamaBatch::Forward(GenerationState& g) { NvtxScope _("Forward"); @@ -1749,7 +1435,7 @@ bool LlamaBatch::Forward(GenerationState& g) // const int missing = state_->h_context_length[i] - seq.cache_len; FT_CHECK(seq.input_length >= 1); h_input_length_buf_[i] = seq.input_length; - input_d_ptrs[i] = state_->output_ids + i * session_len_ + seq.cache_len; + input_d_ptrs[i] = state_->output_ids.data() + i * session_len_ + seq.cache_len; if (seq.input_length > 1 && pf_offset < 0) { pf_offset = i; } @@ -1800,7 +1486,7 @@ bool LlamaBatch::Forward(GenerationState& g) const int first = offsets[p]; const int last = offsets[p + 1]; const int mini_batch_size = last - first; - int* input_ids = context_decoder_ids_buf_; + int* input_ids = input_ids_buf_.data(); BatchedCopy batched_copy; int sum_k = 0; @@ -1810,7 +1496,7 @@ bool LlamaBatch::Forward(GenerationState& g) sum_k += state_->h_context_length[i]; } } - int sum_q = input_ids - context_decoder_ids_buf_; + int sum_q = input_ids - input_ids_buf_.data(); batched_copy.Submit(stream_); @@ -1819,8 +1505,10 @@ bool LlamaBatch::Forward(GenerationState& g) if (tp_rank_ == 0) { if (pf_batch_size) { - const auto max_q = *std::max_element(h_input_length_buf_ + first, h_input_length_buf_ + last); - const auto max_k = *std::max_element(state_->h_context_length + first, state_->h_context_length + last); + const auto max_q = + *std::max_element(h_input_length_buf_.data() + first, h_input_length_buf_.data() + last); + const auto max_k = + *std::max_element(state_->h_context_length.data() + first, state_->h_context_length.data() + last); TM_LOG_INFO("[Forward] [%d, %d), dc=%d, pf=%d, sum_q=%d, sum_k=%d, max_q=%d, max_k=%d", first, last, @@ -1835,68 +1523,71 @@ bool LlamaBatch::Forward(GenerationState& g) // Synchronize batch token num with sync DP ranks auto local_token_nums = AllGather(comm_.h_dp_group, sum_q); + auto global_token_num = std::accumulate(local_token_nums.begin(), local_token_nums.end(), 0); - // if (comm_.h_comm->rank() == 0) { - // std::stringstream ss; - // for (auto x : local_token_nums) { - // ss << x << " "; - // } - // TM_LOG_ERROR("%s", ss.str().c_str()); - // } - - model_->forwardUnified(decoder_output_buf_ + first * model_->hidden_units_, - context_decoder_output_buf_, // temp - context_decoder_input_buf_, // temp - (void**)block_ptrs_, - cu_block_counts_ + first, - context_decoder_ids_buf_, // temp - h_input_length_buf_ + first, - state_->h_context_length + first, - rope_theta_ + first, - finished_buf_ + first, - sum_q, - local_token_nums.data(), - dc_batch_size, - pf_batch_size, - lora_mask_buf_, - state_->sequences.data() + first); - - ComputeAndOutputLogits(context_decoder_output_buf_, first, last); - OutputLastHiddenState(context_decoder_output_buf_, first, last); - } - - if (active_size > g.partial) { - model_->postDecodeEmbedding(logits_buf_, local_logits_buf_, decoder_output_buf_, active_size - g.partial); - - AnomalyHandler::instance().FixLogits(logits_buf_, active_size - g.partial, 1); - - OutputLogits(logits_buf_, 0, active_size - g.partial, GenerationConfig::kGeneration); + auto hidden_states = symm_hidden_states_buf_.slice(0, global_token_num); - FT_CHECK(g.step >= 0); + model_->Forward(input_ids_buf_.slice(0, sum_q), // temp + hidden_states, // temp + decoder_output_buf_.slice(first, mini_batch_size), + block_ptrs_, + cu_block_counts_.slice(first, mini_batch_size + 1), + h_input_length_buf_.slice(first, mini_batch_size), + state_->h_context_length.slice(first, mini_batch_size), + rope_theta_.slice(first, mini_batch_size), + finished_buf_.slice(first, mini_batch_size), + Buffer(local_token_nums.data(), local_token_nums.size(), kCPU), + lora_mask_buf_, + dc_batch_size, + pf_batch_size, + state_->sequences.data() + first); + + ComputeAndOutputLogits(hidden_states, first, last); + OutputLastHiddenState(hidden_states, first, last); + } + + if (const auto bsz = active_size - g.partial; bsz > 0) { + + auto logits = model_->postDecodeEmbedding(decoder_output_buf_.slice(0, bsz), symm_logits_buf_.buffer()); + + // AnomalyHandler::instance().FixLogits(logits.data(), bsz, 1); + + OutputLogits(logits, 0, bsz, GenerationConfig::kGeneration); + + TM_CHECK_GE(g.step, 0); if (!g.skip_init_sampling) { InitializeSampling(g); } + + bool output_logprobs = [&] { + for (int i = 0; i < bsz; ++i) { + if (state_->requests[i]->gen_cfg.output_logprobs) { + return true; + } + } + return false; + }(); + // stop-words & bad-words require the matched tokens to be contiguous, so item size > 1 is - // not supported yet. + // not supported. model_->dynamicDecode(token_ids_buf_, finished_buf_, sequence_lengths_, - nullptr, state_->curand_state, - &inputs_, - &outputs_, - logits_buf_, + logits, // <- batch size indicator seq_limit_len_, init_context_length_, + state_->h_context_length, + state_->h_prompt_length, + output_logprobs ? sampled_indexes_ : Buffer{}, // <- indicator + sampled_logprobs_, + sampled_nums_, g.step, - 0, - g.max_init_ctx_len, - session_len_ * 2, - active_size - g.partial); + g.max_init_ctx_len); } - std::fill(h_input_length_buf_, h_input_length_buf_ + active_size, 0); + std::fill(h_input_length_buf_.data(), h_input_length_buf_.data() + active_size, 0); // `SequenceManager` needs real-time value of cache length for (int i = 0; i < active_size; ++i) { @@ -1918,7 +1609,7 @@ bool LlamaBatch::Forward(GenerationState& g) if (debug_ && tp_rank_ == 0) { std::vector curr(active_size); - Copy(token_ids_buf_ + g.step * active_size, active_size, curr.data()); + core::Copy(token_ids_buf_.data() + g.step * active_size, active_size, curr.data()); cudaStreamSynchronize(stream_); std::stringstream scurr; for (int k = 0; k < curr.size(); ++k) { @@ -1927,14 +1618,10 @@ bool LlamaBatch::Forward(GenerationState& g) TM_LOG_INFO("[Forward] step = %d, [%s]", g.step - 1, scurr.str().c_str()); } - // check_cuda_error(cudaStreamSynchronize(stream_)); - //////////////////////////////////////////////// /// ! increase the counters g.step += 1; - // PrintDecodeTokens(token_ids_buf_, g.step, active_size, stream_, "Forward"); - return true; } @@ -1954,11 +1641,10 @@ std::string Join(First first, Last last, const std::string& delim) return oss.str(); } -template struct TuningContext { - LlamaLinear& linear_; - cudaStream_t stream_; - TuningContext(LlamaLinear& linear, cudaStream_t stream): linear_{linear}, stream_{stream} + LlamaLinear& linear_; + cudaStream_t stream_; + TuningContext(LlamaLinear& linear, cudaStream_t stream): linear_{linear}, stream_{stream} { isTuning() = true; linear_.set_measure(true); @@ -1972,8 +1658,7 @@ struct TuningContext { } // namespace -template -void LlamaBatch::Warmup() +void LlamaBatch::Warmup() { auto& linear = *context_->linear; if (auto str = std::getenv("TM_GEMM_IMPORT")) { @@ -2006,7 +1691,7 @@ void LlamaBatch::Warmup() for (auto& x : input_ids) { x = d(g); } - Copy(input_ids.data(), max_bs, context_decoder_ids_buf_); + core::Copy(input_ids.data(), max_bs, input_ids_buf_.data()); check_cuda_error(cudaStreamSynchronize(stream_)); TuningContext context{linear, stream_}; @@ -2014,29 +1699,31 @@ void LlamaBatch::Warmup() auto tick = std::chrono::steady_clock::now(); /// NOTE: No explicit barrier can be used here as internal threads are waiting on it now - for (auto bs : bss) { + for (auto token_num : bss) { if (tp_rank_ == 0) { - TM_LOG_INFO("[Gemm2] %d", bs); + TM_LOG_INFO("[Gemm2] %d", token_num); } - const int input_length = bs; - auto local_token_nums = AllGather(comm_.h_dp_group, bs); - - model_->forwardUnified(decoder_output_buf_, - context_decoder_output_buf_, - context_decoder_input_buf_, - (void**)block_ptrs_, // invalid data - cu_block_counts_, // invalid data - context_decoder_ids_buf_, - &input_length, - &input_length, - rope_theta_, // invalid data - finished_buf_, // invalid data - bs, - local_token_nums.data(), - 0, - 1, - nullptr, - nullptr); + + int input_length = token_num; + auto local_token_nums = AllGather(comm_.h_dp_group, token_num); + + const auto bsz = 1; + + // A single sequence containing `token_num` prefill tokens + model_->Forward(input_ids_buf_.slice(0, token_num), + symm_hidden_states_buf_.slice(0, token_num * param_.attn_dp_size), + decoder_output_buf_.slice(0, bsz), + block_ptrs_, + cu_block_counts_.slice(0, bsz + 1), + Buffer{&input_length, 1, kCPU}, + Buffer{&input_length, 1, kCPU}, + rope_theta_.slice(0, bsz), + finished_buf_.slice(0, bsz), + Buffer{local_token_nums.data(), (int)local_token_nums.size(), kCPU}, + Buffer{}, + 0, + bsz, + nullptr); } auto tock = std::chrono::steady_clock::now(); @@ -2060,8 +1747,7 @@ void LlamaBatch::Warmup() } } -template -void* LlamaBatch::CommBufAlloc(size_t size, bool register_) +void* LlamaBatch::SymmAlloc(size_t size, bool register_) { if (auto& comm = model_->comm_->d_comm) { auto ptr = comm->Allocate(size); @@ -2071,52 +1757,39 @@ void* LlamaBatch::CommBufAlloc(size_t size, bool register_) return ptr; } else { - return allocator_->malloc(size); + return context_->allocator->allocate(size); } } -template -void LlamaBatch::CommBufFree(void** ptr, bool deregister) +void LlamaBatch::SymmFree(void* ptr, size_t size, bool deregister) { if (!ptr) { return; } - if (auto& comm = model_->comm_->d_comm) { + if (auto& comm = comm_.d_comm) { if (deregister) { - comm->Deregister(*ptr); + comm->Deregister(ptr); } - comm->Free(*ptr); - *ptr = {}; + comm->Free(ptr); } else { - return allocator_->free(ptr); + context_->allocator->deallocate(ptr, size); } } -template -void LlamaBatch::DestroyCommunicators() +void LlamaBatch::DestroyCommunicators() { - if (comm_.d_comm) { - cudaStreamSynchronize(stream_); - comm_.h_comm->Sync(); + cudaStreamSynchronize(stream_); + comm_.h_comm->Sync(); - FreeCommBuffers(); - comm_.h_comm->Sync(); + FreeSymmBuffers(); + comm_.h_comm->Sync(); - // Destroy device communicator - comm_.d_comm = {}; + // Destroy device communicator + comm_.d_comm = {}; - cudaStreamSynchronize(stream_); - comm_.h_comm->Sync(); - } + cudaStreamSynchronize(stream_); + comm_.h_comm->Sync(); } -template class LlamaBatch; -#ifdef ENABLE_FP32 -template class LlamaBatch; -#endif -#ifdef ENABLE_BF16 -template class LlamaBatch<__nv_bfloat16>; -#endif - } // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaBatch.h b/src/turbomind/models/llama/LlamaBatch.h index 4fc5cee93a..837f841188 100644 --- a/src/turbomind/models/llama/LlamaBatch.h +++ b/src/turbomind/models/llama/LlamaBatch.h @@ -4,30 +4,31 @@ #include +#include "src/turbomind/core/core.h" + #include "src/turbomind/engine/gateway.h" #include "src/turbomind/engine/request.h" -#include "src/turbomind/models/llama/Barrier.h" #include "src/turbomind/models/llama/SequenceManager.h" #include "src/turbomind/models/llama/context.h" #include "src/turbomind/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_params.h" -#include "src/turbomind/utils/allocator.h" -#include "src/turbomind/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cuda_utils.h" namespace turbomind { struct BatchState { - int* h_prompt_length; // history + input, ignore generated - int* h_context_length; - bool* h_finished; - curandState_t* curand_state; - int* output_ids; // output ids in [B, S] + Buffer_ h_prompt_length; // history + input, ignore generated + Buffer_ h_context_length; + Buffer_ h_finished; + + Tensor_ curand_state; // [n, sizeof(curandState_t)] + + Tensor_ output_ids; // output ids in [B, S] - float* h_rope_theta; + Buffer_ h_rope_theta; std::vector seq_len_limit; @@ -42,7 +43,6 @@ struct BatchState { int size; }; -template class LlamaV2; struct GenerationState { @@ -62,14 +62,12 @@ struct GenerationState { int finished_count; }; -template class LlamaBatch { public: - void AllocateBuffer(size_t batch_size, size_t session_len, int cache_block_seq_len); - void AllocatePersistantBuffer(size_t max_batch_size, int cache_block_seq_len); + void AllocateBuffer(ssize_t batch_size, ssize_t session_len, int cache_block_seq_len); - void AllocCommBuffers(); - void FreeCommBuffers(); + void AllocSymmBuffers(); + void FreeSymmBuffers(); void FreeBuffer(); @@ -96,24 +94,25 @@ class LlamaBatch { [[nodiscard]] Signal Interrupt(int index, bool force_stop = false, bool force_end = false); - void ComputeAndOutputLogits(T* hidden_states, int first, int last); + void ComputeAndOutputLogits(const Tensor& hidden_states, int first, int last); - void OutputLogits(const T* logits, int first, int last, GenerationConfig::OutType out_type); + void OutputLogits(const Tensor& logits, int first, int last, GenerationConfig::OutType out_type); - void OutputLastHiddenState(const T* hidden_states, int first, int last); + void OutputLastHiddenState(const Tensor& hidden_states, int first, int last); - explicit LlamaBatch(const EngineParam& param, - std::unique_ptr> model, - std::unique_ptr> ctx, - std::shared_ptr gateway, - int device_id, - int dp_rank); + explicit LlamaBatch(DataType data_type, + const EngineParam& param, + std::unique_ptr model, + std::unique_ptr ctx, + std::shared_ptr gateway, + int device_id, + int dp_rank); ~LlamaBatch(); void Start(); - LlamaV2& model() noexcept + LlamaV2& model() noexcept { return *model_; } @@ -136,21 +135,6 @@ class LlamaBatch { void CopyState(const std::vector>& desc); - // analogs to `std::copy_n` - template - U* Copy(const U* src, size_t count, U* dst) - { - check_cuda_error(cudaMemcpyAsync(dst, src, sizeof(U) * count, cudaMemcpyDefault, stream_)); - return dst += count; - } - - template - U* Clear(U* data, size_t count) - { - check_cuda_error(cudaMemsetAsync(data, 0, sizeof(U) * count, stream_)); - return data += count; - } - template void IndexedCopyImpl(const int* src_idx, const int* dst_idx, int count, const std::tuple&... cpys) { @@ -192,9 +176,9 @@ class LlamaBatch { IndexedCopyImpl(nullptr, nullptr, count, cpys...); } - void* CommBufAlloc(size_t size, bool register_); + void* SymmAlloc(size_t size, bool register_); - void CommBufFree(void** ptr, bool deregister); + void SymmFree(void* ptr, size_t size, bool deregister); void DestroyCommunicators(); @@ -216,86 +200,68 @@ class LlamaBatch { const bool debug_; // Refs into `Context` - cudaStream_t const stream_{}; - cublasMMWrapper* const cublas_wrapper_{}; - IAllocator* const allocator_{}; + cudaStream_t const stream_{}; int session_len_; // May be truncated in ctor - std::unique_ptr> context_; - std::unique_ptr> model_; + std::unique_ptr context_; + std::unique_ptr model_; std::unique_ptr sequence_manager_; Communicators& comm_; + Allocator symm_alloc_; + /////////////////////////////////////////////////////////////////// // k/v cache block buffers - int* cu_block_counts_{}; - uintptr_t* block_ptrs_{}; + Buffer_ cu_block_counts_; + Buffer_ block_ptrs_; //////////////////////////////////////////////////////////////////// // context decoding temp buffers - T* context_decoder_input_buf_{}; - T* context_decoder_output_buf_{}; - int* context_decoder_ids_buf_{}; - int* input_ids_buf_{}; - // lengths - int* input_length_buf_{}; // input + cache missed length - int* context_length_buf_{}; // history length + input_length - int* init_context_length_{}; + Tensor symm_hidden_states_buf_; + Tensor symm_logits_buf_; + + Tensor decoder_output_buf_; - T* decoder_input_buf_{}; - T* decoder_output_buf_{}; - int* sequence_lengths_{}; // current sequence length - int* init_ctx_lens_{}; - int* lora_mask_buf_{}; // lora + Buffer_ input_ids_buf_; - T* logits_buf_{}; // combined logits - T* local_logits_buf_{}; // tensor parallel local logits - T* context_logits_buf_{}; - T* local_context_logits_buf_{}; + // lengths + Buffer_ input_length_buf_; // input + cache missed length + Buffer_ context_length_buf_; // history length + input_length + Buffer_ init_context_length_; - size_t local_context_logits_buf_size_{}; + Buffer_ sequence_lengths_; // current sequence length + Buffer_ init_ctx_lens_; + Buffer_ lora_mask_buf_; // lora - T* sampled_logprobs_{}; - uint32_t* sampled_indexes_{}; - uint32_t* sampled_nums_{}; - T* h_sampled_logprobs_{}; - uint32_t* h_sampled_indexes_{}; - uint32_t* h_sampled_nums_{}; + Buffer_ sampled_logprobs_; + Buffer_ sampled_indexes_; + Buffer_ sampled_nums_; + Buffer_ h_sampled_logprobs_; + Buffer_ h_sampled_indexes_; + Buffer_ h_sampled_nums_; - float* rope_theta_{}; + Buffer_ rope_theta_; // used by dynamic decoder - int* token_ids_buf_{}; // all token IDs in [S, B], indexed using `step` - bool* finished_buf_{}; - uint32_t* seq_limit_len_{}; - int* h_end_ids_buf_{}; - int* d_end_ids_buf_{}; + Buffer_ token_ids_buf_; // all token IDs in [S, B], indexed using `step` + Buffer_ finished_buf_; + Buffer_ seq_limit_len_; // pinned buffers - int* h_input_ids_buf_{}; - int* h_input_length_buf_{}; - uint32_t* h_seq_limit_len_{}; - int* h_cu_block_counts_{}; - uintptr_t* h_block_ptrs_{}; - - int* h_min_length_{}; - int* h_runtime_top_k_{}; - float* h_runtime_top_p_{}; - float* h_runtime_min_p_{}; - float* h_temperature_{}; - float* h_repetition_penalty_{}; - int* h_stop_words_{}; // [batch_size, 2, kMaxStopWordsLen] - int* h_bad_words_{}; - int* d_stop_words_{}; // [batch_size, 2, kMaxStopWordsLen] - int* d_bad_words_{}; - - unsigned long long* h_random_seed_{}; - unsigned long long* d_random_seed_{}; - - curandState_t* h_curand_state_{}; - curandState_t* d_curand_state_{}; + Buffer_ h_output_ids_; + Buffer_ h_input_length_buf_; + Buffer_ h_seq_limit_len_; + + Buffer_ h_cu_block_counts_; + Buffer_ h_block_ptrs_; + + Buffer_ h_random_seed_; + Buffer_ d_random_seed_; + + Tensor_ h_curand_state_; // [n, sizeof(curandState_t)] + Tensor_ d_curand_state_; std::array states_{}; @@ -307,18 +273,9 @@ class LlamaBatch { static constexpr int kMaxStopBadWordsLen = 32; static constexpr int kMaxEndIdsSize = 32; - bool is_allocate_persistant_buffer_ = false; - bool is_allocate_buffer_ = false; - - TensorMap inputs_; - TensorMap outputs_; - std::thread internal_thread_; - - int* h_output_ids_{}; }; -template -using Engine = LlamaBatch; +using Engine = LlamaBatch; } // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc index 9e1e2eb4dc..5fc7040c99 100644 --- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc +++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc @@ -18,19 +18,18 @@ // Modified from // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.cc +#include + +#include +#include + #include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h" -#include "src/turbomind/kernels/gemm/cast.h" -#include "src/turbomind/kernels/gemm/gemm.h" -#include "src/turbomind/kernels/gemm/types.h" -#include "src/turbomind/kernels/gpt_kernels.h" + #include "src/turbomind/models/llama/LlamaDenseWeight.h" #include "src/turbomind/models/llama/llama_params.h" #include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/logger.h" -#include "src/turbomind/utils/memory_utils.h" -#include -#include -#include + namespace turbomind { static bool is_fuse_silu_act() @@ -52,17 +51,18 @@ static bool is_fuse_silu_act() return value; } -template -LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(int layer_id, - const ModelParam& model, - const EngineParam& engine, - const LoraParam& lora_param, - const MoeParam& moe_param): +LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(DataType data_type, + int layer_id, + const ModelParam& model, + const EngineParam& engine, + const LoraParam& lora_param, + const MoeParam& moe_param): head_num_(model.head_num), kv_head_num_(model.kv_head_num), size_per_head_(model.head_dim), hidden_units_(model.hidden_units), inter_size_(model.inter_size.at(layer_id)), + data_type_{data_type}, weight_type_(model.weight_type), attn_bias_(model.attn_bias), attn_tp_size_(engine.attn_tp_size), @@ -70,663 +70,68 @@ LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(int layer_id, mlp_tp_size_(engine.mlp_tp_size), mlp_tp_rank_(engine.mlp_tp_rank) { - self_attn_weights = LlamaAttentionWeight{hidden_units_, - size_per_head_, - head_num_, - kv_head_num_, - model.mla, - attn_bias_, - model.qk_norm, - attn_tp_size_, - weight_type_, - model.group_size}; - - ffn_weights = LlamaFfnWeight{ - hidden_units_, - inter_size_, - mlp_tp_size_, - weight_type_, - model.group_size, - weight_type_ == WeightType::kINT4 && is_fuse_silu_act(), - }; - - moe_weights = MoeFfnWeight{ - layer_id, moe_param, hidden_units_, weight_type_, model.group_size, mlp_tp_size_, is_fuse_silu_act()}; - - if (lora_param.policy == LoraPolicy::kPlora) { - std::vector keys = { - "attention.w_qkv", "attention.wo", "feed_forward.w1", "feed_forward.w2", "feed_forward.w3"}; - std::vector*> weights = {&self_attn_weights.qkv, - &self_attn_weights.output, - &ffn_weights.gating, - &ffn_weights.output, - &ffn_weights.intermediate}; - for (int i = 0; i < keys.size(); i++) { - const auto& name = keys[i]; - auto& weight = *weights[i]; - int rank = lora_param.r; - float scale = lora_param.scale; - std::string full_name = "layers." + std::to_string(layer_id) + "." + name; - - for (const auto& [re, pr] : lora_param.rank_pattern) { - if (std::regex_search(full_name, pr.first)) { - rank = pr.second; - TM_LOG_DEBUG("find rank, pattern=%s, name=%s, value=%d", re.c_str(), full_name.c_str(), rank); - break; - } - } - for (const auto& [re, pr] : lora_param.scale_pattern) { - if (std::regex_search(full_name, pr.first)) { - scale = pr.second; - TM_LOG_DEBUG("find scale pattern=%s, name=%s, value=%f", re.c_str(), full_name.c_str(), scale); - break; - } - } - if (rank) { - weight.lora.r = rank; - weight.lora.scale = scale; - weight.lora.policy = lora_param.policy; - } - } - } - - fused_up_and_gate_ = ffn_weights.gating.lora.policy != LoraPolicy::kPlora; -} - -template -void LlamaDecoderLayerWeight::malloc(cudaStream_t st) -{ - deviceMalloc((T**)&self_attn_norm_weights, hidden_units_, st); - deviceMalloc((T**)&ffn_norm_weights, hidden_units_, st); - - self_attn_weights.malloc(st); - - if (inter_size_) { - ffn_weights.malloc(st); - } - - if (!moe_weights.experts.empty()) { - moe_weights.malloc(st); - } -} - -template -size_t LlamaDecoderLayerWeight::workspace_size() const noexcept -{ - // Space to hold the largest weight in full precision - - auto get_size = [](const auto& w) { return (size_t)w.input_dims * w.output_dims; }; - - size_t size = 0; - - size = std::max(size, get_size(self_attn_weights.qkv)); - size = std::max(size, get_size(self_attn_weights.output)); - size = std::max(size, get_size(ffn_weights.gating)); - size = std::max(size, get_size(ffn_weights.fused_gating_intermediate)); - - for (const auto& e : moe_weights.experts) { - size = std::max(size, get_size(e.gating)); - size = std::max(size, get_size(e.fused_gating_intermediate)); - } - - return size * sizeof(uint16_t); -} - -template -std::string concat(FirstArg&& first, Args&&... args) -{ - std::stringstream stream; - stream << first; - ((stream << "." << args), ...); - return stream.str(); -} - -template -void getWeightTensor(LlamaDenseWeight& weights, bool bias, const std::string& prefix, TensorMap& output) -{ - auto get_name = [=](const std::string& name) { return concat(prefix, name); }; - - if (bias) { - output.insert(get_name("bias"), Tensor{MEMORY_GPU, getTensorType(), {weights.bias_size()}, weights.bias}); - } - - const size_t bit_size = getBitSize(weights.type); - if (bit_size >= 16) { - output.insert(get_name("weight"), - Tensor{MEMORY_GPU, getTensorType(), {weights.kernel_size()}, weights.kernel}); - } - else { - output.insert(get_name("qweight"), Tensor{MEMORY_GPU, TYPE_INT32, {weights.kernel_size()}, weights.kernel}); - output.insert(get_name("scales"), - Tensor{MEMORY_GPU, getTensorType(), {weights.scales_size()}, weights.scales}); - output.insert(get_name("zeros"), - Tensor{MEMORY_GPU, getTensorType(), {weights.scales_size()}, weights.zeros}); - } - - if (weights.lora.r) { - auto n = prefix.rfind("."); - - std::string _prefix = prefix.substr(0, n); - std::string _num = prefix.substr(n + 1); - - output.insert(concat(_prefix, "lora_a", _num, "weight"), - Tensor{MEMORY_GPU, getTensorType(), {weights.lora_size().first}, weights.lora.a}); - output.insert(concat(_prefix, "lora_b", _num, "weight"), - Tensor{MEMORY_GPU, getTensorType(), {weights.lora_size().second}, weights.lora.b}); - - TM_LOG_DEBUG("allocate lora weight, layer_name=%s input_dims=%d, output_dims=%d, lora_r=%d", - get_name("weight").c_str(), - weights.input_dims, - weights.output_dims, - weights.lora.r); - } -} - -template -void loadWeights( - LlamaDenseWeight& w, std::string prefix, int rank, FtCudaDataType model_file_type, size_t tensor_para_size) -{ - auto weight_file = prefix + "." + std::to_string(tensor_para_size - 1) + ".weight"; - auto qweight_file = prefix + "." + std::to_string(tensor_para_size - 1) + ".qweight"; - - if (!std::filesystem::exists(weight_file) && !std::filesystem::exists(qweight_file)) { - TM_LOG_ERROR("%s and %s does not exist", weight_file.c_str(), qweight_file.c_str()); - FT_CHECK(false); - } - - prefix += "." + std::to_string(rank); - - size_t dim0 = w.input_dims; - size_t dim1 = w.output_dims; - const auto type = model_file_type; - - if (w.bias) { - loadWeightFromBin((T*)w.bias, {1, dim1}, prefix + ".bias", type); - } - const size_t bit_size = getBitSize(w.type); - if (bit_size >= 16) { // fp16, fp32 - loadWeightFromBin((T*)w.kernel, {dim0, dim1}, prefix + ".weight", type); - } - else { // int8, int4 - const int factor = sizeof(float) * 8 / bit_size; - - FT_CHECK(dim1 % factor == 0); - - std::vector w_shape{dim0, dim1 / factor * sizeof(uint32_t)}; - loadWeightFromBin((int8_t*)w.kernel, w_shape, prefix + ".qweight", FtCudaDataType::INT8); - - const size_t group_count = w.group_size > 0 ? dim0 / w.group_size : 1; - - loadWeightFromBin((half*)w.scales, {group_count, dim1}, prefix + ".scales", type); - loadWeightFromBin((half*)w.zeros, {group_count, dim1}, prefix + ".zeros", type); - } -} - -template -void loadWeights(LlamaDenseWeight& w, std::string prefix, FtCudaDataType model_file_type) -{ - auto weight_file = prefix + ".weight"; - auto qweight_file = prefix + ".qweight"; - - if (!std::filesystem::exists(weight_file) && !std::filesystem::exists(qweight_file)) { - TM_LOG_ERROR("%s and %s does not exist", weight_file.c_str(), qweight_file.c_str()); - FT_CHECK(false); - } - - size_t dim0 = w.input_dims; - size_t dim1 = w.output_dims; - const auto type = model_file_type; - - if (w.bias) { - loadWeightFromBin((T*)w.bias, {1, dim1}, prefix + ".bias", type); - } - const size_t bit_size = getBitSize(w.type); - if (bit_size >= 16) { // fp16, fp32 - loadWeightFromBin((T*)w.kernel, {dim0, dim1}, prefix + ".weight", type); - } - else { // int8, int4 - const int factor = sizeof(float) * 8 / bit_size; - - FT_CHECK(dim1 % factor == 0); - - std::vector w_shape{dim0, dim1 / factor * sizeof(uint32_t)}; - loadWeightFromBin((int8_t*)w.kernel, w_shape, prefix + ".qweight", FtCudaDataType::INT8); - - const size_t group_count = w.group_size > 0 ? dim0 / w.group_size : 1; - - loadWeightFromBin((half*)w.scales, {group_count, dim1}, prefix + ".scales", type); - loadWeightFromBin((half*)w.zeros, {group_count, dim1}, prefix + ".zeros", type); - } -} - -template -void LlamaDecoderLayerWeight::free(cudaStream_t st) -{ - deviceFree(self_attn_norm_weights, st); - deviceFree(ffn_norm_weights, st); - - self_attn_weights.free(st); - - if (inter_size_) { - ffn_weights.free(st); - } - - if (!moe_weights.experts.empty()) { - moe_weights.free(st); - } -} - -template -LlamaDecoderLayerWeight::~LlamaDecoderLayerWeight() = default; - -template -void LlamaDecoderLayerWeight::loadModel(std::string dir_path, FtCudaDataType model_file_type) -{ - const auto type = model_file_type; - - loadWeightFromBin( - (T*)self_attn_norm_weights, {hidden_units_}, dir_path + ".attention_norm.weight", model_file_type); - loadWeightFromBin((T*)ffn_norm_weights, {hidden_units_}, dir_path + ".ffn_norm.weight", model_file_type); - - loadWeights(self_attn_weights.qkv, dir_path + ".attention.w_qkv", attn_tp_rank_, type, attn_tp_size_); - - loadWeights(self_attn_weights.output, dir_path + ".attention.wo", attn_tp_rank_, type, attn_tp_size_); - if (moe_weights.experts.empty()) { - loadWeights(ffn_weights.gating, dir_path + ".feed_forward.w1", mlp_tp_rank_, type, mlp_tp_size_); - loadWeights(ffn_weights.intermediate, dir_path + ".feed_forward.w3", mlp_tp_rank_, type, mlp_tp_size_); - loadWeights(ffn_weights.output, dir_path + ".feed_forward.w2", mlp_tp_rank_, type, mlp_tp_size_); - } - else { - loadWeights(moe_weights.gate, dir_path + ".moe_ffn.gate", type); - for (size_t i = 0; i < moe_weights.experts.size(); ++i) { - std::string weight_name = dir_path + ".moe_ffn.experts." + std::to_string(i); - loadWeights(moe_weights.experts[i].gating, weight_name + ".w1", mlp_tp_rank_, type, mlp_tp_size_); - loadWeights(moe_weights.experts[i].intermediate, weight_name + ".w3", mlp_tp_rank_, type, mlp_tp_size_); - loadWeights(moe_weights.experts[i].output, weight_name + ".w2", mlp_tp_rank_, type, mlp_tp_size_); - } - } -} - -template -void getMLATensor(LlamaAttentionWeight& w, const std::string& p, TensorMap& m, int tp_rank) -{ - if (w.q_proj.output_dims) { - getWeightTensor(w.q_proj, false, concat(p, "attention.q_proj", tp_rank), m); - } - else { - getWeightTensor(w.q_a_proj, false, concat(p, "attention.q_a_proj"), m); - getWeightTensor(w.q_b_proj, false, concat(p, "attention.q_b_proj", tp_rank), m); - m.insert(concat(p, "attention.q_a_layernorm"), - Tensor{MEMORY_GPU, getTensorType(), {sizeof(T) * w.q_b_proj.input_dims}, w.q_a_layernorm}); - } - getWeightTensor(w.kv_a_proj, false, concat(p, "attention.kv_a_proj"), m); - getWeightTensor(w.kv_b_proj, false, concat(p, "attention.kv_b_proj", tp_rank), m); - m.insert(concat(p, "attention.kv_a_layernorm"), - Tensor{MEMORY_GPU, getTensorType(), {sizeof(T) * w.kv_b_proj.input_dims}, w.kv_a_layernorm}); -} - -template -TensorMap LlamaDecoderLayerWeight::getParams(std::string prefix) -{ - TensorMap output; - - output.insert(concat(prefix, "attention_norm.weight"), - Tensor{MEMORY_GPU, getTensorType(), {hidden_units_ * sizeof(T)}, self_attn_norm_weights}); - - output.insert(concat(prefix, "ffn_norm.weight"), - Tensor{MEMORY_GPU, getTensorType(), {hidden_units_ * sizeof(T)}, ffn_norm_weights}); - - auto get_attn = [=](std::string_view name) { return concat(prefix, name, attn_tp_rank_); }; - - if (self_attn_weights.qkv.output_dims) { - getWeightTensor(self_attn_weights.qkv, attn_bias_, get_attn("attention.w_qkv"), output); - - if (self_attn_weights.qk_norm) { - output.insert(concat(prefix, "attention.q_norm"), - Tensor{MEMORY_GPU, - getTensorType(), - {sizeof(T) * self_attn_weights.head_dim}, - self_attn_weights.q_a_layernorm}); - output.insert(concat(prefix, "attention.k_norm"), - Tensor{MEMORY_GPU, - getTensorType(), - {sizeof(T) * self_attn_weights.head_dim}, - self_attn_weights.kv_a_layernorm}); - } - } - else { - getMLATensor(self_attn_weights, prefix, output, attn_tp_rank_); - } - getWeightTensor(self_attn_weights.output, attn_bias_, get_attn("attention.wo"), output); - - auto get_mlp = [=](std::string_view name) { return concat(prefix, name, mlp_tp_rank_); }; + self_attn_weights.reset(new LlamaAttentionWeight{hidden_units_, + size_per_head_, + head_num_, + kv_head_num_, + model.mla, + attn_bias_, + model.qk_norm, + attn_tp_size_, + attn_tp_rank_, + data_type_, + weight_type_, + model.group_size}); + register_module("attention", *self_attn_weights); if (inter_size_) { - getWeightTensor(ffn_weights.gating, false, get_mlp("feed_forward.w1"), output); - getWeightTensor(ffn_weights.intermediate, false, get_mlp("feed_forward.w3"), output); - getWeightTensor(ffn_weights.output, false, get_mlp("feed_forward.w2"), output); - } - - if (!moe_weights.experts.empty()) { - output.insert( - concat(prefix, "moe_ffn.gate.weight"), - Tensor{MEMORY_GPU, getTensorType(), {moe_weights.gate.kernel_size()}, moe_weights.gate.kernel}); - auto& experts = moe_weights.experts; - for (size_t i = 0; i < experts.size(); ++i) { - const std::string name = "moe_ffn.experts." + std::to_string(i); - getWeightTensor(experts[i].gating, false, get_mlp(concat(name, "w1")), output); - getWeightTensor(experts[i].intermediate, false, get_mlp(concat(name, "w3")), output); - getWeightTensor(experts[i].output, false, get_mlp(concat(name, "w2")), output); - } - if (moe_weights.shared_gate.kernel) { - output.insert(concat(prefix, "moe_ffn.shared_gate.weight"), - Tensor{MEMORY_GPU, - getTensorType(), - {moe_weights.shared_gate.kernel_size()}, - moe_weights.shared_gate.kernel}); - } - } - - return output; -} - -// template -static void convert_u4( - LlamaDenseWeight& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt, cudaStream_t st) -{ - FT_CHECK(weight.type == WeightType::kINT4); - - using namespace gemm; - - auto [order_b, pack_b, order_v, pack_v] = - get_weight_and_scales_layout(gemm::DataType::U4, is_fused_moe, getSMVersion(), use_simt); - - if (order_b == kColMajor) { - transpose_u4((uint4_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims, weight.output_dims, st); - cudaMemcpyAsync(weight.kernel, workspace, weight.input_dims * weight.output_dims / 2, cudaMemcpyDefault, st); - } - - extend_to_u16((uint16_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims * weight.output_dims, st); - sync_check_cuda_error(); - - MatrixLayout w_desc{ - gemm::DataType::F16, - order_b, - (int)weight.input_dims, // k - (int)weight.output_dims, // n - order_b == kRowMajor ? (int)weight.output_dims : (int)weight.input_dims, - }; - - MatrixLayout k_desc = w_desc; - k_desc.type = gemm::DataType::U4; - k_desc.pack = pack_b; - - cudaMemsetAsync(weight.kernel, 0, weight.input_dims * weight.output_dims / 2, st); - - FT_CHECK(Convert(workspace, w_desc, weight.kernel, k_desc, st) == 0); - sync_check_cuda_error(); - - const int scale_count = (weight.input_dims / weight.group_size) * weight.output_dims; - - // std::cout << "fuse_scales_and_zeros\n"; - fuse_scales_and_zeros((half*)workspace, weight.scales, weight.zeros, scale_count, st); - // cudaMemset((T*)workspace, 0, sizeof(T) * scale_count * 2); - sync_check_cuda_error(); - - deviceFree(weight.scales, st); - deviceFree(weight.zeros, st); - - deviceMalloc((half**)&weight.scales_zeros, scale_count * 2, st); - - MatrixLayout s_desc{ - gemm::DataType::U32, - order_v, - (int)weight.input_dims / weight.group_size, // k - (int)weight.output_dims, // n - (int)weight.output_dims, - }; - - MatrixLayout q_desc = s_desc; - q_desc.pack = pack_v; - - FT_CHECK(Convert(workspace, s_desc, weight.scales_zeros, q_desc, st) == 0); - sync_check_cuda_error(); - - weight.k_desc = k_desc; - weight.q_desc = q_desc; - - // FT_CHECK(0); -} - -template -static void -convert_fp(LlamaDenseWeight& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt, cudaStream_t st) -{ - using namespace gemm; - - if (!is_fused_moe) { - return; - } - - const auto [order_b, pack_b, order_v, pack_v] = - get_weight_and_scales_layout(get_data_type_v, is_fused_moe, getSMVersion(), use_simt); - - const int input_dim = weight.input_dims; - const int output_dim = weight.output_dims; - - if (order_b == kColMajor) { - invokeTransposeAxis01((uint16_t*)workspace, (uint16_t*)weight.kernel, input_dim, output_dim, 1, st); - sync_check_cuda_error(); - // FT_CHECK(0); - } - else { - check_cuda_error( - cudaMemcpyAsync(workspace, weight.kernel, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault, st)); - } - - MatrixLayout src{ - get_data_type_v, - order_b, - input_dim, // k - output_dim, // n - order_b == kRowMajor ? output_dim : input_dim, - }; - - MatrixLayout dst = src; - dst.pack = pack_b; - - if (pack_b) { - FT_CHECK(Convert(workspace, src, weight.kernel, dst, st) == 0); - sync_check_cuda_error(); - // FT_CHECK(0); - } - else { - check_cuda_error( - cudaMemcpyAsync(weight.kernel, workspace, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault, st)); + ffn_weights.reset(new LlamaFfnWeight{ + hidden_units_, + inter_size_, + mlp_tp_size_, + mlp_tp_rank_, + data_type_, + weight_type_, + model.group_size, + weight_type_ == data_type_v && is_fuse_silu_act(), + }); + register_module("feed_forward", *ffn_weights); } - weight.k_desc = dst; -} - -template -static void -convert(LlamaDenseWeight& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt, cudaStream_t st) -{ - if (weight.type == WeightType::kINT4) { - if constexpr (std::is_same_v) { - convert_u4(weight, is_fused_moe, workspace, size, use_simt, st); - } - else { - FT_CHECK(0); - } - } - else { - convert_fp(weight, is_fused_moe, workspace, size, use_simt, st); - } -} - -template -void interleave(LlamaDenseWeight& c, - LlamaDenseWeight& a, - LlamaDenseWeight& b, - void* workspace, - size_t size, - cudaStream_t st) -{ - FT_CHECK(c.input_dims == a.input_dims); - FT_CHECK(c.input_dims == b.input_dims); - FT_CHECK(c.output_dims == a.output_dims * 2); - FT_CHECK(c.output_dims == b.output_dims * 2); - FT_CHECK(c.group_size == a.group_size); - FT_CHECK(c.group_size == b.group_size); - - if (a.type == WeightType::kINT4) { - uint8_t* tmp_a = (uint8_t*)workspace; - uint8_t* tmp_b = tmp_a + a.output_dims * a.input_dims; - uint8_t* tmp_c = tmp_b + b.output_dims * b.input_dims; - - const auto sentinel = tmp_c + c.output_dims * c.input_dims; - FT_CHECK(sentinel <= (uint8_t*)workspace + size); - - extend_to_u8(tmp_a, (const uint4_t*)a.kernel, a.output_dims * a.input_dims, st); - extend_to_u8(tmp_b, (const uint4_t*)b.kernel, b.output_dims * b.input_dims, st); - - interleave_output_dims(tmp_c, tmp_a, tmp_b, a.output_dims, a.input_dims, st); - - compact_to_u4((uint4_t*)c.kernel, tmp_c, c.output_dims * c.input_dims, st); - - interleave_output_dims(c.scales, a.scales, b.scales, a.output_dims, a.input_dims / a.group_size, st); - interleave_output_dims(c.zeros, a.zeros, b.zeros, a.output_dims, a.input_dims / a.group_size, st); - } - else { - interleave_output_dims((T*)c.kernel, (const T*)a.kernel, (const T*)b.kernel, a.output_dims, a.input_dims, st); + if (layer_id < moe_param.expert_num.size() && moe_param.expert_num[layer_id]) { + moe_weights.reset(new MoeFfnWeight{layer_id, + moe_param, + hidden_units_, + data_type_, + weight_type_, + model.group_size, + mlp_tp_size_, + mlp_tp_rank_, + is_fuse_silu_act()}); + register_module("moe_ffn", *moe_weights); } - // Check at function level - sync_check_cuda_error(); + self_attn_norm = Tensor{{hidden_units_}, data_type_, kDEVICE}; + ffn_norm = Tensor{{hidden_units_}, data_type_, kDEVICE}; + register_parameter("attention_norm.weight", self_attn_norm); + register_parameter("ffn_norm.weight", ffn_norm); } -template -void chunk(LlamaDenseWeight& c, LlamaDenseWeight& a, LlamaDenseWeight& b, void*, size_t, cudaStream_t st) -{ - FT_CHECK(c.input_dims == a.input_dims); - FT_CHECK(c.input_dims == b.input_dims); - FT_CHECK(c.output_dims == a.output_dims * 2); - FT_CHECK(c.output_dims == b.output_dims * 2); - FT_CHECK(c.group_size == a.group_size); - FT_CHECK(c.group_size == b.group_size); - - auto _chunks = [&](auto c, auto a, auto b, int height, int width) { - check_cuda_error( - cudaMemcpy2DAsync((char*)c + 0x000, width * 2, a, width, width, height, cudaMemcpyDefault, st)); - check_cuda_error( - cudaMemcpy2DAsync((char*)c + width, width * 2, b, width, width, height, cudaMemcpyDefault, st)); - }; - - if (c.type == WeightType::kINT4) { - _chunks(c.kernel, a.kernel, b.kernel, a.input_dims, 4 * a.output_dims / 8); - _chunks(c.scales, a.scales, b.scales, a.input_dims / a.group_size, sizeof(T) * a.output_dims); - _chunks(c.zeros, a.zeros, b.zeros, a.input_dims / a.group_size, sizeof(T) * a.output_dims); - } - else { - _chunks(c.kernel, a.kernel, b.kernel, a.input_dims, sizeof(T) * a.output_dims); - } - - // Check at function level - sync_check_cuda_error(); -} +LlamaDecoderLayerWeight::~LlamaDecoderLayerWeight() = default; -template -void LlamaDecoderLayerWeight::prepare(void* workspace, size_t size, const cudaDeviceProp& prop, cudaStream_t st) +void LlamaDecoderLayerWeight::prepare(const cudaDeviceProp& prop, cudaStream_t st) { - const bool is_16xx = is_16xx_series(prop.name); + const bool use_simt = is_16xx_series(prop.name); - convert(self_attn_weights.qkv, false, workspace, size, is_16xx, st); - convert(self_attn_weights.output, false, workspace, size, is_16xx, st); + self_attn_weights->prepare(use_simt); - auto process_ffn = [&](LlamaFfnWeight& ffn, bool is_fused_moe) { - if (fused_up_and_gate_) { - auto& fused_up_and_gate = ffn.fused_gating_intermediate; - - fused_up_and_gate.malloc(st); - - if (ffn.is_fused_silu) { - interleave(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size, st); - } - else { - chunk(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size, st); - } - - convert(ffn.fused_gating_intermediate, is_fused_moe, workspace, size, is_16xx, st); - - ffn.gating.free(st); - ffn.intermediate.free(st); - } - else { - convert(ffn.gating, is_fused_moe, workspace, size, is_16xx, st); - convert(ffn.intermediate, is_fused_moe, workspace, size, is_16xx, st); - } - - convert(ffn.output, is_fused_moe, workspace, size, is_16xx, st); - }; - - if (inter_size_) { - // std::cerr << "process FFN\n"; - process_ffn(ffn_weights, false); + if (ffn_weights) { + ffn_weights->prepare(false, use_simt); } - if (!moe_weights.experts.empty()) { - // std::cerr << "process MoE\n"; - std::vector> fused_ptrs; - std::vector> output_ptrs; - std::vector> fused_param_ptrs; - std::vector> output_param_ptrs; - - for (auto& e : moe_weights.experts) { - - process_ffn(e, moe_weights.method == MoeParam::kFused); - - const auto& fused = e.fused_gating_intermediate; - const auto& output = e.output; - - fused_ptrs.push_back({fused.kernel, fused.k_desc.ld}); - output_ptrs.push_back({output.kernel, output.k_desc.ld}); - - if (e.fused_gating_intermediate.scales_zeros) { - fused_param_ptrs.emplace_back(fused.scales_zeros, fused.q_desc.ld); - output_param_ptrs.emplace_back(output.scales_zeros, output.q_desc.ld); - } - } - - // Note: This assumes all experts has the same shape - moe_weights.block = moe_weights.experts.at(0); - - auto& fused = moe_weights.block.fused_gating_intermediate; - auto& output = moe_weights.block.output; - - // TODO: free these ptrs - fused.kernel = gemm::make_blocked_ptrs(fused_ptrs, st); - output.kernel = gemm::make_blocked_ptrs(output_ptrs, st); - - if (!fused_param_ptrs.empty()) { - fused.scales_zeros = (T*)gemm::make_blocked_ptrs(fused_param_ptrs, st); - output.scales_zeros = (T*)gemm::make_blocked_ptrs(output_param_ptrs, st); - } - - fused.k_desc.ld = output.k_desc.ld = 0; - fused.k_desc.num = output.k_desc.num = moe_weights.experts.size(); - - fused.q_desc.ld = output.q_desc.ld = 0; - fused.q_desc.num = output.q_desc.num = moe_weights.experts.size(); + if (moe_weights) { + moe_weights->prepare(use_simt); } } -#ifdef ENABLE_FP32 -template struct LlamaDecoderLayerWeight; -#endif -template struct LlamaDecoderLayerWeight; -#ifdef ENABLE_BF16 -template struct LlamaDecoderLayerWeight<__nv_bfloat16>; -#endif - } // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h index 44838a747d..0df8077341 100644 --- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h +++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h @@ -20,18 +20,19 @@ #pragma once +#include "src/turbomind/core/core.h" + #include "src/turbomind/models/llama/LlamaDenseWeight.h" #include "src/turbomind/models/llama/llama_params.h" -#include "src/turbomind/utils/Tensor.h" namespace turbomind { -template -struct LlamaDecoderLayerWeight { +struct LlamaDecoderLayerWeight: core::Module { public: LlamaDecoderLayerWeight() = delete; - LlamaDecoderLayerWeight(int layer_id, + LlamaDecoderLayerWeight(DataType data_type, + int layer_id, const ModelParam& model, const EngineParam& engine, const LoraParam& lora_param, @@ -41,41 +42,32 @@ struct LlamaDecoderLayerWeight { LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight&) = delete; LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight&) = delete; - void loadModel(std::string dir_path, FtCudaDataType model_file_type); - - TensorMap getParams(std::string prefix); - - void prepare(void* workspace, size_t size, const cudaDeviceProp& prop, cudaStream_t st); - - size_t workspace_size() const noexcept; - - void malloc(cudaStream_t st); - - void free(cudaStream_t st); + void prepare(const cudaDeviceProp& prop, cudaStream_t st); - T* self_attn_norm_weights{}; - T* ffn_norm_weights{}; + Tensor self_attn_norm; + Tensor ffn_norm; - LlamaAttentionWeight self_attn_weights{}; + std::unique_ptr self_attn_weights; - LlamaFfnWeight ffn_weights{}; - MoeFfnWeight moe_weights{}; + std::unique_ptr ffn_weights; + std::unique_ptr moe_weights; private: - size_t head_num_; - size_t kv_head_num_; - size_t size_per_head_; - size_t hidden_units_; - size_t inter_size_; - WeightType weight_type_; - size_t bit_size_; - bool attn_bias_; - size_t attn_tp_size_; - size_t attn_tp_rank_; - size_t mlp_tp_size_; - size_t mlp_tp_rank_; - bool is_maintain_buffer_ = false; - bool fused_up_and_gate_; + int head_num_; + int kv_head_num_; + int size_per_head_; + int hidden_units_; + int inter_size_; + + DataType data_type_; + DataType weight_type_; + + int bit_size_; + bool attn_bias_; + int attn_tp_size_; + int attn_tp_rank_; + int mlp_tp_size_; + int mlp_tp_rank_; }; } // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaDenseWeight.cc b/src/turbomind/models/llama/LlamaDenseWeight.cc new file mode 100644 index 0000000000..24afe3122e --- /dev/null +++ b/src/turbomind/models/llama/LlamaDenseWeight.cc @@ -0,0 +1,502 @@ +#include "src/turbomind/models/llama/LlamaDenseWeight.h" + +#include "src/turbomind/core/data_type.h" +#include "src/turbomind/kernels/gemm/cast.h" +#include "src/turbomind/kernels/gemm/gemm.h" +#include "src/turbomind/kernels/gemm/types.h" +#include "src/turbomind/kernels/gpt_kernels.h" + +#include "src/turbomind/utils/memory_utils.h" + +namespace turbomind { + +void LlamaDenseWeight::emplace( + int input_dim, int output_dim, DataType data_type, bool bias, DataType weight_type, int group_size) +{ + this->data_type = data_type; + this->weight_type = weight_type; + this->input_dim = input_dim; + this->output_dim = output_dim; + this->group_size = group_size; + + const auto wbits = byte_size(weight_type, 8); + + weight = Tensor({input_dim, output_dim}, weight_type, kDEVICE); + register_parameter(wbits < 16 ? "qweight" : "weight", weight); + + if (bias) { + this->bias = Tensor{{output_dim}, data_type, kDEVICE}; + register_parameter("bias", this->bias); + } + + if (wbits < 16) { + TM_CHECK(input_dim % group_size == 0) << input_dim << " " << group_size; + scales = Tensor{{input_dim / group_size, output_dim}, data_type, kDEVICE}; + zeros = Tensor{{input_dim / group_size, output_dim}, data_type, kDEVICE}; + register_parameter("scales", scales); + register_parameter("zeros", zeros); + } +} + +static void convert_u4(LlamaDenseWeight& dense, bool is_fused_moe, bool use_simt, cudaStream_t st) +{ + TM_CHECK_EQ(dense.weight_type, data_type_v); + + using namespace gemm; + + auto [order_b, pack_b, order_v, pack_v] = + get_weight_and_scales_layout(data_type_v, is_fused_moe, getSMVersion(), use_simt); + + if (order_b == kColMajor) { + Buffer trans{dense.input_dim * dense.output_dim, data_type_v, kDEVICE}; + transpose_u4( + (uint4_t*)trans.raw_data(), (const uint4_t*)dense.weight.raw_data(), dense.input_dim, dense.output_dim, st); + cudaMemcpyAsync( + dense.weight.raw_data(), trans.raw_data(), dense.input_dim * dense.output_dim / 2, cudaMemcpyDefault, st); + } + + Buffer_ tmp_w{dense.input_dim * dense.output_dim, kDEVICE}; + extend_to_u16(tmp_w.data(), (const uint4_t*)dense.weight.raw_data(), dense.input_dim * dense.output_dim, st); + sync_check_cuda_error(); + + MatrixLayout w_desc{ + data_type_v, + order_b, + (int)dense.input_dim, // k + (int)dense.output_dim, // n + order_b == kRowMajor ? (int)dense.output_dim : (int)dense.input_dim, + }; + + MatrixLayout k_desc = w_desc; + k_desc.type = data_type_v; + k_desc.pack = pack_b; + + cudaMemsetAsync(dense.weight.raw_data(), 0, dense.input_dim * dense.output_dim / 2, st); + + FT_CHECK(Convert(tmp_w.data(), w_desc, dense.weight.raw_data(), k_desc, st) == 0); + sync_check_cuda_error(); + + const int scale_count = (dense.input_dim / dense.group_size) * dense.output_dim; + + Buffer_ tmp_q{scale_count * 2, kDEVICE}; + fuse_scales_and_zeros(tmp_q.data(), dense.scales.data(), dense.zeros.data(), scale_count, st); + sync_check_cuda_error(); + + dense.scales = {}; + dense.zeros = {}; + + dense.scales_zeros = Tensor_{{scale_count, 2}, kDEVICE}; + + MatrixLayout s_desc{ + data_type_v, + order_v, + (int)dense.input_dim / dense.group_size, // k + (int)dense.output_dim, // n + (int)dense.output_dim, + }; + + MatrixLayout q_desc = s_desc; + q_desc.pack = pack_v; + + FT_CHECK(Convert(tmp_q.data(), s_desc, dense.scales_zeros.raw_data(), q_desc, st) == 0); + sync_check_cuda_error(); + + dense.k_desc = k_desc; + dense.q_desc = q_desc; +} + +static void convert_fp(LlamaDenseWeight& dense, bool is_fused_moe, bool use_simt, cudaStream_t st) +{ + using namespace gemm; + + if (!is_fused_moe) { + return; + } + + /// TODO: unify data types + auto data_type = dense.data_type; + + const auto [order_b, pack_b, order_v, pack_v] = + get_weight_and_scales_layout(data_type, is_fused_moe, getSMVersion(), use_simt); + + const int input_dim = dense.input_dim; + const int output_dim = dense.output_dim; + + TM_CHECK(dense.weight.is_contiguous()); + + Buffer_ tmp{input_dim * output_dim, kDEVICE}; + + if (order_b == kColMajor) { + invokeTransposeAxis01(tmp.data(), (uint16_t*)dense.weight.raw_data(), input_dim, output_dim, 1, st); + sync_check_cuda_error(); + } + else { + check_cuda_error( + cudaMemcpyAsync(tmp.data(), dense.weight.raw_data(), dense.weight.byte_size(), cudaMemcpyDefault, st)); + } + + MatrixLayout src{ + data_type, + order_b, + input_dim, // k + output_dim, // n + order_b == kRowMajor ? output_dim : input_dim, + }; + + MatrixLayout dst = src; + dst.pack = pack_b; + + if (pack_b) { + FT_CHECK(Convert(tmp.data(), src, dense.weight.raw_data(), dst, st) == 0); + sync_check_cuda_error(); + } + else { + check_cuda_error( + cudaMemcpyAsync(dense.weight.raw_data(), tmp.data(), dense.weight.byte_size(), cudaMemcpyDefault, st)); + } + + dense.k_desc = dst; +} + +static void convert(LlamaDenseWeight& dense, bool is_fused_moe, DataType data_type, bool use_simt, cudaStream_t st) {} + +void LlamaDenseWeight::prepare(bool fused_moe, bool use_simt) +{ + if (!weight) { + return; + } + + auto stream = core::Context::stream().handle(); + + if (weight_type == data_type_v) { + TM_CHECK_EQ(data_type, data_type_v); + convert_u4(*this, fused_moe, use_simt, stream); + } + else { + convert_fp(*this, fused_moe, use_simt, stream); + } +} + +LlamaAttentionWeight::LlamaAttentionWeight(int hidden_dim, + int head_dim, + int head_num, + int kv_head_num, + MLAParam mla, + bool bias, + bool qk_norm, + int tp_size, + int tp_rank, + DataType data_type, + DataType weight_type, + int group_size) +{ + if (mla.kv_lora_rank == 0) { + qkv.emplace( + hidden_dim, (head_num + 2 * kv_head_num) * head_dim / tp_size, data_type, bias, weight_type, group_size); + register_module("w_qkv", qkv, tp_rank); + if (qk_norm) { + q_a_layernorm = Tensor{{head_dim}, data_type, kDEVICE}; + kv_a_layernorm = Tensor{{head_dim}, data_type, kDEVICE}; + register_parameter("q_norm", q_a_layernorm); + register_parameter("k_norm", kv_a_layernorm); + } + } + else { + const int qk_nope_dim = head_dim - mla.qk_rope_dim; + if (mla.q_lora_rank) { + q_a_proj.emplace(hidden_dim, mla.q_lora_rank, data_type, false, weight_type, group_size); + q_b_proj.emplace(mla.q_lora_rank, head_num * head_dim / tp_size, data_type, false, weight_type, group_size); + q_a_layernorm = Tensor{{q_b_proj.input_dim}, data_type, kDEVICE}; + register_module("q_a_proj", q_a_proj); + register_module("q_b_proj", q_b_proj, tp_rank); + register_parameter("q_a_layernorm", q_a_layernorm); + } + else { + q_proj.emplace(hidden_dim, head_num * head_dim / tp_size, data_type, false, weight_type, group_size); + register_module("q_proj", q_proj, tp_rank); + } + kv_a_proj.emplace(hidden_dim, mla.kv_lora_rank + mla.qk_rope_dim, data_type, false, weight_type, group_size); + kv_b_proj.emplace(mla.kv_lora_rank, + head_num * (qk_nope_dim + mla.v_head_dim) / tp_size, + data_type, + false, + weight_type, + group_size); + + kv_a_layernorm = Tensor{{kv_b_proj.input_dim}, data_type, kDEVICE}; + register_module("kv_a_proj", kv_a_proj); + register_module("kv_b_proj", kv_b_proj, tp_rank); + register_parameter("kv_a_layernorm", kv_a_layernorm); + } + output.emplace((head_num * head_dim) / tp_size, hidden_dim, data_type, bias, weight_type, group_size); + register_module("wo", output, tp_rank); +} + +void LlamaAttentionWeight::prepare(bool use_simt) +{ + std::vector weights{ + &qkv, + &output, + &q_a_proj, + &q_a_proj, + &q_b_proj, + &kv_a_proj, + &kv_b_proj, + }; + for (auto& w : weights) { + w->prepare(false, use_simt); + } +} + +LlamaFfnWeight::LlamaFfnWeight(int hidden_dim, + int inter_size, + int tp_size, + int tp_rank, + DataType data_type, + DataType weight_type, + int group_size, + bool fuse_silu_act) +{ + TM_CHECK(inter_size % tp_size == 0) << inter_size << " " << tp_size; + + inter_size /= tp_size; + + this->inter_size = inter_size; + + gating.emplace(hidden_dim, inter_size, data_type, false, weight_type, group_size); + + intermediate.emplace(hidden_dim, inter_size, data_type, false, weight_type, group_size); + + // fused_gating_intermediate = {hidden_dim, inter_size * 2, data_type, weight_type, group_size}; + is_fused_silu = fuse_silu_act; + + output.emplace(inter_size, hidden_dim, data_type, false, weight_type, group_size); + + register_module("w1", gating, tp_rank); + register_module("w3", intermediate, tp_rank); + register_module("w2", output, tp_rank); +} + +void interleave(LlamaDenseWeight& c, LlamaDenseWeight& a, LlamaDenseWeight& b, DataType data_type, cudaStream_t st) +{ + FT_CHECK(c.input_dim == a.input_dim); + FT_CHECK(c.input_dim == b.input_dim); + FT_CHECK(c.output_dim == a.output_dim * 2); + FT_CHECK(c.output_dim == b.output_dim * 2); + FT_CHECK(c.group_size == a.group_size); + FT_CHECK(c.group_size == b.group_size); + + auto invoke = [&](auto t) { + using T = decltype(t); + if (a.weight_type == data_type_v) { + Buffer_ tmp_a{a.weight.size(), kDEVICE}; + Buffer_ tmp_b{b.weight.size(), kDEVICE}; + Buffer_ tmp_c{c.weight.size(), kDEVICE}; + + extend_to_u8(tmp_a.data(), (const uint4_t*)a.weight.raw_data(), a.output_dim * a.input_dim, st); + extend_to_u8(tmp_b.data(), (const uint4_t*)b.weight.raw_data(), b.output_dim * b.input_dim, st); + + interleave_output_dims(tmp_c.data(), tmp_a.data(), tmp_b.data(), a.output_dim, a.input_dim, st); + + compact_to_u4((uint4_t*)c.weight.raw_data(), tmp_c.data(), c.output_dim * c.input_dim, st); + + interleave_output_dims(c.scales.data(), + a.scales.data(), + b.scales.data(), + a.output_dim, + a.input_dim / a.group_size, + st); + interleave_output_dims(c.zeros.data(), // + a.zeros.data(), + b.zeros.data(), + a.output_dim, + a.input_dim / a.group_size, + st); + } + else { + interleave_output_dims( + c.weight.data(), a.weight.data(), b.weight.data(), a.output_dim, a.input_dim, st); + } + // Check at function level + sync_check_cuda_error(); + }; + + TM_DISPATCH_DTYPES(data_type, invoke, half_t, bfloat16_t); +} + +void chunk(LlamaDenseWeight& c, LlamaDenseWeight& a, LlamaDenseWeight& b, DataType data_type, cudaStream_t st) +{ + FT_CHECK(c.input_dim == a.input_dim); + FT_CHECK(c.input_dim == b.input_dim); + FT_CHECK(c.output_dim == a.output_dim * 2); + FT_CHECK(c.output_dim == b.output_dim * 2); + FT_CHECK(c.group_size == a.group_size); + FT_CHECK(c.group_size == b.group_size); + + auto _chunks = [&](auto c, auto a, auto b, int height, int width) { + check_cuda_error( + cudaMemcpy2DAsync((char*)c + 0x000, width * 2, a, width, width, height, cudaMemcpyDefault, st)); + check_cuda_error( + cudaMemcpy2DAsync((char*)c + width, width * 2, b, width, width, height, cudaMemcpyDefault, st)); + }; + + auto invoke = [&](auto t) { + using T = decltype(t); + if (c.weight_type == data_type_v) { + _chunks(c.weight.raw_data(), a.weight.raw_data(), b.weight.raw_data(), a.input_dim, 4 * a.output_dim / 8); + _chunks(c.scales.data(), + a.scales.data(), + b.scales.data(), + a.input_dim / a.group_size, + sizeof(T) * a.output_dim); + _chunks(c.zeros.data(), + a.zeros.data(), + b.zeros.data(), + a.input_dim / a.group_size, + sizeof(T) * a.output_dim); + } + else { + _chunks(c.weight.data(), a.weight.data(), b.weight.data(), a.input_dim, sizeof(T) * a.output_dim); + } + // Check at function level + sync_check_cuda_error(); + }; + + TM_DISPATCH_DTYPES(data_type, invoke, half_t, bfloat16_t); +} + +void LlamaFfnWeight::prepare(bool fused_moe, bool use_simt) +{ + const auto data_type = gating.data_type; + + auto stream = core::Context().stream().handle(); + + if (fuse_up_and_gate) { + auto& fused_up_and_gate = fused_gating_intermediate; + + fused_up_and_gate.emplace(gating.input_dim, // + gating.output_dim * 2, + gating.data_type, + false, + gating.weight_type, + gating.group_size); + + if (is_fused_silu) { + interleave(fused_up_and_gate, gating, intermediate, data_type, stream); + } + else { + chunk(fused_up_and_gate, gating, intermediate, data_type, stream); + } + + fused_gating_intermediate.prepare(fused_moe, use_simt); + + gating = {}; + intermediate = {}; + } + else { + gating.prepare(fused_moe, use_simt); + intermediate.prepare(fused_moe, use_simt); + } + + output.prepare(fused_moe, use_simt); +} + +MoeFfnWeight::MoeFfnWeight(int layer_id, + const MoeParam& param, + int hidden_dim, + DataType data_type, + DataType weight_type, + int group_size, + int tp_size, + int tp_rank, + bool fuse_silu_act) +{ + if ((int)param.expert_num.size() <= layer_id) { + return; + } + + const int expert_num = param.expert_num[layer_id]; + + if (expert_num == 0) { + return; + } + + gate.emplace(hidden_dim, expert_num, data_type, false, data_type, 1); + register_module("gate", gate); + + method = param.method; + fuse_silu_act = fuse_silu_act && method == MoeParam::kFused; + + experts.reserve(expert_num); + for (int i = 0; i < expert_num; ++i) { + experts.emplace_back(new LlamaFfnWeight{ + hidden_dim, param.inter_size, tp_size, tp_rank, data_type, weight_type, group_size, fuse_silu_act}); + register_module("experts", *experts.back(), i); + } + + if (param.shared_gate) { + shared_gate.emplace(hidden_dim, 1, data_type, false, data_type, 1); + register_module("shared_gate", shared_gate); + } +} + +void MoeFfnWeight::prepare(bool use_simt) +{ + const auto fused_moe = method == MoeParam::kFused; + + for (auto& e : experts) { + e->prepare(fused_moe, use_simt); + } + const int n_expert = experts.size(); + const auto st = core::Context::stream().handle(); + + auto make_block_ptr = [&](const auto& ptrs) { + return std::shared_ptr{gemm::make_blocked_ptrs(ptrs, st), [](auto p) { cudaFree(p); }}; + }; + + auto process = [&](auto getter) { + std::vector> weight_ptrs; + std::vector> quant_ptrs; + + for (auto& e : experts) { + auto& m = (*e).*getter; + weight_ptrs.push_back({m.weight.raw_data(), m.k_desc.ld}); + if (m.scales_zeros) { + quant_ptrs.emplace_back(m.scales_zeros.raw_data(), m.q_desc.ld); + } + } + + LlamaDenseWeight& m = block.*getter; + + { // Copy properties from exemplar, this assumes all experts has the same shape + LlamaDenseWeight& e = (*experts.at(0)).*getter; + m.input_dim = e.input_dim; + m.output_dim = e.output_dim; + m.group_size = e.group_size; + m.data_type = e.data_type; + m.weight_type = e.weight_type; + m.k_desc = e.k_desc; + m.q_desc = e.q_desc; + } + + // Dummy tensors to hold the blocked ptrs + m.weight = Tensor{make_block_ptr(weight_ptrs), {n_expert}, m.weight_type, kDEVICE}; + if (!quant_ptrs.empty()) { + TM_CHECK_EQ(quant_ptrs.size(), n_expert); + m.scales_zeros = Tensor{make_block_ptr(quant_ptrs), {n_expert}, m.data_type, kDEVICE}; + } + + m.k_desc.num = m.q_desc.num = experts.size(); + m.k_desc.ld = m.q_desc.ld = 0; // `ld` is meaningless in this case + }; + + process(&LlamaFfnWeight::fused_gating_intermediate); + process(&LlamaFfnWeight::output); + + auto& e = *experts.at(0); + // Copy MLP properties + block.inter_size = e.inter_size; + block.is_fused_silu = e.is_fused_silu; +} + +} // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h index b12592c757..794aa10b97 100644 --- a/src/turbomind/models/llama/LlamaDenseWeight.h +++ b/src/turbomind/models/llama/LlamaDenseWeight.h @@ -19,12 +19,11 @@ #pragma once +#include "src/turbomind/core/core.h" +#include "src/turbomind/core/module.h" + #include "src/turbomind/kernels/gemm/types.h" #include "src/turbomind/models/llama/llama_params.h" -#include "src/turbomind/models/llama/weight_type.h" -#include "src/turbomind/utils/cuda_utils.h" -#include "src/turbomind/utils/memory_utils.h" -#include namespace turbomind { @@ -44,319 +43,128 @@ struct LoraWeight { void* b; }; -template -struct LlamaDenseWeight { - size_t input_dims = 0; - size_t output_dims = 0; - WeightType type; // uninitialized - void* kernel = nullptr; - T* bias = nullptr; - T* scales = nullptr; - T* zeros = nullptr; - T* scales_zeros = nullptr; - int group_size = 1; - - LoraWeight lora; - - gemm::MatrixLayout k_desc; - gemm::MatrixLayout q_desc; +struct LlamaDenseWeight: public core::Module { - LlamaDenseWeight(): type{}, lora{}, k_desc{}, q_desc{} {} + LlamaDenseWeight(): data_type{}, weight_type{}, lora{}, k_desc{}, q_desc{} {} - LlamaDenseWeight(size_t input_dim, size_t output_dim, WeightType type, int group_size): LlamaDenseWeight{} - { - this->input_dims = input_dim; - this->output_dims = output_dim; - this->type = type; - this->group_size = group_size; - } + void emplace(int input_dim, int output_dim, DataType data_type, bool bias, DataType weight_type, int group_size); - size_t kernel_size() const noexcept - { - return getBitSize(type) * input_dims * output_dims / 8; - } - - size_t bias_size() const noexcept - { - return sizeof(T) * output_dims; - } - - size_t scales_size() const noexcept - { - return sizeof(T) * input_dims / group_size * output_dims; - } + void prepare(bool fused_moe, bool use_simt); - std::pair lora_size() const noexcept + LlamaDenseWeight& operator=(std::nullptr_t) { - return {sizeof(T) * input_dims * lora.r, sizeof(T) * lora.r * output_dims}; + this->~LlamaDenseWeight(); + new (this) LlamaDenseWeight{}; + return *this; } - void malloc(cudaStream_t st, bool with_bias = false) + operator bool() const noexcept { - if (with_bias) { - deviceMalloc((T**)&bias, output_dims, st); - } - const size_t bit_size = getBitSize(type); - if (bit_size >= 16) { // fp16, fp32 - deviceMalloc((T**)&kernel, input_dims * output_dims, st); - } - else { // int8, int4 - const int factor = sizeof(float) * 8 / bit_size; - FT_CHECK(input_dims % factor == 0); - deviceMalloc((int**)&kernel, input_dims * output_dims / factor, st); - deviceMalloc((T**)&scales, input_dims / group_size * output_dims, st); - deviceMalloc((T**)&zeros, input_dims / group_size * output_dims, st); - } - - if (lora.r > 0) { - deviceMalloc((T**)&lora.a, input_dims * lora.r, st); - deviceMalloc((T**)&lora.b, lora.r * output_dims, st); - } + return static_cast(weight); } - void free(cudaStream_t st) - { - deviceFree(kernel, st); - deviceFree(bias, st); - deviceFree(scales, st); - deviceFree(zeros, st); - deviceFree(lora.a, st); - deviceFree(lora.b, st); - } -}; + int input_dim = 0; + int output_dim = 0; + int group_size = 1; -template -struct LlamaAttentionWeight { + DataType data_type; + DataType weight_type; - LlamaAttentionWeight() = default; + Tensor weight; + Tensor bias; - LlamaAttentionWeight(size_t hidden_dim, - size_t head_dim, - size_t head_num, - size_t kv_head_num, - MLAParam mla, - bool bias, - bool qk_norm, - size_t tp, - WeightType weight_type, - int group_size) - { - this->bias = bias; - this->head_dim = head_dim; - this->qk_norm = qk_norm; - - if (mla.kv_lora_rank == 0) { - qkv = {hidden_dim, (head_num + 2 * kv_head_num) * head_dim / tp, weight_type, group_size}; - } - else { - const int qk_nope_dim = head_dim - mla.qk_rope_dim; - if (mla.q_lora_rank) { - q_a_proj = {hidden_dim, mla.q_lora_rank, weight_type, group_size}; - q_b_proj = {mla.q_lora_rank, head_num * head_dim / tp, weight_type, group_size}; - } - else { - q_proj = {hidden_dim, head_num * head_dim / tp, weight_type, group_size}; - } - kv_a_proj = {hidden_dim, mla.kv_lora_rank + mla.qk_rope_dim, weight_type, group_size}; - kv_b_proj = {mla.kv_lora_rank, head_num * (qk_nope_dim + mla.v_head_dim) / tp, weight_type, group_size}; - } - output = {(head_num * head_dim) / tp, hidden_dim, weight_type, group_size}; - } + Tensor scales; + Tensor zeros; - void malloc(cudaStream_t st) - { - if (qkv.output_dims) { - qkv.malloc(st, bias); - if (qk_norm) { - deviceMalloc((T**)&q_a_layernorm, head_dim, st); - deviceMalloc((T**)&kv_a_layernorm, head_dim, st); - } - } - else { // MLA - if (q_proj.output_dims) { - q_proj.malloc(st); - } - else { - q_a_proj.malloc(st); - q_b_proj.malloc(st); - deviceMalloc((T**)&q_a_layernorm, q_b_proj.input_dims, st); - } - kv_a_proj.malloc(st); - kv_b_proj.malloc(st); - deviceMalloc((T**)&kv_a_layernorm, kv_b_proj.input_dims, st); - } - output.malloc(st, bias); - } + Tensor scales_zeros; - void free(cudaStream_t st) - { - qkv.free(st); - q_proj.free(st); - q_a_proj.free(st); - q_b_proj.free(st); - kv_a_proj.free(st); - kv_b_proj.free(st); - output.free(st); - deviceFree(q_a_layernorm, st); - deviceFree(kv_a_layernorm, st); - } + LoraWeight lora; - int head_dim{}; - bool bias{}; - bool qk_norm{}; + gemm::MatrixLayout k_desc; + gemm::MatrixLayout q_desc; +}; - LlamaDenseWeight qkv; - LlamaDenseWeight output; +struct LlamaAttentionWeight: public core::Module { - LlamaDenseWeight q_proj; - LlamaDenseWeight q_a_proj; - LlamaDenseWeight q_b_proj; - LlamaDenseWeight kv_a_proj; - LlamaDenseWeight kv_b_proj; + LlamaAttentionWeight() = default; - T* q_a_layernorm{}; - T* kv_a_layernorm{}; + LlamaAttentionWeight(int hidden_dim, + int head_dim, + int head_num, + int kv_head_num, + MLAParam mla, + bool bias, + bool qk_norm, + int tp_size, + int tp_rank, + DataType data_type, + DataType weight_type, + int group_size); + + void prepare(bool use_simt); + + LlamaDenseWeight qkv; + LlamaDenseWeight output; + + LlamaDenseWeight q_proj; + LlamaDenseWeight q_a_proj; + LlamaDenseWeight q_b_proj; + LlamaDenseWeight kv_a_proj; + LlamaDenseWeight kv_b_proj; + + Tensor q_a_layernorm; + Tensor kv_a_layernorm; }; -template -struct LlamaFfnWeight { +struct LlamaFfnWeight: core::Module { LlamaFfnWeight() = default; - LlamaFfnWeight( - size_t hidden_dim, size_t inter_size, size_t tp, WeightType weight_type, int group_size, bool fuse_silu_act) - { - inter_size /= tp; - - this->inter_size = inter_size; - - gating.input_dims = hidden_dim; - gating.output_dims = inter_size; - gating.type = weight_type; - gating.group_size = group_size; + LlamaFfnWeight(int hidden_dim, + int inter_size, + int tp_size, + int tp_rank, + DataType data_type, + DataType weight_type, + int group_size, + bool fuse_silu_act); - intermediate.input_dims = hidden_dim; - intermediate.output_dims = inter_size; - intermediate.type = weight_type; - intermediate.group_size = group_size; + static constexpr bool fuse_up_and_gate = true; - fused_gating_intermediate.input_dims = hidden_dim; - fused_gating_intermediate.output_dims = inter_size * 2; - fused_gating_intermediate.type = weight_type; - fused_gating_intermediate.group_size = group_size; + void prepare(bool fused_moe, bool use_simt); - is_fused_silu = fuse_silu_act; - - output.input_dims = inter_size; - output.output_dims = hidden_dim; - output.type = weight_type; - output.group_size = group_size; - } - - void malloc(cudaStream_t st) - { - gating.malloc(st); - intermediate.malloc(st); - output.malloc(st); - } - - void free(cudaStream_t st) - { - gating.free(st); - intermediate.free(st); - output.free(st); - fused_gating_intermediate.free(st); - } - - LlamaDenseWeight gating; - LlamaDenseWeight intermediate; - LlamaDenseWeight output; - LlamaDenseWeight fused_gating_intermediate; + LlamaDenseWeight gating; + LlamaDenseWeight intermediate; + LlamaDenseWeight output; + LlamaDenseWeight fused_gating_intermediate; int inter_size{}; bool is_fused_silu{}; }; -template -struct MoeFfnWeight { +struct MoeFfnWeight: core::Module { MoeFfnWeight() = default; MoeFfnWeight(int layer_id, const MoeParam& param, - size_t hidden_dim, - WeightType weight_type, + int hidden_dim, + DataType data_type, + DataType weight_type, int group_size, - size_t tp, - bool fuse_silu_act) - { - - if (param.expert_num.size() <= layer_id) { - return; - } + int tp_size, + int tp_rank, + bool fuse_silu_act); - const int expert_num = param.expert_num[layer_id]; - - if (expert_num == 0) { - return; - } - - // printf("%d %d %d\n", (int)hidden_dim, (int)param.inter_size, (int)expert_num); - - gate.input_dims = hidden_dim; - gate.output_dims = expert_num; - gate.type = get_default_weight_type(); - gate.group_size = group_size; - - experts.resize(expert_num); - - method = param.method; - fuse_silu_act = fuse_silu_act && method == MoeParam::kFused; - - for (auto& e : experts) { - // inter size is divided by tp in `FfnWeight` - e = LlamaFfnWeight{hidden_dim, (size_t)param.inter_size, tp, weight_type, group_size, fuse_silu_act}; - } - - if (param.shared_gate) { - shared_gate.input_dims = hidden_dim; - shared_gate.output_dims = 1; - shared_gate.type = get_default_weight_type(); - gate.group_size = group_size; - } - else { - shared_gate = {}; - } - } - - void malloc(cudaStream_t st) - { - gate.malloc(st); - if (shared_gate.output_dims) { - shared_gate.malloc(st); - } - for (auto& e : experts) { - e.malloc(st); - } - } - - void free(cudaStream_t st) - { - gate.free(st); - shared_gate.free(st); - for (auto& e : experts) { - e.free(st); - } - block.free(st); - } + void prepare(bool use_simt); - LlamaDenseWeight gate; - std::vector> experts; + LlamaDenseWeight gate; + LlamaDenseWeight shared_gate; - LlamaDenseWeight shared_gate; + std::vector> experts; // reference into `experts` - LlamaFfnWeight block; + LlamaFfnWeight block; MoeParam::Method method{}; }; diff --git a/src/turbomind/models/llama/LlamaFfnLayer.cc b/src/turbomind/models/llama/LlamaFfnLayer.cc index 7fc15dba38..dd3def0518 100644 --- a/src/turbomind/models/llama/LlamaFfnLayer.cc +++ b/src/turbomind/models/llama/LlamaFfnLayer.cc @@ -21,153 +21,63 @@ #include "src/turbomind/kernels/activation_kernels.h" #include "src/turbomind/models/llama/llama_utils.h" #include "src/turbomind/utils/anomaly_handler.h" -#include "src/turbomind/utils/nvtx_utils.h" namespace turbomind { -template -void LlamaFfnLayer::allocateBuffer( - size_t token_num, int inter_size, size_t inter_buf_factor, size_t gating_lora_r, size_t inter_lora_r) +void LlamaFfnLayer::activation(Tensor& gating, Tensor& inter, cudaStream_t stream) { - const size_t sz = token_num * inter_size; - - gating_buf_ = (T*)allocator_->reMalloc(gating_buf_, sizeof(T) * sz * inter_buf_factor, false); - inter_buf_ = gating_buf_ + sz; - - if (gating_lora_r + inter_lora_r) { - lora_buf_ = (T*)allocator_->reMalloc(lora_buf_, sizeof(T) * token_num * (gating_lora_r + inter_lora_r)); - } - - is_allocate_buffer_ = true; -} - -template -void LlamaFfnLayer::freeBuffer() -{ - if (is_allocate_buffer_) { - allocator_->free((void**)&gating_buf_); - allocator_->free((void**)&lora_buf_); - is_allocate_buffer_ = false; - } -} - -template -void LlamaFfnLayer::activation(int token_num, int inter_size, bool is_chunked) -{ - NvtxScope scope("activation"); - if (is_chunked) { - // gate & up are in the SAME buffer - invokeGenericActivation_v2( - gating_buf_, gating_buf_ + inter_size, inter_size * 2, token_num, inter_size, stream_); - sync_check_cuda_error(); - } - else { - // gate & up are in separate buffers - invokeGenericActivation_v2(gating_buf_, inter_buf_, inter_size, token_num, inter_size, stream_); - sync_check_cuda_error(); - } + // Code for dispatching activation types + invokeGenericActivation_v3(gating, inter, stream); } -template -void LlamaFfnLayer::forward(TensorMap* output_tensors, - const TensorMap* input_tensors, - const LlamaFfnWeight* weights) +void LlamaFfnLayer::forward(ForwardParam param) { - /** - * input_tensors: - * \param ffn_input [token_num, hidden_dimension] - * - * output_tensors: - * \param ffn_output [token_num, hidden_dimension] - */ - NvtxScope scope("ffn"); - const size_t token_num = input_tensors->at("ffn_input").shape[0]; - const int layer_id = input_tensors->getVal("layer_id"); - const int inter_size = weights->inter_size; - - const bool is_fused_silu = weights->fused_gating_intermediate.kernel && weights->is_fused_silu; + const auto& mlp = *param.weights; - allocateBuffer(token_num, inter_size, is_fused_silu ? 1 : 2, weights->gating.lora.r, weights->intermediate.lora.r); + const int token_num = param.input.shape(0); + const int inter_size = mlp.inter_size; + const int layer_id = param.layer_id; - const T* ffn_input_data = input_tensors->at("ffn_input").getPtr(); - T* ffn_output_data = output_tensors->at("ffn_output").getPtr(); - int* lora_mask = input_tensors->at("lora_mask", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr(); + const auto stream = core::Context::stream().handle(); - if (weights->fused_gating_intermediate.kernel) { - NvtxScope scope("fused_silu_ffn"); + Tensor gating; + Tensor inter; - const auto type = weights->is_fused_silu ? LlamaLinear::kFusedSiluFfn : LlamaLinear::kGemm; + if (mlp.fused_gating_intermediate.weight) { + const auto type = mlp.is_fused_silu ? LlamaLinear::kFusedSiluFfn : LlamaLinear::kGemm; - linear_->forward(gating_buf_, ffn_input_data, token_num, weights->fused_gating_intermediate, type); + auto mix = linear_.forward(param.input, mlp.fused_gating_intermediate, type); sync_check_cuda_error(); - if (!weights->is_fused_silu) { - activation(token_num, inter_size, true); + gating = mix.slice({0, 0}, {(int)token_num, inter_size}); + if (!mlp.is_fused_silu) { + inter = mix.slice({0, inter_size}, {(ssize_t)token_num, inter_size}); } - - count_and_fix(gating_buf_, token_num * weights->output.input_dims, Concat("w1_w3_silu", layer_id), 3); } else { - { // w1(x) - NvtxScope scope("w1"); - linear_->forward(gating_buf_, // - ffn_input_data, - token_num, - weights->gating, - LlamaLinear::kGemm, - lora_buf_, - lora_mask); - sync_check_cuda_error(); - } - count_and_fix(gating_buf_, token_num * weights->gating.output_dims, Concat("w1", layer_id), 3); - - { // w3(x) - NvtxScope scope("w3"); - linear_->forward(inter_buf_, - ffn_input_data, - token_num, - weights->intermediate, - LlamaLinear::kGemm, - lora_buf_, - lora_mask); - sync_check_cuda_error(); - } - count_and_fix(inter_buf_, token_num * weights->intermediate.output_dims, Concat("w3", layer_id), 3); + gating = linear_.forward(param.input, mlp.gating, LlamaLinear::kGemm); + sync_check_cuda_error(); + TM_DEBUG_TENSOR(gating, Concat("w1", layer_id), 3); - // silu(w1(x)) * w3(x) - activation(token_num, inter_size, false); + inter = linear_.forward(param.input, mlp.intermediate, LlamaLinear::kGemm); + sync_check_cuda_error(); + TM_DEBUG_TENSOR(inter, Concat("w3", layer_id), 3); + } - count_and_fix(gating_buf_, token_num * weights->output.input_dims, Concat("act", layer_id), 3); + if (!mlp.is_fused_silu) { + // silu(w1(x)) * w3(x) + activation(gating, inter, stream); + sync_check_cuda_error(); + TM_DEBUG_TENSOR(gating, Concat("act", layer_id), 3); } { // w2(x) NvtxScope scope("w2"); - const int pitch = (weights->fused_gating_intermediate.kernel && !weights->is_fused_silu) ? inter_size * 2 : 0; - linear_->forward(ffn_output_data, - {gating_buf_, pitch}, - token_num, - weights->output, - LlamaLinear::kGemm, - lora_buf_, - lora_mask); + linear_.forward(gating, mlp.output, LlamaLinear::kGemm, param.output); sync_check_cuda_error(); } - - count_and_fix(ffn_output_data, token_num * weights->output.output_dims, Concat("w2", layer_id), 3); - - if (is_free_buffer_after_forward_) { - freeBuffer(); - } } -#ifdef ENABLE_FP32 -template class LlamaFfnLayer; -#endif -template class LlamaFfnLayer; -#ifdef ENABLE_BF16 -template class LlamaFfnLayer<__nv_bfloat16>; -#endif - } // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaFfnLayer.h b/src/turbomind/models/llama/LlamaFfnLayer.h index c11c5e56fb..6b5e339fbc 100644 --- a/src/turbomind/models/llama/LlamaFfnLayer.h +++ b/src/turbomind/models/llama/LlamaFfnLayer.h @@ -19,50 +19,35 @@ #pragma once +#include "src/turbomind/core/core.h" +#include "src/turbomind/models/llama/LlamaDenseWeight.h" #include "src/turbomind/models/llama/LlamaLinear.h" #include "src/turbomind/models/llama/context.h" #include "src/turbomind/models/llama/llama_params.h" -#include "src/turbomind/utils/Tensor.h" namespace turbomind { -template class LlamaFfnLayer { public: - LlamaFfnLayer(const ModelParam& model, const Context& ctx): - hidden_units_(model.hidden_units), - stream_(ctx.stream), - linear_(ctx.linear.get()), - allocator_(ctx.allocator.get()) + LlamaFfnLayer(const ModelParam& model, const Context& ctx): hidden_units_(model.hidden_units), linear_(*ctx.linear) { } - ~LlamaFfnLayer() - { - freeBuffer(); - } + struct ForwardParam { + Tensor input; + Tensor output; + const LlamaFfnWeight* weights; + int layer_id; + }; - void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaFfnWeight* weights); + void forward(ForwardParam param); private: - void allocateBuffer( - size_t token_num, int inter_size, size_t inter_buf_factor, size_t gating_lora_r, size_t inter_lora_r); - - void freeBuffer(); - - void activation(int token_num, int inter_size, bool is_chunked); + void activation(Tensor& gating, Tensor& inter, cudaStream_t stream); - const size_t hidden_units_; - cudaStream_t const stream_; - LlamaLinear* const linear_; - IAllocator* const allocator_; - bool is_free_buffer_after_forward_{}; - - T* gating_buf_{}; - T* inter_buf_{}; - T* lora_buf_{}; - - bool is_allocate_buffer_{}; +private: + const size_t hidden_units_; + LlamaLinear& linear_; }; } // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaLinear.cu b/src/turbomind/models/llama/LlamaLinear.cu index 81dcff7a74..1696920d9b 100644 --- a/src/turbomind/models/llama/LlamaLinear.cu +++ b/src/turbomind/models/llama/LlamaLinear.cu @@ -3,176 +3,176 @@ #include "src/turbomind/kernels/gemm/gemm.h" #include "src/turbomind/kernels/gemm/types.h" #include "src/turbomind/models/llama/LlamaLinear.h" -#include "src/turbomind/models/llama/llama_decoder_kernels.h" -#include +#include "src/turbomind/utils/cuda_utils.h" + +#include "src/turbomind/core/cuda_data_type.h" namespace turbomind { -template -struct LlamaLinear::Impl { +struct LlamaLinear::Impl { - Impl(cublasMMWrapper* cublas_wrapper, cudaStream_t stream): cublas_wrapper_(cublas_wrapper), stream_(stream) + explicit Impl(cudaStream_t stream): stream_(stream) { workspace_ = {}; workspace_.barriers_size = gemm::Gemm::kBarriersSize; workspace_.partials_size = gemm::Gemm::kPartialsSize; - cudaMallocAsync(&workspace_.barriers, workspace_.barriers_size, stream_); - cudaMallocAsync(&workspace_.partials, workspace_.partials_size, stream_); - cudaMemsetAsync(workspace_.barriers, 0, workspace_.barriers_size, stream_); + + check_cuda_error(cudaMallocAsync(&workspace_.barriers, workspace_.barriers_size, stream_)); + check_cuda_error(cudaMallocAsync(&workspace_.partials, workspace_.partials_size, stream_)); + check_cuda_error(cudaMemsetAsync(workspace_.barriers, 0, workspace_.barriers_size, stream_)); + + check_cuda_error(cublasCreate(&cublas_)); + check_cuda_error(cublasSetStream(cublas_, stream_)); + check_cuda_error(cublasSetWorkspace(cublas_, workspace_.partials, workspace_.partials_size)); + + if (0) { + check_cuda_error(cublasSetMathMode(cublas_, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION)); + } } ~Impl() { + cublasDestroy(cublas_); cudaFreeAsync(workspace_.barriers, stream_); cudaFreeAsync(workspace_.partials, stream_); workspace_ = {}; } - void forward(T* output_data, - Pitched input_data, - int batch_size, - const LlamaDenseWeight& weight, - Type type, - T* lora_buff, - int* lora_mask) + void forward(Tensor& output, const Tensor& input, const LlamaDenseWeight& dense, Type type) { - if (input_data.pitch == 0) { - input_data.pitch = weight.input_dims; - } - if (lora_mask != nullptr && weight.lora.r > 0) { - FT_CHECK(type == kGemm); - // output = lora(x) * scale - // output = mask(output) - // output = x*W + output - cublas_wrapper_->Gemm(CUBLAS_OP_N, - CUBLAS_OP_N, - weight.lora.r, // m - batch_size, // n - weight.input_dims, // k - (const T*)weight.lora.a, // A - weight.lora.r, // lda - input_data.ptr, // B - input_data.pitch, // ldb - lora_buff, // C - weight.lora.r); // ldc - - cublas_wrapper_->Gemm(CUBLAS_OP_N, - CUBLAS_OP_N, - weight.output_dims, // m - batch_size, // n - weight.lora.r, // k - (const T*)weight.lora.b, // A - weight.output_dims, // lda - lora_buff, // B - weight.lora.r, // ldb - output_data, // C - weight.output_dims, // ldc - weight.lora.scale, // alpha - 0.0f); // beta - - invokeMask(output_data, lora_mask, batch_size, weight.output_dims, stream_); - sync_check_cuda_error(); - - type = kFusedAdd; - } - switch (weight.type) { - case WeightType::kFP16: - case WeightType::kFP32: - case WeightType::kBF16: - return forwardFp(output_data, input_data, batch_size, weight, type); - case WeightType::kINT4: - return forwardInt4(output_data, input_data, batch_size, weight, type); + switch (dense.weight_type) { + case kFloat16: + case kFloat32: + case kBfloat16: + return forwardFp(output, input, dense.weight); + case kUint4: + return forwardInt4(output, input, dense, type); default: - FT_CHECK(0); + TM_CHECK(0) << "not implemented for weight type: " << dense.weight_type; } } - void forwardFp(T* output_data, Pitched input_data, int batch_size, const LlamaDenseWeight& weight, Type type) + void forwardFp(Ref output_, const Tensor& input, const Tensor& weight) { - cublas_wrapper_->Gemm(CUBLAS_OP_N, - CUBLAS_OP_N, - weight.output_dims, - batch_size, - weight.input_dims, - (const T*)weight.kernel, - weight.output_dims, - input_data.ptr, - input_data.pitch, - output_data, - weight.output_dims, - 1.0f, - type == kFusedAdd ? 1.0f : 0.0f); - // sync_check_cuda_error(); + auto& output = output_.get(); + TM_CHECK_EQ(weight.ndim(), 2); + TM_CHECK_EQ(input.ndim(), 2); + TM_CHECK_EQ(output.ndim(), 2); + + int m, n, k; + std::tie(k, m) = weight.shapes(0, 1); + n = input.shape(0); + + TM_CHECK_EQ(input.shape(1), k); + TM_CHECK_EQ(output.shape(0), n); + TM_CHECK_EQ(output.shape(1), m); + + // [k, m] + cublasOperation_t transa = weight.stride(1) == 1 ? CUBLAS_OP_N : CUBLAS_OP_T; + // [n, k] + cublasOperation_t transb = input.stride(1) == 1 ? CUBLAS_OP_N : CUBLAS_OP_T; + + const float alpha = 1.f; + const float beta = 0.f; + + check_cuda_error(cublasGemmEx(cublas_, + transa, + transb, + m, + n, + k, + &alpha, + weight.raw_data(), + to_cuda_dtype(weight.dtype()), + weight.stride(0) * weight.stride(1), // one of these is 1 + input.raw_data(), + to_cuda_dtype(input.dtype()), + input.stride(0) * input.stride(1), // one of these is 1 + &beta, + output.raw_data(), + to_cuda_dtype(output.dtype()), + output.stride(0) * output.stride(1), // one of these is 1 + CUDA_R_32F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } - void forwardInt4(T* output_data, Pitched input_data, int batch_size, const LlamaDenseWeight& weight, Type type) + void forwardInt4(Tensor& output, const Tensor& input, const LlamaDenseWeight& dense, Type type) { + TM_CHECK_EQ(output.ndim(), 2); // A [m, k] + TM_CHECK_EQ(input.ndim(), 2); // C [m, n] + + TM_CHECK_EQ(input.stride(1), 1) << "input must be row-major"; + TM_CHECK_EQ(output.stride(1), 1) << "output must be row-major"; + + TM_CHECK_EQ(output.shape(0), input.shape(0)); + TM_CHECK_EQ(input.shape(1), dense.input_dim); + // TM_CHECK_EQ(output.shape(1), dense.output_dim); + using namespace gemm; const Operation operation{dispatch_policy_, type == kFusedSiluFfn ? Epilogue::kGatedSilu : Epilogue::kNone, {QuantType::kNone}, - {QuantType::kDefault, weight.group_size}, + {QuantType::kDefault, dense.group_size}, 0, {}, nullptr}; const MatrixLayout a_desc{ - get_data_type_v, + input.dtype(), kRowMajor, - batch_size, - (int)weight.input_dims, - input_data.pitch, + (int)input.shape(0), + dense.input_dim, + (int)input.stride(0), }; const MatrixLayout c_desc{ - get_data_type_v, + output.dtype(), // kRowMajor, - batch_size, - (int)weight.output_dims, - type == kFusedSiluFfn ? (int)weight.output_dims / 2 : (int)weight.output_dims, + (int)output.shape(0), + dense.output_dim, + (int)output.stride(0), + // type == kFusedSiluFfn ? (int)weight.output_dim / 2 : (int)weight.output_dim, }; auto ec = gemm_.Run(operation, 1.f, - input_data.ptr, + input.raw_data(), a_desc, nullptr, {}, - weight.kernel, - weight.k_desc, - weight.scales_zeros, - weight.q_desc, + dense.weight.raw_data(), + dense.k_desc, + dense.scales_zeros.raw_data(), + dense.q_desc, type == kFusedAdd ? 1.0f : 0.0f, - output_data, + output.raw_data(), c_desc, - output_data, + output.raw_data(), c_desc, workspace_, stream_); if (ec) { TM_LOG_ERROR("%s: %d", __PRETTY_FUNCTION__, ec); - // std::abort(); } } - void forward_moe(T* output_data, - Pitched input_data, - const int* indexes, - const int* offsets, - int batch_size, - const LlamaDenseWeight& weight, - Type type, - gemm::Context* context) + void forward_moe(Tensor& output, + const Tensor& input, + const int* indexes, + const int* offsets, + const LlamaDenseWeight& dense, + Type type, + gemm::Context* context) { using namespace gemm; QuantDesc quant_b{}; - if (weight.k_desc.type == gemm::DataType::U4) { + if (dense.k_desc.type == kUint4) { quant_b.type = QuantType::kDefault; - quant_b.group_size = weight.group_size; + quant_b.group_size = dense.group_size; } const Operation operation{dispatch_policy_, @@ -184,56 +184,57 @@ struct LlamaLinear::Impl { nullptr}; MatrixLayout a_desc{ - get_data_type_v, + input.dtype(), kRowMajor, - batch_size, // m - (int)weight.input_dims, // k - input_data.pitch, + (int)output.shape(0), // batch size + dense.input_dim, // k + (int)input.stride(0), }; - // std::cout << "m" << batch_size << "n" << weight.output_dims << "k" << weight.input_dims << " " - // << input_data.pitch << "\n"; - a_desc.offsets = (int*)offsets; a_desc.idxs = (int*)indexes; + // std::cout << "m" << batch_size << "n" << weight.output_dims << "k" << weight.input_dims << " " + // << input_data.pitch << "\n"; + MatrixLayout c_desc{ - get_data_type_v, + output.dtype(), // kRowMajor, - batch_size, - (int)weight.output_dims, - type == kFusedSiluFfn ? (int)weight.output_dims / 2 : (int)weight.output_dims, + (int)output.shape(0), // batch size + dense.output_dim, + (int)output.stride(0), + // type == kFusedSiluFfn ? (int)weight.output_dims / 2 : (int)weight.output_dims, }; c_desc.offsets = (int*)offsets; - a_desc.num = c_desc.num = weight.k_desc.num; + a_desc.num = c_desc.num = dense.k_desc.num; auto ec = gemm_.Run(operation, 1.f, - input_data.ptr, + input.raw_data(), a_desc, nullptr, {}, - weight.kernel, - weight.k_desc, - weight.scales_zeros, - weight.q_desc, + dense.weight.raw_data(), + dense.k_desc, + dense.scales_zeros.data_or((void*)nullptr), + dense.q_desc, type == kFusedAdd ? 1.0f : 0.0f, - output_data, + output.raw_data(), c_desc, - output_data, + output.raw_data(), c_desc, workspace_, stream_); if (ec) { TM_LOG_ERROR("%s: %d", __PRETTY_FUNCTION__, ec); - // std::abort(); } } - cublasMMWrapper* cublas_wrapper_; + // cublasMMWrapper* cublas_wrapper_; + cublasHandle_t cublas_; gemm::Gemm gemm_; gemm::DispatchPolicy dispatch_policy_{gemm::DispatchPolicy::kDefault}; cudaStream_t stream_{}; @@ -241,45 +242,50 @@ struct LlamaLinear::Impl { gemm::Workspace workspace_; }; -template -LlamaLinear::LlamaLinear(cublasMMWrapper* cublas_wrapper, cudaStream_t stream): - impl_{std::make_shared(cublas_wrapper, stream)} -{ -} +LlamaLinear::LlamaLinear(cudaStream_t stream): impl_{std::make_shared(stream)} {} -template -void LlamaLinear::forward(T* output_data, - Pitched input_data, - int batch_size, - const LlamaDenseWeight& weight, - Type type, - T* lora_buff, - int* lora_mask) +Tensor LlamaLinear::forward(const Tensor& input, // + const LlamaDenseWeight& dense, + Type type, + std::optional output) { - impl_->forward(output_data, input_data, batch_size, weight, type, lora_buff, lora_mask); + ssize_t output_dim = type == kFusedSiluFfn ? dense.output_dim / 2 : dense.output_dim; + + Tensor in = input.view({-1, input.shape(-1)}); + Tensor out; + + if (output) { + out = output->view({in.shape(0), output_dim}); + } + else { + out = Tensor({in.shape(0), output_dim}, input.dtype(), input.device()); + } + + impl_->forward(out, in, dense, type); + + auto shape = input.shape(); + shape.back() = out.shape(-1); + + return out.view(shape); } -template -void LlamaLinear::forward_moe(T* output_data, - Pitched input_data, - const int* indexes, - const int* offsets, - int batch_size, - const LlamaDenseWeight& weight, - Type type, - gemm::Context* context) +void LlamaLinear::forward_moe(Tensor& output, + const Tensor& input, + const int* indexes, + const int* offsets, + const LlamaDenseWeight& dense, + Type type, + gemm::Context* context) { - impl_->forward_moe(output_data, input_data, indexes, offsets, batch_size, weight, type, context); + return impl_->forward_moe(output, input, indexes, offsets, dense, type, context); } -template -void LlamaLinear::set_measure(bool measure) +void LlamaLinear::set_measure(bool measure) { impl_->dispatch_policy_ = measure ? gemm::DispatchPolicy::kMeasure : gemm::DispatchPolicy::kReuse; } -template -int LlamaLinear::Export(std::ostream& os) +int LlamaLinear::Export(std::ostream& os) { if (os) { return impl_->gemm_.Export(os); @@ -287,8 +293,7 @@ int LlamaLinear::Export(std::ostream& os) return 0; } -template -int LlamaLinear::Import(std::istream& is) +int LlamaLinear::Import(std::istream& is) { auto n_records = 0; if (is) { @@ -300,18 +305,9 @@ int LlamaLinear::Import(std::istream& is) return n_records; } -template -std::vector LlamaLinear::GetTuningSeq() const +std::vector LlamaLinear::GetTuningSeq() const { return impl_->gemm_.GetTuningSeq(); } -#ifdef ENABLE_FP32 -template class LlamaLinear; -#endif -template class LlamaLinear; -#ifdef ENABLE_BF16 -template class LlamaLinear<__nv_bfloat16>; -#endif - } // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaLinear.h b/src/turbomind/models/llama/LlamaLinear.h index a22eb69ebd..625376aeb7 100644 --- a/src/turbomind/models/llama/LlamaLinear.h +++ b/src/turbomind/models/llama/LlamaLinear.h @@ -2,14 +2,14 @@ #pragma once -#include "src/turbomind/models/llama/LlamaDenseWeight.h" -#include "src/turbomind/utils/cublasMMWrapper.h" #include #include +#include "src/turbomind/core/core.h" +#include "src/turbomind/models/llama/LlamaDenseWeight.h" + namespace turbomind { -template class LlamaLinear { public: enum Type @@ -19,30 +19,20 @@ class LlamaLinear { kFusedAdd }; - struct Pitched { - const T* ptr; - int pitch; - Pitched(const T* ptr, int pitch = 0): ptr{ptr}, pitch{pitch} {} - }; + explicit LlamaLinear(cudaStream_t stream); + + Tensor forward(const Tensor& input, // + const LlamaDenseWeight& weight, + Type type = kGemm, + std::optional output = {}); - LlamaLinear(cublasMMWrapper* cublas_wrapper, cudaStream_t stream); - - void forward(T* output_data, - Pitched input_data, - int batch_size, - const LlamaDenseWeight& weight, - Type type = kGemm, - T* lora_buff = nullptr, - int* lora_mask = nullptr); - - void forward_moe(T* output_data, - Pitched input_data, - const int* indexes, - const int* offsets, - int batch_size, - const LlamaDenseWeight& weight, - Type type, - gemm::Context* context); + void forward_moe(Tensor& output, + const Tensor& input, + const int* indexes, + const int* offsets, + const LlamaDenseWeight& weight, + Type type, + gemm::Context* context); void set_measure(bool measure); diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc index ad79b91789..6739cce2fa 100644 --- a/src/turbomind/models/llama/LlamaV2.cc +++ b/src/turbomind/models/llama/LlamaV2.cc @@ -24,9 +24,10 @@ #include #include "src/turbomind/comm/device_comm.h" +#include "src/turbomind/core/core.h" #include "src/turbomind/macro.h" -#include "src/turbomind/models/llama/LlamaBatch.h" +#include "src/turbomind/models/llama/LlamaLinear.h" #include "src/turbomind/models/llama/LlamaV2.h" #include "src/turbomind/models/llama/LlamaWeight.h" #include "src/turbomind/models/llama/SequenceManager.h" @@ -36,7 +37,6 @@ #include "src/turbomind/kernels/gpt_kernels.h" -#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/anomaly_handler.h" #include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/logger.h" @@ -50,15 +50,16 @@ inline int pad_vocab_size(int vocab_size, int tp) return (vocab_size + tp - 1) / tp * tp; } -template -LlamaV2::LlamaV2(const ModelParam& model, - const EngineParam& engine, - const AttentionParam& attn, - const MoeParam& moe, - const LoraParam& lora, - const Context& ctx, - int max_batch_size, - std::shared_ptr> weights): +LlamaV2::LlamaV2(DataType dtype, + const ModelParam& model, + const EngineParam& engine, + const AttentionParam& attn, + const MoeParam& moe, + const LoraParam& lora, + const Context& ctx, + int max_batch_size, + std::shared_ptr weights): + dtype_{dtype}, param_(model), attn_param_(attn), lora_param_(lora), @@ -76,10 +77,7 @@ LlamaV2::LlamaV2(const ModelParam& model, local_kv_head_num_(model.kv_head_num / engine.attn_tp_size), weights_(std::move(weights)), stream_(ctx.stream), - cublas_wrapper_(ctx.cublas_wrapper.get()), - allocator_(ctx.allocator.get()), - linear_(ctx.linear.get()), - is_free_buffer_after_forward_(false), + linear_(*ctx.linear), debug_(isDebug()) { TM_LOG_DEBUG(__PRETTY_FUNCTION__); @@ -88,34 +86,19 @@ LlamaV2::LlamaV2(const ModelParam& model, use_allgather_2d_ = true; } - unified_decoder_ = std::make_unique>(model, engine, attn, moe, lora, ctx); + unified_decoder_ = std::make_unique(model, engine, attn, moe, lora, ctx); - dynamic_decode_layer_ = std::make_unique>(vocab_size_, - vocab_size_padded_, - stream_, - cublas_wrapper_, - allocator_, - is_free_buffer_after_forward_, - (cudaDeviceProp*)&ctx.cuda_device_prop); - - unified_decoder_->allocateBuffer(max_batch_size); -} - -template -LlamaV2::~LlamaV2() -{ - dynamic_decode_layer_.reset(); - unified_decoder_.reset(); + dynamic_decode_ = std::make_unique( + dtype_, max_batch_size, vocab_size_, vocab_size_padded_, stream_, &ctx.device_prop); } -template -void LlamaV2::updateEmbedding(T* decoder_input, - const int bsz, - const int* h_input_length, - const Sequence** sequences, - int token_num, - int* lora_mask, - bool* have_embeddings) +void LlamaV2::updateEmbedding(char* decoder_input, + const int bsz, + const int* h_input_length, + const Sequence** sequences, + int token_num, + int* lora_mask, + bool* have_embeddings) { if (isTuning()) return; @@ -130,6 +113,8 @@ void LlamaV2::updateEmbedding(T* decoder_input, mask_ptr = mask.data(); } + const size_t elem_size = byte_size(dtype_, 1); + for (int i = 0; i < bsz; i++) { const auto& seq = *sequences[i]; const auto& embeddings = seq.input_embeddings; @@ -148,16 +133,16 @@ void LlamaV2::updateEmbedding(T* decoder_input, // calculate intersection of [begin, end) and [seq.cache_len, seq.cache_len + h_input_length[i]) begin = std::max(begin, seq.cache_len); end = std::min(end, seq.cache_len + h_input_length[i]); - size_t byte_size = (end - begin) * hidden_units_ * sizeof(T); - T* dst_ptr = decoder_input + off_dst * hidden_units_; - auto src_ptr = embeddings[j].data() + off_src * hidden_units_ * sizeof(T); - cudaMemcpyAsync(dst_ptr, src_ptr, byte_size, cudaMemcpyDefault, stream_); + size_t byte_size = elem_size * (end - begin) * hidden_units_; + char* dst_ptr = decoder_input + elem_size * off_dst * hidden_units_; + auto src_ptr = embeddings[j].data() + elem_size * off_src * hidden_units_; + check_cuda_error(cudaMemcpyAsync(dst_ptr, src_ptr, byte_size, cudaMemcpyDefault, stream_)); if (lora_mask != nullptr) { std::fill_n(mask_ptr + off_dst, (end - begin), 1); *have_embeddings = true; } } - decoder_input += h_input_length[i] * hidden_units_; + decoder_input += elem_size * h_input_length[i] * hidden_units_; mask_ptr += h_input_length[i]; } @@ -168,271 +153,206 @@ void LlamaV2::updateEmbedding(T* decoder_input, sync_check_cuda_error(); } -template -void LlamaV2::forwardUnified(T* out, - T* decoder_output, - T* decoder_input, - void** block_ptrs, - const int* cu_block_cnts, - const int* input_ids, - const int* h_input_length, - const int* h_context_length, - const float* rope_theta, - const bool* finished, - size_t token_num, - const int* local_token_nums, - int dc_batch_size, - int pf_batch_size, - int* lora_mask, - const Sequence** sequences) +void LlamaV2::Forward(Buffer_ input_ids, + Tensor hidden_states_out, + Tensor decoder_out, + Buffer kv_block_ptrs, + Buffer cu_block_nums, + Buffer_ h_input_length, + Buffer_ h_context_length, + Buffer rope_base, + Buffer finished, + Buffer local_token_nums, + Buffer lora_mask, + int decode_num, + int prefil_num, + const Sequence** sequences) { TM_LOG_DEBUG(__PRETTY_FUNCTION__); + Tensor input_embeds; + + const int token_num = input_ids.size(); + if (token_num) { + const auto& embedding_table = weights_->pre_decoder_embedding.weight; + TM_CHECK_EQ(embedding_table.shape(1) * tp_size_, hidden_units_); + + input_embeds = Tensor{{token_num, (int)hidden_units_}, dtype_, kDEVICE}; + if (tp_size_ == 1) { - invokeInputIdsEmbeddingLookupPosEncoding(decoder_input, - nullptr, // processed somewhere else - weights_->pre_decoder_embedding_table, - static_cast(nullptr), - pPromptTuningParam{}, - input_ids, - 0, // only used for position encoding - token_num, - token_num, - 1, - hidden_units_, - stream_); + invokeEmbeddingLookup(input_embeds, input_ids, embedding_table, stream_); sync_check_cuda_error(); } - else { - const size_t local_hidden_units = hidden_units_ / tp_size_; - const size_t slice = token_num * local_hidden_units; - invokeInputIdsEmbeddingLookupPosEncoding(decoder_output + tp_rank_ * slice, - nullptr, // processed somewhere else - weights_->pre_decoder_embedding_table, - static_cast(nullptr), - pPromptTuningParam{}, - input_ids, - 0, // only used for position encoding - token_num, - token_num, - 1, - local_hidden_units, - stream_); + else if (use_allgather_2d_) { + const auto local_hidden_units = embedding_table.shape(1); + Tensor temp{hidden_states_out.buffer(), {token_num, tp_size_, local_hidden_units}}; + + auto local = temp.slice({0, tp_rank_, 0}, {-1, 1, -1}).squeeze(1); + + invokeEmbeddingLookup(local, input_ids, embedding_table, stream_); sync_check_cuda_error(); - comm_->d_comm->AllGather(decoder_output + tp_rank_ * slice, - decoder_output, - slice, - getTensorType(), - comm_->d_tp_group, - stream_); + comm_->d_comm->AllGather2D(local.raw_data(), + temp.raw_data(), + hidden_units_, + local_hidden_units, + local_hidden_units, + token_num, + local.dtype(), + {true, true}, + comm_->d_tp_group, + stream_); sync_check_cuda_error(); - invokeInPlaceTranspose102( - decoder_input, decoder_output, tp_size_, token_num, local_hidden_units, false, stream_); + Copy(temp.buffer(), input_embeds.buffer()); + } + else { + const auto local_hidden_units = embedding_table.shape(1); + Tensor temp{hidden_states_out.buffer(), {tp_size_, token_num, local_hidden_units}}; + + auto local = temp.slice(tp_rank_).squeeze(0); + + invokeEmbeddingLookup(local, input_ids, embedding_table, stream_); + sync_check_cuda_error(); + comm_->d_comm->AllGather( + local.raw_data(), temp.raw_data(), local.size(), dtype_, comm_->d_tp_group, stream_); + sync_check_cuda_error(); + + invokeInPlaceTranspose102((uint16_t*)input_embeds.raw_data(), + (uint16_t*)temp.raw_data(), + tp_size_, + token_num, + local_hidden_units, + false, + stream_); sync_check_cuda_error(); } - count_and_fix(decoder_input, token_num * hidden_units_, "embedding", 1); + TM_DEBUG_TENSOR(input_embeds, "embeddings", 1); } bool have_embeddings = false; if (token_num) { - updateEmbedding(decoder_input, - dc_batch_size + pf_batch_size, - h_input_length, + // Copy input embeddings from corresponding sequences + updateEmbedding((char*)input_embeds.raw_data(), + h_input_length.size(), + h_input_length.data(), sequences, token_num, - lora_mask, + lora_mask ? lora_mask.data() : nullptr, &have_embeddings); sync_check_cuda_error(); } - const auto dtype = getTensorType(); - const size_t bsz = dc_batch_size + pf_batch_size; - - TensorMap inputs{ - {"decoder_input", {MEMORY_GPU, dtype, {token_num, hidden_units_}, decoder_input}}, - {"output_norm_weight", {MEMORY_GPU, dtype, {hidden_units_}, weights_->output_norm_weight}}, - {"h_q_len", {MEMORY_CPU, TYPE_INT32, {bsz}, h_input_length}}, - {"h_k_len", {MEMORY_CPU, TYPE_INT32, {bsz}, h_context_length}}, - {"finished", {MEMORY_GPU, TYPE_BOOL, {bsz}, finished}}, - {"dc_batch_size", {MEMORY_CPU, TYPE_INT32, {1}, &dc_batch_size}}, - {"pf_batch_size", {MEMORY_CPU, TYPE_INT32, {1}, &pf_batch_size}}, - {"rope_theta", {MEMORY_GPU, TYPE_FP32, {hidden_units_}, rope_theta}}, - {"cu_block_counts", {MEMORY_GPU, TYPE_INT32, {bsz}, cu_block_cnts}}, - {"local_token_nums", {MEMORY_GPU, TYPE_INT32, {1}, local_token_nums}}, - }; - - TensorMap outputs{{"decoder_output", {MEMORY_GPU, dtype, {token_num, hidden_units_}, decoder_output}}, - {"block_ptrs", {MEMORY_GPU, TYPE_UINT64, {bsz}, block_ptrs}}, - {"last_token_hidden_units", {MEMORY_GPU, dtype, {bsz, hidden_units_}, out}}}; - - if (lora_mask != nullptr && have_embeddings) { - inputs.insert({"lora_mask", {MEMORY_GPU, TYPE_INT32, {token_num}, lora_mask}}); - } - - unified_decoder_->forward(&outputs, &inputs, &weights_->decoder_layer_weights); + TensorMap args{{"decoder_input", input_embeds}, + {"decoder_output", hidden_states_out.view({-1, (int)hidden_units_}).borrow()}, + {"last_token_hidden_units", decoder_out}, + {"output_norm_weight", weights_->output_norm_weight}, + {"h_q_len", h_input_length}, + {"h_k_len", h_context_length}, + {"finished", finished}, + {"decode_num", Buffer{&decode_num, 1, kCPU}}, + {"prefil_num", Buffer{&prefil_num, 1, kCPU}}, + {"rope_base", rope_base}, + {"cu_block_nums", cu_block_nums}, + {"kv_block_ptrs", kv_block_ptrs}, + {"local_token_nums", local_token_nums}}; + + unified_decoder_->Forward(args, weights_->decoder_layer_weights); } -template -void LlamaV2::postDecodeEmbedding(T* logits, T* local_logits, const T* decoder_output, int batch_size) +Tensor LlamaV2::postDecodeEmbedding(const Tensor& features, Buffer local_logits) { NvtxScope scope("postDecodeEmbedding"); TM_LOG_DEBUG(__PRETTY_FUNCTION__); - cudaDataType_t data_type = getCudaDataType(); - float alpha = 1.f; - float beta = 0.f; - FT_CHECK(vocab_size_padded_ % tp_size_ == 0); - const size_t local_vocab_size = vocab_size_padded_ / tp_size_; - - auto invoke_gemm = [&](int first, int n, auto C, size_t batch_stride_C, size_t rank_stride_C) { - cublas_wrapper_->Gemm(CUBLAS_OP_T, - CUBLAS_OP_N, - local_vocab_size, // m - n, - hidden_units_, // k - &alpha, - weights_->post_decoder_embedding_kernel, - data_type, - hidden_units_, // k - decoder_output + first * hidden_units_, - data_type, - hidden_units_, // k - &beta, - C + first * batch_stride_C + tp_rank_ * rank_stride_C, - data_type, - batch_stride_C, // ldc - CUDA_R_32F, - cublasGemmAlgo_t(-1)); - }; + TM_CHECK(vocab_size_padded_ % tp_size_ == 0) << vocab_size_padded_ << " " << tp_size_; + + const int bsz = features.shape(0); + const int local_vocab_size = vocab_size_padded_ / tp_size_; if (tp_size_ == 1) { - invoke_gemm(0, batch_size, logits, vocab_size_padded_, 0); + Tensor logits{local_logits, {bsz, (int)vocab_size_padded_}}; + linear_.forward(features, weights_->post_decoder_embedding, LlamaLinear::kGemm, logits); sync_check_cuda_error(); + + TM_DEBUG_TENSOR(logits, "logits", 1); + return logits; } - else if (use_allgather_2d_ == false) { - FT_CHECK(logits != local_logits); - const size_t slice = batch_size * local_vocab_size; - invoke_gemm(0, batch_size, local_logits, local_vocab_size, slice); - sync_check_cuda_error(); - comm_->d_comm->AllGather( - local_logits + tp_rank_ * slice, local_logits, slice, getTensorType(), comm_->d_tp_group, stream_); + else if (use_allgather_2d_) { + Tensor logits{local_logits, {bsz, tp_size_, local_vocab_size}}; + Tensor local = logits.slice({0, tp_rank_, 0}, {-1, 1, -1}); + linear_.forward(features, weights_->post_decoder_embedding, LlamaLinear::kGemm, local.squeeze(1)); sync_check_cuda_error(); - invokeTransposeAxis01(logits, local_logits, tp_size_, batch_size, local_vocab_size, stream_); + comm_->d_comm->AllGather2D(local.raw_data(), + logits.raw_data(), + vocab_size_padded_, + local_vocab_size, + local_vocab_size, + bsz, + logits.dtype(), + {true, true}, + comm_->d_tp_group, + stream_); sync_check_cuda_error(); + return logits.view({bsz, -1}); } else { - FT_CHECK(logits == local_logits); - const int max_stages = 1; - const int min_stage_tokens = 512; - const int step = std::max(std::min(batch_size, min_stage_tokens), (batch_size + max_stages - 1) / max_stages); - cudaStream_t comm_stream = stream_; - cudaEvent_t comm_event{}; - if (step < batch_size) { - check_cuda_error(cudaStreamCreateWithFlags(&comm_stream, cudaStreamNonBlocking)); - check_cuda_error(cudaEventCreateWithFlags(&comm_event, cudaEventDisableTiming)); - } - for (int first = 0; first < batch_size; first += step) { - const int n = std::min(first + step, batch_size) - first; - invoke_gemm(first, n, local_logits, vocab_size_padded_, local_vocab_size); - sync_check_cuda_error(); - if (comm_stream != stream_) { - check_cuda_error(cudaEventRecord(comm_event, stream_)); - check_cuda_error(cudaStreamWaitEvent(comm_stream, comm_event)); - } - comm_->d_comm->AllGather2D(local_logits + first * vocab_size_padded_ + tp_rank_ * local_vocab_size, - local_logits + first * vocab_size_padded_, - vocab_size_padded_, - local_vocab_size, - local_vocab_size, - n, - getTensorType(), - {first == 0, first + n == batch_size}, - comm_->d_tp_group, - comm_stream); - sync_check_cuda_error(); - } - if (comm_stream != stream_) { - check_cuda_error(cudaEventRecord(comm_event, comm_stream)); - check_cuda_error(cudaStreamWaitEvent(stream_, comm_event)); - check_cuda_error(cudaEventDestroy(comm_event)); - check_cuda_error(cudaStreamDestroy(comm_stream)); - } + Tensor logits{local_logits, {tp_size_, bsz, local_vocab_size}}; + Tensor local = logits.slice({tp_rank_, 0, 0}, {1, -1, -1}); + linear_.forward(features, weights_->post_decoder_embedding, LlamaLinear::kGemm, local.squeeze(0)); + sync_check_cuda_error(); + comm_->d_comm->AllGather( + local.raw_data(), logits.raw_data(), local.size(), local.dtype(), comm_->d_tp_group, stream_); + sync_check_cuda_error(); + Tensor out{{bsz, (int)vocab_size_padded_}, features.dtype(), features.device()}; + invokeTransposeAxis01( + (uint16_t*)out.raw_data(), (uint16_t*)logits.raw_data(), tp_size_, bsz, local_vocab_size, stream_); + sync_check_cuda_error(); + return out; } } -template -void LlamaV2::dynamicDecode(int* token_ids, - bool* finished, - int* sequence_length, - bool* should_stop, - curandState_t* curand_state, - TensorMap* inputs, - TensorMap* outputs, - const T* logits, - const uint32_t* seq_limit_len, - const int* context_length, - int step, - int ite, - size_t max_context_len, - size_t token_ids_len, - size_t batch_size) +void LlamaV2::dynamicDecode(Buffer token_ids, + Buffer finished, + Buffer sequence_length, + Tensor curand_state, + Tensor logits, + Buffer seq_limit_len, + Buffer init_context_length, + Buffer context_length, + Buffer prompt_length, + Buffer sampled_logprobs, + Buffer sampled_indexes, + Buffer sampled_nums, + int step, + int max_context_len) { NvtxScope scope("dynamicDecode"); TM_LOG_DEBUG(__PRETTY_FUNCTION__); - int local_batch_size = (int)batch_size; - - std::unordered_map dynamic_decode_input_tensors{ - {"logits", {MEMORY_GPU, getTensorType(), {batch_size, (size_t)1, vocab_size_padded_}, logits}}, - {"step", {MEMORY_CPU, TYPE_INT32, {1}, &step}}, - {"max_input_length", {MEMORY_CPU, TYPE_INT32, {1}, &max_context_len}}, - {"sequence_limit_length", {MEMORY_GPU, TYPE_UINT32, {batch_size}, seq_limit_len}}, - {"input_lengths", {MEMORY_GPU, TYPE_INT32, {batch_size, 1}, context_length}}, - {"ite", {MEMORY_CPU, TYPE_UINT32, {1}, &ite}}, - {"local_batch_size", {MEMORY_CPU, TYPE_INT32, {1}, &local_batch_size}}, + TensorMap args{ + {"logits", logits}, + {"step", Buffer{&step, 1, kCPU}}, + {"max_input_length", Buffer{&max_context_len, 1, kCPU}}, + {"sequence_limit_length", seq_limit_len}, + {"init_context_length", init_context_length}, + {"context_length", context_length}, + {"prompt_length", prompt_length}, + {"output_ids", token_ids}, // inout + {"finished", finished}, // inout + {"sequence_length", sequence_length}, // inout + {"curand_state", curand_state}, // inout }; - const std::vector optional_inputs{"end_ids", - "stop_words_list", - "bad_words_list", - "runtime_top_k", - "runtime_top_p", - "temperature", - "repetition_penalty"}; - for (const auto& key : optional_inputs) { - if (inputs->isExist(key)) { - dynamic_decode_input_tensors.insert({key, inputs->at(key)}); - } + if (sampled_logprobs) { + args.emplace("sampled_logprobs", sampled_logprobs); + args.emplace("sampled_indexes", sampled_indexes); + args.emplace("sampled_nums", sampled_nums); } - std::unordered_map dynamic_decode_output_tensors{ - {"output_ids", {MEMORY_GPU, TYPE_INT32, {token_ids_len, batch_size, 1U}, token_ids}}, - {"finished", {MEMORY_GPU, TYPE_BOOL, {batch_size}, finished}}, - {"sequence_length", {MEMORY_GPU, TYPE_INT32, {batch_size}, sequence_length}}, - {"should_stop", {MEMORY_CPU, TYPE_BOOL, {1}, should_stop}}, - {"curand_state", {MEMORY_GPU, TYPE_VOID, {batch_size}, curand_state}}}; - - const std::vector optional_outputs{ - "cum_log_probs", "output_log_probs", "sampled_indexes", "sampled_logprobs", "sampled_nums"}; - for (const auto& key : optional_outputs) { - if (outputs->isExist(key)) { - dynamic_decode_output_tensors.insert({key, outputs->at(key)}); - } - } - - dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors); + dynamic_decode_->Forward(args); } -template class LlamaV2; -#ifdef ENABLE_FP32 -template class LlamaV2; -#endif -#ifdef ENABLE_BF16 -template class LlamaV2<__nv_bfloat16>; -#endif - } // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaV2.h b/src/turbomind/models/llama/LlamaV2.h index 445c778c5d..e799070b3a 100644 --- a/src/turbomind/models/llama/LlamaV2.h +++ b/src/turbomind/models/llama/LlamaV2.h @@ -29,24 +29,22 @@ #include "src/turbomind/models/llama/context.h" #include "src/turbomind/models/llama/llama_params.h" #include "src/turbomind/models/llama/unified_decoder.h" -#include "src/turbomind/utils/allocator.h" -#include "src/turbomind/utils/cublasMMWrapper.h" namespace turbomind { -template +class LlamaBatch; + class LlamaV2 { public: - ~LlamaV2(); - - LlamaV2(const ModelParam& model, - const EngineParam& engine, - const AttentionParam& attn, - const MoeParam& moe, - const LoraParam& lora, - const Context& ctx, - int max_batch_size, - std::shared_ptr> weights); + LlamaV2(DataType dtype, + const ModelParam& model, + const EngineParam& engine, + const AttentionParam& attn, + const MoeParam& moe, + const LoraParam& lora, + const Context& ctx, + int max_batch_size, + std::shared_ptr weights); size_t vocab_size() const noexcept { @@ -54,7 +52,7 @@ class LlamaV2 { } private: - void updateEmbedding(T* decoder_input, + void updateEmbedding(char* decoder_input, const int bsz, const int* h_input_length, const Sequence** sequences, @@ -62,43 +60,42 @@ class LlamaV2 { int* lora_mask, bool* have_embeddings); - void forwardUnified(T* out, - T* decoder_output, - T* decoder_input, - void** block_ptrs, - const int* cu_block_cnts, - const int* input_ids, - const int* h_input_length, - const int* h_context_length, - const float* rope_theta, - const bool* finished, - size_t token_num, - const int* local_token_nums, - int dc_batch_size, - int pf_batch_size, - int* lora_mask, - const Sequence** sequences); - - void postDecodeEmbedding(T* logits, T* local_logits, const T* decoder_output, int batch_size); - - void dynamicDecode(int* token_ids, - bool* finished, - int* sequence_length, - bool* should_stop, - curandState_t* curand_state, - TensorMap* inputs, - TensorMap* outputs, - const T* logits, - const uint32_t* seq_limit_len, - const int* context_length, - int step, - int ite, - size_t max_context_len, - size_t token_ids_len, - size_t batch_size); + void Forward(Buffer_ input_ids, + Tensor hidden_states_out, + Tensor decoder_out, + Buffer kv_block_ptrs, + Buffer cu_block_nums, + Buffer_ h_input_length, + Buffer_ h_context_length, + Buffer rope_base, + Buffer finished, + Buffer local_token_nums, + Buffer lora_mask, + int decode_num, + int prefil_num, + const Sequence** sequences); + + Tensor postDecodeEmbedding(const Tensor& features, Buffer local_logits); + + void dynamicDecode(Buffer token_ids, + Buffer finished, + Buffer sequence_length, + Tensor curand_state, + Tensor logits, + Buffer seq_limit_len, + Buffer init_context_length, + Buffer context_length, + Buffer prompt_length, + Buffer sampled_logprobs, // <- indicator + Buffer sampled_indexes, + Buffer sampled_nums, + int step, + int max_context_len); private: - friend class LlamaBatch; + friend class LlamaBatch; + + const DataType dtype_; const ModelParam param_; const AttentionParam attn_param_; @@ -118,21 +115,18 @@ class LlamaV2 { const size_t local_head_num_; const size_t local_kv_head_num_; - const std::shared_ptr> weights_{}; + const std::shared_ptr weights_; - // Refs into `Context`, make the pointer constant (not the pointed objects) - cudaStream_t const stream_; - cublasMMWrapper* const cublas_wrapper_; - IAllocator* const allocator_; - LlamaLinear* const linear_; + // Refs into `Context`, make the pointer constant (not the pointed objects) + cudaStream_t const stream_; + LlamaLinear& linear_; bool use_allgather_2d_{false}; - const bool is_free_buffer_after_forward_; const bool debug_; - std::unique_ptr> unified_decoder_; - std::unique_ptr> dynamic_decode_layer_; + std::unique_ptr unified_decoder_; + std::unique_ptr dynamic_decode_; }; } // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc index 4aca3c0056..0a23f986d6 100644 --- a/src/turbomind/models/llama/LlamaWeight.cc +++ b/src/turbomind/models/llama/LlamaWeight.cc @@ -18,26 +18,30 @@ // Modified from // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptWeight.cc +#include + +#include "src/turbomind/core/allocator.h" +#include "src/turbomind/core/context.h" +#include "src/turbomind/models/llama/LlamaDenseWeight.h" #include "src/turbomind/models/llama/LlamaWeight.h" #include "src/turbomind/models/llama/llama_params.h" #include "src/turbomind/utils/cuda_utils.h" -#include "src/turbomind/utils/memory_utils.h" -#include namespace turbomind { -template -LlamaWeight::LlamaWeight(const ModelParam& model, - const EngineParam& engine_param, - const LoraParam& lora_param, - const MoeParam& moe_param): +LlamaWeight::LlamaWeight(DataType data_type, + const ModelParam& model, + const EngineParam& engine_param, + const LoraParam& lora_param, + const MoeParam& moe_param): hidden_units_(model.hidden_units), inter_size_(model.inter_size), vocab_size_(model.vocab_size), vocab_size_padded_(model.vocab_size), embedding_size_(model.embedding_size), num_layer_(model.layer_num), - weight_type_(model.weight_type), + data_type_{data_type}, + weight_type_{model.weight_type}, tp_size_(engine_param.attn_tp_size), tp_rank_(engine_param.attn_tp_rank) { @@ -51,138 +55,65 @@ LlamaWeight::LlamaWeight(const ModelParam& model, } FT_CHECK(hidden_units_ % tp_size_ == 0); - check_cuda_error(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)); + stream_ = core::Stream::create(); + alloca_ = core::Allocator{stream_, false}; + + core::ContextGuard guard = context(); + + TM_CHECK_EQ(vocab_size_padded_ % tp_size_, 0); + TM_CHECK_EQ(hidden_units_ % tp_size_, 0); + + pre_decoder_embedding.emplace(embedding_size_, hidden_units_ / tp_size_, data_type, false, data_type, 1); + post_decoder_embedding.emplace(hidden_units_, vocab_size_padded_ / tp_size_, data_type, false, data_type, 1); + register_module("tok_embeddings", pre_decoder_embedding, tp_rank_); + register_module("output", post_decoder_embedding, tp_rank_); decoder_layer_weights.reserve(num_layer_); - for (unsigned l = 0; l < num_layer_; ++l) { + for (int i = 0; i < num_layer_; ++i) { decoder_layer_weights.emplace_back( - new LlamaDecoderLayerWeight(l, model, engine_param, lora_param, moe_param)); - decoder_layer_weights.back()->malloc(stream_); + new LlamaDecoderLayerWeight(data_type, i, model, engine_param, lora_param, moe_param)); + register_module("layers", *decoder_layer_weights.back(), i); } - FT_CHECK(vocab_size_padded_ % tp_size_ == 0); - deviceMalloc((T**)&pre_decoder_embedding_table, embedding_size_ * hidden_units_ / tp_size_, stream_); - deviceMalloc((T**)&output_norm_weight, hidden_units_, stream_); - deviceMalloc((T**)&post_decoder_embedding_kernel, hidden_units_ * vocab_size_padded_ / tp_size_, stream_); - - // Wait for allocations - check_cuda_error(cudaStreamSynchronize(stream_)); + output_norm_weight = Tensor{{hidden_units_}, data_type_, kDEVICE}; + register_parameter("norm.weight", output_norm_weight); } -template -LlamaWeight::~LlamaWeight() +LlamaWeight::~LlamaWeight() { - deviceFree(pre_decoder_embedding_table, stream_); - deviceFree(output_norm_weight, stream_); - deviceFree(post_decoder_embedding_kernel, stream_); + core::ContextGuard guard = context(); + + pre_decoder_embedding = {}; + post_decoder_embedding = {}; + output_norm_weight = {}; for (auto& p : decoder_layer_weights) { - p->free(stream_); delete p; } decoder_layer_weights.clear(); // Wait for deallocations - check_cuda_error(cudaStreamSynchronize(stream_)); - check_cuda_error(cudaStreamDestroy(stream_)); - stream_ = {}; -} - -template -void LlamaWeight::loadModel(std::string dir_path) -{ - FtCudaDataType model_file_type = FtCudaDataType::FP16; - if (weight_type_ == WeightType::kBF16) { - model_file_type = FtCudaDataType::BF16; - } - dir_path += '/'; - - loadWeightFromBin((T*)pre_decoder_embedding_table, - {embedding_size_ * hidden_units_ / tp_size_}, - dir_path + "tok_embeddings." + std::to_string(tp_rank_) + ".weight", - model_file_type); - - loadWeightFromBin((T*)output_norm_weight, {hidden_units_}, dir_path + "norm.weight", model_file_type); - - loadWeightFromBin((T*)post_decoder_embedding_kernel, - {hidden_units_ * vocab_size_padded_ / tp_size_}, - dir_path + "output." + std::to_string(tp_rank_) + ".weight", - model_file_type); - - for (unsigned layer = 0; layer < num_layer_; ++layer) { - decoder_layer_weights[layer]->loadModel(dir_path + "layers." + std::to_string(layer), model_file_type); - } + core::Context::stream().Sync(); } -template -TensorMap LlamaWeight::getParams() +core::ContextGuard LlamaWeight::context() const { - TensorMap output; - - output.insert("tok_embeddings." + std::to_string(tp_rank_) + ".weight", - Tensor{MEMORY_GPU, - getTensorType(), - {embedding_size_ * hidden_units_ / tp_size_ * sizeof(T)}, - pre_decoder_embedding_table}); - - output.insert("norm.weight", - Tensor{MEMORY_GPU, getTensorType(), {hidden_units_ * sizeof(T)}, output_norm_weight}); - - output.insert("output." + std::to_string(tp_rank_) + ".weight", - Tensor{MEMORY_GPU, - getTensorType(), - {hidden_units_ * vocab_size_padded_ * sizeof(T) / tp_size_}, - post_decoder_embedding_kernel}); - - // transformer layers - for (size_t i = 0; i < num_layer_; i++) { - std::string prefix = fmtstr("layers.%d", i); - TensorMap layeri = decoder_layer_weights[i]->getParams(prefix); - for (auto [name, tensor] : layeri) { - output.insert(name, tensor); - } - } - - return output; + return core::ContextGuard{stream_, alloca_}; } -template -void LlamaWeight::prepare(const cudaDeviceProp& prop) +void LlamaWeight::prepare(const cudaDeviceProp& prop) { - const auto workspace_size = [&] { - size_t size{}; - for (const auto& layer : decoder_layer_weights) { - size = std::max(size, layer->workspace_size()); - } - return size; - }(); - - char* workspace{}; - - TM_LOG_INFO("[LlamaWeight::prepare] workspace size: %d", workspace_size); + core::ContextGuard guard = context(); // Wait for the weights to be filled externally check_cuda_error(cudaDeviceSynchronize()); - if (workspace_size) { - deviceMalloc((char**)&workspace, workspace_size, stream_); - } + auto stream = core::Context::stream().handle(); + for (auto& layer : decoder_layer_weights) { - layer->prepare(workspace, workspace_size, prop, stream_); + layer->prepare(prop, stream); } - - deviceFree(workspace, stream_); - - check_cuda_error(cudaStreamSynchronize(stream_)); } -#ifdef ENABLE_FP32 -template struct LlamaWeight; -#endif -template struct LlamaWeight; -#ifdef ENABLE_BF16 -template struct LlamaWeight<__nv_bfloat16>; -#endif - } // namespace turbomind diff --git a/src/turbomind/models/llama/LlamaWeight.h b/src/turbomind/models/llama/LlamaWeight.h index 8bc77dc26b..08a3f7d7d7 100644 --- a/src/turbomind/models/llama/LlamaWeight.h +++ b/src/turbomind/models/llama/LlamaWeight.h @@ -20,16 +20,18 @@ #pragma once +#include "src/turbomind/core/context.h" #include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h" +#include "src/turbomind/models/llama/LlamaDenseWeight.h" #include "src/turbomind/models/llama/llama_params.h" namespace turbomind { -template -struct LlamaWeight { +struct LlamaWeight: core::Module { LlamaWeight() = default; - LlamaWeight(const ModelParam& model_param, + LlamaWeight(DataType data_type, + const ModelParam& model_param, const EngineParam& engine_param, const LoraParam& lora_param, const MoeParam& moe_param); @@ -39,31 +41,34 @@ struct LlamaWeight { LlamaWeight(const LlamaWeight&) = delete; LlamaWeight& operator=(const LlamaWeight&) = delete; - void loadModel(std::string dir_path); + void prepare(const cudaDeviceProp& prop); - TensorMap getParams(); + core::ContextGuard context() const; - void prepare(const cudaDeviceProp& prop); + std::vector decoder_layer_weights; - std::vector*> decoder_layer_weights; + LlamaDenseWeight pre_decoder_embedding; + LlamaDenseWeight post_decoder_embedding; - T* pre_decoder_embedding_table{}; - T* output_norm_weight{}; - T* post_decoder_embedding_kernel{}; + Tensor output_norm_weight; private: - size_t hidden_units_; - size_t vocab_size_; - size_t vocab_size_padded_; - size_t embedding_size_; - size_t num_layer_; - WeightType weight_type_; - size_t tp_size_; // this will follow attn tp param - size_t tp_rank_; + int hidden_units_; + int vocab_size_; + int vocab_size_padded_; + int embedding_size_; + int num_layer_; + + DataType data_type_; + DataType weight_type_; + + int tp_size_; // this will follow attn tp param + int tp_rank_; std::vector inter_size_; - cudaStream_t stream_; + core::Stream stream_; + core::Allocator alloca_; }; } // namespace turbomind diff --git a/src/turbomind/models/llama/SequenceManager.cc b/src/turbomind/models/llama/SequenceManager.cc index 9497f42164..623ae3e332 100644 --- a/src/turbomind/models/llama/SequenceManager.cc +++ b/src/turbomind/models/llama/SequenceManager.cc @@ -3,15 +3,12 @@ #include "src/turbomind/models/llama/SequenceManager.h" #include "src/turbomind/kernels/attention/block.h" #include "src/turbomind/models/llama/BlockManager.h" -#include "src/turbomind/utils/allocator.h" #include "src/turbomind/utils/debug_utils.h" #include "src/turbomind/utils/logger.h" #include #include #include #include -#include - namespace turbomind { SequenceManager::SequenceManager(size_t layer_num, @@ -20,7 +17,7 @@ SequenceManager::SequenceManager(size_t layer_num, int chunk_size, bool enable_prefix_caching, int rank, - IAllocator* allocator, + core::Allocator allocator, GetFreeMemSize get_free_size): block_seq_len_(block_config.block_len_), rank_(rank) { diff --git a/src/turbomind/models/llama/SequenceManager.h b/src/turbomind/models/llama/SequenceManager.h index a71a556aaa..3e17ff3553 100644 --- a/src/turbomind/models/llama/SequenceManager.h +++ b/src/turbomind/models/llama/SequenceManager.h @@ -2,9 +2,12 @@ #pragma once +#include + +#include "src/turbomind/core/allocator.h" + #include "src/turbomind/models/llama/BlockManager.h" #include "src/turbomind/models/llama/BlockTrie.h" -#include namespace turbomind { @@ -78,7 +81,7 @@ class SequenceManager { int chunk_size, bool enable_prefix_caching, int rank, - IAllocator* allocator, + core::Allocator allocator, GetFreeMemSize get_free_size); SequenceManager(const SequenceManager&) = delete; diff --git a/src/turbomind/models/llama/context.h b/src/turbomind/models/llama/context.h index 062db42247..33b7be29ac 100644 --- a/src/turbomind/models/llama/context.h +++ b/src/turbomind/models/llama/context.h @@ -10,9 +10,8 @@ #include #include "src/turbomind/comm/device_comm.h" +#include "src/turbomind/core/core.h" #include "src/turbomind/models/llama/LlamaLinear.h" -#include "src/turbomind/utils/allocator.h" -#include "src/turbomind/utils/cublasMMWrapper.h" namespace turbomind { @@ -26,76 +25,21 @@ struct Communicators { }; // Execution context for the model -template struct Context { - cudaStream_t stream; - std::unique_ptr> allocator; - cublasHandle_t cublas_handle; - cublasLtHandle_t cublasLt_handle; - std::unique_ptr cublas_algo_map; - std::unique_ptr cublas_wrapper_mutex; - std::unique_ptr cublas_wrapper; - std::unique_ptr> linear; - Communicators comm; - cudaDeviceProp cuda_device_prop; - - Context(int device_id) + core::Stream core_stream; + core::Allocator allocator; + cudaStream_t stream; + std::unique_ptr linear; + cudaDeviceProp device_prop; + Communicators comm; // initialize later + + Context(int device_id): + core_stream{core::Stream::create()}, + allocator{core::Allocator(core_stream, false)}, + stream{core_stream.handle()}, + linear{std::make_unique(stream)} { - check_cuda_error(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - - allocator = std::make_unique>(device_id, false); - allocator->setStream(stream); - - cublasCreate(&cublas_handle); - cublasLtCreate(&cublasLt_handle); - cublasSetStream(cublas_handle, stream); - - if (0) { - cublasSetWorkspace(cublas_handle, nullptr, 0); - cublasSetMathMode(cublas_handle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION); - } - - cublas_algo_map = std::make_unique("gemm_config.in"); - cublas_wrapper_mutex = std::make_unique(); - cublas_wrapper = std::make_unique( - cublas_handle, cublasLt_handle, stream, cublas_algo_map.get(), cublas_wrapper_mutex.get(), allocator.get()); - linear = std::make_unique>(cublas_wrapper.get(), stream); - - check_cuda_error(cudaGetDeviceProperties(&cuda_device_prop, device_id)); - - if (std::is_same::value) { - cublas_wrapper->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F); - } -#ifdef ENABLE_FP32 - else if (std::is_same::value) { - cublas_wrapper->setFP32GemmConfig(); - } -#endif -#ifdef ENABLE_BF16 - else if (std::is_same::value) { - cublas_wrapper->setBF16GemmConfig(); - } -#endif - } - - ~Context() - { - linear.reset(); - cublas_wrapper.reset(); - cublas_algo_map.reset(); - - cublasDestroy(cublas_handle); - cublas_handle = {}; - - cublasLtDestroy(cublasLt_handle); - cublasLt_handle = {}; - - allocator.reset(); - - // `comm` destroyed by infer threads collectively - - cudaStreamDestroy(stream); - stream = {}; + check_cuda_error(cudaGetDeviceProperties(&device_prop, device_id)); } }; diff --git a/src/turbomind/models/llama/llama_decoder_kernels.cu b/src/turbomind/models/llama/llama_decoder_kernels.cu deleted file mode 100644 index f0ed63ca72..0000000000 --- a/src/turbomind/models/llama/llama_decoder_kernels.cu +++ /dev/null @@ -1,220 +0,0 @@ -// Copyright (c) OpenMMLab. All rights reserved. - -#include "src/turbomind/macro.h" -#include "src/turbomind/models/llama/llama_decoder_kernels.h" -#include "src/turbomind/utils/cuda_type_utils.cuh" -#include "src/turbomind/utils/cuda_utils.h" -#include -#include -#include - -namespace cg = cooperative_groups; - -namespace turbomind { - -template -struct res_norm_ops_t { -}; - -template -struct res_norm_t { - res_norm_ops_t f; - __device__ uint4 addvec(const uint4& a, const uint4& b, const uint4& bias, float& accum) const - { - uint4 c; - c.x = f.cast(f.add(f.cast(a.x), f.cast(b.x), f.cast(bias.x), accum)); - c.y = f.cast(f.add(f.cast(a.y), f.cast(b.y), f.cast(bias.y), accum)); - c.z = f.cast(f.add(f.cast(a.z), f.cast(b.z), f.cast(bias.z), accum)); - c.w = f.cast(f.add(f.cast(a.w), f.cast(b.w), f.cast(bias.w), accum)); - return c; - } - __device__ uint4 normvec(const uint4& u, const uint4& s, float factor) const - { - uint4 v; - v.x = f.cast(f.norm(f.cast(u.x), f.cast(s.x), factor)); - v.y = f.cast(f.norm(f.cast(u.y), f.cast(s.y), factor)); - v.z = f.cast(f.norm(f.cast(u.z), f.cast(s.z), factor)); - v.w = f.cast(f.norm(f.cast(u.w), f.cast(s.w), factor)); - return v; - } -}; - -template<> -struct res_norm_ops_t { - __device__ float2 cast(const uint& x) const - { - return __half22float2(reinterpret_cast(x)); - } - __device__ uint cast(const float2& x) const - { - auto y = __float22half2_rn(x); - return reinterpret_cast(y); - } - __device__ float2 add(const float2& a, const float2& b, const float2& bias, float& accum) const - { - float2 c{a.x + b.x + bias.x, a.y + b.y + bias.y}; - accum += c.x * c.x + c.y * c.y; - return c; - } - __device__ float2 norm(const float2& a, const float2& s, float factor) const - { - return {a.x * s.x * factor, a.y * s.y * factor}; - } -}; - -template<> -struct res_norm_ops_t { - __device__ float cast(const uint& x) const - { - return reinterpret_cast(x); - } - __device__ uint cast(const float& x) const - { - return reinterpret_cast(x); - } - __device__ float add(const float& a, const float& b, const float& bias, float& accum) const - { - float c = a + b + bias; - accum += c * c; - return c; - } - __device__ float norm(const float& a, const float& s, float factor) const - { - return a * s * factor; - } -}; - -#ifdef ENABLE_BF16 -template<> -struct res_norm_ops_t<__nv_bfloat16> { - __device__ float2 cast(const uint& x) const - { - return cuda_cast(reinterpret_cast(x)); - } - __device__ uint cast(const float2& x) const - { - auto y = cuda_cast<__nv_bfloat162, float2>(x); - return reinterpret_cast(y); - } - __device__ float2 add(const float2& a, const float2& b, const float2& bias, float& accum) const - { - float2 c{a.x + b.x + bias.x, a.y + b.y + bias.y}; - accum += c.x * c.x + c.y * c.y; - return c; - } - __device__ float2 norm(const float2& a, const float2& s, float factor) const - { - return {a.x * s.x * factor, a.y * s.y * factor}; - } -}; - -#endif - -template -__device__ T blockReduceSum(const cg::thread_block& block, T value) -{ - __shared__ float partial[32]; - - auto tile = cg::tiled_partition<32>(block); - value = cg::reduce(tile, value, cg::plus{}); - - if (tile.thread_rank() == 0) { - partial[tile.meta_group_rank()] = value; - } - - block.sync(); - - value = tile.thread_rank() < tile.meta_group_size() ? partial[tile.thread_rank()] : T{}; - return cg::reduce(tile, value, cg::plus{}); -} - -// r' = r + x -// x' = norm(r') * scales -template -__global__ void fusedAddBiasResidualNorm(T* __restrict__ r_data, - T* __restrict__ x_data, - const T* __restrict__ bias, - const T* __restrict__ scale, - float eps, - int batch_size, - int n_dims) -{ - auto block = cg::this_thread_block(); - auto grid = cg::this_grid(); - - constexpr int PACK_DIM = sizeof(uint4) / sizeof(T); - - const auto batch_idx = block.group_index().x; - uint4* __restrict__ r_ptr = reinterpret_cast(r_data + batch_idx * n_dims); - uint4* __restrict__ x_ptr = reinterpret_cast(x_data + batch_idx * n_dims); - const uint4* __restrict__ b_ptr = reinterpret_cast(bias); - - res_norm_t ops; - - float thread_sum{}; - for (auto i = block.thread_rank(); i < n_dims / PACK_DIM; i += block.size()) { - auto r = r_ptr[i]; - auto x = x_ptr[i]; - uint4 b = b_ptr ? b_ptr[i] : uint4{}; - r = ops.addvec(r, x, b, thread_sum); - r_ptr[i] = r; - } - - auto total_sum = blockReduceSum(block, thread_sum); - - float s_inv_mean = rsqrt(total_sum / n_dims + eps); - - const uint4* __restrict__ s_ptr = reinterpret_cast(scale); - for (uint i = block.thread_rank(); i < n_dims / PACK_DIM; i += block.size()) { - auto r = r_ptr[i]; - auto s = s_ptr[i]; - auto o = ops.normvec(r, s, s_inv_mean); - x_ptr[i] = o; - } -} - -template -void invokeFusedAddBiasResidualRMSNorm( - T* residual, T* in_out, const T* bias, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream) -{ - constexpr int PACK_DIM = sizeof(uint4) / sizeof(T); - FT_CHECK(n_dims % PACK_DIM == 0); - const int n_pack = n_dims / PACK_DIM; - const int n_iter = ((n_pack + 1023) / 1024); // iterations when block size == 1024 - int n_threads = (n_pack + n_iter - 1) / n_iter; // adjust block size to avoid tail effect - n_threads = (n_threads + 31) / 32 * 32; // round up to the nearest multiple of warp size - - fusedAddBiasResidualNorm<<>>( - residual, in_out, bias, scale, eps, batch_size, n_dims); -} - -template -__global__ void maskOutput(T* output, const int* mask, int dim) -{ - int batch_idx = blockIdx.x; - output += dim * batch_idx; - int masked = mask[batch_idx]; - for (int i = threadIdx.x; i < dim; i += blockDim.x) { - output[i] = (masked) ? output[i] : T(); - } -} - -template -void invokeMask(T* output, const int* mask, int batch_size, int dim, cudaStream_t stream) -{ - maskOutput<<>>(output, mask, dim); -} - -#ifdef ENABLE_FP32 -template void -invokeFusedAddBiasResidualRMSNorm(float*, float*, const float*, const float*, float, int, int, cudaStream_t); -template void invokeMask(float* output, const int* mask, int batch_size, int dim, cudaStream_t stream); -#endif -template void invokeFusedAddBiasResidualRMSNorm(half*, half*, const half*, const half*, float, int, int, cudaStream_t); -template void invokeMask(half* output, const int* mask, int batch_size, int dim, cudaStream_t stream); -#ifdef ENABLE_BF16 -template void invokeFusedAddBiasResidualRMSNorm( - __nv_bfloat16*, __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, float, int, int, cudaStream_t); -template void invokeMask(__nv_bfloat16* output, const int* mask, int batch_size, int dim, cudaStream_t stream); -#endif -} // namespace turbomind diff --git a/src/turbomind/models/llama/llama_decoder_kernels.h b/src/turbomind/models/llama/llama_decoder_kernels.h deleted file mode 100644 index 9d4dc51fe7..0000000000 --- a/src/turbomind/models/llama/llama_decoder_kernels.h +++ /dev/null @@ -1,14 +0,0 @@ -// Copyright (c) OpenMMLab. All rights reserved. - -#include - -namespace turbomind { - -template -void invokeFusedAddBiasResidualRMSNorm( - T* residual, T* in_out, const T* bias, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream); - -template -void invokeMask(T* output, const int* mask, int batch_size, int dim, cudaStream_t stream); - -} // namespace turbomind diff --git a/src/turbomind/models/llama/llama_gemm.cc b/src/turbomind/models/llama/llama_gemm.cc deleted file mode 100644 index f9a0191e4b..0000000000 --- a/src/turbomind/models/llama/llama_gemm.cc +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Copied from -// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/gpt_gemm.cc - -#include "src/turbomind/utils/gemm_test/gpt_gemm_func.h" -#include "src/turbomind/utils/memory_utils.h" - -namespace ft = turbomind; - -int main(int argc, char* argv[]) -{ - if (argc < 9 || argc > 11) { - TM_LOG_ERROR("./bin/llama_gemm batch_size \\ \n" - " beam_width \\ \n" - " max_input_len \\ \n" - " head_number \\ \n" - " size_per_head \\ \n" - " inter_size \\ \n" - " vocab_size \\ \n" - " data_type \\ \n" - " tensor_para_size \\\n" - " is_append (append new config into exist gemm_config.ini or not)"); - TM_LOG_ERROR("e.g. ./bin/llama_gemm 8 4 32 96 128 49152 51200 1 8 1"); - return 0; - } - - const int batch_size = atoi(argv[1]); - const int beam_width = atoi(argv[2]); - const int max_input_len = atoi(argv[3]); - const int head_num = atoi(argv[4]); - const int size_per_head = atoi(argv[5]); - const int inter_size = atoi(argv[6]); - const int vocab_size = atoi(argv[7]); - const ft::CublasDataType data_type = static_cast(atoi(argv[8])); // 0 FP32, 1 FP16, 2 BF 16 - const int tensor_para_size = argc < 10 ? 1 : atoi(argv[9]); - const bool is_append = argc < 11 ? false : (bool)(atoi(argv[10])); - - TM_LOG_INFO("Arguments:"); - TM_LOG_INFO(" batch_size: %d", batch_size); - TM_LOG_INFO(" beam_width: %d", beam_width); - TM_LOG_INFO(" max_input_len: %d", max_input_len); - TM_LOG_INFO(" head_num: %d", head_num); - TM_LOG_INFO(" size_per_head: %d", size_per_head); - TM_LOG_INFO(" inter_size: %d", inter_size); - TM_LOG_INFO(" vocab_size: %d", vocab_size); - TM_LOG_INFO(" data_type: %d", data_type); - TM_LOG_INFO(" tensor_para_size: %d", tensor_para_size); - TM_LOG_INFO(" is_append: %d", (int)is_append); - std::cout << std::endl; - - void* gemm_test_buf; - size_t buf_size_in_byte = ft::calGptGemmTestBufSizeInByte(batch_size, - beam_width, - max_input_len, - head_num, - size_per_head, - inter_size, - vocab_size, - tensor_para_size, - data_type); - size_t total, free; - ft::check_cuda_error(cudaMemGetInfo(&free, &total)); - if (free < buf_size_in_byte + 10 * 1024 * 1024) { - printf("[ERROR] There is no enough device memory for gemm test!\n" - " %ld Bytes is needed, but only %ld Bytes is free.\n", - buf_size_in_byte, - free); - gemm_test_buf = NULL; - return -1; - } - else { - ft::deviceMalloc(reinterpret_cast(&gemm_test_buf), buf_size_in_byte, nullptr, false); - } - - if (0) {} -#ifdef ENABLE_FP32 - else if (data_type == ft::FLOAT_DATATYPE) { - ft::generate_gpt_gemm_config(batch_size, - beam_width, - max_input_len, - head_num, - size_per_head, - inter_size, - vocab_size, - tensor_para_size, - gemm_test_buf, - is_append); - } -#endif - else if (data_type == ft::HALF_DATATYPE) { - ft::generate_gpt_gemm_config(batch_size, - beam_width, - max_input_len, - head_num, - size_per_head, - inter_size, - vocab_size, - tensor_para_size, - gemm_test_buf, - is_append); - } -#ifdef ENABLE_BF16 - else if (data_type == ft::BFLOAT16_DATATYPE) { - ft::generate_gpt_gemm_config<__nv_bfloat16>(batch_size, - beam_width, - max_input_len, - head_num, - size_per_head, - inter_size, - vocab_size, - tensor_para_size, - gemm_test_buf, - is_append); - } -#endif -#ifdef ENABLE_FP8 - else if (data_type == ft::FP8_DATATYPE) { - ft::generate_gpt_gemm_config<__nv_fp8_e4m3>(batch_size, - beam_width, - max_input_len, - head_num, - size_per_head, - inter_size, - vocab_size, - tensor_para_size, - gemm_test_buf, - false); - } -#endif - else { - printf("[ERROR] data type only supports fp32(0), fp16(1), bf16(2), fp8(4). \n"); - return -1; - } - - ft::check_cuda_error(cudaFree(gemm_test_buf)); - return 0; -} diff --git a/src/turbomind/models/llama/llama_kernels.cu b/src/turbomind/models/llama/llama_kernels.cu index 879a39d409..4a49460439 100644 --- a/src/turbomind/models/llama/llama_kernels.cu +++ b/src/turbomind/models/llama/llama_kernels.cu @@ -1,242 +1,20 @@ // Copyright (c) OpenMMLab. All rights reserved. -#include "src/turbomind/kernels/core/array_ops.h" -#include "src/turbomind/kernels/reduce_kernel_utils.cuh" -#include "src/turbomind/macro.h" -#include "src/turbomind/models/llama/llama_kernels.h" -#include "src/turbomind/models/llama/llama_utils.h" -#include "src/turbomind/utils/cuda_type_utils.cuh" -#include "src/turbomind/utils/cuda_utils.h" -#include "src/turbomind/utils/dispatch.h" -#include "src/turbomind/utils/logger.h" #include #include -#include +#include #include #include -namespace turbomind { - -// fp16, bf16 -// n is divided by 2 for this impl -template -__global__ void rootMeanSquareNorm(T* out, const T* input, const T* scale, float eps, int m, int n) -{ - using T2 = typename TypeConverter::Type; - __shared__ float s_inv_mean; - float mean = 0.f; - - T2* out_ptr = (T2*)out; - const T2* input_ptr = (const T2*)input; - const T2* scale_ptr = (const T2*)scale; - - for (uint idx = threadIdx.x; idx < n; idx += blockDim.x) { - float2 tmp2 = cuda_cast(input_ptr[blockIdx.x * n + idx]); - mean += tmp2.x * tmp2.x; - mean += tmp2.y * tmp2.y; - } - - mean = blockReduceSum(mean); - if (threadIdx.x == 0) { - s_inv_mean = rsqrt(.5f * mean / (float)n + eps); - } - __syncthreads(); - - for (uint idx = threadIdx.x; idx < n; idx += blockDim.x) { - float2 tmp2 = cuda_cast(input_ptr[blockIdx.x * n + idx]); - float2 sca2 = cuda_cast(scale_ptr[idx]); - tmp2.x = tmp2.x * s_inv_mean * sca2.x; - tmp2.y = tmp2.y * s_inv_mean * sca2.y; - out_ptr[blockIdx.x * n + idx] = cuda_cast(tmp2); - } -} - -template<> -__global__ void rootMeanSquareNorm(float* out, const float* input, const float* scale, float eps, int m, int n) -{ - __shared__ float s_inv_mean; - float mean = 0.f; - - for (uint idx = threadIdx.x; idx < n; idx += blockDim.x) { - float tmp = input[blockIdx.x * n + idx]; - mean += tmp * tmp; - } - - mean = blockReduceSum(mean); - if (threadIdx.x == 0) { - s_inv_mean = rsqrt(mean / static_cast(n) + eps); - } - __syncthreads(); - - for (uint idx = threadIdx.x; idx < n; idx += blockDim.x) { - float tmp = input[blockIdx.x * n + idx]; - out[blockIdx.x * n + idx] = tmp * s_inv_mean * scale[idx]; - } -} - -template -void invokeRootMeanSquareNorm(T* out, const T* input, const T* scale, float eps, int m, int n, cudaStream_t stream) -{ - if (sizeof(T) == 2) { - FT_CHECK(n % 2 == 0); - n /= 2; - } - dim3 grid(m); - dim3 block(std::min(n, 1024)); - rootMeanSquareNorm<<>>(out, input, scale, eps, m, n); -} - -template void invokeRootMeanSquareNorm(float*, const float*, const float*, float, int, int, cudaStream_t); -template void invokeRootMeanSquareNorm(half*, const half*, const half*, float, int, int, cudaStream_t); -#ifdef ENABLE_BF16 -template void -invokeRootMeanSquareNorm(__nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, float, int, int, cudaStream_t); -#endif - -// #ifdef ENABLE_BF16 - -// template void invokeRootMeanSquareNorm(__nv_bfloat16*, const __nv_bfloat16*, float, int, int, cudaStream_t); - -// #endif - -template -__device__ T saturate_cast(T0 x) -{ - return x; -} - -template<> -__device__ half saturate_cast(float x) -{ - return (x > 64512.f || x < -64512.f) ? (x > 0.f ? 64512.f : -64512.f) : x; -} - -template -__global__ void addResidual(T* out, const T* in, size_t n) -{ - auto idx = threadIdx.x + (size_t)blockIdx.x * blockDim.x; - if (idx < n) { - out[idx] = static_cast(static_cast(out[idx]) + static_cast(in[idx])); - } -} - -template -void invokeAddResidual(T* out, const T* in, int m, int n, cudaStream_t stream) -{ - auto total = static_cast(m) * n; - dim3 block(std::min((unsigned long)total, 1024UL)); - dim3 grid((total + block.x - 1) / block.x); - - addResidual<<>>(out, in, total); -} - -template void invokeAddResidual(float*, const float*, int, int, cudaStream_t); -template void invokeAddResidual(half*, const half*, int, int, cudaStream_t); - -// ids [seq_len, batch_size] -// input_ids [batch_size, max_input_len] -__global__ void -fixInputIds(int* ids, const int* input_ids, const int* input_lengths, int batch_size, int seq_len, int max_input_len) -{ - int seq_id = threadIdx.x; - int batch_id = blockIdx.x; - for (; seq_id < input_lengths[batch_id]; seq_id += blockDim.x) { - ids[seq_id * batch_size + batch_id] = input_ids[batch_id * max_input_len + seq_id]; - } -} - -void invokeFixInputIds(int* ids, - const int* input_ids, - const int* input_lengths, - int batch_size, - int seq_len, - int max_input_len, - cudaStream_t st) -{ - dim3 block(std::min(1024, max_input_len)); - dim3 grid(batch_size); - fixInputIds<<>>(ids, input_ids, input_lengths, batch_size, seq_len, max_input_len); -} - -template -__global__ void sliceCausalMask(T* mask, int seq_len, int key_len, int step) -{ - mask += (size_t)blockIdx.x * seq_len * key_len; - for (int i = threadIdx.x; i < seq_len * key_len; i += blockDim.x) { - int row = i / key_len; - int col = i % key_len; - if (col <= row + step) { - mask[i] = static_cast(1.f); - } - else { - mask[i] = static_cast(0.f); - } - } -} - -// [step: step+Q, :] of the K*K causal mask -template -void invokeSliceCausalMask(T* mask, int seq_len, int key_len, int step, int batch_size, cudaStream_t stream) -{ - FT_CHECK(step == key_len - seq_len); - sliceCausalMask<<>>(mask, seq_len, key_len, step); -} - -template void invokeSliceCausalMask(half*, int, int, int, int, cudaStream_t); -template void invokeSliceCausalMask(float*, int, int, int, int, cudaStream_t); - -// mask [bsz, max_q_len, max_k_len] - -template -__global__ void createCausalMasks(T* mask, const int* q_lens, const int* k_lens, int max_q_len, int max_k_len) -{ - const auto q_len = q_lens ? q_lens[blockIdx.x] : max_q_len; - const auto k_len = k_lens ? k_lens[blockIdx.x] : max_k_len; - mask += blockIdx.x * max_q_len * max_k_len; - for (int i = threadIdx.x; i < max_q_len * max_k_len; i += blockDim.x) { - const int q = i / max_k_len; // [0, max_q_len) - const int k = i % max_k_len; // [0, max_k_len) - bool is_valid = q < q_len && k < k_len && k <= q + (k_len - q_len); - mask[i] = static_cast(is_valid); - } -} - -template -void invokeCreateCausalMasks( - T* mask, const int* q_lens, const int* k_lens, int max_q_len, int max_k_len, int batch_size, cudaStream_t stream) -{ - createCausalMasks<<>>(mask, q_lens, k_lens, max_q_len, max_k_len); -} - -template void invokeCreateCausalMasks(float* mask, const int*, const int*, int, int, int, cudaStream_t); -template void invokeCreateCausalMasks(half* mask, const int*, const int*, int, int, int, cudaStream_t); -#ifdef ENABLE_BF16 -template<> -__global__ void createCausalMasks<__nv_bfloat16>( - __nv_bfloat16* mask, const int* q_lens, const int* k_lens, int max_q_len, int max_k_len) -{ - const auto q_len = q_lens[blockIdx.x]; - const auto k_len = k_lens[blockIdx.x]; - mask += blockIdx.x * max_q_len * max_k_len; - for (int i = threadIdx.x; i < max_q_len * max_k_len; i += blockDim.x) { - const int q = i / max_k_len; // [0, max_q_len) - const int k = i % max_k_len; // [0, max_k_len) - bool is_valid = q < q_len && k < k_len && k <= q + (k_len - q_len); - mask[i] = static_cast<__nv_bfloat16>(float(is_valid)); - } -} -template void invokeCreateCausalMasks(__nv_bfloat16* mask, const int*, const int*, int, int, int, cudaStream_t); -#endif - -namespace { +#include -template -__global__ void KernelWrapper(Params params) -{ - Kernel{}(params); -}; +#include "src/turbomind/kernels/core/array.h" +#include "src/turbomind/macro.h" +#include "src/turbomind/models/llama/llama_kernels.h" +#include "src/turbomind/utils/cuda_utils.h" +#include "src/turbomind/utils/dispatch.h" -} // namespace +namespace turbomind { __global__ void gatherOutput(int* output_ids, const int* ids, @@ -477,19 +255,12 @@ __global__ void getFeatureOfLastToken(T* output, const T* input, const int* cu_s } } -template void invokeGetFeatureOfLastToken( - T* output, const T* input, const int* cu_seqlens, int dims, int batch_size, cudaStream_t stream) + uint16_t* output, const uint16_t* input, const int* cu_seqlens, int dims, int batch_size, cudaStream_t stream) { getFeatureOfLastToken<<>>(output, input, cu_seqlens, dims); } -template void invokeGetFeatureOfLastToken(half*, const half*, const int*, int, int, cudaStream_t); -template void invokeGetFeatureOfLastToken(float*, const float*, const int*, int, int, cudaStream_t); -#ifdef ENABLE_BF16 -template void invokeGetFeatureOfLastToken(__nv_bfloat16*, const __nv_bfloat16*, const int*, int, int, cudaStream_t); -#endif // ENABLE_BF16 - template struct BatchedCopyParam { Array src_ptr; @@ -560,4 +331,29 @@ void invokeBatchedCopy(void** src_ptr, void** dst_ptr, int* size, int count, cud }); } +template +__global__ void maskOutput(T* output, const int* mask, int dim) +{ + int batch_idx = blockIdx.x; + output += dim * batch_idx; + int masked = mask[batch_idx]; + for (int i = threadIdx.x; i < dim; i += blockDim.x) { + output[i] = (masked) ? output[i] : T(); + } +} + +template +void invokeMask(T* output, const int* mask, int batch_size, int dim, cudaStream_t stream) +{ + maskOutput<<>>(output, mask, dim); +} + +#ifdef ENABLE_FP32 +template void invokeMask(float* output, const int* mask, int batch_size, int dim, cudaStream_t stream); +#endif +template void invokeMask(half* output, const int* mask, int batch_size, int dim, cudaStream_t stream); +#ifdef ENABLE_BF16 +template void invokeMask(__nv_bfloat16* output, const int* mask, int batch_size, int dim, cudaStream_t stream); +#endif + } // namespace turbomind diff --git a/src/turbomind/models/llama/llama_kernels.h b/src/turbomind/models/llama/llama_kernels.h index aaade1a513..82dbeb13e8 100644 --- a/src/turbomind/models/llama/llama_kernels.h +++ b/src/turbomind/models/llama/llama_kernels.h @@ -2,72 +2,11 @@ #pragma once -#include "src/turbomind/kernels/gpt_kernels.h" -#include "src/turbomind/utils/cuda_bf16_wrapper.h" -#include "src/turbomind/utils/cuda_utils.h" -#include -#include -#include -#include +#include +#include namespace turbomind { -template -void invokeRootMeanSquareNorm(T* out, const T* input, const T* scale, float eps, int m, int n, cudaStream_t stream); - -template -void invokeAddResidual(T* out, const T* in, int m, int n, cudaStream_t stream); - -void invokeFixInputIds(int* ids, - const int* input_ids, - const int* input_lengths, - int batch_size, - int seq_len, - int max_input_len, - cudaStream_t st); - -template -void invokeSliceCausalMask(T* mask, int seq_len, int key_len, int step, int batch_size, cudaStream_t stream); - -template -void invokeCreateCausalMasks( - T* mask, const int* q_lens, const int* k_lens, int max_q_len, int max_k_len, int batch_size, cudaStream_t stream); - -template -void invokeExtendKVCache(void** k_dst_ptrs, - void** v_dst_ptrs, - const T* k_src, - const T* v_src, - const int* cu_block_counts, - const int* query_length, - const int* context_length, - int batch_size, - int block_length, - size_t dst_layer_offset, - int max_q_len, - int head_dim, - int head_num, - int quant, - const float* kv_scale, - cudaStream_t stream); - -template -void invokeTransposeKVCache(T* key_cache_trans, - T* val_cache_trans, - const T** key_cache, - const T** val_cache, - size_t layer_offset, - int batch_size, - const int* key_length, - int max_kv_len, - int max_seq_len, - int size_per_head, - int head_num, - int head_n_rep, - cudaStream_t stream, - int quant_policy, - const float* kv_scale); - void invokeGatherOutput(int* output_ids, const int* ids, const int* context_length, @@ -115,56 +54,10 @@ void invokeBatchedCopy(void** src_ptr, void** dst_ptr, int* size, int count, cud void invokePadLastTokenIds( int* token_ids, const int* context_length, int max_context_len, int batch_size, cudaStream_t stream); -template void invokeGetFeatureOfLastToken( - T* output, const T* input, const int* cu_seqlens, int dims, int batch_size, cudaStream_t stream); - -void invokeMyCopyInt(int* dst, const int* src, size_t count, cudaStream_t st); + uint16_t* output, const uint16_t* input, const int* cu_seqlens, int dims, int batch_size, cudaStream_t stream); template -inline void dump(const T* x, int size, cudaStream_t st, const char* msg, bool full = false) -{ - std::vector h_x(size); - cudaMemcpyAsync(h_x.data(), x, sizeof(T) * size, cudaMemcpyDefault, st); - cudaStreamSynchronize(st); - fprintf(stderr, "\n%s:\n", msg); - std::vector h_y(h_x.begin(), h_x.end()); - float asum = 0.f; - for (const auto& x : h_y) { - asum += std::fabs(x); - } - if (full) { - for (int i = 0; i < size; ++i) { - printf("%d %.8f\n", i, h_y[i]); - } - } - else { - for (int i = 0; i < 8; ++i) { - fprintf(stderr, "%.8f\n", h_y[i]); - } - for (int i = size - 8; i < size; ++i) { - fprintf(stderr, "%.8f\n", h_y[i]); - } - } - fprintf(stderr, "\nasum = %f\n", asum); - // getchar(); -} - -template -struct TempBuffer { - TempBuffer(size_t size) - { - cudaMalloc(&data, size); - } - T* data; -}; - -inline void dump_sequence_len(int* d_seq_len, int step, int tp_rank, cudaStream_t st) -{ - int h_seq_len = -1; - cudaMemcpyAsync(&h_seq_len, d_seq_len, sizeof(int), cudaMemcpyDefault, st); - cudaStreamSynchronize(st); - TM_LOG_ERROR("--------> rank = %d, step = %d, seq_len = %d <--------", tp_rank, step, h_seq_len); -} +void invokeMask(T* output, const int* mask, int batch_size, int dim, cudaStream_t stream); } // namespace turbomind diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h index a5dd8bcb49..4b88e10fb8 100644 --- a/src/turbomind/models/llama/llama_params.h +++ b/src/turbomind/models/llama/llama_params.h @@ -7,34 +7,34 @@ #include #include +#include "src/turbomind/core/data_type.h" #include "src/turbomind/models/llama/llama_rope.h" -#include "src/turbomind/models/llama/weight_type.h" namespace turbomind { struct MLAParam { - size_t q_lora_rank; - size_t kv_lora_rank; - size_t qk_rope_dim; - size_t v_head_dim; + int q_lora_rank; + int kv_lora_rank; + int qk_rope_dim; + int v_head_dim; }; struct ModelParam { - size_t head_num; - size_t head_dim; - size_t kv_head_num; - size_t hidden_units; - size_t layer_num; - size_t vocab_size; - size_t embedding_size; - float norm_eps; - int quant_policy; - bool attn_bias; - WeightType weight_type; - int group_size; - MLAParam mla; - bool qk_norm; - int tune_layer_num; + size_t head_num; + size_t head_dim; + size_t kv_head_num; + size_t hidden_units; + size_t layer_num; + size_t vocab_size; + size_t embedding_size; + float norm_eps; + int quant_policy; + bool attn_bias; + DataType weight_type; + int group_size; + MLAParam mla; + bool qk_norm; + int tune_layer_num; std::vector inter_size; }; @@ -81,7 +81,7 @@ struct EngineParam { bool enable_prefix_caching; // chunking params - int max_prefill_token_num; + int max_forward_token_num; int max_context_token_num; int num_tokens_per_iter; int max_prefill_iters; diff --git a/src/turbomind/models/llama/llama_utils.cu b/src/turbomind/models/llama/llama_utils.cu index eaa450ae20..e4220a8e47 100644 --- a/src/turbomind/models/llama/llama_utils.cu +++ b/src/turbomind/models/llama/llama_utils.cu @@ -70,7 +70,8 @@ void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream) Tacc asum{}; Tacc rsum{}; - Tacc amean{}; + Tacc amean_r{}; + Tacc amean_x{}; for (size_t i = 0; i < size; ++i) { Tacc x = (Tacc)h_b[i]; Tacc r = (Tacc)h_a[i]; @@ -78,10 +79,18 @@ void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream) Tacc rel_diff = abs_diff / std::max(std::max(std::abs(r), std::abs(x)), eps); asum += abs_diff; rsum += rel_diff; - amean += std::abs(r); + amean_x += std::abs(x); + amean_r += std::abs(r); } - std::cerr << key << ": " << amean / size << " " << asum << " " << asum / size << " " << rsum / size << "\n"; + fprintf(stderr, + "%12s%12f%12f%12f%12f%12f\n", + key.c_str(), + (float)amean_x / (float)size, + (float)amean_r / (float)size, + (float)asum, + (float)asum / (float)size, + (float)rsum / (float)size); check_cuda_error(cudaMemcpyAsync(ptr, h_a.data(), sizeof(T) * h_a.size(), cudaMemcpyDefault, stream)); check_cuda_error(cudaStreamSynchronize(stream)); @@ -124,19 +133,6 @@ template void Compare(__nv_bfloat16* ptr, size_t size, std::string key, CmpMode template void CheckNan(const float* ptr, size_t size, std::string key, cudaStream_t stream); template void CheckNan(const half* ptr, size_t size, std::string key, cudaStream_t stream); -std::string format(const std::pair& p) -{ - std::stringstream ss; - ss << p.first << " ["; - bool first = true; - for (const auto& x : p.second.shape) { - ss << (first ? "" : ", ") << x; - first = false; - } - ss << "]"; - return ss.str(); -} - size_t curandStateGetSize() { return sizeof(curandState_t); diff --git a/src/turbomind/models/llama/llama_utils.h b/src/turbomind/models/llama/llama_utils.h index e50364bbd1..193bbfb87c 100644 --- a/src/turbomind/models/llama/llama_utils.h +++ b/src/turbomind/models/llama/llama_utils.h @@ -1,7 +1,6 @@ // Copyright (c) OpenMMLab. All rights reserved. #pragma once -#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/nvtx_utils.h" #include #include @@ -62,8 +61,6 @@ std::string Concat(std::string key, Args&&... args) return key; } -std::string format(const std::pair& p); - size_t curandStateGetSize(); bool isDebug(); diff --git a/src/turbomind/models/llama/mla_utils.cu b/src/turbomind/models/llama/mla_utils.cu index 2f9e786f2a..74478401e2 100644 --- a/src/turbomind/models/llama/mla_utils.cu +++ b/src/turbomind/models/llama/mla_utils.cu @@ -1,5 +1,10 @@ // Copyright (c) OpenMMLab. All rights reserved. + +#include + +#include "src/turbomind/core/check.h" #include "src/turbomind/kernels/core/array_ops.h" +#include "src/turbomind/utils/cuda_utils.h" namespace turbomind { @@ -78,16 +83,37 @@ void invokeMLACopyQKV(T* qkv, qkv, q, kv_a, kv_b, head_num, head_dim, nope_dim, rope_dim, kv_lora_rank, v_head_dim); } -template void invokeMLACopyQKV(uint16_t* qkv, - const uint16_t* q, - const uint16_t* kv_a, - const uint16_t* kv_b, - int token_num, - int head_num, - int nope_dim, - int rope_dim, - int kv_lora_rank, - int v_head_dim, - cudaStream_t stream); +void MLACopyQKV(DataType dtype, + void* qkv, + const void* q, + const void* kv_a, + const void* kv_b, + int token_num, + int head_num, + int nope_dim, + int rope_dim, + int kv_lora_rank, + int v_head_dim, + cudaStream_t stream) +{ + auto invoke = [&](auto t) { + using T = decltype(t); + invokeMLACopyQKV((T*)qkv, + (const T*)q, + (const T*)kv_a, + (const T*)kv_b, + token_num, + head_num, + nope_dim, + rope_dim, + kv_lora_rank, + v_head_dim, + stream); + }; + + TM_CHECK_EQ(byte_size(dtype, 1), 2) << "unsupported data type: " << dtype; + + return invoke(uint16_t{}); +} } // namespace turbomind diff --git a/src/turbomind/models/llama/mla_utils.h b/src/turbomind/models/llama/mla_utils.h index bc06a352f9..255318306f 100644 --- a/src/turbomind/models/llama/mla_utils.h +++ b/src/turbomind/models/llama/mla_utils.h @@ -1,57 +1,23 @@ // Copyright (c) OpenMMLab. All rights reserved. #pragma once -#include #include -#include "src/turbomind/utils/cuda_utils.h" +#include "src/turbomind/core/data_type.h" namespace turbomind { -template -void invokeMLACopyQKV(T* qkv, - const T* q, - const T* kv_a, - const T* kv_b, - int token_num, - int head_num, - int nope_dim, - int rope_dim, - int kv_lora_rank, - int v_head_dim, - cudaStream_t stream); - -template -void dispatchMLACopyQKV(T* qkv, - const T* q, - const T* kv_a, - const T* kv_b, - int token_num, - int head_num, - int nope_dim, - int rope_dim, - int kv_lora_rank, - int v_head_dim, - cudaStream_t stream) -{ - auto invoke = [&](auto x) { - using type = decltype(x); - invokeMLACopyQKV((type*)qkv, - (const type*)q, - (const type*)kv_a, - (const type*)kv_b, - token_num, - head_num, - nope_dim, - rope_dim, - kv_lora_rank, - v_head_dim, - stream); - }; - if constexpr (sizeof(T) == 2) { - return invoke(uint16_t{}); - } - FT_CHECK(0); -} +void MLACopyQKV(DataType dtype, + void* qkv, + const void* q, + const void* kv_a, + const void* kv_b, + int token_num, + int head_num, + int nope_dim, + int rope_dim, + int kv_lora_rank, + int v_head_dim, + cudaStream_t stream); } // namespace turbomind diff --git a/src/turbomind/models/llama/moe_ffn_layer.cc b/src/turbomind/models/llama/moe_ffn_layer.cc index ab5d42bd7b..5b84da56e7 100644 --- a/src/turbomind/models/llama/moe_ffn_layer.cc +++ b/src/turbomind/models/llama/moe_ffn_layer.cc @@ -1,131 +1,94 @@ // Copyright (c) OpenMMLab. All rights reserved. -#include "src/turbomind/models/llama/moe_ffn_layer.h" +#include + #include "src/turbomind/kernels/activation_kernels.h" #include "src/turbomind/models/llama/LlamaDenseWeight.h" #include "src/turbomind/models/llama/LlamaLinear.h" #include "src/turbomind/models/llama/llama_params.h" #include "src/turbomind/models/llama/llama_utils.h" +#include "src/turbomind/models/llama/moe_ffn_layer.h" #include "src/turbomind/utils/cuda_utils.h" -#include "src/turbomind/utils/monotonic.h" -#include "src/turbomind/utils/nvtx_utils.h" -#include "src/turbomind/utils/string_utils.h" -#include -#include namespace turbomind { -template -void MoeFfnLayer::AllocateBuffer(size_t tokens, size_t padded, size_t expert_num, size_t inter_buf_factor) +MoeFfnLayer::MoeFfnLayer(const ModelParam& model, const MoeParam& param, const EngineParam& engine, const Context& ctx): + inter_size_(param.inter_size / engine.mlp_tp_size), + hidden_dim_(model.hidden_units), + param_(param), + stream_(ctx.stream), + linear_(*ctx.linear) { - char* base = 0; - - auto allocate = [&](void* base) { - Monotonic alloc{base}; - alloc(&inout_buf_, tokens * param_.experts_per_token * hidden_dim_); - alloc(&inter_buf_, tokens * param_.experts_per_token * inter_size_ * inter_buf_factor); - alloc(&logits_, tokens * expert_num); - alloc(&masks_, expert_num * padded); - alloc(&f2n_, param_.experts_per_token * tokens); - alloc(&en2f_, param_.experts_per_token * tokens); - alloc(&scales_, param_.experts_per_token * tokens); - alloc(&shared_scales_, tokens); - return (char*)alloc.ptr() - (char*)base; - }; - - const auto workspace_size = allocate(0); - - workspace_ = (char*)allocator_->reMalloc(workspace_, workspace_size); - - allocate(workspace_); -} + TM_CHECK(!param.expert_num.empty()); -template -void MoeFfnLayer::FreeBuffer() -{ - allocator_->free((void**)&workspace_); + const int max_expert_num = *std::max_element(param.expert_num.begin(), param.expert_num.end()); + + if (param_.method == MoeParam::kFused) { + context_ = + std::make_unique(max_expert_num, param.experts_per_token, ctx.device_prop, stream_); + } + else { + expert_ffn_ = std::make_unique(model, ctx); + } - allocator_->free((void**)&accum_); - allocator_->free((void**)&offsets_); + h_offsets_ = {max_expert_num + 1, kCPUpinned}; - allocator_->free((void**)&h_offsets_, true); + const int max_token_num = engine.max_forward_token_num; + const int pad_token_num = (max_token_num + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize; + + masks_ = {max_expert_num * pad_token_num, kDEVICE}; + f2n_ = {param_.experts_per_token * max_token_num, kDEVICE}; + en2f_ = {param_.experts_per_token * max_token_num, kDEVICE}; + scales_ = {param_.experts_per_token * max_token_num, kDEVICE}; + offsets_ = {max_expert_num + 1, kDEVICE}; + accum_ = {max_expert_num * kMoeGateMaxTiles, kDEVICE}; + + shared_scales_ = {max_token_num, kDEVICE}; } -template -void MoeFfnLayer::gate(float* logits, const T* input, int tokens, const LlamaDenseWeight& weight) +Tensor_ MoeFfnLayer::Gate(const Tensor& input, const LlamaDenseWeight& gate) { - const float alpha = 1.f; - const float beta = 0.f; - cublas_->Gemm(CUBLAS_OP_N, - CUBLAS_OP_N, - weight.output_dims, - tokens, - weight.input_dims, - &alpha, - weight.kernel, - getCudaDataType(), - weight.output_dims, - input, - getCudaDataType(), - hidden_dim_, - &beta, - logits, - CUDA_R_32F, - weight.output_dims, - CUDA_R_32F, - CUBLAS_GEMM_DEFAULT_TENSOR_OP); + auto& weight = gate.weight; + TM_CHECK_EQ(input.shape(1), weight.shape(0)); + Tensor_ logits{{input.shape(0), weight.shape(1)}, kDEVICE}; + linear_.forward(input, gate, LlamaLinear::kGemm, logits); + sync_check_cuda_error(); + return logits; } -template -void MoeFfnLayer::forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight& moe) +void MoeFfnLayer::Forward(ForwardParam& p) { + const int tokens = p.input.shape(0); + const auto& moe = *p.weights; + const size_t padded = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize; const int expert_num = moe.experts.size(); FT_CHECK(expert_num); - const size_t inter_buf_factor = [&] { - if (param_.method == MoeParam::kNaive) { - return 0; // managed by ffn - } - else if (moe.block.is_fused_silu) { - return 1; - } - else { - return 2; - } - }(); - - AllocateBuffer(tokens, padded, expert_num, inter_buf_factor); - - gate(logits_, input, tokens, moe.gate); - sync_check_cuda_error(); - - // if (tensor_para_.rank_ == 0) { - // Compare(logits_, tokens * expert_num, Concat("logit", layer_id), compare_mode, stream_); - // } + auto logits = Gate(p.input, moe.gate); - check_cuda_error(cudaMemsetAsync(accum_, 0, sizeof(int) * expert_num * kMoeGateMaxTiles, stream_)); - check_cuda_error(cudaMemsetAsync(masks_, -1, sizeof(int8_t) * expert_num * padded, stream_)); + check_cuda_error(cudaMemsetAsync(accum_.data(), 0, sizeof(int) * expert_num * kMoeGateMaxTiles, stream_)); + check_cuda_error(cudaMemsetAsync(masks_.data(), -1, sizeof(int8_t) * expert_num * padded, stream_)); // dump_logits(tokens, layer_id); bool softmax = true; if (param_.topk_method == "group_limited_greedy") { invokeMoeSoftmaxMaskTopKGroups( - logits_, tokens, expert_num, expert_num / param_.n_group, param_.topk_group, stream_); + logits.data(), tokens, expert_num, expert_num / param_.n_group, param_.topk_group, stream_); sync_check_cuda_error(); softmax = false; } /// TODO: fix illegal memory access even if NaN are present in logits - invokeMoeGate_V2(f2n_, - en2f_, - offsets_, - scales_, - masks_, - accum_, - logits_, + invokeMoeGate_V2(f2n_.data(), + en2f_.data(), + offsets_.data(), + scales_.data(), + masks_.data(), + accum_.data(), + logits.data(), tokens, padded, expert_num, @@ -147,143 +110,87 @@ void MoeFfnLayer::forward(T* output, const T* input, int tokens, int layer_id for (int i = 0; i < expert_num; ++i) { h_offsets_[i + 1] = h_offsets_[i] + cnt[i]; } - check_cuda_error( - cudaMemcpyAsync(offsets_, h_offsets_, sizeof(int) * (expert_num + 1), cudaMemcpyDefault, stream_)); + check_cuda_error(cudaMemcpyAsync( + offsets_.data(), h_offsets_.data(), sizeof(int) * (expert_num + 1), cudaMemcpyDefault, stream_)); } + temp_ = Tensor{{param_.experts_per_token * tokens, hidden_dim_}, p.input.dtype(), p.input.device()}; + if (param_.method == MoeParam::kNaive) { - dispatchMoeGather(inout_buf_, input, f2n_, tokens, param_.experts_per_token, hidden_dim_, stream_); + invokeMoeDispatch(temp_, p.input, f2n_.data(), param_.experts_per_token, stream_); sync_check_cuda_error(); - check_cuda_error( - cudaMemcpyAsync(h_offsets_, offsets_, sizeof(int) * (expert_num + 1), cudaMemcpyDefault, stream_)); + check_cuda_error(cudaMemcpyAsync( + h_offsets_.data(), offsets_.data(), sizeof(int) * (expert_num + 1), cudaMemcpyDefault, stream_)); check_cuda_error(cudaStreamSynchronize(stream_)); - if (h_offsets_[expert_num] != tokens * param_.experts_per_token) { - FT_CHECK_WITH_INFO(0, fmtstr("%d vs %d", h_offsets_[expert_num], tokens * param_.experts_per_token)); - } + TM_CHECK_EQ(h_offsets_[expert_num], tokens * param_.experts_per_token); for (int i = 0; i < expert_num; ++i) { - - FT_CHECK(moe.experts[i].is_fused_silu == false); - - if (size_t count = h_offsets_[i + 1] - h_offsets_[i]) { - auto io = inout_buf_ + h_offsets_[i] * hidden_dim_; - - TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, dtype_, {count, hidden_dim_}, io}}, - {"layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}}}; - TensorMap ffn_outputs{{"ffn_output", {MEMORY_GPU, dtype_, {count, hidden_dim_}, io}}}; - - expert_ffn_->forward(&ffn_outputs, &ffn_inputs, &moe.experts[i]); + if (int count = h_offsets_[i + 1] - h_offsets_[i]) { + auto io = temp_.slice({h_offsets_[i], 0}, {count, -1}); + expert_ffn_->forward({io, io, moe.experts.at(i).get(), p.layer_id}); } } } else { - context_->update(expert_num, param_.experts_per_token, offsets_); + context_->update(expert_num, param_.experts_per_token, offsets_.data()); auto& block = moe.block; - linear_->forward_moe(inter_buf_, - {input, (int)hidden_dim_}, - f2n_, - offsets_, - tokens * param_.experts_per_token, - block.fused_gating_intermediate, - block.is_fused_silu ? LlamaLinear::kFusedSiluFfn : LlamaLinear::kGemm, - context_.get()); - sync_check_cuda_error(); - auto mode = kCmpRead; + const int inter_dim = block.is_fused_silu ? inter_size_ : inter_size_ * 2; + Tensor inter{{tokens * param_.experts_per_token, inter_dim}, p.input.dtype(), p.input.device()}; - // if (tensor_para_.rank_ == 0) { - // Compare(inter_buf_, // - // tokens * param_.experts_per_token * inter_size_ * 2, - // "inter_buf", - // mode, - // stream_); - // } + linear_.forward_moe(inter, + p.input, + f2n_.data(), + offsets_.data(), + block.fused_gating_intermediate, + block.is_fused_silu ? LlamaLinear::kFusedSiluFfn : LlamaLinear::kGemm, + context_.get()); + sync_check_cuda_error(); if (!block.is_fused_silu) { - invokeGenericActivation_v2(inter_buf_, - inter_buf_ + inter_size_, - inter_size_ * 2, - tokens * param_.experts_per_token, - inter_size_, + invokeGenericActivation_v3(inter.slice({0, 0}, {-1, inter_size_}), // + inter.slice({0, inter_size_}, {-1, -1}), stream_); sync_check_cuda_error(); } - linear_->forward_moe(inout_buf_, - {inter_buf_, block.is_fused_silu ? (int)inter_size_ : (int)inter_size_ * 2}, - nullptr, - offsets_, - tokens * param_.experts_per_token, - block.output, - LlamaLinear::kGemm, - context_.get()); + linear_.forward_moe(temp_, + inter.slice({0, 0}, {-1, inter_size_}), + nullptr, + offsets_.data(), + block.output, + LlamaLinear::kGemm, + context_.get()); sync_check_cuda_error(); - auto mode1 = kCmpRead; - - // if (tensor_para_.rank_ == 0) { - // Compare(inter_buf_2_, // - // tokens * param_.experts_per_token * inter_size_, - // "inter_buf_2_", - // mode1, - // stream_); - // Compare(inout_buf_, // - // tokens * param_.experts_per_token * hidden_dim_, - // "inout_buf", - // mode1, - // stream_); - // } - } - - if (moe.shared_gate.kernel) { - gate(shared_scales_, input, tokens, moe.shared_gate); } } -template -void MoeFfnLayer::reduce(T* output, int tokens, float output_scale, int layer_id, const MoeFfnWeight& moe) +void MoeFfnLayer::Combine(ForwardParam& p) { - invokeMoeReduce(output, - inout_buf_, - scales_, - en2f_, - moe.shared_gate.kernel ? shared_scales_ : nullptr, - tokens, - param_.experts_per_token, - hidden_dim_, - output_scale, - stream_); - sync_check_cuda_error(); -} + auto& moe = *p.weights; -template -void MoeFfnLayer::dump_logits(int token_num, int layer_id, int expert_num) -{ - std::vector logits(token_num * expert_num); - check_cuda_error( - cudaMemcpyAsync(logits.data(), logits_, sizeof(float) * logits.size(), cudaMemcpyDefault, stream_)); - check_cuda_error(cudaStreamSynchronize(stream_)); - - auto ptr = logits.data(); - std::cout << "layer_id: " << layer_id << std::endl; - for (int i = 0; i < token_num; ++i) { - for (int e = 0; e < expert_num; ++e) { - std::cout << *ptr++ << " "; - } - std::cout << std::endl; + Tensor_ shared_scales; + + if (moe.shared_gate.weight) { + shared_scales = Gate(p.input, moe.shared_gate); } -} -#ifdef ENABLE_FP32 -template class MoeFfnLayer; -#endif -template class MoeFfnLayer; -#ifdef ENABLE_BF16 -template class MoeFfnLayer<__nv_bfloat16>; -#endif + invokeMoeCombine(p.output, + temp_, + scales_.data(), + en2f_.data(), + shared_scales.data_or((float*)nullptr), + param_.experts_per_token, + p.scale, + stream_); + sync_check_cuda_error(); + + temp_ = {}; +} } // namespace turbomind diff --git a/src/turbomind/models/llama/moe_ffn_layer.h b/src/turbomind/models/llama/moe_ffn_layer.h index 67c13609bb..abad2402cf 100644 --- a/src/turbomind/models/llama/moe_ffn_layer.h +++ b/src/turbomind/models/llama/moe_ffn_layer.h @@ -7,89 +7,54 @@ #include "src/turbomind/models/llama/LlamaDenseWeight.h" #include "src/turbomind/models/llama/LlamaFfnLayer.h" #include "src/turbomind/models/llama/llama_params.h" -#include "src/turbomind/utils/cublasMMWrapper.h" -#include namespace turbomind { -template class MoeFfnLayer { public: - MoeFfnLayer(ModelParam model, const MoeParam& param, size_t tp_size, const Context& ctx): - inter_size_(param.inter_size / tp_size), - hidden_dim_(model.hidden_units), - param_(param), - dtype_(getTensorType()), - stream_(ctx.stream), - cublas_(ctx.cublas_wrapper.get()), - linear_(ctx.linear.get()), - allocator_(ctx.allocator.get()) - { - FT_CHECK(!param.expert_num.empty()); - const int max_expert_num = *std::max_element(param.expert_num.begin(), param.expert_num.end()); + MoeFfnLayer(const ModelParam& model, const MoeParam& param, const EngineParam& engine, const Context& ctx); - if (param_.method == MoeParam::kFused) { - context_ = std::make_unique( - max_expert_num, param.experts_per_token, ctx.cuda_device_prop, stream_); - } - else { - expert_ffn_ = std::make_unique>(model, ctx); - } + struct ForwardParam { + Tensor input; + Tensor output; + const MoeFfnWeight* weights; + float scale; + int layer_id; + }; - h_offsets_ = (int*)allocator_->malloc(sizeof(int) * (max_expert_num + 1), false, true); + void Forward(ForwardParam& p); - offsets_ = (int*)allocator_->malloc(sizeof(int) * (max_expert_num + 1)); - accum_ = (int*)allocator_->malloc(sizeof(int) * max_expert_num * kMoeGateMaxTiles); - } - - void AllocateBuffer(size_t tokens, size_t padded, size_t expert_num, size_t inter_buf_factor); - - void FreeBuffer(); - - ~MoeFfnLayer() - { - FreeBuffer(); - } - - void forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight& moe); - - void reduce(T* output, int tokens, float output_scale, int layer_id, const MoeFfnWeight& moe); - - void gate(float* logits, const T* input, int tokens, const LlamaDenseWeight& weight); - - void dump_logits(int token_num, int layer_id, int expert_num); + void Combine(ForwardParam& p); private: - const size_t inter_size_; - const size_t hidden_dim_; - const MoeParam param_; - const DataType dtype_; - cudaStream_t const stream_; - cublasMMWrapper* const cublas_; - LlamaLinear* const linear_; - IAllocator* const allocator_; - - std::unique_ptr> expert_ffn_; - std::unique_ptr context_; + Tensor_ Gate(const Tensor& input, const LlamaDenseWeight& gate); - int* h_offsets_{}; + void dump_logits(int token_num, int layer_id, int expert_num); - char* workspace_{}; + const int inter_size_; + const int hidden_dim_; + const MoeParam param_; - T* inout_buf_{}; // [n * e, hidden_dim] - T* inter_buf_{}; // [n * e, inter_size] + cudaStream_t const stream_; + LlamaLinear& linear_; - float* logits_{}; - int* masks_{}; + std::unique_ptr expert_ffn_; + std::unique_ptr context_; - int* f2n_{}; - int* en2f_{}; - float* scales_{}; + /////////////////////////////////////////////////////// + /// runtime states + Buffer_ h_offsets_; - float* shared_scales_{}; + Buffer_ masks_; + Buffer_ f2n_; + Buffer_ en2f_; + Buffer_ scales_; + Buffer_ shared_scales_; + Buffer_ accum_; + Buffer_ offsets_; - int* accum_{}; - int* offsets_{}; + Tensor temp_; + /////////////////////////////////////////////////////// }; } // namespace turbomind diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc index 30efbdedf2..692a68997b 100644 --- a/src/turbomind/models/llama/unified_attention_layer.cc +++ b/src/turbomind/models/llama/unified_attention_layer.cc @@ -21,27 +21,49 @@ #include #include +#include + +#include "src/turbomind/core/check.h" +#include "src/turbomind/core/data_type.h" +#include "src/turbomind/core/tensor.h" #include "src/turbomind/kernels/attention/attention.h" #include "src/turbomind/kernels/attention/decoding.h" #include "src/turbomind/kernels/attention/kv_cache_utils_v2.h" #include "src/turbomind/kernels/norm/rms_norm.h" + #include "src/turbomind/macro.h" -#include "src/turbomind/models/llama/llama_kernels.h" + #include "src/turbomind/models/llama/llama_utils.h" #include "src/turbomind/models/llama/mla_utils.h" #include "src/turbomind/models/llama/unified_attention_layer.h" -#include "src/turbomind/utils/Tensor.h" + #include "src/turbomind/utils/anomaly_handler.h" #include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/logger.h" -#include "src/turbomind/utils/memory_utils.h" namespace turbomind { -template -UnifiedAttentionLayer::UnifiedAttentionLayer( - const ModelParam& model, const AttentionParam& attn, const LoraParam& lora, size_t tp_size, const Context& ctx): +UnifiedAttentionLayer::~UnifiedAttentionLayer() +{ + for (auto& s : streams_) { + s = {}; + } + + check_cuda_error(cudaEventDestroy(aux_event_)); + check_cuda_error(cudaEventDestroy(qkv_event_)); + check_cuda_error(cudaStreamDestroy(aux_stream_)); + + aux_event_ = qkv_event_ = {}; + aux_stream_ = {}; +} + +UnifiedAttentionLayer::UnifiedAttentionLayer(const ModelParam& model, + const AttentionParam& attn, + const EngineParam& engine, + const LoraParam& lora, + int tp_size, + const Context& ctx): head_num_(model.head_num), kv_head_num_(model.kv_head_num), size_per_head_(model.head_dim), @@ -53,11 +75,11 @@ UnifiedAttentionLayer::UnifiedAttentionLayer( lora_param_(lora), context_(ctx), stream_(ctx.stream), - linear_(ctx.linear.get()), - allocator_(ctx.allocator.get()), + linear_(*ctx.linear), arch_(getSMVersion()) { - FT_CHECK(head_num_ % kv_head_num_ == 0); + TM_CHECK_EQ(head_num_ % tp_size, 0) << head_num_ << " " << tp_size; + TM_CHECK_EQ(head_num_ % kv_head_num_, 0) << head_num_ << " " << kv_head_num_; check_cuda_error(cudaStreamCreateWithFlags(&aux_stream_, cudaStreamNonBlocking)); check_cuda_error(cudaEventCreateWithFlags(&qkv_event_, cudaEventDisableTiming)); @@ -68,77 +90,59 @@ UnifiedAttentionLayer::UnifiedAttentionLayer( init_rope_kernel_param(param_.rope, rope_param_); - allocateWorkspace(); + partial_M_ = Tensor_({kMaxWorkspaceTokens, local_head_num_}, kDEVICE); + partial_L_ = Tensor_({kMaxWorkspaceTokens, local_head_num_}, kDEVICE); + partial_O_ = Tensor_({kMaxWorkspaceTokens, local_head_num_, size_per_head_}, kDEVICE); + split_cnt_ = Tensor_({kMaxWorkspaceTokens}, kDEVICE); + barriers_ = Tensor_({kMaxWorkspaceTokens, local_head_num_}, kDEVICE); + + Clear(split_cnt_.buffer()); + Clear(barriers_.buffer()); + + const auto max_batch_size = engine.max_batch_size; + + d_cu_x_len_ = {2 * (max_batch_size + 1), kDEVICE}; + h_cu_x_len_ = {2 * (max_batch_size + 1), kCPUpinned}; + event_ = Event::create(); } -template -void UnifiedAttentionLayer::allocateBuffer(size_t q_count, size_t k_count, size_t batch_size, size_t max_lora_rank) +void UnifiedAttentionLayer::Initialize(TensorMap& args) { - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - - if (max_lora_rank) { - lora_buf_ = (T*)allocator_->reMalloc(lora_buf_, sizeof(T) * q_count * max_lora_rank); - } + h_q_len_ = args.at("h_q_len").buffer(); + h_k_len_ = args.at("h_k_len").buffer(); - const int local_q_kv_head_num = local_head_num_ + 2 * local_kv_head_num_; + const int bsz = h_q_len_.size(); - qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * q_count * local_q_kv_head_num * size_per_head_, false); + d_cu_q_len_ = d_cu_x_len_.data(); + h_cu_q_len_ = h_cu_x_len_.data(); + d_cu_k_len_ = d_cu_q_len_ + bsz + 1; + h_cu_k_len_ = h_cu_q_len_ + bsz + 1; - qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * q_count * local_head_num_ * size_per_head_, false); + h_cu_q_len_[0] = h_cu_k_len_[0] = 0; - // Pad the tmp buffer for linear KV cache by `MAX_CTA_S` to avoid illegal accesses - tmp_kv_buf_ = (T*)allocator_->reMalloc( - tmp_kv_buf_, sizeof(T) * local_kv_head_num_ * 2 * (k_count + MAX_CTA_S) * size_per_head_, false); + std::inclusive_scan(h_q_len_.data(), h_q_len_.data() + bsz, h_cu_q_len_ + 1); + std::inclusive_scan(h_k_len_.data(), h_k_len_.data() + bsz, h_cu_k_len_ + 1); - is_allocate_buffer_ = true; -} + Copy(h_cu_x_len_.slice(0, 2 * bsz + 2), d_cu_x_len_.slice(0, 2 * bsz + 2)); -template -void UnifiedAttentionLayer::allocateWorkspace() -{ - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - FT_CHECK(!is_allocate_workspace_); - partial_M_ = (float*)allocator_->malloc(sizeof(float) * kMaxWorkspaceTokens * local_head_num_); - partial_L_ = (float*)allocator_->malloc(sizeof(float) * kMaxWorkspaceTokens * local_head_num_); - partial_O_ = (float*)allocator_->malloc(sizeof(float) * kMaxWorkspaceTokens * local_head_num_ * size_per_head_); - split_cnt_ = (int*)allocator_->malloc(sizeof(int) * kMaxWorkspaceTokens); - barriers_ = (int*)allocator_->malloc(sizeof(int) * kMaxWorkspaceTokens * local_head_num_, true, false); - is_allocate_workspace_ = true; -} + event_.Record(core::Context::stream()); -template -void UnifiedAttentionLayer::freeWorkspace() -{ - if (is_allocate_workspace_) { - TM_LOG_DEBUG(__PRETTY_FUNCTION__); + decode_num_ = *args.at("decode_num").data(); + prefil_num_ = *args.at("prefil_num").data(); - allocator_->free((void**)&partial_M_); - allocator_->free((void**)&partial_L_); - allocator_->free((void**)&partial_O_); - allocator_->free((void**)&split_cnt_); - allocator_->free((void**)&barriers_); + finished_ = args.at("finished").buffer(); + rope_base_ = args.at("rope_base").buffer(); - is_allocate_workspace_ = false; - } + cu_block_nums_ = args.at("cu_block_nums").buffer(); + kv_block_ptrs_ = args.at("kv_block_ptrs").buffer(); } -template -void UnifiedAttentionLayer::freeBuffer() +void UnifiedAttentionLayer::Finalize() { - if (is_allocate_buffer_) { - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - - allocator_->free((void**)&qkv_buf_); - allocator_->free((void**)&qkv_buf_3_); - allocator_->free((void**)&tmp_kv_buf_); - allocator_->free((void**)&lora_buf_); - - is_allocate_buffer_ = false; - } + event_.Sync(); } -template -inline void UnifiedAttentionLayer::forward(TensorMap* outputs, const TensorMap* inputs, const WeightType* weights) +void UnifiedAttentionLayer::Forward(ForwardParam p) { TM_LOG_DEBUG(__PRETTY_FUNCTION__); @@ -165,100 +169,66 @@ inline void UnifiedAttentionLayer::forward(TensorMap* outputs, const TensorMa ///////////////////////////////////////////// /// parse inputs - const int token_num = inputs->at("input_query").shape[0]; - const int layer_id = inputs->getVal("layer_id"); - - const int dc_batch_size = inputs->getVal("dc_batch_size"); - const int pf_batch_size = inputs->getVal("pf_batch_size"); - const int batch_size = dc_batch_size + pf_batch_size; - - int* h_q_len = inputs->getPtr("h_q_len"); - int* h_k_len = inputs->getPtr("h_k_len"); - int* cu_q_len = inputs->getPtr("cu_q_len"); - int* cu_k_len = inputs->getPtr("cu_k_len"); - int* h_cu_q_len = inputs->getPtr("h_cu_q_len"); - int* h_cu_k_len = inputs->getPtr("h_cu_k_len"); - - bool* is_finished = inputs->getPtr("finished"); - float* rope_theta = inputs->getPtr("rope_theta"); - - void** block_ptrs = outputs->getPtr("block_ptrs"); - int* cu_block_count = inputs->getPtr("cu_block_counts"); - - T* attention_input = inputs->getPtr("input_query"); - T* attention_out = outputs->getPtr("hidden_features"); + const int token_num = p.input.shape(0); if (token_num == 0) { return; } - ///////////////////////////////////////////// - /// allocate buffers - allocateBuffer(token_num, // shared - h_cu_k_len[batch_size] - h_cu_k_len[dc_batch_size], // prefill - batch_size, - std::max(weights->qkv.lora.r, weights->output.lora.r)); + const int layer_id = p.layer_id; + + const auto& weights = *p.weights; // [L, 2, H, s, D] const size_t layer_offset = layer_id * 2 * local_kv_head_num_ * param_.cache_block_seq_len * size_per_head_; - // static int count = 0; - - // if (tensor_para_.rank_ == 0) { - // Compare(attention_input, token_num * hidden_units_, Concat("qkv_input", layer_id), compare_mode, stream_); - // } - - int* lora_mask = inputs->at("lora_mask", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr(); + Tensor qkv; - if (weights->qkv.output_dims) { - ////////////////////////////////////////////// - /// qkv gemm - // [token_num, hidden_dim] -> [token_num, 3, local_hidden_dim] - linear_->forward( - qkv_buf_, attention_input, token_num, weights->qkv, LlamaLinear::kGemm, lora_buf_, lora_mask); + if (weights.qkv.output_dim) { + // [token_num, hidden_dim] -> [token_num, local_q_kv_head_num, head_dim] + qkv = linear_.forward(p.input, weights.qkv, LlamaLinear::kGemm); sync_check_cuda_error(); if (model_param_.qk_norm) { - qk_norm(qkv_buf_, token_num, *weights); + qk_norm(qkv, weights); } } else { - forward_mla(attention_input, token_num, *weights); + qkv = forward_mla(p.input, weights); } - // std::cerr << layer_id << " " << count << " " << tensor_para_.rank_ << "\n"; + TM_DEBUG_TENSOR(qkv, Concat("qkv", layer_id), 3); - count_and_fix(qkv_buf_, token_num * weights->qkv.output_dims, Concat("qkv", layer_id), 3); + auto invoke = [&](auto t) -> Tensor { + using T = decltype(t); + return core_attention(qkv, p, weights); + }; - // std::cerr << "token num: " << token_num << "\n"; + Tensor attn = [&]() -> Tensor { TM_DISPATCH_PRIMARY_DTYPES_RET(qkv.dtype(), invoke); }(); - // if (layer_id == 0 && count == 0 && tensor_para_.rank_ == 0) { - // Compare(qkv_buf_, token_num * (3 * local_head_num_ * size_per_head_), "qkv_buf", CMP_MODE, stream_); - // } + TM_DEBUG_TENSOR(attn, Concat("attn", layer_id), 3); - if constexpr (0) { - std::vector tmp(token_num * weights->qkv.output_dims); - cudaMemcpyAsync(tmp.data(), qkv_buf_, sizeof(T) * tmp.size(), cudaMemcpyDefault, stream_); - cudaStreamSynchronize(stream_); - int i = 0; - for (auto& x : tmp) { - std::cout << (float)x << " "; - if (++i == 256) { - break; - } - } - std::cout << "\n"; - i = 0; - for (auto it = tmp.rbegin(); it != tmp.rend(); ++it) { - std::cout << (float)*it << " "; - if (++i == 256) { - break; - } - } - std::cout << "\n"; - } + ////////////////////////////////////////////// + /// output gemm -> + (void)linear_.forward(attn, weights.output, LlamaLinear::kGemm, p.output); + sync_check_cuda_error(); +} + +template +Tensor UnifiedAttentionLayer::core_attention(Tensor& qkv, const ForwardParam& p, const WeightType& weights) +{ + const auto device = qkv.device(); + const auto dtype = qkv.dtype(); + + const int batch_size = decode_num_ + prefil_num_; + const int q_count = qkv.shape(0); + const int k_count = h_cu_k_len_[batch_size] - h_cu_k_len_[decode_num_]; + const int layer_id = p.layer_id; + + const int local_q_kv_head_num = local_head_num_ + 2 * local_kv_head_num_; - // FT_CHECK(0); + Tensor attn{{q_count, (int)local_head_num_ * (int)size_per_head_}, dtype, device}; + Tensor tmp_kv{{2, (int)local_kv_head_num_, k_count + MAX_CTA_S, (int)size_per_head_}, dtype, device}; auto stream_ptr = streams_.data(); @@ -266,39 +236,40 @@ inline void UnifiedAttentionLayer::forward(TensorMap* outputs, const TensorMa AttentionParams params{}; // Batch offset for `out` and `q` are computed inside the kernel - params.out = qkv_buf_3_; + params.out = (T*)attn.raw_data(); - params.q = (T*)qkv_buf_; + params.q = (T*)qkv.raw_data(); params.k = params.q + local_head_num_ * size_per_head_; params.v = params.k + local_kv_head_num_ * size_per_head_; params.stride = (local_head_num_ + 2 * local_kv_head_num_) * size_per_head_; - if (weights->qkv.bias) { - params.q_bias = weights->qkv.bias; + if (weights.qkv.bias) { + params.q_bias = (T*)weights.qkv.bias.data_or(nullptr); params.k_bias = params.q_bias + local_head_num_ * size_per_head_; params.v_bias = params.k_bias + local_kv_head_num_ * size_per_head_; } - params.token_num = h_cu_q_len[offset + batch_size] - h_cu_q_len[offset]; + params.token_num = h_cu_q_len_[offset + batch_size] - h_cu_q_len_[offset]; params.batch_size = batch_size; - params.max_q_len = *std::max_element(h_q_len + offset, h_q_len + offset + batch_size); - params.max_k_len = *std::max_element(h_k_len + offset, h_k_len + offset + batch_size); + /// TODO: maximum on buffer slice + params.max_q_len = *std::max_element(h_q_len_.data() + offset, h_q_len_.data() + offset + batch_size); + params.max_k_len = *std::max_element(h_k_len_.data() + offset, h_k_len_.data() + offset + batch_size); // Decoding use only - params.block_iter_params = BlockIteratorParams{(char**)block_ptrs, // - (int*)cu_block_count + offset, + params.block_iter_params = BlockIteratorParams{(char**)kv_block_ptrs_.data(), // + cu_block_nums_.data() + offset, layer_id, (int)param_.cache_block_seq_len}; // Prefilling use only - const int sum_k_len = h_cu_k_len[offset + pf_batch_size] - h_cu_k_len[offset]; - params.linear_iter_params = LinearIteratorParams{tmp_kv_buf_, // + const int sum_k_len = h_cu_k_len_[offset + prefil_num_] - h_cu_k_len_[offset]; + params.linear_iter_params = LinearIteratorParams{tmp_kv.raw_data(), // int(2 * sum_k_len * size_per_head_), int(sum_k_len * size_per_head_)}; - params.finished = is_finished + offset; - params.cu_q_len = cu_q_len + offset; - params.cu_k_len = cu_k_len + offset; + params.finished = finished_.data() + offset; + params.cu_q_len = d_cu_q_len_ + offset; + params.cu_k_len = d_cu_k_len_ + offset; params.num_heads = local_head_num_; params.num_kv_heads = local_kv_head_num_; @@ -315,7 +286,7 @@ inline void UnifiedAttentionLayer::forward(TensorMap* outputs, const TensorMa // rotary embedding if (rope_param_.type == RopeType::kDynamic) { - rope_param_.base = rope_theta + offset; + rope_param_.base = const_cast(rope_base_.data()) + offset; } params.rope_param = rope_param_; @@ -324,12 +295,11 @@ inline void UnifiedAttentionLayer::forward(TensorMap* outputs, const TensorMa params.max_position_embeddings = param_.max_position_embeddings; // Decoding use only for now - FT_CHECK(barriers_); - params.split_cnt = split_cnt_; - params.partial_L = partial_L_; - params.partial_M = partial_M_; - params.partial_O = partial_O_; - params.locks = barriers_; + params.split_cnt = split_cnt_.data(); + params.partial_L = partial_L_.data(); + params.partial_M = partial_M_.data(); + params.partial_O = partial_O_.data(); + params.locks = barriers_.data(); params.max_split_k = std::min(std::max(1, kMaxWorkspaceTokens / params.token_num), max_kv_splits); params.arch = arch_; @@ -342,18 +312,18 @@ inline void UnifiedAttentionLayer::forward(TensorMap* outputs, const TensorMa cudaStream_t pf_stream = stream_; cudaStream_t dc_stream = stream_; - if (pf_batch_size && dc_batch_size) { + if (decode_num_ && prefil_num_) { pf_stream = aux_stream_; check_cuda_error(cudaEventRecord(qkv_event_, stream_)); check_cuda_error(cudaStreamWaitEvent(aux_stream_, qkv_event_)); } - if (pf_batch_size && !isTuning()) { - const int offset = dc_batch_size; - const int sum_k_len = h_cu_k_len[offset + pf_batch_size] - h_cu_k_len[offset]; + if (prefil_num_ && !isTuning()) { + const int offset = decode_num_; + const int sum_k_len = h_cu_k_len_[offset + prefil_num_] - h_cu_k_len_[offset]; // We are executing prefill & decoding kernels concurrently, but only have 1 workspace // disable split kv for prefill for now - auto params = CreateParams(offset, pf_batch_size, 1, pf_stream); + auto params = CreateParams(offset, prefil_num_, 1, pf_stream); if constexpr (sizeof(T) == 2) { invokeProcessKV_v2_(params); sync_check_cuda_error(); @@ -367,170 +337,106 @@ inline void UnifiedAttentionLayer::forward(TensorMap* outputs, const TensorMa } } - if (dc_batch_size && !isTuning()) { - auto params = CreateParams(0, dc_batch_size, kMaxKVSplits, dc_stream); + if (decode_num_ && !isTuning()) { + auto params = CreateParams(0, decode_num_, kMaxKVSplits, dc_stream); if constexpr (sizeof(T) == 2) { dispatchDecoding(params); sync_check_cuda_error(); } } - if (pf_batch_size && dc_batch_size) { + if (decode_num_ && prefil_num_) { check_cuda_error(cudaEventRecord(aux_event_, aux_stream_)); check_cuda_error(cudaStreamWaitEvent(stream_, aux_event_)); } - // if (layer_id == 0 && count == 0) { - // Compare(qkv_buf_3_, num_token * weights->output.input_dims, "qkv_buf_3", kCmpRead, stream_); - - // dump(qkv_buf_3_, num_token * weights->output.input_dims, stream_, "qkv_buf_3"); - // } - if (isTuning()) { rng_.set_stream(stream_); - rng_.GenerateUniform(qkv_buf_3_, token_num * weights->output.input_dims, .02f, -.01f); - } - - count_and_fix(qkv_buf_3_, token_num * weights->output.input_dims, Concat("attn", layer_id), 3); - - ////////////////////////////////////////////// - /// output gemm -> - linear_->forward( - attention_out, qkv_buf_3_, token_num, weights->output, LlamaLinear::kGemm, lora_buf_, lora_mask); - sync_check_cuda_error(); - - count_and_fix(attention_out, token_num * weights->output.output_dims, Concat("wo", layer_id), 3); - - // if (tensor_para_.rank_ == 0) { - // Compare(attention_out, token_num * hidden_units_, Concat("attn_out", layer_id), compare_mode, stream_); - // // dump(qkv_buf_3_, num_token * weights->output.input_dims, stream_, "qkv_buf_3"); - // } - - if (is_free_buffer_after_forward_ == true) { - freeBuffer(); + rng_.GenerateUniform(attn.data(), attn.size(), .02f, -.01f); } - sync_check_cuda_error(); - // ++count; + return attn; } -template -void UnifiedAttentionLayer::forward_mla(const T* inputs, int token_num, const WeightType& w) +Tensor UnifiedAttentionLayer::forward_mla(const Tensor& hidden_state, const WeightType& w) { - const int q_lora_rank = w.q_a_proj.output_dims; - const int kv_lora_rank = w.kv_b_proj.input_dims; - const int qk_rope_dim = w.kv_a_proj.output_dims - kv_lora_rank; - const int qk_nope_dim = std::max(w.q_b_proj.output_dims, w.q_proj.output_dims) / local_head_num_ - qk_rope_dim; - const int v_head_dim = w.kv_b_proj.output_dims / local_head_num_ - qk_nope_dim; + const int q_lora_rank = w.q_a_proj.output_dim; + const int kv_lora_rank = w.kv_b_proj.input_dim; + const int qk_rope_dim = w.kv_a_proj.output_dim - kv_lora_rank; + const int qk_nope_dim = std::max(w.q_b_proj.output_dim, w.q_proj.output_dim) / local_head_num_ - qk_rope_dim; + const int v_head_dim = w.kv_b_proj.output_dim / local_head_num_ - qk_nope_dim; + + const auto token_num = hidden_state.shape(0); + const auto dtype = hidden_state.dtype(); - T* q{}; + Tensor q; - if (w.q_proj.kernel) { - deviceMalloc((T**)&q, (size_t)token_num * w.q_proj.output_dims, stream_); - linear_->forward(q, inputs, token_num, w.q_proj); + if (w.q_proj.weight) { + q = linear_.forward(hidden_state, w.q_proj); sync_check_cuda_error(); } else { - T* q_a{}; - deviceMalloc((T**)&q_a, (size_t)token_num * q_lora_rank, stream_); - - linear_->forward(q_a, inputs, token_num, w.q_a_proj); + Tensor q_a = linear_.forward(hidden_state, w.q_a_proj); sync_check_cuda_error(); - invokeRMSNorm(q_a, - q_lora_rank, - q_a, - q_lora_rank, - w.q_a_layernorm, - q_lora_rank, - token_num, - model_param_.norm_eps, - stream_); + invokeRMSNorm(q_a, q_a, w.q_a_layernorm, model_param_.norm_eps, stream_); sync_check_cuda_error(); - deviceMalloc((T**)&q, (size_t)token_num * w.q_b_proj.output_dims, stream_); - linear_->forward(q, q_a, token_num, w.q_b_proj); + q = linear_.forward(q_a, w.q_b_proj); sync_check_cuda_error(); - - deviceFree(q_a, stream_); } - T* kv_a{}; - const int kv_a_dim = w.kv_a_proj.output_dims; - deviceMalloc((T**)&kv_a, (size_t)token_num * kv_a_dim, stream_); - - linear_->forward(kv_a, inputs, token_num, w.kv_a_proj); + Tensor kv_a_k_pe = linear_.forward(hidden_state, w.kv_a_proj); sync_check_cuda_error(); - invokeRMSNorm( - kv_a, kv_a_dim, kv_a, kv_a_dim, w.kv_a_layernorm, kv_lora_rank, token_num, model_param_.norm_eps, stream_); + auto kv_a = kv_a_k_pe.slice({0, 0}, {-1, kv_lora_rank}); + invokeRMSNorm(kv_a, kv_a, w.kv_a_layernorm, model_param_.norm_eps, stream_); sync_check_cuda_error(); - T* kv_b{}; - deviceMalloc((T**)&kv_b, (size_t)token_num * w.kv_b_proj.output_dims, stream_); + Tensor kv_b = linear_.forward(kv_a, w.kv_b_proj); sync_check_cuda_error(); - linear_->forward(kv_b, {kv_a, kv_a_dim}, token_num, w.kv_b_proj); - sync_check_cuda_error(); + const int local_q_kv_head_num = local_head_num_ + 2 * local_kv_head_num_; - dispatchMLACopyQKV(qkv_buf_, - q, - kv_a, - kv_b, - token_num, - local_head_num_, - qk_nope_dim, - qk_rope_dim, - kv_lora_rank, - v_head_dim, - stream_); + Tensor qkv{{token_num, local_q_kv_head_num, (int)size_per_head_}, dtype, hidden_state.device()}; + MLACopyQKV(dtype, + qkv.raw_data(), + q.raw_data(), + kv_a.raw_data(), + kv_b.raw_data(), + token_num, + local_head_num_, + qk_nope_dim, + qk_rope_dim, + kv_lora_rank, + v_head_dim, + stream_); sync_check_cuda_error(); - deviceFree(q, stream_); - deviceFree(kv_a, stream_); - deviceFree(kv_b, stream_); + return qkv; } -template -void UnifiedAttentionLayer::qk_norm(T* qkv, int token_num, const WeightType& weights) +void UnifiedAttentionLayer::qk_norm(Tensor& qkv, const WeightType& weights) { check_cuda_error(cudaEventRecord(qkv_event_, stream_)); check_cuda_error(cudaStreamWaitEvent(aux_stream_, qkv_event_)); - FT_CHECK(model_param_.attn_bias == false); - - invokeQkRMSNorm(qkv_buf_, - weights.qkv.output_dims, - weights.q_a_layernorm, - getTensorType(), - size_per_head_, - local_head_num_, - token_num, - model_param_.norm_eps, - stream_); + TM_CHECK(model_param_.attn_bias == false) << "not implemented"; + + const auto token_num = qkv.shape(0); + + auto qkv3 = qkv.view({token_num, -1, (int)size_per_head_}); + + auto q = qkv3.slice({0, 0, 0}, {-1, (int)local_head_num_, -1}); + invokeRMSNormQK(q, weights.q_a_layernorm, model_param_.norm_eps, stream_); sync_check_cuda_error(); - invokeQkRMSNorm(qkv_buf_ + size_per_head_ * local_head_num_, - weights.qkv.output_dims, - weights.kv_a_layernorm, - getTensorType(), - size_per_head_, - local_kv_head_num_, - token_num, - model_param_.norm_eps, - aux_stream_); + auto k = qkv3.slice({0, (int)local_head_num_, 0}, {-1, (int)local_kv_head_num_, -1}); + invokeRMSNormQK(k, weights.kv_a_layernorm, model_param_.norm_eps, aux_stream_); sync_check_cuda_error(); check_cuda_error(cudaEventRecord(aux_event_, aux_stream_)); check_cuda_error(cudaStreamWaitEvent(stream_, aux_event_)); } -#ifdef ENABLE_FP32 -template class UnifiedAttentionLayer; -#endif -template class UnifiedAttentionLayer; -#ifdef ENABLE_BF16 -template class UnifiedAttentionLayer<__nv_bfloat16>; -#endif // ENABLE_BF16 - } // namespace turbomind diff --git a/src/turbomind/models/llama/unified_attention_layer.h b/src/turbomind/models/llama/unified_attention_layer.h index 2dd114e0b4..a498b3b881 100644 --- a/src/turbomind/models/llama/unified_attention_layer.h +++ b/src/turbomind/models/llama/unified_attention_layer.h @@ -21,116 +21,79 @@ #pragma once +#include + #include +#include "src/turbomind/core/core.h" #include "src/turbomind/kernels/gemm/test/test_utils.h" #include "src/turbomind/models/llama/LlamaDenseWeight.h" #include "src/turbomind/models/llama/LlamaLinear.h" #include "src/turbomind/models/llama/context.h" #include "src/turbomind/models/llama/llama_params.h" -#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/cuda_utils.h" namespace turbomind { -template class UnifiedAttentionLayer { public: - using WeightType = LlamaAttentionWeight; + using WeightType = LlamaAttentionWeight; static constexpr int kMaxKVSplits = 128; static constexpr int kMaxWorkspaceTokens = 4096; - void freeBuffer(); - void allocateBuffer(size_t q_count, size_t k_count, size_t batch_size, size_t qkv_lora_rank); + struct ForwardParam { + Tensor input; + Tensor output; + const WeightType* weights; + int layer_id; + }; - void allocateWorkspace(); - void freeWorkspace(); + ~UnifiedAttentionLayer(); - ~UnifiedAttentionLayer() - { - freeBuffer(); - freeWorkspace(); + UnifiedAttentionLayer(const ModelParam& model, + const AttentionParam& attn, + const EngineParam& engine, + const LoraParam& lora, + int tp_size, + const Context& context); - for (auto& s : streams_) { - s = {}; - } + void Forward(ForwardParam p); - check_cuda_error(cudaEventDestroy(aux_event_)); - check_cuda_error(cudaEventDestroy(qkv_event_)); - check_cuda_error(cudaStreamDestroy(aux_stream_)); + void Initialize(TensorMap& args); - aux_event_ = qkv_event_ = {}; - aux_stream_ = {}; - } + void Finalize(); - UnifiedAttentionLayer(const ModelParam& model, - const AttentionParam& attn, - const LoraParam& lora, - size_t tp_size, - const Context& context); - - void forward(TensorMap* outputs, const TensorMap* inputs, const WeightType* weights); - - void prefill(T* output, - T* tmp_kv_buffer, - const T* qkv, - void** block_ptrs, - const int* cu_q_len, - const int* cu_k_len, - const int* input_length, - const int* context_length, - const int* cu_block_count, - const bool* is_finished, - const float* rope_theta, - int pf_batch_size, - int pf_num_token, - size_t layer_offset, - int pf_max_q_len, - int pf_max_k_len, - int pf_session_len, - const WeightType* weights); - - void decode(T* output, - const T* qkv, - void** block_ptrs, - const int* cu_q_len, - const int* cu_block_count, - const int* input_length, - const int* context_length, - const bool* is_finished, - const float* rope_theta, - size_t layer_offset, - int batch_size, - int dc_sum_seq_len, - int dc_max_seq_len, - int max_split_k, - const WeightType* weights); + const int* d_cu_q_len() + { + return d_cu_q_len_; + } private: - void forward_mla(const T* inputs, int token_num, const WeightType& weights); + Tensor forward_mla(const Tensor& hidden_state, const WeightType& weights); + + /// TODO: dropping the `T` here requires deep refactor of attention dispatch + template + Tensor core_attention(Tensor& qkv, const ForwardParam& p, const WeightType& weights); - void qk_norm(T* qkv, int token_num, const WeightType& weights); + void qk_norm(Tensor& qkv, const WeightType& weights); private: - const size_t head_num_; - const size_t kv_head_num_; - const size_t size_per_head_; - const size_t hidden_units_; - const size_t local_head_num_; - const size_t local_kv_head_num_; + const int head_num_; + const int kv_head_num_; + const int size_per_head_; + const int hidden_units_; + const int local_head_num_; + const int local_kv_head_num_; const AttentionParam param_; const ModelParam model_param_; const LoraParam lora_param_; - const Context& context_; - - cudaStream_t const stream_; - LlamaLinear* const linear_; - IAllocator* const allocator_; - const int arch_{}; + const Context& context_; - const bool is_free_buffer_after_forward_{false}; + cudaStream_t const stream_; + LlamaLinear& linear_; + const int arch_{}; cudaStream_t aux_stream_; cudaEvent_t qkv_event_; @@ -142,28 +105,37 @@ class UnifiedAttentionLayer { RopeKernelParam rope_param_{}; - T* qkv_buf_{}; - T* q_buf_2_{}; - T* k_buf_2_{}; - T* v_buf_2_{}; - T* k_cache_buf_{}; - T* v_cache_buf_{}; - T* qk_buf_{}; - float* qk_buf_float_{}; - T* qkv_buf_2_{}; - T* qkv_buf_3_{}; - T* lora_buf_{}; - - float* partial_M_{}; - float* partial_L_{}; - float* partial_O_{}; - int* split_cnt_{}; - int* barriers_{}; // always zero - - T* tmp_kv_buf_{}; - - bool is_allocate_buffer_ = false; - bool is_allocate_workspace_ = false; + /////////////////////////////////////////////////////// + /// runtime states + int decode_num_; + int prefil_num_; + + Tensor_ partial_M_; + Tensor_ partial_L_; + Tensor_ partial_O_; + Tensor_ split_cnt_; + Tensor_ barriers_; // always zero + + Event event_; + + Buffer_ h_q_len_; + Buffer_ h_k_len_; + + Buffer_ d_cu_x_len_; + Buffer_ h_cu_x_len_; + + // references into d/h_cu_x_len_ + int* d_cu_q_len_; + int* d_cu_k_len_; + int* h_cu_q_len_; + int* h_cu_k_len_; + + Buffer_ finished_; + Buffer_ rope_base_; + + Buffer_ cu_block_nums_; + Buffer_ kv_block_ptrs_; + /////////////////////////////////////////////////////// }; } // namespace turbomind diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc index d801539483..c875c7852f 100644 --- a/src/turbomind/models/llama/unified_decoder.cc +++ b/src/turbomind/models/llama/unified_decoder.cc @@ -1,30 +1,28 @@ -#include -#include #include +#include + +#include #include "src/turbomind/kernels/core/math.h" #include "src/turbomind/kernels/norm/rms_norm.h" -#include "src/turbomind/models/llama/llama_decoder_kernels.h" #include "src/turbomind/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_utils.h" #include "src/turbomind/models/llama/moe_ffn_layer.h" #include "src/turbomind/models/llama/unified_attention_layer.h" #include "src/turbomind/models/llama/unified_decoder.h" -#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/anomaly_handler.h" #include "src/turbomind/utils/cuda_utils.h" namespace turbomind { -template -UnifiedDecoder::UnifiedDecoder(const ModelParam& model, - const EngineParam& engine, - const AttentionParam& attn, - const MoeParam& moe, - const LoraParam& lora, - const Context& ctx): +UnifiedDecoder::UnifiedDecoder(const ModelParam& model, + const EngineParam& engine, + const AttentionParam& attn, + const MoeParam& moe, + const LoraParam& lora, + const Context& ctx): layer_num_(model.layer_num), hidden_units_(model.hidden_units), attn_tp_size_(engine.attn_tp_size), @@ -34,91 +32,39 @@ UnifiedDecoder::UnifiedDecoder(const ModelParam& model, attn_tp_group_(ctx.comm.d_tp_group), rmsnorm_eps_(model.norm_eps), stream_(ctx.stream), - allocator_(ctx.allocator.get()), d_comm_(ctx.comm.d_comm), - dtype_(getTensorType()), tune_layer_num_(model.tune_layer_num) { - attn_layer_ = std::make_unique>(model, attn, lora, attn_tp_size_, ctx); + attn_layer_ = std::make_unique(model, attn, engine, lora, attn_tp_size_, ctx); if (std::accumulate(moe.expert_num.begin(), moe.expert_num.end(), 0LL)) { - moe_ffn_layer_ = std::make_unique>(model, moe, mlp_tp_size_, ctx); + moe_ffn_layer_ = std::make_unique(model, moe, engine, ctx); } if (std::accumulate(model.inter_size.begin(), model.inter_size.end(), 0LL)) { - ffn_layer_ = std::make_unique>(model, ctx); + ffn_layer_ = std::make_unique(model, ctx); } - - check_cuda_error(cudaEventCreateWithFlags(&ev_h_cu_x_, cudaEventDisableTiming)); -} - -template -UnifiedDecoder::~UnifiedDecoder() -{ - freeBuffer(); - check_cuda_error(cudaEventDestroy(ev_h_cu_x_)); -} - -template -void UnifiedDecoder::allocateBuffer(size_t batch_size) -{ - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - - cu_q_len_ = (int*)allocator_->reMalloc(cu_q_len_, 2 * sizeof(int) * (batch_size + 1), false); - h_cu_q_len_ = (int*)allocator_->reMalloc(h_cu_q_len_, 2 * sizeof(int) * (batch_size + 1), false, true); -} - -template -void UnifiedDecoder::freeBuffer() -{ - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - - allocator_->free((void**)&cu_q_len_); - allocator_->free((void**)&h_cu_q_len_, true); -} - -template -void UnifiedDecoder::forwardSelfAttn(T* attn_io, - TensorMap* _outputs, - const TensorMap* _inputs, - size_t token_num, - size_t batch_size, - int layer_id, - const WeightType* weight) -{ - TensorMap inputs(*_inputs); - inputs.insert("input_query", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, attn_io}); - inputs.insert("layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}); - inputs.insert("cu_q_len", {MEMORY_GPU, TYPE_INT32, {batch_size + 1}, cu_q_len_}); - inputs.insert("cu_k_len", {MEMORY_GPU, TYPE_INT32, {batch_size + 1}, cu_k_len_}); - inputs.insert("h_cu_q_len", {MEMORY_CPU, TYPE_INT32, {batch_size + 1}, h_cu_q_len_}); - inputs.insert("h_cu_k_len", {MEMORY_CPU, TYPE_INT32, {batch_size + 1}, h_cu_k_len_}); - - TensorMap outputs(*_outputs); - outputs.insert("hidden_features", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, attn_io}); - - attn_layer_->forward(&outputs, &inputs, &weight->self_attn_weights); } -template -void UnifiedDecoder::AllreduceResidualRMSnorm(T* hidden_states, - T* residual, - const T* bias, - const T* weight, - int token_num, - int group0, - int group1, - const int* local_token_nums) +void UnifiedDecoder::AllreduceResidualRMSnorm(Tensor& hidden_states, + Tensor& residual, + const Tensor& bias, + const Tensor& weight, + int token_num, + int group0, + int group1, + const int* local_token_nums) { + const auto dtype = hidden_states.dtype(); if (0) {} else if (group0 || group1) { - d_comm_->AllreduceResidualBiasRMSnormEx(hidden_states, - residual, - bias, - weight, + d_comm_->AllreduceResidualBiasRMSnormEx(hidden_states.raw_data(), + residual.raw_data(), + bias.data_or((void*)nullptr), + weight.raw_data(), rmsnorm_eps_, hidden_units_, - dtype_, + dtype, group0, group1, local_token_nums, @@ -126,19 +72,33 @@ void UnifiedDecoder::AllreduceResidualRMSnorm(T* hidden_states, sync_check_cuda_error(); } else if (d_comm_) { - d_comm_->AllreduceResidualBiasRMSnorm( - hidden_states, residual, bias, weight, rmsnorm_eps_, hidden_units_, token_num, dtype_, 0, stream_); + d_comm_->AllreduceResidualBiasRMSnorm(hidden_states.raw_data(), + residual.raw_data(), + bias.data_or((void*)nullptr), + weight.raw_data(), + rmsnorm_eps_, + hidden_units_, + token_num, + dtype, + 0, + stream_); sync_check_cuda_error(); } else { - invokeBiasResidualRMSNorm( - residual, hidden_states, weight, bias, hidden_units_, token_num, rmsnorm_eps_, stream_); + invokeResidualBiasRMSNorm(hidden_states.raw_data(), + residual.raw_data(), + weight.raw_data(), + bias.data_or((void*)nullptr), + dtype, + hidden_units_, + token_num, + rmsnorm_eps_, + stream_); sync_check_cuda_error(); } } -template -void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, const std::vector* weights) +void UnifiedDecoder::Forward(TensorMap& args, const std::vector& weights) { /** * input tensors: @@ -158,70 +118,42 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con * \param block_ptrs [total_block_counts], void* */ - const size_t token_num = inputs->at("decoder_input").shape[0]; - - const int pf_batch_size = inputs->getVal("pf_batch_size"); - const int dc_batch_size = inputs->getVal("dc_batch_size"); - const int batch_size = pf_batch_size + dc_batch_size; - - const int* h_q_len = inputs->getPtr("h_q_len"); - const int* h_k_len = inputs->getPtr("h_k_len"); + const int decode_num = *args.at("decode_num").data(); + const int prefil_num = *args.at("prefil_num").data(); + const int batch_size = prefil_num + decode_num; - T* residual = inputs->getPtr("decoder_input"); - T* hidden_states = outputs->getPtr("decoder_output"); + constexpr auto device = kDEVICE; - T* last_token_hidden_units = outputs->getPtr("last_token_hidden_units"); + Tensor_ local_token_nums = args.at("local_token_nums"); - { // compute cumulative lengths + Tensor local_residual = args.at("decoder_input"); + Tensor global_hidden_states = args.at("decoder_output"); - h_cu_k_len_ = h_cu_q_len_ + batch_size + 1; - cu_k_len_ = cu_q_len_ + batch_size + 1; + Tensor local_hidden_states = global_hidden_states; - h_cu_q_len_[0] = h_cu_k_len_[0] = 0; + const auto global_token_num = global_hidden_states.shape(0); + const auto local_token_num = local_residual.shape(0); - for (int i = 1; i <= batch_size; ++i) { - h_cu_q_len_[i] = h_cu_q_len_[i - 1] + h_q_len[i - 1]; - h_cu_k_len_[i] = h_cu_k_len_[i - 1] + h_k_len[i - 1]; - } - - check_cuda_error( - cudaMemcpyAsync(cu_q_len_, h_cu_q_len_, 2 * sizeof(int) * (batch_size + 1), cudaMemcpyDefault, stream_)); - - check_cuda_error(cudaEventRecord(ev_h_cu_x_, stream_)); + if (attn_dp_size_ > 1) { // Offset hidden states buffer for mixed DP + TM_CHECK_EQ(local_token_nums.size(), attn_dp_size_); + std::vector cumul_token_nums(attn_dp_size_ + 1, 0); + std::inclusive_scan( + local_token_nums.data(), local_token_nums.data() + attn_dp_size_, cumul_token_nums.begin() + 1); + const int offset = cumul_token_nums[attn_dp_rank_]; + local_hidden_states = global_hidden_states.slice({offset, 0}, {local_token_num, -1}); } - const int pf_offset = dc_batch_size; + attn_layer_->Initialize(args); - /// Offset hidden states buffer for mixed DP - T* global_hidden_states = hidden_states; - size_t global_token_num = token_num; - const int* local_token_nums = inputs->getPtr("local_token_nums", nullptr); - if (attn_dp_size_ > 1) { - FT_CHECK(local_token_nums); - std::vector cumul_token_nums(attn_dp_size_ + 1, 0); - std::inclusive_scan(local_token_nums, local_token_nums + attn_dp_size_, cumul_token_nums.begin() + 1); - hidden_states = hidden_states + (size_t)cumul_token_nums[attn_dp_rank_] * hidden_units_; - global_token_num = cumul_token_nums.back(); - // TM_LOG_ERROR("rank %d, global_token_num %d, offset %d", - // attn_dp_rank_, - // global_token_num, - // cumul_token_nums[attn_dp_rank_]); - } + TM_DEBUG_TENSOR(local_residual, "res", 1); + TM_DEBUG_TENSOR(weights.at(0)->self_attn_norm, "norm_weight", 2); - ///////////////////////////////////////////// - /// RMSNorm - invokeRMSNorm(hidden_states, - residual, - weights->at(0)->self_attn_norm_weights, - hidden_units_, - token_num, - rmsnorm_eps_, - stream_); + invokeRMSNorm(local_hidden_states, local_residual, weights.at(0)->self_attn_norm, rmsnorm_eps_, stream_); sync_check_cuda_error(); - count_and_fix(hidden_states, token_num * hidden_units_, Concat("norm0", 0), 2); + TM_DEBUG_TENSOR(local_hidden_states, Concat("norm0", 0), 2); - for (size_t layer = 0; layer < layer_num_; ++layer) { + for (int layer = 0; layer < layer_num_; ++layer) { /// TODO: do not skip the layers when they are heterogeneous if (isTuning() && layer >= tune_layer_num_) { @@ -230,113 +162,99 @@ void UnifiedDecoder::forward(TensorMap* outputs, const TensorMap* inputs, con ///////////////////////////////////////////// /// self-attention - forwardSelfAttn(hidden_states, // - outputs, - inputs, - token_num, - batch_size, - layer, - weights->at(layer)); + attn_layer_->Forward({local_hidden_states, // + local_hidden_states, + weights.at(layer)->self_attn_weights.get(), + layer}); - count_and_fix(hidden_states, token_num * hidden_units_, Concat("attn_block", layer), 2); + TM_DEBUG_TENSOR(local_hidden_states, Concat("attn_block", layer), 2); AllreduceResidualRMSnorm(global_hidden_states, - residual, - weights->at(layer)->self_attn_weights.output.bias, - weights->at(layer)->ffn_norm_weights, - token_num, + local_residual, + weights.at(layer)->self_attn_weights->output.bias, + weights.at(layer)->ffn_norm, + local_token_num, attn_tp_group_, 0, - local_token_nums); + local_token_nums.data()); - count_and_fix(residual, token_num * hidden_units_, Concat("residual0", layer), 2); - count_and_fix(hidden_states, token_num * hidden_units_, Concat("norm1", layer), 2); + TM_DEBUG_TENSOR(local_residual, Concat("residual0", layer), 2); + TM_DEBUG_TENSOR(local_hidden_states, Concat("norm1", layer), 2); //////////////////////////////////////////// /// feed-forward network - const bool is_moe = !weights->at(layer)->moe_weights.experts.empty(); - if (is_moe) { - // Writes to internal buffer - moe_ffn_layer_->forward( - nullptr, global_hidden_states, global_token_num, layer, weights->at(layer)->moe_weights); + std::optional moe_fwd_param; + + if (weights.at(layer)->moe_weights) { + moe_fwd_param = MoeFfnLayer::ForwardParam{global_hidden_states, + global_hidden_states, + weights.at(layer)->moe_weights.get(), + ffn_layer_ ? 1.f : 0.f, + layer}; + moe_ffn_layer_->Forward(*moe_fwd_param); } - if (weights->at(layer)->ffn_weights.output.kernel) { - int layer_id = layer; // int is needed - TensorMap ffn_inputs{ - {"ffn_input", {MEMORY_GPU, dtype_, {global_token_num, hidden_units_}, global_hidden_states}}, - {"layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}}, - }; - TensorMap ffn_outputs{ - {"ffn_output", {MEMORY_GPU, dtype_, {global_token_num, hidden_units_}, global_hidden_states}}, - }; - if (inputs->isExist("lora_mask")) { - ffn_inputs.insert({"lora_mask", inputs->at("lora_mask")}); - } - ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &weights->at(layer)->ffn_weights); + if (weights.at(layer)->ffn_weights) { + ffn_layer_->forward( + {global_hidden_states, global_hidden_states, weights.at(layer)->ffn_weights.get(), (int)layer}); } - if (is_moe) { - moe_ffn_layer_->reduce( - global_hidden_states, global_token_num, (bool)ffn_layer_, layer, weights->at(layer)->moe_weights); + if (moe_fwd_param) { + moe_ffn_layer_->Combine(*moe_fwd_param); } - count_and_fix(global_hidden_states, global_token_num * hidden_units_, Concat("ffn_block", layer), 2); + TM_DEBUG_TENSOR(global_hidden_states, Concat("ffn_block", layer), 2); - const bool is_last_layer = layer == layer_num_ - 1; + const bool last = layer == layer_num_ - 1; - auto scale_weight = !is_last_layer ? weights->at(layer + 1)->self_attn_norm_weights : - inputs->at("output_norm_weight").getPtr(); + auto& scale_weight = !last ? weights.at(layer + 1)->self_attn_norm : args.at("output_norm_weight"); AllreduceResidualRMSnorm(global_hidden_states, - residual, - weights->at(layer)->ffn_weights.output.bias, + local_residual, + {}, scale_weight, - token_num, + local_token_num, 0, attn_tp_group_, - local_token_nums); + local_token_nums.data()); sync_check_cuda_error(); - count_and_fix(residual, token_num * hidden_units_, Concat("residual1", layer), 2); - count_and_fix(hidden_states, token_num * hidden_units_, Concat("norm0", layer + 1), 2); + TM_DEBUG_TENSOR(local_residual, Concat("residual1", layer), 2); + TM_DEBUG_TENSOR(local_hidden_states, Concat("norm0", layer + 1), 2); } - if (dc_batch_size) { + /// TODO + using T = uint16_t; + + auto last_token_hidden_units = (T*)args.at("last_token_hidden_units").raw_data(); + + if (decode_num) { check_cuda_error(cudaMemcpyAsync(last_token_hidden_units, - hidden_states, - sizeof(T) * dc_batch_size * hidden_units_, + (T*)local_hidden_states.raw_data(), + sizeof(T) * decode_num * hidden_units_, cudaMemcpyDefault, stream_)); - count_and_fix(last_token_hidden_units, dc_batch_size * hidden_units_, "dc_out", 2); + // TM_DEBUG_RAW(last_token_hidden_units, decode_num * hidden_units_, "dc_out", 2); } - if (pf_batch_size) { - invokeGetFeatureOfLastToken(last_token_hidden_units + pf_offset * hidden_units_, // - hidden_states, - cu_q_len_ + pf_offset, + if (prefil_num) { + invokeGetFeatureOfLastToken(last_token_hidden_units + decode_num * hidden_units_, // + (T*)local_hidden_states.raw_data(), + attn_layer_->d_cu_q_len() + decode_num, hidden_units_, - pf_batch_size, + prefil_num, stream_); sync_check_cuda_error(); - count_and_fix(last_token_hidden_units + pf_offset * hidden_units_, pf_batch_size * hidden_units_, "pf_out", 2); + // TM_DEBUG_RAW(last_token_hidden_units + decode_num * hidden_units_, prefil_num * hidden_units_, "pf_out", 2); } - if (is_free_buffer_after_forward_) { - freeBuffer(); - } + Buffer out( + (void*)last_token_hidden_units, (decode_num + prefil_num) * hidden_units_, local_residual.dtype(), kDEVICE); - // Wait for `h_cu_q/k_len_` to be consumed - check_cuda_error(cudaEventSynchronize(ev_h_cu_x_)); -} + TM_DEBUG_TENSOR(out, "out", 1); -#ifdef ENABLE_FP32 -template class UnifiedDecoder; -#endif -template class UnifiedDecoder; -#ifdef ENABLE_BF16 -template class UnifiedDecoder<__nv_bfloat16>; -#endif // ENABLE_BF16 + attn_layer_->Finalize(); +} } // namespace turbomind diff --git a/src/turbomind/models/llama/unified_decoder.h b/src/turbomind/models/llama/unified_decoder.h index 3dcb3e04a0..dd03293744 100644 --- a/src/turbomind/models/llama/unified_decoder.h +++ b/src/turbomind/models/llama/unified_decoder.h @@ -7,16 +7,24 @@ #include "src/turbomind/models/llama/llama_params.h" #include "src/turbomind/models/llama/moe_ffn_layer.h" #include "src/turbomind/models/llama/unified_attention_layer.h" -#include "src/turbomind/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cuda_utils.h" namespace turbomind { -template class UnifiedDecoder { -private: - void freeBuffer(); +public: + using WeightType = LlamaDecoderLayerWeight; + UnifiedDecoder(const ModelParam& model, + const EngineParam& engine, + const AttentionParam& attn, + const MoeParam& moe, + const LoraParam& lora, + const Context& ctx); + + void Forward(TensorMap& args, const std::vector& weights); + +private: const size_t layer_num_; const size_t hidden_units_; @@ -29,58 +37,23 @@ class UnifiedDecoder { const float rmsnorm_eps_; cudaStream_t const stream_; - IAllocator* const allocator_; comm::DeviceCommImpl* const d_comm_; - const DataType dtype_; - const int tune_layer_num_; - bool is_free_buffer_after_forward_{}; - - int* cu_q_len_{}; - int* cu_k_len_{}; - - int* h_cu_q_len_{}; - int* h_cu_k_len_{}; - - std::unique_ptr> attn_layer_; - std::unique_ptr> ffn_layer_; - std::unique_ptr> moe_ffn_layer_; - - cudaEvent_t ev_h_cu_x_{}; - - using WeightType = LlamaDecoderLayerWeight; - - void forwardSelfAttn(T* attn_io, - TensorMap* _outputs, - const TensorMap* _inputs, - size_t token_num, - size_t batch_size, - int layer_id, - const WeightType* weight); - - void AllreduceResidualRMSnorm(T* hidden_states, - T* residual, - const T* bias, - const T* weight, - int token_num, - int t0, - int t1, - const int* local_token_nums); - -public: - UnifiedDecoder(const ModelParam& model, - const EngineParam& engine, - const AttentionParam& attn, - const MoeParam& moe, - const LoraParam& lora, - const Context& ctx); - - void allocateBuffer(size_t max_batch_size); + const int tune_layer_num_; - ~UnifiedDecoder(); + std::unique_ptr attn_layer_; + std::unique_ptr ffn_layer_; + std::unique_ptr moe_ffn_layer_; - void forward(TensorMap* outputs, const TensorMap* inputs, const std::vector* weights); + void AllreduceResidualRMSnorm(Tensor& hidden_states, + Tensor& residual, + const Tensor& bias, + const Tensor& weight, + int token_num, + int t0, + int t1, + const int* local_token_nums); }; } // namespace turbomind diff --git a/src/turbomind/models/llama/weight_type.h b/src/turbomind/models/llama/weight_type.h deleted file mode 100644 index bc2f49a08e..0000000000 --- a/src/turbomind/models/llama/weight_type.h +++ /dev/null @@ -1,56 +0,0 @@ -#pragma once - -#include -#include -#include - -namespace turbomind { - -enum class WeightType : int -{ - kFP32, - kFP16, - kFP8, // not supported yet - kBF16, - kINT8, - kINT4 -}; - -template -constexpr WeightType get_default_weight_type() -{ - if constexpr (std::is_same_v) { - return WeightType::kFP16; - } - else if constexpr (std::is_same_v) { - return WeightType::kBF16; - } - else if constexpr (std::is_same_v) { - return WeightType::kFP32; - } - else { - static_assert(sizeof(T) != sizeof(T), "not implemented"); - return {}; - } -} - -inline size_t getBitSize(WeightType type) -{ - switch (type) { - case WeightType::kFP32: - return 32; - case WeightType::kFP16: - return 16; - case WeightType::kFP8: - return 8; - case WeightType::kBF16: - return 16; - case WeightType::kINT8: - return 8; - case WeightType::kINT4: - return 4; - } - return 0; -} - -} // namespace turbomind diff --git a/src/turbomind/python/CMakeLists.txt b/src/turbomind/python/CMakeLists.txt index bc7b063e95..8e8c07de2a 100644 --- a/src/turbomind/python/CMakeLists.txt +++ b/src/turbomind/python/CMakeLists.txt @@ -13,8 +13,7 @@ if(NOT pybind11_FOUND) endif() pybind11_add_module(${PROJECT_NAME} bind.cpp) -target_link_libraries(${PROJECT_NAME} PRIVATE TransformerTritonBackend - LlamaTritonBackend) +target_link_libraries(${PROJECT_NAME} PRIVATE LlamaTritonBackend) target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_14) set(_INSTALL_CUDA_RPATH diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp index 1dea57375b..a25daab2f7 100644 --- a/src/turbomind/python/bind.cpp +++ b/src/turbomind/python/bind.cpp @@ -12,44 +12,42 @@ #include #include +#include "src/turbomind/core/tensor.h" #include "src/turbomind/engine/model_request.h" #include "src/turbomind/python/dlpack.h" #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h" -#include "src/turbomind/triton_backend/transformer_triton_backend.hpp" -#include "src/turbomind/utils/Tensor.h" #include "src/turbomind/utils/cuda_utils.h" namespace py = pybind11; namespace ft = turbomind; using namespace pybind11::literals; -using ft::ManagedTensor; -using ft::Tensor; +using ft::core::Tensor; // prepare to bind container -using TensorMap = std::unordered_map; +using TensorMap = ft::core::TensorMap; PYBIND11_MAKE_OPAQUE(TensorMap); static const char kDlTensorCapsuleName[] = "dltensor"; -DLDevice getDLDevice(const ft::Tensor& tensor) +DLDevice getDLDevice(const Tensor& tensor) { int device_id = 0; - if (tensor.where == ft::MEMORY_GPU) { + if (tensor.device().type == ft::kDEVICE) { cudaPointerAttributes ptr_attr{}; - cudaPointerGetAttributes(&ptr_attr, tensor.data); + cudaPointerGetAttributes(&ptr_attr, tensor.raw_data()); device_id = ptr_attr.device; } DLDevice device{kDLCPU, device_id}; - switch (tensor.where) { - case ft::MEMORY_CPU: + switch (tensor.device().type) { + case ft::kCPU: device.device_type = DLDeviceType::kDLCPU; break; - case ft::MEMORY_CPU_PINNED: + case ft::kCPUpinned: device.device_type = DLDeviceType::kDLCUDAHost; break; - case ft::MEMORY_GPU: + case ft::kDEVICE: device.device_type = DLDeviceType::kDLCUDA; break; default: @@ -59,179 +57,170 @@ DLDevice getDLDevice(const ft::Tensor& tensor) return device; } -DLManagedTensor* TritonTensorToDLManagedTensor(ManagedTensor& tensor) +DLManagedTensor* TritonTensorToDLManagedTensor(Tensor& tensor) { - DLDevice device = getDLDevice(*tensor); - + DLDevice device = getDLDevice(tensor); DLDataType data_type{0, 0, 1}; - switch (tensor->type) { - case ft::TYPE_BOOL: + using ft::data_type_v; + switch (tensor.dtype()) { + case data_type_v: data_type.code = DLDataTypeCode::kDLBool; data_type.bits = 8; break; - case ft::TYPE_UINT8: + case data_type_v: data_type.code = DLDataTypeCode::kDLUInt; data_type.bits = 8; break; - case ft::TYPE_UINT16: + case data_type_v: data_type.code = DLDataTypeCode::kDLUInt; data_type.bits = 16; break; - case ft::TYPE_UINT32: + case data_type_v: data_type.code = DLDataTypeCode::kDLUInt; data_type.bits = 32; break; - case ft::TYPE_UINT64: + case data_type_v: data_type.code = DLDataTypeCode::kDLUInt; data_type.bits = 64; break; - case ft::TYPE_INT8: - case ft::TYPE_BYTES: + case data_type_v: data_type.code = DLDataTypeCode::kDLInt; data_type.bits = 8; break; - case ft::TYPE_INT16: + case data_type_v: data_type.code = DLDataTypeCode::kDLInt; data_type.bits = 16; break; - case ft::TYPE_INT32: + case data_type_v: data_type.code = DLDataTypeCode::kDLInt; data_type.bits = 32; break; - case ft::TYPE_INT64: + case data_type_v: data_type.code = DLDataTypeCode::kDLInt; data_type.bits = 64; break; - case ft::TYPE_FP16: + case data_type_v: data_type.code = DLDataTypeCode::kDLFloat; data_type.bits = 16; break; - case ft::TYPE_FP32: + case data_type_v: data_type.code = DLDataTypeCode::kDLFloat; data_type.bits = 32; break; - case ft::TYPE_FP64: + case data_type_v: data_type.code = DLDataTypeCode::kDLFloat; data_type.bits = 64; break; - case ft::TYPE_BF16: + case data_type_v: data_type.code = DLDataTypeCode::kDLBfloat; data_type.bits = 16; break; default: break; } - ManagedTensor* ctx = new ManagedTensor(tensor); - DLTensor dl_tensor{const_cast((*ctx)->data), + + static_assert(sizeof(int64_t) == sizeof(tensor.shape(0))); + + Tensor* ctx = new Tensor(tensor); + DLTensor dl_tensor{const_cast(ctx->raw_data()), device, - (int32_t)((*ctx)->shape.size()), + (int32_t)(ctx->ndim()), data_type, - reinterpret_cast(const_cast((*ctx)->shape.data())), + (int64_t*)ctx->shape().data(), (int64_t*)(nullptr), 0}; return new DLManagedTensor{dl_tensor, ctx, [](DLManagedTensor* dlmt) { // - // auto& x = *(ManagedTensor*)dlmt->manager_ctx; - // std::stringstream ss; - // ss << "("; - // for (const auto& d : x->shape) { - // ss << d << ","; - // } - // ss << ")"; - // std::cerr << "turbomind tensor dtor " << ss.str() << " " << std::endl; - delete (ManagedTensor*)dlmt->manager_ctx; + delete (Tensor*)dlmt->manager_ctx; delete dlmt; }}; } -ft::MemoryType getMemoryType(DLDevice device) +ft::DeviceType getMemoryType(DLDevice device) { switch (device.device_type) { case DLDeviceType::kDLCUDAHost: - return ft::MemoryType::MEMORY_CPU_PINNED; + return ft::DeviceType::kCPUpinned; case DLDeviceType::kDLCUDA: - return ft::MemoryType::MEMORY_GPU; + return ft::DeviceType::kDEVICE; case DLDeviceType::kDLCPU: default: - return ft::MemoryType::MEMORY_CPU; + return ft::DeviceType::kCPU; } } ft::DataType getDataType(DLDataType data_type) { + using ft::data_type_v; switch (data_type.code) { case DLDataTypeCode::kDLUInt: switch (data_type.bits) { case 8: - return ft::TYPE_UINT8; + return data_type_v; case 16: - return ft::TYPE_UINT16; + return data_type_v; case 32: - return ft::TYPE_UINT32; + return data_type_v; case 64: - return ft::TYPE_UINT64; + return data_type_v; default: - return ft::TYPE_INVALID; + return data_type_v; } break; case DLDataTypeCode::kDLInt: switch (data_type.bits) { case 8: - return ft::TYPE_INT8; + return data_type_v; case 16: - return ft::TYPE_INT16; + return data_type_v; case 32: - return ft::TYPE_INT32; + return data_type_v; case 64: - return ft::TYPE_INT64; + return data_type_v; default: - return ft::TYPE_INVALID; + return data_type_v; } break; case DLDataTypeCode::kDLFloat: switch (data_type.bits) { case 16: - return ft::TYPE_FP16; + return data_type_v; case 32: - return ft::TYPE_FP32; + return data_type_v; case 64: - return ft::TYPE_FP64; + return data_type_v; default: - return ft::TYPE_INVALID; + return data_type_v; } break; case DLDataTypeCode::kDLBfloat: switch (data_type.bits) { case 16: - return ft::TYPE_BF16; + return data_type_v; default: - return ft::TYPE_INVALID; + return data_type_v; } break; case DLDataTypeCode::kDLBool: - return ft::TYPE_BOOL; + return data_type_v; default: - return ft::TYPE_INVALID; + return data_type_v; } } -std::shared_ptr DLManagedTensorToTritonTensor(DLManagedTensor* tensor) +std::shared_ptr DLManagedTensorToTritonTensor(DLManagedTensor* tensor) { auto& dl_tensor = tensor->dl_tensor; auto where = getMemoryType(dl_tensor.device); auto dtype = getDataType(dl_tensor.dtype); assert(dl_tensor.ndim > 0); - std::vector shape(dl_tensor.shape, dl_tensor.shape + dl_tensor.ndim); - auto data = dl_tensor.data; - - auto ret = std::make_shared(); - ret->tensor = Tensor(where, dtype, std::move(shape), data); - ret->data_holder.reset((void*)nullptr, [tensor](void*) { - // std::cerr << "dlpack tensor dtor" << std::endl; - if (tensor->deleter) { - tensor->deleter(tensor); - } - }); - return ret; + std::vector shape(dl_tensor.shape, dl_tensor.shape + dl_tensor.ndim); + + std::shared_ptr ptr{dl_tensor.data, [tensor](void* p) { + if (tensor->deleter) { + tensor->deleter(tensor); + } + }}; + return std::make_shared(ptr, std::move(shape), dtype, where); } static void safe_memcpy(void* dst, const void* src, size_t size) @@ -352,80 +341,54 @@ PYBIND11_MODULE(_turbomind, m) .def("consume", [](ft::AtomicRequestState& s) { return s.exchange(nullptr); }); // data type - py::enum_(m, "DataType") - .value("TYPE_INVALID", ft::DataType::TYPE_INVALID) - .value("TYPE_BOOL", ft::DataType::TYPE_BOOL) - .value("TYPE_UINT8", ft::DataType::TYPE_UINT8) - .value("TYPE_UINT16", ft::DataType::TYPE_UINT16) - .value("TYPE_UINT32", ft::DataType::TYPE_UINT32) - .value("TYPE_UINT64", ft::DataType::TYPE_UINT64) - .value("TYPE_INT8", ft::DataType::TYPE_INT8) - .value("TYPE_INT16", ft::DataType::TYPE_INT16) - .value("TYPE_INT32", ft::DataType::TYPE_INT32) - .value("TYPE_INT64", ft::DataType::TYPE_INT64) - .value("TYPE_FP16", ft::DataType::TYPE_FP16) - .value("TYPE_FP32", ft::DataType::TYPE_FP32) - .value("TYPE_FP64", ft::DataType::TYPE_FP64) - .value("TYPE_BYTES", ft::DataType::TYPE_BYTES) - .value("TYPE_BF16", ft::DataType::TYPE_BF16); - - // memory type - py::enum_(m, "MemoryType") - .value("MEMORY_CPU", ft::MemoryType::MEMORY_CPU) - .value("MEMORY_CPU_PINNED", ft::MemoryType::MEMORY_CPU_PINNED) - .value("MEMORY_GPU", ft::MemoryType::MEMORY_GPU); + { + using namespace turbomind; + py::enum_(m, "DataType") + .value("TYPE_INVALID", kNull) + .value("TYPE_BOOL", kBool) + .value("TYPE_UINT8", kUint8) + .value("TYPE_UINT16", kUint16) + .value("TYPE_UINT32", kUint32) + .value("TYPE_UINT64", kUint64) + .value("TYPE_INT8", kInt8) + .value("TYPE_INT16", kInt16) + .value("TYPE_INT32", kInt32) + .value("TYPE_INT64", kInt64) + .value("TYPE_FP16", kFloat16) + .value("TYPE_FP32", kFloat32) + .value("TYPE_FP64", kFloat64) + .value("TYPE_BF16", kBfloat16); + + // memory type + py::enum_(m, "MemoryType") + .value("MEMORY_CPU", ft::DeviceType::kCPU) + .value("MEMORY_CPU_PINNED", ft::DeviceType::kCPUpinned) + .value("MEMORY_GPU", ft::DeviceType::kDEVICE); + } // tensor - py::class_>(m, "Tensor") - .def_property_readonly("where", [](const ManagedTensor& t) { return t->where; }) - .def_property_readonly("type", [](const ManagedTensor& t) { return t->type; }) - .def_property_readonly("shape", [](const ManagedTensor& t) { return t->shape; }) - .def_property_readonly("data", [](const ManagedTensor& t) { return t->data; }) - .def( - "view", - [](const ManagedTensor& self, ft::DataType new_type) { - auto x = self; - x->type = new_type; - return std::make_shared(std::move(x)); - }, - "new_type"_a) - .def( - "view", - [](const ManagedTensor& self, std::vector new_shape) { - auto x = self; - x->shape = new_shape; - return std::make_shared(std::move(x)); - }, - "new_shape"_a) + py::class_>(m, "Tensor") + .def_property_readonly("where", [](const Tensor& t) { return t.device().type; }) + .def_property_readonly("type", [](const Tensor& t) { return t.dtype(); }) + .def_property_readonly("shape", [](const Tensor& t) { return t.shape(); }) + .def_property_readonly("data", [](const Tensor& t) { return t.raw_data(); }) .def( "copy_from", - [](ManagedTensor& self, py::object obj) { + [](Tensor& self, py::object obj) { py::capsule cap = obj.attr("__dlpack__")(); DLManagedTensor* dlmt = static_cast(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName)); auto src = DLManagedTensorToTritonTensor(dlmt); // take ownership of capsule's payload cap.set_name("used_dltensor"); - switch (self->type) { - case ft::TYPE_FP16: - case ft::TYPE_FP32: - case ft::TYPE_INT32: - case ft::TYPE_BF16: { - auto num_element = std::accumulate( - (*src)->shape.begin(), (*src)->shape.end(), 1LL, std::multiplies()); - auto num_bytes = num_element * dlmt->dl_tensor.dtype.bits / 8; - ft::FT_CHECK(self->shape.size() == 1 && num_bytes == self->shape[0]); - safe_memcpy(const_cast(self->data), (*src)->data, num_bytes); - break; - } - default: - ft::FT_CHECK(0); - } + + TM_CHECK_EQ(self.byte_size(), src->byte_size()); + safe_memcpy(self.raw_data(), src->raw_data(), self.byte_size()); }, "tensor"_a) .def( "__dlpack__", - [](ManagedTensor& self, long stream) { + [](Tensor& self, long stream) { DLManagedTensor* dlmt = TritonTensorToDLManagedTensor(self); return py::capsule(dlmt, kDlTensorCapsuleName, [](PyObject* obj) { DLManagedTensor* dlmt = @@ -441,8 +404,8 @@ PYBIND11_MODULE(_turbomind, m) }); }, "stream"_a = 0) - .def("__dlpack_device__", [](const ManagedTensor& self) { - auto device = getDLDevice(*self); + .def("__dlpack_device__", [](const Tensor& self) { + auto device = getDLDevice(self); return std::tuple(int(device.device_type), device.device_id); }); m.def( @@ -458,9 +421,9 @@ PYBIND11_MODULE(_turbomind, m) }, "dl_managed_tensor"_a); - // transformer model instance - using ft::ModelRequest; py::bind_map>(m, "TensorMap"); + + using ft::ModelRequest; py::class_(m, "ModelRequest") .def( "forward", @@ -507,87 +470,80 @@ PYBIND11_MODULE(_turbomind, m) "session_id"_a); // transformer model - using ft::AbstractTransformerModel; using ft::LlamaTritonModel; - py::class_>(m, "AbstractTransformerModel") + py::class_>(m, "AbstractTransformerModel") .def_static( "create_llama_model", [](std::string model_dir, std::string config, - std::string data_type) -> std::shared_ptr { + std::string weight_type) -> std::shared_ptr { auto gil_factory = [] { // // erase the type return std::static_pointer_cast(std::make_shared()); }; - auto no_gil_deleter = [](AbstractTransformerModel* ptr) { + auto no_gil_deleter = [](LlamaTritonModel* ptr) { pybind11::gil_scoped_release release; delete ptr; }; - if (data_type == "half" || data_type == "fp16" || data_type == "float16" || data_type == "int4") { - std::shared_ptr> model( - new LlamaTritonModel(model_dir, config, gil_factory), no_gil_deleter); - return model; + turbomind::DataType data_type{}; + + if (weight_type == "half" || weight_type == "fp16" || weight_type == "float16" + || weight_type == "int4") { + data_type = turbomind::kFloat16; } - else if (data_type == "bf16" || data_type == "bfloat16") { + else if (weight_type == "bf16" || weight_type == "bfloat16") { #ifdef ENABLE_BF16 - std::shared_ptr> model( - new LlamaTritonModel<__nv_bfloat16>(model_dir, config, gil_factory), no_gil_deleter); - return model; + data_type = turbomind::kBfloat16; #else throw std::runtime_error("Error: turbomind has not been built with bf16 support."); #endif } else { #ifdef ENABLE_FP32 - auto model = std::make_shared>(model_dir, config, gil_factory); - return model; + data_type = turbomind::kF32; #else throw std::runtime_error("Error: turbomind has not been built with fp32 support."); #endif } + + std::shared_ptr model(new LlamaTritonModel(data_type, model_dir, config, gil_factory), + no_gil_deleter); + return model; }, "model_dir"_a, - "config"_a = "", - "data_type"_a = "half") + "config"_a = "", + "weight_type"_a = "half") .def( "create_model_instance", - [](AbstractTransformerModel* model, int deviceId) { return model->createModelInstance(deviceId); }, + [](LlamaTritonModel* model, int deviceId) { return model->createModelInstance(deviceId); }, py::call_guard(), "device_id"_a) .def("create_shared_weights", - &AbstractTransformerModel::createSharedWeights, + &LlamaTritonModel::createSharedWeights, py::call_guard(), "device_id"_a, "rank"_a) .def( "get_params", - [](AbstractTransformerModel* model, int deviceId, int rank) { - auto output = model->getParams(deviceId, rank); - TensorMap ret; - for (const auto& [k, v] : output) { - // export reference to weight data only (no ownership) - ret.emplace(k, ManagedTensor{v}); - } - return ret; - }, + [](LlamaTritonModel* model, int deviceId, int rank) { return model->getParams(deviceId, rank); }, py::call_guard(), "device_id"_a, "rank"_a) .def( "process_weight", - [](AbstractTransformerModel* model, int deviceId, int rank) { model->processWeights(deviceId, rank); }, + [](LlamaTritonModel* model, int deviceId, int rank) { model->processWeights(deviceId, rank); }, py::call_guard(), "device_id"_a, "rank"_a) .def( "create_engine", - [](AbstractTransformerModel* model, int deviceId, int rank) { model->createEngine(deviceId, rank); }, + [](LlamaTritonModel* model, int deviceId, int rank) { model->createEngine(deviceId, rank); }, py::call_guard(), "device_id"_a, "rank"_a) - .def("__str__", &AbstractTransformerModel::toString) - .def("__repr__", &AbstractTransformerModel::toString) - .def("get_tensor_para_size", &AbstractTransformerModel::getTensorParaSize) - .def("get_pipeline_para_size", &AbstractTransformerModel::getPipelineParaSize); + .def("__str__", &LlamaTritonModel::toString) + .def("__repr__", &LlamaTritonModel::toString) + .def("get_tensor_para_size", &LlamaTritonModel::getTensorParaSize) + .def("get_pipeline_para_size", &LlamaTritonModel::getPipelineParaSize); } diff --git a/src/turbomind/triton_backend/CMakeLists.txt b/src/turbomind/triton_backend/CMakeLists.txt index e152073204..08c8e4e884 100644 --- a/src/turbomind/triton_backend/CMakeLists.txt +++ b/src/turbomind/triton_backend/CMakeLists.txt @@ -1,35 +1,2 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -cmake_minimum_required (VERSION 3.18) - -project(tritonturbomindbackend LANGUAGES C CXX) - -add_library(TransformerTritonBackend STATIC transformer_triton_backend.cpp) -set_property(TARGET TransformerTritonBackend PROPERTY POSITION_INDEPENDENT_CODE ON) -install(TARGETS TransformerTritonBackend DESTINATION ${CMAKE_INSTALL_LIBDIR}) add_subdirectory(llama) diff --git a/src/turbomind/triton_backend/llama/CMakeLists.txt b/src/turbomind/triton_backend/llama/CMakeLists.txt index 7f745d64b9..756f5ac67d 100644 --- a/src/turbomind/triton_backend/llama/CMakeLists.txt +++ b/src/turbomind/triton_backend/llama/CMakeLists.txt @@ -26,11 +26,10 @@ add_library(LlamaTritonBackend STATIC ${llama_triton_backend_files}) set_property(TARGET LlamaTritonBackend PROPERTY POSITION_INDEPENDENT_CODE ON) target_link_libraries(LlamaTritonBackend PUBLIC - TransformerTritonBackend Llama device_comm host_comm - tensor + core memory_utils CUDA::cublasLt yaml-cpp::yaml-cpp) diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index af48fbba3f..a1b33a8316 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -20,19 +20,23 @@ #include #include +#include #include + #include #include "src/turbomind/comm/device_comm.h" #include "src/turbomind/comm/host_comm.h" +#include "src/turbomind/core/allocator.h" +#include "src/turbomind/core/check.h" +#include "src/turbomind/core/tensor.h" #include "src/turbomind/engine/gateway.h" #include "src/turbomind/engine/model_request.h" #include "src/turbomind/models/llama/LlamaDenseWeight.h" #include "src/turbomind/models/llama/LlamaV2.h" #include "src/turbomind/models/llama/context.h" #include "src/turbomind/models/llama/llama_params.h" -#include "src/turbomind/utils/allocator.h" #include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h" @@ -141,8 +145,7 @@ std::map> getLoraPattern(std::string patte return res; } -template -void LlamaTritonModel::handleMissingParams() +void LlamaTritonModel::handleMissingParams() { if (model_param_.kv_head_num == 0) { model_param_.kv_head_num = model_param_.head_num; @@ -173,12 +176,6 @@ void LlamaTritonModel::handleMissingParams() TM_LOG_WARNING("[LlamaTritonModel] `session_len` is not set, default to %d.", (int)engine_param_.session_len); } - if (!engine_param_.max_prefill_token_num) { - engine_param_.max_prefill_token_num = 8192; - TM_LOG_WARNING("[LlamaTritonModel] `max_prefill_token_num` is not set, default to %d.", - (int)engine_param_.max_prefill_token_num); - } - if (!engine_param_.max_context_token_num) { engine_param_.max_context_token_num = engine_param_.session_len; TM_LOG_WARNING("[LlamaTritonModel] `max_context_token_num` is not set, default to %d.", @@ -219,8 +216,7 @@ void LlamaTritonModel::handleMissingParams() } } -template -LlamaTritonModel::~LlamaTritonModel() +LlamaTritonModel::~LlamaTritonModel() { FT_CHECK(weights_.size() == engines_.size()); @@ -235,11 +231,17 @@ LlamaTritonModel::~LlamaTritonModel() } } -template -LlamaTritonModel::LlamaTritonModel(std::string model_dir, - std::string config, - std::function()> ffi_ctx_factory): - model_param_{}, attn_param_{}, moe_param_{}, lora_param_{}, engine_param_{}, weights_(getDeviceCount()) +LlamaTritonModel::LlamaTritonModel(DataType dtype, + std::string model_dir, + std::string config, + std::function()> ffi_ctx_factory): + dtype_{dtype}, + model_param_{}, + attn_param_{}, + moe_param_{}, + lora_param_{}, + engine_param_{}, + weights_(getDeviceCount()) { FT_CHECK_WITH_INFO(!(config.empty() && model_dir.empty()), "invalid init options"); @@ -298,8 +300,10 @@ LlamaTritonModel::LlamaTritonModel(std::string mod // rotary embedding parameters parse_rope_param(attention_reader["rope_param"], attn_param_.rope); - engine_param_.max_batch_size = engine_reader["max_batch_size"].as(0); - engine_param_.max_prefill_token_num = engine_reader["max_prefill_token_num"].as(0); + engine_param_.max_batch_size = engine_reader["max_batch_size"].as(0); + auto max_forward_token_num = engine_reader["max_prefill_token_num"].as(0); + max_forward_token_num += engine_param_.max_batch_size; + engine_param_.max_context_token_num = engine_reader["max_context_token_num"].as(0); engine_param_.session_len = model_reader["session_len"].as(0); @@ -319,6 +323,11 @@ LlamaTritonModel::LlamaTritonModel(std::string mod engine_param_.mlp_tp_size = engine_reader["mlp_tp_size"].as(); engine_param_.mlp_tp_rank = 0; + { + auto tp = engine_param_.attn_tp_size; + engine_param_.max_forward_token_num = ((size_t)max_forward_token_num + tp - 1) / tp * tp; + } + comm_size_ = engine_param_.attn_dp_size * engine_param_.attn_tp_size; FT_CHECK(engine_param_.mlp_tp_size == comm_size_); @@ -355,19 +364,19 @@ LlamaTritonModel::LlamaTritonModel(std::string mod const std::string weight_type_str = model_reader["weight_type"].as(); if (weight_type_str == "fp16" || weight_type_str == "float16") { - model_param_.weight_type = WeightType::kFP16; + model_param_.weight_type = kFloat16; } else if (weight_type_str == "bf16" || weight_type_str == "bfloat16") { - model_param_.weight_type = WeightType::kBF16; + model_param_.weight_type = kBfloat16; } else if (weight_type_str == "fp32") { - model_param_.weight_type = WeightType::kFP32; + model_param_.weight_type = kFloat32; } else if (weight_type_str == "int8") { - model_param_.weight_type = WeightType::kINT8; + model_param_.weight_type = kUint8; } else if (weight_type_str == "int4") { - model_param_.weight_type = WeightType::kINT4; + model_param_.weight_type = kUint4; } else { std::cout << "[ERROR] Unsupported weight type: '" << weight_type_str << "'\n"; @@ -402,51 +411,33 @@ LlamaTritonModel::LlamaTritonModel(std::string mod TM_LOG_INFO("%s", toString().c_str()); } -template -std::unique_ptr LlamaTritonModel::createModelInstance(int device_id) +std::unique_ptr LlamaTritonModel::createModelInstance(int device_id) { check_cuda_error(cudaSetDevice(device_id)); FT_CHECK(engines_[device_id] != nullptr); - return std::make_unique(gateway_.get(), - getTensorType(), - engine_param_.session_len, - model_param_.vocab_size, - model_param_.hidden_units); + return std::make_unique( + gateway_.get(), dtype_, engine_param_.session_len, model_param_.vocab_size, model_param_.hidden_units); } -template -void LlamaTritonModel::createSharedWeights(int device_id, int rank) noexcept +void LlamaTritonModel::createSharedWeights(int device_id, int rank) { check_cuda_error(cudaSetDevice(device_id)); - weights_[rank] = std::make_shared>(model_param_, engine_params_.at(rank), lora_param_, moe_param_); + weights_[rank] = + std::make_shared(dtype_, model_param_, engine_params_.at(rank), lora_param_, moe_param_); // model inited with model_dir - if (model_dir_ != "") { - weights_[device_id]->loadModel(model_dir_); - } + // if (model_dir_ != "") { + // weights_[device_id]->loadModel(model_dir_); + // } } -template -std::unordered_map LlamaTritonModel::getParams(int device_id, int rank) noexcept +TensorMap LlamaTritonModel::getParams(int device_id, int rank) { - check_cuda_error(cudaSetDevice(device_id)); - - // shared_weight should be created before getParams - FT_CHECK(weights_[rank] != nullptr); - - TensorMap output = weights_[rank]->getParams(); - - std::unordered_map result; - for (auto [name, tensor] : output) { - result.insert({{name, Tensor{tensor.where, tensor.type, tensor.shape, tensor.data}}}); - } - - return result; + return TM_CHECK_NOTNULL(weights_[rank])->get_parameters(); } -template -void LlamaTritonModel::processWeights(int device_id, int rank) noexcept +void LlamaTritonModel::processWeights(int device_id, int rank) { check_cuda_error(cudaSetDevice(device_id)); FT_CHECK(weights_[device_id] != nullptr); @@ -458,8 +449,7 @@ void LlamaTritonModel::processWeights(int device_id, int rank) noexcept sync_check_cuda_error(); } -template -Communicators LlamaTritonModel::createCommSplits(int rank) +Communicators LlamaTritonModel::createCommSplits(int rank) { Communicators comm{}; @@ -483,12 +473,13 @@ Communicators LlamaTritonModel::createCommSplits(int rank) return comm; } -template -void LlamaTritonModel::createEngine(int device_id, int rank) +void LlamaTritonModel::createEngine(int device_id, int rank) { check_cuda_error(cudaSetDevice(device_id)); - auto ctx = std::make_unique>(device_id); + auto ctx = std::make_unique(device_id); + + core::ContextGuard guard{ctx->core_stream, ctx->allocator, Allocator{kCPUpinned}}; ctx->comm = createCommSplits(rank); @@ -499,25 +490,27 @@ void LlamaTritonModel::createEngine(int device_id, int rank) h_comm->Sync(); - auto model = std::make_unique>(model_param_, // - engine_param, - attn_param_, - moe_param_, - lora_param_, - *ctx, - engine_param_.max_batch_size, - weights_[device_id]); + auto model = std::make_unique(dtype_, + model_param_, // + engine_param, + attn_param_, + moe_param_, + lora_param_, + *ctx, + engine_param_.max_batch_size, + weights_[device_id]); h_comm->Sync(); try { const int dp_rank = engine_param.outer_dp_rank * engine_param.attn_dp_size + engine_param.attn_dp_rank; - engines_[device_id] = std::make_unique>(engine_param_, // - std::move(model), - std::move(ctx), - gateway_, - device_id, - dp_rank); + engines_[device_id] = std::make_unique(dtype_, + engine_param_, // + std::move(model), + std::move(ctx), + gateway_, + device_id, + dp_rank); } catch (const std::exception& e) { TM_LOG_ERROR("[Engine][Init] %s", e.what()); @@ -544,8 +537,7 @@ void LlamaTritonModel::createEngine(int device_id, int rank) engine.Start(); } -template -std::string LlamaTritonModel::toString() +std::string LlamaTritonModel::toString() { std::stringstream ss; ss << "Model: " // @@ -556,7 +548,6 @@ std::string LlamaTritonModel::toString() << "\nnum_layer: " << model_param_.layer_num << "\nvocab_size: " << model_param_.vocab_size << "\nattn_bias: " << model_param_.attn_bias << "\nqk_norm: " << model_param_.qk_norm << "\nmax_batch_size: " << engine_param_.max_batch_size - << "\nmax_prefill_token_num: " << engine_param_.max_prefill_token_num << "\nmax_context_token_num: " << engine_param_.max_context_token_num << "\nnum_tokens_per_iter: " << engine_param_.num_tokens_per_iter << "\nmax_prefill_iters: " << engine_param_.max_prefill_iters << "\nsession_len: " << engine_param_.session_len @@ -574,24 +565,14 @@ std::string LlamaTritonModel::toString() return ss.str(); } -template -int LlamaTritonModel::getTensorParaSize() +int LlamaTritonModel::getTensorParaSize() { return engine_param_.attn_tp_size; } -template -int LlamaTritonModel::getPipelineParaSize() +int LlamaTritonModel::getPipelineParaSize() { return 1; } -#ifdef ENABLE_FP32 -template struct LlamaTritonModel; -#endif -template struct LlamaTritonModel; -#ifdef ENABLE_BF16 -template struct LlamaTritonModel<__nv_bfloat16>; -#endif - } // namespace turbomind diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.h b/src/turbomind/triton_backend/llama/LlamaTritonModel.h index 12fc3abffc..f58c982fd8 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h @@ -21,38 +21,44 @@ #pragma once #include +#include +#include #include "src/turbomind/comm/device_comm.h" + #include "src/turbomind/engine/gateway.h" +#include "src/turbomind/engine/model_request.h" + #include "src/turbomind/models/llama/LlamaBatch.h" #include "src/turbomind/models/llama/LlamaWeight.h" #include "src/turbomind/models/llama/context.h" #include "src/turbomind/models/llama/llama_params.h" -#include "src/turbomind/triton_backend/transformer_triton_backend.hpp" - namespace turbomind { -template -class LlamaTritonModel: public AbstractTransformerModel { +class LlamaTritonModel { public: - LlamaTritonModel(std::string model_dir, std::string config, std::function()> ffi_ctx_factory); + LlamaTritonModel(DataType dtype, + std::string model_dir, + std::string config, + std::function()> ffi_ctx_factory); + + ~LlamaTritonModel(); - ~LlamaTritonModel() override; + std::unique_ptr createModelInstance(int deviceId); - std::unique_ptr createModelInstance(int deviceId) override; + void createSharedWeights(int deviceId, int rank); - void createSharedWeights(int deviceId, int rank) noexcept override; + TensorMap getParams(int deviceId, int rank); - std::unordered_map getParams(int deviceId, int rank) noexcept override; + void processWeights(int deviceId, int rank); - void processWeights(int deviceId, int rank) noexcept override; + void createEngine(int device_id, int rank); - void createEngine(int device_id, int rank) override; + std::string toString(); - std::string toString() override; - int getTensorParaSize() override; - int getPipelineParaSize() override; + int getTensorParaSize(); + int getPipelineParaSize(); private: void handleMissingParams(); @@ -60,6 +66,7 @@ class LlamaTritonModel: public AbstractTransformerModel { Communicators createCommSplits(int rank); private: + DataType dtype_; ModelParam model_param_; AttentionParam attn_param_; MoeParam moe_param_; @@ -76,8 +83,8 @@ class LlamaTritonModel: public AbstractTransformerModel { std::shared_ptr gateway_; // Weights & engine instances for the ranks - std::vector>> weights_; - std::vector>> engines_; + std::vector> weights_; + std::vector> engines_; bool is_fp16_; diff --git a/src/turbomind/triton_backend/transformer_triton_backend.cpp b/src/turbomind/triton_backend/transformer_triton_backend.cpp deleted file mode 100644 index 5268ad723c..0000000000 --- a/src/turbomind/triton_backend/transformer_triton_backend.cpp +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (c) OpenMMLab. All rights reserved. - * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Modified from -// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.cpp - -#include "src/turbomind/triton_backend/transformer_triton_backend.hpp" - -namespace turbomind { - -} // namespace turbomind diff --git a/src/turbomind/triton_backend/transformer_triton_backend.hpp b/src/turbomind/triton_backend/transformer_triton_backend.hpp deleted file mode 100644 index 6ebcdc9e11..0000000000 --- a/src/turbomind/triton_backend/transformer_triton_backend.hpp +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) OpenMMLab. All rights reserved. - * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Modified from -// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.hpp - -#pragma once - -#include "src/turbomind/comm/device_comm.h" -#include -#include -#include - -#ifdef __linux__ -#include -#endif - -#include "src/turbomind/utils/Tensor.h" - -#include "src/turbomind/engine/model_request.h" - -namespace turbomind { - -using triton_stream_cb_t = std::function>, void*)>; - -struct AbstractTransformerModel; -struct AbstractTransformerModelInstance; - -struct AbstractTransformerModelInstance { - virtual ~AbstractTransformerModelInstance() = default; - - virtual std::shared_ptr> - forward(std::shared_ptr> input_tensors) = 0; - - void registerCallback(triton_stream_cb_t cb, void* ctx) - { - stream_cb_ = cb; - stream_ctx_ = ctx; - } - - void unRegisterCallback() - { - stream_cb_ = nullptr; - stream_ctx_ = nullptr; - } - - triton_stream_cb_t stream_cb_ = nullptr; - void* stream_ctx_ = nullptr; -}; - -struct AbstractTransformerModel { - - virtual ~AbstractTransformerModel() = default; - - virtual std::unique_ptr createModelInstance(int deviceId) = 0; - - virtual void createSharedWeights(int deviceId, int rank) = 0; - - virtual std::unordered_map getParams(int deviceId, int rank) = 0; - - virtual void processWeights(int deviceId, int rank) = 0; - - virtual void createEngine(int device_id, int rank) = 0; - - virtual std::string toString() = 0; - - virtual int getTensorParaSize() = 0; - virtual int getPipelineParaSize() = 0; -}; - -} // namespace turbomind diff --git a/src/turbomind/utils/CMakeLists.txt b/src/turbomind/utils/CMakeLists.txt index fe6584543a..f9aa832696 100644 --- a/src/turbomind/utils/CMakeLists.txt +++ b/src/turbomind/utils/CMakeLists.txt @@ -16,31 +16,16 @@ cmake_minimum_required(VERSION 3.8) find_package(CUDAToolkit REQUIRED) -add_subdirectory(gemm_test) - add_library(cuda_utils STATIC cuda_utils.cc) set_property(TARGET cuda_utils PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET cuda_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) -target_link_libraries(cuda_utils PUBLIC CUDA::cudart) +target_link_libraries(cuda_utils PUBLIC CUDA::cudart CUDA::cuda_driver) add_library(logger STATIC logger.cc) set_property(TARGET logger PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET logger PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) target_link_libraries(logger PUBLIC CUDA::cudart) -add_library(cublasAlgoMap STATIC cublasAlgoMap.cc) -set_property(TARGET cublasAlgoMap PROPERTY POSITION_INDEPENDENT_CODE ON) -set_property(TARGET cublasAlgoMap PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) -target_link_libraries(cublasAlgoMap PUBLIC CUDA::cublas CUDA::cudart CUDA::curand cuda_utils logger) - -add_library(cublasMMWrapper STATIC cublasMMWrapper.cc) -set_property(TARGET cublasMMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON) -set_property(TARGET cublasMMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) -target_link_libraries(cublasMMWrapper PUBLIC CUDA::cublas CUDA::cudart CUDA::curand cublasAlgoMap cuda_utils logger) -if (SPARSITY_SUPPORT) -target_link_libraries(cublasMMWrapper PUBLIC CUDA::cusparse -lcusparseLt) -endif() - add_library(nvtx_utils STATIC nvtx_utils.cc) set_property(TARGET nvtx_utils PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET nvtx_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) @@ -53,38 +38,7 @@ endif() add_library(memory_utils STATIC memory_utils.cu) set_property(TARGET memory_utils PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET memory_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) -target_link_libraries(memory_utils PUBLIC cuda_utils logger tensor) - -# add_library(mpi_utils STATIC mpi_utils.cc) -# set_property(TARGET mpi_utils PROPERTY POSITION_INDEPENDENT_CODE ON) -# set_property(TARGET mpi_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) -# if (BUILD_MULTI_GPU) -# target_link_libraries(mpi_utils PUBLIC ${MPI_CXX_LIBRARIES} logger) -# endif() - -add_library(cublasINT8MMWrapper STATIC cublasINT8MMWrapper.cc) -set_property(TARGET cublasINT8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON) -set_property(TARGET cublasINT8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) -target_link_libraries(cublasINT8MMWrapper PUBLIC CUDA::cublasLt CUDA::cudart CUDA::curand cublasAlgoMap cublasMMWrapper cuda_utils logger) - -add_library(gemm STATIC gemm.cc) -set_property(TARGET gemm PROPERTY POSITION_INDEPENDENT_CODE ON) -set_property(TARGET gemm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) -target_link_libraries(gemm PUBLIC - CUDA::cublas CUDA::cublasLt CUDA::cudart CUDA::curand - cublasAlgoMap memory_utils cuda_utils logger) -if (SPARSITY_SUPPORT) - target_link_libraries(gemm PUBLIC CUDA::cusparse -lcusparseLt) -endif() - -add_library(cuda_fp8_utils STATIC cuda_fp8_utils.cu) -set_property(TARGET cuda_fp8_utils PROPERTY POSITION_INDEPENDENT_CODE ON) -set_property(TARGET cuda_fp8_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) - -add_library(tensor STATIC Tensor.cc) -set_property(TARGET tensor PROPERTY POSITION_INDEPENDENT_CODE ON) -set_property(TARGET tensor PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) -target_link_libraries(tensor PUBLIC cuda_utils logger) +target_link_libraries(memory_utils PUBLIC cuda_utils logger) add_library(anomaly_handler STATIC anomaly_handler.cu) set_property(TARGET anomaly_handler PROPERTY POSITION_INDEPENDENT_CODE ON) diff --git a/src/turbomind/utils/Tensor.cc b/src/turbomind/utils/Tensor.cc deleted file mode 100644 index 7a2cedac13..0000000000 --- a/src/turbomind/utils/Tensor.cc +++ /dev/null @@ -1,441 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "src/turbomind/utils/Tensor.h" -#include "src/turbomind/utils/cuda_bf16_wrapper.h" -#include "src/turbomind/utils/cuda_utils.h" -#include "src/turbomind/utils/string_utils.h" - -#include "stdlib.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace fs = std::filesystem; -namespace turbomind { - -Tensor::Tensor(): - // a none tensor. - where(MEMORY_CPU), - type(TYPE_INVALID), - shape({}), - data(nullptr), - offsets({}) // only a record to record offset -{ -} - -Tensor::Tensor(const MemoryType _where, const DataType _type, const std::vector _shape, const void* _data): - where(_where), type(_type), shape(_shape), data(const_cast(_data)) -{ -} - -Tensor::Tensor(const MemoryType _where, - const DataType _type, - const std::vector _shape, - const void* _data, - const std::vector _offset): - where(_where), type(_type), shape(_shape), data(const_cast(_data)), offsets(_offset) -{ -} - -void Tensor::parseNpyIntro(FILE*& f_ptr, uint32_t& header_len, uint32_t& start_data) -{ - const char magic[] = "\x93" - "NUMPY"; - char magic_test[sizeof(magic)] = "\0"; - - size_t n_elems = fread((void*)magic_test, sizeof(char), sizeof(magic) - 1, f_ptr); - if (n_elems != sizeof(magic) - 1 || std::string(magic) != std::string(magic_test)) { - throw std::runtime_error("Could read magic token in NPY file"); - } - - uint8_t npy_major = 0; - uint8_t npy_minor = 0; - n_elems = fread((void*)&npy_major, sizeof(uint8_t), 1, f_ptr); - n_elems += fread((void*)&npy_minor, sizeof(uint8_t), 1, f_ptr); - - if (npy_major == 1) { - uint16_t header_len_u16 = 0; - n_elems = fread((void*)&header_len_u16, sizeof(uint16_t), 1, f_ptr); - header_len = header_len_u16; - } - else if (npy_major == 2) { - uint32_t header_len_u32 = 0; - n_elems = fread((void*)&header_len_u32, sizeof(uint32_t), 1, f_ptr); - header_len = header_len_u32; - } - else { - throw std::runtime_error("Unsupported npy version: " + std::to_string(npy_major)); - } - - start_data = 8 + 2 * npy_major + header_len; -} - -int Tensor::parseNpyHeader(FILE*& f_ptr, uint32_t header_len, DataType& type, std::vector& shape) -{ - char* header_c = (char*)malloc(header_len * sizeof(char)); - size_t n_elems = fread((void*)header_c, sizeof(char), header_len, f_ptr); - if (n_elems != header_len) { - free(header_c); - return -1; - } - std::string header(header_c, header_len); - free(header_c); - - size_t start, end; - start = header.find("'descr'") + 7; - start = header.find("'", start); - end = header.find("'", start + 1); - type = typeFromNumpyDesc(header.substr(start + 1, end - start - 1)); - - start = header.find("'fortran_order'") + 15; - start = header.find(":", start); - end = header.find(",", start + 1); - if (header.substr(start + 1, end - start - 1).find("False") == std::string::npos) { - throw std::runtime_error("Unsupported value for fortran_order while reading npy file"); - } - - start = header.find("'shape'") + 7; - start = header.find("(", start); - end = header.find(")", start + 1); - - std::istringstream shape_stream(header.substr(start + 1, end - start - 1)); - std::string token; - - shape.clear(); - while (std::getline(shape_stream, token, ',')) { - if (token.find_first_not_of(' ') == std::string::npos) { - break; - } - shape.push_back(std::stoul(token)); - } - - return 0; -} - -Tensor Tensor::loadNpy(const std::string& npy_file, const MemoryType where) -{ - DataType type; - std::vector shape; - - FILE* f_ptr = fopen(npy_file.c_str(), "rb"); - if (f_ptr == nullptr) { - throw std::runtime_error("Could not open file " + npy_file); - } - uint32_t header_len, start_data; - parseNpyIntro(f_ptr, header_len, start_data); - parseNpyHeader(f_ptr, header_len, type, shape); - - const size_t size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); - void* data_cpu = malloc(size * Tensor::getTypeSize(type)); - void* data = data_cpu; - - size_t n_elems = fread(data_cpu, Tensor::getTypeSize(type), size, f_ptr); - FT_CHECK_WITH_INFO(n_elems == size, "reading tensor failed"); - if (where == MEMORY_GPU) { - cudaMalloc(&data, size * Tensor::getTypeSize(type)); - cudaMemcpy(data, data_cpu, size * Tensor::getTypeSize(type), cudaMemcpyHostToDevice); - free(data_cpu); - } - - fclose(f_ptr); - return Tensor(where, type, shape, data); -} - -size_t Tensor::size() const -{ - if (data == nullptr || shape.size() == 0) { - return 0; - } - return std::accumulate(shape.begin(), shape.end(), (size_t)1, std::multiplies()); -} - -size_t Tensor::sizeBytes() const -{ - return size() * Tensor::getTypeSize(type); -} - -std::string Tensor::whereToString() const -{ - static const std::unordered_map mem_to_string{ - {MEMORY_CPU, "CPU"}, {MEMORY_CPU_PINNED, "CPU_PINNED"}, {MEMORY_GPU, "GPU"}}; - return mem_to_string.at(where); -} - -std::string Tensor::toString() const -{ - std::string memtype_str = whereToString(); - - static const std::unordered_map type_to_string{ - {TYPE_BOOL, "BOOL"}, - {TYPE_UINT8, "UINT8"}, - {TYPE_UINT16, "UINT16"}, - {TYPE_UINT32, "UINT32"}, - {TYPE_UINT64, "UINT64"}, - {TYPE_INT8, "INT8"}, - {TYPE_INT16, "INT16"}, - {TYPE_INT32, "INT32"}, - {TYPE_INT64, "INT64"}, - {TYPE_BF16, "BF16"}, - {TYPE_FP16, "FP16"}, - {TYPE_FP32, "FP32"}, - {TYPE_FP64, "FP64"}, - {TYPE_BYTES, "BYTES"}, - {TYPE_INVALID, "INVALID"}, - {TYPE_FP8_E4M3, "E4M3"}, - {TYPE_VOID, "VOID"}, - }; - return fmtstr("Tensor[where=%s, type=%s, shape=%s, data=%p]", - memtype_str.c_str(), - type_to_string.at(type).c_str(), - vec2str(shape).c_str(), - data); -} - -DataType Tensor::typeFromNumpyDesc(std::string type) -{ - static const std::unordered_map type_map{{"?", TYPE_BOOL}, - {"b", TYPE_BYTES}, - {"u1", TYPE_UINT8}, - {"u2", TYPE_UINT16}, - {"u4", TYPE_UINT32}, - {"u8", TYPE_UINT64}, - {"i1", TYPE_INT8}, - {"i2", TYPE_INT16}, - {"i4", TYPE_INT32}, - {"i8", TYPE_INT64}, - {"f2", TYPE_FP16}, - {"f4", TYPE_FP32}, - {"f8", TYPE_FP64}}; - return type_map.at(type); -} - -size_t Tensor::getTypeSize(DataType type) -{ - static const std::unordered_map type_map{{TYPE_BOOL, sizeof(bool)}, - {TYPE_BYTES, sizeof(char)}, - {TYPE_UINT8, sizeof(uint8_t)}, - {TYPE_UINT16, sizeof(uint16_t)}, - {TYPE_UINT32, sizeof(uint32_t)}, - {TYPE_UINT64, sizeof(uint64_t)}, - {TYPE_INT8, sizeof(int8_t)}, - {TYPE_INT16, sizeof(int16_t)}, - {TYPE_INT32, sizeof(int32_t)}, - {TYPE_INT64, sizeof(int64_t)}, -#ifdef ENABLE_BF16 - {TYPE_BF16, sizeof(__nv_bfloat16)}, -#endif -#ifdef ENABLE_FP8 - {TYPE_FP8_E4M3, sizeof(__nv_fp8_e4m3)}, -#endif - {TYPE_FP16, sizeof(half)}, - {TYPE_FP32, sizeof(float)}, - {TYPE_FP64, sizeof(double)}}; - return type_map.at(type); -} - -std::string Tensor::getNumpyTypeDesc(DataType type) const -{ - static const std::unordered_map type_map{{TYPE_INVALID, "x"}, - {TYPE_BOOL, "?"}, - {TYPE_BYTES, "b"}, - {TYPE_UINT8, "u1"}, - {TYPE_UINT16, "u2"}, - {TYPE_UINT32, "u4"}, - {TYPE_UINT64, "u8"}, - {TYPE_INT8, "i1"}, - {TYPE_INT16, "i2"}, - {TYPE_INT32, "i4"}, - {TYPE_INT64, "i8"}, - {TYPE_FP16, "f2"}, - {TYPE_FP32, "f4"}, - {TYPE_FP64, "f8"}}; - - if (type == TYPE_BF16) { - TM_LOG_WARNING("getNumpyTypeDesc(TYPE_BF16) returns an invalid type 'x' since Numpy doesn't " - "support bfloat16 as of now, it will be properly extended if numpy supports. " - "Please refer for the discussions https://github.com/numpy/numpy/issues/19808."); - } - - return type_map.count(type) > 0 ? type_map.at(type) : "x"; -} - -void Tensor::saveNpy(const std::string& filename) const -{ - // Save tensor to NPY 1.0 format (see https://numpy.org/neps/nep-0001-npy-format.html) - void* cpu_data = (void*)data; - bool is_data_temp = false; - size_t tensor_size = size(); - if (where == MemoryType::MEMORY_GPU) { - cpu_data = malloc(tensor_size * Tensor::getTypeSize(type)); - is_data_temp = true; - cudaDeviceSynchronize(); - cudaMemcpy(cpu_data, data, tensor_size * Tensor::getTypeSize(type), cudaMemcpyDeviceToHost); - } - - const char magic[] = "\x93" - "NUMPY"; - const uint8_t npy_major = 1; - const uint8_t npy_minor = 0; - - std::stringstream header_stream; - header_stream << "{'descr': '" << getNumpyTypeDesc(type) << "', 'fortran_order': False, 'shape': ("; - for (size_t i = 0; i < shape.size(); ++i) { - header_stream << shape[i]; - if (i + 1 < shape.size() || shape.size() == 1) { - header_stream << ", "; - } - } - header_stream << ")}"; - int base_length = 6 + 4 + header_stream.str().size(); - int pad_length = 16 * ((base_length + 1 + 15) / 16); // Take ceiling of base_length + 1 (for '\n' ending) - for (int i = 0; i < pad_length - base_length; ++i) { - header_stream << ((i == pad_length - base_length - 1) ? "\n" : "\x20"); - } - std::string header = header_stream.str(); - const uint16_t header_len = header.size(); - - FILE* f_ptr = fopen(filename.c_str(), "wb"); - FT_CHECK_WITH_INFO(f_ptr != nullptr, fmtstr("Unable to open %s for writing.\n", filename.c_str())); - - fwrite(magic, sizeof(char), sizeof(magic) - 1, f_ptr); - fwrite(&npy_major, sizeof(uint8_t), 1, f_ptr); - fwrite(&npy_minor, sizeof(uint8_t), 1, f_ptr); - fwrite(&header_len, sizeof(uint16_t), 1, f_ptr); - fwrite(header.c_str(), sizeof(char), header_len, f_ptr); - fwrite(cpu_data, Tensor::getTypeSize(type), tensor_size, f_ptr); - - fclose(f_ptr); - - if (is_data_temp) { - free(cpu_data); - } -} - -Tensor Tensor::slice(std::vector shape, size_t offset) const -{ - if (this->data != nullptr) { - size_t n_elts = this->size(); - size_t n_sliced_elts = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); - FT_CHECK_WITH_INFO( - n_sliced_elts + offset <= n_elts, - fmtstr("The number (%ld) of elements of sliced tensor exceeds that (%ld) of the original tensor", - n_sliced_elts + offset, - n_elts)); - } - return Tensor(this->where, this->type, shape, this->getPtrWithOffset(offset)); -} - -TensorMap::TensorMap(const std::unordered_map& tensor_map) -{ - for (auto& kv : tensor_map) { - insert(kv.first, kv.second); - } -} - -TensorMap::TensorMap(const std::vector& tensor_map) -{ - for (size_t i = 0; i < tensor_map.size(); i++) { - insert(std::to_string(i), tensor_map[i]); - } -} - -TensorMap::TensorMap(std::initializer_list> tensor_map) -{ - for (auto& pair : tensor_map) { - insert(pair.first, pair.second); - } -} - -TensorMap::~TensorMap() -{ - tensor_map_.clear(); -} - -std::vector TensorMap::keys() const -{ - std::vector key_names; - for (auto& kv : tensor_map_) { - key_names.push_back(kv.first); - } - return key_names; -} - -std::string TensorMap::toString() -{ - std::stringstream ss; - ss << "{"; - std::vector key_names = keys(); - for (size_t i = 0; i < tensor_map_.size(); ++i) { - ss << key_names[i] << ": " << at(key_names[i]).toString(); - if (i < tensor_map_.size() - 1) { - ss << ", "; - } - } - ss << "}"; - return ss.str(); -} - -TensorMap TensorMap::fromNpyFolder(const std::string& base_folder) -{ - TensorMap ret_tensor; - for (auto const& entry : fs::directory_iterator{base_folder}) { - std::string filename = entry.path().stem().string(); - size_t len = filename.length(); - if (len < 4 || filename.compare(len - 4, 4, ".npy")) { - continue; - } - - size_t pos = filename.find('-'); - FT_CHECK_WITH_INFO(pos != std::string::npos, fmtstr("Invalid filename: %s\n", filename.c_str())); - - MemoryType where; - if (filename.compare(0, pos, "GPU") == 0) { - where = MEMORY_GPU; - } - else if (filename.compare(0, pos, "CPU") == 0) { - where = MEMORY_CPU; - } - else if (filename.compare(0, pos, "CPU_PINNED") == 0) { - where = MEMORY_CPU_PINNED; - } - else { - FT_CHECK_WITH_INFO(false, fmtstr("Invalid filename: %s\n", filename.c_str())); - } - std::string key = filename.substr(pos + 1, len - pos - 5); - - ret_tensor.tensor_map_.insert({key, Tensor::loadNpy(base_folder + "/" + filename, where)}); - } - return ret_tensor; -} - -void TensorMap::saveNpy(const std::string& base_folder) -{ - bool ret = fs::exists(base_folder) | fs::create_directory(base_folder); - FT_CHECK_WITH_INFO(ret == true, fmtstr("Could not create folder %s.\n", base_folder.c_str())); - for (const auto& item : tensor_map_) { - item.second.saveNpy(base_folder + "/" + item.second.whereToString() + "-" + item.first + ".npy"); - } -} - -} // namespace turbomind diff --git a/src/turbomind/utils/Tensor.h b/src/turbomind/utils/Tensor.h deleted file mode 100644 index bf9840314c..0000000000 --- a/src/turbomind/utils/Tensor.h +++ /dev/null @@ -1,582 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "src/turbomind/macro.h" -#include "src/turbomind/utils/cuda_bf16_wrapper.h" -#include "src/turbomind/utils/cuda_fp8_utils.h" -#include "src/turbomind/utils/cuda_utils.h" -#include "src/turbomind/utils/string_utils.h" - -#include "stdlib.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace turbomind { - -typedef enum datatype_enum -{ - TYPE_INVALID, - TYPE_BOOL, - TYPE_UINT8, - TYPE_UINT16, - TYPE_UINT32, - TYPE_UINT64, - TYPE_INT8, - TYPE_INT16, - TYPE_INT32, - TYPE_INT64, - TYPE_FP16, - TYPE_FP32, - TYPE_FP64, - TYPE_BYTES, - TYPE_BF16, - TYPE_FP8_E4M3, - TYPE_STR, - TYPE_VOID, -} DataType; - -template -DataType getTensorType() -{ - if (std::is_same::value || std::is_same::value) { - return TYPE_FP32; - } - else if (std::is_same::value || std::is_same::value) { - return TYPE_FP16; - } -#ifdef ENABLE_BF16 - else if (std::is_same::value || std::is_same::value) { - return TYPE_BF16; - } -#endif -#ifdef ENABLE_FP8 - else if (std::is_same::value || std::is_same::value) { - return TYPE_FP8_E4M3; - } -#endif - else if (std::is_same::value || std::is_same::value) { - return TYPE_INT32; - } - else if (std::is_same::value || std::is_same::value) { - return TYPE_INT8; - } - else if (std::is_same::value || std::is_same::value) { - return TYPE_UINT32; - } - else if (std::is_same::value || std::is_same::value) { - return TYPE_UINT64; - } - else if (std::is_same::value || std::is_same::value) { - return TYPE_BOOL; - } - else if (std::is_same::value || std::is_same::value) { - return TYPE_BYTES; - } - else if (std::is_pointer_v && sizeof(T) == sizeof(uint64_t)) { - return TYPE_UINT64; - } - else { - return TYPE_INVALID; - } -} - -static inline size_t get_elem_size(DataType type) -{ - switch (type) { - case DataType::TYPE_FP16: - case DataType::TYPE_BF16: - case DataType::TYPE_INT16: - return 2; - case DataType::TYPE_FP32: - case DataType::TYPE_INT32: - case DataType::TYPE_UINT32: - return 4; - case DataType::TYPE_UINT64: - case DataType::TYPE_INT64: - return 8; - case DataType::TYPE_UINT8: - return 1; - default: - throw std::runtime_error("not supported"); - } -} - -typedef enum memorytype_enum -{ - MEMORY_CPU, - MEMORY_CPU_PINNED, - MEMORY_GPU -} MemoryType; - -struct Tensor { - MemoryType where; - DataType type; - std::vector shape; - void* data; - std::vector offsets = std::vector{}; - - Tensor(); - Tensor(const MemoryType _where, const DataType _type, const std::vector _shape, const void* _data); - Tensor(const MemoryType _where, - const DataType _type, - const std::vector _shape, - const void* _data, - const std::vector _offset); - - size_t size() const; - size_t sizeBytes() const; - - std::string whereToString() const; - std::string toString() const; - std::string getNumpyTypeDesc(DataType type) const; - - void saveNpy(const std::string& filename) const; - static Tensor loadNpy(const std::string& npy_file, const MemoryType where); - - static DataType typeFromNumpyDesc(std::string type); - static size_t getTypeSize(DataType type); - - template - inline T getVal(size_t index) const - { - TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); - FT_CHECK(where == MEMORY_CPU); - FT_CHECK(data != nullptr); - FT_CHECK_WITH_INFO(index < size(), "index is larger than buffer size"); - - if (getTensorType() != type) { - TM_LOG_DEBUG("getVal with type %s, but data type is: %s", - getNumpyTypeDesc(getTensorType()).c_str(), - getNumpyTypeDesc(type).c_str()); - } - return ((T*)data)[index]; - } - - template - inline T getVal() const - { - TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); - if (getTensorType() != type) { - TM_LOG_DEBUG("getVal with type %s, but data type is: %s", - getNumpyTypeDesc(getTensorType()).c_str(), - getNumpyTypeDesc(type).c_str()); - } - return getVal(0); - } - - template - inline T* getPtr() const - { - TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); - if (getTensorType() != type) { - TM_LOG_DEBUG("getPtr with type %s, but data type is: %s", - getNumpyTypeDesc(getTensorType()).c_str(), - getNumpyTypeDesc(type).c_str()); - } - return (T*)data; - } - - inline void* getPtrWithOffset(size_t offset) const - { - TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); - if (data == nullptr) { - return (void*)data; - } - else { - FT_CHECK_WITH_INFO(offset < size(), "offset is larger than buffer size"); - return (void*)((char*)data + offset * Tensor::getTypeSize(type)); - } - } - - template - inline T* getPtrWithOffset(size_t offset) const - { - TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); - if (getTensorType() != type) { - TM_LOG_DEBUG("getVal with type %s, but data type is: %s", - getNumpyTypeDesc(getTensorType()).c_str(), - getNumpyTypeDesc(type).c_str()); - } - if (data == nullptr) { - return (T*)data; - } - else { - FT_CHECK_WITH_INFO(offset < size(), - fmtstr("offset (%lu) is larger than buffer size (%lu)", offset, size())); - return ((T*)data) + offset; - } - } - - template - T max() const - { - if (getTensorType() != type) { - TM_LOG_DEBUG("getVal with type %s, but data type is: %s", - getNumpyTypeDesc(getTensorType()).c_str(), - getNumpyTypeDesc(type).c_str()); - } - FT_CHECK_WITH_INFO(shape.size() > 0 && data != nullptr, "Should be a non-empty tensor."); - FT_CHECK_WITH_INFO(where == MEMORY_CPU || where == MEMORY_CPU_PINNED, - "max() supports MEMORY_CPU or MEMORY_CPU_PINNED tensor."); - size_t max_idx = 0; - T max_val = getVal(max_idx); - for (size_t i = 1; i < size(); ++i) { - T val = getVal(i); - if (val > max_val) { - max_idx = i; - max_val = val; - } - } - return max_val; - } - - template - T min() const - { - if (getTensorType() != type) { - TM_LOG_DEBUG("getVal with type %s, but data type is: %s", - getNumpyTypeDesc(getTensorType()).c_str(), - getNumpyTypeDesc(type).c_str()); - } - FT_CHECK_WITH_INFO(shape.size() > 0 && data != nullptr, "Should be a non-empty tensor."); - FT_CHECK_WITH_INFO(where == MEMORY_CPU || where == MEMORY_CPU_PINNED, - "min() supports MEMORY_CPU or MEMORY_CPU_PINNED tensor."); - size_t min_idx = 0; - T min_val = getVal(min_idx); - for (size_t i = 1; i < size(); ++i) { - T val = getVal(i); - if (val < min_val) { - min_idx = i; - min_val = val; - } - } - return min_val; - } - - template - T any(T val) const - { - if (getTensorType() != type) { - TM_LOG_DEBUG("getVal with type %s, but data type is: %s", - getNumpyTypeDesc(getTensorType()).c_str(), - getNumpyTypeDesc(type).c_str()); - } - FT_CHECK_WITH_INFO(shape.size() > 0 && data != nullptr, "Should be a non-empty tensor."); - FT_CHECK_WITH_INFO(where == MEMORY_CPU || where == MEMORY_CPU_PINNED, - "any() supports MEMORY_CPU or MEMORY_CPU_PINNED tensor."); - for (size_t i = 0; i < size(); ++i) { - if (getVal(i) == val) { - return true; - } - } - return false; - } - - template - T all(T val) const - { - if (getTensorType() != type) { - TM_LOG_DEBUG("getVal with type %s, but data type is: %s", - getNumpyTypeDesc(getTensorType()).c_str(), - getNumpyTypeDesc(type).c_str()); - } - FT_CHECK_WITH_INFO(shape.size() > 0 && data != nullptr, "Should be a non-empty tensor."); - FT_CHECK_WITH_INFO(where == MEMORY_CPU || where == MEMORY_CPU_PINNED, - "all() supports MEMORY_CPU or MEMORY_CPU_PINNED tensor."); - for (size_t i = 0; i < size(); ++i) { - if (getVal(i) != val) { - return false; - } - } - return true; - } - - void updateShape(size_t idx, size_t val) - { - // TODO: find a better way to update the shape - std::vector& shape_ref = const_cast&>(shape); - shape_ref[idx] = val; - } - - Tensor slice(std::vector shape, size_t offset = 0) const; - -private: - static void parseNpyIntro(FILE*& f_ptr, uint32_t& header_len, uint32_t& start_data); - static int parseNpyHeader(FILE*& f_ptr, uint32_t header_len, DataType& type, std::vector& shape); -}; - -class TensorMap { -private: - std::unordered_map tensor_map_; - - inline bool isValid(const Tensor& tensor) - { - return tensor.size() > 0 && tensor.data != nullptr; - } - -public: - TensorMap() = default; - TensorMap(const std::unordered_map& tensor_map); - TensorMap(const std::vector& tensor_map); - TensorMap(std::initializer_list> tensor_map); - ~TensorMap(); - - inline size_t size() const - { - return tensor_map_.size(); - } - - inline bool isExist(const std::string& key) const - { - TM_LOG_DEBUG("%s for key: %s", __PRETTY_FUNCTION__, key.c_str()); - return tensor_map_.find(key) != tensor_map_.end(); - } - - std::vector keys() const; - - inline void insert(const std::string& key, const Tensor& value) - { - FT_CHECK_WITH_INFO(!isExist(key), fmtstr("Duplicated key %s", key.c_str())); - tensor_map_.insert({key, value}); - } - - inline void insertIfValid(const std::string& key, const Tensor& value) - { - if (isValid(value)) { - insert({key, value}); - } - } - - inline void insert(std::pair p) - { - tensor_map_.insert(p); - } - - // prevent converting int or size_t to string automatically - Tensor at(int tmp) = delete; - Tensor at(size_t tmp) = delete; - - inline Tensor& at(const std::string& key) - { - TM_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str()); - FT_CHECK_WITH_INFO(isExist(key), - fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)", - key.c_str(), - vec2str(keys()).c_str())); - return tensor_map_.at(key); - } - - inline Tensor at(const std::string& key) const - { - FT_CHECK_WITH_INFO(isExist(key), - fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)", - key.c_str(), - vec2str(keys()).c_str())); - return tensor_map_.at(key); - } - - inline Tensor& at(const std::string& key, Tensor& default_tensor) - { - TM_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str()); - if (isExist(key)) { - return tensor_map_.at(key); - } - return default_tensor; - } - - inline Tensor at(const std::string& key, Tensor& default_tensor) const - { - TM_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str()); - if (isExist(key)) { - return tensor_map_.at(key); - } - return default_tensor; - } - - inline Tensor& at(const std::string& key, Tensor&& default_tensor) - { - TM_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str()); - if (isExist(key)) { - return tensor_map_.at(key); - } - return default_tensor; - } - - inline Tensor at(const std::string& key, Tensor&& default_tensor) const - { - if (isExist(key)) { - return tensor_map_.at(key); - } - return default_tensor; - } - - template - inline T getVal(const std::string& key) const - { - FT_CHECK_WITH_INFO(isExist(key), - fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)", - key.c_str(), - vec2str(keys()).c_str())); - return tensor_map_.at(key).getVal(); - } - - template - inline T getVal(const std::string& key, T default_value) const - { - if (isExist(key)) { - return tensor_map_.at(key).getVal(); - } - return default_value; - } - - template - inline T getValWithOffset(const std::string& key, size_t index) const - { - FT_CHECK_WITH_INFO(isExist(key), - fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)", - key.c_str(), - vec2str(keys()).c_str())); - return tensor_map_.at(key).getVal(index); - } - - template - inline T getValWithOffset(const std::string& key, size_t index, T default_value) const - { - if (isExist(key)) { - return tensor_map_.at(key).getVal(index); - } - return default_value; - } - - template - inline T* getPtr(const std::string& key) const - { - FT_CHECK_WITH_INFO(isExist(key), - fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)", - key.c_str(), - vec2str(keys()).c_str())); - return tensor_map_.at(key).getPtr(); - } - - template - inline T* getPtr(const std::string& key, T* default_ptr) const - { - if (isExist(key)) { - return tensor_map_.at(key).getPtr(); - } - return default_ptr; - } - - template - inline T* getPtrWithOffset(const std::string& key, size_t index) const - { - FT_CHECK_WITH_INFO(isExist(key), - fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)", - key.c_str(), - vec2str(keys()).c_str())); - return tensor_map_.at(key).getPtrWithOffset(index); - } - - template - inline T* getPtrWithOffset(const std::string& key, size_t index, T* default_ptr) const - { - if (isExist(key)) { - return tensor_map_.at(key).getPtrWithOffset(index); - } - return default_ptr; - } - - inline std::unordered_map getMap() const - { - return tensor_map_; - } - - inline std::unordered_map::iterator begin() - { - return tensor_map_.begin(); - } - - inline std::unordered_map::iterator end() - { - return tensor_map_.end(); - } - - inline std::unordered_map& get() - { - return tensor_map_; - } - - inline std::unordered_map::const_iterator begin() const - { - return tensor_map_.begin(); - } - - inline std::unordered_map::const_iterator end() const - { - return tensor_map_.end(); - } - - int count(const std::string& key) const - { - return tensor_map_.count(key); - } - - bool empty() const - { - return tensor_map_.empty(); - } - - std::string toString(); - static TensorMap fromNpyFolder(const std::string& base_folder); - void saveNpy(const std::string& base_folder); -}; - -struct ManagedTensor { - Tensor tensor; - std::shared_ptr data_holder; - - Tensor* operator->() noexcept - { - return &tensor; - } - - const Tensor* operator->() const noexcept - { - return &tensor; - } - - Tensor& operator*() noexcept - { - return tensor; - } - - const Tensor& operator*() const noexcept - { - return tensor; - } -}; - -} // namespace turbomind diff --git a/src/turbomind/utils/allocator.h b/src/turbomind/utils/allocator.h deleted file mode 100644 index 88c299c3de..0000000000 --- a/src/turbomind/utils/allocator.h +++ /dev/null @@ -1,493 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * Memory Allocator - **/ - -#pragma once - -#include "cuda_utils.h" -#include "src/turbomind/macro.h" -#include -#include -#include - -#ifdef GOOGLE_CUDA -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/register_types.h" -#include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow/core/framework/tensor.h" -#include "tensorflow/core/framework/tensor_types.h" -#include "tensorflow/core/framework/types.h" -#include "tensorflow/core/lib/core/errors.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#endif - -#ifdef TORCH_CUDA -#include "torch/extension.h" -#include -#endif - -#include "src/turbomind/utils/logger.h" - -#if defined(CUDART_VERSION) && CUDART_VERSION < 11020 -#define CUDA_MEMORY_POOL_DISABLED -#endif - -namespace turbomind { - -enum class AllocatorType -{ - CUDA, - TF, - TH -}; - -enum class ReallocType -{ - INCREASE, - REUSE, - DECREASE, -}; - -class IAllocator { -public: - virtual ~IAllocator(){}; - - virtual void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false) = 0; - virtual void free(void** ptr, bool is_host = false) = 0; - virtual void setStream(cudaStream_t stream) = 0; - virtual cudaStream_t returnStream() = 0; - virtual void memSet(void* ptr, const int val, const size_t size) = 0; - - template - void* reMalloc(T* ptr, size_t size, const bool is_set_zero = true, bool is_host = false) - { - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - size = ((size + 31) / 32) * 32; // make the buffer align with 32 bytes - void* void_ptr = (void*)ptr; - void* ptr_address = getAddress(void_ptr); - if (isExist(ptr_address)) { - ReallocType realloc_type = isReMalloc(ptr_address, size); - if (realloc_type == ReallocType::INCREASE) { - TM_LOG_DEBUG("ReMalloc the buffer %p since it is too small.", void_ptr); - free((void**)(&void_ptr), is_host); - return malloc(size, is_set_zero, is_host); - } -#if !defined(CUDA_MEMORY_POOL_DISABLED) - else if (realloc_type == ReallocType::DECREASE) { - TM_LOG_DEBUG("ReMalloc the buffer %p to release unused memory to memory pools.", void_ptr); - free((void**)(&void_ptr), is_host); - return malloc(size, is_set_zero, is_host); - } -#endif - else { - TM_LOG_DEBUG("Reuse original buffer %p with size %d and do nothing for reMalloc.", void_ptr, size); - if (is_set_zero) { - memSet(void_ptr, 0, size); - } - return void_ptr; - } - } - else { - TM_LOG_DEBUG("Cannot find buffer %p, mallocing new one.", void_ptr); - return malloc(size, is_set_zero, is_host); - } - } - -protected: - virtual bool isExist(void* address) const = 0; - virtual ReallocType isReMalloc(void* address, size_t size) const = 0; - - void* getAddress(void* ptr) const - { - return ptr; - } -}; - -template -class Allocator; - -template<> -class Allocator: public IAllocator { -private: - enum class MemoryType - { - HOST, - DEVICE - }; - - const int device_id_; - bool enable_peer_access_{false}; - cudaStream_t stream_ = 0; // initialize as default stream - cudaMemPool_t mempool_{}; - std::unordered_map> pointer_mapping_; - - bool isExist(void* address) const - { - return pointer_mapping_.count(address) > 0; - } - ReallocType isReMalloc(void* address, size_t size) const - { - FT_CHECK(isExist(address)); - if (pointer_mapping_.at(address).first < size) { - return ReallocType::INCREASE; - } - else if (pointer_mapping_.at(address).first == size) { - return ReallocType::REUSE; - } - else { - return ReallocType::DECREASE; - } - } - -public: - Allocator(int device_id, bool enable_peer_access = false): - device_id_(device_id), enable_peer_access_(enable_peer_access) - { - TM_LOG_DEBUG(__PRETTY_FUNCTION__); -#if defined(CUDA_MEMORY_POOL_DISABLED) - TM_LOG_WARNING( - "Async cudaMalloc/Free is not supported before CUDA 11.2. Using Sync cudaMalloc/Free." - "Note this may lead to hang with NCCL kernels launched in parallel; if so, try NCCL_LAUNCH_MODE=GROUP"); -#else - - if (enable_peer_access) { - cudaMemPoolProps props{}; - props.allocType = cudaMemAllocationTypePinned; - props.handleTypes = cudaMemHandleTypeNone; - props.location.type = cudaMemLocationTypeDevice; - props.location.id = device_id; - check_cuda_error(cudaMemPoolCreate(&mempool_, &props)); - cudaMemAccessDesc desc = {}; - int peer_access_available = 0; - int device_count = 1; - check_cuda_error(cudaGetDeviceCount(&device_count)); - for (int i = 0; i < device_count; i++) { - if (i == device_id) { - continue; - } - check_cuda_error(cudaDeviceCanAccessPeer(&peer_access_available, device_id, i)); - if (!peer_access_available) { - TM_LOG_WARNING("Devicle " + std::to_string(device_id) + " peer access Device " + std::to_string(i) - + " is not available."); - continue; - } - desc.location.type = cudaMemLocationTypeDevice; - desc.location.id = i; - desc.flags = cudaMemAccessFlagsProtReadWrite; - check_cuda_error(cudaMemPoolSetAccess(mempool_, &desc, 1)); - } - } - else { - check_cuda_error(cudaDeviceGetDefaultMemPool(&mempool_, device_id)); - } - // set memory pool threshold to avoid shrinking the pool - uint64_t setVal = UINT64_MAX; - check_cuda_error(cudaMemPoolSetAttribute(mempool_, cudaMemPoolAttrReleaseThreshold, &setVal)); -#endif - } - - virtual ~Allocator() - { - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - while (!pointer_mapping_.empty()) { - auto ptr = pointer_mapping_.begin()->first; - auto size_and_type = pointer_mapping_.begin()->second; - free(&ptr, size_and_type.second == MemoryType::HOST); - } - if (enable_peer_access_) { // We own the pool in this case - check_cuda_error(cudaMemPoolDestroy(mempool_)); - mempool_ = {}; - } - } - - void setStream(cudaStream_t stream) - { - stream_ = stream; - } - - cudaStream_t returnStream() - { - return stream_; - }; - - void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false) - { - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - if (size == 0) { - return nullptr; - } - void* ptr = nullptr; - int o_device = 0; - - check_cuda_error(getSetDevice(device_id_, &o_device)); - if (is_host) { - check_cuda_error(cudaMallocHost(&ptr, (size_t)(ceil(size / 32.)) * 32)); - } - else { -#if defined(CUDA_MEMORY_POOL_DISABLED) - check_cuda_error(cudaMalloc(&ptr, (size_t)(ceil(size / 32.)) * 32)); -#else - check_cuda_error(cudaMallocFromPoolAsync(&ptr, (size_t)(ceil(size / 32.)) * 32, mempool_, stream_)); -#endif - } - if (is_set_zero) { - check_cuda_error(cudaMemsetAsync(ptr, 0, (size_t)(ceil(size / 32.)) * 32, stream_)); - } - check_cuda_error(getSetDevice(o_device)); - TM_LOG_DEBUG("malloc buffer %p with size %ld", ptr, size); - - pointer_mapping_.insert({getAddress(ptr), {size, is_host ? MemoryType::HOST : MemoryType::DEVICE}}); - - return ptr; - } - - void free(void** ptr, bool _ = false) - { - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - void* address = getAddress(*ptr); - if (*ptr != nullptr) { - int o_device = 0; - if (pointer_mapping_.count(address)) { - const auto is_host = pointer_mapping_.at(address).second == MemoryType::HOST; - TM_LOG_DEBUG("Free buffer %p", address); - check_cuda_error(getSetDevice(device_id_, &o_device)); - if (is_host) { - check_cuda_error(cudaFreeHost(*ptr)); - } - else { -#if defined(CUDA_MEMORY_POOL_DISABLED) - check_cuda_error(cudaFree(*ptr)); -#else - check_cuda_error(cudaFreeAsync(*ptr, stream_)); -#endif - } - check_cuda_error(getSetDevice(o_device)); - pointer_mapping_.erase(address); - } - else { - FT_CHECK_WITH_INFO(0, - fmtstr("pointer_mapping_ does not have information of ptr at %p.", address).c_str()); - } - } - *ptr = nullptr; - return; - } - - void memSet(void* ptr, const int val, const size_t size) - { - check_cuda_error(cudaMemsetAsync(ptr, val, size, stream_)); - } -}; - -#ifdef GOOGLE_CUDA -using namespace tensorflow; -template<> -class Allocator: public IAllocator { - OpKernelContext* context_; - std::unordered_map* pointer_mapping_; - cudaStream_t stream_; - - bool isExist(void* address) const - { - return pointer_mapping_->count(address) > 0; - } - ReallocType isReMalloc(void* address, size_t size) const - { - FT_CHECK(isExist(address)); - size_t current_buffer_size = 1; - for (int i = 0; i < pointer_mapping_->at(address).dims(); i++) { - current_buffer_size *= pointer_mapping_->at(address).dim_size(i); - } - TM_LOG_DEBUG("current_buffer_size: %d, new buffer: %d", current_buffer_size, size); - if (current_buffer_size < size) { - return ReallocType::INCREASE; - } - else if (current_buffer_size == size) { - return ReallocType::REUSE; - } - else { - return ReallocType::DECREASE; - } - } - -public: - Allocator(OpKernelContext* context, cudaStream_t stream): context_(context), stream_(stream) - { - pointer_mapping_ = new std::unordered_map(); - } - - void setStream(cudaStream_t stream) - { - stream_ = stream; - } - - cudaStream_t returnStream() - { - return stream_; - }; - - void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false) - { - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - tensorflow::Tensor buf; - long long int buf_size = ((long long int)ceil(size / 32.) * 32); - tensorflow::Status status; - if (is_host) { - tensorflow::AllocatorAttributes pinned_allocator; - pinned_allocator.set_on_host(true); - pinned_allocator.set_gpu_compatible(true); - status = context_->allocate_temp(DT_UINT8, TensorShape{buf_size}, &buf, pinned_allocator); - } - else { - status = context_->allocate_temp(DT_UINT8, TensorShape{buf_size}, &buf); - } - - if (status != tensorflow::Status::OK()) { - throw std::runtime_error("TF error: context->allocate_temp failed"); - } - - auto flat = buf.flat(); - void* ptr = (void*)flat.data(); - if (is_set_zero) { - cudaMemsetAsync(ptr, 0, buf_size, stream_); - } - pointer_mapping_->insert({getAddress(ptr), buf}); - - return ptr; - } - - void free(void** ptr, bool is_host = false) const - { - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - void* address = getAddress(*ptr); - pointer_mapping_->erase(address); - *ptr = nullptr; - return; - } - - virtual ~Allocator() - { - while (!pointer_mapping_->empty()) { - void* ptr = pointer_mapping_->begin()->second.flat().data(); - free(&ptr); - } - pointer_mapping_->clear(); - delete pointer_mapping_; - } - - void memSet(void* ptr, const int val, const size_t size) - { - check_cuda_error(cudaMemsetAsync(ptr, val, size, stream_)); - } -}; -#endif - -#ifdef TORCH_CUDA -template<> -class Allocator: public IAllocator { - std::unordered_map* pointer_mapping_; - - bool isExist(void* address) const - { - return pointer_mapping_->count(address) > 0; - } - ReallocType isReMalloc(void* address, size_t size) const - { - FT_CHECK(isExist(address)); - size_t current_buffer_size = 1; - for (int i = 0; i < pointer_mapping_->at(address).dim(); i++) { - current_buffer_size *= pointer_mapping_->at(address).size(i); - } - TM_LOG_DEBUG( - "current_buffer_size: %d, original buffer: %p, new buffer: %d", current_buffer_size, address, size); - if (current_buffer_size < size) { - return ReallocType::INCREASE; - } - else if (current_buffer_size == size) { - return ReallocType::REUSE; - } - else { - return ReallocType::DECREASE; - } - } - -public: - Allocator() - { - pointer_mapping_ = new std::unordered_map(); - } - - void setStream(cudaStream_t stream) - { - // nothing to do here; - } - - cudaStream_t returnStream() - { - // nothing to do here; - return 0; - }; - - void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false) - { - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - int64_t buf_size = static_cast(ceil(size / 32.)) * 32; - torch::Tensor buf; - if (is_host) { - buf = torch::empty({buf_size}, torch::dtype(torch::kUInt8).device(torch::kCPU).pinned_memory(true)); - } - else { - buf = torch::empty({buf_size}, torch::dtype(torch::kUInt8).device(torch::kCUDA)); - } - void* ptr = buf.data_ptr(); - if (is_set_zero) { - cudaMemset(ptr, 0, buf_size); - } - TM_LOG_DEBUG("malloc buffer %p with size %ld", ptr, buf_size); - pointer_mapping_->insert({getAddress(ptr), buf}); - return ptr; - } - - void free(void** ptr, bool is_host = false) const - { - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - void* address = getAddress(*ptr); - pointer_mapping_->erase(address); - *ptr = nullptr; - return; - } - - virtual ~Allocator() - { - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - while (!pointer_mapping_->empty()) { - void* ptr = pointer_mapping_->begin()->second.data_ptr(); - free(&ptr); - } - pointer_mapping_->clear(); - delete pointer_mapping_; - } - - void memSet(void* ptr, const int val, const size_t size) - { - check_cuda_error(cudaMemset(ptr, val, size)); - } -}; -#endif -} // namespace turbomind diff --git a/src/turbomind/utils/anomaly_handler.cu b/src/turbomind/utils/anomaly_handler.cu index e4e1eb6228..693e7c3569 100644 --- a/src/turbomind/utils/anomaly_handler.cu +++ b/src/turbomind/utils/anomaly_handler.cu @@ -1,8 +1,5 @@ -#include "src/turbomind/utils/anomaly_handler.h" -#include "src/turbomind/utils/cuda_utils.h" -#include "src/turbomind/utils/logger.h" -#include "src/turbomind/utils/memory_utils.h" + #include #include #include @@ -10,6 +7,13 @@ #include #include +#include "src/turbomind/core/data_type.h" +#include "src/turbomind/models/llama/llama_utils.h" +#include "src/turbomind/utils/anomaly_handler.h" +#include "src/turbomind/utils/cuda_utils.h" +#include "src/turbomind/utils/logger.h" +#include "src/turbomind/utils/memory_utils.h" + namespace turbomind { static std::optional parse_float(const std::string& s, const std::string& key) @@ -378,10 +382,25 @@ void AnomalyHandler::FixLogits(T* logits, int batch_size, int level) impl_->invokeFixLogitsAnomaly(logits, batch_size, level); } +int AnomalyHandler::level() noexcept +{ + return Impl::g_level; +} + template void AnomalyHandler::FixLogits(float*, int, int); template void AnomalyHandler::FixLogits(half*, int, int); #ifdef ENABLE_BF16 template void AnomalyHandler::FixLogits(__nv_bfloat16*, int, int); #endif +void DebugTensor(Tensor& tensor, const std::string& key, int level) +{ + auto invoke = [&](auto t) { + using T = decltype(t); + AnomalyHandler::instance().CountAndFix((T*)tensor.raw_data(), tensor.size(), key, level); + // Compare((T*)tensor.raw_data(), tensor.size(), key, compare_mode, core::Context::stream().handle()); + }; + TM_DISPATCH_DTYPES(tensor.dtype(), invoke, float, half_t, bfloat16_t); +} + } // namespace turbomind diff --git a/src/turbomind/utils/anomaly_handler.h b/src/turbomind/utils/anomaly_handler.h index 9603b8e781..00325183f9 100644 --- a/src/turbomind/utils/anomaly_handler.h +++ b/src/turbomind/utils/anomaly_handler.h @@ -4,11 +4,14 @@ #pragma once #include +#include #include #include #include #include +#include "src/turbomind/core/core.h" + namespace turbomind { class AnomalyHandler { @@ -21,6 +24,8 @@ class AnomalyHandler { static AnomalyHandler& instance(); + static int level() noexcept; + void Init(int rank, int vocab_size, int fallback, int max_batch_size, cudaStream_t stream) noexcept; template @@ -47,4 +52,21 @@ void count_and_fix(T* data, size_t size, std::string key, int level) AnomalyHandler::instance().CountAndFix(data, size, key, level); } +void DebugTensor(Tensor& tensor, const std::string& key, int level); + +inline void DebugTensor(Tensor&& tensor, const std::string& key, int level) +{ + DebugTensor(tensor, key, level); +} + +#define TM_DEBUG_RAW(ptr, size, key, __level) \ + if (::turbomind::AnomalyHandler::level() >= __level) { \ + ::turbomind::count_and_fix(ptr, size, key, __level); \ + } + +#define TM_DEBUG_TENSOR(tensor, key, __level) \ + if (::turbomind::AnomalyHandler::level() >= __level) { \ + ::turbomind::DebugTensor(tensor, key, __level); \ + } + } // namespace turbomind diff --git a/src/turbomind/utils/cublasAlgoMap.cc b/src/turbomind/utils/cublasAlgoMap.cc deleted file mode 100644 index 1f9d5743c4..0000000000 --- a/src/turbomind/utils/cublasAlgoMap.cc +++ /dev/null @@ -1,226 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "cublasAlgoMap.h" - -namespace turbomind { - -cublasAlgoMap::cublasAlgoMap(const std::string filename, const std::string sp_config_filename): - config_filename_(filename), sp_config_filename_(sp_config_filename) -{ - loadGemmConfig(); - loadSpGemmConfig(); -} - -cublasAlgoMap::cublasAlgoMap(const cublasAlgoMap& algo_map): - config_filename_(algo_map.config_filename_), - sp_config_filename_(algo_map.sp_config_filename_), - algo_map_(algo_map.algo_map_), - sp_algo_map_(algo_map.sp_algo_map_) -{ -} - -cublasAlgoMap::~cublasAlgoMap() -{ - algo_map_.clear(); -} - -void cublasAlgoMap::loadGemmConfig() -{ - FILE* fd; - fd = fopen(config_filename_.c_str(), "r"); - if (fd == NULL) { - std::cout << "[WARNING] " << config_filename_ << " is not found; using default GEMM algo" << std::endl; - return; - } - - int batchCount2, m2, n2, k2, algoId, customOption, tile, splitK_val; - int batch_size, seq_len, head_num, size_per_head, dataType; - int swizzle, reductionScheme, workspaceSize, stages; - int inner_shapeId, cluster_shapeId, mma_shapeId, cga_shapeId, sche_mode; - float exec_time; - char tmp[1024]; - if (!fgets(tmp, 1024, fd)) { - printf("[ERROR] fgets fail at %s:%d \n", __FILE__, __LINE__); - exit(-1); - } - while (fscanf(fd, - "%d %d %d %d %d ### %d %d %d %d %d %d %d %d %d %d %d %d " -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - "%d %d " -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - "%d %d %d " -#endif - "%f\n", - &batch_size, - &seq_len, - &head_num, - &size_per_head, - &dataType, - &batchCount2, - &n2, - &m2, - &k2, - &algoId, - &customOption, - &tile, - &splitK_val, - &swizzle, - &reductionScheme, - &workspaceSize, - &stages, -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - &inner_shapeId, - &cluster_shapeId, -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - &mma_shapeId, - &cga_shapeId, - &sche_mode, -#endif - &exec_time) - != EOF) { - if (dataType != FLOAT_DATATYPE && dataType != HALF_DATATYPE && dataType != BFLOAT16_DATATYPE - && dataType != INT8_DATATYPE && dataType != FP8_DATATYPE) { - printf("[WARNING][readAlgoFromConfig] wrong dataType %d!\n", dataType); - continue; - } - cublasAlgoConfig_t markStr{batchCount2, m2, n2, k2, static_cast(dataType)}; - // workspaceSize should be zero - if (algo_map_.find(markStr) == algo_map_.end()) { - algo_map_[markStr].algoId = algoId; - algo_map_[markStr].customOption = customOption; - algo_map_[markStr].tile = tile; - algo_map_[markStr].splitK_val = splitK_val; - algo_map_[markStr].swizzle = swizzle; - algo_map_[markStr].reductionScheme = reductionScheme; - algo_map_[markStr].workspaceSize = workspaceSize; - algo_map_[markStr].stages = stages; -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - algo_map_[markStr].inner_shapeId = (uint16_t)inner_shapeId; - algo_map_[markStr].cluster_shapeId = (uint16_t)cluster_shapeId; -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - algo_map_[markStr].mma_shapeId = (uint16_t)mma_shapeId; - algo_map_[markStr].cga_shapeId = (uint16_t)cga_shapeId; - algo_map_[markStr].sche_mode = (uint16_t)sche_mode; -#endif - algo_map_[markStr].exec_time = exec_time; - } - } - fclose(fd); -} - -bool cublasAlgoMap::isExist( - const int batch_count, const int m, const int n, const int k, const CublasDataType data_type) -{ - cublasAlgoConfig_t mark{batch_count, n, m, k, data_type}; - return algo_map_.find(mark) != algo_map_.end(); -} - -cublasLtMatmulAlgo_info -cublasAlgoMap::getAlgo(const int batch_count, const int m, const int n, const int k, const CublasDataType data_type) -{ - cublasAlgoConfig_t mark{batch_count, n, m, k, data_type}; - if (algo_map_.find(mark) != algo_map_.end()) { - return algo_map_[mark]; - } - else { - cublasLtMatmulAlgo_info tmp_algo; - tmp_algo.algoId = - static_cast(data_type == FLOAT_DATATYPE ? CUBLAS_GEMM_DEFAULT : CUBLAS_GEMM_DEFAULT_TENSOR_OP); - tmp_algo.customOption = -1; - tmp_algo.tile = -1; - tmp_algo.splitK_val = -1; - tmp_algo.swizzle = -1; - tmp_algo.reductionScheme = -1; - tmp_algo.workspaceSize = -1; - tmp_algo.stages = -1; - tmp_algo.exec_time = -1.0f; - return tmp_algo; - } -} - -void cublasAlgoMap::loadSpGemmConfig() -{ - if (sp_config_filename_.empty()) { - return; - } - FILE* fd = fopen(sp_config_filename_.c_str(), "r"); - if (fd == NULL) { - printf("[WARNING] %s is not found; using SPGEMM algo id 0\n", sp_config_filename_.c_str()); - return; - } - sp_algo_map_.clear(); - int batch_size, seq_len, head_num, size_per_head, data_type; - int batchCount, m, n, k, algoId; - float exec_time; - char tmp[1024]; - if (!fgets(tmp, 1024, fd)) { - printf("[ERROR] fgets fail at %s:%d \n", __FILE__, __LINE__); - exit(-1); - } - while (fscanf(fd, - "%d %d %d %d %d ### %d %d %d %d %d %f\n", - &batch_size, - &seq_len, - &head_num, - &size_per_head, - &data_type, - &batchCount, - &m, - &n, - &k, - &algoId, - &exec_time) - != EOF) { - char mark[256]; - sprintf(mark, "%d_%d_%d_%d", batchCount, m, n, k); - std::string markStr(mark); - sp_algo_map_[markStr] = algoId; - } - fclose(fd); -} - -int cublasAlgoMap::getSpAlgo(const int batch_count, const int m, const int n, const int k) -{ - char mark[256]; - sprintf(mark, "%d_%d_%d_%d", batch_count, m, n, k); - if (sp_algo_map_.find(mark) != sp_algo_map_.end()) { - return sp_algo_map_[mark]; - } - else { - // for remove padding, select algo 1 for simplicity - return 0; - } -} - -bool cublasAlgoMap::isUseSparse(const int batch_count, const int m, const int n, const int k) -{ - // not available to use cusparselt. - if (m % 8 != 0 || n % 8 != 0 || k % 8 != 0) { - return false; - } - char mark[256]; - sprintf(mark, "%d_%d_%d_%d", batch_count, m, n, k); - if (sp_algo_map_.find(mark) != sp_algo_map_.end()) { - return sp_algo_map_[mark] != -1; - } - else { - // no gemm test case, choose sparse according to sparse flag - return true; - } -} - -} // namespace turbomind diff --git a/src/turbomind/utils/cublasAlgoMap.h b/src/turbomind/utils/cublasAlgoMap.h deleted file mode 100644 index 3e5b534a1b..0000000000 --- a/src/turbomind/utils/cublasAlgoMap.h +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "src/turbomind/utils/cuda_utils.h" -#include -#include -#include -#include -#include -#include -#include - -#pragma once -namespace turbomind { - -#define GEMM_NUM 6 -#define GEMM_CONFIG "gemm_config.in" -#define IGEMM_CONFIG "igemm_config.in" -#define SPGEMM_CONFIG "spgemm_config.in" -#define SPIGEMM_CONFIG "spigemm_config.in" - -typedef struct { - int algoId, customOption, tile, splitK_val; - int swizzle, reductionScheme, workspaceSize; - // only used in cublasLt >= 11.0 - int stages; -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - uint16_t inner_shapeId, cluster_shapeId; -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - uint16_t mma_shapeId, cga_shapeId, sche_mode; -#endif - float exec_time; -} cublasLtMatmulAlgo_info; - -/* Structure to store information about different run trials */ -typedef struct { - cublasLtMatmulAlgo_t algo; - cublasStatus_t status; - float time; - size_t workspaceSize; // actual memory workspace needed - cublasMath_t mathMode; - cublasLtReductionScheme_t reductionScheme; - int customOption; - float wavesCount; -} customMatmulPerf_t; - -struct cublasAlgoConfig_t { - int batch_count; - int m; - int n; - int k; - CublasDataType data_type; - bool operator==(cublasAlgoConfig_t const& config) const - { - return (batch_count == config.batch_count) && (m == config.m) && (n == config.n) && (k == config.k) - && (data_type == config.data_type); - } -}; - -class cublasAlgoConfig_hasher { -public: - std::size_t operator()(cublasAlgoConfig_t const& config) const - { - return config.batch_count * 98317ull ^ config.m * 49157ull ^ config.n * 24593ull ^ config.k * 196613ull - ^ static_cast(config.data_type) * 6151ull; - } -}; - -class cublasAlgoMap { -private: - std::unordered_map algo_map_; - std::string config_filename_; - std::string sp_config_filename_; - std::map sp_algo_map_; - -public: - cublasAlgoMap(){}; - explicit cublasAlgoMap(const std::string filename, const std::string sp_config_filename = ""); - cublasAlgoMap(const cublasAlgoMap& map); - ~cublasAlgoMap(); - void loadGemmConfig(); - void loadSpGemmConfig(); - int getSpAlgo(const int batch_count, const int m, const int n, const int k); - bool isUseSparse(const int batch_count, const int m, const int n, const int k); - - bool isExist(const int batch_count, const int m, const int n, const int k, const CublasDataType data_type); - - cublasLtMatmulAlgo_info - getAlgo(const int batch_count, const int m, const int n, const int k, const CublasDataType data_type); -}; - -} // namespace turbomind diff --git a/src/turbomind/utils/cublasINT8MMWrapper.cc b/src/turbomind/utils/cublasINT8MMWrapper.cc deleted file mode 100644 index 9afd21d088..0000000000 --- a/src/turbomind/utils/cublasINT8MMWrapper.cc +++ /dev/null @@ -1,559 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "cublasINT8MMWrapper.h" - -#ifndef CUDART_VERSION -#error CUDART_VERSION Undefined! -#endif - -namespace turbomind { -cublasINT8MMWrapper::cublasINT8MMWrapper(cublasLtHandle_t cublaslt_handle, - cudaStream_t stream, - cublasAlgoMap* cublas_algo_map, - std::mutex* mu, - bool use_ORDER_COL32_2R_4R4): - cublasMMWrapper(nullptr, cublaslt_handle, stream, cublas_algo_map, mu, nullptr), - use_ORDER_COL32_2R_4R4_(use_ORDER_COL32_2R_4R4) -{ -} - -cublasINT8MMWrapper::cublasINT8MMWrapper(cublasHandle_t cublas_handle, - cublasLtHandle_t cublaslt_handle, - cudaStream_t stream, - cublasAlgoMap* cublas_algo_map, - std::mutex* mu, - bool use_ORDER_COL32_2R_4R4): - cublasMMWrapper(cublas_handle, cublaslt_handle, stream, cublas_algo_map, mu, nullptr), - use_ORDER_COL32_2R_4R4_(use_ORDER_COL32_2R_4R4) -{ -} - -#ifdef SPARSITY_ENABLED -cublasINT8MMWrapper::cublasINT8MMWrapper(cublasLtHandle_t cublaslt_handle, - cusparseLtHandle_t cusparselt_handle, - cudaStream_t stream, - cublasAlgoMap* cublas_algo_map, - std::mutex* mu, - bool use_ORDER_COL32_2R_4R4): - cublasMMWrapper(nullptr, cublaslt_handle, cusparselt_handle, stream, cublas_algo_map, mu, nullptr), - use_ORDER_COL32_2R_4R4_(use_ORDER_COL32_2R_4R4) -{ -} -#endif - -cublasINT8MMWrapper::~cublasINT8MMWrapper() -{ - mu_ = nullptr; -} - -cublasINT8MMWrapper::cublasINT8MMWrapper(const cublasINT8MMWrapper& wrapper): -#ifdef SPARSITY_ENABLED - cublasMMWrapper(nullptr, - wrapper.cublaslt_handle_, - wrapper.cusparselt_handle_, - wrapper.stream_, - wrapper.cublas_algo_map_, - wrapper.mu_, - wrapper.allocator_), -#else - cublasMMWrapper( - nullptr, wrapper.cublaslt_handle_, wrapper.stream_, wrapper.cublas_algo_map_, wrapper.mu_, wrapper.allocator_), -#endif - use_ORDER_COL32_2R_4R4_(wrapper.use_ORDER_COL32_2R_4R4_) -{ -} - -// for int8 cublasLtMM with algo -// ATransform should be m*n, CUBLASLT_ORDER_COL32 -// kernel should be n*k, CUBLASLT_ORDER_COL4_4R2_8C or CUBLASLT_ORDER_COL32_2R_4R4 -// res is m*n, CUBLASLT_ORDER_COL32 -void cublasINT8MMWrapper::Gemm(int* res, - int batchCount, - int m, - int n, - int k, - int64_t stridea, - int64_t strideb, - int64_t stridec, - const int8_t* ATransform, - const int8_t* kernel) -{ - mu_->lock(); - cublasOperation_t opTranspose = CUBLAS_OP_T; -#if (CUDART_VERSION >= 11000) - cublasComputeType_t computeType = CUBLAS_COMPUTE_32I; -#else - cudaDataType_t computeType = CUDA_R_32I; -#endif - cublasLtMatmulDesc_t matmulDesc; - cublasLtMatrixLayout_t AtransformDesc = NULL; - cublasLtMatrixLayout_t BtransformDesc = NULL; - cublasLtMatrixLayout_t CtransformDesc = NULL; - cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32; - - cublasLtOrder_t order_matrixB; -#if (CUDART_VERSION >= 11000) - if (use_ORDER_COL32_2R_4R4_) { - order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4; - } - else { - order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C; - } -#else - order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C; -#endif - - int ldaTransform = 32 * m; - int ldbTransform; - if (use_ORDER_COL32_2R_4R4_) { - ldbTransform = 32 * ((n + 32 - 1) / 32) * 32; - } - else { - ldbTransform = 32 * ((n + 8 - 1) / 8) * 8; - } - int ldcTransform = 32 * m; - - // create matmulDesc -#if (CUDART_VERSION >= 11000) - cublasLtMatmulDescCreate(&matmulDesc, computeType, CUDA_R_32I); -#else - cublasLtMatmulDescCreate(&matmulDesc, computeType); -#endif - cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t)); - cublasLtMatrixLayoutCreate(&AtransformDesc, CUDA_R_8I, m, k, ldaTransform); - cublasLtMatrixLayoutSetAttribute(AtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32)); - cublasLtMatrixLayoutCreate(&BtransformDesc, CUDA_R_8I, n, k, ldbTransform); - cublasLtMatrixLayoutSetAttribute( - BtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_matrixB, sizeof(order_matrixB)); - cublasLtMatrixLayoutCreate(&CtransformDesc, CUDA_R_32I, m, n, ldcTransform); - cublasLtMatrixLayoutSetAttribute(CtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32)); - if (batchCount > 1) { - cublasLtMatrixLayoutSetAttribute( - AtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)); - cublasLtMatrixLayoutSetAttribute( - AtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, sizeof(stridea)); - cublasLtMatrixLayoutSetAttribute( - BtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)); - cublasLtMatrixLayoutSetAttribute( - BtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, sizeof(strideb)); - cublasLtMatrixLayoutSetAttribute( - CtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)); - cublasLtMatrixLayoutSetAttribute( - CtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, sizeof(stridec)); - } - - int alphaI = 1; - int betaI = 0; - - // get algo - cublasLtMatmulAlgo_t algo; - int findAlgo = 0; - if (cublas_algo_map_->isExist(batchCount, m, n, k, INT8_DATATYPE)) { - // printf("find algo %s\n", markStr.c_str()); - findAlgo = 1; - - cublasLtMatmulAlgo_info tmp_info = cublas_algo_map_->getAlgo(batchCount, m, n, k, INT8_DATATYPE); - - cublasLtMatmulAlgoInit(cublaslt_handle_, - computeType, - CUDA_R_32I, - CUDA_R_8I, - CUDA_R_8I, - CUDA_R_32I, - CUDA_R_32I, - tmp_info.algoId, - &algo); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(tmp_info.customOption), sizeof(tmp_info.customOption)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tmp_info.tile), sizeof(tmp_info.tile)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(tmp_info.splitK_val), sizeof(tmp_info.splitK_val)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(tmp_info.swizzle), sizeof(tmp_info.swizzle)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(tmp_info.reductionScheme), sizeof(int)); -#if (CUDART_VERSION >= 11000) - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(tmp_info.stages), sizeof(tmp_info.stages)); -#endif - } - else { - findAlgo = 1; - int algoId; - if (use_ORDER_COL32_2R_4R4_) { - algoId = 7; - } - else { - algoId = 6; - } - int swizzle = 0; - int customOption = 0; - int tile = 20; - int splitK_val = 0; - int reductionScheme = 0; - cublasLtMatmulAlgoInit( - cublaslt_handle_, computeType, CUDA_R_32I, CUDA_R_8I, CUDA_R_8I, CUDA_R_32I, CUDA_R_32I, algoId, &algo); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(customOption), sizeof(customOption)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tile), sizeof(tile)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(splitK_val), sizeof(splitK_val)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(swizzle), sizeof(swizzle)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(reductionScheme), sizeof(int)); -#if (CUDART_VERSION >= 11000) - int stages; - if (use_ORDER_COL32_2R_4R4_) { - stages = 15; - } - else { - stages = 13; - } - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(stages), sizeof(stages)); -#endif - } - - cublasLtMatmul(cublaslt_handle_, - matmulDesc, - &alphaI, - ATransform, - AtransformDesc, - kernel, - BtransformDesc, - &betaI, - res, - CtransformDesc, - res, - CtransformDesc, - (findAlgo == 1 ? (&algo) : NULL), - NULL, - 0, - stream_); - - cublasLtMatmulDescDestroy(matmulDesc); - cublasLtMatrixLayoutDestroy(AtransformDesc); - cublasLtMatrixLayoutDestroy(BtransformDesc); - cublasLtMatrixLayoutDestroy(CtransformDesc); - sync_check_cuda_error(); - mu_->unlock(); -} - -// for int8 IO cublasLtMM with algo -// ATransform should be m*k CUBLASLT_ORDER_COL32 -// kernel should be n*k CUBLASLT_ORDER_COL4_4R2_8C -// res is m*n CUBLASLT_ORDER_COL32 -void cublasINT8MMWrapper::Gemm(int8_t* res, - int batchCount, - int m, - int n, - int k, - int64_t stridea, - int64_t strideb, - int64_t stridec, - const float alpha, - const int8_t* ATransform, - const int8_t* kernel) -{ - mu_->lock(); - cublasOperation_t opTranspose = CUBLAS_OP_T; - // int8 gemm does not support CUBLAS_POINTER_MODE_DEVICE - // cublasLtPointerMode_t pointerMode = CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO; - cudaDataType_t scaleType = CUDA_R_32F; -#if (CUDART_VERSION >= 11000) - cublasComputeType_t computeType = CUBLAS_COMPUTE_32I; -#else - cudaDataType_t computeType = CUDA_R_32I; -#endif - cublasLtMatmulDesc_t matmulDesc; - cublasLtMatrixLayout_t AtransformDesc = NULL; - cublasLtMatrixLayout_t BtransformDesc = NULL; - cublasLtMatrixLayout_t CtransformDesc = NULL; - cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32; - - cublasLtOrder_t order_matrixB; -#if (CUDART_VERSION >= 11000) - if (use_ORDER_COL32_2R_4R4_) { - order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4; - } - else { - order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C; - } -#else - order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C; -#endif - - int ldaTransform = 32 * m; - - int ldbTransform; - if (use_ORDER_COL32_2R_4R4_) { - ldbTransform = 32 * ((n + 32 - 1) / 32) * 32; - } - else { - ldbTransform = 32 * ((n + 8 - 1) / 8) * 8; - } - - int ldcTransform = 32 * m; - - // create matmulDesc -#if (CUDART_VERSION >= 11000) - cublasLtMatmulDescCreate(&matmulDesc, computeType, scaleType); -#else - cublasLtMatmulDescCreate(&matmulDesc, computeType); -#endif - cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t)); - cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, &scaleType, sizeof(scaleType)); - // cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointerMode, - // sizeof(cublasLtPointerMode_t)); - cublasLtMatrixLayoutCreate(&AtransformDesc, CUDA_R_8I, m, k, ldaTransform); - cublasLtMatrixLayoutSetAttribute(AtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32)); - cublasLtMatrixLayoutCreate(&BtransformDesc, CUDA_R_8I, n, k, ldbTransform); - cublasLtMatrixLayoutSetAttribute( - BtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_matrixB, sizeof(order_matrixB)); - cublasLtMatrixLayoutCreate(&CtransformDesc, CUDA_R_8I, m, n, ldcTransform); - cublasLtMatrixLayoutSetAttribute(CtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32)); - if (batchCount > 1) { - cublasLtMatrixLayoutSetAttribute( - AtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)); - cublasLtMatrixLayoutSetAttribute( - AtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, sizeof(stridea)); - cublasLtMatrixLayoutSetAttribute( - BtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)); - cublasLtMatrixLayoutSetAttribute( - BtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, sizeof(strideb)); - cublasLtMatrixLayoutSetAttribute( - CtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)); - cublasLtMatrixLayoutSetAttribute( - CtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, sizeof(stridec)); - } - - // get algo - cublasLtMatmulAlgo_t algo; - int findAlgo = 0; - if (cublas_algo_map_->isExist(batchCount, m, n, k, INT8_DATATYPE)) { - findAlgo = 1; - - cublasLtMatmulAlgo_info tmp_info = cublas_algo_map_->getAlgo(batchCount, m, n, k, INT8_DATATYPE); - - cublasLtMatmulAlgoInit(cublaslt_handle_, - computeType, - CUDA_R_32F, - CUDA_R_8I, - CUDA_R_8I, - CUDA_R_8I, - CUDA_R_8I, - tmp_info.algoId, - &algo); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(tmp_info.customOption), sizeof(tmp_info.customOption)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tmp_info.tile), sizeof(tmp_info.tile)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(tmp_info.splitK_val), sizeof(tmp_info.splitK_val)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(tmp_info.swizzle), sizeof(tmp_info.swizzle)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(tmp_info.reductionScheme), sizeof(int)); -#if (CUDART_VERSION >= 11000) - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(tmp_info.stages), sizeof(tmp_info.stages)); -#endif - } - else { - findAlgo = 1; - int algoId; - if (use_ORDER_COL32_2R_4R4_) { - algoId = 7; - } - else { - algoId = 6; - } - int swizzle = 0; - int customOption = 0; - int tile = 20; - int splitK_val = 0; - int reductionScheme = 0; - cublasLtMatmulAlgoInit( - cublaslt_handle_, computeType, CUDA_R_32F, CUDA_R_8I, CUDA_R_8I, CUDA_R_8I, CUDA_R_8I, algoId, &algo); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(customOption), sizeof(customOption)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tile), sizeof(tile)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(splitK_val), sizeof(splitK_val)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(swizzle), sizeof(swizzle)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(reductionScheme), sizeof(int)); -#if (CUDART_VERSION >= 11000) - int stages; - if (use_ORDER_COL32_2R_4R4_) { - stages = 15; - } - else { - stages = 13; - } - cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(stages), sizeof(stages)); -#endif - } - - float beta = 0.0f; - cublasLtMatmul(cublaslt_handle_, - matmulDesc, - &alpha, - ATransform, - AtransformDesc, - kernel, - BtransformDesc, - &beta, - res, - CtransformDesc, - res, - CtransformDesc, - (findAlgo == 1 ? (&algo) : NULL), - NULL, - 0, - stream_); - - cublasLtMatmulDescDestroy(matmulDesc); - cublasLtMatrixLayoutDestroy(AtransformDesc); - cublasLtMatrixLayoutDestroy(BtransformDesc); - cublasLtMatrixLayoutDestroy(CtransformDesc); - sync_check_cuda_error(); - mu_->unlock(); -} - -template -int cublasINT8MMWrapper::getFusedINT8QKVType(const int k, const int n, const AttentionWeight* attention_weights) -{ - - int fusedINT8QKV_type = 0; - const int8_t* Q_weight = (const int8_t*)(attention_weights->query_weight.kernel); - const int8_t* K_weight = (const int8_t*)(attention_weights->key_weight.kernel); - const int8_t* V_weight = (const int8_t*)(attention_weights->value_weight.kernel); - // for QKV weight are DataType_ & continue - if ((attention_weights->query_weight.kernel + n * k == attention_weights->key_weight.kernel) - && (attention_weights->key_weight.kernel + n * k == attention_weights->value_weight.kernel)) { - fusedINT8QKV_type = 1; - } - // for QVK weight are int8 & continue - else if ((Q_weight + n * k == K_weight) && (K_weight + n * k == V_weight)) { - fusedINT8QKV_type = 2; - } - return fusedINT8QKV_type; -} - -bool cublasINT8MMWrapper::getUseOrderCol322R4R4() -{ - return use_ORDER_COL32_2R_4R4_; -} - -template int -cublasINT8MMWrapper::getFusedINT8QKVType(const int k, const int n, const AttentionWeight* attention_weights); - -template int -cublasINT8MMWrapper::getFusedINT8QKVType(const int k, const int n, const AttentionWeight* attention_weights); - -#ifdef SPARSITY_ENABLED -// A is sparse weight [m,k], non transposed row major -// B is activation input [k, n], non transposed col major -void cublasINT8MMWrapper::SpGemm( - const int m, const int n, const int k, const float alpha, const void* A, const void* B, void* C) -{ - cudaDataType_t Atype = CUDA_R_8I; - cudaDataType_t Btype = CUDA_R_8I; - cudaDataType_t Ctype = CUDA_R_8I; - cusparseComputeType compute_type = CUSPARSE_COMPUTE_32I; - cusparseOrder_t col_order = CUSPARSE_ORDER_COL; - cusparseOrder_t row_order = CUSPARSE_ORDER_ROW; - cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; - cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE; - cusparseLtMatmulDescriptor_t matmul; - cusparseLtMatmulAlgSelection_t alg_sel; - cusparseLtMatmulPlan_t plan; - - auto num_A_rows = m; - auto num_A_cols = k; - auto num_B_rows = k; - auto num_B_cols = n; - auto num_C_rows = m; - auto num_C_cols = n; - unsigned alignment = 16; - auto lda = num_A_cols; - auto ldb = num_B_rows; - auto ldc = num_C_rows; - float _beta(0.0f); - - char mark[256]; - sprintf(mark, "%d_%d_%d_%d", 1, m, n, k); - if (sp_mat_A_desc_map_.find(mark) != sp_mat_A_desc_map_.end()) { - CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_, - &matmul, - opA, - opB, - &sp_mat_A_desc_map_[mark], - &sp_mat_B_desc_map_[mark], - &sp_mat_C_desc_map_[mark], - &sp_mat_C_desc_map_[mark], - compute_type)) - } - else { - // initializing MatDesc takes a lot of time - cusparseLtMatDescriptor_t mat_A, mat_B, mat_C; - sp_mat_A_desc_map_[mark] = mat_A; - sp_mat_B_desc_map_[mark] = mat_B; - sp_mat_C_desc_map_[mark] = mat_C; - CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_, - &sp_mat_A_desc_map_[mark], - num_A_rows, - num_A_cols, - lda, - alignment, - Atype, - row_order, - CUSPARSELT_SPARSITY_50_PERCENT)) - CHECK_CUSPARSE(cusparseLtDenseDescriptorInit( - &cusparselt_handle_, &sp_mat_B_desc_map_[mark], num_B_rows, num_B_cols, ldb, alignment, Btype, col_order)) - CHECK_CUSPARSE(cusparseLtDenseDescriptorInit( - &cusparselt_handle_, &sp_mat_C_desc_map_[mark], num_C_rows, num_C_cols, ldc, alignment, Ctype, col_order)) - CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_, - &matmul, - opA, - opB, - &sp_mat_A_desc_map_[mark], - &sp_mat_B_desc_map_[mark], - &sp_mat_C_desc_map_[mark], - &sp_mat_C_desc_map_[mark], - compute_type)) - } - mu_->lock(); - CHECK_CUSPARSE( - cusparseLtMatmulAlgSelectionInit(&cusparselt_handle_, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT)) - int alg = cublas_algo_map_->getSpAlgo(1, num_A_rows, num_B_cols, num_A_cols); - CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute( - &cusparselt_handle_, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg))) - size_t workspace_size; - CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&cusparselt_handle_, &alg_sel, &workspace_size)) - CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&cusparselt_handle_, &plan, &matmul, &alg_sel, workspace_size)) - - void* d_workspace = nullptr; - int num_streams = 1; - cudaStream_t streams[1] = {stream_}; - CHECK_CUSPARSE( - cusparseLtMatmul(&cusparselt_handle_, &plan, &alpha, A, B, &_beta, C, C, d_workspace, streams, num_streams)) - CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan)) - sync_check_cuda_error(); - mu_->unlock(); -} -#endif -} // namespace turbomind diff --git a/src/turbomind/utils/cublasINT8MMWrapper.h b/src/turbomind/utils/cublasINT8MMWrapper.h deleted file mode 100644 index 631ef1f842..0000000000 --- a/src/turbomind/utils/cublasINT8MMWrapper.h +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "cuda_utils.h" -#include "src/turbomind/layers/attention_layers/AttentionWeight.h" -#include "src/turbomind/utils/cublasAlgoMap.h" -#include "src/turbomind/utils/cublasMMWrapper.h" -#include -#include -#include -#include -#include -#include - -#pragma once -namespace turbomind { - -class cublasINT8MMWrapper: public cublasMMWrapper { -private: - bool use_ORDER_COL32_2R_4R4_; - -public: - cublasINT8MMWrapper(cublasLtHandle_t cublaslt_handle_, - cudaStream_t stream, - cublasAlgoMap* map, - std::mutex* mu, - bool use_ORDER_COL32_2R_4R4); - - cublasINT8MMWrapper(cublasHandle_t cublas_handle, - cublasLtHandle_t cublaslt_handle, - cudaStream_t stream, - cublasAlgoMap* map, - std::mutex* mu, - bool use_ORDER_COL32_2R_4R4); -#ifdef SPARSITY_ENABLED - cublasINT8MMWrapper(cublasLtHandle_t cublaslt_handle_, - cusparseLtHandle_t cusparselt_handle, - cudaStream_t stream, - cublasAlgoMap* map, - std::mutex* mu, - bool use_ORDER_COL32_2R_4R4); -#endif - - ~cublasINT8MMWrapper(); - - cublasINT8MMWrapper(const cublasINT8MMWrapper& wrapper); - - void Gemm(int* res, - int batchCount, - int m, - int n, - int k, - int64_t stridea, - int64_t strideb, - int64_t stridec, - const int8_t* ATransform, - const int8_t* kernel); - - void Gemm(int8_t* res, - int batchCount, - int m, - int n, - int k, - int64_t stridea, - int64_t strideb, - int64_t stridec, - const float alpha, - const int8_t* ATransform, - const int8_t* kernel); - - template - int getFusedINT8QKVType(const int k, const int n, const AttentionWeight* attention_weights); - - bool getUseOrderCol322R4R4(); - -#ifdef SPARSITY_ENABLED - void SpGemm(const int m, const int n, const int k, const float alpha, const void* A, const void* B, void* C); -#endif -}; - -} // namespace turbomind diff --git a/src/turbomind/utils/cublasMMWrapper.cc b/src/turbomind/utils/cublasMMWrapper.cc deleted file mode 100644 index cd70298b64..0000000000 --- a/src/turbomind/utils/cublasMMWrapper.cc +++ /dev/null @@ -1,1102 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "cublasMMWrapper.h" -#include "cuda_utils.h" -#include "src/turbomind/macro.h" - -#ifndef CUDART_VERSION -#error CUDART_VERSION Undefined! -#endif - -namespace turbomind { -cublasMMWrapper::cublasMMWrapper(cublasHandle_t cublas_handle, - cublasLtHandle_t cublaslt_handle, - cudaStream_t stream, - cublasAlgoMap* cublas_algo_map, - std::mutex* mu, - IAllocator* allocator): - cublas_handle_(cublas_handle), - cublaslt_handle_(cublaslt_handle), - stream_(stream), - cublas_algo_map_(cublas_algo_map), - mu_(mu), - allocator_(allocator) -{ - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - if (allocator_ != nullptr) { - cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false); - } -} - -#ifdef SPARSITY_ENABLED -cublasMMWrapper::cublasMMWrapper(cublasHandle_t cublas_handle, - cublasLtHandle_t cublaslt_handle, - cusparseLtHandle_t cusparselt_handle, - cudaStream_t stream, - cublasAlgoMap* cublas_algo_map, - std::mutex* mu, - IAllocator* allocator): - cublas_handle_(cublas_handle), - cublaslt_handle_(cublaslt_handle), - cusparselt_handle_(cusparselt_handle), - stream_(stream), - cublas_algo_map_(cublas_algo_map), - mu_(mu), - allocator_(allocator) -{ - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - if (allocator_ != nullptr) { - cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false); - } -} -#endif - -cublasMMWrapper::~cublasMMWrapper() -{ - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - mu_ = nullptr; - if (allocator_ != nullptr) { - allocator_->free((void**)(&cublas_workspace_)); - allocator_ = nullptr; - } -} - -cublasMMWrapper::cublasMMWrapper(const cublasMMWrapper& wrapper): - cublas_handle_(wrapper.cublas_handle_), - cublaslt_handle_(wrapper.cublaslt_handle_), -#ifdef SPARSITY_ENABLED - cusparselt_handle_(wrapper.cusparselt_handle_), -#endif - stream_(wrapper.stream_), - cublas_algo_map_(wrapper.cublas_algo_map_), - mu_(wrapper.mu_), - allocator_(wrapper.allocator_) -{ - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - if (allocator_ != nullptr) { - cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false); - } -} - -void cublasMMWrapper::Gemm(cublasOperation_t transa, - cublasOperation_t transb, - const int m, - const int n, - const int k, - const void* alpha, - const void* A, - cudaDataType_t Atype, - int lda, - const void* B, - cudaDataType_t Btype, - int ldb, - const void* beta, - void* C, - cudaDataType_t Ctype, - int ldc, - cudaDataType_t computeType, - cublasGemmAlgo_t algo) -{ - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - mu_->lock(); - check_cuda_error(cublasGemmEx(cublas_handle_, - transa, - transb, - m, - n, - k, - alpha, - A, - Atype, - lda, - B, - Btype, - ldb, - beta, - C, - Ctype, - ldc, - computeType, - algo)); - sync_check_cuda_error(); - mu_->unlock(); -} - -void cublasMMWrapper::Gemm(cublasOperation_t transa, - cublasOperation_t transb, - const int m, - const int n, - const int k, - const void* A, - const int lda, - const void* B, - const int ldb, - void* C, - const int ldc) -{ - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - Gemm(transa, transb, m, n, k, A, lda, B, ldb, C, ldc, 1.0f, 0.0f); -} - -void cublasMMWrapper::Gemm(cublasOperation_t transa, - cublasOperation_t transb, - const int m, - const int n, - const int k, - const void* A, - const int lda, - const void* B, - const int ldb, - void* C, - const int ldc, - float f_alpha, - float f_beta) -{ - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - half h_alpha = (half)(f_alpha); - half h_beta = (half)(f_beta); - - mu_->lock(); - // TODO: default cublas libs - int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0; - bool using_cublasLt = (Atype_ == CUDA_R_16F) ? true : false; - int batch_count = 1; - // fp32 use cublas as default - // fp16 use cublasLt as default - const void* alpha = is_fp16_computeType ? reinterpret_cast(&h_alpha) : reinterpret_cast(&f_alpha); - const void* beta = is_fp16_computeType ? reinterpret_cast(&h_beta) : reinterpret_cast(&f_beta); - - int findAlgo = cublas_algo_map_->isExist(batch_count, m, n, k, getCublasDataType(Atype_)); - - cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(Atype_)); - if (findAlgo) { - if (info.stages != -1) { - using_cublasLt = true; - } - else { - using_cublasLt = false; - } - } - - if (using_cublasLt) { - cublasLtMatmulDesc_t operationDesc = NULL; - cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL; - cudaDataType_t scaleType; -#if (CUDART_VERSION >= 11000) - cublasComputeType_t computeType; -#else - cudaDataType_t computeType; -#endif - - if (is_fp16_computeType) { -#if (CUDART_VERSION >= 11000) - computeType = CUBLAS_COMPUTE_16F; -#else - computeType = CUDA_R_16F; -#endif - scaleType = CUDA_R_16F; - } - else { -#if (CUDART_VERSION >= 11000) - computeType = CUBLAS_COMPUTE_32F; -#else - computeType = CUDA_R_32F; -#endif - scaleType = CUDA_R_32F; - } - - // -------------------------------------- - // Create descriptors for the original matrices - cublasLtMatrixLayoutCreate(&Adesc, Atype_, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda); - cublasLtMatrixLayoutCreate(&Bdesc, Btype_, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb); - cublasLtMatrixLayoutCreate(&Cdesc, Ctype_, m, n, ldc); -#if (CUDART_VERSION >= 11000) - cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType); -#else - cublasLtMatmulDescCreate(&operationDesc, computeType); -#endif - - cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t)); - cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t)); - - cublasLtMatmulAlgo_t algo; - void* workSpace = cublas_workspace_; - int workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE; - if (findAlgo) { - if (info.workspaceSize > workspaceSize) { - findAlgo = 0; - } - else { - cublasLtMatmulAlgoInit( - cublaslt_handle_, computeType, scaleType, Atype_, Btype_, Ctype_, Ctype_, info.algoId, &algo); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, - CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, - &(info.reductionScheme), - sizeof(info.reductionScheme)); - -#if (CUDART_VERSION >= 11000) - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages)); -#endif - -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID, &(info.inner_shapeId), sizeof(info.inner_shapeId)); - cublasLtMatmulAlgoConfigSetAttribute(&algo, - CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID, - &(info.cluster_shapeId), - sizeof(info.cluster_shapeId)); -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_MMA_SHAPE_ID, &(info.mma_shapeId), sizeof(info.mma_shapeId)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CGA_SHAPE_ID, &(info.cga_shapeId), sizeof(info.cga_shapeId)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_SCHEDULING_MODE, &(info.sche_mode), sizeof(info.sche_mode)); -#endif - } - } - - cublasLtMatmul(cublaslt_handle_, - operationDesc, - alpha, - A, - Adesc, - B, - Bdesc, - beta, - C, - Cdesc, - C, - Cdesc, - (findAlgo == 1 ? (&algo) : NULL), - workSpace, - workspaceSize, - stream_); - - cublasLtMatmulDescDestroy(operationDesc); - cublasLtMatrixLayoutDestroy(Adesc); - cublasLtMatrixLayoutDestroy(Bdesc); - cublasLtMatrixLayoutDestroy(Cdesc); - sync_check_cuda_error(); - } - else { - int cublasAlgo = info.algoId; - check_cuda_error(cublasGemmEx(cublas_handle_, - transa, - transb, - m, - n, - k, - alpha, - A, - Atype_, - lda, - B, - Btype_, - ldb, - beta, - C, - Ctype_, - ldc, - computeType_, - static_cast(cublasAlgo))); - } - mu_->unlock(); -} - -void cublasMMWrapper::setFP32GemmConfig() -{ - Atype_ = CUDA_R_32F; - Btype_ = CUDA_R_32F; - Ctype_ = CUDA_R_32F; - computeType_ = CUDA_R_32F; -} - -void cublasMMWrapper::setFP16GemmConfig() -{ - Atype_ = CUDA_R_16F; - Btype_ = CUDA_R_16F; - Ctype_ = CUDA_R_16F; - computeType_ = CUDA_R_32F; -} - -#ifdef ENABLE_BF16 -void cublasMMWrapper::setBF16GemmConfig() -{ - Atype_ = CUDA_R_16BF; - Btype_ = CUDA_R_16BF; - Ctype_ = CUDA_R_16BF; - computeType_ = CUDA_R_32F; -} -#endif - -void cublasMMWrapper::setGemmConfig(cudaDataType_t aType, - cudaDataType_t bType, - cudaDataType_t cType, - cudaDataType_t computeType) -{ - Atype_ = aType; - Btype_ = bType; - Ctype_ = cType; - computeType_ = computeType; -} - -CublasDataType cublasMMWrapper::getCublasDataType(cudaDataType_t data_type) -{ - if (data_type == CUDA_R_16F) { - return HALF_DATATYPE; - } - else if (data_type == CUDA_R_32F) { - return FLOAT_DATATYPE; - } -#ifdef ENABLE_BF16 - else if (data_type == CUDA_R_16BF) { - return BFLOAT16_DATATYPE; - } -#endif - return FLOAT_DATATYPE; -} - -#if (CUDART_VERSION >= 11000) -// input, weight, output are row-major -// only works for cublas 11.x -void cublasMMWrapper::Gemm(cublasOperation_t transa, - cublasOperation_t transb, - const int m, - const int n, - const int k, - const void* A, - const int lda, - const void* B, - const int ldb, - const void* bias, - void* C, - const int ldc) -{ - TM_LOG_DEBUG(__PRETTY_FUNCTION__); - cudaDataType_t Atype, Btype, Ctype; - cublasComputeType_t computeType; - cudaDataType_t scaleType; - float alpha_float = 1.0f; - float beta_float = 0.0f; - half alpha_half = half(1.0f); - half beta_half = half(0.0f); - void * alpha, *beta; - - // int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0; - if (Atype_ == CUDA_R_32F) { - computeType = CUBLAS_COMPUTE_32F_FAST_TF32; - Atype = CUDA_R_32F; - Btype = CUDA_R_32F; - Ctype = CUDA_R_32F; - scaleType = CUDA_R_32F; - alpha = &alpha_float; - beta = &beta_float; - } - else if (Atype_ == CUDA_R_16BF) { - computeType = CUBLAS_COMPUTE_32F_FAST_TF32; - Atype = CUDA_R_16BF; - Btype = CUDA_R_16BF; - Ctype = CUDA_R_16BF; - scaleType = CUDA_R_32F; - alpha = &alpha_float; - beta = &beta_float; - } - else { - computeType = CUBLAS_COMPUTE_16F; - Atype = CUDA_R_16F; - Btype = CUDA_R_16F; - Ctype = CUDA_R_16F; - scaleType = CUDA_R_16F; - alpha = &alpha_half; - beta = &beta_half; - } - - cublasLtMatmulDesc_t operationDesc = NULL; - cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL; - cublasLtEpilogue_t epi = CUBLASLT_EPILOGUE_BIAS; - cublasLtMatrixLayoutCreate(&Adesc, Atype, (transa == CUBLAS_OP_N) ? m : k, (transa == CUBLAS_OP_N) ? k : m, lda); - cublasLtMatrixLayoutCreate(&Bdesc, Btype, (transb == CUBLAS_OP_N) ? k : n, (transb == CUBLAS_OP_N) ? n : k, ldb); - cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, ldc); - - cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType); - cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t)); - cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t)); - cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epi, sizeof(cublasLtEpilogue_t)); - cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(const void*)); - check_cuda_error(cublasLtMatmul( - cublaslt_handle_, operationDesc, alpha, A, Adesc, B, Bdesc, beta, C, Cdesc, C, Cdesc, NULL, NULL, 0, stream_)); - cublasLtMatrixLayoutDestroy(Adesc); - cublasLtMatrixLayoutDestroy(Bdesc); - cublasLtMatrixLayoutDestroy(Cdesc); - cublasLtMatmulDescDestroy(operationDesc); -} -#endif -void cublasMMWrapper::setStream(cudaStream_t stream) -{ - stream_ = stream; -} - -void cublasMMWrapper::stridedBatchedGemm(cublasOperation_t transa, - cublasOperation_t transb, - const int m, - const int n, - const int k, - const void* A, - const int lda, - const int64_t strideA, - const void* B, - const int ldb, - const int64_t strideB, - void* C, - const int ldc, - const int64_t strideC, - const int batch_count, - const float f_alpha, - const float f_beta) -{ - half h_alpha = (half)f_alpha; - half h_beta = (half)f_beta; - - mu_->lock(); - int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0; - const void* alpha = - is_fp16_computeType ? reinterpret_cast(&h_alpha) : reinterpret_cast(&f_alpha); - const void* beta = is_fp16_computeType ? reinterpret_cast(&h_beta) : reinterpret_cast(&f_beta); - cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(Atype_)); - - check_cuda_error(cublasGemmStridedBatchedEx(cublas_handle_, - transa, - transb, - m, - n, - k, - alpha, - A, - Atype_, - lda, - strideA, - B, - Btype_, - ldb, - strideB, - beta, - C, - Ctype_, - ldc, - strideC, - batch_count, - computeType_, - static_cast(info.algoId))); - - mu_->unlock(); -} - -void cublasMMWrapper::stridedBatchedGemm(cublasOperation_t transa, - cublasOperation_t transb, - const int m, - const int n, - const int k, - const float f_alpha, - const void* A, - cudaDataType_t AType, - const int lda, - const int64_t strideA, - const void* B, - cudaDataType_t BType, - const int ldb, - const int64_t strideB, - const float f_beta, - void* C, - cudaDataType_t CType, - const int ldc, - const int64_t strideC, - const int batch_count, - cudaDataType_t computeType) -{ - half h_alpha = (half)f_alpha; - half h_beta = (half)f_beta; - - mu_->lock(); - int is_fp16_computeType = computeType == CUDA_R_16F ? 1 : 0; - const void* alpha = - is_fp16_computeType ? reinterpret_cast(&h_alpha) : reinterpret_cast(&f_alpha); - const void* beta = is_fp16_computeType ? reinterpret_cast(&h_beta) : reinterpret_cast(&f_beta); - cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(Atype_)); - - check_cuda_error(cublasGemmStridedBatchedEx(cublas_handle_, - transa, - transb, - m, - n, - k, - alpha, - A, - AType, - lda, - strideA, - B, - BType, - ldb, - strideB, - beta, - C, - CType, - ldc, - strideC, - batch_count, - computeType, - static_cast(info.algoId))); - - mu_->unlock(); -} - -void cublasMMWrapper::batchedGemm(cublasOperation_t transa, - cublasOperation_t transb, - const int m, - const int n, - const int k, - const void* const* A, - const int lda, - const void* const* B, - const int ldb, - void* const* C, - const int ldc, - const int batch_count) -{ - float f_alpha = static_cast(1.0f); - float f_beta = static_cast(0.0f); - - half h_alpha = (half)1.0f; - half h_beta = (half)0.0f; - - mu_->lock(); - int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0; - const void* alpha = is_fp16_computeType ? reinterpret_cast(&h_alpha) : reinterpret_cast(&f_alpha); - const void* beta = is_fp16_computeType ? reinterpret_cast(&h_beta) : reinterpret_cast(&f_beta); - cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(Atype_)); - - check_cuda_error(cublasGemmBatchedEx(cublas_handle_, - transa, - transb, - m, - n, - k, - alpha, - A, - Atype_, - lda, - B, - Btype_, - ldb, - beta, - C, - Ctype_, - ldc, - batch_count, - computeType_, - static_cast(info.algoId))); - mu_->unlock(); -} - -bool cublasMMWrapper::isFuseBatchGemm(const int batch_count, const int m, const int k, const int n) -{ - CublasDataType data_type = getCublasDataType(Atype_); - - if (cublas_algo_map_->isExist(batch_count, m, k, n, data_type) == false - || cublas_algo_map_->isExist(1, m, k, n, data_type) == false) { - return false; - } - else { - return cublas_algo_map_->getAlgo(batch_count, m, k, n, data_type).exec_time - < 3 * cublas_algo_map_->getAlgo(1, m, k, n, data_type).exec_time; - } -} - -#ifdef SPARSITY_ENABLED -void cublasMMWrapper::SpGemm(cublasOperation_t transa, - cublasOperation_t transb, - const int m, - const int n, - const int k, - const void* A, - const void* B, - void* C) -{ - if (Atype_ != CUDA_R_16F || Btype_ != CUDA_R_16F || Ctype_ != CUDA_R_16F) { - throw std::runtime_error("\n[TM][ERROR] sparse GEMM only supports FP16 data type now."); - } - static bool not_printed_fp32_accumulation_warning = true; - if (computeType_ != CUDA_R_16F && not_printed_fp32_accumulation_warning) { - printf("[TM][WARNING] cublasMMWrapper sets to FP32 compute type, " - "but sparse gemm will use FP16 compute type since cusparselt " - "supports FP16 accumulation only.\n"); - not_printed_fp32_accumulation_warning = false; - } - cusparseOrder_t order = CUSPARSE_ORDER_COL; - cusparseOperation_t opA = (transa == CUBLAS_OP_N) ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; - cusparseOperation_t opB = (transb == CUBLAS_OP_N) ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; - cusparseComputeType compute_type = CUSPARSE_COMPUTE_16F; - cusparseLtMatmulDescriptor_t matmul; - cusparseLtMatmulAlgSelection_t alg_sel; - cusparseLtMatmulPlan_t plan; - - bool is_rowmajor = (order == CUSPARSE_ORDER_ROW); - bool isA_transposed = (opA != CUSPARSE_OPERATION_NON_TRANSPOSE); - bool isB_transposed = (opB != CUSPARSE_OPERATION_NON_TRANSPOSE); - auto num_A_rows = (isA_transposed) ? k : m; - auto num_A_cols = (isA_transposed) ? m : k; - auto num_B_rows = (isB_transposed) ? n : k; - auto num_B_cols = (isB_transposed) ? k : n; - auto num_C_rows = m; - auto num_C_cols = n; - unsigned alignment = 16; - auto lda = (is_rowmajor) ? num_A_cols : num_A_rows; - auto ldb = (is_rowmajor) ? num_B_cols : num_B_rows; - auto ldc = (is_rowmajor) ? num_C_cols : num_C_rows; - float _alpha(1.0f); - float _beta(0.0f); - - char mark[256]; - sprintf(mark, "%d_%d_%d_%d", 1, m, n, k); - if (sp_mat_A_desc_map_.find(mark) != sp_mat_A_desc_map_.end()) { - CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_, - &matmul, - opA, - opB, - &sp_mat_A_desc_map_[mark], - &sp_mat_B_desc_map_[mark], - &sp_mat_C_desc_map_[mark], - &sp_mat_C_desc_map_[mark], - compute_type)) - } - else { - // initializing MatDesc takes a lot of time - cusparseLtMatDescriptor_t mat_A, mat_B, mat_C; - sp_mat_A_desc_map_[mark] = mat_A; - sp_mat_B_desc_map_[mark] = mat_B; - sp_mat_C_desc_map_[mark] = mat_C; - CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_, - &sp_mat_A_desc_map_[mark], - num_A_rows, - num_A_cols, - lda, - alignment, - Atype_, - order, - CUSPARSELT_SPARSITY_50_PERCENT)) - CHECK_CUSPARSE(cusparseLtDenseDescriptorInit( - &cusparselt_handle_, &sp_mat_B_desc_map_[mark], num_B_rows, num_B_cols, ldb, alignment, Btype_, order)) - CHECK_CUSPARSE(cusparseLtDenseDescriptorInit( - &cusparselt_handle_, &sp_mat_C_desc_map_[mark], num_C_rows, num_C_cols, ldc, alignment, Ctype_, order)) - CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_, - &matmul, - opA, - opB, - &sp_mat_A_desc_map_[mark], - &sp_mat_B_desc_map_[mark], - &sp_mat_C_desc_map_[mark], - &sp_mat_C_desc_map_[mark], - compute_type)) - } - mu_->lock(); - CHECK_CUSPARSE( - cusparseLtMatmulAlgSelectionInit(&cusparselt_handle_, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT)) - int alg = cublas_algo_map_->getSpAlgo(1, num_A_rows, num_B_cols, num_A_cols); - CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute( - &cusparselt_handle_, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg))) - size_t workspace_size; - CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&cusparselt_handle_, &alg_sel, &workspace_size)) - CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&cusparselt_handle_, &plan, &matmul, &alg_sel, workspace_size)) - - void* d_workspace = nullptr; - int num_streams = 1; - cudaStream_t streams[1] = {stream_}; - CHECK_CUSPARSE( - cusparseLtMatmul(&cusparselt_handle_, &plan, &_alpha, A, B, &_beta, C, C, d_workspace, streams, num_streams)) - CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan)) - sync_check_cuda_error(); - mu_->unlock(); -} - -size_t cublasMMWrapper::getSparseMatrixSize(int m, int k) -{ - // Get a compressed matrix size of shape (m, k) used in cusparselt. - auto Atype_ = CUDA_R_16F; - cusparseOrder_t order = CUSPARSE_ORDER_COL; - unsigned alignment = 16; - int num_A_rows = m; - int num_A_cols = k; - int lda = num_A_rows; - - cusparseLtMatDescriptor_t mat_A; - CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_, - &mat_A, - num_A_rows, - num_A_cols, - lda, - alignment, - Atype_, - order, - CUSPARSELT_SPARSITY_50_PERCENT)); - size_t compressed_size = 0; - CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&cusparselt_handle_, &mat_A, &compressed_size)); - return compressed_size; -} - -void cublasMMWrapper::compressMatrix(const void* input, void* output, const int m, const int k) -{ - cusparseOrder_t order = CUSPARSE_ORDER_COL; - cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; - cusparseLtMatDescriptor_t mat_A; - unsigned alignment = 16; - CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit( - &cusparselt_handle_, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT)) - CHECK_CUSPARSE(cusparseLtSpMMACompress2(&cusparselt_handle_, &mat_A, true, opA, input, output, stream_)) - sync_check_cuda_error(); -} - -bool cublasMMWrapper::isUseSparse(const int batch_count, const int m, const int n, const int k) -{ - return cublas_algo_map_->isUseSparse(batch_count, m, n, k); -} -#endif - -std::pair cublasMMWrapper::findBestAlgo(cublasLtHandle_t lightHandle, - cublasLtMatmulDesc_t computeDesc, - const void* alpha, - const void* A, - cublasLtMatrixLayout_t Adesc, - const void* B, - cublasLtMatrixLayout_t Bdesc, - const void* beta, - const void* C, - cublasLtMatrixLayout_t Cdesc, - void* D, - cublasLtMatrixLayout_t Ddesc, - cudaStream_t stream) -{ -#if (CUBLAS_VERSION) <= 11601 - FT_CHECK_WITH_INFO(false, "CUBLAS version too low."); - return {false, cublasLtMatmulAlgo_t{}}; -#else - size_t returnSize; - int32_t pointer_mode; - cublasLtMatmulDescGetAttribute( - computeDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointer_mode, sizeof(pointer_mode), &returnSize); - - std::vector heuristics(200); - cublasLtMatmulPreference_t preference; - check_cuda_error(cublasLtMatmulPreferenceCreate(&preference)); - check_cuda_error(cublasLtMatmulPreferenceInit(preference)); - uint64_t workspace_size = CUBLAS_WORKSPACE_SIZE; - check_cuda_error(cublasLtMatmulPreferenceSetAttribute( - preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_size, sizeof(workspace_size))); -#if (CUBLAS_VERSION) <= 12000 - uint32_t pointer_mode_mask = 0; - check_cuda_error(cublasLtMatmulPreferenceSetAttribute( - preference, CUBLASLT_MATMUL_PREF_EPILOGUE_MASK, &pointer_mode_mask, sizeof(pointer_mode_mask))); -#endif - - int return_count = 0; - auto ret = cublasLtMatmulAlgoGetHeuristic(lightHandle, - computeDesc, - Adesc, - Bdesc, - Cdesc, - Ddesc, - preference, - heuristics.size(), - heuristics.data(), - &return_count); - heuristics.resize(return_count); - - std::map> algo_results; - for (const auto& heuristic : heuristics) { - cublasLtMatmulAlgo_t algo = heuristic.algo; - int32_t algo_id; - cublasLtMatmulAlgoConfigGetAttribute(&algo, CUBLASLT_ALGO_CONFIG_ID, &algo_id, sizeof(algo_id), &returnSize); - - cudaEvent_t start_event, stop_event; - cudaEventCreate(&start_event); - cudaEventCreate(&stop_event); - - float my_alpha = 1.0f; - float my_beta = 0.0f; - - for (int i = 0; i < 11; i++) { - float duration_ms; - cudaEventRecord(start_event, stream); - check_cuda_error(cublasLtMatmul(lightHandle, - computeDesc, - alpha, - A, - Adesc, - B, - Bdesc, - beta, - C, - Cdesc, - D, - Ddesc, - &algo, - cublas_workspace_, - CUBLAS_WORKSPACE_SIZE, - stream)); - cudaEventRecord(stop_event, stream); - cudaEventSynchronize(stop_event); - cudaEventElapsedTime(&duration_ms, start_event, stop_event); - - algo_results[algo_id].push_back(duration_ms); - } - std::sort(algo_results[algo_id].begin(), algo_results[algo_id].end()); - } - - cublasLtMatmulHeuristicResult_t result; - float best_time = INFINITY; - for (const auto& heuristic : heuristics) { - cublasLtMatmulAlgo_t algo = heuristic.algo; - int32_t algo_id; - cublasLtMatmulAlgoConfigGetAttribute(&algo, CUBLASLT_ALGO_CONFIG_ID, &algo_id, sizeof(algo_id), &returnSize); - const auto& results = algo_results[algo_id]; - - if (results.size() > 0 && results[5] < best_time) { - best_time = results[5]; - result = heuristic; - } - } - - return {best_time != INFINITY, result.algo}; -#endif -} - -cublasMMWrapper::MatrixLayout cublasMMWrapper::createMatrixLayout(cublasLtMatrixLayout_t Mdesc) -{ - size_t returnSize; - MatrixLayout m_layout; - - cublasLtMatrixLayoutGetAttribute( - Mdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &std::get<0>(m_layout), sizeof(std::get<0>(m_layout)), &returnSize); - cublasLtMatrixLayoutGetAttribute( - Mdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &std::get<1>(m_layout), sizeof(std::get<1>(m_layout)), &returnSize); - cublasLtMatrixLayoutGetAttribute( - Mdesc, CUBLASLT_MATRIX_LAYOUT_ROWS, &std::get<2>(m_layout), sizeof(std::get<2>(m_layout)), &returnSize); - cublasLtMatrixLayoutGetAttribute( - Mdesc, CUBLASLT_MATRIX_LAYOUT_COLS, &std::get<3>(m_layout), sizeof(std::get<3>(m_layout)), &returnSize); - - return m_layout; -} - -cublasStatus_t cublasMMWrapper::cublasLtMatmulWrapper(cublasLtHandle_t lightHandle, - cublasLtMatmulDesc_t computeDesc, - const void* alpha, - const void* A, - cublasLtMatrixLayout_t Adesc, - const void* B, - cublasLtMatrixLayout_t Bdesc, - const void* beta, - const void* C, - cublasLtMatrixLayout_t Cdesc, - void* D, - cublasLtMatrixLayout_t Ddesc, - const cublasLtMatmulAlgo_t* algo, - void* workspace, - size_t workspaceSizeInBytes, - cudaStream_t stream) -{ - cache_idx_t cache_idx{ - computeDesc, - {createMatrixLayout(Adesc), createMatrixLayout(Bdesc), createMatrixLayout(Cdesc), createMatrixLayout(Ddesc)}}; - - cublasLtMatmulAlgo_t algo_value; - bool found_algo = false; - if (algo == nullptr) { - if (algo_cache.find(cache_idx) == algo_cache.end()) { - auto result = - findBestAlgo(lightHandle, computeDesc, alpha, A, Adesc, B, Bdesc, beta, C, Cdesc, D, Ddesc, stream); - if (result.first) { - algo_cache[cache_idx] = result.second; - algo_value = result.second; - found_algo = true; - } - } - else { - algo_value = algo_cache[cache_idx]; - found_algo = true; - } - } - - return cublasLtMatmul(lightHandle, - computeDesc, - alpha, - A, - Adesc, - B, - Bdesc, - beta, - C, - Cdesc, - D, - Ddesc, - found_algo ? &algo_value : algo, - workspace, - workspaceSizeInBytes, - stream); -} - -void cublasMMWrapper::_Int8Gemm(const int m, - const int n, - const int k, - const int8_t* A, - const int lda, - const int8_t* B, - const int ldb, - void* C, - const int ldc, - const void* alpha, - const int mode, - const bool per_column_scaling) -{ - /* mode: - * - 0: int8 * int8 -> int32 -> int8 - * - 1: int8 * int8 -> int32 -> int32 - */ -#if (CUBLAS_VERSION) <= 11601 - FT_CHECK_WITH_INFO(false, "CUBLAS version too low."); -#else - - mu_->lock(); - const auto op_a = CUBLAS_OP_T; - const auto op_b = CUBLAS_OP_N; - const auto dataType = CUDA_R_8I; - const auto resultType = mode == 0 ? CUDA_R_8I : CUDA_R_32I; - const auto computeType = CUBLAS_COMPUTE_32I; - const auto scaleType = mode == 0 ? CUDA_R_32F : CUDA_R_32I; - const int batch_count = 1; - const void* beta; - - int findAlgo = cublas_algo_map_->isExist(batch_count, m, n, k, getCublasDataType(dataType)); - - cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(dataType)); - - cublasLtMatmulDesc_t operationDesc = NULL; - cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL; - - // -------------------------------------- - // Create descriptors for the original matrices - check_cuda_error(cublasLtMatrixLayoutCreate(&Adesc, dataType, k, m, lda)); - check_cuda_error(cublasLtMatrixLayoutCreate(&Bdesc, dataType, k, n, ldb)); - check_cuda_error(cublasLtMatrixLayoutCreate(&Cdesc, resultType, m, n, ldc)); - - check_cuda_error(cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType)); - - auto pointer_mode = CUBLASLT_POINTER_MODE_HOST; - if (mode == 0) { - pointer_mode = - per_column_scaling ? CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST : CUBLASLT_POINTER_MODE_DEVICE; - } - check_cuda_error( - cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &op_a, sizeof(cublasOperation_t))); - check_cuda_error( - cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &op_b, sizeof(cublasOperation_t))); - check_cuda_error( - cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSC, &op_b, sizeof(cublasOperation_t))); - check_cuda_error(cublasLtMatmulDescSetAttribute( - operationDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointer_mode, sizeof(pointer_mode))); - - const int32_t int_one = 1; - const int32_t int_zero = 0; - const float float_zero = 0; - if (mode == 0) { - beta = per_column_scaling ? &float_zero : NULL; - } - else { - alpha = &int_one; - beta = &int_zero; - } - - cublasLtMatmulAlgo_t algo; - void* workSpace = cublas_workspace_; - int workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE; - - sync_check_cuda_error(); - auto ret = cublasLtMatmulWrapper(cublaslt_handle_, - operationDesc, - alpha, - A, - Adesc, - B, - Bdesc, - beta, - C, - Cdesc, - C, - Cdesc, - NULL, - workSpace, - workspaceSize, - stream_); - check_cuda_error(ret); - sync_check_cuda_error(); - - cublasLtMatmulDescDestroy(operationDesc); - cublasLtMatrixLayoutDestroy(Adesc); - cublasLtMatrixLayoutDestroy(Bdesc); - cublasLtMatrixLayoutDestroy(Cdesc); - sync_check_cuda_error(); - mu_->unlock(); -#endif -} - -void cublasMMWrapper::Int8Gemm(const int m, - const int n, - const int k, - const int8_t* A, - const int lda, - const int8_t* B, - const int ldb, - int8_t* C, - const int ldc, - const float* alpha, - const bool per_column_scaling) -{ - return _Int8Gemm(m, n, k, A, lda, B, ldb, C, ldc, alpha, 0, per_column_scaling); -} - -void cublasMMWrapper::Int8Gemm(const int m, - const int n, - const int k, - const int8_t* A, - const int lda, - const int8_t* B, - const int ldb, - int32_t* C, - const int ldc) -{ - return _Int8Gemm(m, n, k, A, lda, B, ldb, C, ldc, (float*)nullptr, 1, false); -} - -} // namespace turbomind diff --git a/src/turbomind/utils/cublasMMWrapper.h b/src/turbomind/utils/cublasMMWrapper.h deleted file mode 100644 index 0f90a44057..0000000000 --- a/src/turbomind/utils/cublasMMWrapper.h +++ /dev/null @@ -1,297 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "cuda_utils.h" -#include "src/turbomind/utils/allocator.h" -#include "src/turbomind/utils/cublasAlgoMap.h" -#include -#include -#include -#include -#include -#include -#include - -#pragma once -namespace turbomind { - -class cublasMMWrapper { -protected: - cublasHandle_t cublas_handle_; - cublasLtHandle_t cublaslt_handle_; -#ifdef SPARSITY_ENABLED - cusparseLtHandle_t cusparselt_handle_; - std::map sp_mat_A_desc_map_; - std::map sp_mat_B_desc_map_; - std::map sp_mat_C_desc_map_; -#endif - - cudaDataType_t Atype_; - cudaDataType_t Btype_; - cudaDataType_t Ctype_; - cudaDataType_t computeType_; - - cudaStream_t stream_; - cublasAlgoMap* cublas_algo_map_; - std::mutex* mu_; - - IAllocator* allocator_ = nullptr; - void* cublas_workspace_ = nullptr; - - friend class cublasINT8MMWrapper; - - void _Int8Gemm(const int m, - const int n, - const int k, - const int8_t* A, - const int lda, - const int8_t* B, - const int ldb, - void* C, - const int ldc, - const void* alpha, - const int mode, - const bool per_column_scaling); - -public: - cublasMMWrapper(cublasHandle_t cublas_handle_, - cublasLtHandle_t cublaslt_handle_, - cudaStream_t stream, - cublasAlgoMap* map, - std::mutex* mu, - IAllocator* allocator); - -#ifdef SPARSITY_ENABLED - cublasMMWrapper(cublasHandle_t cublas_handle_, - cublasLtHandle_t cublaslt_handle_, - cusparseLtHandle_t cusparselt_handle, - cudaStream_t stream, - cublasAlgoMap* map, - std::mutex* mu, - IAllocator* allocator); -#endif - - virtual ~cublasMMWrapper(); - - cublasMMWrapper(const cublasMMWrapper& wrapper); - - virtual void cublasVersionCheck() - { - return; - }; - cublasStatus_t cublasLtMatmulWrapper(cublasLtHandle_t lightHandle, - cublasLtMatmulDesc_t computeDesc, - const void* alpha, - const void* A, - cublasLtMatrixLayout_t Adesc, - const void* B, - cublasLtMatrixLayout_t Bdesc, - const void* beta, - const void* C, - cublasLtMatrixLayout_t Cdesc, - void* D, - cublasLtMatrixLayout_t Ddesc, - const cublasLtMatmulAlgo_t* algo, - void* workspace, - size_t workspaceSizeInBytes, - cudaStream_t stream); - - std::pair findBestAlgo(cublasLtHandle_t lightHandle, - cublasLtMatmulDesc_t computeDesc, - const void* alpha, - const void* A, - cublasLtMatrixLayout_t Adesc, - const void* B, - cublasLtMatrixLayout_t Bdesc, - const void* beta, - const void* C, - cublasLtMatrixLayout_t Cdesc, - void* D, - cublasLtMatrixLayout_t Ddesc, - cudaStream_t stream); - - using MatrixLayout = std::tuple; - using cache_idx_t = std::tuple>; - std::map algo_cache; - - MatrixLayout createMatrixLayout(cublasLtMatrixLayout_t Mdesc); - - void Gemm(cublasOperation_t transa, - cublasOperation_t transb, - const int m, - const int n, - const int k, - const void* alpha, - const void* A, - cudaDataType_t Atype, - int lda, - const void* B, - cudaDataType_t Btype, - int ldb, - const void* beta, - void* C, - cudaDataType_t Ctype, - int ldc, - cudaDataType_t computeType, - cublasGemmAlgo_t algo); - - void Gemm(cublasOperation_t transa, - cublasOperation_t transb, - const int m, - const int n, - const int k, - const void* A, - const int lda, - const void* B, - const int ldb, - void* C, - const int ldc); - - void Gemm(cublasOperation_t transa, - cublasOperation_t transb, - const int m, - const int n, - const int k, - const void* A, - const int lda, - const void* B, - const int ldb, - void* C, - const int ldc, - float f_alpha, - float f_beta); - - void Int8Gemm(const int m, - const int n, - const int k, - const int8_t* A, - const int lda, - const int8_t* B, - const int ldb, - int8_t* C, - const int ldc, - const float* alpha, - const bool per_column_scaling = false); - - void Int8Gemm(const int m, - const int n, - const int k, - const int8_t* A, - const int lda, - const int8_t* B, - const int ldb, - int32_t* C, - const int ldc); - - void setFP32GemmConfig(); - void setFP16GemmConfig(); -#ifdef ENABLE_BF16 - void setBF16GemmConfig(); -#endif - void setStream(cudaStream_t stream); - - void setGemmConfig(cudaDataType_t aType, cudaDataType_t bType, cudaDataType_t cType, cudaDataType_t computeType); - - CublasDataType getCublasDataType(cudaDataType_t data_type); - -#if (CUDART_VERSION >= 11000) - void Gemm(cublasOperation_t transa, - cublasOperation_t transb, - const int m, - const int n, - const int k, - const void* A, - const int lda, - const void* B, - const int ldb, - const void* bias, - void* C, - const int ldc); -#endif - - void stridedBatchedGemm(cublasOperation_t transa, - cublasOperation_t transb, - const int m, - const int n, - const int k, - const void* A, - const int lda, - const int64_t strideA, - const void* B, - const int ldb, - const int64_t strideB, - void* C, - const int ldc, - const int64_t strideC, - const int batchCount, - const float f_alpha = 1.0f, - const float f_beta = 0.0f); - - void stridedBatchedGemm(cublasOperation_t transa, - cublasOperation_t transb, - const int m, - const int n, - const int k, - const float f_alpha, - const void* A, - cudaDataType_t AType, - const int lda, - const int64_t strideA, - const void* B, - cudaDataType_t BType, - const int ldb, - const int64_t strideB, - const float f_beta, - void* C, - cudaDataType_t CType, - const int ldc, - const int64_t strideC, - const int batch_count, - cudaDataType_t computeType); - - void batchedGemm(cublasOperation_t transa, - cublasOperation_t transb, - const int m, - const int n, - const int k, - const void* const* A, - const int lda, - const void* const* B, - const int ldb, - void* const* C, - const int ldc, - const int batch_count); - - bool isFuseBatchGemm(const int batch_count, const int m, const int k, const int n); - -#ifdef SPARSITY_ENABLED - void SpGemm(cublasOperation_t transa, - cublasOperation_t transb, - const int m, - const int n, - const int k, - const void* A, - const void* B, - void* C); - - size_t getSparseMatrixSize(int m, int k); - void compressMatrix(const void* input, void* output, const int m, const int k); - - bool isUseSparse(const int batch_count, const int m, const int n, const int k); -#endif -}; - -} // namespace turbomind diff --git a/src/turbomind/utils/cuda_fp8_utils.cu b/src/turbomind/utils/cuda_fp8_utils.cu deleted file mode 100644 index 5651dab2e7..0000000000 --- a/src/turbomind/utils/cuda_fp8_utils.cu +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "cuda_fp8_utils.h" - -namespace turbomind { -#ifdef ENABLE_FP8 - -template -__global__ void quantizeMatrix(T_OUT* output, float const* input_scale, T_IN const* input, uint32_t size, uint32_t n) -{ - for (uint32_t i = threadIdx.x + blockIdx.x * blockDim.x; i < size; i += blockDim.x * gridDim.x) { - if (quantize_mode == QUANTIZE_MODE::PER_CHANNEL) { - output[i] = T_OUT((float)(input[i]) * __ldg(input_scale + (i % n))); - } - else { - output[i] = T_OUT((float)(input[i]) * __ldg(input_scale)); - } - } -} - -template -void invokeQuantizeMatrix( - T_OUT* output, float const* input_scale, T_IN const* input, uint32_t size, uint32_t n, cudaStream_t stream) -{ - dim3 grid(32); - dim3 block(256); - quantizeMatrix<<>>(output, input_scale, input, size, n); -} - -#define defineinvokeQuantizeMatrix(type_out, type_in, mode) \ - template void invokeQuantizeMatrix(type_out * output, \ - float const* input_scale, \ - type_in const* input, \ - uint32_t size, \ - uint32_t n, \ - cudaStream_t stream); - -defineinvokeQuantizeMatrix(__nv_fp8_e4m3, float, QUANTIZE_MODE::PER_CHANNEL); -defineinvokeQuantizeMatrix(__nv_fp8_e4m3, float, QUANTIZE_MODE::PER_TENSOR); -defineinvokeQuantizeMatrix(__nv_fp8_e4m3, half, QUANTIZE_MODE::PER_CHANNEL); -defineinvokeQuantizeMatrix(__nv_fp8_e4m3, half, QUANTIZE_MODE::PER_TENSOR); -defineinvokeQuantizeMatrix(half, __nv_fp8_e4m3, QUANTIZE_MODE::PER_CHANNEL); -defineinvokeQuantizeMatrix(half, __nv_fp8_e4m3, QUANTIZE_MODE::PER_TENSOR); -defineinvokeQuantizeMatrix(float, __nv_fp8_e4m3, QUANTIZE_MODE::PER_CHANNEL); -defineinvokeQuantizeMatrix(float, __nv_fp8_e4m3, QUANTIZE_MODE::PER_TENSOR); -#ifdef ENABLE_BF16 -defineinvokeQuantizeMatrix(__nv_fp8_e4m3, __nv_bfloat16, QUANTIZE_MODE::PER_CHANNEL); -defineinvokeQuantizeMatrix(__nv_fp8_e4m3, __nv_bfloat16, QUANTIZE_MODE::PER_TENSOR); -defineinvokeQuantizeMatrix(__nv_bfloat16, __nv_fp8_e4m3, QUANTIZE_MODE::PER_CHANNEL); -defineinvokeQuantizeMatrix(__nv_bfloat16, __nv_fp8_e4m3, QUANTIZE_MODE::PER_TENSOR); -#endif - -template -__global__ void fakeQuantize(T_OUT* dst, const T_IN* src, const int size) -{ - for (int tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { - T_FAKE tmp = (T_FAKE)((float)src[tid]); - dst[tid] = (T_OUT)((float)tmp); - } -} - -template -void invokeFakeQuantize(T_OUT* dst, const T_IN* src, const int size, cudaStream_t stream) -{ - fakeQuantize<<<256, 256, 0, stream>>>(dst, src, size); -} - -template void -invokeFakeQuantize(float* dst, const float* src, const int size, cudaStream_t stream); -template void -invokeFakeQuantize(half* dst, const half* src, const int size, cudaStream_t stream); -template void invokeFakeQuantize<__nv_bfloat16, __nv_bfloat16, __nv_fp8_e4m3>(__nv_bfloat16* dst, - const __nv_bfloat16* src, - const int size, - cudaStream_t stream); - -template -__global__ void computeFP8QuantizeScale(float* quant_ptr, const T_W* weights, const int k, const int n) -{ - float max = -10000.f; - for (int i = 0; i < k; i++) { - float val = fabs((float)weights[i * n + blockIdx.x * blockDim.x + threadIdx.x]); - max = max > val ? max : val; - if (threadIdx.x == 0 && blockIdx.x == 0 && i % 100 == 0) { - printf("max: %f, val: %f \n", max, val); - } - } - // quant_ptr[blockIdx.x * blockDim.x + threadIdx.x] = 1.0f; - // quant_ptr[blockIdx.x * blockDim.x + threadIdx.x] = FP8_E4M3_MAX / max; - quant_ptr[blockIdx.x * blockDim.x + threadIdx.x] = std::max(max / FP8_E4M3_MAX, 1.0f / 32.f); -} - -template -void invokeComputeFP8QuantizeScale(float* quant_ptr, const T_W* weights, const int k, const int n, cudaStream_t stream) -{ - dim3 block(256); - dim3 grid; - grid.x = (n + 255) / 256; - computeFP8QuantizeScale<<>>(quant_ptr, weights, k, n); -} - -#ifdef ENABLE_BF16 -template void invokeComputeFP8QuantizeScale( - float* quant_ptr, const __nv_bfloat16* weights, const int k, const int n, cudaStream_t stream); -#endif -template void -invokeComputeFP8QuantizeScale(float* quant_ptr, const float* weights, const int k, const int n, cudaStream_t stream); - -#endif // ENABLE_FP8 -} // namespace turbomind diff --git a/src/turbomind/utils/cuda_fp8_utils.h b/src/turbomind/utils/cuda_fp8_utils.h deleted file mode 100644 index ba7f91c8bf..0000000000 --- a/src/turbomind/utils/cuda_fp8_utils.h +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#ifdef ENABLE_FP8 -#include -#include -#include - -// #define FP8_MHA -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 900 -#define FUSE_GEMM_ACT -#endif -#define FP8_GEMM_OUTPUT_QUANT_DISABLE - -#ifdef FUSE_GEMM_ACT -#define USE_QGMMA -#endif - -namespace turbomind { - -const float FP8_E4M3_MAX = 480.0f; - -enum QUANTIZE_MODE -{ - PER_CHANNEL, - PER_TENSOR, - PER_CHANNEL_WEIGHT_PER_TENSOR_ACT -}; - -// Packed Data Type -typedef struct __CUDA_ALIGN__(32) { - float array[8]; -} float8; - -typedef struct __CUDA_ALIGN__(16) { - half array[8]; -} half8; - -#ifdef ENABLE_BF16 -typedef struct __CUDA_ALIGN__(4) { - __nv_bfloat16 array[2]; -} __nv_bfloat16_2; - -typedef struct __CUDA_ALIGN__(8) { - __nv_bfloat162 x, y; -} __nv_bfloat162_2_xy; - -typedef struct __CUDA_ALIGN__(8) { - __nv_bfloat16 array[4]; -} __nv_bfloat164; - -typedef struct __CUDA_ALIGN__(8) { - __nv_bfloat162 array[2]; -} __nv_bfloat162_2; - -typedef struct __CUDA_ALIGN__(16) { - __nv_bfloat16 array[8]; -} __nv_bfloat168; - -typedef struct __CUDA_ALIGN__(16) { - __nv_bfloat162 array[4]; -} __nv_bfloat162_4; - -typedef struct __CUDA_ALIGN__(32) { - __nv_bfloat16 array[16]; -} __nv_bfloat1616; -#endif - -#ifdef ENABLE_FP8 -typedef struct __CUDA_ALIGN__(2) { - __nv_fp8_e4m3 array[2]; -} __nv_fp8_2_e4m3; - -typedef struct __CUDA_ALIGN__(4) { - __nv_fp8_e4m3 array[4]; -} __nv_fp8_4_e4m3; - -typedef struct __CUDA_ALIGN__(4) { - __nv_fp8x2_e4m3 array[2]; -} __nv_fp8x2_x2_e4m3; - -typedef struct __CUDA_ALIGN__(8) { - __nv_fp8_e4m3 array[8]; -} __nv_fp8_8_e4m3; - -typedef struct __CUDA_ALIGN__(8) { - __nv_fp8x2_e4m3 array[4]; -} __nv_fp8x2_x4_e4m3; - -typedef struct __CUDA_ALIGN__(16) { - __nv_fp8_e4m3 array[16]; -} __nv_fp8x16_e4m3; -#endif - -// only BF16 and FP8 -template -struct PackType { - using type = float; -}; - -#ifdef ENABLE_BF16 -template<> -struct PackType<__nv_bfloat16, 2> { - using type = __nv_bfloat16_2; -}; - -template<> -struct PackType<__nv_bfloat16, 4> { - using type = __nv_bfloat164; -}; - -template<> -struct PackType<__nv_bfloat16, 8> { - using type = __nv_bfloat168; -}; -#endif - -#ifdef ENABLE_FP8 -template<> -struct PackType<__nv_fp8_e4m3, 2> { - using type = __nv_fp8_2_e4m3; -}; - -template<> -struct PackType<__nv_fp8_e4m3, 4> { - using type = __nv_fp8_4_e4m3; -}; - -template<> -struct PackType<__nv_fp8_e4m3, 8> { - using type = __nv_fp8_8_e4m3; -}; -#endif - -__inline__ __device__ void fp8x4_e4m3_to_bfloat2(__nv_bfloat162* out1, __nv_bfloat162* out2, const __nv_fp8x4_e4m3* in) -{ - const char4 tmp_val = reinterpret_cast(in)[0]; - *out1 = __nv_bfloat162((float)reinterpret_cast(&tmp_val.x)[0], - (float)reinterpret_cast(&tmp_val.y)[0]); - *out2 = __nv_bfloat162((float)reinterpret_cast(&tmp_val.z)[0], - (float)reinterpret_cast(&tmp_val.w)[0]); -} - -__inline__ __device__ __nv_bfloat162 fp8x2_e4m3_to_bfloat2(const __nv_fp8x2_e4m3* in) -{ - const char2 tmp_val = reinterpret_cast(in)[0]; - __nv_bfloat162 out = __nv_bfloat162((float)reinterpret_cast(&tmp_val.x)[0], - (float)reinterpret_cast(&tmp_val.y)[0]); - return out; -} - -__inline__ __device__ void fp8x4_e4m3_to_half2(half2* out1, half2* out2, const __nv_fp8x4_e4m3* in) -{ - const char4 tmp_val = reinterpret_cast(in)[0]; - *out1 = half2((float)reinterpret_cast(&tmp_val.x)[0], - (float)reinterpret_cast(&tmp_val.y)[0]); - *out2 = half2((float)reinterpret_cast(&tmp_val.z)[0], - (float)reinterpret_cast(&tmp_val.w)[0]); -} - -__inline__ __device__ half2 fp8x2_e4m3_to_half2(const __nv_fp8x2_e4m3* in) -{ - const char2 tmp_val = reinterpret_cast(in)[0]; - half2 out = half2((float)reinterpret_cast(&tmp_val.x)[0], - (float)reinterpret_cast(&tmp_val.y)[0]); - return out; -} - -template -void invokeQuantizeMatrix( - T_OUT* output, float const* input_qua_amax_ptr, T_IN const* input, uint32_t size, uint32_t n, cudaStream_t stream); - -template -void invokeFakeQuantize(T_OUT* dst, const T_IN* src, const int size, cudaStream_t stream); - -template -void invokeComputeFP8QuantizeScale(float* quant_ptr, const T_W* weights, const int k, const int n, cudaStream_t stream); - -} // namespace turbomind -#endif // ENABLE_FP8 diff --git a/src/turbomind/utils/cuda_type_utils.cuh b/src/turbomind/utils/cuda_type_utils.cuh index f7f7b95273..0b03442c74 100644 --- a/src/turbomind/utils/cuda_type_utils.cuh +++ b/src/turbomind/utils/cuda_type_utils.cuh @@ -18,7 +18,6 @@ #include "src/turbomind/utils/cuda_bf16_fallbacks.cuh" #include "src/turbomind/utils/cuda_bf16_wrapper.h" -#include "src/turbomind/utils/cuda_fp8_utils.h" #include #include diff --git a/src/turbomind/utils/cuda_utils.cc b/src/turbomind/utils/cuda_utils.cc index 95b6e87c5c..455b7826cc 100644 --- a/src/turbomind/utils/cuda_utils.cc +++ b/src/turbomind/utils/cuda_utils.cc @@ -16,124 +16,31 @@ #include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/macro.h" -#include "src/turbomind/utils/cuda_fp8_utils.h" #include namespace turbomind { -/* **************************** debug tools ********************************* */ - -template -void print_to_file(const T* result, const int size, const char* file, cudaStream_t stream, std::ios::openmode open_mode) -{ - cudaDeviceSynchronize(); - check_cuda_error(cudaGetLastError()); - printf("[INFO] file: %s with size %d.\n", file, size); - std::ofstream outFile(file, open_mode); - if (outFile) { - T* tmp = new T[size]; - check_cuda_error(cudaMemcpyAsync(tmp, result, sizeof(T) * size, cudaMemcpyDeviceToHost, stream)); - for (int i = 0; i < size; ++i) { - float val = (float)(tmp[i]); - outFile << val << std::endl; - } - delete[] tmp; - } - else { - throw std::runtime_error(std::string("[TM][ERROR] Cannot open file: ") + file + "\n"); - } - cudaDeviceSynchronize(); - check_cuda_error(cudaGetLastError()); -} - -template void -print_to_file(const float* result, const int size, const char* file, cudaStream_t stream, std::ios::openmode open_mode); -template void -print_to_file(const half* result, const int size, const char* file, cudaStream_t stream, std::ios::openmode open_mode); -#ifdef ENABLE_BF16 -template void print_to_file( - const __nv_bfloat16* result, const int size, const char* file, cudaStream_t stream, std::ios::openmode open_mode); -#endif - -template -void print_abs_mean(const T* buf, uint size, cudaStream_t stream, std::string name) +void syncAndCheck(const char* const file, int const line) { - if (buf == nullptr) { - TM_LOG_WARNING("It is an nullptr, skip!"); - return; - } - cudaDeviceSynchronize(); - check_cuda_error(cudaGetLastError()); - T* h_tmp = new T[size]; - cudaMemcpyAsync(h_tmp, buf, sizeof(T) * size, cudaMemcpyDeviceToHost, stream); - cudaDeviceSynchronize(); - check_cuda_error(cudaGetLastError()); - double sum = 0.0f; - uint64_t zero_count = 0; - float max_val = -1e10; - bool find_inf = false; - for (uint i = 0; i < size; i++) { - if (std::isinf((float)(h_tmp[i]))) { - find_inf = true; - continue; - } - sum += abs((double)h_tmp[i]); - if ((float)h_tmp[i] == 0.0f) { - zero_count++; + // When FT_DEBUG_LEVEL=DEBUG, must check error + static char* level_name = std::getenv("TM_DEBUG_LEVEL"); + if (level_name != nullptr) { + static std::string level = std::string(level_name); + if (level == "DEBUG") { + cudaDeviceSynchronize(); + cudaError_t result = cudaGetLastError(); + if (result) { + TM_LOG_ERROR((std::string("CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + " " + file + ":" + + std::to_string(line)) + .c_str()); + std::abort(); + } + TM_LOG_DEBUG(fmtstr("run syncAndCheck at %s:%d", file, line)); } - max_val = max_val > abs(float(h_tmp[i])) ? max_val : abs(float(h_tmp[i])); } - printf("[TM][INFO] %20s size: %u, abs mean: %f, abs sum: %f, abs max: %f, find inf: %s", - name.c_str(), - size, - sum / size, - sum, - max_val, - find_inf ? "true" : "false"); - std::cout << std::endl; - delete[] h_tmp; - cudaDeviceSynchronize(); - check_cuda_error(cudaGetLastError()); } -template void print_abs_mean(const float* buf, uint size, cudaStream_t stream, std::string name); -template void print_abs_mean(const half* buf, uint size, cudaStream_t stream, std::string name); -#ifdef ENABLE_BF16 -template void print_abs_mean(const __nv_bfloat16* buf, uint size, cudaStream_t stream, std::string name); -#endif -template void print_abs_mean(const int* buf, uint size, cudaStream_t stream, std::string name); -template void print_abs_mean(const uint* buf, uint size, cudaStream_t stream, std::string name); -template void print_abs_mean(const int8_t* buf, uint size, cudaStream_t stream, std::string name); -#ifdef ENABLE_FP8 -template void print_abs_mean(const __nv_fp8_e4m3* buf, uint size, cudaStream_t stream, std::string name); -#endif - -template -void print_to_screen(const T* result, const int size) -{ - if (result == nullptr) { - TM_LOG_WARNING("It is an nullptr, skip! \n"); - return; - } - T* tmp = reinterpret_cast(malloc(sizeof(T) * size)); - check_cuda_error(cudaMemcpy(tmp, result, sizeof(T) * size, cudaMemcpyDeviceToHost)); - for (int i = 0; i < size; ++i) { - printf("%d, %f\n", i, static_cast(tmp[i])); - } - free(tmp); -} - -template void print_to_screen(const float* result, const int size); -template void print_to_screen(const half* result, const int size); -#ifdef ENABLE_BF16 -template void print_to_screen(const __nv_bfloat16* result, const int size); -#endif -template void print_to_screen(const int* result, const int size); -template void print_to_screen(const uint* result, const int size); -template void print_to_screen(const bool* result, const int size); -#ifdef ENABLE_FP8 -template void print_to_screen(const __nv_fp8_e4m3* result, const int size); -#endif +/* **************************** debug tools ********************************* */ template void printMatrix(T* ptr, int m, int k, int stride, bool is_device_ptr) @@ -335,35 +242,38 @@ template void check_abs_mean_val(const __nv_bfloat16* result, const int size); /* ***************************** common utils ****************************** */ -cudaError_t getSetDevice(int i_device, int* o_device) +int getSMVersion() { - int current_dev_id = 0; - cudaError_t err = cudaSuccess; + int device{-1}; + check_cuda_error(cudaGetDevice(&device)); + int sm_major = 0; + int sm_minor = 0; + check_cuda_error(cudaDeviceGetAttribute(&sm_major, cudaDevAttrComputeCapabilityMajor, device)); + check_cuda_error(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device)); + return sm_major * 10 + sm_minor; +} - if (o_device != NULL) { - err = cudaGetDevice(¤t_dev_id); - if (err != cudaSuccess) { - return err; - } - if (current_dev_id == i_device) { - *o_device = i_device; - } - else { - err = cudaSetDevice(i_device); - if (err != cudaSuccess) { - return err; - } - *o_device = current_dev_id; - } - } - else { - err = cudaSetDevice(i_device); - if (err != cudaSuccess) { - return err; - } - } +std::string getDeviceName() +{ + int device{-1}; + check_cuda_error(cudaGetDevice(&device)); + cudaDeviceProp props; + check_cuda_error(cudaGetDeviceProperties(&props, device)); + return std::string(props.name); +} - return cudaSuccess; +int getDevice() +{ + int current_dev_id = 0; + check_cuda_error(cudaGetDevice(¤t_dev_id)); + return current_dev_id; +} + +int getDeviceCount() +{ + int count = 0; + check_cuda_error(cudaGetDeviceCount(&count)); + return count; } bool is_16xx_series(const char* name) diff --git a/src/turbomind/utils/cuda_utils.h b/src/turbomind/utils/cuda_utils.h index d764bb343a..543d90812a 100644 --- a/src/turbomind/utils/cuda_utils.h +++ b/src/turbomind/utils/cuda_utils.h @@ -37,46 +37,6 @@ namespace turbomind { -#define MAX_CONFIG_NUM 20 -#define COL32_ 32 -// workspace for cublas gemm : 32MB -#define CUBLAS_WORKSPACE_SIZE 33554432 - -typedef struct __align__(4) -{ - half x, y, z, w; -} -half4; - -/* **************************** type definition ***************************** */ - -enum CublasDataType -{ - FLOAT_DATATYPE = 0, - HALF_DATATYPE = 1, - BFLOAT16_DATATYPE = 2, - INT8_DATATYPE = 3, - FP8_DATATYPE = 4 -}; - -enum FtCudaDataType -{ - FP32 = 0, - FP16 = 1, - BF16 = 2, - INT8 = 3, - FP8 = 4 -}; - -enum class OperationType -{ - FP32, - FP16, - BF16, - INT8, - FP8 -}; - /* **************************** debug tools ********************************* */ static const char* _cudaGetErrorEnum(cudaError_t error) { @@ -123,40 +83,17 @@ template void check(T result, char const* const func, const char* const file, int const line) { if (result) { - throw std::runtime_error(std::string("[TM][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + " " - + file + ":" + std::to_string(line) + " \n"); + TM_LOG_ERROR((std::string("CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + " " + file + ":" + + std::to_string(line)) + .c_str()); + std::abort(); } } #define check_cuda_error(val) check((val), #val, __FILE__, __LINE__) #define check_cuda_error_2(val, file, line) check((val), #val, file, line) -inline void syncAndCheck(const char* const file, int const line) -{ - // When FT_DEBUG_LEVEL=DEBUG, must check error - static char* level_name = std::getenv("TM_DEBUG_LEVEL"); - if (level_name != nullptr) { - static std::string level = std::string(level_name); - if (level == "DEBUG") { - cudaDeviceSynchronize(); - cudaError_t result = cudaGetLastError(); - if (result) { - throw std::runtime_error(std::string("[TM][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result)) - + " " + file + ":" + std::to_string(line) + " \n"); - } - TM_LOG_DEBUG(fmtstr("run syncAndCheck at %s:%d", file, line)); - } - } - -#ifndef NDEBUG - cudaDeviceSynchronize(); - cudaError_t result = cudaGetLastError(); - if (result) { - throw std::runtime_error(std::string("[TM][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + " " - + file + ":" + std::to_string(line) + " \n"); - } -#endif -} +void syncAndCheck(const char* const file, int const line); #define sync_check_cuda_error() syncAndCheck(__FILE__, __LINE__) @@ -179,19 +116,6 @@ inline void syncAndCheck(const char* const file, int const line) } \ } -template -void print_to_file(const T* result, - const int size, - const char* file, - cudaStream_t stream = 0, - std::ios::openmode open_mode = std::ios::out); - -template -void print_abs_mean(const T* buf, uint size, cudaStream_t stream, std::string name = ""); - -template -void print_to_screen(const T* result, const int size); - template void printMatrix(T* ptr, int m, int k, int stride, bool is_device_ptr); @@ -223,10 +147,10 @@ inline void myAssert(bool result, const char* const file, int const line, std::s } } -#define FT_CHECK(val) myAssert(val, __FILE__, __LINE__) +#define FT_CHECK(val) myAssert(bool(val), __FILE__, __LINE__) #define FT_CHECK_WITH_INFO(val, info) \ do { \ - bool is_valid_val = (val); \ + bool is_valid_val = bool(val); \ if (!is_valid_val) { \ turbomind::myAssert(is_valid_val, __FILE__, __LINE__, (info)); \ } \ @@ -234,89 +158,11 @@ inline void myAssert(bool result, const char* const file, int const line, std::s #define FT_THROW(info) throwRuntimeError(__FILE__, __LINE__, info) -#ifdef SPARSITY_ENABLED -#define CHECK_CUSPARSE(func) \ - { \ - cusparseStatus_t status = (func); \ - if (status != CUSPARSE_STATUS_SUCCESS) { \ - throw std::runtime_error(std::string("[TM][ERROR] CUSPARSE API failed at line ") \ - + std::to_string(__LINE__) + " in file " + __FILE__ + ": " \ - + cusparseGetErrorString(status) + " " + std::to_string(status)); \ - } \ - } -#endif - -/*************Time Handling**************/ -class CudaTimer { -private: - cudaEvent_t event_start_; - cudaEvent_t event_stop_; - cudaStream_t stream_; - -public: - explicit CudaTimer(cudaStream_t stream = 0) - { - stream_ = stream; - } - void start() - { - check_cuda_error(cudaEventCreate(&event_start_)); - check_cuda_error(cudaEventCreate(&event_stop_)); - check_cuda_error(cudaEventRecord(event_start_, stream_)); - } - float stop() - { - float time; - check_cuda_error(cudaEventRecord(event_stop_, stream_)); - check_cuda_error(cudaEventSynchronize(event_stop_)); - check_cuda_error(cudaEventElapsedTime(&time, event_start_, event_stop_)); - check_cuda_error(cudaEventDestroy(event_start_)); - check_cuda_error(cudaEventDestroy(event_stop_)); - return time; - } - ~CudaTimer() {} -}; - /* ***************************** common utils ****************************** */ -inline void print_mem_usage(std::string time = "after allocation") -{ - size_t free_bytes, total_bytes; - check_cuda_error(cudaMemGetInfo(&free_bytes, &total_bytes)); - float free = static_cast(free_bytes) / 1024.0 / 1024.0 / 1024.0; - float total = static_cast(total_bytes) / 1024.0 / 1024.0 / 1024.0; - float used = total - free; - printf("%-20s: free: %5.2f GB, total: %5.2f GB, used: %5.2f GB\n", time.c_str(), free, total, used); -} - -inline int getSMVersion() -{ - int device{-1}; - check_cuda_error(cudaGetDevice(&device)); - int sm_major = 0; - int sm_minor = 0; - check_cuda_error(cudaDeviceGetAttribute(&sm_major, cudaDevAttrComputeCapabilityMajor, device)); - check_cuda_error(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device)); - return sm_major * 10 + sm_minor; -} - -inline int getMaxSharedMemoryPerBlock() -{ - int device{-1}; - check_cuda_error(cudaGetDevice(&device)); - int max_shared_memory_size = 0; - check_cuda_error(cudaDeviceGetAttribute(&max_shared_memory_size, cudaDevAttrMaxSharedMemoryPerBlock, device)); - return max_shared_memory_size; -} +int getSMVersion(); -inline std::string getDeviceName() -{ - int device{-1}; - check_cuda_error(cudaGetDevice(&device)); - cudaDeviceProp props; - check_cuda_error(cudaGetDeviceProperties(&props, device)); - return std::string(props.name); -} +std::string getDeviceName(); template inline T div_up(T a, T n) @@ -324,175 +170,9 @@ inline T div_up(T a, T n) return (a + n - 1) / n; } -cudaError_t getSetDevice(int i_device, int* o_device = NULL); - -inline int getDevice() -{ - int current_dev_id = 0; - check_cuda_error(cudaGetDevice(¤t_dev_id)); - return current_dev_id; -} - -inline int getDeviceCount() -{ - int count = 0; - check_cuda_error(cudaGetDeviceCount(&count)); - return count; -} +int getDevice(); -template -CublasDataType getCublasDataType() -{ - if (std::is_same::value) { - return HALF_DATATYPE; - } -#ifdef ENABLE_BF16 - else if (std::is_same::value) { - return BFLOAT16_DATATYPE; - } -#endif - else if (std::is_same::value) { - return FLOAT_DATATYPE; - } - else { - FT_CHECK(false); - return FLOAT_DATATYPE; - } -} - -template -cudaDataType_t getCudaDataType() -{ - if (std::is_same::value) { - return CUDA_R_16F; - } -#ifdef ENABLE_BF16 - else if (std::is_same::value) { - return CUDA_R_16BF; - } -#endif - else if (std::is_same::value) { - return CUDA_R_32F; - } - else { - FT_CHECK(false); - return CUDA_R_32F; - } -} - -template -struct getTypeFromCudaDataType { - using Type = float; -}; - -template<> -struct getTypeFromCudaDataType { - using Type = half; -}; - -#ifdef ENABLE_BF16 -template<> -struct getTypeFromCudaDataType { - using Type = __nv_bfloat16; -}; -#endif - -// clang-format off -template struct packed_type; -template <> struct packed_type { using type = float; }; // we don't need to pack float by default -template <> struct packed_type { using type = half2; }; - -#ifdef ENABLE_BF16 -template<> -struct packed_type<__nv_bfloat16> { - using type = __nv_bfloat162; -}; -#endif - -template struct num_elems; -template <> struct num_elems { static constexpr int value = 1; }; -template <> struct num_elems { static constexpr int value = 2; }; -template <> struct num_elems { static constexpr int value = 4; }; -template <> struct num_elems { static constexpr int value = 1; }; -template <> struct num_elems { static constexpr int value = 2; }; -#ifdef ENABLE_BF16 -template <> struct num_elems<__nv_bfloat16> { static constexpr int value = 1; }; -template <> struct num_elems<__nv_bfloat162> { static constexpr int value = 2; }; -#endif - -template struct packed_as; -template struct packed_as { using type = T; }; -template<> struct packed_as { using type = half2; }; -template<> struct packed_as { using type = float2; }; -template<> struct packed_as { using type = int16_t; }; -template<> struct packed_as { using type = int2; }; -template<> struct packed_as { using type = half; }; -#ifdef ENABLE_BF16 -template<> struct packed_as<__nv_bfloat16, 2> { using type = __nv_bfloat162; }; -template<> struct packed_as<__nv_bfloat162, 1> { using type = __nv_bfloat16; }; -#endif - -inline __device__ float2 operator*(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); } -inline __device__ float2 operator*(float2 a, float b) { return make_float2(a.x * b, a.y * b); } -// clang-format on - -template -void compareTwoTensor( - const T1* pred, const T2* ref, const int size, const int print_size = 0, const std::string filename = "") -{ - T1* h_pred = new T1[size]; - T2* h_ref = new T2[size]; - check_cuda_error(cudaMemcpy(h_pred, pred, size * sizeof(T1), cudaMemcpyDeviceToHost)); - check_cuda_error(cudaMemcpy(h_ref, ref, size * sizeof(T2), cudaMemcpyDeviceToHost)); - - FILE* fd = nullptr; - if (filename != "") { - fd = fopen(filename.c_str(), "w"); - fprintf(fd, "| %10s | %10s | %10s | %10s | \n", "pred", "ref", "abs_diff", "rel_diff(%)"); - } - - if (print_size > 0) { - TM_LOG_INFO(" id | pred | ref |abs diff | rel diff (%) |"); - } - float mean_abs_diff = 0.0f; - float mean_rel_diff = 0.0f; - int count = 0; - for (int i = 0; i < size; i++) { - if (i < print_size) { - TM_LOG_INFO("%4d | % 6.4f | % 6.4f | % 6.4f | % 7.4f |", - i, - (float)h_pred[i], - (float)h_ref[i], - abs((float)h_pred[i] - (float)h_ref[i]), - abs((float)h_pred[i] - (float)h_ref[i]) / (abs((float)h_ref[i]) + 1e-6f) * 100.f); - } - if ((float)h_pred[i] == 0) { - continue; - } - count += 1; - mean_abs_diff += abs((float)h_pred[i] - (float)h_ref[i]); - mean_rel_diff += abs((float)h_pred[i] - (float)h_ref[i]) / (abs((float)h_ref[i]) + 1e-6f) * 100.f; - - if (fd != nullptr) { - fprintf(fd, - "| %10.5f | %10.5f | %10.5f | %11.5f |\n", - (float)h_pred[i], - (float)h_ref[i], - abs((float)h_pred[i] - (float)h_ref[i]), - abs((float)h_pred[i] - (float)h_ref[i]) / (abs((float)h_ref[i]) + 1e-6f) * 100.f); - } - } - mean_abs_diff = mean_abs_diff / (float)count; - mean_rel_diff = mean_rel_diff / (float)count; - TM_LOG_INFO("mean_abs_diff: % 6.4f, mean_rel_diff: % 6.4f (%%)", mean_abs_diff, mean_rel_diff); - - if (fd != nullptr) { - fprintf(fd, "mean_abs_diff: % 6.4f, mean_rel_diff: % 6.4f (%%)", mean_abs_diff, mean_rel_diff); - fclose(fd); - } - delete[] h_pred; - delete[] h_ref; -} +int getDeviceCount(); bool is_16xx_series(const char* name); diff --git a/src/turbomind/utils/gemm.cc b/src/turbomind/utils/gemm.cc deleted file mode 100644 index 097c9a19e9..0000000000 --- a/src/turbomind/utils/gemm.cc +++ /dev/null @@ -1,1184 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "src/turbomind/utils/gemm.h" - -namespace turbomind { - -/* ***************************** GEMM Impl ******************************** */ - -Gemm::Gemm(IAllocator* allocator, cudaStream_t stream, std::string config_file) -{ - allocator_ = allocator; - stream_ = stream; - mutex_ = new std::mutex(); // mutex per process - check_cuda_error(cublasCreate(&cublas_handle_)); - check_cuda_error(cublasLtCreate(&cublaslt_handle_)); - check_cuda_error(cublasSetStream(cublas_handle_, stream)); - - if (allocator_ != nullptr) { - workspace_ = allocator_->reMalloc(workspace_, WORKSPACE_SIZE); - } - loadGemmConfig(config_file); -} - -Gemm::~Gemm() -{ - if (allocator_ != nullptr) { - allocator_->free((void**)(&workspace_)); - allocator_ = nullptr; - } - cublasLtDestroy(cublaslt_handle_); - cublasDestroy(cublas_handle_); - delete cublas_algo_map_; - delete mutex_; -} - -std::string Gemm::toString() -{ - const char* a_type_str = a_type_ == TYPE_FP16 ? "FP16" : "FP32"; - const char* b_type_str = b_type_ == TYPE_FP16 ? "FP16" : "FP32"; - const char* c_type_str = c_type_ == TYPE_FP16 ? "FP16" : "FP32"; - const char* compute_type_str = compute_type_ == TYPE_FP16 ? "FP16" : "FP32"; - return fmtstr( - "Gemm[a_type=%s, b_type=%s, c_type=%s, compute_type=%s]", a_type_str, b_type_str, c_type_str, compute_type_str); -} - -void Gemm::setAllocator(IAllocator* allocator) -{ - if (allocator_ != nullptr && workspace_ != nullptr) { - allocator_->free((void**)(&workspace_)); - } - allocator_ = allocator; - if (allocator_ != nullptr) { - workspace_ = allocator_->reMalloc(workspace_, WORKSPACE_SIZE); - } -} - -void Gemm::setCudaStream(cudaStream_t& stream) -{ - stream_ = stream; - cublasSetStream(cublas_handle_, stream); -} - -void Gemm::setComputeType(DataType compute_type) -{ - checkDataTypeValidity(compute_type); - compute_type_ = compute_type; -} - -void Gemm::setTypes(DataType a_type, DataType b_type, DataType c_type, DataType compute_type) -{ - checkDataTypeValidity(a_type); - checkDataTypeValidity(b_type); - checkDataTypeValidity(c_type); - a_type_ = a_type; - b_type_ = b_type; - c_type_ = c_type; - setComputeType(compute_type); -} - -template -void Gemm::setDefaultTypes() -{ - if (std::is_same::value) { - setTypes(TYPE_FP32, TYPE_FP32, TYPE_FP32, TYPE_FP32); - } - else if (std::is_same::value) { - setTypes(TYPE_FP16, TYPE_FP16, TYPE_FP16, TYPE_FP16); - } - else { - throw GemmNotSupportedException("Gemm supports float or half type."); - } -} - -void Gemm::loadGemmConfig(std::string config_file) -{ - if (cublas_algo_map_ != nullptr) { - delete cublas_algo_map_; // unload the previous cublas map. - } - cublas_algo_map_ = new cublasAlgoMap(config_file); -} - -void Gemm::gemm(const GemmOp transa, - const GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* input, - const DenseWeight& weight, - void* output, - const float alpha, - const float beta) -{ - gemm(transa, - transb, - m, - n, - k, - input, - a_type_, - (transa == GEMM_OP_N) ? k : m, - (const void*)weight.kernel, - b_type_, - (transb == GEMM_OP_N) ? n : k, - output, - c_type_, - n, - alpha, - beta); -} - -void Gemm::gemm(const GemmOp transa, - const GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* input, - const DenseWeight& weight, - void* output, - const float alpha, - const float beta) -{ - gemm(transa, - transb, - m, - n, - k, - input, - a_type_, - (transa == GEMM_OP_N) ? k : m, - (const void*)weight.kernel, - b_type_, - (transb == GEMM_OP_N) ? n : k, - output, - c_type_, - n, - alpha, - beta); -} - -void Gemm::gemm(const GemmOp transa, - const GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* A, - const void* B, - void* C, - const float alpha, - const float beta) -{ - size_t lda = (transa == GEMM_OP_N) ? k : m; - size_t ldb = (transb == GEMM_OP_N) ? n : k; - size_t ldc = n; - gemm(transa, transb, m, n, k, A, a_type_, lda, B, b_type_, ldb, C, c_type_, ldc, alpha, beta); -} - -void Gemm::gemm(const GemmOp transa, - const GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* A, - const size_t lda, - const void* B, - const size_t ldb, - void* C, - const size_t ldc, - const float alpha, - const float beta) -{ - gemm(transa, transb, m, n, k, A, a_type_, lda, B, b_type_, ldb, C, c_type_, ldc, alpha, beta); -} - -void Gemm::gemm(const GemmOp transa, - const GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* A, - const DataType Atype, - const size_t lda, - const void* B, - const DataType Btype, - const size_t ldb, - void* C, - const DataType Ctype, - const size_t ldc, - const float alpha, - const float beta) -{ - TM_LOG_TRACE("Gemm::gemm [m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]", m, n, k, lda, ldb, ldc); - - // Implementation copied from cublasMMWrapper::Gemm - // Switch A and B since both cublas and cublasLt assume a column major layout, - // while A and B are both row major layout. - const void* a_data_ptr = B; - const void* b_data_ptr = A; - - cublasOperation_t a_op = getCublasOperation(transb); - cublasOperation_t b_op = getCublasOperation(transa); - - cudaDataType_t a_type = getCublasDataType(Btype); - cudaDataType_t b_type = getCublasDataType(Atype); - cudaDataType_t c_type = getCublasDataType(Ctype); - - // swap m and n - const size_t _m = n; - const size_t _n = m; - - // swap lda and ldb; - const size_t _lda = ldb; - const size_t _ldb = lda; - - mutex_->lock(); - // Use cublas as default in FP32 and cublasLt as default in FP16 - bool is_fp16_compute_type = compute_type_ == TYPE_FP16; - bool using_cublasLt = Atype == TYPE_FP16; - int batch_count = 1; - - half h_alpha = (half)alpha; - half h_beta = (half)beta; - const void* alpha_ptr = - is_fp16_compute_type ? reinterpret_cast(&h_alpha) : reinterpret_cast(&alpha); - const void* beta_ptr = - is_fp16_compute_type ? reinterpret_cast(&h_beta) : reinterpret_cast(&beta); - - // TODO: unify CUBLAS_DATA_TYPE and DataType. - int findAlgo = - cublas_algo_map_->isExist(batch_count, _m, _n, k, (a_type == CUDA_R_16F) ? HALF_DATATYPE : FLOAT_DATATYPE); - cublasLtMatmulAlgo_info info = - cublas_algo_map_->getAlgo(batch_count, _m, _n, k, (a_type == CUDA_R_16F) ? HALF_DATATYPE : FLOAT_DATATYPE); - if (findAlgo) { - using_cublasLt = (info.stages != -1); - } - - if (using_cublasLt) { - const size_t a_rows = (a_op == getCublasOperation(GEMM_OP_N)) ? _m : k; - const size_t a_cols = (a_op == getCublasOperation(GEMM_OP_N)) ? k : _m; - const size_t b_rows = (b_op == getCublasOperation(GEMM_OP_N)) ? k : _n; - const size_t b_cols = (b_op == getCublasOperation(GEMM_OP_N)) ? _n : k; - - cublasLtMatmulDesc_t matmul_desc = NULL; - cublasLtMatrixLayout_t a_desc = NULL, b_desc = NULL, c_desc = NULL; - cudaDataType_t scale_type = getCublasDataType(compute_type_); - auto compute_type = getCublasComputeType(compute_type_); - - // -------------------------------------- - // Create descriptors for the original matrices - cublasLtMatrixLayoutCreate(&a_desc, a_type, a_rows, a_cols, _lda); - cublasLtMatrixLayoutCreate(&b_desc, b_type, b_rows, b_cols, _ldb); - cublasLtMatrixLayoutCreate(&c_desc, c_type, _m, _n, ldc); -#if (CUDART_VERSION >= 11000) - cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_type); -#else - cublasLtMatmulDescCreate(&matmul_desc, compute_type); -#endif - - cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSA, &a_op, sizeof(cublasOperation_t)); - cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSB, &b_op, sizeof(cublasOperation_t)); - - cublasLtMatmulAlgo_t algo; - void* workspace = workspace_; - int workspace_size = workspace_ == nullptr ? 0 : CUBLAS_WORKSPACE_SIZE; - if (findAlgo) { - if (info.workspaceSize > workspace_size) { - findAlgo = 0; - } - else { - cublasLtMatmulAlgoInit( - cublaslt_handle_, compute_type, scale_type, a_type, b_type, c_type, c_type, info.algoId, &algo); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(info.reductionScheme), sizeof(int)); -#if (CUDART_VERSION >= 11000) - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages)); -#endif - } - } - - cublasLtMatmul(cublaslt_handle_, - matmul_desc, - alpha_ptr, - a_data_ptr, - a_desc, - b_data_ptr, - b_desc, - beta_ptr, - C, - c_desc, - C, - c_desc, - (findAlgo == 1 ? (&algo) : NULL), - workspace, - workspace_size, - stream_); - - cublasLtMatmulDescDestroy(matmul_desc); - cublasLtMatrixLayoutDestroy(a_desc); - cublasLtMatrixLayoutDestroy(b_desc); - cublasLtMatrixLayoutDestroy(c_desc); - sync_check_cuda_error(); - } - else { - cudaDataType_t compute_type = getCublasDataType(compute_type_); - int cublas_algo = info.algoId; - check_cuda_error(cublasGemmEx(cublas_handle_, - a_op, - b_op, - _m, - _n, - k, - alpha_ptr, - a_data_ptr, - a_type, - _lda, - b_data_ptr, - b_type, - _ldb, - beta_ptr, - C, - c_type, - ldc, - compute_type, - static_cast(cublas_algo))); - sync_check_cuda_error(); - } - mutex_->unlock(); -} - -void Gemm::batchedGemm(const GemmOp transa, - const GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* const* A, - const void* const* B, - void* const* C, - const size_t batch_size, - const float alpha, - const float beta) -{ - size_t lda = (transa == GEMM_OP_N) ? k : m; - size_t ldb = (transb == GEMM_OP_N) ? n : k; - size_t ldc = n; - batchedGemm(transa, transb, m, n, k, A, a_type_, lda, B, b_type_, ldb, C, c_type_, ldc, batch_size, alpha, beta); -} - -void Gemm::batchedGemm(const GemmOp transa, - const GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* const* A, - const size_t lda, - const void* const* B, - const size_t ldb, - void* const* C, - const size_t ldc, - const size_t batch_size, - const float alpha, - const float beta) -{ - batchedGemm(transa, transb, m, n, k, A, a_type_, lda, B, b_type_, ldb, C, c_type_, ldc, batch_size, alpha, beta); -} - -void Gemm::batchedGemm(const GemmOp transa, - const GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* const* A, - const DataType Atype, - const size_t lda, - const void* const* B, - const DataType Btype, - const size_t ldb, - void* const* C, - const DataType Ctype, - const size_t ldc, - const size_t batch_size, - const float alpha, - const float beta) -{ - TM_LOG_TRACE( - "Gemm::batchedGemm [b=%ld m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]", batch_size, m, n, k, lda, ldb, ldc); - - // Switch A and B. - const void* const* a_data_ptr = B; - const void* const* b_data_ptr = A; - - cublasOperation_t a_op = getCublasOperation(transb); - cublasOperation_t b_op = getCublasOperation(transa); - - cudaDataType_t a_type = getCublasDataType(Btype); - cudaDataType_t b_type = getCublasDataType(Atype); - cudaDataType_t c_type = getCublasDataType(Ctype); - - // swap m and n, lda and ldb - const size_t _m = n; - const size_t _n = m; - const size_t _lda = ldb; - const size_t _ldb = lda; - - half h_alpha = (half)alpha; - half h_beta = (half)beta; - - mutex_->lock(); - bool is_fp16_compute_type = compute_type_ == TYPE_FP16; - const void* alpha_ptr = - is_fp16_compute_type ? reinterpret_cast(&h_alpha) : reinterpret_cast(&alpha); - const void* beta_ptr = - is_fp16_compute_type ? reinterpret_cast(&h_beta) : reinterpret_cast(&beta); - cublasLtMatmulAlgo_info info = - cublas_algo_map_->getAlgo(batch_size, m, n, k, (a_type == CUDA_R_16F) ? HALF_DATATYPE : FLOAT_DATATYPE); - - check_cuda_error(cublasGemmBatchedEx(cublas_handle_, - a_op, - b_op, - _m, - _n, - k, - alpha_ptr, - a_data_ptr, - a_type, - _lda, - b_data_ptr, - b_type, - _ldb, - beta_ptr, - C, - c_type, - ldc, - batch_size, - getCublasComputeType(compute_type_), - static_cast(info.algoId))); - mutex_->unlock(); -} - -void Gemm::stridedBatchedGemm(GemmOp transa, - GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* A, - const void* B, - void* C, - const size_t batch_size, - const float alpha, - const float beta) -{ - size_t lda = (transa == GEMM_OP_N) ? k : m; - size_t ldb = (transb == GEMM_OP_N) ? n : k; - size_t ldc = n; - int64_t stridea = m * k; - int64_t strideb = k * n; - int64_t stridec = m * n; - - stridedBatchedGemm(transa, - transb, - m, - n, - k, - A, - a_type_, - lda, - stridea, - B, - b_type_, - ldb, - strideb, - C, - c_type_, - ldc, - stridec, - batch_size, - compute_type_, - alpha, - beta); -} - -void Gemm::stridedBatchedGemm(GemmOp transa, - GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* A, - const int64_t strideA, - const void* B, - const int64_t strideB, - void* C, - const int64_t strideC, - const size_t batch_size, - const float alpha, - const float beta) -{ - size_t lda = (transa == GEMM_OP_N) ? k : m; - size_t ldb = (transb == GEMM_OP_N) ? n : k; - size_t ldc = n; - stridedBatchedGemm(transa, - transb, - m, - n, - k, - A, - a_type_, - lda, - strideA, - B, - b_type_, - ldb, - strideB, - C, - c_type_, - ldc, - strideC, - batch_size, - compute_type_, - alpha, - beta); -} - -void Gemm::stridedBatchedGemm(GemmOp transa, - GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* A, - const size_t lda, - const int64_t strideA, - const void* B, - const size_t ldb, - const int64_t strideB, - void* C, - const size_t ldc, - const int64_t strideC, - const size_t batch_size, - const float alpha, - const float beta) -{ - stridedBatchedGemm(transa, - transb, - m, - n, - k, - A, - a_type_, - lda, - strideA, - B, - b_type_, - ldb, - strideB, - C, - c_type_, - ldc, - strideC, - batch_size, - compute_type_, - alpha, - beta); -} - -void Gemm::stridedBatchedGemm(GemmOp transa, - GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* A, - DataType Atype, - const size_t lda, - const int64_t strideA, - const void* B, - DataType Btype, - const size_t ldb, - const int64_t strideB, - void* C, - DataType Ctype, - const size_t ldc, - const int64_t strideC, - const size_t batch_size, - DataType compute_type, - const float alpha, - const float beta) -{ - TM_LOG_TRACE("Gemm::stridedBatchedGemm [b=%ld, m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]", - batch_size, - m, - n, - k, - lda, - ldb, - ldc); - - // Switch A and B. - const void* a_data_ptr = B; - const void* b_data_ptr = A; - - cublasOperation_t a_op = getCublasOperation(transb); - cublasOperation_t b_op = getCublasOperation(transa); - - cudaDataType_t a_type = getCublasDataType(Btype); - cudaDataType_t b_type = getCublasDataType(Atype); - cudaDataType_t c_type = getCublasDataType(Ctype); - - // swap m and n, lda and ldb, stride A and B - const size_t _m = n; - const size_t _n = m; - const size_t _lda = ldb; - const size_t _ldb = lda; - const int64_t _stridea = strideB; - const int64_t _strideb = strideA; - - half h_alpha = (half)alpha; - half h_beta = (half)beta; - - mutex_->lock(); - bool is_fp16_compute_type = compute_type_ == TYPE_FP16; - const void* alpha_ptr = - is_fp16_compute_type ? reinterpret_cast(&h_alpha) : reinterpret_cast(&alpha); - const void* beta_ptr = - is_fp16_compute_type ? reinterpret_cast(&h_beta) : reinterpret_cast(&beta); - cublasLtMatmulAlgo_info info = - cublas_algo_map_->getAlgo(batch_size, m, n, k, (a_type == CUDA_R_16F) ? HALF_DATATYPE : FLOAT_DATATYPE); - - check_cuda_error(cublasGemmStridedBatchedEx(cublas_handle_, - a_op, - b_op, - _m, - _n, - k, - alpha_ptr, - a_data_ptr, - a_type, - _lda, - _stridea, - b_data_ptr, - b_type, - _ldb, - _strideb, - beta_ptr, - C, - c_type, - ldc, - strideC, - batch_size, - getCublasComputeType(compute_type), - static_cast(info.algoId))); - mutex_->unlock(); -} - -void Gemm::checkDataTypeValidity(const DataType& type) -{ - if (type != TYPE_FP32 && type != TYPE_FP16) { - throw GemmNotSupportedException("Gemm supports TYPE_FP16 or TYPE_FP32"); - } -} - -/* ************************* End of GEMM Impl **************************** */ - -// void Int8Gemm::gemm(Tensor& C, -// const GemmOp transa, -// const GemmOp transb, -// const Tensor& A, -// const Tensor& B, -// const float alpha, -// const float beta) -// { - -// } - -/* ************************* SpGEMM Impl *********************************** */ -#ifdef SPARSITY_ENABLED -SpGemm::SpGemm(IAllocator* allocator, cudaStream_t stream, std::string config_file, std::string spconfig_file): - Gemm(allocator, stream, config_file) -{ - CHECK_CUSPARSE(cusparseLtInit(&cusparselt_handle_)); - // TODO(jaedeokk): - // Let's make cublasAlgoMap load gemm/spgemm config separtely, - // allowing us to inherit Gemm's constructor. - // cublas_algo_map_.loadSpGemmConfig(spconfig_file); // enable this line later. - - a_type_ = TYPE_FP16; - b_type_ = TYPE_FP16; - c_type_ = TYPE_FP16; - compute_type_ = TYPE_FP16; -} - -SpGemm::~SpGemm() -{ - cusparseLtDestroy(&cusparselt_handle_); - // Need to destroy matmul description cache. - for (auto& kv : a_desc_map_) { // kv = (mark, a_desc) - cusparseLtMatDescriptorDestroy(&a_desc_map_[kv.first]); - } - for (auto& kv : b_desc_map_) { // kv = (mark, b_desc) - cusparseLtMatDescriptorDestroy(&b_desc_map_[kv.first]); - } - for (auto& kv : c_desc_map_) { // kv = (mark, c_desc) - cusparseLtMatDescriptorDestroy(&c_desc_map_[kv.first]); - } -} - -std::string SpGemm::toString() -{ - const char* a_type_str = a_type_ == TYPE_FP16 ? "FP16" : "FP32"; - const char* b_type_str = b_type_ == TYPE_FP16 ? "FP16" : "FP32"; - const char* c_type_str = c_type_ == TYPE_FP16 ? "FP16" : "FP32"; - const char* compute_type_str = compute_type_ == TYPE_FP16 ? "FP16" : "FP32"; - return fmtstr("SpGemm[a_type=%s, b_type=%s, c_type=%s, compute_type=%s]", - a_type_str, - b_type_str, - c_type_str, - compute_type_str); -} - -void SpGemm::loadGemmConfig(std::string config_file, std::string spconfig_file) -{ - if (cublas_algo_map_ != nullptr) { - delete cublas_algo_map_; // unload algo map. - } - cublas_algo_map_ = new cublasAlgoMap(config_file, spconfig_file); -} - -void SpGemm::checkDataTypeValidity(const DataType& type) -{ - if (type != TYPE_FP16) { - throw GemmNotSupportedException("Sparse GEMM only supports FP16 data type now."); - } -} - -bool SpGemm::useBaseGemm(size_t batch_size, size_t m, size_t n, size_t k) -{ - return !cublas_algo_map_->isUseSparse(batch_size, m, n, k); -} - -// Temporal gemm helper mtehod to use template T. -template -void SpGemm::weightGemmHelper(const GemmOp transa, - const GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* input, - const DenseWeight& weight, - void* output, - const float alpha, - const float beta) -{ - size_t lda = (transa == GEMM_OP_N) ? k : m; - size_t ldb = (transb == GEMM_OP_N) ? n : k; - size_t ldc = n; - if (useBaseGemm(1, m, n, k) || weight.sp_kernel == nullptr) { - Gemm::gemm(transa, - transb, - m, - n, - k, - input, - a_type_, - lda, - (const void*)weight.kernel, - b_type_, - ldb, - output, - c_type_, - ldc, - alpha, - beta); - } - else { - gemm(transa, - transb, - m, - n, - k, - input, - a_type_, - lda, - (const void*)weight.sp_kernel, - b_type_, - ldb, - output, - c_type_, - ldc, - alpha, - beta); - } -} - -void SpGemm::gemm(const GemmOp transa, - const GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* input, - const DenseWeight& weight, - void* output, - const float alpha, - const float beta) -{ - weightGemmHelper(transa, transb, m, n, k, input, weight, output, alpha, beta); -} -void SpGemm::gemm(const GemmOp transa, - const GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* input, - const DenseWeight& weight, - void* output, - const float alpha, - const float beta) -{ - weightGemmHelper(transa, transb, m, n, k, input, weight, output, alpha, beta); -} - -void SpGemm::gemm(const GemmOp transa, - const GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* A, - const DataType Atype, - const size_t lda, - const void* B, - const DataType Btype, - const size_t ldb, - void* C, - const DataType Ctype, - const size_t ldc, - const float alpha, - const float beta) -{ - TM_LOG_TRACE("SpGemm::gemm [m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]", m, n, k, lda, ldb, ldc); - checkDataTypeValidity(Atype); - checkDataTypeValidity(Btype); - checkDataTypeValidity(Ctype); - checkDataTypeValidity(compute_type_); - - if (useBaseGemm(1, m, n, k)) { - // Compute by the base GEMM. - Gemm::gemm(transa, transb, m, n, k, A, Atype, lda, B, Btype, ldb, C, Ctype, ldc, alpha, beta); - return; - } - - // Switch A/B due to column major layout in computation. - // Typical usecase of Gemm family is to compute Y = X * W where X is an - // input tensor and W is a kernel weight. Compression takes a lot time - // so only the kernel weight (which is fixed in inference time) can be - // sparse. Using B as sparse seems not stable, unfortunately. - // (e.g. caching matrix descriptions is not correctly working.) - // Thus, SpGemm considers a column major layout in computation to make - // C^T = B^T * A^T, where a kernel weight "B" is located at the front. - const void* a_data = B; - const void* b_data = A; - - cusparseOrder_t order = CUSPARSE_ORDER_COL; - - cusparseOperation_t opA = getCusparseOperation(transb); - cusparseOperation_t opB = getCusparseOperation(transa); - - cudaDataType_t a_type = getCublasDataType(Btype); - cudaDataType_t b_type = getCublasDataType(Atype); - cudaDataType_t c_type = getCublasDataType(Ctype); - - const size_t _m = n; - const size_t _n = m; - const size_t _lda = ldb; - const size_t _ldb = lda; - - const size_t a_rows = (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) ? _m : k; - const size_t a_cols = (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) ? k : _m; - const size_t b_rows = (opB == CUSPARSE_OPERATION_NON_TRANSPOSE) ? k : _n; - const size_t b_cols = (opB == CUSPARSE_OPERATION_NON_TRANSPOSE) ? _n : k; - const size_t c_rows = _m; - const size_t c_cols = _n; - - const unsigned alignment = 16; - cusparseComputeType compute_type = getCusparseComputeType(compute_type_); - - cusparseLtMatmulDescriptor_t matmul; - cusparseLtMatmulAlgSelection_t alg_sel; - cusparseLtMatmulPlan_t plan; - - char mark[256]; - sprintf(mark, "%d_%ld_%ld_%ld_%s_%s", 1, m, n, k, getGemmOpString(transb).c_str(), getGemmOpString(transa).c_str()); - if (a_desc_map_.find(mark) != a_desc_map_.end()) { - CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_, - &matmul, - opA, - opB, - &a_desc_map_[mark], - &b_desc_map_[mark], - &c_desc_map_[mark], - &c_desc_map_[mark], - compute_type)); - } - else { - // initializing MatDesc takes a lot of time - cusparseLtMatDescriptor_t a_desc, b_desc, c_desc; - a_desc_map_[mark] = a_desc; - b_desc_map_[mark] = b_desc; - c_desc_map_[mark] = c_desc; - CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_, - &a_desc_map_[mark], - a_rows, - a_cols, - _lda, - alignment, - a_type, - order, - CUSPARSELT_SPARSITY_50_PERCENT)); - CHECK_CUSPARSE(cusparseLtDenseDescriptorInit( - &cusparselt_handle_, &b_desc_map_[mark], b_rows, b_cols, _ldb, alignment, b_type, order)); - CHECK_CUSPARSE(cusparseLtDenseDescriptorInit( - &cusparselt_handle_, &c_desc_map_[mark], c_rows, c_cols, ldc, alignment, c_type, order)); - CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_, - &matmul, - opA, - opB, - &a_desc_map_[mark], - &b_desc_map_[mark], - &c_desc_map_[mark], - &c_desc_map_[mark], - compute_type)); - } - - mutex_->lock(); - CHECK_CUSPARSE( - cusparseLtMatmulAlgSelectionInit(&cusparselt_handle_, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT)); - int alg = cublas_algo_map_->getSpAlgo(1, a_rows, b_cols, a_cols); - CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute( - &cusparselt_handle_, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg))); - size_t workspace_size; - CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&cusparselt_handle_, &alg_sel, &workspace_size)); - CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&cusparselt_handle_, &plan, &matmul, &alg_sel, workspace_size)); - - void* d_workspace = nullptr; // Can we use the workspace of the class? - int num_streams = 1; - cudaStream_t streams[1] = {stream_}; - CHECK_CUSPARSE(cusparseLtMatmul( - &cusparselt_handle_, &plan, &alpha, a_data, b_data, &beta, C, C, d_workspace, streams, num_streams)) - CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan)) - mutex_->unlock(); - sync_check_cuda_error(); -} -#endif - -/* ************************* End of SpGEMM Impl ************************** */ - -/* ***************************** GEMM utils ****************************** */ - -std::shared_ptr createGemm(IAllocator* allocator, cudaStream_t stream, bool sparse, bool quantized) -{ - TM_LOG_TRACE( - "Create Gemm instance [sparse=%s, quantized=%s]", sparse ? "true" : "false", quantized ? "true" : "false"); - std::shared_ptr gemm; - if (!sparse) { - if (!quantized) { - gemm = std::make_shared(allocator, stream); - } - else { - throw GemmNotSupportedException("Int8 Gemm is not supported yet"); - } - } - else { -#ifdef SPARSITY_ENABLED - if (sparse && !quantized) { - gemm = std::make_shared(allocator, stream); - } - else { - throw GemmNotSupportedException("Int8 Sparse Gemm is not supported yet"); - } -#else - throw GemmNotSupportedException("Sparsity support is not enabled. To enabled sparisty, " - "please provide `-DSPARSITY_SUPPORT` flag for compilation."); -#endif - } - return gemm; -} - -cudaDataType_t getCublasDataType(DataType dtype) -{ - switch (dtype) { - case TYPE_FP16: - return CUDA_R_16F; - case TYPE_FP32: - return CUDA_R_32F; - default: - throw GemmNotSupportedException("Not supported data type."); - } -} - -#if (CUDART_VERSION >= 11000) -cublasComputeType_t getCublasComputeType(DataType ctype) -{ - switch (ctype) { - case TYPE_FP16: - return CUBLAS_COMPUTE_16F; - case TYPE_FP32: - return CUBLAS_COMPUTE_32F; - default: - throw GemmNotSupportedException("Not supported cublas compute type."); - } -} -#else -cudaDataType_t getCublasComputeType(DataType ctype) -{ - switch (ctype) { - case TYPE_FP16: - return CUDA_R_16F; - case TYPE_FP32: - return CUDA_R_32F; - default: - throw GemmNotSupportedException("Not supported cublas compute type."); - } -} -#endif - -cublasOperation_t getCublasOperation(GemmOp op) -{ - switch (op) { - case GEMM_OP_N: - return CUBLAS_OP_N; - case GEMM_OP_T: - return CUBLAS_OP_T; - default: - throw GemmNotSupportedException("Unknown GemmOp provided."); - } -} - -std::string getGemmOpString(const GemmOp& op) -{ - switch (op) { - case GEMM_OP_T: - return "T"; - case GEMM_OP_N: - return "N"; - } - throw GemmNotSupportedException("Unknown GemmOp provided."); -} - -#ifdef SPARSITY_ENABLED -cusparseOperation_t getCusparseOperation(GemmOp op) -{ - switch (op) { - case GEMM_OP_N: - return CUSPARSE_OPERATION_NON_TRANSPOSE; - case GEMM_OP_T: - return CUSPARSE_OPERATION_TRANSPOSE; - default: - throw GemmNotSupportedException("Unknown GemmOp provided."); - } -} - -cusparseComputeType getCusparseComputeType(DataType ctype) -{ - if (ctype != TYPE_FP16) { - throw GemmNotSupportedException("Sparse GEMM supports TYPE_FP16 compute type only."); - } - return CUSPARSE_COMPUTE_16F; -} - -void pruneMatrixB(void* data, const cudaStream_t& stream, const size_t k, const size_t n, const GemmOp trans) -{ - TM_LOG_TRACE("Prune matrix B [k=%ld, n=%ld, op=%s]", k, n, getGemmOpString(trans).c_str()); - - // Due to A/B switching, the matrix B will be used as a matrix A. - const cusparseOrder_t order = CUSPARSE_ORDER_COL; - const size_t rows = (trans == GEMM_OP_N) ? n : k; - const size_t cols = (trans == GEMM_OP_N) ? k : n; - const size_t ld = rows; - const unsigned alignment = 16; - - const cusparseLtPruneAlg_t prune_alg = CUSPARSELT_PRUNE_SPMMA_STRIP; - const cusparseOperation_t op = getCusparseOperation(trans); - const cudaDataType_t dtype = CUDA_R_16F; // fixed under cusparselt == 0.2.0. - - // 0: B is sparse, 1: A is sparse - // B matrix will be used as A matrix at the SpGemm::gemm. - const int is_sparse_a = 1; - - // TODO: Let the resource manager handle GPU-related resources later. - cusparseLtHandle_t handle; - CHECK_CUSPARSE(cusparseLtInit(&handle)); - cusparseLtMatDescriptor_t mat_desc; - CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit( - &handle, &mat_desc, rows, cols, ld, alignment, dtype, order, CUSPARSELT_SPARSITY_50_PERCENT)); - CHECK_CUSPARSE(cusparseLtSpMMAPrune2(&handle, &mat_desc, is_sparse_a, op, data, data, prune_alg, stream)); - CHECK_CUSPARSE(cusparseLtMatDescriptorDestroy(&mat_desc)); - CHECK_CUSPARSE(cusparseLtDestroy(&handle)); -} - -size_t compressMatrixB(void** output, - IAllocator& allocator, - const cudaStream_t& stream, - const void* input, - const size_t k, - const size_t n, - const GemmOp trans) -{ - TM_LOG_TRACE("compressMatrix [k=%ld, n=%ld, dtype=FP16]", k, n); - - // swap A/B due to column/row major layout mismatch. - cusparseOrder_t order = CUSPARSE_ORDER_COL; - const size_t rows = (trans == GEMM_OP_N) ? n : k; - const size_t cols = (trans == GEMM_OP_N) ? k : n; - const size_t ld = rows; - - cudaDataType_t dtype = CUDA_R_16F; // fixed under cusparselt == 0.2.0. - cusparseLtSparsity_t sparsity = CUSPARSELT_SPARSITY_50_PERCENT; - cusparseOperation_t op = getCusparseOperation(trans); - cusparseLtMatDescriptor_t mat_desc; - const unsigned alignment = 16; - const int is_sparse_a = 1; // 0: B is sparse, 1: A is sparse - - cusparseLtHandle_t handle; - CHECK_CUSPARSE(cusparseLtInit(&handle)); - - CHECK_CUSPARSE( - cusparseLtStructuredDescriptorInit(&handle, &mat_desc, rows, cols, ld, alignment, dtype, order, sparsity)) - - size_t compressed_size = 0; - CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &mat_desc, &compressed_size)); - if (compressed_size == 0) { - throw GemmInvalidException("Fail to compute correct compressed_size, got 0. This error may be " - "caused by a too small input matrix."); - } - - *output = allocator.malloc(compressed_size, false); - CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &mat_desc, is_sparse_a, op, input, *output, stream)) - - CHECK_CUSPARSE(cusparseLtMatDescriptorDestroy(&mat_desc)); - CHECK_CUSPARSE(cusparseLtDestroy(&handle)); - return compressed_size; -} - -#endif - -/* ************************* End of GEMM utils **************************** */ - -} // end of namespace turbomind diff --git a/src/turbomind/utils/gemm.h b/src/turbomind/utils/gemm.h deleted file mode 100644 index 7cc5502da9..0000000000 --- a/src/turbomind/utils/gemm.h +++ /dev/null @@ -1,681 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -// TODO: Need to remove the dependency of the layer module. -// e.g. refactor Weight class to some base module. -#include "src/turbomind/layers/DenseWeight.h" -#include "src/turbomind/utils/Tensor.h" -#include "src/turbomind/utils/allocator.h" -#include "src/turbomind/utils/cublasAlgoMap.h" -#include "src/turbomind/utils/cuda_utils.h" -#include "src/turbomind/utils/logger.h" -#include "src/turbomind/utils/memory_utils.h" - -#ifndef CUDART_VERSION -#error CUDART_VERSION Undefined! -#endif - -// cublas default workspace size: 32MB. Let me make this as a Gemm property. -#define WORKSPACE_SIZE 33554432 - -namespace turbomind { - -// A wrapper of cublas or cusparse matrix operator. -// - GEMM_OP_N = CUBLAS_OP_N or CUSPARSE_OP_N -// - GEMM_OP_T = CUBLAS_OP_T or CUSPARSE_OP_T -enum GemmOp -{ - GEMM_OP_N, - GEMM_OP_T -}; - -// A base class of the GEMM family. -// In the current version Gemm is as a base class as well as an interface. -class Gemm { - -public: - Gemm() = delete; // Disable a default constructor - /** - * A Gemm class. - * - * NOTE: - * A, B, C are assumed to have a row major layout, while a backend cuda libraries - * assumes a column major layout. However, a family of Gemm has already handled - * such discrepancy internally. Please use naively without a trick like switching - * inputs A and B that aligns the matrix layout. - * - * Restriction: Supported in/out data or compute types: TYPE_FP16, TYPE_FP32. - * - * TODO: - * Unify resource allocation/release from a singleton GPU resource managers. - * Thus, allocator, stream can be replaced by a resource handler later. - * E.g. Gemm(std::shared_ptr resource_manager), and - * stream_ = resource_manager.getCudaStream(); - * buffer = resource_manager.malloc(...); - * - * @param allocator Resource allocator. - * @param stream A CUDA stream. - * @param config_file A file path of a GEMM configuration. - */ - Gemm(IAllocator* allocator, cudaStream_t stream, std::string config_file = GEMM_CONFIG); - Gemm(Gemm const& other) = delete; - virtual ~Gemm(); - - virtual std::string toString(); - - /** - * @brief Set GEMM compute type. - * - * @param compute_type The data type of accumulation type inside GEMM computation. - * (Choices: TYPE_FP16, TYPE_FP32) - * - * @throw GemmNotSupportedException if a type is not TYPE_FP16 or TYPE_FP32. - * @throw std::runtime_error if any exception inside CUDA. - */ - void setComputeType(DataType compute_type); - - /** - * @brief Set matrix data types and compute precision. - * - * Supported data or compute types: TYPE_FP16, TYPE_FP32 - * - * @param a_type The data type of a matrix A. - * @param b_type The data type of a matrix B. - * @param c_type The data type of a matrix C. - * @param compute_type The data type of accumulation type inside GEMM computation. - * - * @throw GemmNotSupportedException if a type is not TYPE_FP16 or TYPE_FP32. - * @throw std::runtime_error if any exception inside CUDA. - */ - void setTypes(DataType a_type, DataType b_type, DataType c_type, DataType compute_type); - - /** - * @brief Set matrix data and compute types by default values. - * - * Default configs: - * - T=float : data type=TYPE_FP32, compute type=TYPE_FP32 - * - T=half : data type=TYPE_FP16, compute type=TYPE_FP32 - */ - template - void setDefaultTypes(); - - void loadGemmConfig(std::string config_file); - - void setAllocator(IAllocator* allocator); - void setCudaStream(cudaStream_t& stream); - - // Th APIs below are to see how the interface will change - // if it cooperates with Tensor. To enable it, we need to - // update the Tensor class. For instance, data is need to - // be of type (void*) rather than (const void*) to pass it - // as the output C of gemm. - // virtual void gemm(Tensor& C, - // const GemmOp transa, - // const GemmOp transb, - // const Tensor& A, - // const Tensor& B, - // const float alpha = 1.0f, - // const float beta = 0.0f); - // - // virtual void batchedMatmul(std::vector Carray, - // const GemmOp transa, - // const GemmOp transb, - // const std::vector Aarray, - // const std::vector Barray, - // const float alpha = 1.0f, - // const float beta = 0.0f); - // - // virtual void stridedBatchedGemm(Tensor& C, - // const GemmOp transa, - // const GemmOp transb, - // const Tensor& A, - // const Tensor& B, - // const float alpha = 1.0f, - // const float beta = 0.0f); - - // TODO: - // This function cooperates with a Weight object to simply Gemm calls - // inside layers, computing the following formula - // output(C) = input(A) * weight_kernel(B) - // where weight_kernel can be changed according to Gemm functions. - // DenseWeight is of a template struct, not allowing override the method. - // We temperally add an interface here for two cases float/half, - // but to finialze this function, we need an interface of a weight class - // which is not a template class. - virtual void gemm(const GemmOp transa, - const GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* input, - const DenseWeight& weight, - void* output, - const float alpha = 1.0f, - const float beta = 0.0f); - virtual void gemm(const GemmOp transa, - const GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* input, - const DenseWeight& weight, - void* output, - const float alpha = 1.0f, - const float beta = 0.0f); - - virtual void gemm(const GemmOp transa, - const GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* A, - const void* B, - void* C, - const float alpha = 1.0f, - const float beta = 0.0f); - - virtual void gemm(const GemmOp transa, - const GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* A, - const size_t lda, - const void* B, - const size_t ldb, - void* C, - const size_t ldc, - const float alpha = 1.0f, - const float beta = 0.0f); - /** - * @brief Compute the matrix multiplication `C = \alpha * op(A) * op(B) + \beta * C`. - * - * @param transa A transpose operation of a matrix A (GEMM_OP_N or GEMM_OP_T). - * @param transb A transpose operation of a matrix B (GEMM_OP_N or GEMM_OP_T). - * @param m A number of rows of a matrix op(A) and C. - * @param n A number of columns of a matrix op(B) or C. - * @param k A number of columns of op(A) and rows of op(B). - * @param A A device pointer of a matrix A of dimension (m x lda). - * @param Atype A data type of A (TYPE_FP16 or TYPE_FP32) - * @param lda A leading dimension of the matrix A. - * @param B A device pointer of a matrix B of dimension (k x ldb). - * @param Btype A data type of B (TYPE_FP16 or TYPE_FP32) - * @param ldb A leading dimension of the matrix B. - * @param C (Output) A device pointer of a matrix C of dimension (m x ldc). - * @param Ctype A data type of C (TYPE_FP16 or TYPE_FP32) - * @param ldc A leading dimension of the matrix C. - * @param alpha A scale factor for A*B (default: 1.0f). - * @param beta A scale factor for C (default: 0.0f). - * - * @throw GemmNotSupportedException if a type is not TYPE_FP16 or TYPE_FP32. - * @throw std::runtime_error if any exception inside CUDA. - */ - virtual void gemm(const GemmOp transa, - const GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* A, - const DataType Atype, - const size_t lda, - const void* B, - const DataType Btype, - const size_t ldb, - void* C, - const DataType Ctype, - const size_t ldc, - const float alpha = 1.0f, - const float beta = 0.0f); - - virtual void batchedGemm(const GemmOp transa, - const GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* const* A, - const void* const* B, - void* const* C, - const size_t batch_size, - const float alpha = 1.0f, - const float beta = 0.0f); - - virtual void batchedGemm(const GemmOp transa, - const GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* const* A, - const size_t lda, - const void* const* B, - const size_t ldb, - void* const* C, - const size_t ldc, - const size_t batch_size, - const float alpha = 1.0f, - const float beta = 0.0f); - - /** - * @brief Compute the matrix multiplication of batch of matrices As and Bs - * - * For input batch A[i]/B[i] and output batch C[i], i = 0, ..., batch_size - 1, - * `C[i] = \alpha * op(A[i]) * op(B[i]) + \beta * C[i]`. - * - * @param transa A transpose operation of a matrix A (GEMM_OP_N or GEMM_OP_T). - * @param transb A transpose operation of a matrix B (GEMM_OP_N or GEMM_OP_T). - * @param m A number of rows of a matrix op(A) and C. - * @param n A number of columns of a matrix op(B) or C. - * @param k A number of columns of op(A) and rows of op(B). - * @param A An array of device pointers of batch of input A matrices. - * @param Atype A data type of A (TYPE_FP16 or TYPE_FP32) - * @param lda A leading dimension of the matrix A. - * @param B An array of device pointers of batch of input B matrices. - * @param Btype A data type of B (TYPE_FP16 or TYPE_FP32) - * @param ldb A leading dimension of the matrix B. - * @param C (Output) An array of device pointers of batch of output C matrices. - * @param Ctype A data type of C (TYPE_FP16 or TYPE_FP32) - * @param ldc A leading dimension of the matrix C. - * @param alpha A scale factor for A*B (default: 1.0f). - * @param beta A scale factor for C (default: 0.0f). - * - * @throw GemmNotSupportedException if a type is not TYPE_FP16 or TYPE_FP32. - * @throw std::runtime_error if any exception inside CUDA. - */ - virtual void batchedGemm(const GemmOp transa, - const GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* const* A, - const DataType Atype, - const size_t lda, - const void* const* B, - const DataType Btype, - const size_t ldb, - void* const* C, - const DataType Ctype, - const size_t ldc, - const size_t batch_size, - const float alpha = 1.0f, - const float beta = 0.0f); - - virtual void stridedBatchedGemm(GemmOp transa, - GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* A, - const void* B, - void* C, - const size_t batch_size, - const float alpha = 1.0f, - const float beta = 0.0f); - - virtual void stridedBatchedGemm(GemmOp transa, - GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* A, - const int64_t strideA, - const void* B, - const int64_t strideB, - void* C, - const int64_t strideC, - const size_t batch_size, - const float alpha = 1.0f, - const float beta = 0.0f); - - virtual void stridedBatchedGemm(GemmOp transa, - GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* A, - const size_t lda, - const int64_t strideA, - const void* B, - const size_t ldb, - const int64_t strideB, - void* C, - const size_t ldc, - const int64_t strideC, - const size_t batch_size, - const float alpha = 1.0f, - const float beta = 0.0f); - /** - * @brief Compute the strided matrix multiplication of batch of matrices As and Bs - * - * For input batch A[i]/B[i] and output batch C[i], i = 0, ..., batch_size - 1, - * `C[i] = \alpha * op(A[i]) * op(B[i]) + \beta * C[i]`. - * - * @param transa A transpose operation of a matrix A (GEMM_OP_N or GEMM_OP_T). - * @param transb A transpose operation of a matrix B (GEMM_OP_N or GEMM_OP_T). - * @param m A number of rows of a matrix op(A) and C. - * @param n A number of columns of a matrix op(B) or C. - * @param k A number of columns of op(A) and rows of op(B). - * @param A An array of device pointers of batch of input A matrices. - * @param Atype A data type of A (TYPE_FP16 or TYPE_FP32) - * @param lda A leading dimension of the matrix A. - * @param strideA An offset in number of elements between matrix A[i] and A[i+1]. - * @param B An array of device pointers of batch of input B matrices. - * @param Btype A data type of B (TYPE_FP16 or TYPE_FP32) - * @param ldb A leading dimension of the matrix B. - * @param strideB An offset in number of elements between matrix B[i] and B[i+1]. - * @param C (Output) An array of device pointers of batch of output C matrices. - * @param Ctype A data type of C (TYPE_FP16 or TYPE_FP32) - * @param ldc A leading dimension of the matrix C. - * @param strideC An offset in number of elements between matrix C[i] and C[i+1]. - * @param compute_type An accumulation type of GEMM. - * @param alpha A scale factor for A*B (default: 1.0f). - * @param beta A scale factor for C (default: 0.0f). - * - * @throw GemmNotSupportedException if a type is not TYPE_FP16 or TYPE_FP32. - * @throw std::runtime_error if any exception inside CUDA. - */ - virtual void stridedBatchedGemm(GemmOp transa, - GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* A, - DataType Atype, - const size_t lda, - const int64_t strideA, - const void* B, - DataType Btype, - const size_t ldb, - const int64_t strideB, - void* C, - DataType Ctype, - const size_t ldc, - const int64_t strideC, - const size_t batch_size, - DataType compute_type, - const float alpha = 1.0f, - const float beta = 0.0f); - -protected: - IAllocator* allocator_ = nullptr; - cudaStream_t stream_; - std::mutex* mutex_ = nullptr; - cublasAlgoMap* cublas_algo_map_ = nullptr; - - cublasHandle_t cublas_handle_; - cublasLtHandle_t cublaslt_handle_; - void* workspace_ = nullptr; - - // use FP32 as default - DataType a_type_ = TYPE_FP32; - DataType b_type_ = TYPE_FP32; - DataType c_type_ = TYPE_FP32; - DataType compute_type_ = TYPE_FP32; - - // Check if data and inputs are valid in the Gemm class. - virtual void checkDataTypeValidity(const DataType& type); -}; - -// class Int8Gemm : public Gemm { - -// protected: -// bool use_ORDER_COL32_2R_4R4_; // what is this? -// }; - -#ifdef SPARSITY_ENABLED - -/** - * A Sparse Gemm class. - * - * NOTE: - * A, B, C are assumed to have a row major layout. - * There are two restrictions: - * - It supports the case when the matrix B is sparse. - * - Supported only TYPE_FP16 for in/out data or compute types. - */ -class SpGemm: public Gemm { - -protected: - cusparseLtHandle_t cusparselt_handle_; - std::map a_desc_map_; - std::map b_desc_map_; - std::map c_desc_map_; - bool useBaseGemm(size_t batch_size, size_t m, size_t n, size_t k); - -public: - using Gemm::setComputeType; - using Gemm::setTypes; - using Gemm::setDefaultTypes; - using Gemm::setAllocator; - using Gemm::setCudaStream; - using Gemm::gemm; - using Gemm::batchedGemm; - using Gemm::stridedBatchedGemm; - - /** - * @param allocator Resource allocator. - * @param stream A CUDA stream. - * @param config_file A file path of a GEMM configuration. - */ - // TODO: Let's unify algo map loading part. - SpGemm(IAllocator* allocator, - cudaStream_t stream, - std::string config_file = GEMM_CONFIG, - std::string spconfig_file = SPGEMM_CONFIG); - ~SpGemm(); - std::string toString() override; - void loadGemmConfig(std::string config_file, std::string spconfig_file); - - // Template method cannot be overridden. - void gemm(const GemmOp transa, - const GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* input, - const DenseWeight& weight, - void* output, - const float alpha = 1.0f, - const float beta = 0.0f) override; - void gemm(const GemmOp transa, - const GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* input, - const DenseWeight& weight, - void* output, - const float alpha = 1.0f, - const float beta = 0.0f) override; - - void gemm(const GemmOp transa, - const GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* A, - const DataType Atype, - const size_t lda, - const void* B, - const DataType Btype, - const size_t ldb, - void* C, - const DataType Ctype, - const size_t ldc, - const float alpha = 1.0f, - const float beta = 0.0f) override; - -private: - void checkDataTypeValidity(const DataType& type) override; - - // Temporal gemm helper mtehod to use template T. - template - void weightGemmHelper(const GemmOp transa, - const GemmOp transb, - const size_t m, - const size_t n, - const size_t k, - const void* input, - const DenseWeight& weight, - void* output, - const float alpha, - const float beta); -}; - -// class Int8SpGemm : public Int8Gemm, public SpGemm { - -// }; -#endif - -/* ***************************** GEMM Exceptions ******************************* */ - -class GemmInvalidShapeException: public std::exception { -private: - std::string msg_ = "Invalid matrix shapes."; - -public: - explicit GemmInvalidShapeException() = default; - - template - explicit GemmInvalidShapeException(const std::string format, const Args&... args): msg_(fmtstr(format, args...)) - { - } - - const char* what() const throw() - { - return msg_.c_str(); - } -}; - -class GemmNotSupportedException: public std::exception { -private: - std::string msg_ = "Not supported exception."; - -public: - explicit GemmNotSupportedException() = default; - - template - explicit GemmNotSupportedException(const std::string format, const Args&... args): msg_(fmtstr(format, args...)) - { - } - - const char* what() const throw() - { - return msg_.c_str(); - } -}; - -class GemmInvalidException: public std::exception { -private: - std::string msg_ = "Invalid use of gemm."; - -public: - explicit GemmInvalidException() = default; - - template - explicit GemmInvalidException(const std::string format, const Args&... args): msg_(fmtstr(format, args...)) - { - } - - const char* what() const throw() - { - return msg_.c_str(); - } -}; - -/* ************************ End of GEMM Exceptions ************************ */ - -/* ***************************** GEMM utils ******************************* */ - -/** - * @brief Create method for the Gemm family. - * - * @param allocator Resource allocator. - * @param stream A CUDA stream. - * @param sparse Whether to use sparse GEMM - * @param quantized Whether to use int8 quantized GEMM. - * @return A shared pointer of a GemmCls instance. - */ -std::shared_ptr -createGemm(IAllocator* allocator, cudaStream_t stream, bool sparse = false, bool quantized = false); - -cudaDataType_t getCublasDataType(DataType dtype); -#if (CUDART_VERSION >= 11000) -cublasComputeType_t getCublasComputeType(DataType dtype); -#else -cudaDataType_t getCublasComputeType(DataType dtype); -#endif -cublasOperation_t getCublasOperation(GemmOp op); -std::string getGemmOpString(const GemmOp& op); - -#ifdef SPARSITY_ENABLED -cusparseOperation_t getCusparseOperation(GemmOp op); -cusparseComputeType getCusparseComputeType(DataType dtype); - -/** - * @brief Prune a weight matrix (in-place). - * - * SpGemm supports a case when the sparse matrix is B in C=A*B. - * - * @param data A data pointer - * @param stream A cuda stream object. - * @param k A number of rows of op(B). - * @param n A number of columns of op(B). - * @param trans A transpose operation that will be applied to the matrix - * (default: GEMM_OP_N). - */ -void pruneMatrixB( - void* data, const cudaStream_t& stream, const size_t k, const size_t n, const GemmOp trans = GEMM_OP_N); - -/** - * @brief Compress the B matrix in a specific sparsity format. - * - * @param output A pointer where to allocate memory buffer to store a compressed matrix. - * @param alloactor A resource allocator. - * @param stream A cuda stream object. - * @param input An input matrix to compress. - * @param k A number of rows of op(B). - * @param n A number of columns of op(B). - * @param trans A transpose operation that will be applied to the matrix (default: GEMM_OP_N). - * - * @return A size of the allocated device buffer of the compressed matrix. - * - * @throw GemmInvalidException if the input matrix does not have 2:4 sparsity. - * or if fail to compute a correct buffer size to store the compressed matrix. - * @throw std::runtime_error if any exception inside CUDA. - */ -size_t compressMatrixB(void** output, - IAllocator& allocator, - const cudaStream_t& stream, - const void* input, - const size_t k, - const size_t n, - const GemmOp trans = GEMM_OP_N); - -#endif - -/* ************************* End of GEMM utils **************************** */ - -} // end of namespace turbomind diff --git a/src/turbomind/utils/gemm_test/CMakeLists.txt b/src/turbomind/utils/gemm_test/CMakeLists.txt deleted file mode 100644 index 3e65f65a36..0000000000 --- a/src/turbomind/utils/gemm_test/CMakeLists.txt +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -cmake_minimum_required(VERSION 3.8) - -find_package(CUDAToolkit REQUIRED) - -set(gemm_func_files - gemm_func.cc -) - -set(encoder_gemm_func_files - encoder_gemm_func.cc -) - -set(encoder_igemm_func_files - encoder_igemm_func.cc -) - -set(decoding_gemm_func_files - decoding_gemm_func.cc -) - -set(gpt_gemm_func_files - gpt_gemm_func.cc -) - -set(xlnet_gemm_func_files - xlnet_gemm_func.cc -) - -set(t5_gemm_func_files - t5_gemm_func.cc -) - -set(swin_igemm_func_files - swin_igemm_func.cc -) - -set(swin_gemm_func_files - swin_gemm_func.cc -) - -add_library(gemm_func STATIC ${gemm_func_files}) -target_link_libraries(gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart cuda_utils logger) -set_property(TARGET gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON) -set_property(TARGET gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) - -add_library(encoder_gemm_func STATIC ${encoder_gemm_func_files}) -target_link_libraries(encoder_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger) -if (SPARSITY_SUPPORT) -target_link_libraries(encoder_gemm_func PUBLIC CUDA::cusparse -lcusparseLt) -endif() -set_property(TARGET encoder_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON) -set_property(TARGET encoder_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) - -add_library(encoder_igemm_func STATIC ${encoder_igemm_func_files}) -target_link_libraries(encoder_igemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart cuda_utils logger) -if (SPARSITY_SUPPORT) -target_link_libraries(encoder_igemm_func PUBLIC CUDA::cusparse -lcusparseLt) -endif() -set_property(TARGET encoder_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON) -set_property(TARGET encoder_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) - -add_library(decoding_gemm_func STATIC ${decoding_gemm_func_files}) -target_link_libraries(decoding_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger) -set_property(TARGET decoding_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON) -set_property(TARGET decoding_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) - -add_library(gpt_gemm_func STATIC ${gpt_gemm_func_files}) -target_link_libraries(gpt_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger) -if (SPARSITY_SUPPORT) - target_link_libraries(gpt_gemm_func PUBLIC CUDA::cusparse -lcusparseLt) -endif() -set_property(TARGET gpt_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON) -set_property(TARGET gpt_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) - -add_library(xlnet_gemm_func STATIC ${xlnet_gemm_func_files}) -target_link_libraries(xlnet_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger) -set_property(TARGET xlnet_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON) -set_property(TARGET xlnet_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) - -add_library(t5_gemm_func STATIC ${t5_gemm_func_files}) -target_link_libraries(t5_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger) -if (SPARSITY_SUPPORT) - target_link_libraries(t5_gemm_func PUBLIC CUDA::cusparse -lcusparseLt) -endif() -set_property(TARGET t5_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON) -set_property(TARGET t5_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) - -add_library(swin_igemm_func STATIC ${swin_igemm_func_files}) -target_link_libraries(swin_igemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func encoder_igemm_func cuda_utils logger) -set_property(TARGET swin_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON) -set_property(TARGET swin_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) - -add_library(swin_gemm_func STATIC ${swin_gemm_func_files}) -target_link_libraries(swin_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger) -set_property(TARGET swin_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON) -set_property(TARGET swin_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) diff --git a/src/turbomind/utils/gemm_test/decoding_gemm_func.cc b/src/turbomind/utils/gemm_test/decoding_gemm_func.cc deleted file mode 100644 index 068ae98d81..0000000000 --- a/src/turbomind/utils/gemm_test/decoding_gemm_func.cc +++ /dev/null @@ -1,406 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "src/turbomind/utils/gemm_test/decoding_gemm_func.h" -#include "src/turbomind/macro.h" -#include - -namespace turbomind { - -template -void generate_decoding_gemm_config(int batch_size, - int beam_width, - int max_mem_seq_len, - int head_num, - int size_per_head, - int inter_size, - int vocab_size, - int mem_hidden_units, - void* buffer_in, - bool isAppend) -{ - void* cublas_workspace; - void* buffer; - int workSpaceSize; - -#ifdef ENABLE_BF16 - if (std::is_same::value || std::is_same::value) { -#else - if (std::is_same::value) { -#endif // ENABLE_BF16 - // cublas_workspace_ should be the start pointer of cudaMalloc() - // to ensure 16B alignemnet - cublas_workspace = buffer_in; - buffer = (void*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE); - workSpaceSize = CUBLAS_WORKSPACE_SIZE; - } - else { - cublas_workspace = nullptr; - buffer = buffer_in; - workSpaceSize = 0; - } - - struct cudaDeviceProp prop; - check_cuda_error(cudaGetDeviceProperties(&prop, 0)); - printf("Device %s\n", prop.name); - - // check config - FILE* fd; - int line_count = 0; - if (!isAppend) { - fd = fopen(GEMM_CONFIG, "w+"); - } - else { - fd = fopen(GEMM_CONFIG, "a+"); - std::vector config; - char line[1024]; - while (fgets(line, 1024, fd) != NULL) { - config.push_back(std::string(line)); - } - line_count = config.size(); - if (config.size() >= (MAX_CONFIG_NUM * GEMM_NUM + 1)) // 6 cublas/cublasLt, first row is not included - { - int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * GEMM_NUM); - fclose(fd); - fd = fopen(GEMM_CONFIG, "w+"); - fprintf(fd, "%s", config[0].c_str()); - for (uint i = startIdx; i < config.size(); i++) { - fprintf(fd, "%s", config[i].c_str()); - } - line_count = config.size() - (GEMM_NUM + 3); - } - } - - const int hidden_units = head_num * size_per_head; - const int gemm_num = 6; - int M[gemm_num]; - int N[gemm_num]; - int K[gemm_num]; - int batchCount[gemm_num] = {1, 1, 1, 1, 1, 1}; - char mess[gemm_num][256]; - - // gemm 0 - M[0] = batch_size * beam_width; - K[0] = hidden_units; - N[0] = K[0] * 3; - strcpy(mess[0], "from_tensor * weightQKV"); - - // gemm 1 - M[1] = batch_size * beam_width; - K[1] = hidden_units; - N[1] = K[1]; - strcpy(mess[1], "attr * output_kernel"); - - // gemm2 - M[2] = batch_size * beam_width * max_mem_seq_len; - K[2] = mem_hidden_units; - N[2] = hidden_units; - strcpy(mess[2], "mem_tensor * weightK/V in cross attention"); - - // gemm 3 - M[3] = batch_size * beam_width; - K[3] = hidden_units; - N[3] = inter_size; - strcpy(mess[3], "ffn gemm1 "); - - // gemm 4 - M[4] = batch_size * beam_width; - K[4] = inter_size; - N[4] = hidden_units; - strcpy(mess[4], "ffn gemm2"); - - // gemm5 - M[5] = batch_size * beam_width; - K[5] = hidden_units; - N[5] = ceil(vocab_size / 8.) * 8; - strcpy(mess[5], "decoder_output * embedding_kernel -> embedding_output"); - - cublasHandle_t cublas_handle; - check_cuda_error(cublasCreate(&cublas_handle)); - cublasLtHandle_t ltHandle; - check_cuda_error(cublasLtCreate(<Handle)); - - cudaDataType_t AType; - cudaDataType_t BType; - cudaDataType_t CType; - cudaDataType_t computeType; - int startAlgo, endAlgo; - const int ites = 100; - - CublasDataType data_type; - if (std::is_same::value) { - data_type = FLOAT_DATATYPE; - AType = CUDA_R_32F; - BType = CUDA_R_32F; - CType = CUDA_R_32F; - computeType = CUDA_R_32F; - startAlgo = (int)CUBLAS_GEMM_DEFAULT; - endAlgo = (int)CUBLAS_GEMM_ALGO23; - } - else if (std::is_same::value) { - data_type = HALF_DATATYPE; - AType = CUDA_R_16F; - BType = CUDA_R_16F; - CType = CUDA_R_16F; - computeType = CUDA_R_32F; - startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP; - endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP; - } -#ifdef ENABLE_BF16 - else if (std::is_same::value) { - data_type = BFLOAT16_DATATYPE; - AType = CUDA_R_16BF; - BType = CUDA_R_16BF; - CType = CUDA_R_16BF; - computeType = CUDA_R_32F; - startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP; - endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP; - } -#endif - using scaleT = typename ScaleTypeConverter::Type; - - scaleT alpha = (scaleT)1.0f; - scaleT beta = (scaleT)0.0f; - - printf("***Encoder Gemm Testing Begin***\n"); - printf("***Cublas Gemm Testing Begin***\n"); - if (line_count == 0) { - fprintf(fd, - "batch_size, seq_len, head_num, size_per_head dataType ### batchCount, n, m, k, algoId, " - "customOption, tile, numSplitsK, swizzle, reductionScheme, workspaceSize, stages, exec_time\n"); - } - for (int i = 0; i < gemm_num; ++i) { - int m = M[i], n = N[i], k = K[i]; - printf("\n-----------------------------\n"); - printf("GEMM test %d: [M: %d, K: %d, N: %d] %s\n", i, m, k, n, mess[i]); - T* d_A = (T*)buffer; - T* d_B = d_A + m * k * batchCount[i]; - T* d_C = d_B + k * n * batchCount[i]; - - float exec_time = 99999.0f; - int fast_algo = 0; - int seq_len = i == 2 ? max_mem_seq_len : 1; - for (int algo = startAlgo; algo <= endAlgo; algo++) { - cublasStatus_t status; - cudaDeviceSynchronize(); - auto start = std::chrono::high_resolution_clock::now(); - for (int ite = 0; ite < ites; ++ite) { - status = cublasGemmEx(cublas_handle, - CUBLAS_OP_N, - CUBLAS_OP_N, - n, - m, - k, - &alpha, - d_B, - BType, - n, - d_A, - AType, - k, - &beta, - d_C, - CType, - n, - computeType, - static_cast(algo)); - if (status != CUBLAS_STATUS_SUCCESS) { - break; - } - } - cudaDeviceSynchronize(); - auto end = std::chrono::high_resolution_clock::now(); - auto dur = std::chrono::duration(end - start); - if (status == CUBLAS_STATUS_SUCCESS) { - printf("algo_%d costs %.3fms \n", algo, dur.count() / ites); - if (dur.count() / ites < exec_time) { - exec_time = dur.count() / ites; - fast_algo = algo; - } - } - } - printf("fast_algo %d costs %.3f ms\n", fast_algo, exec_time); - - // for fp16 and bf16, we compare cublasLt - if (data_type != FLOAT_DATATYPE) { - printf("***cublasLt Gemm Testing Begin***\n"); - // Let try a fixed number of combinations - const int ALGO_COMBINATIONS = 5000; - customMatmulPerf_t perfResults[ALGO_COMBINATIONS]; - - LtHgemmCustomFind(ltHandle, - batch_size * beam_width, - seq_len, - head_num, - size_per_head, - n, - m, - k, - &alpha, - d_B, - d_A, - &beta, - d_C, - cublas_workspace, - workSpaceSize, - fd, - perfResults, - ALGO_COMBINATIONS); - if (perfResults[0].time < exec_time) { - printPerfStructure(batch_size * beam_width, - seq_len, - head_num, - size_per_head, - n, - m, - k, - perfResults[0], - fd, - data_type, - 0); - } - else { - fprintf(fd, - "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 " -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - "-1 -1 " -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - "-1 -1 -1 " -#endif - "%f\n", - batch_size * beam_width, - seq_len, - head_num, - size_per_head, - data_type, - batchCount[i], - n, - m, - k, - fast_algo, - exec_time); - } - printf("***cublasLt Gemm Testing End***\n"); - } - else { - fprintf(fd, - "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 " -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - "-1 -1 " -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - "-1 -1 -1 " -#endif - "%f\n", - batch_size * beam_width, - seq_len, - head_num, - size_per_head, - data_type, - batchCount[i], - n, - m, - k, - fast_algo, - exec_time); - } - } - printf("***cublas Gemm Testing End***\n\n"); - fclose(fd); - printf("***Decoding Gemm Testing End***\n"); - return; -} - -template void generate_decoding_gemm_config(int batch_size, - int beam_width, - int seq_len, - int head_num, - int size_per_head, - int inter_size, - int vocab_size, - int mem_hidden_units, - void* buffer_in, - bool isAppend); - -template void generate_decoding_gemm_config(int batch_size, - int beam_width, - int seq_len, - int head_num, - int size_per_head, - int inter_size, - int vocab_size, - int mem_hidden_units, - void* buffer_in, - bool isAppend); - -#ifdef ENABLE_BF16 -template void generate_decoding_gemm_config<__nv_bfloat16>(int batch_size, - int beam_width, - int seq_len, - int head_num, - int size_per_head, - int inter_size, - int vocab_size, - int mem_hidden_units, - void* buffer_in, - bool isAppend); -#endif - -size_t calDecodingGemmTestBufSizeInByte(int batch_size, - int beam_width, - int max_mem_seq_len, - int head_num, - int size_per_head, - int inter_size, - int memory_hidden_units, - int vocab_size, - CublasDataType data_type) -{ - size_t buf_size_in_byte = 0; - const size_t tensor_para_size = 1; - const size_t hidden_units = head_num * size_per_head; - const size_t local_head_num = head_num / tensor_para_size; - const size_t local_hidden_units = local_head_num * size_per_head; - - // int wordSize = (data_type == FLOAT_DATATYPE ? sizeof(float) : sizeof(half)); - // Because we always use float for some buffer, set the wordSize to float directly. - int wordSize = sizeof(float); - - size_t m = batch_size * beam_width; - std::vector buff_size; - // for qkv gemm - buff_size.push_back(m * hidden_units + hidden_units * 3 * local_hidden_units + m * 3 * local_hidden_units); - // for attention output gemm - buff_size.push_back(m * hidden_units + hidden_units * local_hidden_units + m * local_hidden_units); - // for memory_tensor gemm - buff_size.push_back(m * max_mem_seq_len * memory_hidden_units + memory_hidden_units * local_hidden_units - + m * max_mem_seq_len * local_hidden_units); - // for context ffn gemm - buff_size.push_back(m * inter_size / tensor_para_size + hidden_units * inter_size / tensor_para_size - + m * hidden_units); - // for vocab - buff_size.push_back(m * hidden_units + hidden_units * ceil(vocab_size / 8.) * 8 / tensor_para_size - + m * ceil(vocab_size / 8.) * 8 / tensor_para_size); - - for (auto t : buff_size) { - buf_size_in_byte = buf_size_in_byte > t ? buf_size_in_byte : t; - } - buf_size_in_byte *= wordSize; - buf_size_in_byte += ((data_type == HALF_DATATYPE || data_type == BFLOAT16_DATATYPE) ? CUBLAS_WORKSPACE_SIZE : 0); - - return buf_size_in_byte; -} - -} // namespace turbomind diff --git a/src/turbomind/utils/gemm_test/decoding_gemm_func.h b/src/turbomind/utils/gemm_test/decoding_gemm_func.h deleted file mode 100644 index 9f17b358b7..0000000000 --- a/src/turbomind/utils/gemm_test/decoding_gemm_func.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "src/turbomind/utils/cublasAlgoMap.h" -#include "src/turbomind/utils/cuda_bf16_wrapper.h" -#include "src/turbomind/utils/cuda_utils.h" -#include "src/turbomind/utils/gemm_test/gemm_func.h" - -#include -#include -#include -#include -#include -#include -#ifdef __linux__ -#include -#include -#endif -#include - -namespace turbomind { - -template -void generate_decoding_gemm_config(int batch_size, - int beam_width, - int seq_len, - int head_num, - int size_per_head, - int inter_size, - int vocab_size, - int mem_hidden_units, - void* buffer_in, - bool isAppend); - -size_t calDecodingGemmTestBufSizeInByte(int batch_size, - int beam_width, - int max_mem_seq_len, - int head_num, - int size_per_head, - int inter_size, - int memory_hidden_units, - int vocab_size, - CublasDataType data_type); - -} // namespace turbomind diff --git a/src/turbomind/utils/gemm_test/encoder_gemm_func.cc b/src/turbomind/utils/gemm_test/encoder_gemm_func.cc deleted file mode 100644 index 9acd82c6ca..0000000000 --- a/src/turbomind/utils/gemm_test/encoder_gemm_func.cc +++ /dev/null @@ -1,566 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "src/turbomind/utils/gemm_test/encoder_gemm_func.h" -#include "src/turbomind/macro.h" -#include - -namespace turbomind { - -template -void generate_encoder_gemm_config( - int batch_size, int seq_len, int head_num, int size_per_head, void* buffer_in, bool isAppend, int tensor_para_size) -{ - void* cublas_workspace; - void* buffer; - int workSpaceSize; - -#ifdef ENABLE_BF16 - if (std::is_same::value || std::is_same::value) { -#else - if (std::is_same::value) { -#endif // ENABLE_BF16 - // cublas_workspace_ should be the start pointer of cudaMalloc() - // to ensure 16B alignemnet - cublas_workspace = buffer_in; - buffer = (void*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE); - workSpaceSize = CUBLAS_WORKSPACE_SIZE; - } - else { - cublas_workspace = nullptr; - buffer = buffer_in; - workSpaceSize = 0; - } - - struct cudaDeviceProp prop; - check_cuda_error(cudaGetDeviceProperties(&prop, 0)); - printf("Device %s\n", prop.name); - - // check config - FILE* fd; - int line_count = 0; - if (!isAppend) { - fd = fopen(GEMM_CONFIG, "w+"); - } - else { - fd = fopen(GEMM_CONFIG, "a+"); - std::vector config; - char line[1024]; - while (fgets(line, 1024, fd) != NULL) { - config.push_back(std::string(line)); - } - line_count = config.size(); - if (config.size() >= (MAX_CONFIG_NUM * GEMM_NUM + 1)) // 6 cublas/cublasLt, first row is not included - { - int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * GEMM_NUM); - fclose(fd); - fd = fopen(GEMM_CONFIG, "w+"); - fprintf(fd, "%s", config[0].c_str()); - for (uint i = startIdx; i < config.size(); i++) { - fprintf(fd, "%s", config[i].c_str()); - } - line_count = config.size() - (GEMM_NUM + 3); - } - } - - const int gemm_num = 7; - int M[gemm_num]; - int N[gemm_num]; - int K[gemm_num]; - int batchCount[gemm_num] = {1, 1, 1, 1, 1, 1, 1}; - char mess[gemm_num][256]; - float exec_times[gemm_num]; - - // gemm1 - M[0] = batch_size * seq_len; - K[0] = head_num * size_per_head; - N[0] = (head_num / tensor_para_size) * size_per_head; - strcpy(mess[0], "from_tensor * weightQ/K/V"); - - // gemm2 - M[1] = M[0]; - K[1] = head_num * size_per_head; - N[1] = 4 * head_num * size_per_head / tensor_para_size; - strcpy(mess[1], "attr_output * inter_kernel"); - - // gemm3 - M[2] = M[0]; - K[2] = 4 * head_num * size_per_head / tensor_para_size; - N[2] = head_num * size_per_head; - strcpy(mess[2], "inter_matmul * output_kernel"); - - M[3] = seq_len; - N[3] = seq_len; - K[3] = size_per_head; - batchCount[3] = batch_size * (head_num / tensor_para_size); - strcpy(mess[3], "attention batched Gemm1"); - - M[4] = seq_len; - N[4] = size_per_head; - K[4] = seq_len; - batchCount[4] = batch_size * (head_num / tensor_para_size); - strcpy(mess[4], "attention batched Gemm2"); - - M[5] = batch_size * seq_len; - N[5] = (head_num / tensor_para_size) * size_per_head; - K[5] = head_num * size_per_head; - batchCount[5] = 3; - strcpy(mess[5], "from_tensor * weight_QKV in BatchGemm"); - - M[6] = batch_size * seq_len; - K[6] = (head_num / tensor_para_size) * size_per_head; - N[6] = head_num * size_per_head; - strcpy(mess[6], "attr * output_kernel"); - - cublasHandle_t cublas_handle; - check_cuda_error(cublasCreate(&cublas_handle)); - cublasLtHandle_t ltHandle; - check_cuda_error(cublasLtCreate(<Handle)); - - cudaDataType_t AType; - cudaDataType_t BType; - cudaDataType_t CType; - cudaDataType_t computeType; - int startAlgo, endAlgo; - const int ites = 100; - - CublasDataType data_type; - if (std::is_same::value) { - data_type = FLOAT_DATATYPE; - AType = CUDA_R_32F; - BType = CUDA_R_32F; - CType = CUDA_R_32F; - computeType = CUDA_R_32F; - startAlgo = (int)CUBLAS_GEMM_DEFAULT; - endAlgo = (int)CUBLAS_GEMM_ALGO23; - } - else if (std::is_same::value) { - data_type = HALF_DATATYPE; - AType = CUDA_R_16F; - BType = CUDA_R_16F; - CType = CUDA_R_16F; - computeType = CUDA_R_32F; - startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP; - endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP; - } -#ifdef ENABLE_BF16 - else if (std::is_same::value) { - data_type = BFLOAT16_DATATYPE; - AType = CUDA_R_16BF; - BType = CUDA_R_16BF; - CType = CUDA_R_16BF; - computeType = CUDA_R_32F; - startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP; - endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP; - } -#endif - using scaleT = typename ScaleTypeConverter::Type; - - scaleT alpha = (scaleT)1.0f; - scaleT beta = (scaleT)0.0f; - - printf("***Encoder Gemm Testing Begin***\n"); - printf("***Cublas Gemm Testing Begin***\n"); - if (line_count == 0) { - fprintf(fd, - "batch_size, seq_len, head_num, size_per_head dataType ### batchCount, n, m, k, algoId, " - "customOption, tile, numSplitsK, swizzle, reductionScheme, workspaceSize, stages, exec_time\n"); - } - for (int i = 0; i < gemm_num; ++i) { - // if(i != 0 && i != 5) continue; - - int m = M[i], n = N[i], k = K[i]; - printf("\n-----------------------------\n"); - printf("GEMM test %d: [M: %d, K: %d, N: %d] %s\n", i, m, k, n, mess[i]); - T* d_A = (T*)buffer; - T* d_B = d_A + m * k * batchCount[i]; - T* d_C = d_B + k * n * batchCount[i]; - - // array of pointer for batchedGemm - T* harray[12]; - harray[0] = (T*)buffer; - harray[1] = (T*)((char*)buffer + sizeof(T) * m * k); - harray[2] = (T*)((char*)buffer + 2 * sizeof(T) * m * k); - harray[4] = (T*)((char*)buffer + 3 * sizeof(T) * m * k); - harray[5] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + sizeof(T) * k * n); - harray[6] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 2 * sizeof(T) * k * n); - harray[8] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n); - harray[9] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n + sizeof(T) * m * n); - harray[10] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n + 2 * sizeof(T) * m * n); - - T** darray = 0; - check_cuda_error(cudaMalloc((void**)&darray, sizeof(T*) * 12)); - cudaMemcpy((void*)darray, (void*)harray, sizeof(T*) * 12, cudaMemcpyHostToDevice); - T** dAarray = darray; - T** dBarray = darray + 4; - T** dCarray = darray + 8; - - float exec_time = 99999.0f; - int fast_algo = 0; - for (int algo = startAlgo; algo <= endAlgo; algo++) { - cublasStatus_t status; - cudaDeviceSynchronize(); - auto start = std::chrono::high_resolution_clock::now(); - for (int ite = 0; ite < ites; ++ite) { - if (i < 3) { - status = cublasGemmEx(cublas_handle, - CUBLAS_OP_N, - CUBLAS_OP_N, - n, - m, - k, - &alpha, - d_B, - BType, - n, - d_A, - AType, - k, - &beta, - d_C, - CType, - n, - computeType, - static_cast(algo)); - } - else if (i == 3) { - status = cublasGemmStridedBatchedEx(cublas_handle, - CUBLAS_OP_T, - CUBLAS_OP_N, - seq_len, - seq_len, - size_per_head, - &alpha, - d_B, - BType, - size_per_head, - seq_len * size_per_head, - d_A, - AType, - size_per_head, - seq_len * size_per_head, - &beta, - d_C, - CType, - seq_len, - seq_len * seq_len, - batch_size * head_num, - computeType, - static_cast(algo)); - } - else if (i == 4) { - status = cublasGemmStridedBatchedEx(cublas_handle, - CUBLAS_OP_N, - CUBLAS_OP_N, - size_per_head, - seq_len, - seq_len, - &alpha, - d_B, - BType, - size_per_head, - seq_len * size_per_head, - d_A, - AType, - seq_len, - seq_len * seq_len, - &beta, - d_C, - CType, - size_per_head, - seq_len * size_per_head, - batch_size * head_num, - computeType, - static_cast(algo)); - } - else if (i == 5) { - status = cublasGemmBatchedEx(cublas_handle, - CUBLAS_OP_N, - CUBLAS_OP_N, - n, - m, - k, - &alpha, - (const void* const*)dBarray, - BType, - n, - (const void* const*)dAarray, - AType, - k, - &beta, - (void* const*)dCarray, - CType, - n, - 3, - computeType, - static_cast(algo)); - } - if (status != CUBLAS_STATUS_SUCCESS) { - break; - } - } - cudaDeviceSynchronize(); - auto end = std::chrono::high_resolution_clock::now(); - auto dur = std::chrono::duration(end - start); - if (status == CUBLAS_STATUS_SUCCESS) { - printf("algo_%d costs %.3fms \n", algo, dur.count() / ites); - if (dur.count() / ites < exec_time) { - exec_time = dur.count() / ites; - fast_algo = algo; - } - } - } - printf("fast_algo %d costs %.3f ms\n", fast_algo, exec_time); - - // for fp16 and bf16, we compare cublasLt - if (i < 3 && data_type != FLOAT_DATATYPE) { - printf("***cublasLt Gemm Testing Begin***\n"); - // Let try a fixed number of combinations - const int ALGO_COMBINATIONS = 5000; - customMatmulPerf_t perfResults[ALGO_COMBINATIONS]; - LtHgemmCustomFind(ltHandle, - batch_size, - seq_len, - head_num, - size_per_head, - n, - m, - k, - &alpha, - d_B, - d_A, - &beta, - d_C, - cublas_workspace, - workSpaceSize, - fd, - perfResults, - ALGO_COMBINATIONS); - if (perfResults[0].time < exec_time) { - printPerfStructure( - batch_size, seq_len, head_num, size_per_head, n, m, k, perfResults[0], fd, data_type, 0); - exec_time = perfResults[0].time; - } - else { - fprintf(fd, - "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 " -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - "-1 -1 " -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - "-1 -1 -1 " -#endif - "%f\n", - batch_size, - seq_len, - head_num, - size_per_head, - data_type, - batchCount[i], - n, - m, - k, - fast_algo, - exec_time); - } - printf("***cublasLt Gemm Testing End***\n"); - } - else { - fprintf(fd, - "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 " -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - "-1 -1 " -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - "-1 -1 -1 " -#endif - "%f\n", - batch_size, - seq_len, - head_num, - size_per_head, - data_type, - batchCount[i], - n, - m, - k, - fast_algo, - exec_time); - } - exec_times[i] = exec_time; - cudaFree(darray); - } - printf("***cublas Gemm Testing End***\n\n"); - fclose(fd); - printf("***Encoder Gemm Testing End***\n"); - -#ifdef SPARSITY_ENABLED - bool do_sparse_test = false; - if (prop.major == 8 && (prop.minor == 0 || prop.minor == 6)) { - do_sparse_test = true; - } - if (do_sparse_test && sizeof(T) == sizeof(half)) { - printf("***cusparseLt Gemm Testing Begin***\n"); - // only first 3 cases can be sparse - const int spgemm_num = 3; - if (!isAppend) { - fd = fopen(SPGEMM_CONFIG, "w+"); - } - else { - fd = fopen(SPGEMM_CONFIG, "a+"); - std::vector config; - char line[1024]; - while (fgets(line, 1024, fd) != NULL) { - config.push_back(std::string(line)); - } - line_count = config.size(); - if (config.size() >= (MAX_CONFIG_NUM * spgemm_num + 1)) // 6 cublas/cublasLt, first row is not included - { - int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * spgemm_num); - fclose(fd); - fd = fopen(SPGEMM_CONFIG, "w+"); - fprintf(fd, "%s", config[0].c_str()); - for (uint i = startIdx; i < config.size(); i++) { - fprintf(fd, "%s", config[i].c_str()); - } - line_count = config.size() - (spgemm_num + 3); - } - } - if (line_count == 0) { - fprintf( - fd, - "batch_size, seq_len, head_num, size_per_head dataType ### batchCount, m, n, k, algoId, exec_time\n"); - } - cusparseLtHandle_t handle; - CHECK_CUSPARSE(cusparseLtInit(&handle)); - cusparseOrder_t order = CUSPARSE_ORDER_COL; - cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; - cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE; - cusparseComputeType compute_type = CUSPARSE_COMPUTE_16F; - unsigned alignment = 16; - cudaStream_t stream = 0; - float alpha2 = 1.0f; - float beta2 = 0.0f; - for (int i = 0; i < spgemm_num; ++i) { - // to be compatible with spgemm wrapper, we let A be the weight matrix - // so m and n are swapped - // A: mxk B: kxn C:mxn - int m = N[i], n = M[i], k = K[i]; - printf("\n-----------------------------\n"); - printf("GEMM test %d: [M: %d, K: %d, N: %d]\n", i, m, k, n); - T* d_A = (T*)buffer; - T* d_B = d_A + m * k * batchCount[i]; - T* d_C = d_B + k * n * batchCount[i]; - T* dA_compressed; - { - cusparseLtMatDescriptor_t mat_A; - CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit( - &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT)) - CHECK_CUSPARSE( - cusparseLtSpMMAPrune2(&handle, &mat_A, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream)) - size_t compressed_size; - CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &mat_A, &compressed_size)) - check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size)); - CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &mat_A, true, opA, d_A, dA_compressed, stream)) - } - - float exec_time = 99999.0f; - int fast_algo = 0; - for (int alg = 0; alg < 4; ++alg) { - cudaDeviceSynchronize(); - cusparseLtMatDescriptor_t mat_A, mat_B, mat_C; - void* d_workspace = nullptr; - int num_streams = 1; - cudaStream_t streams[1] = {stream}; - CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit( - &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT)) - CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_B, k, n, k, alignment, CUDA_R_16F, order)) - CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_16F, order)) - auto start = std::chrono::high_resolution_clock::now(); - for (int ite = 0; ite < ites; ++ite) { - // initializing MatDesc takes a lot of time - // and these descs can be stored to other place - // whereas storing MatMulPlan to other place will cause errors - cusparseLtMatmulDescriptor_t matmul; - cusparseLtMatmulAlgSelection_t alg_sel; - cusparseLtMatmulPlan_t plan; - CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit( - &handle, &matmul, opA, opB, &mat_A, &mat_B, &mat_C, &mat_C, compute_type)) - CHECK_CUSPARSE( - cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT)) - CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute( - &handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg))) - size_t workspace_size; - CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&handle, &alg_sel, &workspace_size)) - CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel, workspace_size)) - CHECK_CUSPARSE(cusparseLtMatmul(&handle, - &plan, - &alpha2, - dA_compressed, - d_B, - &beta2, - d_C, - d_C, - d_workspace, - streams, - num_streams)) - CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan)) - } - cudaDeviceSynchronize(); - auto end = std::chrono::high_resolution_clock::now(); - auto dur = std::chrono::duration(end - start); - printf("algo_%d costs %.3fms \n", alg, dur.count() / ites); - if (dur.count() < exec_time) { - exec_time = dur.count(); - fast_algo = alg; - } - } - exec_time /= ites; - if (exec_time >= exec_times[i]) { - fast_algo = -1; - } - printf("fast_algo %d\n", fast_algo); - fprintf(fd, - "%d %d %d %d %d ### %d %d %d %d %d %f\n", - batch_size, - seq_len, - head_num, - size_per_head, - HALF_DATATYPE, - batchCount[i], - m, - n, - k, - fast_algo, - exec_time); - cudaFree(dA_compressed); - } - CHECK_CUSPARSE(cusparseLtDestroy(&handle)) - fclose(fd); - printf("***cusparseLt Gemm Testing End***\n"); - } -#endif - return; -} - -template void generate_encoder_gemm_config( - int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend, int tensor_para_size); -template void generate_encoder_gemm_config( - int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend, int tensor_para_size); -#ifdef ENABLE_BF16 -template void generate_encoder_gemm_config<__nv_bfloat16>( - int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend, int tensor_para_size); -#endif - -} // namespace turbomind diff --git a/src/turbomind/utils/gemm_test/encoder_gemm_func.h b/src/turbomind/utils/gemm_test/encoder_gemm_func.h deleted file mode 100644 index 35c62ca771..0000000000 --- a/src/turbomind/utils/gemm_test/encoder_gemm_func.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "src/turbomind/utils/cublasAlgoMap.h" -#include "src/turbomind/utils/cuda_bf16_wrapper.h" -#include "src/turbomind/utils/cuda_utils.h" -#include "src/turbomind/utils/gemm_test/gemm_func.h" - -#include -#include -#include -#include -#include -#include -#ifdef __linux__ -#include -#include -#endif -#include - -namespace turbomind { - -template -void generate_encoder_gemm_config(int batch_size, - int seq_len, - int head_num, - int size_per_head, - void* buffer, - bool isAppend = true, - int tensor_para_size = 1); - -} // namespace turbomind diff --git a/src/turbomind/utils/gemm_test/encoder_igemm_func.cc b/src/turbomind/utils/gemm_test/encoder_igemm_func.cc deleted file mode 100644 index c2cf26bf82..0000000000 --- a/src/turbomind/utils/gemm_test/encoder_igemm_func.cc +++ /dev/null @@ -1,1334 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "encoder_igemm_func.h" -#include "src/turbomind/macro.h" -#include - -#ifndef CUDART_VERSION -#error CUDART_VERSION Undefined! -#endif - -namespace turbomind { - -int batch_size_; -int seq_len_; -int head_num_; -int size_per_head_; - -static const char* showStatus(cublasStatus_t error) -{ - switch (error) { - case CUBLAS_STATUS_SUCCESS: - return "CUBLAS_STATUS_SUCCESS"; - - case CUBLAS_STATUS_NOT_INITIALIZED: - return "CUBLAS_STATUS_NOT_INITIALIZED"; - - case CUBLAS_STATUS_ALLOC_FAILED: - return "CUBLAS_STATUS_ALLOC_FAILED"; - - case CUBLAS_STATUS_INVALID_VALUE: - return "CUBLAS_STATUS_INVALID_VALUE"; - - case CUBLAS_STATUS_ARCH_MISMATCH: - return "CUBLAS_STATUS_ARCH_MISMATCH"; - - case CUBLAS_STATUS_MAPPING_ERROR: - return "CUBLAS_STATUS_MAPPING_ERROR"; - - case CUBLAS_STATUS_EXECUTION_FAILED: - return "CUBLAS_STATUS_EXECUTION_FAILED"; - - case CUBLAS_STATUS_INTERNAL_ERROR: - return "CUBLAS_STATUS_INTERNAL_ERROR"; - - case CUBLAS_STATUS_NOT_SUPPORTED: - return "CUBLAS_STATUS_NOT_SUPPORTED"; - - case CUBLAS_STATUS_LICENSE_ERROR: - return "CUBLAS_STATUS_LICENSE_ERROR"; - } - - return ""; -} - -// Utility function to print customMatmulPerf_t structure -int printPerfStructure(int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint) -{ - int algoId, tile, swizzle, customOption, numSplitsK, reductionScheme, stages; - - const cublasLtMatmulAlgo_t* matmulAlgo = &perf.algo; - cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_ID, &algoId, sizeof(algoId), NULL); - cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tile, sizeof(tile), NULL); - cublasLtMatmulAlgoConfigGetAttribute( - matmulAlgo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &numSplitsK, sizeof(numSplitsK), NULL); - cublasLtMatmulAlgoConfigGetAttribute( - matmulAlgo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &reductionScheme, sizeof(reductionScheme), NULL); - cublasLtMatmulAlgoConfigGetAttribute( - matmulAlgo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &swizzle, sizeof(swizzle), NULL); - cublasLtMatmulAlgoConfigGetAttribute( - matmulAlgo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption), NULL); -#if (CUDART_VERSION >= 11000) - cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL); -#else - stages = 0; -#endif - - printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d stages=%d} status %d " - "time %f workspace=%d mathMode=%d waves=%f\n", - algoId, - tile, - matmulTileName[tile], - numSplitsK, - reductionScheme, - swizzle, - customOption, - stages, - perf.status, - perf.time, - (int)perf.workspaceSize, - (int)perf.mathMode, - perf.wavesCount); - - // chose the fastest algo that does not need workspace - if ((int)perf.workspaceSize == 0 && hasPrint == 0) { - fprintf(fout, - "%d %d %d %d %d ### 1 %d %d %d %d %d %d %d %d %d %d %d %f\n", - batch_size_, - seq_len_, - head_num_, - size_per_head_, - INT8_DATATYPE, - m, - n, - k, - algoId, - customOption, - tile, - numSplitsK, - swizzle, - reductionScheme, - (int)perf.workspaceSize, - stages, - perf.time); - return 1; - } - else { - return hasPrint; - } -} - -int printBatchPerfStructure( - int batchCount, int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint) -{ - int algoId, tile, swizzle, customOption, numSplitsK, reductionScheme, stages; - - const cublasLtMatmulAlgo_t* matmulAlgo = &perf.algo; - cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_ID, &algoId, sizeof(algoId), NULL); - cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tile, sizeof(tile), NULL); - cublasLtMatmulAlgoConfigGetAttribute( - matmulAlgo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &numSplitsK, sizeof(numSplitsK), NULL); - cublasLtMatmulAlgoConfigGetAttribute( - matmulAlgo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &reductionScheme, sizeof(reductionScheme), NULL); - cublasLtMatmulAlgoConfigGetAttribute( - matmulAlgo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &swizzle, sizeof(swizzle), NULL); - cublasLtMatmulAlgoConfigGetAttribute( - matmulAlgo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption), NULL); -#if (CUDART_VERSION >= 11000) - cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL); -#else - stages = 0; -#endif - - printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d stages=%d} status %d " - "time %f workspace=%d mathMode=%d waves=%f\n", - algoId, - tile, - matmulTileName[tile], - numSplitsK, - reductionScheme, - swizzle, - customOption, - stages, - perf.status, - perf.time, - (int)perf.workspaceSize, - (int)perf.mathMode, - perf.wavesCount); - - // chose the fastest algo that does not need workspace - if ((int)perf.workspaceSize == 0 && hasPrint == 0) { - fprintf(fout, - "%d %d %d %d %d ### %d %d %d %d %d %d %d %d %d %d %d %d %f\n", - batch_size_, - seq_len_, - head_num_, - size_per_head_, - INT8_DATATYPE, - batchCount, - m, - n, - k, - algoId, - customOption, - tile, - numSplitsK, - swizzle, - reductionScheme, - (int)perf.workspaceSize, - stages, - perf.time); - return 1; - } - else { - return hasPrint; - } -} - -static inline bool time_compare(const customMatmulPerf_t& perf_a, const customMatmulPerf_t& perf_b) -{ - return ((perf_a.status == CUBLAS_STATUS_SUCCESS) && (perf_a.time < perf_b.time)); -} - -static cublasStatus_t customMatmulRun(cublasLtHandle_t ltHandle, // to get the capabilities (required a GPU) - cublasLtMatmulDesc_t operationDesc, - const void* alpha, /* host or device pointer */ - const void* A, - cublasLtMatrixLayout_t Adesc, - const void* B, - cublasLtMatrixLayout_t Bdesc, - const void* beta, /* host or device pointer */ - const void* C, - cublasLtMatrixLayout_t Cdesc, - void* D, - cublasLtMatrixLayout_t Ddesc, - const cublasLtMatmulAlgo_t& algo, - int kernelRepeats, - void* workSpace, - size_t workSpaceSizeInBytes, - customMatmulPerf_t& perfResults, - cudaStream_t stream) -{ - cublasLtMatmulHeuristicResult_t heurResult; - /* Looping over the Algo */ - int repeats = kernelRepeats; - cublasStatus_t algoStatus = - cublasLtMatmulAlgoCheck(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, &algo, &heurResult); - if (algoStatus == CUBLAS_STATUS_SUCCESS) { - if (heurResult.workspaceSize <= workSpaceSizeInBytes) { - cublasStatus_t oneRunStatus; - cudaDeviceSynchronize(); - auto start = std::chrono::high_resolution_clock::now(); - for (int loop = 0; loop < repeats; loop++) { - oneRunStatus = cublasLtMatmul(ltHandle, - operationDesc, - alpha, - A, - Adesc, - B, - Bdesc, - beta, - C, - Cdesc, - D, - Ddesc, - &algo, - workSpace, - workSpaceSizeInBytes, - stream); - } - cudaDeviceSynchronize(); - auto end = std::chrono::high_resolution_clock::now(); - auto dur = std::chrono::duration(end - start); - if (oneRunStatus != CUBLAS_STATUS_SUCCESS) { - algoStatus = oneRunStatus; - } - float time = dur.count(); - // For the moment only add successful findings - if (algoStatus == CUBLAS_STATUS_SUCCESS) { - perfResults.algo = algo; - perfResults.time = time / repeats; - perfResults.workspaceSize = heurResult.workspaceSize; - perfResults.wavesCount = heurResult.wavesCount; - } - } - else { - // printf("not enough workspace! %ld\n", heurResult.workspaceSize); - algoStatus = CUBLAS_STATUS_NOT_SUPPORTED; // Not enough workspace - } - } - else { - // printf("check fail!\n"); - } - return algoStatus; -} - -// Sample wrapper running through multiple algo and config attributes combination for INT8 gemm using cublasLt low-level -// API -template -int LtIgemmCustomFind(cublasLtHandle_t ltHandle, - int m, - int n, - int k, - const scaleT* alpha, /* host pointer */ - const int8_t* A, - const int8_t* B, - const scaleT* beta, /* host pointer */ - T* C, - void* workSpace, - size_t workSpaceSize, - FILE* fout) -{ - cublasStatus_t status = CUBLAS_STATUS_SUCCESS; - - cublasLtMatmulDesc_t operationDesc = NULL; - cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL; - cudaStream_t stream = 0; - // SplitK value that we are going to try when SplitK is supported for a given algo - const int splitKSequenceA[] = {2, 3, 4, 5, 6, 8, 12, 16, 32}; - // Let try a fixed number of combinations -#define ALGO_COMBINATIONS 50000 - int AlgoCombinations = ALGO_COMBINATIONS; - int AlgoCount = 0; - int kernelRepeats = 100; // number of time the CUDA kernels will be run back to back - customMatmulPerf_t perfResults[ALGO_COMBINATIONS]; - int nbAlgoIds = 0; -#define ALGO_IDS 100 - int algoIdA[ALGO_IDS]; - - cudaDataType_t Atype, Btype, Ctype, scaleType; - Atype = CUDA_R_8I; - Btype = CUDA_R_8I; - - if (std::is_same::value && std::is_same::value) { - Ctype = CUDA_R_32I; - scaleType = CUDA_R_32I; - } - else if (std::is_same::value && std::is_same::value) { - Ctype = CUDA_R_8I; - scaleType = CUDA_R_32F; - } - else { - printf("[ERROR] of igemm is invalid\n"); - exit(-1); - } - -#if (CUDART_VERSION >= 11000) - cublasComputeType_t computeType = CUBLAS_COMPUTE_32I; -#else - cudaDataType_t computeType = CUDA_R_32I; -#endif - cublasOperation_t opTranspose = CUBLAS_OP_T; - - bool use_ORDER_COL32_2R_4R4 = false; -#if (CUDART_VERSION >= 11000) - int device{-1}; - cudaGetDevice(&device); - cudaDeviceProp props; - cudaGetDeviceProperties(&props, device); - if (props.major * 10 + props.minor >= 80) { - use_ORDER_COL32_2R_4R4 = true; - } -#endif - cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32; - cublasLtOrder_t order_matrixB; -#if (CUDART_VERSION >= 11000) - if (use_ORDER_COL32_2R_4R4) { - order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4; - } - else { - order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C; - } -#else - order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C; -#endif - - int ldaTransform = 32 * m; - int ldbTransform; - if (use_ORDER_COL32_2R_4R4) { - ldbTransform = 32 * ((n + 32 - 1) / 32) * 32; - } - else { - ldbTransform = 32 * ((n + 8 - 1) / 8) * 8; - } - - int ldcTransform = 32 * m; - -#if (CUDART_VERSION >= 11000) - status = cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType); -#else - status = cublasLtMatmulDescCreate(&operationDesc, scaleType); -#endif - if (status != CUBLAS_STATUS_SUCCESS) { - goto CLEANUP; - } - cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t)); - - // Create matrix descriptors. - status = cublasLtMatrixLayoutCreate(&Adesc, Atype, m, k, ldaTransform); - if (status != CUBLAS_STATUS_SUCCESS) { - goto CLEANUP; - } - status = cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32)); - if (status != CUBLAS_STATUS_SUCCESS) { - goto CLEANUP; - } - status = cublasLtMatrixLayoutCreate(&Bdesc, Btype, n, k, ldbTransform); - if (status != CUBLAS_STATUS_SUCCESS) { - goto CLEANUP; - } - status = - cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_matrixB, sizeof(order_matrixB)); - if (status != CUBLAS_STATUS_SUCCESS) { - goto CLEANUP; - } - status = cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, ldcTransform); - if (status != CUBLAS_STATUS_SUCCESS) { - goto CLEANUP; - } - status = cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32)); - if (status != CUBLAS_STATUS_SUCCESS) { - goto CLEANUP; - } - - // Request AlgoId available for IGEMM - status = cublasLtMatmulAlgoGetIds( - ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, ALGO_IDS, algoIdA, &nbAlgoIds); - if (status != CUBLAS_STATUS_SUCCESS) { - goto CLEANUP; - } - - // Loop over the Algo IDs - for (int idx = 0; (idx < nbAlgoIds) && (AlgoCount < AlgoCombinations); idx++) { - cublasLtMatmulAlgo_t algo; - size_t sizeWritten = 0; - /* Initialize algo structure with given Algp ID */ - status = - cublasLtMatmulAlgoInit(ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, algoIdA[idx], &algo); - if (status != CUBLAS_STATUS_SUCCESS) { - continue; - } - // Query the tiles enums supported by that algo - cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_TILE_IDS, NULL, 0, &sizeWritten); - int nbTiles = int(sizeWritten / sizeof(int)); - int* tileA = new int[nbTiles == 0 ? 1 : nbTiles]; - if (nbTiles == 0) { - tileA[0] = CUBLASLT_MATMUL_TILE_UNDEFINED; - nbTiles = 1; - } -#if (CUDART_VERSION >= 11000) - cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, NULL, 0, &sizeWritten); - int nbStages = int(sizeWritten / sizeof(int)); - std::vector stagesA(nbStages == 0 ? 1 : nbStages); - if (nbStages == 0) { - stagesA[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED; - nbStages = 1; - } - else { - cublasLtMatmulAlgoCapGetAttribute( - &algo, CUBLASLT_ALGO_CAP_STAGES_IDS, stagesA.data(), sizeof(int) * nbStages, &sizeWritten); - } -#endif - int splitkSupport, redMask, swizzlingMax, customOptionMax; - // Retrieve Algo Capabilities attributes to be able to setup loop over the different combinations - cublasLtMatmulAlgoCapGetAttribute( - &algo, CUBLASLT_ALGO_CAP_TILE_IDS, tileA, sizeof(int) * nbTiles, &sizeWritten); - cublasLtMatmulAlgoCapGetAttribute( - &algo, CUBLASLT_ALGO_CAP_SPLITK_SUPPORT, &splitkSupport, sizeof(splitkSupport), &sizeWritten); - cublasLtMatmulAlgoCapGetAttribute( - &algo, CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK, &redMask, sizeof(redMask), &sizeWritten); - cublasLtMatmulAlgoCapGetAttribute( - &algo, CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT, &swizzlingMax, sizeof(swizzlingMax), &sizeWritten); - cublasLtMatmulAlgoCapGetAttribute( - &algo, CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX, &customOptionMax, sizeof(customOptionMax), &sizeWritten); - /* Loop over the different tiles */ - for (int tileIdx = 0; tileIdx < nbTiles; tileIdx++) { -#if (CUDART_VERSION >= 11000) - /* Loop over different stages count */ - for (int stagesIdx = 0; stagesIdx < nbStages; stagesIdx++) { - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stagesA[stagesIdx], sizeof(stagesA[stagesIdx])); -#endif - /* Loop over the different custom option if any */ - for (int customOption = 0; customOption <= customOptionMax; customOption++) { - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption)); - /* Loop over the CTAs swizzling support */ - for (int k = 0; k <= swizzlingMax; k++) { - int splitK_trial = 0; - if (splitkSupport) { - splitK_trial += sizeof(splitKSequenceA) / sizeof(splitKSequenceA[0]); - } - // Loop over the splitK value over a fixed sequence splitKSequenceA in addition to the case - // where splitK is not enabled - for (int l = 0; (l < (1 + splitK_trial)) && (AlgoCount < AlgoCombinations); l++) { - /* Setup attribute of the algo to run */ - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tileA[tileIdx], sizeof(tileA[tileIdx])); - int splitK_val = 0; - int redScheme = CUBLASLT_REDUCTION_SCHEME_NONE; - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &splitK_val, sizeof(splitK_val)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &k, sizeof(k)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &redScheme, sizeof(int)); - - if (l > 0) { // Split-K case - splitK_val = splitKSequenceA[l - 1]; - cublasLtMatmulAlgoConfigSetAttribute(&algo, - CUBLASLT_ALGO_CONFIG_SPLITK_NUM, - &splitKSequenceA[l - 1], - sizeof(splitKSequenceA[l - 1])); - /* Going over all the reduction scheme */ - for (redScheme = 1; - redScheme <= (int)CUBLASLT_REDUCTION_SCHEME_MASK && (AlgoCount < AlgoCombinations); - redScheme = redScheme << 1) { - if (redScheme & redMask) { - cublasLtMatmulAlgoConfigSetAttribute(&algo, - CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, - &redScheme, - sizeof(redScheme)); - status = customMatmulRun(ltHandle, - operationDesc, - alpha, /* host or device pointer */ - A, - Adesc, - B, - Bdesc, - beta, /* host or device pointer */ - C, - Cdesc, - C, - Cdesc, - algo, - kernelRepeats, - workSpace, - workSpaceSize, - perfResults[AlgoCount], - stream); - perfResults[AlgoCount].status = status; - if (status == CUBLAS_STATUS_SUCCESS) { - AlgoCount++; - } - } // end if - } // end for - } - else { // Non-splitK case - /* if user preference is ok with workspace */ - if (AlgoCount < AlgoCombinations) { - status = customMatmulRun(ltHandle, - operationDesc, - alpha, /* host or device pointer */ - A, - Adesc, - B, - Bdesc, - beta, /* host or device pointer */ - C, - Cdesc, - C, - Cdesc, - algo, - kernelRepeats, - workSpace, - workSpaceSize, - perfResults[AlgoCount], - stream); - perfResults[AlgoCount].status = status; - if (status == CUBLAS_STATUS_SUCCESS) { - AlgoCount++; - } - } - } - } // end l - } // end k - } // end customOption -#if (CUDART_VERSION >= 11000) - } // end stagesIdx -#endif - } // end tileIdx - delete[] tileA; - } // end idx - // Sort the results per run duration - std::sort(perfResults, perfResults + AlgoCount, time_compare); - // Print timing and perf details - for (int i = 0, hasPrint = 0; i < AlgoCount; i++) { - printf("result %03d : ", i); - hasPrint = printPerfStructure(m, n, k, perfResults[i], fout, hasPrint); - } - -CLEANUP: - // Descriptors are no longer needed as all GPU work was already enqueued - if (Cdesc) { - cublasLtMatrixLayoutDestroy(Cdesc); - } - if (Bdesc) { - cublasLtMatrixLayoutDestroy(Bdesc); - } - if (Adesc) { - cublasLtMatrixLayoutDestroy(Adesc); - } - if (operationDesc) { - cublasLtMatmulDescDestroy(operationDesc); - } - return status == CUBLAS_STATUS_SUCCESS ? 0 : 1; -} - -template int LtIgemmCustomFind(cublasLtHandle_t ltHandle, - int m, - int n, - int k, - const int* alpha, /* host pointer */ - const int8_t* A, - const int8_t* B, - const int* beta, /* host pointer */ - int32_t* C, - void* workSpace, - size_t workSpaceSize, - FILE* fout); - -template int LtIgemmCustomFind(cublasLtHandle_t ltHandle, - int m, - int n, - int k, - const float* alpha, /* host pointer */ - const int8_t* A, - const int8_t* B, - const float* beta, /* host pointer */ - int8_t* C, - void* workSpace, - size_t workSpaceSize, - FILE* fout); - -template -int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle, - int batchCount, - int m, - int n, - int k, - const scaleT* alpha, /* host pointer */ - const int8_t* A, - const int8_t* B, - const scaleT* beta, /* host pointer */ - T* C, - void* workSpace, - size_t workSpaceSize, - FILE* fout) -{ - cublasStatus_t status = CUBLAS_STATUS_SUCCESS; - - cublasLtMatmulDesc_t operationDesc = NULL; - cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL; - cudaStream_t stream = 0; - // SplitK value that we are going to try when SplitK is supported for a given algo - const int splitKSequenceA[] = {2, 3, 4, 5, 6, 8, 12, 16, 32}; - // Let try a fixed number of combinations -#define ALGO_COMBINATIONS 50000 - int AlgoCombinations = ALGO_COMBINATIONS; - int AlgoCount = 0; - int kernelRepeats = 100; // number of time the CUDA kernels will be run back to back - customMatmulPerf_t perfResults[ALGO_COMBINATIONS]; - int nbAlgoIds = 0; -#define ALGO_IDS 100 - int algoIdA[ALGO_IDS]; - - cudaDataType_t Atype, Btype, Ctype, scaleType; - Atype = CUDA_R_8I; - Btype = CUDA_R_8I; - - if (std::is_same::value && std::is_same::value) { - Ctype = CUDA_R_32I; - scaleType = CUDA_R_32I; - } - else if (std::is_same::value && std::is_same::value) { - Ctype = CUDA_R_8I; - scaleType = CUDA_R_32F; - } - else { - printf("[ERROR] of igemm is invalid\n"); - exit(-1); - } - -#if (CUDART_VERSION >= 11000) - cublasComputeType_t computeType = CUBLAS_COMPUTE_32I; -#else - cudaDataType_t computeType = CUDA_R_32I; -#endif - cublasOperation_t opTranspose = CUBLAS_OP_T; - - bool use_ORDER_COL32_2R_4R4 = false; -#if (CUDART_VERSION >= 11000) - int device{-1}; - cudaGetDevice(&device); - cudaDeviceProp props; - cudaGetDeviceProperties(&props, device); - if (props.major * 10 + props.minor >= 80) { - use_ORDER_COL32_2R_4R4 = true; - } -#endif - cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32; - cublasLtOrder_t order_matrixB; -#if (CUDART_VERSION >= 11000) - if (use_ORDER_COL32_2R_4R4) { - order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4; - } - else { - order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C; - } -#else - order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C; -#endif - - int ldaTransform = 32 * m; - int ldbTransform; - if (use_ORDER_COL32_2R_4R4) { - ldbTransform = 32 * ((n + 32 - 1) / 32) * 32; - } - else { - ldbTransform = 32 * ((n + 8 - 1) / 8) * 8; - } - - int ldcTransform = 32 * m; - - int64_t stridea, strideb, stridec; - stridea = m * k; - strideb = n * k; - stridec = m * n; - -#if (CUDART_VERSION >= 11000) - status = cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType); -#else - status = cublasLtMatmulDescCreate(&operationDesc, scaleType); -#endif - if (status != CUBLAS_STATUS_SUCCESS) { - goto CLEANUP; - } - cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t)); - - // Create matrix descriptors. - status = cublasLtMatrixLayoutCreate(&Adesc, Atype, m, k, ldaTransform); - if (status != CUBLAS_STATUS_SUCCESS) { - goto CLEANUP; - } - status = cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32)); - if (status != CUBLAS_STATUS_SUCCESS) { - goto CLEANUP; - } - cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)); - cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, sizeof(stridea)); - - status = cublasLtMatrixLayoutCreate(&Bdesc, Btype, n, k, ldbTransform); - if (status != CUBLAS_STATUS_SUCCESS) { - goto CLEANUP; - } - status = - cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_matrixB, sizeof(order_matrixB)); - if (status != CUBLAS_STATUS_SUCCESS) { - goto CLEANUP; - } - cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)); - cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, sizeof(strideb)); - - status = cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, ldcTransform); - if (status != CUBLAS_STATUS_SUCCESS) { - goto CLEANUP; - } - status = cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32)); - if (status != CUBLAS_STATUS_SUCCESS) { - goto CLEANUP; - } - cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)); - cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, sizeof(stridec)); - - // Request AlgoId available for IGEMM - status = cublasLtMatmulAlgoGetIds( - ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, ALGO_IDS, algoIdA, &nbAlgoIds); - if (status != CUBLAS_STATUS_SUCCESS) { - goto CLEANUP; - } - - // Loop over the Algo IDs - for (int idx = 0; (idx < nbAlgoIds) && (AlgoCount < AlgoCombinations); idx++) { - cublasLtMatmulAlgo_t algo; - size_t sizeWritten = 0; - /* Initialize algo structure with given Algp ID */ - status = - cublasLtMatmulAlgoInit(ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, algoIdA[idx], &algo); - if (status != CUBLAS_STATUS_SUCCESS) { - continue; - } - // Query the tiles enums supported by that algo - cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_TILE_IDS, NULL, 0, &sizeWritten); - int nbTiles = int(sizeWritten / sizeof(int)); - int* tileA = new int[nbTiles == 0 ? 1 : nbTiles]; - if (nbTiles == 0) { - tileA[0] = CUBLASLT_MATMUL_TILE_UNDEFINED; - nbTiles = 1; - } -#if (CUDART_VERSION >= 11000) - cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, NULL, 0, &sizeWritten); - int nbStages = int(sizeWritten / sizeof(int)); - std::vector stagesA(nbStages == 0 ? 1 : nbStages); - if (nbStages == 0) { - stagesA[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED; - nbStages = 1; - } - else { - cublasLtMatmulAlgoCapGetAttribute( - &algo, CUBLASLT_ALGO_CAP_STAGES_IDS, stagesA.data(), sizeof(int) * nbStages, &sizeWritten); - } -#endif - int splitkSupport, redMask, swizzlingMax, customOptionMax; - // Retrieve Algo Capabilities attributes to be able to setup loop over the different combinations - cublasLtMatmulAlgoCapGetAttribute( - &algo, CUBLASLT_ALGO_CAP_TILE_IDS, tileA, sizeof(int) * nbTiles, &sizeWritten); - cublasLtMatmulAlgoCapGetAttribute( - &algo, CUBLASLT_ALGO_CAP_SPLITK_SUPPORT, &splitkSupport, sizeof(splitkSupport), &sizeWritten); - cublasLtMatmulAlgoCapGetAttribute( - &algo, CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK, &redMask, sizeof(redMask), &sizeWritten); - cublasLtMatmulAlgoCapGetAttribute( - &algo, CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT, &swizzlingMax, sizeof(swizzlingMax), &sizeWritten); - cublasLtMatmulAlgoCapGetAttribute( - &algo, CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX, &customOptionMax, sizeof(customOptionMax), &sizeWritten); - /* Loop over the different tiles */ - for (int tileIdx = 0; tileIdx < nbTiles; tileIdx++) { -#if (CUDART_VERSION >= 11000) - /* Loop over different stages count */ - for (int stagesIdx = 0; stagesIdx < nbStages; stagesIdx++) { - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stagesA[stagesIdx], sizeof(stagesA[stagesIdx])); -#endif - /* Loop over the different custom option if any */ - for (int customOption = 0; customOption <= customOptionMax; customOption++) { - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption)); - /* Loop over the CTAs swizzling support */ - for (int k = 0; k <= swizzlingMax; k++) { - int splitK_trial = 0; - if (splitkSupport) { - splitK_trial += sizeof(splitKSequenceA) / sizeof(splitKSequenceA[0]); - } - // Loop over the splitK value over a fixed sequence splitKSequenceA in addition to the case - // where splitK is not enabled - for (int l = 0; (l < (1 + splitK_trial)) && (AlgoCount < AlgoCombinations); l++) { - /* Setup attribute of the algo to run */ - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tileA[tileIdx], sizeof(tileA[tileIdx])); - int splitK_val = 0; - int redScheme = CUBLASLT_REDUCTION_SCHEME_NONE; - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &splitK_val, sizeof(splitK_val)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &k, sizeof(k)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &redScheme, sizeof(int)); - - if (l > 0) { // Split-K case - splitK_val = splitKSequenceA[l - 1]; - cublasLtMatmulAlgoConfigSetAttribute(&algo, - CUBLASLT_ALGO_CONFIG_SPLITK_NUM, - &splitKSequenceA[l - 1], - sizeof(splitKSequenceA[l - 1])); - /* Going over all the reduction scheme */ - for (redScheme = 1; - redScheme <= (int)CUBLASLT_REDUCTION_SCHEME_MASK && (AlgoCount < AlgoCombinations); - redScheme = redScheme << 1) { - if (redScheme & redMask) { - cublasLtMatmulAlgoConfigSetAttribute(&algo, - CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, - &redScheme, - sizeof(redScheme)); - status = customMatmulRun(ltHandle, - operationDesc, - alpha, /* host or device pointer */ - A, - Adesc, - B, - Bdesc, - beta, /* host or device pointer */ - C, - Cdesc, - C, - Cdesc, - algo, - kernelRepeats, - workSpace, - workSpaceSize, - perfResults[AlgoCount], - stream); - perfResults[AlgoCount].status = status; - if (status == CUBLAS_STATUS_SUCCESS) { - AlgoCount++; - } - } // end if - } // end for - } - else { // Non-splitK case - /* if user preference is ok with workspace */ - if (AlgoCount < AlgoCombinations) { - status = customMatmulRun(ltHandle, - operationDesc, - alpha, /* host or device pointer */ - A, - Adesc, - B, - Bdesc, - beta, /* host or device pointer */ - C, - Cdesc, - C, - Cdesc, - algo, - kernelRepeats, - workSpace, - workSpaceSize, - perfResults[AlgoCount], - stream); - perfResults[AlgoCount].status = status; - if (status == CUBLAS_STATUS_SUCCESS) { - AlgoCount++; - } - } - } - } // end l - } // end k - } // end customOption -#if (CUDART_VERSION >= 11000) - } // end stagesIdx -#endif - } // end tileIdx - delete[] tileA; - } // end idx - // Sort the results per run duration - std::sort(perfResults, perfResults + AlgoCount, time_compare); - // Print timing and perf details - for (int i = 0, hasPrint = 0; i < AlgoCount; i++) { - printf("result %03d : ", i); - hasPrint = printBatchPerfStructure(batchCount, m, n, k, perfResults[i], fout, hasPrint); - } - -CLEANUP: - // Descriptors are no longer needed as all GPU work was already enqueued - if (Cdesc) { - cublasLtMatrixLayoutDestroy(Cdesc); - } - if (Bdesc) { - cublasLtMatrixLayoutDestroy(Bdesc); - } - if (Adesc) { - cublasLtMatrixLayoutDestroy(Adesc); - } - if (operationDesc) { - cublasLtMatmulDescDestroy(operationDesc); - } - return status == CUBLAS_STATUS_SUCCESS ? 0 : 1; -} - -template int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle, - int batchCount, - int m, - int n, - int k, - const int* alpha, /* host pointer */ - const int8_t* A, - const int8_t* B, - const int* beta, /* host pointer */ - int32_t* C, - void* workSpace, - size_t workSpaceSize, - FILE* fout); - -template int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle, - int batchCount, - int m, - int n, - int k, - const float* alpha, /* host pointer */ - const int8_t* A, - const int8_t* B, - const float* beta, /* host pointer */ - int8_t* C, - void* workSpace, - size_t workSpaceSize, - FILE* fout); - -// initialize matrix in column-major -void matInit(int rows, int cols, int8_t* p, int ld) -{ - srand(time(NULL)); - - for (int c = 0; c < cols; c++) { - for (int r = 0; r < rows; r++) { - int index = r + c * ld; - - p[index] = rand() % 255 - 127; - } - } -} - -int batch_igemm_config(int batchCount, int m, int n, int k, FILE* fout, void* buffer) -{ - printf("batchCount %d m %d n %d k %d\n", batchCount, m, n, k); - int alpha = 1; - int beta = 0; - - int8_t* d_A = (int8_t*)buffer; // m * k, stored in column-major - int8_t* d_B = d_A + batchCount * m * k; // k * n, stored in column-major - int32_t* d_C = (int32_t*)(d_B + batchCount * k * n); // m * n, stored in column-major - - cublasLtHandle_t ltHandle; - cublasLtCreate(<Handle); - - LtBatchIgemmCustomFind(ltHandle, - batchCount, - m, - n, - k, - &alpha, /* host pointer */ - d_A, - d_B, - &beta, /* host pointer */ - d_C, - NULL, - 0, - fout); - // free memory - cublasLtDestroy(ltHandle); - return 0; -} - -int igemm_config(int m, int n, int k, FILE* fout, void* buffer) -{ - printf("batchCount %d m %d n %d k %d\n", 1, m, n, k); - int alpha = 1; - int beta = 0; - - int8_t* d_A = (int8_t*)buffer; // m * k, stored in column-major - int8_t* d_B = d_A + m * k; // k * n, stored in column-major - int32_t* d_C = (int32_t*)(d_B + k * n); // m * n, stored in column-major - - cublasLtHandle_t ltHandle; - cublasLtCreate(<Handle); - - LtIgemmCustomFind(ltHandle, - m, - n, - k, - &alpha, /* host pointer */ - d_A, - d_B, - &beta, /* host pointer */ - d_C, - NULL, - 0, - fout); - - cublasLtDestroy(ltHandle); - return 0; -} - -int generate_encoder_igemm_config( - int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend) -{ - - // ensure program running on SM >= 7.5 - struct cudaDeviceProp prop; - check_cuda_error(cudaGetDeviceProperties(&prop, 0)); - if (!(prop.major >= 8 || (prop.major >= 7 && prop.minor >= 5))) { - printf("[ERROR] INT8 mode > 0 is only supported on device with sm >= 7.5\n "); - exit(-1); - } - printf("Device %s\n", prop.name); - - // check config - FILE* fout; - if (!isAppend) { - fout = fopen(IGEMM_CONFIG, "w+"); - fprintf( - fout, - "batch_size seq_len head_num size_per_head dataType ### batchCount m n k algoId customOption tile splitK_val swizzle reductionScheme workspaceSize stages exec_time\n"); - } - else { - fout = fopen(IGEMM_CONFIG, "a+"); - std::vector config; - char line[1024]; - while (fgets(line, 1024, fout) != NULL) { - config.push_back(std::string(line)); - } - if (config.size() >= MAX_CONFIG_NUM * GEMM_NUM) { - int startIdx = config.size() - (MAX_CONFIG_NUM - 1) * GEMM_NUM; - fclose(fout); - fout = fopen(IGEMM_CONFIG, "w+"); - for (int i = startIdx; i < (int)config.size(); i++) { - fprintf(fout, "%s", config[i].c_str()); - } - } - } - - batch_size_ = batch_size; - seq_len_ = seq_len; - head_num_ = head_num; - size_per_head_ = size_per_head; - int m = batch_size * seq_len; - int n = head_num * size_per_head; - int k = n; - int batchCount; - - printf("***Encoder IGemm Testing Begin***\n"); - printf("\n-----------------------------\n"); - - batchCount = 3; - m = batch_size * seq_len; - k = head_num * size_per_head; - n = k; - if (n % 32 != 0 || k % 32 != 0) { - printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k); - } - else { - batch_igemm_config(batchCount, m, n, k, fout, buffer); - } - - printf("\n-----------------------------\n"); - m = seq_len; - n = seq_len; - k = size_per_head; - batchCount = batch_size * head_num; - if (n % 32 != 0 || k % 32 != 0) { - printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k); - } - else { - batch_igemm_config(batchCount, m, n, k, fout, buffer); - } - - printf("\n-----------------------------\n"); - m = seq_len; - n = size_per_head; - k = seq_len; - batchCount = batch_size * head_num; - if (n % 32 != 0 || k % 32 != 0) { - printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k); - } - else { - batch_igemm_config(batchCount, m, n, k, fout, buffer); - } - - printf("\n-----------------------------\n"); - m = batch_size * seq_len; - n = head_num * size_per_head; - k = head_num * size_per_head; - if (n % 32 != 0 || k % 32 != 0) { - printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k); - } - else { - igemm_config(m, n, k, fout, buffer); - } - - printf("\n-----------------------------\n"); - n = 4 * n; - if (n % 32 != 0 || k % 32 != 0) { - printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k); - } - else { - igemm_config(m, n, k, fout, buffer); - } - - printf("\n-----------------------------\n"); - n = k; - k = 4 * n; - if (n % 32 != 0 || k % 32 != 0) { - printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k); - } - else { - igemm_config(m, n, k, fout, buffer); - } - - fclose(fout); - printf("\n-----------------------------\n"); - printf("***Encoder IGemm Testing End***\n"); - -#ifdef SPARSITY_ENABLED - bool do_sparse_test = false; - if (prop.major == 8 && (prop.minor == 0 || prop.minor == 6)) { - do_sparse_test = true; - } - if (do_sparse_test) { - printf("***cusparseLt Gemm Testing Begin***\n"); - const int spgemm_num = 3; - FILE* fd; - int line_count = 0; - const int ites = 100; - if (!isAppend) { - fd = fopen(SPIGEMM_CONFIG, "w+"); - } - else { - fd = fopen(SPIGEMM_CONFIG, "a+"); - std::vector config; - char line[1024]; - while (fgets(line, 1024, fd) != NULL) { - config.push_back(std::string(line)); - } - line_count = config.size(); - if (config.size() >= (MAX_CONFIG_NUM * spgemm_num + 1)) // 6 cublas/cublasLt, first row is not included - { - int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * spgemm_num); - fclose(fd); - fd = fopen(SPIGEMM_CONFIG, "w+"); - fprintf(fd, "%s", config[0].c_str()); - for (uint i = startIdx; i < config.size(); i++) { - fprintf(fd, "%s", config[i].c_str()); - } - line_count = config.size() - (spgemm_num + 3); - } - } - if (line_count == 0) { - fprintf( - fd, - "batch_size, seq_len, head_num, size_per_head dataType ### batchCount, m, n, k, algoId, exec_time\n"); - } - - int M[spgemm_num]; - int N[spgemm_num]; - int K[spgemm_num]; - // gemm1 - M[0] = batch_size * seq_len; - K[0] = head_num * size_per_head; - N[0] = K[0]; - // gemm2 - M[1] = M[0]; - K[1] = K[0]; - N[1] = 4 * N[0]; - // gemm3 - M[2] = M[0]; - K[2] = 4 * K[0]; - N[2] = N[0]; - - cusparseLtHandle_t handle; - CHECK_CUSPARSE(cusparseLtInit(&handle)); - cusparseOrder_t col_order = CUSPARSE_ORDER_COL; - cusparseOrder_t row_order = CUSPARSE_ORDER_ROW; - cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; - cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE; - cusparseComputeType compute_type = CUSPARSE_COMPUTE_32I; - unsigned alignment = 16; - cudaStream_t stream = 0; - float alpha2 = 1.0f; - float beta2 = 0.0f; - for (int i = 0; i < spgemm_num; ++i) { - // to be compatible with spgemm wrapper, we let A be the weight matrix - // so m and n are swapped - // A: mxk B: kxn C:mxn - int m = N[i], n = M[i], k = K[i]; - printf("\n-----------------------------\n"); - printf("GEMM test %d: [M: %d, K: %d, N: %d]\n", i, m, k, n); - int8_t* d_A = (int8_t*)buffer; - int8_t* d_B = d_A + m * k; - int8_t* d_C = d_B + k * n; - int8_t* dA_compressed; - { - cusparseLtMatDescriptor_t mat_A; - CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit( - &handle, &mat_A, m, k, k, alignment, CUDA_R_8I, row_order, CUSPARSELT_SPARSITY_50_PERCENT)) - CHECK_CUSPARSE( - cusparseLtSpMMAPrune2(&handle, &mat_A, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream)) - size_t compressed_size; - CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &mat_A, &compressed_size)) - check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size)); - CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &mat_A, true, opA, d_A, dA_compressed, stream)) - } - cudaDeviceSynchronize(); - cudaError_t result = cudaGetLastError(); - if (result) { - throw std::runtime_error(std::string("[TM][ERROR] CUDA runtime error: ")); - } - - float exec_time = 99999.0f; - int fast_algo = 0; - for (int alg = 0; alg < 4; ++alg) { - cudaDeviceSynchronize(); - cusparseLtMatDescriptor_t mat_A, mat_B, mat_C; - void* d_workspace = nullptr; - int num_streams = 1; - cudaStream_t streams[1] = {stream}; - CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit( - &handle, &mat_A, m, k, k, alignment, CUDA_R_8I, row_order, CUSPARSELT_SPARSITY_50_PERCENT)) - CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_B, k, n, k, alignment, CUDA_R_8I, col_order)) - CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_8I, col_order)) - auto start = std::chrono::high_resolution_clock::now(); - for (int ite = 0; ite < ites; ++ite) { - // initializing MatDesc takes a lot of time - // and these descs can be stored to other place - // whereas storing MatMulPlan to other place will cause errors - cusparseLtMatmulDescriptor_t matmul; - cusparseLtMatmulAlgSelection_t alg_sel; - cusparseLtMatmulPlan_t plan; - CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit( - &handle, &matmul, opA, opB, &mat_A, &mat_B, &mat_C, &mat_C, compute_type)) - CHECK_CUSPARSE( - cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT)) - CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute( - &handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg))) - size_t workspace_size; - CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&handle, &alg_sel, &workspace_size)) - CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel, workspace_size)) - CHECK_CUSPARSE(cusparseLtMatmul(&handle, - &plan, - &alpha2, - dA_compressed, - d_B, - &beta2, - d_C, - d_C, - d_workspace, - streams, - num_streams)) - CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan)) - } - cudaDeviceSynchronize(); - auto end = std::chrono::high_resolution_clock::now(); - auto dur = std::chrono::duration(end - start); - printf("algo_%d costs %.3fms \n", alg, dur.count() / ites); - if (dur.count() < exec_time) { - exec_time = dur.count(); - fast_algo = alg; - } - } - exec_time /= ites; - printf("fast_algo %d\n", fast_algo); - fprintf(fd, - "%d %d %d %d %d ### 1 %d %d %d %d %f\n", - batch_size, - seq_len, - head_num, - size_per_head, - HALF_DATATYPE, - m, - n, - k, - fast_algo, - exec_time); - cudaFree(dA_compressed); - } - CHECK_CUSPARSE(cusparseLtDestroy(&handle)) - fclose(fd); - printf("***cusparseLt Gemm Testing End***\n"); - } -#endif - return 0; -} - -} // namespace turbomind diff --git a/src/turbomind/utils/gemm_test/encoder_igemm_func.h b/src/turbomind/utils/gemm_test/encoder_igemm_func.h deleted file mode 100644 index 4cadeed026..0000000000 --- a/src/turbomind/utils/gemm_test/encoder_igemm_func.h +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "src/turbomind/utils/cublasAlgoMap.h" -#include "src/turbomind/utils/cuda_utils.h" -#include -#include -#include -#include -#include -#include -#ifdef __linux__ -#include -#include -#endif -#include -#include - -namespace turbomind { - -/* CAUTION : must match cublasLtMatmulTile_t */ -const char* const matmulTileName[] = {"UNDEF", "8x8", "8x16", "16x8", "8x32", "16x16", "32x8", - "8x64", "16x32", "32x16", "64x8", "32x32", "32x64", "64x32", - "32x128", "64x64", "128x32", "64x128", "128x64", "64x256", "128x128", - "256x64", "64x512", "128x256", "256x128", "512x64", "64x96", "96*64", - "96x128", "128x160", "160x128", "192x128", "128x192", "128x96", "END"}; - -int generate_encoder_igemm_config( - int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend = true); - -int printPerfStructure(int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint); - -int printBatchPerfStructure( - int batchCount, int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint); - -template -int LtIgemmCustomFind(cublasLtHandle_t ltHandle, - int m, - int n, - int k, - const scaleT* alpha, /* host pointer */ - const int8_t* A, - const int8_t* B, - const scaleT* beta, /* host pointer */ - T* C, - void* workSpace, - size_t workSpaceSize, - FILE* fout); - -template -int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle, - int batchCount, - int m, - int n, - int k, - const scaleT* alpha, /* host pointer */ - const int8_t* A, - const int8_t* B, - const scaleT* beta, /* host pointer */ - T* C, - void* workSpace, - size_t workSpaceSize, - FILE* fout); - -void matInit(int rows, int cols, int8_t* p, int ld); - -} // namespace turbomind diff --git a/src/turbomind/utils/gemm_test/gemm_func.cc b/src/turbomind/utils/gemm_test/gemm_func.cc deleted file mode 100644 index 0a4645481b..0000000000 --- a/src/turbomind/utils/gemm_test/gemm_func.cc +++ /dev/null @@ -1,990 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "encoder_gemm_func.h" -#include -#include -#include - -#ifndef CUDART_VERSION -#error CUDART_VERSION Undefined! -#endif - -namespace turbomind { - -// Utility function to print customMatmulPerf_t structure -int printPerfStructure(int batch_size, - int seq_len, - int head_num, - int size_per_head, - int m, - int n, - int k, - const customMatmulPerf_t& perf, - FILE* fout, - CublasDataType data_type, - int hasPrint, - int batch_count) -{ - int algoId, tile, swizzle, customOption, numSplitsK, reductionScheme, stages; - - const cublasLtMatmulAlgo_t* matmulAlgo = &perf.algo; - cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_ID, &algoId, sizeof(algoId), NULL); - cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tile, sizeof(tile), NULL); - cublasLtMatmulAlgoConfigGetAttribute( - matmulAlgo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &numSplitsK, sizeof(numSplitsK), NULL); - cublasLtMatmulAlgoConfigGetAttribute( - matmulAlgo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &reductionScheme, sizeof(reductionScheme), NULL); - cublasLtMatmulAlgoConfigGetAttribute( - matmulAlgo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &swizzle, sizeof(swizzle), NULL); - cublasLtMatmulAlgoConfigGetAttribute( - matmulAlgo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption), NULL); -#if (CUDART_VERSION >= 11000) - cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL); -#else - stages = 0; -#endif -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - uint16_t inner_shapeId, cluster_shapeId; - cublasLtMatmulAlgoConfigGetAttribute( - matmulAlgo, CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID, &inner_shapeId, sizeof(inner_shapeId), NULL); - cublasLtMatmulAlgoConfigGetAttribute( - matmulAlgo, CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID, &cluster_shapeId, sizeof(cluster_shapeId), NULL); -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - uint16_t mma_shapeId, cga_shapeId, sche_mode; - cublasLtMatmulAlgoConfigGetAttribute( - matmulAlgo, CUBLASLT_ALGO_CONFIG_MMA_SHAPE_ID, &mma_shapeId, sizeof(mma_shapeId), NULL); - cublasLtMatmulAlgoConfigGetAttribute( - matmulAlgo, CUBLASLT_ALGO_CONFIG_CGA_SHAPE_ID, &cga_shapeId, sizeof(cga_shapeId), NULL); - cublasLtMatmulAlgoConfigGetAttribute( - matmulAlgo, CUBLASLT_ALGO_CONFIG_SCHEDULING_MODE, &sche_mode, sizeof(sche_mode), NULL); -#endif - - printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d " -#if (CUDART_VERSION >= 11000) - "stages=%d " -#endif -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - "inner_shapeId=%d cluster_shapeId=%d" -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - "mma_shapeId=%d cga_shapeId=%d schedule_mode=%d" -#endif - "} status %d " - "time %fms workspace=%d mathMode=%d waves=%f\n", - algoId, - tile, - matmulTileName[tile], - numSplitsK, - reductionScheme, - swizzle, - customOption, -#if (CUDART_VERSION >= 11000) - stages, -#endif -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - inner_shapeId, - cluster_shapeId, -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - mma_shapeId, - cga_shapeId, - sche_mode, -#endif - perf.status, - perf.time, - (int)perf.workspaceSize, - (int)perf.mathMode, - perf.wavesCount); - if (hasPrint == 0) { - fprintf(fout, - "%d %d %d %d %d ### %d %d %d %d %d %d %d %d %d %d %d %d " -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - "%d %d " -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - "%d %d %d " -#endif - "%f\n", - batch_size, - seq_len, - head_num, - size_per_head, - data_type, - batch_count, - m, - n, - k, - algoId, - customOption, - tile, - numSplitsK, - swizzle, - reductionScheme, - (int)perf.workspaceSize, - stages, -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - inner_shapeId, - cluster_shapeId, -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - mma_shapeId, - cga_shapeId, - sche_mode, -#endif - perf.time); - return 1; - } - else { - return hasPrint; - } -} - -static inline bool time_compare(const customMatmulPerf_t& perf_a, const customMatmulPerf_t& perf_b) -{ - return ((perf_a.status == CUBLAS_STATUS_SUCCESS) && (perf_a.time < perf_b.time)); -} - -static cublasStatus_t customMatmulRun(cublasLtHandle_t ltHandle, // to get the capabilities (required a GPU) - cublasLtMatmulDesc_t operationDesc, - const void* alpha, /* host or device pointer */ - const void* A, - cublasLtMatrixLayout_t Adesc, - const void* B, - cublasLtMatrixLayout_t Bdesc, - const void* beta, /* host or device pointer */ - const void* C, - cublasLtMatrixLayout_t Cdesc, - void* D, - cublasLtMatrixLayout_t Ddesc, - const cublasLtMatmulAlgo_t& algo, - int kernelRepeats, - void* workSpace, - size_t workSpaceSizeInBytes, - customMatmulPerf_t& perfResults, - cudaStream_t stream, - cudaEvent_t& startEvent, - cudaEvent_t& stopEvent) -{ - cublasLtMatmulHeuristicResult_t heurResult; - /* Looping over the Algo */ - int repeats = kernelRepeats; - cublasStatus_t algoStatus = - cublasLtMatmulAlgoCheck(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, &algo, &heurResult); - - if (algoStatus == CUBLAS_STATUS_SUCCESS) { - if (heurResult.workspaceSize <= workSpaceSizeInBytes) { - cudaError_t err, err1, err2, err3; - err = cudaEventRecord(startEvent, stream); - for (int loop = 0; loop < repeats; loop++) { - cublasStatus_t oneRunStatus = cublasLtMatmul(ltHandle, - operationDesc, - alpha, - A, - Adesc, - B, - Bdesc, - beta, - C, - Cdesc, - D, - Ddesc, - &algo, - workSpace, - workSpaceSizeInBytes, - stream); - if (oneRunStatus != CUBLAS_STATUS_SUCCESS) { - algoStatus = oneRunStatus; - break; - } - } - err1 = cudaEventRecord(stopEvent, stream); - err2 = cudaEventSynchronize(stopEvent); - float time; - err3 = cudaEventElapsedTime(&time, startEvent, stopEvent); - if ((err != cudaSuccess) || (err1 != cudaSuccess) || (err2 != cudaSuccess) || (err3 != cudaSuccess)) { - algoStatus = CUBLAS_STATUS_INTERNAL_ERROR; - } - // For the moment only add successful findings - if (algoStatus == CUBLAS_STATUS_SUCCESS) { - perfResults.algo = algo; - perfResults.time = time / repeats; - perfResults.workspaceSize = heurResult.workspaceSize; - perfResults.wavesCount = heurResult.wavesCount; - } - } - else { - // printf("not enough workspace! %ld\n", heurResult.workspaceSize); - algoStatus = CUBLAS_STATUS_NOT_SUPPORTED; // Not enough workspace - } - } - - return algoStatus; -} - -template -int LtHgemmCustomFind(cublasLtHandle_t ltHandle, - int batch_size, - int seq_len, - int head_num, - int size_per_head, - int m, - int n, - int k, - const scaleT* alpha, /* host pointer */ - const T* A, - const T* B, - const scaleT* beta, /* host pointer */ - T* C, - void* workSpace, - size_t workSpaceSize, - FILE* fout, - customMatmulPerf_t perfResults[], - int AlgoCombinations, - cudaDataType_t dtype_fp8, - int batchCount, - int64_t strideA, - int64_t strideB, - int64_t strideD) -{ - cublasStatus_t status = CUBLAS_STATUS_SUCCESS; - cudaEvent_t startEvent; - cudaEvent_t stopEvent; - CublasDataType data_type; - - cublasLtMatmulDesc_t operationDesc = NULL; - cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL, Ddesc = NULL; - - cudaStream_t stream = 0; - // SplitK value that we are going to try when SplitK is supported for a - // given algo - const int splitKSequenceA[] = {2, 3, 4, 5, 6, 8, 12, 16, 32}; - // Let try a fixed number of combinations - int AlgoCount = 0; - int AlgoCountRestrict = 0; // workspace == 0 - const int maxNumTraversal = 50; // max number of traversal - std::vector algos(AlgoCombinations); // 0 <= workspace <= 32MB - std::vector algosRestrict(AlgoCombinations); // workspace == 0 - const int kernelRepeats = 100; // number of time the CUDA kernels will be run back to back - int nbAlgoIds = 0; // Number of algorithms actually returned by - // cublasLtMatmulAlgoGetIds function. -#define ALGO_IDS 100 // Number of algorithms requested. - int algoIdA[ALGO_IDS]; // Array containing the algorithm IDs returned by - // cublasLtMatmulAlgoGetIds function. - cudaDataType_t Atype, Btype, Ctype, scaleType, Dtype; -#if (CUDART_VERSION >= 11000) - cublasComputeType_t computeType; -#else - cudaDataType_t computeType; -#endif - - if (std::is_same::value) { - data_type = FLOAT_DATATYPE; - Atype = CUDA_R_32F, Btype = CUDA_R_32F, Ctype = CUDA_R_32F, Dtype = CUDA_R_32F; - } - else if (std::is_same::value) { - data_type = HALF_DATATYPE; - Atype = CUDA_R_16F, Btype = CUDA_R_16F, Ctype = CUDA_R_16F, Dtype = CUDA_R_16F; - } -#ifdef ENABLE_BF16 - else if (std::is_same::value) { - data_type = BFLOAT16_DATATYPE; - Atype = CUDA_R_16BF, Btype = CUDA_R_16BF, Ctype = CUDA_R_16BF, Dtype = CUDA_R_16BF; - } -#endif -#ifdef ENABLE_FP8 - else if (std::is_same::value) { - data_type = FP8_DATATYPE; - Atype = CUDA_R_8F_E4M3, Btype = CUDA_R_8F_E4M3, Ctype = CUDA_R_16BF; -#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE - Dtype = CUDA_R_16BF; -#else - Dtype = dtype_fp8; -#endif - } -#endif - - if (sizeof(scaleT) == sizeof(float)) { - scaleType = CUDA_R_32F; -#if (CUDART_VERSION >= 11000) - computeType = CUBLAS_COMPUTE_32F; -#else - computeType = CUDA_R_32F; -#endif - } - else { - scaleType = CUDA_R_16F; -#if (CUDART_VERSION >= 11000) - computeType = CUBLAS_COMPUTE_16F; -#else - computeType = CUDA_R_16F; -#endif - } - - const cublasOperation_t tA = data_type == FP8_DATATYPE ? CUBLAS_OP_T : CUBLAS_OP_N; - -// Create operation descriptor; see cublasLtMatmulDescAttributes_t for -// details about defaults; here we just need to set the transforms for A and -// B -#if (CUDART_VERSION >= 11000) - status = cublasLtMatmulDescCreate(&operationDesc, computeType, - scaleType); // creates a matrix multiply descriptor -#else - status = cublasLtMatmulDescCreate(&operationDesc, computeType); -#endif - if (status != CUBLAS_STATUS_SUCCESS) { - goto CLEANUP; - } - - status = cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &tA, sizeof(tA)); - if (status != CUBLAS_STATUS_SUCCESS) { - goto CLEANUP; - } -#ifdef ENABLE_FP8 - if (data_type == FP8_DATATYPE) { - const int8_t fastAccuMode = 1; // enable fast imprecise accum - status = cublasLtMatmulDescSetAttribute( - operationDesc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(decltype(fastAccuMode))); - if (status != CUBLAS_STATUS_SUCCESS) { - goto CLEANUP; - } - } -#endif - - // Create matrix descriptors. We are good with the details here so no need - // to set any extra attributes - if (data_type == FP8_DATATYPE) { - status = cublasLtMatrixLayoutCreate(&Adesc, Atype, k, m, k); - } - else { - status = cublasLtMatrixLayoutCreate(&Adesc, Atype, m, k, m); - } - if (status != CUBLAS_STATUS_SUCCESS) { - goto CLEANUP; - } - - status = cublasLtMatrixLayoutCreate(&Bdesc, Btype, k, n, k); - if (status != CUBLAS_STATUS_SUCCESS) { - goto CLEANUP; - } - - status = cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, m); - if (status != CUBLAS_STATUS_SUCCESS) { - goto CLEANUP; - } - status = cublasLtMatrixLayoutCreate(&Ddesc, Dtype, m, n, m); - if (status != CUBLAS_STATUS_SUCCESS) { - goto CLEANUP; - } - - if (batchCount > 1) { - check_cuda_error(cublasLtMatrixLayoutSetAttribute( - Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount))); - check_cuda_error(cublasLtMatrixLayoutSetAttribute( - Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount))); - check_cuda_error(cublasLtMatrixLayoutSetAttribute( - Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount))); - check_cuda_error(cublasLtMatrixLayoutSetAttribute( - Ddesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount))); - - check_cuda_error(cublasLtMatrixLayoutSetAttribute( - Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideA, sizeof(strideA))); - check_cuda_error(cublasLtMatrixLayoutSetAttribute( - Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideB, sizeof(strideB))); - check_cuda_error(cublasLtMatrixLayoutSetAttribute( - Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD))); - check_cuda_error(cublasLtMatrixLayoutSetAttribute( - Ddesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD))); - } - - // Create CUDA event to time the execution time of each algo - if (cudaEventCreate(&startEvent, cudaEventBlockingSync) != cudaSuccess) { - goto CLEANUP; - } - if (cudaEventCreate(&stopEvent, cudaEventBlockingSync) != cudaSuccess) { - goto CLEANUP; - } - - // Request the 100 first AlgoId available - status = cublasLtMatmulAlgoGetIds( - ltHandle, computeType, scaleType, Atype, Btype, Ctype, Dtype, ALGO_IDS, algoIdA, &nbAlgoIds); - if (status != CUBLAS_STATUS_SUCCESS) { - goto CLEANUP; - } - if (nbAlgoIds > ALGO_IDS) { - printf( - "Warning: the algo id count is not large enough to guarantee the best algo %d, %d\n", nbAlgoIds, ALGO_IDS); - } - - // Loop over the Algo IDs - // This loop doesn't work for fp8 gemm - for (int idx = 0; (idx < nbAlgoIds) && (AlgoCount < AlgoCombinations); idx++) { - cublasLtMatmulAlgo_t algo; - size_t sizeWritten = 0; - /* Initialize algo structure with given Algp ID */ - status = - cublasLtMatmulAlgoInit(ltHandle, computeType, scaleType, Atype, Btype, Ctype, Dtype, algoIdA[idx], &algo); - if (status != CUBLAS_STATUS_SUCCESS) { - continue; - } - // Query the tiles enums supported by that algo - cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_TILE_IDS, NULL, 0, &sizeWritten); - int nbTiles = int(sizeWritten / sizeof(int)); - int* tileA = new int[nbTiles == 0 ? 1 : nbTiles]; - if (nbTiles == 0) { - tileA[0] = CUBLASLT_MATMUL_TILE_UNDEFINED; - nbTiles = 1; - } -#if (CUDART_VERSION >= 11000) - cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, NULL, 0, &sizeWritten); - int nbStages = int(sizeWritten / sizeof(int)); - std::vector stagesA(nbStages == 0 ? 1 : nbStages); - if (nbStages == 0) { - stagesA[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED; - nbStages = 1; - } - else { - cublasLtMatmulAlgoCapGetAttribute( - &algo, CUBLASLT_ALGO_CAP_STAGES_IDS, stagesA.data(), sizeof(int) * nbStages, &sizeWritten); - } -#endif - int splitkSupport, redMask, swizzlingMax, customOptionMax; - // Retrieve Algo Capabilities attributes to be able to setup loop over - // the different combinations - cublasLtMatmulAlgoCapGetAttribute( - &algo, CUBLASLT_ALGO_CAP_TILE_IDS, tileA, sizeof(int) * nbTiles, &sizeWritten); - cublasLtMatmulAlgoCapGetAttribute( - &algo, CUBLASLT_ALGO_CAP_SPLITK_SUPPORT, &splitkSupport, sizeof(splitkSupport), &sizeWritten); - cublasLtMatmulAlgoCapGetAttribute( - &algo, CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK, &redMask, sizeof(redMask), &sizeWritten); - cublasLtMatmulAlgoCapGetAttribute( - &algo, CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT, &swizzlingMax, sizeof(swizzlingMax), &sizeWritten); - cublasLtMatmulAlgoCapGetAttribute( - &algo, CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX, &customOptionMax, sizeof(customOptionMax), &sizeWritten); - - /* Loop over the different tiles */ - for (int tileIdx = 0; tileIdx < nbTiles; tileIdx++) { -#if (CUDART_VERSION >= 11000) - /* Loop over different stages count */ - for (int stagesIdx = 0; stagesIdx < nbStages; stagesIdx++) { - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stagesA[stagesIdx], sizeof(stagesA[stagesIdx])); -#endif - /* Loop over the different custom option if any */ - for (int customOption = 0; customOption <= customOptionMax; customOption++) { - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption)); - /* Loop over the CTAs swizzling support */ - for (int k = 0; k <= swizzlingMax; k++) { - int splitK_trial = 0; - if (splitkSupport) { - splitK_trial += sizeof(splitKSequenceA) / sizeof(splitKSequenceA[0]); - } - // Loop over the splitK value over a fixed sequence - // splitKSequenceA in addition to the case where splitK - // is not enabled - for (int l = 0; (l < (1 + splitK_trial)) && (AlgoCount < AlgoCombinations); l++) { - /* Setup attribute of the algo to run */ - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tileA[tileIdx], sizeof(tileA[tileIdx])); - int splitK_val = 0; - int redScheme = CUBLASLT_REDUCTION_SCHEME_NONE; - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &splitK_val, sizeof(splitK_val)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &k, sizeof(k)); - cublasLtMatmulAlgoConfigSetAttribute( - &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &redScheme, sizeof(int)); - - if (l > 0) { // Split-K case - splitK_val = splitKSequenceA[l - 1]; - cublasLtMatmulAlgoConfigSetAttribute(&algo, - CUBLASLT_ALGO_CONFIG_SPLITK_NUM, - &splitKSequenceA[l - 1], - sizeof(splitKSequenceA[l - 1])); - /* Going over all the reduction scheme */ - for (redScheme = 1; - redScheme < (int)CUBLASLT_REDUCTION_SCHEME_MASK && (AlgoCount < AlgoCombinations); - redScheme = redScheme << 1) { - if (redScheme & redMask) { - cublasLtMatmulAlgoConfigSetAttribute(&algo, - CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, - &redScheme, - sizeof(redScheme)); - - cublasLtMatmulHeuristicResult_t heurResult; - cublasStatus_t algoStatus = cublasLtMatmulAlgoCheck( - ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Cdesc, &algo, &heurResult); - if (heurResult.workspaceSize > workSpaceSize) { - // printf("not enough workspace! - // %ld\n", - // heurResult.workspaceSize); - algoStatus = CUBLAS_STATUS_NOT_SUPPORTED; // Not enough workspace - } - else if (heurResult.workspaceSize == 0) { - if (algoStatus == CUBLAS_STATUS_SUCCESS) { - algosRestrict[AlgoCountRestrict++] = algo; - } - } - if (algoStatus == CUBLAS_STATUS_SUCCESS) { - algos[AlgoCount++] = algo; - } - } // end if - } // end for - } - else { // Non-splitK case - /* if user preference is ok with workspace */ - if (AlgoCount < AlgoCombinations) { - cublasLtMatmulHeuristicResult_t heurResult; - cublasStatus_t algoStatus = cublasLtMatmulAlgoCheck( - ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Cdesc, &algo, &heurResult); - if (heurResult.workspaceSize > workSpaceSize) { - // printf("not enough workspace! %ld\n", - // heurResult.workspaceSize); - algoStatus = CUBLAS_STATUS_NOT_SUPPORTED; // Not - // enough - // workspace - } - else if (heurResult.workspaceSize == 0) { - if (algoStatus == CUBLAS_STATUS_SUCCESS) { - algosRestrict[AlgoCountRestrict++] = algo; - } - } - if (algoStatus == CUBLAS_STATUS_SUCCESS) { - algos[AlgoCount++] = algo; - } - } - } - } // end l - } // end k - } // end customOption -#if (CUDART_VERSION >= 11000) - } // end stagesIdx -#endif - } // end tileIdx - delete[] tileA; - } // end idx - - printf("AlgoCount: %d\n", AlgoCount); - if (data_type == FP8_DATATYPE) { - assert(AlgoCount == 0); - } - if (AlgoCount < maxNumTraversal && data_type != FP8_DATATYPE) { - // 0 <= workspacesize <= 32MB - for (int i = 0; i < AlgoCount; i++) { - status = customMatmulRun(ltHandle, - operationDesc, - alpha, /* host or device pointer */ - A, - Adesc, - B, - Bdesc, - beta, /* host or device pointer */ - C, - Cdesc, - C, - Cdesc, - algos[i], - kernelRepeats, - workSpace, - workSpaceSize, - perfResults[i], - stream, - startEvent, - stopEvent); - perfResults[i].status = status; - // if (status == CUBLAS_STATUS_SUCCESS) AlgoCount++; - } - } - else { - // Heuristic + workspacesize==0 - AlgoCount = 0; - nbAlgoIds = 0; - cublasLtMatmulPreference_t pref; - cublasLtMatmulPreferenceCreate(&pref); - uint64_t maxWorkSpaceSize = workSpaceSize; //(32MB) - cublasLtMatmulPreferenceSetAttribute( - pref, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &maxWorkSpaceSize, sizeof(maxWorkSpaceSize)); - cublasLtMatmulHeuristicResult_t heuristicResultsArray[maxNumTraversal]; - - cublasLtMatmulAlgoGetHeuristic(ltHandle, - operationDesc, - Adesc, - Bdesc, - Cdesc, - Ddesc, - pref, - maxNumTraversal, - heuristicResultsArray, - &nbAlgoIds); - cublasLtMatmulPreferenceDestroy(pref); - printf("return %d and run heuristic algo\n", nbAlgoIds); - for (int i = 0; i < nbAlgoIds; i++) { - if (heuristicResultsArray[i].state == CUBLAS_STATUS_SUCCESS) { - status = customMatmulRun(ltHandle, - operationDesc, - alpha, /* host or device pointer */ - A, - Adesc, - B, - Bdesc, - beta, /* host or device pointer */ - C, - Cdesc, - C, - Ddesc, - heuristicResultsArray[i].algo, - kernelRepeats, - workSpace, - workSpaceSize, - perfResults[AlgoCount], - stream, - startEvent, - stopEvent); - perfResults[AlgoCount].status = status; - if (status == CUBLAS_STATUS_SUCCESS) { - AlgoCount++; - } - } - } - - // workspacesize==0 - printf("workspacesize==0, run %d algos\n", AlgoCountRestrict); - for (int i = 0; i < AlgoCountRestrict && i < (maxNumTraversal - nbAlgoIds); i++) { - status = customMatmulRun(ltHandle, - operationDesc, - alpha, /* host or device pointer */ - A, - Adesc, - B, - Bdesc, - beta, /* host or device pointer */ - C, - Cdesc, - C, - Ddesc, - algosRestrict[i], - kernelRepeats, - NULL, - 0, - perfResults[AlgoCount], - stream, - startEvent, - stopEvent); - perfResults[AlgoCount].status = status; - if (status == CUBLAS_STATUS_SUCCESS) { - AlgoCount++; - } - } - } - - // Sort the results per run duration - std::sort(perfResults, perfResults + AlgoCount, time_compare); - // Print timing and perf details - for (int i = 0, hasPrint = 1; i < AlgoCount; i++) { - printf("result %03d : ", i); - hasPrint = printPerfStructure(batch_size, - seq_len, - head_num, - size_per_head, - m, - n, - k, - perfResults[i], - fout, - data_type, - hasPrint, - batchCount); - } - -CLEANUP: - // Descriptors are no longer needed as all GPU work was already enqueued - if (Cdesc) { - cublasLtMatrixLayoutDestroy(Cdesc); - } - if (Bdesc) { - cublasLtMatrixLayoutDestroy(Bdesc); - } - if (Adesc) { - cublasLtMatrixLayoutDestroy(Adesc); - } - if (operationDesc) { - cublasLtMatmulDescDestroy(operationDesc); - } - if (startEvent) { - cudaEventDestroy(startEvent); - } - if (stopEvent) { - cudaEventDestroy(stopEvent); - } - return status == CUBLAS_STATUS_SUCCESS ? 0 : 1; -} - -template int LtHgemmCustomFind(cublasLtHandle_t ltHandle, - int batch_size, - int seq_len, - int head_num, - int size_per_head, - int m, - int n, - int k, - const float* alpha, /* host pointer */ - const float* A, - const float* B, - const float* beta, /* host pointer */ - float* C, - void* workSpace, - size_t workSpaceSize, - FILE* fout, - customMatmulPerf_t perfResults[], - int AlgoCombinations, - cudaDataType_t dtype_fp8, - int batchCount, - int64_t strideA, - int64_t strideB, - int64_t strideD); - -template int LtHgemmCustomFind(cublasLtHandle_t ltHandle, - int batch_size, - int seq_len, - int head_num, - int size_per_head, - int m, - int n, - int k, - const half* alpha, /* host pointer */ - const half* A, - const half* B, - const half* beta, /* host pointer */ - half* C, - void* workSpace, - size_t workSpaceSize, - FILE* fout, - customMatmulPerf_t perfResults[], - int AlgoCombinations, - cudaDataType_t dtype_fp8, - int batchCount, - int64_t strideA, - int64_t strideB, - int64_t strideD); - -#ifdef ENABLE_BF16 -template int LtHgemmCustomFind(cublasLtHandle_t ltHandle, - int batch_size, - int seq_len, - int head_num, - int size_per_head, - int m, - int n, - int k, - const float* alpha, /* host pointer */ - const __nv_bfloat16* A, - const __nv_bfloat16* B, - const float* beta, /* host pointer */ - __nv_bfloat16* C, - void* workSpace, - size_t workSpaceSize, - FILE* fout, - customMatmulPerf_t perfResults[], - int AlgoCombinations, - cudaDataType_t dtype_fp8, - int batchCount, - int64_t strideA, - int64_t strideB, - int64_t strideD); -#endif - -#ifdef ENABLE_FP8 -template int LtHgemmCustomFind(cublasLtHandle_t ltHandle, - int batch_size, - int seq_len, - int head_num, - int size_per_head, - int m, - int n, - int k, - const float* alpha, /* host pointer */ - const __nv_fp8_e4m3* A, - const __nv_fp8_e4m3* B, - const float* beta, /* host pointer */ - __nv_fp8_e4m3* C, - void* workSpace, - size_t workSpaceSize, - FILE* fout, - customMatmulPerf_t perfResults[], - int AlgoCombinations, - cudaDataType_t dtype_fp8, - int batchCount, - int64_t strideA, - int64_t strideB, - int64_t strideD); -#endif - -template int LtHgemmCustomFind(cublasLtHandle_t ltHandle, - int batch_size, - int seq_len, - int head_num, - int size_per_head, - int m, - int n, - int k, - const float* alpha, /* host pointer */ - const half* A, - const half* B, - const float* beta, /* host pointer */ - half* C, - void* workSpace, - size_t workSpaceSize, - FILE* fout, - customMatmulPerf_t perfResults[], - int AlgoCombinations, - cudaDataType_t dtype_fp8, - int batchCount, - int64_t strideA, - int64_t strideB, - int64_t strideD); - -size_t calGemmTestBufSizeInByte(int batch_size, - int seq_len, - int head_num, - int size_per_head, - int inter_size, - int vocab_size, - int int8_mode, - CublasDataType data_type) -{ - size_t buf_size_in_byte; - if (int8_mode > 0) { - int m = batch_size * seq_len; - int n = head_num * size_per_head; - int k = n; - - size_t size1 = 3 * (m * k * sizeof(int8_t) + k * n * sizeof(int8_t) + m * n * sizeof(int)); - size_t size2 = batch_size * head_num - * (seq_len * size_per_head * sizeof(int8_t) + size_per_head * seq_len * sizeof(int8_t) - + seq_len * seq_len * sizeof(int)); - size_t size3 = batch_size * head_num - * (seq_len * seq_len * sizeof(int8_t) + seq_len * size_per_head * sizeof(int8_t) - + seq_len * size_per_head * sizeof(int)); - size_t size4 = m * k * sizeof(int8_t) + k * inter_size * sizeof(int8_t) + m * inter_size * sizeof(int); - size_t size5 = m * k * sizeof(int8_t) + k * vocab_size * sizeof(int8_t) + m * vocab_size * sizeof(int); - buf_size_in_byte = size1 > size2 ? size1 : size2; - buf_size_in_byte = buf_size_in_byte > size3 ? buf_size_in_byte : size3; - buf_size_in_byte = buf_size_in_byte > size4 ? buf_size_in_byte : size4; - buf_size_in_byte = buf_size_in_byte > size5 ? buf_size_in_byte : size5; - } - else { - size_t m = batch_size * seq_len; - size_t n = head_num * size_per_head; - size_t k = n; - // TODO need to add bfloat16 here - int wordSize = (data_type == FLOAT_DATATYPE ? sizeof(float) : sizeof(half)); - size_t size1 = 3 * (m * k + k * n + m * n) * wordSize; - size_t size2 = (size_t)batch_size * (size_t)head_num - * ((size_t)seq_len * (size_t)seq_len + (size_t)seq_len * (size_t)size_per_head - + (size_t)seq_len * (size_t)size_per_head) - * (size_t)wordSize; - size_t size3 = (m * k + k * inter_size + m * inter_size) * wordSize; - size_t size4 = (m * k + k * vocab_size + m * vocab_size) * wordSize; - buf_size_in_byte = size1 > size2 ? size1 : size2; - buf_size_in_byte = buf_size_in_byte > size3 ? buf_size_in_byte : size3; - buf_size_in_byte = buf_size_in_byte > size4 ? buf_size_in_byte : size4; - buf_size_in_byte += - ((data_type == HALF_DATATYPE || data_type == BFLOAT16_DATATYPE) ? CUBLAS_WORKSPACE_SIZE : 0); - } - return buf_size_in_byte; -} - -size_t calGemmTestBufSizeInByteXlnet( - int batch_size, int seq_len, int head_num, int size_per_head, int inter_size, int hidden_units, int is_fp16) -{ - int M[10] = {0}; - int N[10] = {0}; - int K[10] = {0}; - int batchCount[10] = {0}; - - // gemm1 - M[0] = hidden_units; - N[0] = seq_len * batch_size; - K[0] = hidden_units; - batchCount[0] = 3; - - // gemm2 - M[1] = hidden_units; - N[1] = seq_len * 2; - K[1] = hidden_units; - batchCount[1] = 1; - - // gemm3 - M[2] = seq_len; - N[2] = seq_len; - K[2] = size_per_head; - batchCount[2] = batch_size * head_num; - - // gemm4 - M[3] = seq_len * 2; - N[3] = seq_len; - K[3] = size_per_head; - batchCount[3] = batch_size * head_num; - - // gemm5 - M[4] = 2; - N[4] = seq_len; - K[4] = size_per_head; - batchCount[4] = batch_size * head_num; - - // gemm6 - M[5] = head_num; - N[5] = seq_len; - K[5] = 2; - // gemm7 - M[6] = size_per_head; - N[6] = seq_len; - K[6] = seq_len; - batchCount[6] = batch_size * head_num; - - // gemm8 - M[7] = hidden_units; - N[7] = seq_len; - K[7] = hidden_units; - batchCount[7] = batch_size; - - // gemm9 - M[8] = inter_size; - N[8] = seq_len; - K[8] = hidden_units; - batchCount[8] = batch_size; - - // gemm10 - M[9] = hidden_units; - N[9] = seq_len; - K[9] = inter_size; - batchCount[9] = batch_size; - - size_t max_size = 0; - - for (int i = 0; i < 10; ++i) { - int m = M[i], n = N[i], k = K[i]; - size_t size = (M[i] * N[i] + M[i] * K[i] + N[i] * K[i]) * batchCount[i]; - if (size > max_size) { - max_size = size; - } - } - - int size_per_ele = 4; - if (is_fp16 == true) { - size_per_ele = 2; - } - return max_size * size_per_ele; -} - -} // namespace turbomind diff --git a/src/turbomind/utils/gemm_test/gemm_func.h b/src/turbomind/utils/gemm_test/gemm_func.h deleted file mode 100644 index b33ae2132b..0000000000 --- a/src/turbomind/utils/gemm_test/gemm_func.h +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "encoder_igemm_func.h" // TODO(bhsueh) Remove this include -#include "src/turbomind/utils/cublasAlgoMap.h" -#include "src/turbomind/utils/cuda_utils.h" -#include -#include -#include -#ifdef ENABLE_BF16 -#include -#endif -#ifdef ENABLE_FP8 -#include -#endif -#ifdef __linux__ -#include -#include -#endif -#include -#include -#include - -namespace turbomind { - -// Scale Type Converter -// is_fp16_compute_type is only valid when T = half -template -struct ScaleTypeConverter { - using Type = float; -}; - -template<> -struct ScaleTypeConverter { - using Type = half; -}; - -template -int LtHgemmCustomFind(cublasLtHandle_t ltHandle, - int batch_size, - int seq_len, - int head_num, - int size_per_head, - int m, - int n, - int k, - const scaleT* alpha, /* host pointer */ - const T* A, - const T* B, - const scaleT* beta, /* host pointer */ - T* C, - void* workSpace, - size_t workSpaceSize, - FILE* fout, - customMatmulPerf_t perfResults[], - int AlgoCombinations, - cudaDataType_t dtype_fp8 = CUDA_R_32F, - int batchCount = 1, - int64_t strideA = 0, - int64_t strideB = 0, - int64_t strideD = 0); - -size_t calGemmTestBufSizeInByte(int batch_size, - int seq_len, - int head_num, - int size_per_head, - int inter_size, - int vocab_size, - int int8_mode, - CublasDataType data_type); - -size_t calGemmTestBufSizeInByteXlnet( - int batch_size, int seq_len, int head_num, int size_per_head, int inter_size, int hidden_units, int is_fp16); - -int printPerfStructure(int batch_size, - int seq_len, - int head_num, - int size_per_head, - int m, - int n, - int k, - const customMatmulPerf_t& perf, - FILE* fout, - CublasDataType data_type, - int hasPrint, - int batch_count = 1); - -} // namespace turbomind diff --git a/src/turbomind/utils/gemm_test/gpt_gemm_func.cc b/src/turbomind/utils/gemm_test/gpt_gemm_func.cc deleted file mode 100644 index 68e665930f..0000000000 --- a/src/turbomind/utils/gemm_test/gpt_gemm_func.cc +++ /dev/null @@ -1,811 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "src/turbomind/utils/gemm_test/gpt_gemm_func.h" -#include "src/turbomind/macro.h" -#include - -namespace turbomind { - -bool isSparseGemmAvailable(size_t m, size_t n, size_t k) -{ - return m % 8 == 0 && n % 8 == 0 && k % 8 == 0; -} - -template -void generate_gpt_gemm_config(int batch_size, - int beam_width, - int max_input_len, - int head_num, - int size_per_head, - int inter_size, - int vocab_size, - int tensor_para_size, - void* buffer_in, - bool isAppend) -{ - FT_CHECK(head_num % tensor_para_size == 0); - void* cublas_workspace; - void* buffer; - int workSpaceSize; -#if 0 - bool workspace_flag = std::is_same::value; -#ifdef ENABLE_FP8 - workspace_flag = workspace_flag || std::is_same::value; -#endif -#if ENABLE_BF16 - workspace_flag = workspace_flag || std::is_same::value; -#endif -#endif - // algorithms with workspace perform worse than evaluated - const bool workspace_flag = 0; - if (workspace_flag) { - // cublas_workspace_ should be the start pointer of cudaMalloc() - // to ensure 16B alignemnet - cublas_workspace = buffer_in; - buffer = (void*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE); - workSpaceSize = CUBLAS_WORKSPACE_SIZE; - } - else { - cublas_workspace = nullptr; - buffer = buffer_in; - workSpaceSize = 0; - } - - struct cudaDeviceProp prop; - check_cuda_error(cudaGetDeviceProperties(&prop, 0)); - printf("Device %s\n", prop.name); - - // check config - FILE* fd; - int line_count = 0; - if (!isAppend) { - fd = fopen(GEMM_CONFIG, "w+"); - } - else { - fd = fopen(GEMM_CONFIG, "a+"); - std::vector config; - char line[1024]; - while (fgets(line, 1024, fd) != NULL) { - config.push_back(std::string(line)); - } - line_count = config.size(); - // if (config.size() >= (MAX_CONFIG_NUM * GEMM_NUM + 1)) // 6 cublas/cublasLt, first row is not included - // { - // int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * GEMM_NUM); - // fclose(fd); - // fd = fopen(GEMM_CONFIG, "w+"); - // fprintf(fd, "%s", config[0].c_str()); - // for (uint i = startIdx; i < config.size(); i++) { - // fprintf(fd, "%s", config[i].c_str()); - // } - // line_count = config.size() - (GEMM_NUM + 3); - // } - } - - const int hidden_units = head_num * size_per_head; - const int local_head_num = head_num / tensor_para_size; - const int local_hidden_units = local_head_num * size_per_head; - const int max_input_len_padded = (max_input_len + 15) / 16 * 16; - const int gemm_num = 11; - int M[gemm_num]; - int N[gemm_num]; - int K[gemm_num]; - int batchCount[gemm_num]; - int64_t strideA[gemm_num]; - int64_t strideB[gemm_num]; - int64_t strideD[gemm_num]; - char mess[gemm_num][256]; - float exec_times[gemm_num]; - - // gemm 0 - M[0] = batch_size * beam_width * max_input_len; - K[0] = hidden_units; - N[0] = 3 * local_hidden_units; - batchCount[0] = 1; - strideA[0] = 0; - strideB[0] = 0; - strideD[0] = 0; - strcpy(mess[0], "context from_tensor * weightQKV"); - - // gemm 1 - M[1] = max_input_len_padded; - K[1] = size_per_head; - N[1] = max_input_len_padded; - batchCount[1] = batch_size * beam_width * local_head_num; - strideA[1] = max_input_len_padded * size_per_head; - strideB[1] = max_input_len_padded * size_per_head; - strideD[1] = max_input_len_padded * max_input_len_padded; - strcpy(mess[1], "context batch gemm Q*K^T"); - - // gemm 2 - M[2] = max_input_len_padded; - K[2] = max_input_len_padded; - N[2] = size_per_head; - batchCount[2] = batch_size * beam_width * local_head_num; - strideA[2] = max_input_len_padded * size_per_head; - strideB[2] = max_input_len_padded * max_input_len_padded; - strideD[2] = max_input_len_padded * size_per_head; - strcpy(mess[2], "context batch gemm QK*V^T"); - - // gemm 3 - M[3] = batch_size * beam_width * max_input_len; - K[3] = local_hidden_units; - N[3] = hidden_units; - batchCount[3] = 1; - strideA[3] = 0; - strideB[3] = 0; - strideD[3] = 0; - strcpy(mess[3], "context attr * output_kernel"); - - // gemm 4 - M[4] = batch_size * beam_width * max_input_len; - K[4] = hidden_units; - N[4] = inter_size / tensor_para_size; - batchCount[4] = 1; - strideA[4] = 0; - strideB[4] = 0; - strideD[4] = 0; - strcpy(mess[4], "context ffn gemm 1"); - - // gemm 5 - M[5] = batch_size * beam_width * max_input_len; - K[5] = inter_size / tensor_para_size; - N[5] = hidden_units; - batchCount[5] = 1; - strideA[5] = 0; - strideB[5] = 0; - strideD[5] = 0; - strcpy(mess[5], "context ffn gemm 2"); - - // gemm 6 - M[6] = batch_size * beam_width; - K[6] = hidden_units; - N[6] = 3 * local_hidden_units; - batchCount[6] = 1; - strideA[6] = 0; - strideB[6] = 0; - strideD[6] = 0; - strcpy(mess[6], "from_tensor * weightQKV"); - - // gemm 7 - M[7] = batch_size * beam_width; - K[7] = local_hidden_units; - N[7] = hidden_units; - batchCount[7] = 1; - strideA[7] = 0; - strideB[7] = 0; - strideD[7] = 0; - strcpy(mess[7], "attr * output_kernel"); - - // gemm 8 - M[8] = batch_size * beam_width; - K[8] = hidden_units; - N[8] = inter_size / tensor_para_size; - batchCount[8] = 1; - strideA[8] = 0; - strideB[8] = 0; - strideD[8] = 0; - strcpy(mess[8], "ffn gemm 1"); - - // gemm 9 - M[9] = batch_size * beam_width; - K[9] = inter_size / tensor_para_size; - N[9] = hidden_units; - batchCount[9] = 1; - strideA[9] = 0; - strideB[9] = 0; - strideD[9] = 0; - strcpy(mess[9], "ffn gemm 2"); - - // gemm 10 - M[10] = batch_size * beam_width; - K[10] = hidden_units; - N[10] = ceil(vocab_size / 8.) * 8 / tensor_para_size; - batchCount[10] = 1; - strideA[10] = 0; - strideB[10] = 0; - strideD[10] = 0; - strcpy(mess[10], "logits gemm"); - - cublasHandle_t cublas_handle; - check_cuda_error(cublasCreate(&cublas_handle)); - cublasLtHandle_t ltHandle; - check_cuda_error(cublasLtCreate(<Handle)); - - cudaDataType_t AType; - cudaDataType_t BType; - cudaDataType_t CType; - cudaDataType_t DType; - cudaDataType_t DType_FP8[gemm_num]; - cudaDataType_t computeType; - int startAlgo, endAlgo; - const int ites = 100; - - CublasDataType data_type; - if (std::is_same::value) { - data_type = FLOAT_DATATYPE; - AType = CUDA_R_32F; - BType = CUDA_R_32F; - CType = CUDA_R_32F; - DType = CUDA_R_32F; - computeType = CUDA_R_32F; - startAlgo = (int)CUBLAS_GEMM_DEFAULT; - endAlgo = (int)CUBLAS_GEMM_ALGO23; - } - else if (std::is_same::value) { - data_type = HALF_DATATYPE; - AType = CUDA_R_16F; - BType = CUDA_R_16F; - CType = CUDA_R_16F; - DType = CUDA_R_16F; - computeType = CUDA_R_32F; - startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP; - endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP; - } -#ifdef ENABLE_BF16 - else if (std::is_same::value) { - data_type = BFLOAT16_DATATYPE; - AType = CUDA_R_16BF; - BType = CUDA_R_16BF; - CType = CUDA_R_16BF; - DType = CUDA_R_16BF; - computeType = CUDA_R_32F; - startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP; - endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP; - } -#endif -#ifdef ENABLE_FP8 - else if (std::is_same::value) { - data_type = FP8_DATATYPE; - AType = CUDA_R_8F_E4M3; - BType = CUDA_R_8F_E4M3; - CType = CUDA_R_16BF; -#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE - DType = CUDA_R_16BF -#else - DType_FP8[0] = CUDA_R_8F_E4M3; - DType_FP8[1] = CUDA_R_16BF; - DType_FP8[2] = CUDA_R_8F_E4M3; - DType_FP8[3] = CUDA_R_16BF; - DType_FP8[4] = CUDA_R_16BF; - DType_FP8[5] = CUDA_R_16BF; -#ifdef FP8_MHA - DType_FP8[6] = CUDA_R_8F_E4M3; -#else - DType_FP8[6] = CUDA_R_16BF; -#endif - DType_FP8[7] = CUDA_R_16BF; - DType_FP8[8] = CUDA_R_16BF; - DType_FP8[9] = CUDA_R_16BF; -#endif - computeType = CUDA_R_32F; - startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP; - endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP; - } -#endif - float alpha = (float)1.0f; - float beta = (float)0.0f; - - printf("***Encoder Gemm Testing Begin***\n"); - printf("***Cublas Gemm Testing Begin***\n"); - if (line_count == 0) { - fprintf(fd, - "batch_size, seq_len, head_num, size_per_head dataType ### batchCount, n, m, k, algoId, " - "customOption, tile, numSplitsK, swizzle, reductionScheme, workspaceSize, stages, " -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - "inner_shapeId, cluster_shapeId, " -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - "mma_shapeId, cga_shapeId, schedule_mode, " -#endif - "exec_time\n"); - } - - for (int i = 0; i < gemm_num; ++i) { - // tuning of context gemm and logits gemm is not working yet - if (i <= 5 || i == 10) { - continue; - } - int seq_len = i <= 5 ? max_input_len : 1; - - int m = M[i], n = N[i], k = K[i]; - printf("\n-----------------------------\n"); - printf("GEMM test %d: [M: %d, K: %d, N: %d] %s\n", i, m, k, n, mess[i]); - T* d_A = (T*)buffer; - T* d_B = d_A + m * k * batchCount[i]; - T* d_C = d_B + k * n * batchCount[i]; - - float exec_time = 99999.0f; - int fast_algo = 0; - for (int algo = startAlgo; algo <= endAlgo; algo++) { - cublasStatus_t status; - cudaDeviceSynchronize(); - auto start = std::chrono::high_resolution_clock::now(); - for (int ite = 0; ite < ites; ++ite) { - if (i == 1) { - status = cublasGemmStridedBatchedEx(cublas_handle, - CUBLAS_OP_T, - CUBLAS_OP_N, - max_input_len, - max_input_len, - size_per_head, - &alpha, - d_B, - BType, - size_per_head, - max_input_len * size_per_head, - d_A, - AType, - size_per_head, - max_input_len * size_per_head, - &beta, - d_C, - CUDA_R_32F, // CType, - max_input_len, - max_input_len * max_input_len, - batchCount[i], - computeType, - static_cast(algo)); - } - else if (i == 2) { - status = cublasGemmStridedBatchedEx(cublas_handle, - CUBLAS_OP_N, - CUBLAS_OP_N, - size_per_head, - max_input_len, - max_input_len, - &alpha, - d_B, - BType, - size_per_head, - max_input_len * size_per_head, - d_A, - AType, - max_input_len, - max_input_len * max_input_len, - &beta, - d_C, - CType, - size_per_head, - max_input_len * size_per_head, - batchCount[i], - computeType, - static_cast(algo)); - } - else if (i == 10) { - status = cublasGemmEx(cublas_handle, - CUBLAS_OP_T, - CUBLAS_OP_N, - n, - m, - k, - &alpha, - d_B, - BType, - k, - d_A, - AType, - k, - &beta, - d_C, - CType, - n, - computeType, - static_cast(algo)); - } - else { - status = cublasGemmEx(cublas_handle, - CUBLAS_OP_N, - CUBLAS_OP_N, - n, - m, - k, - &alpha, - d_B, - BType, - n, - d_A, - AType, - k, - &beta, - d_C, - CType, - n, - computeType, - static_cast(algo)); - } - - if (status != CUBLAS_STATUS_SUCCESS) { - break; - } - } - cudaDeviceSynchronize(); - auto end = std::chrono::high_resolution_clock::now(); - auto dur = std::chrono::duration(end - start); - if (status == CUBLAS_STATUS_SUCCESS) { - printf("algo_%d costs %.3fms \n", algo, dur.count() / ites); - if (dur.count() / ites < exec_time) { - exec_time = dur.count() / ites; - fast_algo = algo; - } - } - sync_check_cuda_error(); - } - - printf("fast_algo %d costs %.3f ms\n", fast_algo, exec_time); - - // for fp16 and bf16, we compare cublasLt - // for fp8, compare cublaslt for all gemm kernels - if ((data_type != FLOAT_DATATYPE && i != 1 && i != 2 && i != 10) || data_type == FP8_DATATYPE) { - printf("***cublasLt Gemm Testing Beign***\n"); - // Let try a fixed number of combinations - const int ALGO_COMBINATIONS = 10000; - customMatmulPerf_t perfResults[ALGO_COMBINATIONS]; - - // for gpt, computeType & scaleType should be FP32 - LtHgemmCustomFind(ltHandle, - batch_size * beam_width, - i == 1 || i == 2 ? max_input_len : 1, - head_num, - size_per_head, - n, - m, - k, - &alpha, - d_B, - d_A, - &beta, - d_C, - cublas_workspace, - workSpaceSize, - fd, - perfResults, - ALGO_COMBINATIONS, - DType_FP8[i], - batchCount[i], - strideA[i], - strideB[i], - strideD[i]); - if (perfResults[0].time < exec_time) { - printPerfStructure(batch_size * beam_width, - seq_len, - head_num, - size_per_head, - n, - m, - k, - perfResults[0], - fd, - data_type, - 0, - batchCount[i]); - } - else { - fprintf(fd, - "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 " -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - "-1 -1 " -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - "-1 -1 -1 " -#endif - "%f\n", - batch_size * beam_width, - seq_len, - head_num, - size_per_head, - data_type, - batchCount[i], - n, - m, - k, - fast_algo, - exec_time); - } - printf("***cublasLt Gemm Testing End***\n"); - } - else { - fprintf(fd, - "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 " -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - "-1 -1 " -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - "-1 -1 -1 " -#endif - "%f\n", - batch_size * beam_width, - seq_len, - head_num, - size_per_head, - data_type, - batchCount[i], - n, - m, - k, - fast_algo, - exec_time); - } - sync_check_cuda_error(); - exec_times[i] = exec_time; - } - printf("***cublas Gemm Testing End***\n\n"); - fclose(fd); - -#ifdef SPARSITY_ENABLED - bool do_sparse_test = false; - if (prop.major == 8 && (prop.minor == 0 || prop.minor == 6) && sizeof(T) == sizeof(half)) { - do_sparse_test = true; - } - if (do_sparse_test) { - printf("***cusparseLt Gemm Testing Begin***\n"); - // Only first 8 cases can be sparse - // - QKV kernel, Projection, FC1, FC2 in context or decoding. - const int spgemm_num = 8; - if (!isAppend) { - fd = fopen(SPGEMM_CONFIG, "w+"); - } - else { - fd = fopen(SPGEMM_CONFIG, "a+"); - std::vector config; - char line[1024]; - while (fgets(line, 1024, fd) != NULL) { - config.push_back(std::string(line)); - } - line_count = config.size(); - // gemm_num configs (cublas/cublasLt), first row is not included - if (config.size() >= (MAX_CONFIG_NUM * spgemm_num + 1)) { - int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * spgemm_num); - fclose(fd); - fd = fopen(SPGEMM_CONFIG, "w+"); - fprintf(fd, "%s", config[0].c_str()); - for (uint i = startIdx; i < config.size(); i++) { - fprintf(fd, "%s", config[i].c_str()); - } - line_count = config.size() - (spgemm_num + 3); - } - } - if (line_count == 0) { - // header line - fprintf(fd, - "batch_size, seq_len, head_num, size_per_head dataType " - "### batchCount, m, n, k, algoId, exec_time\n"); - } - - cusparseLtHandle_t handle; - CHECK_CUSPARSE(cusparseLtInit(&handle)); - cusparseOrder_t order = CUSPARSE_ORDER_COL; - cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; - cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE; - // let's make this optional - cusparseComputeType compute_type = CUSPARSE_COMPUTE_16F; - unsigned alignment = 16; - cudaStream_t stream = 0; - float alpha2 = 1.0f; - float beta2 = 0.0f; - for (int i = 0; i < gemm_num; ++i) { - // skip qk or attn or logit gemms. - if (i == 1 || i == 2 || i == 10) { - continue; - } - - // seq_len is always 1 except context gemms. - int seq_len = i <= 5 ? max_input_len : 1; - - // to be compatible with spgemm wrapper, we let A be the weight matrix - // so m and n are swapped - // A: mxk B: kxn C:mxn - int m = N[i], n = M[i], k = K[i]; - printf("\n-----------------------------\n"); - printf("GEMM test %d: [M: %d, K: %d, N: %d]\n", i, m, k, n); - - if (n % 8 != 0) { - n = div_up(n, 8) * 8; // pad n to be multiple of 8 as FT does. - } - - T* d_A = (T*)buffer; - T* d_B = d_A + m * k * batchCount[i]; - T* d_C = d_B + k * n * batchCount[i]; - T* dA_compressed; - { - cusparseLtMatDescriptor_t mat_A; - CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit( - &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT)) - CHECK_CUSPARSE( - cusparseLtSpMMAPrune2(&handle, &mat_A, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream)) - size_t compressed_size; - CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &mat_A, &compressed_size)) - check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size)); - CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &mat_A, true, opA, d_A, dA_compressed, stream)) - } - - float exec_time = 99999.0f; - int fast_algo = 0; - if (isSparseGemmAvailable(m, n, k)) { - for (int alg = 0; alg < 4; ++alg) { - cudaDeviceSynchronize(); - cusparseLtMatDescriptor_t mat_A, mat_B, mat_C; - void* d_workspace = nullptr; - int num_streams = 1; - cudaStream_t streams[1] = {stream}; - CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit( - &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT)) - CHECK_CUSPARSE( - cusparseLtDenseDescriptorInit(&handle, &mat_B, k, n, k, alignment, CUDA_R_16F, order)) - CHECK_CUSPARSE( - cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_16F, order)) - cudaDeviceSynchronize(); - auto start = std::chrono::high_resolution_clock::now(); - for (int ite = 0; ite < ites; ++ite) { - // initializing MatDesc takes a lot of time - // and these descs can be stored to other place - // whereas storing MatMulPlan to other place will cause errors - cusparseLtMatmulDescriptor_t matmul; - cusparseLtMatmulAlgSelection_t alg_sel; - cusparseLtMatmulPlan_t plan; - CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit( - &handle, &matmul, opA, opB, &mat_A, &mat_B, &mat_C, &mat_C, compute_type)) - CHECK_CUSPARSE( - cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT)) - CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute( - &handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg))) - size_t workspace_size; - CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&handle, &alg_sel, &workspace_size)) - CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel, workspace_size)) - CHECK_CUSPARSE(cusparseLtMatmul(&handle, - &plan, - &alpha2, - dA_compressed, - d_B, - &beta2, - d_C, - d_C, - d_workspace, - streams, - num_streams)) - CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan)) - } - cudaDeviceSynchronize(); - auto end = std::chrono::high_resolution_clock::now(); - auto dur = std::chrono::duration(end - start); - printf("algo_%d costs %.3fms \n", alg, dur.count() / ites); - if (dur.count() < exec_time) { - exec_time = dur.count(); - fast_algo = alg; - } - } - } - exec_time /= ites; - if (exec_time >= exec_times[i]) { - fast_algo = -1; - } - printf("fast_algo %d\n", fast_algo); - fprintf(fd, - "%d %d %d %d %d ### %d %d %d %d %d %f\n", - batch_size * beam_width, - seq_len, - head_num, - size_per_head, - data_type, - batchCount[i], - m, - n, - k, - fast_algo, - exec_time); - cudaFree(dA_compressed); - } - CHECK_CUSPARSE(cusparseLtDestroy(&handle)) - fclose(fd); - printf("***cusparseLt Gemm Testing End***\n"); - } -#endif - - printf("***GPT Gemm Testing End***\n"); - return; -} - -template void generate_gpt_gemm_config(int batch_size, - int beam_width, - int max_input_len, - int head_num, - int size_per_head, - int inter_size, - int vocab_size, - int tensor_para_size, - void* buffer_in, - bool isAppend); - -template void generate_gpt_gemm_config(int batch_size, - int beam_width, - int max_input_len, - int head_num, - int size_per_head, - int inter_size, - int vocab_size, - int tensor_para_size, - void* buffer_in, - bool isAppend); - -#ifdef ENABLE_BF16 -template void generate_gpt_gemm_config<__nv_bfloat16>(int batch_size, - int beam_width, - int max_input_len, - int head_num, - int size_per_head, - int inter_size, - int vocab_size, - int tensor_para_size, - void* buffer_in, - bool isAppend); -#endif - -#ifdef ENABLE_FP8 -template void generate_gpt_gemm_config<__nv_fp8_e4m3>(int batch_size, - int beam_width, - int max_input_len, - int head_num, - int size_per_head, - int inter_size, - int vocab_size, - int tensor_para_size, - void* buffer_in, - bool isAppend); -#endif - -size_t calGptGemmTestBufSizeInByte(int batch_size, - int beam_width, - int max_input_len, - int head_num, - int size_per_head, - int inter_size, - int vocab_size, - int tensor_para_size, - CublasDataType data_type) -{ - size_t buf_size_in_byte = 0; - const size_t hidden_units = head_num * size_per_head; - const size_t local_head_num = head_num / tensor_para_size; - const size_t local_hidden_units = local_head_num * size_per_head; - - // int wordSize = (data_type == FLOAT_DATATYPE ? sizeof(float) : sizeof(half)); - // Because we always use float for some buffer, set the wordSize to float directly. - int wordSize = sizeof(float); - - size_t m = batch_size * beam_width * max_input_len; - std::vector buff_size; - // for context qkv gemm - buff_size.push_back(m * hidden_units + hidden_units * 3 * local_hidden_units + m * 3 * local_hidden_units); - // for context batch gemm - buff_size.push_back(m * local_hidden_units + m * local_hidden_units - + batch_size * beam_width * head_num * max_input_len * max_input_len); - // for context ffn gemm - buff_size.push_back(m * inter_size / tensor_para_size + hidden_units * inter_size / tensor_para_size - + m * hidden_units); - // for vocab - buff_size.push_back(m * hidden_units + hidden_units * ceil(vocab_size / 8.) * 8 / tensor_para_size - + m * ceil(vocab_size / 8.) * 8 / tensor_para_size); - - for (auto t : buff_size) { - buf_size_in_byte = buf_size_in_byte > t ? buf_size_in_byte : t; - } - buf_size_in_byte *= wordSize; - buf_size_in_byte += ((data_type == HALF_DATATYPE || data_type == BFLOAT16_DATATYPE || data_type == FP8_DATATYPE) ? - CUBLAS_WORKSPACE_SIZE : - 0); - - return buf_size_in_byte; -} - -} // namespace turbomind diff --git a/src/turbomind/utils/gemm_test/gpt_gemm_func.h b/src/turbomind/utils/gemm_test/gpt_gemm_func.h deleted file mode 100644 index bcbe131d8b..0000000000 --- a/src/turbomind/utils/gemm_test/gpt_gemm_func.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "src/turbomind/utils/cublasAlgoMap.h" -#include "src/turbomind/utils/cuda_bf16_wrapper.h" -#include "src/turbomind/utils/cuda_utils.h" -#include "src/turbomind/utils/gemm_test/gemm_func.h" - -#include -#include -#include -#ifdef ENABLE_BF16 -#include -#endif -#ifdef ENABLE_FP8 -#include -#endif -#include -#include -#ifdef __linux__ -#include -#include -#endif -#include - -namespace turbomind { - -template -void generate_gpt_gemm_config(int batch_size, - int beam_width, - int seq_len, - int head_num, - int size_per_head, - int inter_size, - int vocab_size, - int tensor_para_size, - void* buffer_in, - bool isAppend); - -size_t calGptGemmTestBufSizeInByte(int batch_size, - int beam_width, - int max_input_len, - int head_num, - int size_per_head, - int inter_size, - int vocab_size, - int tensor_para_size, - CublasDataType data_type); - -} // namespace turbomind diff --git a/src/turbomind/utils/gemm_test/swin_gemm_func.cc b/src/turbomind/utils/gemm_test/swin_gemm_func.cc deleted file mode 100644 index b43f250b03..0000000000 --- a/src/turbomind/utils/gemm_test/swin_gemm_func.cc +++ /dev/null @@ -1,403 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "src/turbomind/utils/gemm_test/swin_gemm_func.h" -#include "src/turbomind/macro.h" -#include - -namespace turbomind { - -template -void generate_swin_gemm_config( - int batch_size, int seq_len, int head_num, int size_per_head, void* buffer_in, bool isAppend) -{ - void* cublas_workspace; - void* buffer; - int workSpaceSize; -#ifdef ENABLE_BF16 - if (std::is_same::value || std::is_same::value) { -#else - if (std::is_same::value) { -#endif // ENABLE_BF16 - // cublas_workspace_ should be the start pointer of cudaMalloc() - // to ensure 16B alignemnet - cublas_workspace = buffer_in; - buffer = (void*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE); - workSpaceSize = CUBLAS_WORKSPACE_SIZE; - } - else { - cublas_workspace = nullptr; - buffer = buffer_in; - workSpaceSize = 0; - } - - struct cudaDeviceProp prop; - check_cuda_error(cudaGetDeviceProperties(&prop, 0)); - printf("Device %s\n", prop.name); - - // check config - FILE* fd; - int line_count = 0; - if (!isAppend) { - fd = fopen(GEMM_CONFIG, "w+"); - fprintf( - fd, - "batch_size seq_len head_num size_per_head dataType ### batchCount n m k algoId customOption tile splitK_val swizzle reductionScheme workspaceSize stages exec_time\n"); - } - else { - fd = fopen(GEMM_CONFIG, "a+"); - std::vector config; - char line[1024]; - while (fgets(line, 1024, fd) != NULL) { - config.push_back(std::string(line)); - } - line_count = config.size(); - if (config.size() >= (MAX_CONFIG_NUM * GEMM_NUM + 1)) // 6 cublas/cublasLt, first row is not included - { - int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * GEMM_NUM); - fclose(fd); - fd = fopen(GEMM_CONFIG, "w+"); - fprintf(fd, "%s", config[0].c_str()); - for (uint i = startIdx; i < config.size(); i++) { - fprintf(fd, "%s", config[i].c_str()); - } - line_count = config.size() - (GEMM_NUM + 3); - } - } - - const int gemm_num = 7; - const int NUM_OF_BASIC_LAYERS = 4; - int M[gemm_num]; - int N[gemm_num]; - int K[gemm_num]; - int batchCount[gemm_num] = {1, 1, 1, 1, 1, 1, 1}; - char mess[gemm_num][256]; - float exec_times[gemm_num]; - - printf("***Encoder Gemm Testing Begin***\n"); - printf("***Cublas Gemm Testing Begin***\n"); - for (int basic_layer = 0; basic_layer < NUM_OF_BASIC_LAYERS; basic_layer++) { - // gemm1 - M[0] = batch_size * seq_len; - K[0] = head_num * size_per_head; - N[0] = 3 * K[0]; - strcpy(mess[0], "from_tensor * weightQ/K/V"); - - // gemm2 - M[1] = M[0]; - K[1] = K[0]; - N[1] = K[0]; - strcpy(mess[1], "attr * output_kernel"); - - // gemm3 - M[2] = M[0]; - K[2] = K[0]; - N[2] = 4 * K[0]; - strcpy(mess[2], "attr_output * inter_kernel"); - - // gemm3 - M[3] = M[0]; - K[3] = 4 * K[0]; - N[3] = K[0]; - strcpy(mess[3], "inter_matmul * output_kernel"); - - M[4] = M[0] / 4; - K[4] = 4 * K[0]; - N[4] = 2 * K[0]; - strcpy(mess[4], "patchMerge gemm"); - - M[5] = seq_len; - N[5] = seq_len; - K[5] = size_per_head; - batchCount[5] = batch_size * head_num; - strcpy(mess[5], "attention batched Gemm1"); - - M[6] = seq_len; - N[6] = size_per_head; - K[6] = seq_len; - batchCount[6] = batch_size * head_num; - strcpy(mess[6], "attention batched Gemm2"); - - cublasHandle_t cublas_handle; - check_cuda_error(cublasCreate(&cublas_handle)); - cublasLtHandle_t ltHandle; - check_cuda_error(cublasLtCreate(<Handle)); - - cudaDataType_t AType; - cudaDataType_t BType; - cudaDataType_t CType; - cudaDataType_t computeType; - int startAlgo, endAlgo; - const int ites = 100; - - CublasDataType data_type; - if (std::is_same::value) { - data_type = FLOAT_DATATYPE; - AType = CUDA_R_32F; - BType = CUDA_R_32F; - CType = CUDA_R_32F; - computeType = CUDA_R_32F; - startAlgo = (int)CUBLAS_GEMM_DEFAULT; - endAlgo = (int)CUBLAS_GEMM_ALGO23; - } - else if (std::is_same::value) { - data_type = HALF_DATATYPE; - AType = CUDA_R_16F; - BType = CUDA_R_16F; - CType = CUDA_R_16F; - computeType = CUDA_R_32F; - startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP; - endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP; - } -#ifdef ENABLE_BF16 - else if (std::is_same::value) { - data_type = BFLOAT16_DATATYPE; - AType = CUDA_R_16BF; - BType = CUDA_R_16BF; - CType = CUDA_R_16BF; - computeType = CUDA_R_32F; - startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP; - endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP; - } -#endif - using scaleT = typename ScaleTypeConverter::Type; - - scaleT alpha = (scaleT)1.0f; - scaleT beta = (scaleT)0.0f; - - for (int i = 0; i < gemm_num; ++i) { - // if(i != 0 && i != 5) continue; - - int m = M[i], n = N[i], k = K[i]; - printf("\n-----------------------------\n"); - printf("GEMM test %d: [M: %d, K: %d, N: %d] %s\n", i, m, k, n, mess[i]); - T* d_A = (T*)buffer; - T* d_B = d_A + m * k * batchCount[i]; - T* d_C = d_B + k * n * batchCount[i]; - - // array of pointer for batchedGemm - T* harray[12]; - harray[0] = (T*)buffer; - harray[1] = (T*)((char*)buffer + sizeof(T) * m * k); - harray[2] = (T*)((char*)buffer + 2 * sizeof(T) * m * k); - harray[4] = (T*)((char*)buffer + 3 * sizeof(T) * m * k); - harray[5] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + sizeof(T) * k * n); - harray[6] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 2 * sizeof(T) * k * n); - harray[8] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n); - harray[9] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n + sizeof(T) * m * n); - harray[10] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n + 2 * sizeof(T) * m * n); - - T** darray = 0; - check_cuda_error(cudaMalloc((void**)&darray, sizeof(T*) * 12)); - cudaMemcpy((void*)darray, (void*)harray, sizeof(T*) * 12, cudaMemcpyHostToDevice); - T** dAarray = darray; - T** dBarray = darray + 4; - T** dCarray = darray + 8; - - float exec_time = 99999.0f; - int fast_algo = 0; - for (int algo = startAlgo; algo <= endAlgo; algo++) { - cublasStatus_t status; - cudaDeviceSynchronize(); - auto start = std::chrono::high_resolution_clock::now(); - for (int ite = 0; ite < ites; ++ite) { - if (i < 5) { - status = cublasGemmEx(cublas_handle, - CUBLAS_OP_N, - CUBLAS_OP_N, - n, - m, - k, - &alpha, - d_B, - BType, - n, - d_A, - AType, - k, - &beta, - d_C, - CType, - n, - computeType, - static_cast(algo)); - } - else if (i == 5) { - status = cublasGemmStridedBatchedEx(cublas_handle, - CUBLAS_OP_T, - CUBLAS_OP_N, - seq_len, - seq_len, - size_per_head, - &alpha, - d_B, - BType, - size_per_head, - seq_len * size_per_head, - d_A, - AType, - size_per_head, - seq_len * size_per_head, - &beta, - d_C, - CType, - seq_len, - seq_len * seq_len, - batch_size * head_num, - computeType, - static_cast(algo)); - } - else if (i == 6) { - status = cublasGemmStridedBatchedEx(cublas_handle, - CUBLAS_OP_N, - CUBLAS_OP_N, - size_per_head, - seq_len, - seq_len, - &alpha, - d_B, - BType, - size_per_head, - seq_len * size_per_head, - d_A, - AType, - seq_len, - seq_len * seq_len, - &beta, - d_C, - CType, - size_per_head, - seq_len * size_per_head, - batch_size * head_num, - computeType, - static_cast(algo)); - } - if (status != CUBLAS_STATUS_SUCCESS) { - break; - } - } - cudaDeviceSynchronize(); - auto end = std::chrono::high_resolution_clock::now(); - auto dur = std::chrono::duration(end - start); - if (status == CUBLAS_STATUS_SUCCESS) { - printf("algo_%d costs %.3fms \n", algo, dur.count() / ites); - if (dur.count() / ites < exec_time) { - exec_time = dur.count() / ites; - fast_algo = algo; - } - } - } - printf("fast_algo %d costs %.3f ms\n", fast_algo, exec_time); - - // for fp16 and bf16, we compare cublasLt - if (i < 5 && data_type != FLOAT_DATATYPE) { - printf("***cublasLt Gemm Testing Begin***\n"); - // Let try a fixed number of combinations - const int ALGO_COMBINATIONS = 5000; - customMatmulPerf_t perfResults[ALGO_COMBINATIONS]; - - LtHgemmCustomFind(ltHandle, - batch_size, - seq_len, - head_num, - size_per_head, - n, - m, - k, - &alpha, - d_B, - d_A, - &beta, - d_C, - cublas_workspace, - workSpaceSize, - fd, - perfResults, - ALGO_COMBINATIONS); - if (perfResults[0].time < exec_time) { - printPerfStructure( - batch_size, seq_len, head_num, size_per_head, n, m, k, perfResults[0], fd, data_type, 0); - exec_time = perfResults[0].time; - } - else { - fprintf(fd, - "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 " -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - "-1 -1 " -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - "-1 -1 -1 " -#endif - "%f\n", - batch_size, - seq_len, - head_num, - size_per_head, - data_type, - batchCount[i], - n, - m, - k, - fast_algo, - exec_time); - } - printf("***cublasLt Gemm Testing End***\n"); - } - else { - fprintf(fd, - "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 " -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - "-1 -1 " -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - "-1 -1 -1 " -#endif - "%f\n", - batch_size, - seq_len, - head_num, - size_per_head, - data_type, - batchCount[i], - n, - m, - k, - fast_algo, - exec_time); - } - exec_times[i] = exec_time; - cudaFree(darray); - } - - if (basic_layer != NUM_OF_BASIC_LAYERS - 1) { - batch_size = batch_size / 4; - head_num = head_num * 2; - } - } - printf("***cublas Gemm Testing End***\n\n"); - fclose(fd); - printf("***Encoder Gemm Testing End***\n"); - return; -} - -template void generate_swin_gemm_config( - int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend); -template void generate_swin_gemm_config( - int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend); -#ifdef ENABLE_BF16 -template void generate_swin_gemm_config<__nv_bfloat16>( - int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend); -#endif - -} // namespace turbomind diff --git a/src/turbomind/utils/gemm_test/swin_gemm_func.h b/src/turbomind/utils/gemm_test/swin_gemm_func.h deleted file mode 100644 index 815da7b197..0000000000 --- a/src/turbomind/utils/gemm_test/swin_gemm_func.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "src/turbomind/utils/cublasAlgoMap.h" -#include "src/turbomind/utils/cuda_bf16_wrapper.h" -#include "src/turbomind/utils/cuda_utils.h" -#include "src/turbomind/utils/gemm_test/gemm_func.h" - -#include -#include -#include -#include -#include -#include -#ifdef __linux__ -#include -#include -#endif -#include - -namespace turbomind { - -template -void generate_swin_gemm_config( - int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend = true); - -} // namespace turbomind diff --git a/src/turbomind/utils/gemm_test/swin_igemm_func.cc b/src/turbomind/utils/gemm_test/swin_igemm_func.cc deleted file mode 100644 index 08b28b1656..0000000000 --- a/src/turbomind/utils/gemm_test/swin_igemm_func.cc +++ /dev/null @@ -1,282 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "swin_igemm_func.h" -#include - -namespace turbomind { - -static const char* showStatus(cublasStatus_t error) -{ - switch (error) { - case CUBLAS_STATUS_SUCCESS: - return "CUBLAS_STATUS_SUCCESS"; - - case CUBLAS_STATUS_NOT_INITIALIZED: - return "CUBLAS_STATUS_NOT_INITIALIZED"; - - case CUBLAS_STATUS_ALLOC_FAILED: - return "CUBLAS_STATUS_ALLOC_FAILED"; - - case CUBLAS_STATUS_INVALID_VALUE: - return "CUBLAS_STATUS_INVALID_VALUE"; - - case CUBLAS_STATUS_ARCH_MISMATCH: - return "CUBLAS_STATUS_ARCH_MISMATCH"; - - case CUBLAS_STATUS_MAPPING_ERROR: - return "CUBLAS_STATUS_MAPPING_ERROR"; - - case CUBLAS_STATUS_EXECUTION_FAILED: - return "CUBLAS_STATUS_EXECUTION_FAILED"; - - case CUBLAS_STATUS_INTERNAL_ERROR: - return "CUBLAS_STATUS_INTERNAL_ERROR"; - - case CUBLAS_STATUS_NOT_SUPPORTED: - return "CUBLAS_STATUS_NOT_SUPPORTED"; - - case CUBLAS_STATUS_LICENSE_ERROR: - return "CUBLAS_STATUS_LICENSE_ERROR"; - } - - return ""; -} - -static inline bool time_compare(const customMatmulPerf_t& perf_a, const customMatmulPerf_t& perf_b) -{ - return ((perf_a.status == CUBLAS_STATUS_SUCCESS) && (perf_a.time < perf_b.time)); -} - -static cublasStatus_t customMatmulRun(cublasLtHandle_t ltHandle, // to get the capabilities (required a GPU) - cublasLtMatmulDesc_t operationDesc, - const void* alpha, /* host or device pointer */ - const void* A, - cublasLtMatrixLayout_t Adesc, - const void* B, - cublasLtMatrixLayout_t Bdesc, - const void* beta, /* host or device pointer */ - const void* C, - cublasLtMatrixLayout_t Cdesc, - void* D, - cublasLtMatrixLayout_t Ddesc, - const cublasLtMatmulAlgo_t& algo, - int kernelRepeats, - void* workSpace, - size_t workSpaceSizeInBytes, - customMatmulPerf_t& perfResults, - cudaStream_t stream) -{ - cublasLtMatmulHeuristicResult_t heurResult; - /* Looping over the Algo */ - int repeats = kernelRepeats; - cublasStatus_t algoStatus = - cublasLtMatmulAlgoCheck(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, &algo, &heurResult); - if (algoStatus == CUBLAS_STATUS_SUCCESS) { - if (heurResult.workspaceSize <= workSpaceSizeInBytes) { - cublasStatus_t oneRunStatus; - cudaDeviceSynchronize(); - auto start = std::chrono::high_resolution_clock::now(); - for (int loop = 0; loop < repeats; loop++) { - oneRunStatus = cublasLtMatmul(ltHandle, - operationDesc, - alpha, - A, - Adesc, - B, - Bdesc, - beta, - C, - Cdesc, - D, - Ddesc, - &algo, - workSpace, - workSpaceSizeInBytes, - stream); - } - cudaDeviceSynchronize(); - auto end = std::chrono::high_resolution_clock::now(); - auto dur = std::chrono::duration(end - start); - if (oneRunStatus != CUBLAS_STATUS_SUCCESS) { - algoStatus = oneRunStatus; - } - float time = dur.count(); - // For the moment only add successful findings - if (algoStatus == CUBLAS_STATUS_SUCCESS) { - perfResults.algo = algo; - perfResults.time = time / repeats; - perfResults.workspaceSize = heurResult.workspaceSize; - perfResults.wavesCount = heurResult.wavesCount; - } - } - else { - // printf("not enough workspace! %ld\n", heurResult.workspaceSize); - algoStatus = CUBLAS_STATUS_NOT_SUPPORTED; // Not enough workspace - } - } - else { - // printf("check fail!\n"); - } - return algoStatus; -} - -int igemm_config_INT8IO(int m, int n, int k, FILE* fout, void* buffer) -{ - printf("batchCount %d m %d n %d k %d\n", 1, m, n, k); - float alpha = 1.0f; - float beta = 0.0f; - - int8_t* d_A = (int8_t*)buffer; // m * k, stored in column-major - int8_t* d_B = d_A + m * k; // k * n, stored in column-major - int8_t* d_C = (int8_t*)(d_B + k * n); // m * n, stored in column-major - - cublasLtHandle_t ltHandle; - cublasLtCreate(<Handle); - - LtIgemmCustomFind(ltHandle, - m, - n, - k, - &alpha, /* host pointer */ - d_A, - d_B, - &beta, /* host pointer */ - d_C, - NULL, - 0, - fout); - - cublasLtDestroy(ltHandle); - return 0; -} - -int generate_swin_igemm_config( - int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend) -{ - - // ensure program running on SM >= 7.5 - struct cudaDeviceProp prop; - check_cuda_error(cudaGetDeviceProperties(&prop, 0)); - if (!(prop.major >= 8 || (prop.major >= 7 && prop.minor >= 5))) { - printf("[ERROR] INT8 mode > 0 is only supported on device with sm >= 7.5\n "); - exit(-1); - } - printf("Device %s\n", prop.name); - - // check config - FILE* fout; - if (!isAppend) { - fout = fopen(IGEMM_CONFIG, "w+"); - fprintf( - fout, - "batch_size seq_len head_num size_per_head dataType ### batchCount m n k algoId customOption tile splitK_val swizzle reductionScheme workspaceSize stages exec_time\n"); - } - else { - fout = fopen(IGEMM_CONFIG, "a+"); - std::vector config; - char line[1024]; - while (fgets(line, 1024, fout) != NULL) { - config.push_back(std::string(line)); - } - if (config.size() >= MAX_CONFIG_NUM * GEMM_NUM) { - int startIdx = config.size() - (MAX_CONFIG_NUM - 1) * GEMM_NUM; - fclose(fout); - fout = fopen(IGEMM_CONFIG, "w+"); - for (int i = startIdx; i < (int)config.size(); i++) { - fprintf(fout, "%s", config[i].c_str()); - } - } - } - - int m = batch_size * seq_len; - int n = head_num * size_per_head; - int k = n; - int batchCount; - const int NUM_OF_BASIC_LAYERS = 4; - - printf("***Swin IGemm Testing Begin***\n"); - - for (int basic_layer = 0; basic_layer < NUM_OF_BASIC_LAYERS; basic_layer++) { - printf("\n-----------------------------\n"); - batchCount = 1; - m = batch_size * seq_len; - k = head_num * size_per_head; - n = 3 * head_num * size_per_head; - if (n % 32 != 0 || k % 32 != 0) { - printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k); - } - else { - igemm_config_INT8IO(m, n, k, fout, buffer); - } - - printf("\n-----------------------------\n"); - m = batch_size * seq_len; - n = head_num * size_per_head; - k = head_num * size_per_head; - if (n % 32 != 0 || k % 32 != 0) { - printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k); - } - else { - igemm_config_INT8IO(m, n, k, fout, buffer); - } - - printf("\n-----------------------------\n"); - m = batch_size * seq_len; - n = 4 * head_num * size_per_head; - k = head_num * size_per_head; - if (n % 32 != 0 || k % 32 != 0) { - printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k); - } - else { - igemm_config_INT8IO(m, n, k, fout, buffer); - } - - printf("\n-----------------------------\n"); - m = batch_size * seq_len; - n = head_num * size_per_head; - k = 4 * head_num * size_per_head; - if (n % 32 != 0 || k % 32 != 0) { - printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k); - } - else { - igemm_config_INT8IO(m, n, k, fout, buffer); - } - - if (basic_layer != NUM_OF_BASIC_LAYERS - 1) { - printf("\n-----------------------------\n"); - batch_size = batch_size / 4; - head_num = head_num * 2; - m = batch_size * seq_len; - n = head_num * size_per_head; - k = 2 * head_num * size_per_head; - if (n % 32 != 0 || k % 32 != 0) { - printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k); - } - else { - igemm_config_INT8IO(m, n, k, fout, buffer); - } - } - printf("\n-----------------------------\n"); - } - - fclose(fout); - printf("\n-----------------------------\n"); - printf("***Swin IGemm Testing End***\n"); - return 0; -} - -} // namespace turbomind diff --git a/src/turbomind/utils/gemm_test/swin_igemm_func.h b/src/turbomind/utils/gemm_test/swin_igemm_func.h deleted file mode 100644 index 21603dc57d..0000000000 --- a/src/turbomind/utils/gemm_test/swin_igemm_func.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "src/turbomind/utils/cublasAlgoMap.h" -#include "src/turbomind/utils/cuda_utils.h" -#include "src/turbomind/utils/gemm_test/encoder_igemm_func.h" -#include -#include -#include -#include -#include -#include -#include -#ifdef __linux__ -#include -#include -#endif -#include - -namespace turbomind { - -/* CAUTION : must match cublasLtMatmulTile_t */ -// const char* const matmulTileName[] = { -// "UNDEF", "8x8", "8x16", "16x8", "8x32", "16x16", "32x8", "8x64", "16x32", -// "32x16", "64x8", "32x32", "32x64", "64x32", "32x128", "64x64", "128x32", "64x128", -// "128x64", "64x256", "128x128", "256x64", "64x512", "128x256", "256x128", "512x64", -// }; - -int generate_swin_igemm_config( - int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend = true); - -} // namespace turbomind diff --git a/src/turbomind/utils/gemm_test/t5_gemm_func.cc b/src/turbomind/utils/gemm_test/t5_gemm_func.cc deleted file mode 100644 index 44d26a37b7..0000000000 --- a/src/turbomind/utils/gemm_test/t5_gemm_func.cc +++ /dev/null @@ -1,837 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "src/turbomind/utils/gemm_test/t5_gemm_func.h" -#include "src/turbomind/macro.h" -#include - -namespace turbomind { - -bool isSparseGemmAvailable(size_t m, size_t n, size_t k) -{ - return m % 8 == 0 && n % 8 == 0 && k % 8 == 0; -} - -template -void generate_t5_gemm_config(int batch_size, - int beam_width, - int max_mem_seq_len, - int encoder_d_model, - int encoder_head_num, - int encoder_size_per_head, - int encoder_inter_size, - int decoder_d_model, - int decoder_head_num, - int decoder_size_per_head, - int decoder_inter_size, - int decoder_vocab_size, - int tensor_para_size, - void* buffer_in, - bool isAppend, - bool is_fp16_compute_type) -{ - FT_CHECK(encoder_head_num % tensor_para_size == 0); - FT_CHECK(decoder_head_num % tensor_para_size == 0); - - void* cublas_workspace; - void* buffer; - int workSpaceSize; -#ifdef ENABLE_BF16 - if (std::is_same::value || std::is_same::value) { -#else - if (std::is_same::value) { -#endif // ENABLE_BF16 - // cublas_workspace_ should be the start pointer of cudaMalloc() - // to ensure 16B alignemnet - cublas_workspace = buffer_in; - buffer = (void*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE); - workSpaceSize = CUBLAS_WORKSPACE_SIZE; - } - else { - cublas_workspace = nullptr; - buffer = buffer_in; - workSpaceSize = 0; - } - - struct cudaDeviceProp prop; - check_cuda_error(cudaGetDeviceProperties(&prop, 0)); - printf("Device %s\n", prop.name); - - // check config - FILE* fd; - int line_count = 0; - if (!isAppend) { - fd = fopen(GEMM_CONFIG, "w+"); - } - else { - fd = fopen(GEMM_CONFIG, "a+"); - std::vector config; - char line[1024]; - while (fgets(line, 1024, fd) != NULL) { - config.push_back(std::string(line)); - } - line_count = config.size(); - if (config.size() >= (MAX_CONFIG_NUM * GEMM_NUM + 1)) // 6 cublas/cublasLt, first row is not included - { - int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * GEMM_NUM); - fclose(fd); - fd = fopen(GEMM_CONFIG, "w+"); - fprintf(fd, "%s", config[0].c_str()); - for (uint i = startIdx; i < config.size(); i++) { - fprintf(fd, "%s", config[i].c_str()); - } - line_count = config.size() - (GEMM_NUM + 3); - } - } - - const int gemm_num = 12; - int M[gemm_num]; - int N[gemm_num]; - int K[gemm_num]; - int batchCount[gemm_num]; - char mess[gemm_num][256]; - float exec_times[gemm_num]; - - // gemm 0 - M[0] = batch_size * max_mem_seq_len; - K[0] = encoder_d_model; - N[0] = encoder_head_num / tensor_para_size * encoder_size_per_head; - batchCount[0] = 3; - strcpy(mess[0], "encoder from_tensor * batched gemm weightQKV"); - - // gemm 1 - M[1] = max_mem_seq_len; - K[1] = encoder_size_per_head; - N[1] = max_mem_seq_len; - batchCount[1] = batch_size * encoder_head_num / tensor_para_size; - strcpy(mess[1], "encoder batch strided gemm Q*K^T"); - - // gemm 2 - M[2] = max_mem_seq_len; - K[2] = max_mem_seq_len; - N[2] = encoder_size_per_head; - batchCount[2] = batch_size * encoder_head_num / tensor_para_size; - strcpy(mess[2], "encoder batch strided gemm QK*V^T"); - - // gemm 3 - M[3] = batch_size * max_mem_seq_len; - K[3] = encoder_head_num / tensor_para_size * encoder_size_per_head; - N[3] = encoder_d_model; - batchCount[3] = 1; - strcpy(mess[3], "encoder attr * output_kernel"); - - // gemm 4 - M[4] = batch_size * max_mem_seq_len; - K[4] = encoder_d_model; - N[4] = encoder_inter_size / tensor_para_size; - batchCount[4] = 1; - strcpy(mess[4], "encoder ffn gemm 1"); - - // gemm 5 - M[5] = batch_size * max_mem_seq_len; - K[5] = encoder_inter_size / tensor_para_size; - N[5] = encoder_d_model; - batchCount[5] = 1; - strcpy(mess[5], "encoder ffn gemm 2"); - - // gemm 6 - M[6] = batch_size * beam_width; - K[6] = decoder_d_model; - N[6] = 3 * decoder_head_num / tensor_para_size * decoder_size_per_head; - batchCount[6] = 1; - strcpy(mess[6], "from_tensor * weightQKV"); - - // gemm 7 - M[7] = batch_size * beam_width; - K[7] = decoder_head_num / tensor_para_size * decoder_size_per_head; - N[7] = decoder_d_model; - batchCount[7] = 1; - strcpy(mess[7], "attr * output_kernel"); - - // gemm 8 - M[8] = batch_size * beam_width; - K[8] = decoder_d_model; - N[8] = decoder_inter_size / tensor_para_size; - batchCount[8] = 1; - strcpy(mess[8], "ffn gemm 1"); - - // gemm 9 - M[9] = batch_size * beam_width; - K[9] = decoder_inter_size / tensor_para_size; - N[9] = decoder_d_model; - batchCount[9] = 1; - strcpy(mess[9], "ffn gemm 2"); - - // gemm 10 - size_t decoder_vocab_size_padded = ((size_t)ceil(decoder_vocab_size / 1. / tensor_para_size) * tensor_para_size); - if (!std::is_same::value) { - decoder_vocab_size_padded = ((size_t)ceil(decoder_vocab_size_padded / 8.) * 8); - } - M[10] = batch_size * beam_width; - K[10] = decoder_d_model; - N[10] = decoder_vocab_size_padded / tensor_para_size; - batchCount[10] = 1; - strcpy(mess[10], "logits gemm"); - - // gemm 11 - M[11] = batch_size * max_mem_seq_len; - K[11] = encoder_d_model; - N[11] = encoder_head_num / tensor_para_size * encoder_size_per_head; - batchCount[11] = 1; - strcpy(mess[11], "encoder from_tensor * splited qkv weight"); - - cublasHandle_t cublas_handle; - check_cuda_error(cublasCreate(&cublas_handle)); - cublasLtHandle_t ltHandle; - check_cuda_error(cublasLtCreate(<Handle)); - - cudaDataType_t AType; - cudaDataType_t BType; - cudaDataType_t CType; - cudaDataType_t computeType; - int startAlgo, endAlgo; - const int ites = 100; - - CublasDataType data_type; - if (std::is_same::value) { - data_type = FLOAT_DATATYPE; - AType = CUDA_R_32F; - BType = CUDA_R_32F; - CType = CUDA_R_32F; - computeType = CUDA_R_32F; - startAlgo = (int)CUBLAS_GEMM_DEFAULT; - endAlgo = (int)CUBLAS_GEMM_ALGO23; - } - else if (std::is_same::value) { - data_type = HALF_DATATYPE; - AType = CUDA_R_16F; - BType = CUDA_R_16F; - CType = CUDA_R_16F; - computeType = CUDA_R_32F; - startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP; - endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP; - } -#ifdef ENABLE_BF16 - else if (std::is_same::value) { - data_type = BFLOAT16_DATATYPE; - AType = CUDA_R_16BF; - BType = CUDA_R_16BF; - CType = CUDA_R_16BF; - computeType = CUDA_R_32F; - startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP; - endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP; - } -#endif - float f_alpha = (float)1.0f; - float f_beta = (float)0.0f; - - half h_alpha = (half)(1.0f); - half h_beta = (half)(0.0f); - - void* alpha = computeType == CUDA_R_16F ? (void*)(&h_alpha) : (void*)(&f_alpha); - void* beta = computeType == CUDA_R_16F ? (void*)(&h_beta) : (void*)(&f_beta); - - printf("***Encoder Gemm Testing Begin***\n"); - printf("***Cublas Gemm Testing Begin***\n"); - if (line_count == 0) { - fprintf(fd, - "batch_size, seq_len, head_num, size_per_head dataType ### batchCount, n, m, k, algoId, " - "customOption, tile, numSplitsK, swizzle, reductionScheme, workspaceSize, stages, exec_time\n"); - } - for (int i = 0; i < gemm_num; ++i) { - int seq_len = (i <= 5 || i == 11) ? max_mem_seq_len : 1; - int head_num = ((i <= 5 || i == 11) ? encoder_head_num : decoder_head_num) / tensor_para_size; - int size_per_head = (i <= 5 || i == 11) ? encoder_size_per_head : decoder_size_per_head; - - int m = M[i], n = N[i], k = K[i]; - printf("\n-----------------------------\n"); - printf("GEMM test %d: [M: %d, K: %d, N: %d] %s\n", i, m, k, n, mess[i]); - T* d_A = (T*)buffer; - T* d_B = d_A + m * k * batchCount[i]; - T* d_C = d_B + k * n * batchCount[i]; - - // array of pointer for batchedGemm - T* harray[12]; - harray[0] = (T*)buffer; - harray[1] = (T*)((char*)buffer + sizeof(T) * m * k); - harray[2] = (T*)((char*)buffer + 2 * sizeof(T) * m * k); - harray[4] = (T*)((char*)buffer + 3 * sizeof(T) * m * k); - harray[5] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + sizeof(T) * k * n); - harray[6] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 2 * sizeof(T) * k * n); - harray[8] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n); - harray[9] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n + sizeof(T) * m * n); - harray[10] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n + 2 * sizeof(T) * m * n); - - T** darray = 0; - check_cuda_error(cudaMalloc((void**)&darray, sizeof(T*) * 12)); - cudaMemcpy((void*)darray, (void*)harray, sizeof(T*) * 12, cudaMemcpyHostToDevice); - T** dAarray = darray; - T** dBarray = darray + 4; - T** dCarray = darray + 8; - - float exec_time = 99999.0f; - int fast_algo = 0; - for (int algo = startAlgo; algo <= endAlgo; algo++) { - cublasStatus_t status; - cudaDeviceSynchronize(); - auto start = std::chrono::high_resolution_clock::now(); - for (int ite = 0; ite < ites; ++ite) { - if (i == 0) { - status = cublasGemmBatchedEx(cublas_handle, - CUBLAS_OP_N, - CUBLAS_OP_N, - n, - m, - k, - alpha, - (const void* const*)dBarray, - BType, - n, - (const void* const*)dAarray, - AType, - k, - beta, - (void* const*)dCarray, - CType, - n, - batchCount[i], - computeType, - static_cast(algo)); - } - else if (i == 1) { - status = cublasGemmStridedBatchedEx(cublas_handle, - CUBLAS_OP_T, - CUBLAS_OP_N, - max_mem_seq_len, - max_mem_seq_len, - encoder_size_per_head, - alpha, - d_B, - BType, - encoder_size_per_head, - max_mem_seq_len * encoder_size_per_head, - d_A, - AType, - encoder_size_per_head, - max_mem_seq_len * encoder_size_per_head, - beta, - d_C, - CType, // CType, - max_mem_seq_len, - max_mem_seq_len * max_mem_seq_len, - batchCount[i], - computeType, - static_cast(algo)); - } - else if (i == 2) { - status = cublasGemmStridedBatchedEx(cublas_handle, - CUBLAS_OP_N, - CUBLAS_OP_N, - encoder_size_per_head, - max_mem_seq_len, - max_mem_seq_len, - alpha, - d_B, - BType, - encoder_size_per_head, - max_mem_seq_len * encoder_size_per_head, - d_A, - AType, - max_mem_seq_len, - max_mem_seq_len * max_mem_seq_len, - beta, - d_C, - CType, - encoder_size_per_head, - max_mem_seq_len * encoder_size_per_head, - batchCount[i], - computeType, - static_cast(algo)); - } - else if (i == 10) { - status = cublasGemmEx(cublas_handle, - CUBLAS_OP_T, - CUBLAS_OP_N, - n, - m, - k, - alpha, - d_B, - BType, - k, - d_A, - AType, - k, - beta, - d_C, - CType, - n, - computeType, - static_cast(algo)); - } - else { - status = cublasGemmEx(cublas_handle, - CUBLAS_OP_N, - CUBLAS_OP_N, - n, - m, - k, - alpha, - d_B, - BType, - n, - d_A, - AType, - k, - beta, - d_C, - CType, - n, - computeType, - static_cast(algo)); - } - - if (status != CUBLAS_STATUS_SUCCESS) { - break; - } - } - cudaDeviceSynchronize(); - auto end = std::chrono::high_resolution_clock::now(); - auto dur = std::chrono::duration(end - start); - if (status == CUBLAS_STATUS_SUCCESS) { - printf("algo_%d costs %.3fms \n", algo, dur.count() / ites); - if (dur.count() / ites < exec_time) { - exec_time = dur.count() / ites; - fast_algo = algo; - } - } - sync_check_cuda_error(); - } - - printf("fast_algo %d costs %.3f ms\n", fast_algo, exec_time); - - using scaleT = float; - - if (is_fp16_compute_type) { - using scaleT = typename ScaleTypeConverter::Type; - } - - // for fp16 and bf16, we compare cublasLt - if (data_type != FLOAT_DATATYPE && i != 1 && i != 2 && i != 0 && i != 10) { - printf("***cublasLt Gemm Testing Begin***\n"); - // Let try a fixed number of combinations - const int ALGO_COMBINATIONS = 5000; - customMatmulPerf_t perfResults[ALGO_COMBINATIONS]; - - // for t5, computeType & scaleType should be FP32 - if (is_fp16_compute_type) { - using scaleT = typename ScaleTypeConverter::Type; - scaleT alpha_scale = (scaleT)1.0f; - scaleT beta_scale = (scaleT)0.0f; - - LtHgemmCustomFind(ltHandle, - m, - seq_len, - head_num, - size_per_head, - n, - m, - k, - &(alpha_scale), - d_B, - d_A, - &(beta_scale), - d_C, - cublas_workspace, - workSpaceSize, - fd, - perfResults, - ALGO_COMBINATIONS); - } - else { - LtHgemmCustomFind(ltHandle, - m, - seq_len, - head_num, - size_per_head, - n, - m, - k, - &(f_alpha), - d_B, - d_A, - &(f_beta), - d_C, - cublas_workspace, - workSpaceSize, - fd, - perfResults, - ALGO_COMBINATIONS); - } - - if (perfResults[0].time < exec_time) { - printPerfStructure(batch_size * (i <= 5 || i == 1 ? 1 : beam_width), - seq_len, - head_num, - size_per_head, - n, - m, - k, - perfResults[0], - fd, - data_type, - 0); - } - else { - fprintf(fd, - "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 " -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - "-1 -1 " -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - "-1 -1 -1 " -#endif - "%f\n", - batch_size * (i <= 5 || i == 1 ? 1 : beam_width), - seq_len, - head_num, - size_per_head, - data_type, - batchCount[i], - n, - m, - k, - fast_algo, - exec_time); - } - printf("***cublasLt Gemm Testing End***\n"); - } - else { - fprintf(fd, - "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 " -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - "-1 -1 " -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - "-1 -1 -1 " -#endif - "%f\n", - batch_size * (i <= 5 || i == 1 ? 1 : beam_width), - seq_len, - head_num, - size_per_head, - data_type, - batchCount[i], - n, - m, - k, - fast_algo, - exec_time); - } - sync_check_cuda_error(); - exec_times[i] = exec_time; - } - printf("***cublas Gemm Testing End***\n\n"); - fclose(fd); - -#ifdef SPARSITY_ENABLED - bool do_sparse_test = false; - if (prop.major == 8 && (prop.minor == 0 || prop.minor == 6) && sizeof(T) == sizeof(half)) { - do_sparse_test = true; - } - if (do_sparse_test) { - printf("***cusparseLt Gemm Testing Begin***\n"); - // Only first 8 cases can be sparse - // - QKV kernel, Projection, FC1, FC2 in context or decoding. - const int spgemm_num = 8; - if (!isAppend) { - fd = fopen(SPGEMM_CONFIG, "w+"); - } - else { - fd = fopen(SPGEMM_CONFIG, "a+"); - std::vector config; - char line[1024]; - while (fgets(line, 1024, fd) != NULL) { - config.push_back(std::string(line)); - } - line_count = config.size(); - // gemm_num configs (cublas/cublasLt), first row is not included - if (config.size() >= (MAX_CONFIG_NUM * spgemm_num + 1)) { - int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * spgemm_num); - fclose(fd); - fd = fopen(SPGEMM_CONFIG, "w+"); - fprintf(fd, "%s", config[0].c_str()); - for (uint i = startIdx; i < config.size(); i++) { - fprintf(fd, "%s", config[i].c_str()); - } - line_count = config.size() - (spgemm_num + 3); - } - } - if (line_count == 0) { - // header line - fprintf(fd, - "batch_size, seq_len, head_num, size_per_head dataType " - "### batchCount, m, n, k, algoId, exec_time\n"); - } - - cusparseLtHandle_t handle; - CHECK_CUSPARSE(cusparseLtInit(&handle)); - cusparseOrder_t order = CUSPARSE_ORDER_COL; - cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; - cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE; - // let's make this optional - cusparseComputeType compute_type = CUSPARSE_COMPUTE_16F; - unsigned alignment = 16; - cudaStream_t stream = 0; - float alpha2 = 1.0f; - float beta2 = 0.0f; - for (int i = 0; i < gemm_num; ++i) { - // skip qk or attn or logit gemms. - if (i == 1 || i == 2 || i == 10) { - continue; - } - - // seq_len is always 1 except context gemms. - int seq_len = i <= 5 ? max_mem_seq_len : 1; - int head_num = (i <= 5 ? encoder_head_num : decoder_head_num) / tensor_para_size; - int size_per_head = i <= 5 ? encoder_size_per_head : decoder_size_per_head; - - // to be compatible with spgemm wrapper, we let A be the weight matrix - // so m and n are swapped - // A: mxk B: kxn C:mxn - int m = N[i], n = M[i], k = K[i]; - printf("\n-----------------------------\n"); - printf("GEMM test %d: [M: %d, K: %d, N: %d]\n", i, m, k, n); - T* d_A = (T*)buffer; - T* d_B = d_A + m * k * batchCount[i]; - T* d_C = d_B + k * n * batchCount[i]; - T* dA_compressed; - { - cusparseLtMatDescriptor_t mat_A; - CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit( - &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT)) - CHECK_CUSPARSE( - cusparseLtSpMMAPrune2(&handle, &mat_A, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream)) - size_t compressed_size; - CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &mat_A, &compressed_size)) - check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size)); - CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &mat_A, true, opA, d_A, dA_compressed, stream)) - } - - float exec_time = 99999.0f; - int fast_algo = 0; - if (isSparseGemmAvailable(m, n, k)) { - for (int alg = 0; alg < 4; ++alg) { - cudaDeviceSynchronize(); - cusparseLtMatDescriptor_t mat_A, mat_B, mat_C; - void* d_workspace = nullptr; - int num_streams = 1; - cudaStream_t streams[1] = {stream}; - CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit( - &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT)) - CHECK_CUSPARSE( - cusparseLtDenseDescriptorInit(&handle, &mat_B, k, n, k, alignment, CUDA_R_16F, order)) - CHECK_CUSPARSE( - cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_16F, order)) - cudaDeviceSynchronize(); - auto start = std::chrono::high_resolution_clock::now(); - for (int ite = 0; ite < ites; ++ite) { - // initializing MatDesc takes a lot of time - // and these descs can be stored to other place - // whereas storing MatMulPlan to other place will cause errors - cusparseLtMatmulDescriptor_t matmul; - cusparseLtMatmulAlgSelection_t alg_sel; - cusparseLtMatmulPlan_t plan; - CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit( - &handle, &matmul, opA, opB, &mat_A, &mat_B, &mat_C, &mat_C, compute_type)) - CHECK_CUSPARSE( - cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT)) - CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute( - &handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg))) - size_t workspace_size; - CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&handle, &alg_sel, &workspace_size)) - CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel, workspace_size)) - CHECK_CUSPARSE(cusparseLtMatmul(&handle, - &plan, - &alpha2, - dA_compressed, - d_B, - &beta2, - d_C, - d_C, - d_workspace, - streams, - num_streams)) - CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan)) - } - cudaDeviceSynchronize(); - auto end = std::chrono::high_resolution_clock::now(); - auto dur = std::chrono::duration(end - start); - printf("algo_%d costs %.3fms \n", alg, dur.count() / ites); - if (dur.count() < exec_time) { - exec_time = dur.count(); - fast_algo = alg; - } - } - } - exec_time /= ites; - if (exec_time >= exec_times[i]) { - fast_algo = -1; - } - printf("fast_algo %d\n", fast_algo); - fprintf(fd, - "%d %d %d %d %d ### %d %d %d %d %d %f\n", - batch_size * beam_width, - seq_len, - head_num, - size_per_head, - data_type, - batchCount[i], - m, - n, - k, - fast_algo, - exec_time); - cudaFree(dA_compressed); - } - CHECK_CUSPARSE(cusparseLtDestroy(&handle)) - fclose(fd); - printf("***cusparseLt Gemm Testing End***\n"); - } -#endif - - printf("***T5 Gemm Testing End***\n"); - return; -} - -template void generate_t5_gemm_config(int batch_size, - int beam_width, - int max_mem_seq_len, - int encoder_d_model, - int encoder_head_num, - int encoder_size_per_head, - int encoder_inter_size, - int decoder_d_model, - int decoder_head_num, - int decoder_size_per_head, - int decoder_inter_size, - int decoder_vocab_size, - int tensor_para_size, - void* buffer_in, - bool isAppend, - bool is_fp16_compute_type); - -template void generate_t5_gemm_config(int batch_size, - int beam_width, - int max_mem_seq_len, - int encoder_d_model, - int encoder_head_num, - int encoder_size_per_head, - int encoder_inter_size, - int decoder_d_model, - int decoder_head_num, - int decoder_size_per_head, - int decoder_inter_size, - int decoder_vocab_size, - int tensor_para_size, - void* buffer_in, - bool isAppend, - bool is_fp16_compute_type); - -#ifdef ENABLE_BF16 -template void generate_t5_gemm_config<__nv_bfloat16>(int batch_size, - int beam_width, - int max_mem_seq_len, - int encoder_d_model, - int encoder_head_num, - int encoder_size_per_head, - int encoder_inter_size, - int decoder_d_model, - int decoder_head_num, - int decoder_size_per_head, - int decoder_inter_size, - int decoder_vocab_size, - int tensor_para_size, - void* buffer_in, - bool isAppend, - bool is_fp16_compute_type); -#endif - -size_t calT5GemmTestBufSizeInByte(int batch_size, - int beam_width, - int max_mem_seq_len, - int encoder_d_model, - int encoder_head_num, - int encoder_size_per_head, - int encoder_inter_size, - int decoder_d_model, - int decoder_head_num, - int decoder_size_per_head, - int decoder_inter_size, - int decoder_vocab_size, - int tensor_para_size, - CublasDataType data_type) -{ - const size_t local_encoder_head_num = encoder_head_num / tensor_para_size; - const size_t local_encoder_hidden_units = local_encoder_head_num * encoder_size_per_head; - const size_t local_encoder_inter_size = encoder_inter_size / tensor_para_size; - const size_t local_decoder_head_num = decoder_head_num / tensor_para_size; - const size_t local_decoder_hidden_units = local_decoder_head_num * decoder_size_per_head; - const size_t local_decoder_inter_size = decoder_inter_size / tensor_para_size; - - size_t m = batch_size * max_mem_seq_len; - std::vector buff_size; - - // encoder qkv gemm - buff_size.push_back( - 3 * (m * encoder_d_model + encoder_d_model * local_encoder_hidden_units + m * local_encoder_hidden_units)); - // encoder batch gemm - buff_size.push_back(m * local_encoder_hidden_units + m * local_encoder_hidden_units - + batch_size * beam_width * local_encoder_head_num * max_mem_seq_len * max_mem_seq_len); - // encoder ffn gemm - buff_size.push_back(m * local_encoder_inter_size + encoder_d_model * local_encoder_inter_size - + m * encoder_d_model); - - m = batch_size * beam_width; - // decoder qkv gemm - buff_size.push_back(m * decoder_d_model + decoder_d_model * 3 * local_decoder_hidden_units - + 3 * m * local_decoder_hidden_units); - // decoder cross mem gemm - buff_size.push_back(m * max_mem_seq_len * encoder_d_model + encoder_d_model * local_decoder_hidden_units - + m * max_mem_seq_len * local_decoder_hidden_units); - // decoder ffn gemm - buff_size.push_back(m * local_decoder_inter_size + decoder_d_model * local_decoder_inter_size - + m * decoder_d_model); - // decoder vocab gemm - size_t decoder_vocab_size_padded = ((size_t)ceil(decoder_vocab_size / 1. / tensor_para_size) * tensor_para_size); - if (data_type != FLOAT_DATATYPE) { - decoder_vocab_size_padded = ((size_t)ceil(decoder_vocab_size_padded / 8.) * 8); - } - buff_size.push_back(m * decoder_d_model + decoder_d_model * decoder_vocab_size_padded / tensor_para_size - + m * decoder_vocab_size_padded / tensor_para_size); - - size_t buf_size_in_byte = 0; - // int wordSize = (data_type == FLOAT_DATATYPE ? sizeof(float) : sizeof(half)); - // Because we always use float for some buffer, set the wordSize to float directly. - int wordSize = sizeof(float); - for (auto t : buff_size) { - buf_size_in_byte = buf_size_in_byte > t ? buf_size_in_byte : t; - } - buf_size_in_byte *= wordSize; - buf_size_in_byte += ((data_type == HALF_DATATYPE || data_type == BFLOAT16_DATATYPE) ? CUBLAS_WORKSPACE_SIZE : 0); - - return buf_size_in_byte; -} - -} // namespace turbomind diff --git a/src/turbomind/utils/gemm_test/t5_gemm_func.h b/src/turbomind/utils/gemm_test/t5_gemm_func.h deleted file mode 100644 index e0883095ae..0000000000 --- a/src/turbomind/utils/gemm_test/t5_gemm_func.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "src/turbomind/utils/cublasAlgoMap.h" -#include "src/turbomind/utils/cuda_bf16_wrapper.h" -#include "src/turbomind/utils/cuda_utils.h" -#include "src/turbomind/utils/gemm_test/gemm_func.h" - -#include -#include -#include -#include -#include -#include -#ifdef __linux__ -#include -#include -#endif -#include - -namespace turbomind { - -template -void generate_t5_gemm_config(int batch_size, - int beam_width, - int max_mem_seq_len, - int encoder_d_model, - int encoder_head_num, - int encoder_size_per_head, - int encoder_inter_size, - int decoder_d_model, - int decoder_head_num, - int decoder_size_per_head, - int decoder_inter_size, - int decoder_vocab_size, - int tensor_para_size, - void* buffer_in, - bool isAppend, - bool is_fp16_compute_type); - -size_t calT5GemmTestBufSizeInByte(int batch_size, - int beam_width, - int max_mem_seq_len, - int encoder_d_model, - int encoder_head_num, - int encoder_size_per_head, - int encoder_inter_size, - int decoder_d_model, - int decoder_head_num, - int decoder_size_per_head, - int decoder_inter_size, - int decoder_vocab_size, - int tensor_para_size, - CublasDataType data_type); - -} // namespace turbomind diff --git a/src/turbomind/utils/gemm_test/xlnet_gemm_func.cc b/src/turbomind/utils/gemm_test/xlnet_gemm_func.cc deleted file mode 100644 index 885b693c29..0000000000 --- a/src/turbomind/utils/gemm_test/xlnet_gemm_func.cc +++ /dev/null @@ -1,463 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "src/turbomind/utils/gemm_test/xlnet_gemm_func.h" -#include "src/turbomind/macro.h" -#include - -namespace turbomind { - -template -void generate_xlnet_gemm_config(int batch_size, - int seq_len, - int head_num, - int size_per_head, - int hidden_units_, - int inter_size_, - void* buffer_in, - bool isAppend) -{ - void* cublas_workspace; - void* buffer; - int workSpaceSize; - -#ifdef ENABLE_BF16 - if (std::is_same::value || std::is_same::value) { -#else - if (std::is_same::value) { -#endif // ENABLE_BF16 - // cublas_workspace_ should be the start pointer of cudaMalloc() - // to ensure 16B alignemnet - cublas_workspace = buffer_in; - buffer = (void*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE); - workSpaceSize = CUBLAS_WORKSPACE_SIZE; - } - else { - cublas_workspace = nullptr; - buffer = buffer_in; - workSpaceSize = 0; - } - - struct cudaDeviceProp prop; - check_cuda_error(cudaGetDeviceProperties(&prop, 0)); - printf("Device %s\n", prop.name); - - // check config - FILE* fd; - int line_count = 0; - if (!isAppend) { - fd = fopen(GEMM_CONFIG, "w+"); - } - else { - fd = fopen(GEMM_CONFIG, "a+"); - std::vector config; - char line[1024]; - while (fgets(line, 1024, fd) != NULL) { - config.push_back(std::string(line)); - } - line_count = config.size(); - if (config.size() >= (MAX_CONFIG_NUM * GEMM_NUM + 1)) // 6 cublas/cublasLt, first row is not included - { - int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * GEMM_NUM); - fclose(fd); - fd = fopen(GEMM_CONFIG, "w+"); - fprintf(fd, "%s", config[0].c_str()); - for (uint i = startIdx; i < config.size(); i++) { - fprintf(fd, "%s", config[i].c_str()); - } - line_count = config.size() - (GEMM_NUM + 3); - } - } - - const int gemm_num = 10; - int M[gemm_num]; - int N[gemm_num]; - int K[gemm_num]; - int lda[gemm_num]; - int strideA[gemm_num]; - int ldb[gemm_num]; - int strideB[gemm_num]; - int ldc[gemm_num]; - int strideC[gemm_num]; - cublasOperation_t transa[gemm_num] = {CUBLAS_OP_N, - CUBLAS_OP_N, - CUBLAS_OP_T, - CUBLAS_OP_T, - CUBLAS_OP_T, - CUBLAS_OP_T, - CUBLAS_OP_N, - CUBLAS_OP_T, - CUBLAS_OP_N, - CUBLAS_OP_N}; - cublasOperation_t transb[gemm_num] = {CUBLAS_OP_N}; - int batchCount[gemm_num] = {1}; - char mess[gemm_num][256]; - - // gemm1 - M[0] = hidden_units_; - N[0] = seq_len * batch_size; - K[0] = hidden_units_; - lda[0] = hidden_units_; - strideA[0] = hidden_units_ * hidden_units_; - ldb[0] = hidden_units_; - strideB[0] = 0; - ldc[0] = hidden_units_; - strideC[0] = seq_len * batch_size * hidden_units_; - batchCount[0] = 3; - strcpy(mess[0], "from_tensor * weightQ/K/V"); - - // gemm2 - M[1] = hidden_units_; - N[1] = seq_len * 2; - K[1] = hidden_units_; - batchCount[1] = 1; - strcpy(mess[1], " k_head_r_"); - - // gemm3 - M[2] = seq_len; - N[2] = seq_len; - K[2] = size_per_head; - lda[2] = size_per_head; - strideA[2] = seq_len * size_per_head; - ldb[2] = size_per_head; - strideB[2] = seq_len * size_per_head; - ldc[2] = seq_len; - strideC[2] = seq_len * seq_len; - batchCount[2] = batch_size * head_num; - strcpy(mess[2], "ac"); - - // gemm4 - M[3] = seq_len * 2; - N[3] = seq_len; - K[3] = size_per_head; - lda[3] = size_per_head; - strideA[3] = seq_len * 2 * size_per_head; - ldb[3] = size_per_head; - strideB[3] = seq_len * size_per_head; - ldc[3] = seq_len * 2; - strideC[3] = seq_len * seq_len * 2; - - batchCount[3] = batch_size * head_num; - strcpy(mess[3], "bd"); - - // gemm5 - M[4] = 2; - N[4] = seq_len; - K[4] = size_per_head; - lda[4] = size_per_head; - strideA[4] = 2 * size_per_head; - ldb[4] = size_per_head; - strideB[4] = seq_len * size_per_head; - ldc[4] = 2; - strideC[4] = seq_len * 2; - batchCount[4] = batch_size * head_num; - strcpy(mess[4], "ef"); - - // gemm6 - M[5] = head_num; - N[5] = seq_len; - K[5] = 2; - lda[5] = 2; - strideA[5] = 2 * head_num; - ldb[5] = 2; - strideB[5] = seq_len * 2; - ldc[5] = head_num; - strideC[5] = seq_len * head_num; - - batchCount[5] = batch_size * seq_len; - strcpy(mess[5], "seg_mat"); - // gemm7 - M[6] = size_per_head; - N[6] = seq_len; - K[6] = seq_len; - lda[6] = size_per_head; - strideA[6] = seq_len * size_per_head; - ldb[6] = seq_len; - strideB[6] = seq_len * seq_len; - ldc[6] = size_per_head; - strideC[6] = seq_len * size_per_head; - - batchCount[6] = batch_size * head_num; - strcpy(mess[6], "attn_vec"); - - // gemm8 - M[7] = hidden_units_; - N[7] = seq_len * batch_size; - K[7] = hidden_units_; - lda[7] = hidden_units_; - batchCount[7] = 1; - strcpy(mess[7], "attn_out"); - - // gemm9 - M[8] = inter_size_; - N[8] = seq_len * batch_size; - K[8] = hidden_units_; - batchCount[8] = 1; - strcpy(mess[8], "output_fc1_"); - - // gemm10 - M[9] = hidden_units_; - N[9] = seq_len * batch_size; - K[9] = inter_size_; - batchCount[9] = 1; - - strcpy(mess[9], "output_fc2_"); - - cublasHandle_t cublas_handle; - check_cuda_error(cublasCreate(&cublas_handle)); - cublasLtHandle_t ltHandle; - check_cuda_error(cublasLtCreate(<Handle)); - - cudaDataType_t AType; - cudaDataType_t BType; - cudaDataType_t CType; - cudaDataType_t computeType; - int startAlgo, endAlgo; - const int ites = 100; - - CublasDataType data_type; - if (std::is_same::value) { - data_type = FLOAT_DATATYPE; - AType = CUDA_R_32F; - BType = CUDA_R_32F; - CType = CUDA_R_32F; - computeType = CUDA_R_32F; - startAlgo = (int)CUBLAS_GEMM_DEFAULT; - endAlgo = (int)CUBLAS_GEMM_ALGO23; - } - else if (std::is_same::value) { - data_type = HALF_DATATYPE; - AType = CUDA_R_16F; - BType = CUDA_R_16F; - CType = CUDA_R_16F; - computeType = CUDA_R_32F; - startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP; - endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP; - } -#ifdef ENABLE_BF16 - else if (std::is_same::value) { - data_type = BFLOAT16_DATATYPE; - AType = CUDA_R_16BF; - BType = CUDA_R_16BF; - CType = CUDA_R_16BF; - computeType = CUDA_R_32F; - startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP; - endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP; - } -#endif - - using scaleT = typename ScaleTypeConverter::Type; - - scaleT alpha = (scaleT)1.0f; - scaleT beta = (scaleT)0.0f; - - printf("***Xlnet Gemm Testing Begin***\n"); - printf("***Cublas Gemm Testing Begin***\n"); - if (line_count == 0) { - fprintf(fd, - "batch_size, seq_len, head_num, size_per_head dataType ### " - "batchCount, n, m, k, algoId, " - "customOption, tile, numSplitsK, swizzle, reductionScheme, " - "workspaceSize, stages, exec_time\n"); - } - for (int i = 0; i < gemm_num; ++i) { - int m = M[i], n = N[i], k = K[i]; - printf("\n-----------------------------\n"); - printf("GEMM test %d: [M: %d, K: %d, N: %d] %s\n", i, m, k, n, mess[i]); - T* d_A = (T*)buffer; - T* d_B = d_A + m * k * batchCount[i]; - T* d_C = d_B + k * n * batchCount[i]; - - float exec_time = 99999.0f; - int fast_algo = 0; - for (int algo = startAlgo; algo <= endAlgo; algo++) { - cublasStatus_t status; - cudaDeviceSynchronize(); - auto start = std::chrono::high_resolution_clock::now(); - for (int ite = 0; ite < ites; ++ite) { - if (i == 1 || i == 7 || i == 8 || i == 9) { - status = cublasGemmEx(cublas_handle, - transa[i], - transb[i], - n, - m, - k, - &alpha, - d_A, - AType, - n, - d_B, - AType, - k, - &beta, - d_C, - CType, - n, - computeType, - static_cast(algo)); - } - else { - status = cublasGemmStridedBatchedEx(cublas_handle, - transa[i], - transb[i], - m, - n, - k, - &alpha, - d_A, - BType, - lda[i], - strideA[i], - d_B, - AType, - ldb[i], - strideB[i], - &beta, - d_C, - CType, - ldc[i], - strideC[i], - batchCount[i], - computeType, - static_cast(algo)); - } - if (status != CUBLAS_STATUS_SUCCESS) { - break; - } - } - cudaDeviceSynchronize(); - auto end = std::chrono::high_resolution_clock::now(); - auto dur = std::chrono::duration(end - start); - if (status == CUBLAS_STATUS_SUCCESS) { - printf("algo_%d costs %.3fms \n", algo, dur.count() / ites); - if (dur.count() / ites < exec_time) { - exec_time = dur.count() / ites; - fast_algo = algo; - } // end if diffTime - } // end status - } // end for algo - - printf("fast_algo %d costs %.3f ms\n", fast_algo, exec_time); - - if ((i == 1 || i == 7 || i == 8 || i == 9) && data_type != FLOAT_DATATYPE) { - printf("***cublasLt Gemm Testing Begin***\n"); - // Let try a fixed number of combinations - const int ALGO_COMBINATIONS = 5000; - customMatmulPerf_t perfResults[ALGO_COMBINATIONS]; - - LtHgemmCustomFind(ltHandle, - batch_size, - seq_len, - head_num, - size_per_head, - n, - m, - k, - &alpha, - d_B, - d_A, - &beta, - d_C, - cublas_workspace, - workSpaceSize, - fd, - perfResults, - ALGO_COMBINATIONS); - if (perfResults[0].time < exec_time) { - printPerfStructure( - batch_size, seq_len, head_num, size_per_head, n, m, k, perfResults[0], fd, data_type, 0); - exec_time = perfResults[0].time; - } - else { - fprintf(fd, - "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 " -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - "-1 -1 " -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - "-1 -1 -1 " -#endif - "%f\n", - batch_size, - seq_len, - head_num, - size_per_head, - data_type, - batchCount[i], - n, - m, - k, - fast_algo, - exec_time); - } - printf("***cublasLt Gemm Testing End***\n"); - } - else { - fprintf(fd, - "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 " -#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3) - "-1 -1 " -#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3) - "-1 -1 -1 " -#endif - "%f\n", - batch_size, - seq_len, - head_num, - size_per_head, - data_type, - batchCount[i], - n, - m, - k, - fast_algo, - exec_time); - } // end else fp16 - } // end i - printf("***cublas Gemm Testing End***\n\n"); - fclose(fd); - printf("***Xlnet Gemm Testing End***\n"); - - return; -} - -template void generate_xlnet_gemm_config(int batch_size, - int seq_len, - int head_num, - int size_per_head, - int hidden_units_, - int inter_size_, - void* buffer_in, - bool isAppend); -template void generate_xlnet_gemm_config(int batch_size, - int seq_len, - int head_num, - int size_per_head, - int hidden_units_, - int inter_size_, - void* buffer_in, - bool isAppend); -#ifdef ENABLE_BF16 -template void generate_xlnet_gemm_config<__nv_bfloat16>(int batch_size, - int seq_len, - int head_num, - int size_per_head, - int hidden_units_, - int inter_size_, - void* buffer_in, - bool isAppend); -#endif - -} // namespace turbomind diff --git a/src/turbomind/utils/gemm_test/xlnet_gemm_func.h b/src/turbomind/utils/gemm_test/xlnet_gemm_func.h deleted file mode 100644 index 240805af4b..0000000000 --- a/src/turbomind/utils/gemm_test/xlnet_gemm_func.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "src/turbomind/utils/cublasAlgoMap.h" -#include "src/turbomind/utils/cuda_bf16_wrapper.h" -#include "src/turbomind/utils/cuda_utils.h" -#include "src/turbomind/utils/gemm_test/gemm_func.h" - -#include -#include -#include -#include -#include -#include -#ifdef __linux__ -#include -#include -#endif -#include - -namespace turbomind { - -template -void generate_xlnet_gemm_config(int batch_size, - int seq_len, - int head_num, - int size_per_head, - int hidden_units_, - int inter_size_, - void* buffer_in, - bool isAppend = true); - -} // namespace turbomind diff --git a/src/turbomind/utils/memory_utils.cu b/src/turbomind/utils/memory_utils.cu index e9a79ea5a1..a31bfd631d 100644 --- a/src/turbomind/utils/memory_utils.cu +++ b/src/turbomind/utils/memory_utils.cu @@ -15,687 +15,11 @@ */ #include "src/turbomind/macro.h" -#include "src/turbomind/utils/Tensor.h" -#include "src/turbomind/utils/cuda_type_utils.cuh" -#include "src/turbomind/utils/logger.h" +#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/memory_utils.h" -#include -#include -#include namespace turbomind { -template -void deviceMalloc(T** ptr, size_t size, cudaStream_t st, bool is_random_initialize) -{ - check_cuda_error(cudaMallocAsync((void**)(ptr), sizeof(T) * size, st)); - if (is_random_initialize) { - cudaRandomUniform(*ptr, size, st); - } -} - -template void deviceMalloc(float** ptr, size_t size, cudaStream_t, bool is_random_initialize); -template void deviceMalloc(half** ptr, size_t size, cudaStream_t, bool is_random_initialize); -#ifdef ENABLE_BF16 -template void deviceMalloc(__nv_bfloat16** ptr, size_t size, cudaStream_t, bool is_random_initialize); -#endif -template void deviceMalloc(uint16_t** ptr, size_t size, cudaStream_t, bool is_random_initialize); -template void deviceMalloc(int** ptr, size_t size, cudaStream_t, bool is_random_initialize); -template void deviceMalloc(bool** ptr, size_t size, cudaStream_t, bool is_random_initialize); -template void deviceMalloc(char** ptr, size_t size, cudaStream_t, bool is_random_initialize); -template void deviceMalloc(int8_t** ptr, size_t size, cudaStream_t, bool is_random_initialize); -#ifdef ENABLE_FP8 -template void deviceMalloc(__nv_fp8_e4m3** ptr, size_t size, cudaStream_t, bool is_random_initialize); -#endif - -template -void deviceFree(T*& ptr, cudaStream_t st) -{ - if (ptr != NULL) { - check_cuda_error(cudaFreeAsync(ptr, st)); - ptr = NULL; - } -} - -template void deviceFree(float*& ptr, cudaStream_t); -template void deviceFree(half*& ptr, cudaStream_t); -#ifdef ENABLE_BF16 -template void deviceFree(__nv_bfloat16*& ptr, cudaStream_t); -#endif -template void deviceFree(unsigned short*& ptr, cudaStream_t); -template void deviceFree(int*& ptr, cudaStream_t); -template void deviceFree(bool*& ptr, cudaStream_t); -template void deviceFree(char*& ptr, cudaStream_t); -template void deviceFree(int8_t*& ptr, cudaStream_t); -template void deviceFree(void*& ptr, cudaStream_t); -#ifdef ENABLE_FP8 -template void deviceFree(__nv_fp8_e4m3*& ptr, cudaStream_t); -#endif - -namespace { - -template -__global__ void fill_kernel(T* devptr, size_t size, T value) -{ - const int idx = threadIdx.x + blockIdx.x * blockDim.x; - for (size_t i = idx; i < size; i += blockDim.x * gridDim.x) { - devptr[i] = value; - } -} - -} // namespace - -template -void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream) -{ - constexpr int threads = 512; - const int blocks = (size + threads - 1) / threads; - fill_kernel<<>>(devptr, size, value); -} - -template void deviceFill(float* devptr, size_t size, float value, cudaStream_t stream); -template void deviceFill(half* devptr, size_t size, half value, cudaStream_t stream); -#ifdef ENABLE_BF16 -template void deviceFill(__nv_bfloat16* devptr, size_t size, __nv_bfloat16 value, cudaStream_t stream); -#endif -template void deviceFill(int* devptr, size_t size, int value, cudaStream_t stream); -template void deviceFill(bool* devptr, size_t size, bool value, cudaStream_t stream); - -template -void cudaD2Hcpy(T* tgt, const T* src, const size_t size) -{ - check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyDeviceToHost)); -} - -template void cudaD2Hcpy(float* tgt, const float* src, size_t size); -template void cudaD2Hcpy(half* tgt, const half* src, size_t size); -#ifdef ENABLE_BF16 -template void cudaD2Hcpy(__nv_bfloat16* tgt, const __nv_bfloat16* src, size_t size); -#endif -template void cudaD2Hcpy(int* tgt, const int* src, size_t size); -template void cudaD2Hcpy(bool* tgt, const bool* src, size_t size); -#ifdef ENABLE_FP8 -template void cudaD2Hcpy(__nv_fp8_e4m3* tgt, const __nv_fp8_e4m3* src, size_t size); -#endif -template void cudaD2Hcpy(unsigned long long* tgt, const unsigned long long* src, size_t size); -template void cudaD2Hcpy(unsigned int* tgt, const unsigned int* src, size_t size); -template void cudaD2Hcpy(int8_t* tgt, const int8_t* src, size_t size); - -template -void cudaH2Dcpy(T* tgt, const T* src, const size_t size) -{ - if (tgt == nullptr || src == nullptr) { - TM_LOG_ERROR("cudaH2Dcpy: dst=%p src=%p, size=%d", tgt, src, (int)(sizeof(T) * size)); - } - check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyHostToDevice)); -} - -template void cudaH2Dcpy(float* tgt, const float* src, size_t size); -template void cudaH2Dcpy(half* tgt, const half* src, size_t size); -#ifdef ENABLE_BF16 -template void cudaH2Dcpy(__nv_bfloat16* tgt, const __nv_bfloat16* src, size_t size); -#endif -template void cudaH2Dcpy(int* tgt, const int* src, size_t size); -template void cudaH2Dcpy(bool* tgt, const bool* src, size_t size); -#ifdef ENABLE_FP8 -template void cudaH2Dcpy(__nv_fp8_e4m3* tgt, const __nv_fp8_e4m3* src, size_t size); -#endif -template void cudaH2Dcpy(unsigned long long* tgt, const unsigned long long* src, size_t size); -template void cudaH2Dcpy(unsigned int* tgt, const unsigned int* src, size_t size); -template void cudaH2Dcpy(int8_t* tgt, const int8_t* src, size_t size); - -template -void cudaD2Dcpy(T* tgt, const T* src, const size_t size) -{ - check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyDeviceToDevice)); -} - -template void cudaD2Dcpy(float* tgt, const float* src, size_t size); -template void cudaD2Dcpy(half* tgt, const half* src, size_t size); -#ifdef ENABLE_BF16 -template void cudaD2Dcpy(__nv_bfloat16* tgt, const __nv_bfloat16* src, size_t size); -#endif -template void cudaD2Dcpy(int* tgt, const int* src, size_t size); -template void cudaD2Dcpy(bool* tgt, const bool* src, size_t size); -template void cudaD2Dcpy(int8_t* tgt, const int8_t* src, size_t size); -#ifdef ENABLE_FP8 -template void cudaD2Dcpy(__nv_fp8_e4m3* tgt, const __nv_fp8_e4m3* src, size_t size); -#endif -template void cudaD2Dcpy(unsigned long long* tgt, const unsigned long long* src, size_t size); - -template -__global__ void cudaCast(T_OUT* dst, T_IN* src, const size_t size) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { - dst[tid] = (T_OUT)((float)(src[tid])); - } -} - -template -void invokeCudaCast(T_OUT* dst, T_IN const* const src, const size_t size, cudaStream_t stream) -{ - cudaCast<<<256, 256, 0, stream>>>(dst, src, size); -} - -template void invokeCudaCast(float* dst, half const* const src, const size_t size, cudaStream_t stream); -#ifdef ENABLE_BF16 -template void invokeCudaCast(float* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream); -template void invokeCudaCast(__nv_bfloat16* dst, float const* const src, const size_t size, cudaStream_t stream); -template void invokeCudaCast(__nv_bfloat16* dst, half const* const src, const size_t size, cudaStream_t stream); -template void invokeCudaCast(half* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream); -#endif -#ifdef ENABLE_FP8 -template void invokeCudaCast(float* dst, __nv_fp8_e4m3 const* const src, const size_t size, cudaStream_t stream); -template void -invokeCudaCast(__nv_bfloat16* dst, __nv_fp8_e4m3 const* const src, const size_t size, cudaStream_t stream); -template void invokeCudaCast(half* dst, __nv_fp8_e4m3 const* const src, const size_t size, cudaStream_t stream); -template void invokeCudaCast(__nv_fp8_e4m3* dst, float const* const src, const size_t size, cudaStream_t stream); -template void -invokeCudaCast(__nv_fp8_e4m3* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream); -template void invokeCudaCast(__nv_fp8_e4m3* dst, half const* const src, const size_t size, cudaStream_t stream); -#endif - -template -void cudaAutoCpy(T* tgt, const T* src, const size_t size, cudaStream_t stream) -{ - if (stream != NULL) { - check_cuda_error(cudaMemcpyAsync(tgt, src, sizeof(T) * size, cudaMemcpyDefault, stream)); - } - else { - check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyDefault)); - } -} - -template void cudaAutoCpy(float* tgt, const float* src, size_t size, cudaStream_t stream); -template void cudaAutoCpy(half* tgt, const half* src, size_t size, cudaStream_t stream); -#ifdef ENABLE_BF16 -template void cudaAutoCpy(__nv_bfloat16* tgt, const __nv_bfloat16* src, size_t size, cudaStream_t stream); -#endif -template void cudaAutoCpy(int* tgt, const int* src, size_t size, cudaStream_t stream); -template void cudaAutoCpy(bool* tgt, const bool* src, size_t size, cudaStream_t stream); -template void cudaAutoCpy(int8_t* tgt, const int8_t* src, size_t size, cudaStream_t stream); -template void cudaAutoCpy(uint* tgt, const uint* src, size_t size, cudaStream_t stream); -template void cudaAutoCpy(unsigned long long* tgt, const unsigned long long* src, size_t size, cudaStream_t stream); -template void cudaAutoCpy(char* tgt, const char* src, size_t size, cudaStream_t stream); - -template void cudaAutoCpy(float const** tgt, float const* const* src, size_t size, cudaStream_t stream); -template void cudaAutoCpy(half const** tgt, half const* const* src, size_t size, cudaStream_t stream); -#ifdef ENABLE_BF16 -template void cudaAutoCpy(__nv_bfloat16 const** tgt, __nv_bfloat16 const* const* src, size_t size, cudaStream_t stream); -#endif -template void cudaAutoCpy(int const** tgt, int const* const* src, size_t size, cudaStream_t stream); -template void cudaAutoCpy(bool const** tgt, bool const* const* src, size_t size, cudaStream_t stream); -template void cudaAutoCpy(int8_t const** tgt, int8_t const* const* src, size_t size, cudaStream_t stream); -template void -cudaAutoCpy(unsigned long long const** tgt, unsigned long long const* const* src, size_t size, cudaStream_t stream); - -template -__global__ void cuda_random_uniform_kernel(T* buffer, const size_t size, const int seq_offset) -{ - const int idx = blockIdx.x * blockDim.x + threadIdx.x; - curandState_t local_state; - curand_init((unsigned long long int)1337, idx + seq_offset, 0, &local_state); - for (size_t index = idx; index < size; index += blockDim.x * gridDim.x) { - buffer[index] = (T)(curand_uniform(&local_state) * 0.2f - 0.1f); - } -} - -template<> -__global__ void cuda_random_uniform_kernel(int* buffer, const size_t size, const int seq_offset) -{ - const int idx = blockIdx.x * blockDim.x + threadIdx.x; - curandState_t local_state; - curand_init((float)1337.f, idx + seq_offset, 0, &local_state); - for (size_t index = idx; index < size; index += blockDim.x * gridDim.x) { - buffer[index] = curand(&local_state); - } -} - -template<> -__global__ void cuda_random_uniform_kernel(bool* buffer, const size_t size, const int seq_offset) -{ - const int idx = blockIdx.x * blockDim.x + threadIdx.x; - curandState_t local_state; - curand_init((float)1337.f, idx + seq_offset, 0, &local_state); - for (size_t index = idx; index < size; index += blockDim.x * gridDim.x) { - buffer[index] = (curand(&local_state) % 2 == 0); - } -} - -template<> -__global__ void cuda_random_uniform_kernel(char* buffer, const size_t size, const int seq_offset) -{ - const int idx = blockIdx.x * blockDim.x + threadIdx.x; - curandState_t local_state; - curand_init((float)1337.f, idx + seq_offset, 0, &local_state); - for (size_t index = idx; index < size; index += blockDim.x * gridDim.x) { - buffer[index] = curand(&local_state) % 0xFF; - } -} - -template -void cudaRandomUniform(T* buffer, const size_t size, cudaStream_t st) -{ - static int seq_offset = 0; - cuda_random_uniform_kernel<<<256, 256, 0, st>>>(buffer, size, seq_offset); - seq_offset += 256 * 256; -} - -template void cudaRandomUniform(float* buffer, const size_t size, cudaStream_t); -template void cudaRandomUniform(half* buffer, const size_t size, cudaStream_t); -#ifdef ENABLE_BF16 -template void cudaRandomUniform(__nv_bfloat16* buffer, const size_t size, cudaStream_t); -#endif -template void cudaRandomUniform(int* buffer, const size_t size, cudaStream_t); -template void cudaRandomUniform(bool* buffer, const size_t size, cudaStream_t); -template void cudaRandomUniform(char* buffer, const size_t size, cudaStream_t); -#ifdef ENABLE_FP8 -template void cudaRandomUniform(__nv_fp8_e4m3* buffer, const size_t size, cudaStream_t); -#endif - -// loads data from binary file. If it succeeds, returns a non-empty vector. If loading fails or -// the product of the elements in shape is 0, this function will return an empty vector. -template -std::vector loadWeightFromBinHelper(std::vector shape, std::string filename) -{ - if (shape.size() > 2) { - printf("[ERROR] shape should have less than two dims \n"); - return std::vector(); - } - - size_t dim0 = shape[0], dim1 = 1; - if (shape.size() == 2) { - dim1 = shape[1]; - } - - size_t size = dim0 * dim1; - if (size == 0) { - TM_LOG_WARNING("shape is zero, skip loading weight from file %s \n", filename.c_str()); - return std::vector(); - } - - std::vector host_array(size); - std::ifstream in(filename, std::ios::in | std::ios::binary); - if (!in.is_open()) { - TM_LOG_WARNING("file %s cannot be opened, loading model fails! \n", filename.c_str()); - return std::vector(); - } - - size_t loaded_data_size = sizeof(T) * size; - in.seekg(0, in.end); - const auto file_size_in_bytes = (size_t)in.tellg(); - in.seekg(0, in.beg); - - TM_LOG_DEBUG("Read " + std::to_string(loaded_data_size) + " bytes from " + filename); - in.read((char*)host_array.data(), loaded_data_size); - - if (file_size_in_bytes != loaded_data_size) { - TM_LOG_WARNING("file %s has %ld, but request %ld, loading model fails!", - filename.c_str(), - file_size_in_bytes, - loaded_data_size); - return std::vector(); - } - in.close(); - // If we succeed, return an array with values. - return host_array; -} - -std::vector loadArrayFromBin(std::vector shape, std::string filename) -{ - return loadWeightFromBinHelper(shape, filename); -} - -template -int loadWeightFromBinFunc(T* ptr, std::vector shape, std::string filename) -{ - std::vector host_array = loadWeightFromBinHelper(shape, filename); - - if (host_array.empty()) { - return 0; - } - - if (std::is_same::value == true) { - cudaH2Dcpy(ptr, (T*)host_array.data(), host_array.size()); - } - else { - T_IN* ptr_2 = nullptr; - deviceMalloc(&ptr_2, host_array.size(), nullptr, false); - cudaH2Dcpy(ptr_2, host_array.data(), host_array.size()); - invokeCudaD2DcpyConvert(ptr, ptr_2, host_array.size()); - deviceFree(ptr_2, nullptr); - } - return 0; -} - -template int loadWeightFromBinFunc(float* ptr, std::vector shape, std::string filename); -template int loadWeightFromBinFunc(half* ptr, std::vector shape, std::string filename); -template int loadWeightFromBinFunc(float* ptr, std::vector shape, std::string filename); -template int loadWeightFromBinFunc(half* ptr, std::vector shape, std::string filename); -template int loadWeightFromBinFunc(int8_t* ptr, std::vector shape, std::string filename); -#ifdef ENABLE_BF16 -template int -loadWeightFromBinFunc<__nv_bfloat16, float>(__nv_bfloat16* ptr, std::vector shape, std::string filename); -template int -loadWeightFromBinFunc<__nv_bfloat16, half>(__nv_bfloat16* ptr, std::vector shape, std::string filename); -template int loadWeightFromBinFunc(float* ptr, std::vector shape, std::string filename); -template int loadWeightFromBinFunc(half* ptr, std::vector shape, std::string filename); -template int loadWeightFromBinFunc<__nv_bfloat16, __nv_bfloat16>(__nv_bfloat16* ptr, - std::vector shape, - std::string filename); -#endif // ENABLE_BF16 -template int loadWeightFromBinFunc(int* ptr, std::vector shape, std::string filename); -#ifdef ENABLE_FP8 -template int -loadWeightFromBinFunc<__nv_fp8_e4m3, float>(__nv_fp8_e4m3* ptr, std::vector shape, std::string filename); -#endif // ENABLE_FP8 - -template -int loadWeightFromBin(T* ptr, std::vector shape, std::string filename, FtCudaDataType model_file_type) -{ - switch (model_file_type) { - case FtCudaDataType::FP32: - loadWeightFromBinFunc(ptr, shape, filename); - break; - case FtCudaDataType::FP16: - loadWeightFromBinFunc(ptr, shape, filename); - break; - case FtCudaDataType::INT8: - loadWeightFromBinFunc(ptr, shape, filename); - break; -#ifdef ENABLE_BF16 - case FtCudaDataType::BF16: - loadWeightFromBinFunc(ptr, shape, filename); - break; -#endif -#ifdef ENABLE_FP8 - case FtCudaDataType::FP8: - loadWeightFromBinFunc(ptr, shape, filename); - break; -#endif - default: - TM_LOG_ERROR("Does not support FtCudaDataType=%d", model_file_type); - FT_CHECK(false); - } - return 0; -} - -template<> -int loadWeightFromBin(int* ptr, std::vector shape, std::string filename, FtCudaDataType model_file_type) -{ - loadWeightFromBinFunc(ptr, shape, filename); - return 0; -} - -template int -loadWeightFromBin(float* ptr, std::vector shape, std::string filename, FtCudaDataType model_file_type); -template int -loadWeightFromBin(half* ptr, std::vector shape, std::string filename, FtCudaDataType model_file_type); -template int -loadWeightFromBin(int8_t* ptr, std::vector shape, std::string filename, FtCudaDataType model_file_type); -#ifdef ENABLE_BF16 -template int -loadWeightFromBin(__nv_bfloat16* ptr, std::vector shape, std::string filename, FtCudaDataType model_file_type); -#endif -#ifdef ENABLE_FP8 -template int -loadWeightFromBin(__nv_fp8_e4m3* ptr, std::vector shape, std::string filename, FtCudaDataType model_file_type); -#endif -template int -loadWeightFromBin(int* ptr, std::vector shape, std::string filename, FtCudaDataType model_file_type); - -template -__global__ void cudaD2DcpyConvert(T_OUT* dst, const T_IN* src, const size_t size) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { - dst[tid] = cuda_cast(src[tid]); - } -} - -template -void invokeCudaD2DcpyConvert(T_OUT* tgt, const T_IN* src, const size_t size, cudaStream_t stream) -{ - cudaD2DcpyConvert<<<256, 256, 0, stream>>>(tgt, src, size); -} - -template void invokeCudaD2DcpyConvert(int8_t* tgt, const float* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(float* tgt, const int8_t* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(float* tgt, const int* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(half* tgt, const int* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(float* tgt, const float* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(half* tgt, const float* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(float* tgt, const half* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(uint* tgt, const int* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(int* tgt, const uint* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(int* tgt, const float* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(int* tgt, const half* src, const size_t size, cudaStream_t stream); - -#ifdef ENABLE_BF16 -template void invokeCudaD2DcpyConvert(__nv_bfloat16* tgt, const float* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(__nv_bfloat16* tgt, const int* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(float* tgt, const __nv_bfloat16* src, const size_t size, cudaStream_t stream); -template void invokeCudaD2DcpyConvert(int* tgt, const __nv_bfloat16* src, const size_t size, cudaStream_t stream); -#endif // ENABLE_BF16 - -template -__global__ void -cudaD2DScaleCpyConvert(T_OUT* dst, const T_IN* src, const float* scale, bool invert_scale, const size_t size) -{ - const float scale_value = invert_scale ? 1.0f / scale[0] : scale[0]; - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { - dst[tid] = cuda_cast(cuda_cast(src[tid]) * scale_value); - } -} - -template -void invokeCudaD2DScaleCpyConvert( - T_OUT* tgt, const T_IN* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream) -{ - cudaD2DScaleCpyConvert<<<256, 256, 0, stream>>>(tgt, src, scale, invert_scale, size); -} - -// clang-format off -template void invokeCudaD2DScaleCpyConvert(float* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); -template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const float* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); -template void invokeCudaD2DScaleCpyConvert(half* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); -template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const half* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); -#ifdef ENABLE_BF16 -template void invokeCudaD2DScaleCpyConvert(__nv_bfloat16* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); -template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const __nv_bfloat16* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); -#endif // ENABLE_BF16 -#ifdef ENABLE_FP8 -template void invokeCudaD2DScaleCpyConvert(float* tgt, const __nv_fp8_e4m3* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream); -#endif // ENABLE_FP8 -// clang-format on - -void invokeCudaD2DcpyHalf2Float(float* dst, half* src, const size_t size, cudaStream_t stream) -{ - invokeCudaD2DcpyConvert(dst, src, size, stream); -} - -void invokeCudaD2DcpyFloat2Half(half* dst, float* src, const size_t size, cudaStream_t stream) -{ - invokeCudaD2DcpyConvert(dst, src, size, stream); -} - -template -void saveToBinary(const T* ptr, const size_t size, std::string filename) -{ - - std::vector h_ptr(size); - cudaD2Hcpy(h_ptr.data(), ptr, size); - std::vector float_ptr(size); - for (size_t i = 0; i < size; i++) { - float_ptr[i] = (float)h_ptr[i]; - } - - std::ofstream out(filename, std::ios::out | std::ios::binary); - FT_CHECK_WITH_INFO(out.is_open(), "Fail to open file " + filename); - - out.write((char*)float_ptr.data(), size * sizeof(float)); -} - -template void saveToBinary(const float* ptr, const size_t size, std::string filename); -template void saveToBinary(const half* ptr, const size_t size, std::string filename); -#ifdef ENABLE_BF16 -template void saveToBinary(const __nv_bfloat16* ptr, const size_t size, std::string filename); -#endif // ENABLE_BF16 - -template<> -void saveToBinary(const int* ptr, const size_t size, std::string filename) -{ - std::vector h_ptr(size); - cudaD2Hcpy(h_ptr.data(), ptr, size); - std::ofstream out(filename, std::ios::out | std::ios::binary); - FT_CHECK_WITH_INFO(out.is_open(), "Fail to open file " + filename); - out.write((char*)h_ptr.data(), size * sizeof(int)); -} - -template -__global__ void fakeCast(T_IN* input_ptr, const size_t size) -{ - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) { - T_fake_type tmp_val = (T_fake_type)((float)input_ptr[i]); - input_ptr[i] = (T_IN)((float)tmp_val); - } -} - -template -void invokeFakeCast(T_IN* input_ptr, const size_t size, cudaStream_t stream) -{ - dim3 block(256); - dim3 grid((size + 255) / 256); - fakeCast<<>>(input_ptr, size); -} - -#ifdef ENABLE_FP8 -__global__ void cudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { - dst[tid] = (float)(src[tid]); - } -} - -void invokeCudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream) -{ - cudaD2Dcpyfp82Float<<<256, 256, 0, stream>>>(dst, src, size); -} - -__global__ void cudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { - dst[tid] = (half)((float)(src[tid])); - } -} - -void invokeCudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream) -{ - cudaD2Dcpyfp82Half<<<256, 256, 0, stream>>>(dst, src, size); -} - -__global__ void cudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { - dst[tid] = (__nv_fp8_e4m3)src[tid]; - } -} - -void invokeCudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size, cudaStream_t stream) -{ - cudaD2DcpyFloat2fp8<<<256, 256, 0, stream>>>(dst, src, size); -} - -__global__ void cudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { - dst[tid] = (__nv_fp8_e4m3)src[tid]; - } -} - -void invokeCudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size, cudaStream_t stream) -{ - cudaD2DcpyHalf2fp8<<<256, 256, 0, stream>>>(dst, src, size); -} - -__global__ void cudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) { - dst[tid] = (__nv_fp8_e4m3)src[tid]; - } -} - -void invokeCudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size, cudaStream_t stream) -{ - cudaD2DcpyBfloat2fp8<<<256, 256, 0, stream>>>(dst, src, size); -} - -#endif // ENABLE_FP8 - -template -__global__ void transpose(T_OUT* dst, T_IN* src, const int dim0, const int dim1) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1; tid += blockDim.x * gridDim.x) { - const int src_col_id = tid % dim1; - const int src_row_id = tid / dim1; - dst[src_col_id * dim0 + src_row_id] = (T_OUT)(src[tid]); - } -} - -template -void invokeInPlaceTranspose(T* data, T* workspace, const int dim0, const int dim1) -{ - // copy data to workspace, and then transpose from workspace to data - cudaD2Dcpy(workspace, data, dim0 * dim1); - transpose<<<256, 256>>>(data, workspace, dim0, dim1); -} - -#ifdef ENABLE_FP8 -template void invokeInPlaceTranspose(__nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const int dim0, const int dim1); -#endif // ENABLE_FP8 -#ifdef ENABLE_BF16 -template void invokeInPlaceTranspose(__nv_bfloat16* data, __nv_bfloat16* workspace, const int dim0, const int dim1); -#endif // ENABLE_BF16 -template void invokeInPlaceTranspose(float* data, float* workspace, const int dim0, const int dim1); - -template -__global__ void transpose0213(T_OUT* dst, T_IN* src, const int dim0, const int dim1, const int dim2, const int dim3) -{ - // src permutation: [0, 1, 2, 3] - // dst permutation: [0, 2, 1, 3] - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1 * dim2 * dim3; - tid += blockDim.x * gridDim.x) { - int tmp_idx = tid; - const int dim_3_idx = tmp_idx % dim3; - tmp_idx = (tmp_idx - dim_3_idx) / dim3; - const int dim_2_idx = tmp_idx % dim2; - tmp_idx = (tmp_idx - dim_2_idx) / dim2; - const int dim_1_idx = tmp_idx % dim1; - tmp_idx = (tmp_idx - dim_1_idx) / dim1; - const int dim_0_idx = tmp_idx % dim0; - dst[dim_0_idx * dim1 * dim2 * dim3 + dim_2_idx * dim1 * dim3 + dim_1_idx * dim3 + dim_3_idx] = src[tid]; - } -} - -template -void invokeInPlaceTranspose0213(T* data, T* workspace, const int dim0, const int dim1, const int dim2, const int dim3) -{ - // copy data to workspace, and then transpose from workspace to data - // Note that this kernel is used for pre-processing and not very efficient. - cudaD2Dcpy(workspace, data, dim0 * dim1 * dim2 * dim3); - transpose0213<<<256, 256>>>(data, workspace, dim0, dim1, dim2, dim3); -} - -#ifdef ENABLE_FP8 -template void invokeInPlaceTranspose0213( - __nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const int dim0, const int dim1, const int dim2, const int dim3); -#endif // ENABLE_FP8 -#ifdef ENABLE_BF16 -template void invokeInPlaceTranspose0213( - __nv_bfloat16* data, __nv_bfloat16* workspace, const int dim0, const int dim1, const int dim2, const int dim3); -#endif // ENABLE_BF16 -template void invokeInPlaceTranspose0213( - float* data, float* workspace, const int dim0, const int dim1, const int dim2, const int dim3); - template __global__ void transpose102(T_OUT* dst, T_IN* src, const int dim0, const int dim1, const int dim2) { @@ -720,139 +44,19 @@ void invokeInPlaceTranspose102( // Note that this kernel is used for pre-processing and not very efficient. const size_t count = dim0 * dim1 * dim2; if (copy) { - cudaAutoCpy(workspace, data, count, stream); + check_cuda_error(cudaMemcpyAsync(workspace, data, sizeof(T) * count, cudaMemcpyDefault, stream)); } const int block = 512; const int grid = std::min((count + block - 1) / block, (size_t)8192); transpose102<<>>(data, workspace, dim0, dim1, dim2); } -#ifdef ENABLE_FP8 -template void invokeInPlaceTranspose102(__nv_fp8_e4m3* data, - __nv_fp8_e4m3* workspace, - const int dim0, - const int dim1, - const int dim2, - bool copy, - cudaStream_t stream); -#endif // ENABLE_FP8 -#ifdef ENABLE_BF16 -template void invokeInPlaceTranspose102(__nv_bfloat16* data, - __nv_bfloat16* workspace, - const int dim0, - const int dim1, - const int dim2, - bool copy, - cudaStream_t stream); -#endif // ENABLE_BF16 -template void invokeInPlaceTranspose102( - half* data, half* workspace, const int dim0, const int dim1, const int dim2, bool copy, cudaStream_t stream); -template void invokeInPlaceTranspose102( - float* data, float* workspace, const int dim0, const int dim1, const int dim2, bool copy, cudaStream_t stream); - -template -void __global__ multiplyScale(T* tensor, float scale, const size_t size) -{ - for (size_t index = threadIdx.x + blockIdx.x * blockDim.x; index < size; index += blockDim.x * gridDim.x) { - tensor[index] = (T)(((float)tensor[index]) * scale); - } -} - -template -void invokeMultiplyScale(T* tensor, float scale, const size_t size, cudaStream_t stream) -{ - int block = 256; - int grid = (size + 255) / 256; - multiplyScale<<>>(tensor, scale, size); -} - -template void invokeMultiplyScale(float* tensor, float scale, const size_t size, cudaStream_t stream); -template void invokeMultiplyScale(half* tensor, float scale, const size_t size, cudaStream_t stream); -#ifdef ENABLE_BF16 -template void invokeMultiplyScale(__nv_bfloat16* tensor, float scale, const size_t size, cudaStream_t stream); -#endif -#ifdef ENABLE_FP8 -template void invokeMultiplyScale(__nv_fp8_e4m3* tensor, float scale, const size_t size, cudaStream_t stream); -#endif - -template -void __global__ divideScale(T* tensor, float scale, const size_t size) -{ - for (size_t index = threadIdx.x + blockIdx.x * blockDim.x; index < size; index += blockDim.x * gridDim.x) { - tensor[index] = (T)(((float)tensor[index]) / scale); - } -} - -template -void invokeDivideScale(T* tensor, float scale, const size_t size, cudaStream_t stream) -{ - int block = 256; - int grid = (size + 255) / 256; - divideScale<<>>(tensor, scale, size); -} - -template void invokeDivideScale(float* tensor, float scale, const size_t size, cudaStream_t stream); -template void invokeDivideScale(half* tensor, float scale, const size_t size, cudaStream_t stream); -#ifdef ENABLE_BF16 -template void invokeDivideScale(__nv_bfloat16* tensor, float scale, const size_t size, cudaStream_t stream); -#endif -#ifdef ENABLE_FP8 -template void invokeDivideScale(__nv_fp8_e4m3* tensor, float scale, const size_t size, cudaStream_t stream); -#endif -#ifdef ENABLE_BF16 -template void invokeFakeCast(float* input_ptr, const size_t size, cudaStream_t stream); -template void -invokeFakeCast<__nv_bfloat16, __nv_bfloat16>(__nv_bfloat16* input_ptr, const size_t size, cudaStream_t stream); -template void invokeFakeCast(half* input_ptr, const size_t size, cudaStream_t stream); -#endif -template void invokeFakeCast(float* input_ptr, const size_t size, cudaStream_t stream); -template void invokeFakeCast(float* input_ptr, const size_t size, cudaStream_t stream); -#ifdef ENABLE_FP8 -template void invokeFakeCast(float* input_ptr, const size_t size, cudaStream_t stream); -template void invokeFakeCast(half* input_ptr, const size_t size, cudaStream_t stream); -template void -invokeFakeCast<__nv_bfloat16, __nv_fp8_e4m3>(__nv_bfloat16* input_ptr, const size_t size, cudaStream_t stream); -#endif - -size_t cuda_datatype_size(FtCudaDataType dt) -{ - static const std::unordered_map sizes{{FtCudaDataType::FP32, sizeof(float)}, - {FtCudaDataType::FP16, sizeof(half)} -#ifdef ENABLE_BF16 - , - {FtCudaDataType::BF16, sizeof(__nv_bfloat16)} -#endif - }; - - return sizes.at(dt); -} - -template -__global__ void check_range(T* buffer, size_t size, T min, T max, bool* d_within_range) -{ - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) { - const T val = buffer[i]; - if (val < min || val > max) { - *d_within_range = false; - } - } -} - -template -bool invokeCheckRange(T* buffer, const size_t size, T min, T max, bool* d_within_range, cudaStream_t stream) -{ - cudaMemsetAsync(d_within_range, true, sizeof(bool), stream); - - dim3 block(256); - dim3 grid((size + 255) / 256); - check_range<<>>(buffer, size, min, max, d_within_range); - - bool result; - cudaD2Hcpy(&result, d_within_range, 1); - return result; -} - -template bool -invokeCheckRange(int* buffer, const size_t size, int min, int max, bool* d_within_range, cudaStream_t stream); +template void invokeInPlaceTranspose102(uint16_t* data, + uint16_t* workspace, + const int dim0, + const int dim1, + const int dim2, + bool copy, + cudaStream_t stream); } // namespace turbomind diff --git a/src/turbomind/utils/memory_utils.h b/src/turbomind/utils/memory_utils.h index 03a0ef7b33..a61408281f 100644 --- a/src/turbomind/utils/memory_utils.h +++ b/src/turbomind/utils/memory_utils.h @@ -16,130 +16,12 @@ #pragma once -#include "src/turbomind/utils/Tensor.h" -#include "src/turbomind/utils/cuda_fp8_utils.h" -#include "src/turbomind/utils/cuda_utils.h" +#include namespace turbomind { -template -void deviceMalloc(T** ptr, size_t size, cudaStream_t st, bool is_random_initialize = false); - -template -void deviceFree(T*& ptr, cudaStream_t st); - -template -void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream = {}); - -template -void cudaD2Hcpy(T* tgt, const T* src, const size_t size); - -template -void cudaH2Dcpy(T* tgt, const T* src, const size_t size); - -template -void cudaD2Dcpy(T* tgt, const T* src, const size_t size); - -template -void cudaAutoCpy(T* tgt, const T* src, const size_t size, cudaStream_t stream = {}); - -template -void cudaRandomUniform(T* buffer, const size_t size, cudaStream_t stream = {}); - -template -int loadWeightFromBin(T* ptr, - std::vector shape, - std::string filename, - FtCudaDataType model_file_type = FtCudaDataType::FP32); - -std::vector loadArrayFromBin(std::vector shape, std::string filename); - -// template -// int loadWeightFromBinAndQuantizeForWeightOnly(int8_t* quantized_weight_ptr, -// T* scale_ptr, -// std::vector shape, -// std::string filename, -// FtCudaDataType model_file_type = FtCudaDataType::FP32); - -void invokeCudaD2DcpyHalf2Float(float* dst, half* src, const size_t size, cudaStream_t stream); -void invokeCudaD2DcpyFloat2Half(half* dst, float* src, const size_t size, cudaStream_t stream); -#ifdef ENABLE_FP8 -void invokeCudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream); -void invokeCudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream); -void invokeCudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size, cudaStream_t stream); -void invokeCudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size, cudaStream_t stream); -void invokeCudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size, cudaStream_t stream); -#endif // ENABLE_FP8 -#ifdef ENABLE_BF16 -void invokeCudaD2DcpyBfloat2Float(float* dst, __nv_bfloat16* src, const size_t size, cudaStream_t stream); -#endif // ENABLE_BF16 - -template -void invokeCudaCast(T_OUT* dst, T_IN const* const src, const size_t size, cudaStream_t stream); - -template -__inline__ __host__ __device__ size_t dim2flat(const T (&idx)[n_dims], const T (&dims)[n_dims]) -{ - size_t flat_idx = 0; - for (size_t i = 0; i < n_dims; i++) { - flat_idx += idx[i]; - if (i + 1 < n_dims) - flat_idx *= dims[i + 1]; - } - return flat_idx; -} - -template -__inline__ __host__ __device__ void flat2dim(T1 flat_idx, const T2 (&dims)[n_dims], T2 (&idx)[n_dims]) -{ - for (int i = n_dims - 1; i >= 0; i--) { - idx[i] = flat_idx % dims[i]; - flat_idx /= dims[i]; - } -} - -template -void invokeInPlaceTranspose(T* data, T* workspace, const int dim0, const int dim1); - -template -void invokeInPlaceTranspose0213(T* data, T* workspace, const int dim0, const int dim1, const int dim2, const int dim3); - template void invokeInPlaceTranspose102( T* data, T* workspace, const int dim0, const int dim1, const int dim2, bool copy = true, cudaStream_t stream = 0); -template -void invokeMultiplyScale(T* tensor, float scale, const size_t size, cudaStream_t stream); - -template -void invokeDivideScale(T* tensor, float scale, const size_t size, cudaStream_t stream); - -template -void invokeCudaD2DcpyConvert(T_OUT* tgt, const T_IN* src, const size_t size, cudaStream_t stream = 0); - -template -void invokeCudaD2DScaleCpyConvert( - T_OUT* tgt, const T_IN* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream = 0); - -inline bool checkIfFileExist(const std::string& file_path) -{ - std::ifstream in(file_path, std::ios::in | std::ios::binary); - if (in.is_open()) { - in.close(); - return true; - } - return false; -} - -template -void saveToBinary(const T* ptr, const size_t size, std::string filename); - -template -void invokeFakeCast(T_IN* input_ptr, const size_t size, cudaStream_t stream); - -size_t cuda_datatype_size(FtCudaDataType dt); - -template -bool invokeCheckRange(T* buffer, const size_t size, T min, T max, bool* d_within_range, cudaStream_t stream); - } // namespace turbomind diff --git a/src/turbomind/utils/mpi_utils.cc b/src/turbomind/utils/mpi_utils.cc deleted file mode 100644 index 737e428d04..0000000000 --- a/src/turbomind/utils/mpi_utils.cc +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "src/turbomind/utils/mpi_utils.h" - -namespace turbomind { -namespace mpi { - -#ifdef BUILD_MULTI_GPU -MPI_Datatype getMpiDtype(MpiType dtype) -{ - static const std::unordered_map dtype_map{ - {MPI_TYPE_BYTE, MPI_BYTE}, - {MPI_TYPE_CHAR, MPI_CHAR}, - {MPI_TYPE_INT, MPI_INT}, - {MPI_TYPE_INT64_T, MPI_INT64_T}, - {MPI_TYPE_UINT32_T, MPI_UINT32_T}, - {MPI_TYPE_UNSIGNED_LONG_LONG, MPI_UNSIGNED_LONG_LONG}, - }; - return dtype_map.at(dtype); -} -#endif - -void initialize(int* argc, char*** argv) -{ -#ifdef BUILD_MULTI_GPU - MPICHECK(MPI_Init(argc, argv)); -#endif -} - -void finalize() -{ -#ifdef BUILD_MULTI_GPU - MPICHECK(MPI_Finalize()); -#endif -} - -bool isInitialized() -{ - int mpi_initialized = 0; -#ifdef BUILD_MULTI_GPU - MPICHECK(MPI_Initialized(&mpi_initialized)); -#endif - return static_cast(mpi_initialized); -} - -void initThread(int* argc, char*** argv, MpiThreadSupport required, int* provided) -{ -#ifdef BUILD_MULTI_GPU - switch (required) { - case THREAD_SINGLE: - MPICHECK(MPI_Init_thread(argc, argv, MPI_THREAD_SINGLE, provided)); - break; - case THREAD_FUNNELED: - MPICHECK(MPI_Init_thread(argc, argv, MPI_THREAD_FUNNELED, provided)); - break; - case THREAD_SERIALIZED: - MPICHECK(MPI_Init_thread(argc, argv, MPI_THREAD_SERIALIZED, provided)); - break; - case THREAD_MULTIPLE: - MPICHECK(MPI_Init_thread(argc, argv, MPI_THREAD_MULTIPLE, provided)); - break; - default: - break; - } -#endif -} - -int getCommWorldRank() -{ - int rank = 0; -#ifdef BUILD_MULTI_GPU - MPI_Comm_rank(MPI_COMM_WORLD, &rank); -#endif - return rank; -} - -int getCommWorldSize() -{ - int world_size = 1; -#ifdef BUILD_MULTI_GPU - MPI_Comm_size(MPI_COMM_WORLD, &world_size); -#endif - return world_size; -} - -void barrier(MpiComm comm) -{ -#ifdef BUILD_MULTI_GPU - MPICHECK(MPI_Barrier(comm.group)); -#endif -} - -void barrier() -{ -#ifdef BUILD_MULTI_GPU - MPICHECK(MPI_Barrier(MPI_COMM_WORLD)); -#endif -} - -void bcast(void* buffer, size_t size, MpiType dtype, int root, MpiComm comm) -{ -#ifdef BUILD_MULTI_GPU - MPICHECK(MPI_Bcast(buffer, size, getMpiDtype(dtype), root, comm.group)); -#endif -} - -} // namespace mpi -} // namespace turbomind diff --git a/src/turbomind/utils/mpi_utils.h b/src/turbomind/utils/mpi_utils.h deleted file mode 100644 index 0eef1f2cc1..0000000000 --- a/src/turbomind/utils/mpi_utils.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "src/turbomind/utils/logger.h" - -#ifdef BUILD_MULTI_GPU -#include -#endif -#include -#include - -namespace turbomind { - -#ifdef BUILD_MULTI_GPU -#define MPICHECK(cmd) \ - do { \ - int e = cmd; \ - if (e != MPI_SUCCESS) { \ - printf("Failed: MPI error %s:%d '%d'\n", __FILE__, __LINE__, e); \ - exit(EXIT_FAILURE); \ - } \ - } while (0) -#else -#define MPICHECK(cmd) printf("[WARNING] No MPI\n"); -#endif - -// A wrapper module of the MPI library. -namespace mpi { - -// A wrapper of MPI data type. MPI_TYPE_{data_type} -enum MpiType -{ - MPI_TYPE_BYTE, - MPI_TYPE_CHAR, - MPI_TYPE_INT, - MPI_TYPE_INT64_T, - MPI_TYPE_UINT32_T, - MPI_TYPE_UNSIGNED_LONG_LONG, -}; - -// A wrapper of the level of MPI thread support -enum MpiThreadSupport -{ - THREAD_SINGLE, - THREAD_FUNNELED, - THREAD_SERIALIZED, - THREAD_MULTIPLE -}; - -struct MpiComm { -#ifdef BUILD_MULTI_GPU - MPI_Comm group; - MpiComm(){}; - MpiComm(MPI_Comm g): group(g){}; -#endif -}; - -#ifdef BUILD_MULTI_GPU -#define COMM_WORLD MpiComm(MPI_COMM_WORLD) -#else -#define COMM_WORLD MpiComm() -#endif - -#ifdef BUILD_MULTI_GPU -MPI_Datatype getMpiDtype(MpiType dtype); -#endif - -void initialize(int* argc, char*** argv); -void initThread(int* argc, char*** argv, MpiThreadSupport required, int* provided); -void finalize(); -bool isInitialized(); -void barrier(MpiComm comm); -void barrier(); - -int getCommWorldRank(); -int getCommWorldSize(); - -void bcast(void* buffer, size_t size, MpiType dtype, int root, MpiComm comm); - -} // namespace mpi -} // namespace turbomind diff --git a/tests/csrc/CMakeLists.txt b/tests/csrc/CMakeLists.txt index 61a9b7383d..82fae2ea3d 100644 --- a/tests/csrc/CMakeLists.txt +++ b/tests/csrc/CMakeLists.txt @@ -13,7 +13,3 @@ # limitations under the License. add_subdirectory(unittests) -if(BUILD_PYT) - add_subdirectory(gemm_dequantize) - add_subdirectory(int8_gemm) -endif() diff --git a/tests/csrc/gemm_dequantize/CMakeLists.txt b/tests/csrc/gemm_dequantize/CMakeLists.txt deleted file mode 100644 index dd02ecdc61..0000000000 --- a/tests/csrc/gemm_dequantize/CMakeLists.txt +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -if (TORCH_VERSION VERSION_GREATER_EQUAL "1.9.0") - set(gemm_dq_test_files - th_gemm_dequantize.cc - ) - - add_definitions(-DTORCH_CUDA=1) - - set(LIB_NAME "gemm_dq_unit_ops") - add_library(${LIB_NAME} SHARED ${gemm_dq_test_files}) - set_target_properties(${LIB_NAME} PROPERTIES - CUDA_RESOLVE_DEVICE_SYMBOLS ON) - target_link_libraries(${LIB_NAME} "${TORCH_LIBRARIES}" fpA_intB_gemm logger) -else() - message("TORCH_VERSION ${TORCH_VERSION} < 1.9.0, skipping compiling th_moe_ops.cc because QUInt4x2 is supported after torch 1.9.0") -endif() diff --git a/tests/csrc/gemm_dequantize/th_gemm_dequantize.cc b/tests/csrc/gemm_dequantize/th_gemm_dequantize.cc deleted file mode 100644 index e00a4eceef..0000000000 --- a/tests/csrc/gemm_dequantize/th_gemm_dequantize.cc +++ /dev/null @@ -1,372 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -#include "torch/csrc/cuda/Stream.h" -#include -#include - -#include "src/turbomind/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h" -#include "src/turbomind/th_op/th_utils.h" -#include "src/turbomind/utils/cuda_bf16_wrapper.h" - -#include "cutlass/numeric_types.h" - -using torch::Tensor; - -namespace torch_ext { - -namespace ft = turbomind; - -template -Tensor fused_gemm_dq_helper( - Tensor input_activations, Tensor weight, Tensor scales, const int64_t timing_iterations, float& avg_time) -{ - const at::ScalarType _st = input_activations.scalar_type(); - const int m = input_activations.size(0); - const int n = scales.size(0); - const int k = input_activations.size(1); - auto stream = at::cuda::getCurrentCUDAStream().stream(); - - const T* input_act_ptr = get_ptr(input_activations); - const WeightType* weight_ptr = get_ptr(weight); - const T* scales_ptr = get_ptr(scales); - - turbomind::CutlassFpAIntBGemmRunner fused_gemm_dq_runner; - const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k); - - auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false)); - auto ws_tensor = torch::empty({ws_bytes}, torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false)); - - T* output_tensor_ptr = get_ptr(output_tensor); - char* ws_ptr = get_ptr(ws_tensor); - - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - - cudaEventRecord(start, stream); - for (int64_t iter = 0; iter < timing_iterations; ++iter) { - fused_gemm_dq_runner.gemm( - input_act_ptr, weight_ptr, scales_ptr, output_tensor_ptr, m, n, k, ws_ptr, ws_bytes, stream); - } - cudaEventRecord(stop, stream); - cudaEventSynchronize(stop); - float total_time_ms = 0; - cudaEventElapsedTime(&total_time_ms, start, stop); - avg_time = total_time_ms / float(timing_iterations); - - return output_tensor; -} - -Tensor -_fused_gemm_dq(Tensor input_activations, Tensor weight, Tensor scales, int64_t timing_iterations, float& avg_time) -{ - const at::ScalarType _st = input_activations.scalar_type(); - CHECK_INPUT(scales, _st); - - TORCH_CHECK(input_activations.dim() == 2, "Invalid rank for activations"); - TORCH_CHECK(weight.dim() == 2, "Invalid rank for weight"); - TORCH_CHECK(scales.dim() == 1, "Invalid rank for scales"); - - const int m = input_activations.size(0); - const int n = scales.size(0); - const int k = input_activations.size(1); - - TORCH_CHECK(input_activations.size(1) == weight.size(0), "dim 1 of act and dim 0 of weight must be equal"); - - // We signal int4 by having the last weight dim be half the size of the scales. - // This is because int4 elements are packed into a single byte. - torch::ScalarType quant_type = weight.scalar_type(); - if (weight.size(-1) == scales.size(-1) / 2) { - quant_type = at::ScalarType::QUInt4x2; - } - else { - TORCH_CHECK(weight.size(-1) == scales.size(-1), - "Last dim of weight and scales must be equal for int8 " - "or last dim of scale must be 2x last dim of weight for int4."); - } - - Tensor output_tensor; - switch (_st) { - case at::ScalarType::Half: { - if (quant_type == torch::kInt8) { - output_tensor = - fused_gemm_dq_helper(input_activations, weight, scales, timing_iterations, avg_time); - } - else if (quant_type == at::ScalarType::QUInt4x2) { - output_tensor = fused_gemm_dq_helper( - input_activations, weight, scales, timing_iterations, avg_time); - } - else { - std::string err_msg = "Unsupported weight type " + std::string(at::toString(quant_type)); - throw std::runtime_error(err_msg); - } - break; - } -#ifdef ENABLE_BF16 - case at::ScalarType::BFloat16: { - if (quant_type == torch::kInt8) { - output_tensor = fused_gemm_dq_helper<__nv_bfloat16, uint8_t>( - input_activations, weight, scales, timing_iterations, avg_time); - } - else if (quant_type == at::ScalarType::QUInt4x2) { - output_tensor = fused_gemm_dq_helper<__nv_bfloat16, cutlass::uint4b_t>( - input_activations, weight, scales, timing_iterations, avg_time); - } - else { - std::string err_msg = "Unsupported weight type " + std::string(at::toString(quant_type)); - throw std::runtime_error(err_msg); - } - break; - } -#endif - default: - throw std::runtime_error("Unsupported tensor type. Got " + std::string(at::toString(_st))); - } - return output_tensor; -} - -Tensor fused_gemm_dq(Tensor input_activations, Tensor weight, Tensor scales) -{ - float dummy = 0.f; - return _fused_gemm_dq(input_activations, weight, scales, 1, dummy); -} - -Tensor -bench_cublas(Tensor input_activations, Tensor weight_dequantized, const int64_t timing_iterations, float& avg_time) -{ - using namespace turbomind; - const int m = input_activations.size(0); - const int n = weight_dequantized.size(1); - const int k = input_activations.size(1); - - const void* input_act_ptr = get_ptr(input_activations); - const void* weight_ptr = get_ptr(weight_dequantized); - - cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); - const at::ScalarType _st = input_activations.scalar_type(); - - TORCH_CHECK(input_activations.size(1) == weight_dequantized.size(0), - "CUBLAS_BENCH: dim 1 of act and dim 0 of weight must be equal"); - CHECK_INPUT(input_activations, _st); - CHECK_INPUT(weight_dequantized, _st); - - auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false)); - void* output_tensor_ptr = get_ptr(output_tensor); - - TORCH_CHECK(_st == at::ScalarType::Half || _st == at::ScalarType::BFloat16, "Input type must be float or bfloat"); - cudaDataType_t cublasType = _st == at::ScalarType::Half ? CUDA_R_16F : CUDA_R_16BF; - - float alpha = 1.0f; - float beta = 0.0f; - - auto stream = at::cuda::getCurrentCUDAStream().stream(); - cublasSetStream(handle, stream); - - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - - cublasStatus_t status = CUBLAS_STATUS_SUCCESS; - cudaEventRecord(start, stream); - for (int64_t iter = 0; iter < timing_iterations; ++iter) { - status = cublasGemmEx(handle, - CUBLAS_OP_N, - CUBLAS_OP_N, - n, - m, - k, - &alpha, - weight_ptr, - cublasType, - n, - input_act_ptr, - cublasType, - k, - &beta, - output_tensor_ptr, - cublasType, - n, - CUBLAS_COMPUTE_32F, - CUBLAS_GEMM_DEFAULT); - } - cudaEventRecord(stop, stream); - cudaEventSynchronize(stop); - float total_time_ms = 0; - cudaEventElapsedTime(&total_time_ms, start, stop); - avg_time = total_time_ms / float(timing_iterations); - check_cuda_error(status); - return output_tensor; -} - -std::vector> benchmark_against_cublas_fp(Tensor input_activations, - Tensor weight_quantized, - Tensor scales, - Tensor weight_dequantized, - const int64_t timing_iterations) -{ - float cublas_time = 0.f; - float ft_time = 0.f; - Tensor cublas_result = bench_cublas(input_activations, weight_dequantized, timing_iterations, cublas_time); - Tensor ft_result = _fused_gemm_dq(input_activations, weight_quantized, scales, timing_iterations, ft_time); - - auto timing_tensor = - torch::empty({2}, torch::dtype(at::ScalarType::Float).device(torch::kCPU).requires_grad(false)); - timing_tensor[0] = cublas_time; - timing_tensor[1] = ft_time; - - // const int m = input_activations.size(0); - // const int n = weight_dequantized.size(1); - // const int k = input_activations.size(1); - // std::cout << "m, n, k" << m << ", " << n << ", " << k << std::endl; - // std::cout << "cuBLAS time (ms) " << cublas_time << std::endl; - // std::cout << "FT time (ms) " << ft_time << std::endl; - - return {{timing_tensor}, {cublas_result, ft_result}}; -} - -template -Tensor fused_gemm_dq_bias_act_helper( - Tensor input_activations, Tensor weight, Tensor scales, Tensor bias, ft::ActivationType activation_type) -{ - const at::ScalarType _st = input_activations.scalar_type(); - const int m = input_activations.size(0); - const int n = scales.size(0); - const int k = input_activations.size(1); - auto stream = at::cuda::getCurrentCUDAStream().stream(); - - const T* input_act_ptr = get_ptr(input_activations); - const WeightType* weight_ptr = get_ptr(weight); - const T* scales_ptr = get_ptr(scales); - const T* bias_ptr = get_ptr(bias); - - turbomind::CutlassFpAIntBGemmRunner fused_gemm_dq_runner; - const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k); - - auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false)); - auto ws_tensor = torch::empty({ws_bytes}, torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false)); - - T* output_tensor_ptr = get_ptr(output_tensor); - char* ws_ptr = get_ptr(ws_tensor); - - fused_gemm_dq_runner.gemm_bias_act(input_act_ptr, - weight_ptr, - scales_ptr, - bias_ptr, - output_tensor_ptr, - m, - n, - k, - activation_type, - ws_ptr, - ws_bytes, - stream); - - return output_tensor; -} - -Tensor fused_gemm_dq_bias_act( - Tensor input_activations, Tensor weight, Tensor scales, Tensor bias, std::string activation_type_str) -{ - const at::ScalarType _st = input_activations.scalar_type(); - CHECK_INPUT(scales, _st); - CHECK_INPUT(bias, _st); - - TORCH_CHECK(input_activations.dim() == 2, "Invalid rank for activations"); - TORCH_CHECK(weight.dim() == 2, "Invalid rank for weight"); - TORCH_CHECK(scales.dim() == 1, "Invalid rank for scales"); - TORCH_CHECK(bias.dim() == 1, "Invalid rank for bias"); - - const int m = input_activations.size(0); - const int n = scales.size(0); - const int k = input_activations.size(1); - - TORCH_CHECK(bias.size(0) == n, "Must have 1 bias value for each output column"); - TORCH_CHECK(input_activations.size(1) == weight.size(0), "dim 1 of act and dim 0 of weight must be equal"); - - // We signal int4 by having the last weight dim be half the size of the scales. - // This is because int4 elements are packed into a single byte. - torch::ScalarType quant_type = weight.scalar_type(); - if (weight.size(-1) == scales.size(-1) / 2) { - quant_type = at::ScalarType::QUInt4x2; - } - else { - TORCH_CHECK(weight.size(-1) == scales.size(-1), - "Last dim of weight and scales must be equal for int8 " - "or last dim of scale must be 2x last dim of weight for int4."); - } - - ft::ActivationType activation_type = ft::ActivationType::InvalidType; - if (activation_type_str == "identity") { - activation_type = ft::ActivationType::Identity; - } - else { - activation_type = ft::getActivationType(activation_type_str); - } - - TORCH_CHECK(!isGatedActivation(activation_type), "Fused gated activations not supported."); - - Tensor output_tensor; - switch (_st) { - case at::ScalarType::Half: { - if (quant_type == torch::kInt8) { - output_tensor = fused_gemm_dq_bias_act_helper( - input_activations, weight, scales, bias, activation_type); - } - else if (quant_type == at::ScalarType::QUInt4x2) { - output_tensor = fused_gemm_dq_bias_act_helper( - input_activations, weight, scales, bias, activation_type); - } - else { - std::string err_msg = "Unsupported weight type " + std::string(at::toString(quant_type)); - throw std::runtime_error(err_msg); - } - break; - } -#ifdef ENABLE_BF16 - case at::ScalarType::BFloat16: { - if (quant_type == torch::kInt8) { - output_tensor = fused_gemm_dq_bias_act_helper<__nv_bfloat16, uint8_t>( - input_activations, weight, scales, bias, activation_type); - } - else if (quant_type == at::ScalarType::QUInt4x2) { - output_tensor = fused_gemm_dq_bias_act_helper<__nv_bfloat16, cutlass::uint4b_t>( - input_activations, weight, scales, bias, activation_type); - } - else { - std::string err_msg = "Unsupported weight type " + std::string(at::toString(quant_type)); - throw std::runtime_error(err_msg); - } - break; - } -#endif - default: - throw std::runtime_error("Unsupported tensor type. Got " + std::string(at::toString(_st))); - } - return output_tensor; -} - -TORCH_LIBRARY(gemm_dq_unit_ops, m) -{ - m.def("fused_gemm_dq", fused_gemm_dq); - m.def("benchmark_against_cublas_fp", benchmark_against_cublas_fp); - m.def("fused_gemm_dq_bias_act", fused_gemm_dq_bias_act); -} -} // namespace torch_ext diff --git a/tests/csrc/gemm_dequantize/th_gemm_dequantize.py b/tests/csrc/gemm_dequantize/th_gemm_dequantize.py deleted file mode 100644 index 0946fe3191..0000000000 --- a/tests/csrc/gemm_dequantize/th_gemm_dequantize.py +++ /dev/null @@ -1,247 +0,0 @@ -# flake8: noqa -import unittest - -import torch - - -def random_tensor(shape, dtype, device, mean=0, std=1): - return torch.empty(shape, dtype=dtype, device=device).normal_(mean, std) - - -class TestGemmDequantize(unittest.TestCase): - - def setUp(self) -> None: - torch.classes.load_library('lib/libth_transformer.so') - torch.classes.load_library('lib/libgemm_dq_unit_ops.so') - self.unpack_packed_int4s = torch.ops.turbomind.unpack_int4_packed_tensor_to_int8 - self.pack_int4s = torch.ops.turbomind.pack_int8_tensor_to_packed_int4 - self.fused_gemm_dq = torch.ops.gemm_dq_unit_ops.fused_gemm_dq - self.fused_gemm_dq_bias_act = torch.ops.gemm_dq_unit_ops.fused_gemm_dq_bias_act - self.bench = torch.ops.gemm_dq_unit_ops.benchmark_against_cublas_fp - self.preprocess_weights_for_mixed_gemm = torch.ops.turbomind.preprocess_weights_for_mixed_gemm - - self.symmetric_quantizer = torch.ops.turbomind._symmetric_quantize_last_axis_of_batched_matrix - - torch.manual_seed(734876213) - - def dequantize_test_helper(self, weight_type, quant_type): - assert quant_type == torch.int8 or quant_type == torch.quint4x2 - - lower_bound = -128 if quant_type == torch.int8 else -8 - upper_bound = 127 if quant_type == torch.int8 else 7 - - m, n, k = 64, 128, 64 - weights = torch.randint(lower_bound, upper_bound, [k, n], dtype=torch.int8, device='cpu') - - packed_weight = self.pack_int4s(weights) if quant_type == torch.quint4x2 else weights - cuda_weights = self.preprocess_weights_for_mixed_gemm(packed_weight, quant_type).to('cuda') - weights = weights.to('cuda') - - act = torch.eye(m, dtype=weight_type, device='cuda') - scales = torch.ones([n], dtype=weight_type, device='cuda') - - actual = self.fused_gemm_dq(act, cuda_weights, scales) - torch.testing.assert_close(actual, weights, atol=0, rtol=0, check_dtype=False) - - def test_fp16_int8_dequantize(self): - self.dequantize_test_helper(torch.float16, torch.int8) - - def test_bf16_int8_dequantize(self): - self.dequantize_test_helper(torch.bfloat16, torch.int8) - - def test_fp16_int4_dequantize(self): - self.dequantize_test_helper(torch.float16, torch.quint4x2) - - def test_bf16_int4_dequantize(self): - self.dequantize_test_helper(torch.bfloat16, torch.quint4x2) - - def apply_act(self, inp, act_str): - if act_str == 'identity': - return inp - elif act_str == 'silu': - return torch.nn.SiLU()(inp) - elif act_str == 'relu': - return torch.nn.ReLU()(inp) - elif act_str == 'gelu': - return torch.nn.GELU(approximate='tanh')(inp) - else: - assert False, 'Unsupported activation' - - def gemm_dequant_test_helper(self, - compute_type, - weight_dtype, - gemm_ms, - gemm_ns, - gemm_ks, - rtol, - atol, - act_str='only_gemm', - benchmark=False): - assert weight_dtype == torch.int8 or weight_dtype == torch.quint4x2, 'Weight must be quantized' - - for gemm_k in gemm_ks: - for gemm_n in gemm_ns: - torch_weights_cpu = random_tensor((gemm_k, gemm_n), dtype=compute_type, device='cpu', mean=0, std=0.002) - ref_torch_weights, processed_torch_weights, torch_weight_scales = self.symmetric_quantizer( - torch_weights_cpu, weight_dtype) - ref_torch_weights = self.unpack_packed_int4s( - ref_torch_weights) if weight_dtype == torch.quint4x2 else ref_torch_weights - ref_torch_weights = ref_torch_weights.to('cuda') - processed_torch_weights = processed_torch_weights.to('cuda') - torch_weight_scales = torch_weight_scales.to('cuda') - torch_biases = random_tensor((gemm_n), dtype=compute_type, device='cuda', mean=0, std=0.1) - - for num_rows in gemm_ms: - torch_activations = torch.randn(size=(num_rows, gemm_k), dtype=compute_type, device='cuda') - - scales_unsqueezed = torch_weight_scales.unsqueeze(0) - casted_weights = ref_torch_weights.to(torch_activations.dtype) - dequantized_weights = torch.multiply(casted_weights, scales_unsqueezed) - if benchmark: - assert act_str == 'only_gemm', 'Benchmarks against cublas must use just GEMM.' - torch.cuda.profiler.start() - times, results = self.bench(torch_activations, processed_torch_weights, torch_weight_scales, - dequantized_weights, 200) - torch.cuda.profiler.stop() - times = times[0] - cublas_time = times[0].item() - ft_time = times[1].item() - ft_speedup = cublas_time / ft_time - print('{},{},{},{},{},{}'.format(num_rows, gemm_n, gemm_k, cublas_time, ft_time, ft_speedup)) - reference_result = results[0] - ft_result = results[1] - else: - if act_str == 'only_gemm': - reference_result = torch.matmul(torch_activations, dequantized_weights) - ft_result = self.fused_gemm_dq(torch_activations, processed_torch_weights, - torch_weight_scales) - else: - reference_result = torch.matmul(torch_activations, dequantized_weights) - reference_result += torch_biases.unsqueeze(0) - reference_result = self.apply_act(reference_result, act_str) - - ft_result = self.fused_gemm_dq_bias_act(torch_activations, processed_torch_weights, - torch_weight_scales, torch_biases, act_str) - - msg = 'FC1 Failed on m={}, n={}, k={}'.format(num_rows, gemm_n, gemm_k) - torch.testing.assert_close(ft_result, - reference_result, - rtol=rtol, - atol=atol, - msg=msg, - check_dtype=False) - - def test_fp16_int8_gemm(self): - self.gemm_dequant_test_helper(torch.float16, - torch.int8, - gemm_ms=[256, 177, 195, 125, 66, 33, 8, 2, 1], - gemm_ns=[1024, 2048, 4096], - gemm_ks=[4096, 8192, 16384], - rtol=0.001, - atol=0.002) - - def test_fp16_int4_gemm(self): - self.gemm_dequant_test_helper(torch.float16, - torch.quint4x2, - gemm_ms=[256, 177, 195, 125, 66, 33, 8, 2, 1], - gemm_ns=[1024, 2048, 4096], - gemm_ks=[4096, 8192, 16384], - rtol=0.001, - atol=0.002) - - def test_bf16_int8_gemm(self): - self.gemm_dequant_test_helper(torch.bfloat16, - torch.int8, - gemm_ms=[256, 177, 195, 125, 66, 33, 8, 2, 1], - gemm_ns=[1024, 2048, 4096], - gemm_ks=[4096, 8192, 16384], - rtol=0.01, - atol=0.01) - - def test_bf16_int4_gemm(self): - self.gemm_dequant_test_helper(torch.bfloat16, - torch.quint4x2, - gemm_ms=[256, 177, 195, 125, 66, 33, 8, 2, 1], - gemm_ns=[1024, 2048, 4096], - gemm_ks=[4096, 8192, 16384], - rtol=0.01, - atol=0.01) - - def test_fp16_int8_gemm_bias(self): - self.gemm_dequant_test_helper(torch.float16, - torch.int8, - gemm_ms=[256], - gemm_ns=[1024], - gemm_ks=[8192], - rtol=0.001, - atol=0.002, - act_str='identity') - - def test_fp16_int8_gemm_bias_relu(self): - self.gemm_dequant_test_helper(torch.float16, - torch.int8, - gemm_ms=[256], - gemm_ns=[1024], - gemm_ks=[8192], - rtol=0.001, - atol=0.002, - act_str='relu') - - def test_fp16_int8_gemm_bias_gelu(self): - self.gemm_dequant_test_helper(torch.float16, - torch.int8, - gemm_ms=[256], - gemm_ns=[1024], - gemm_ks=[8192], - rtol=0.001, - atol=0.002, - act_str='gelu') - - def test_fp16_int8_gemm_bias_silu(self): - self.gemm_dequant_test_helper(torch.float16, - torch.int8, - gemm_ms=[256], - gemm_ns=[1024], - gemm_ks=[8192], - rtol=0.001, - atol=0.002, - act_str='silu') - - def bench_helper(self, act_type, quant_type, rtol, atol): - # Warm, using bfloat here since it seems to reliably use cublas. - x = random_tensor([20480, 20480], torch.bfloat16, device='cuda') - warm_iters = 30 - for iter in range(warm_iters): - res = x @ x - - m_shapes = torch.arange(0, 12) - m_shapes = 2**m_shapes - - self.gemm_dequant_test_helper(act_type, - quant_type, - gemm_ms=[128], - gemm_ns=[1536], - gemm_ks=[12288], - rtol=rtol, - atol=atol, - benchmark=True) - - @unittest.skip("This is a benchmark so don't run by default") - def test_fp16_int8_cublas(self): - self.bench_helper(torch.float16, torch.int8, 1e-3, 0.002) - - @unittest.skip("This is a benchmark so don't run by default") - def test_bf16_int8_cublas(self): - self.bench_helper(torch.bfloat16, torch.int8, 1e-2, 1e-2) - - @unittest.skip("This is a benchmark so don't run by default") - def test_fp16_int4_cublas(self): - self.bench_helper(torch.float16, torch.quint4x2, 1e-3, 0.002) - - @unittest.skip("This is a benchmark so don't run by default") - def test_bf16_int4_cublas(self): - self.bench_helper(torch.bfloat16, torch.quint4x2, 1e-2, 1e-2) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/csrc/int8_gemm/CMakeLists.txt b/tests/csrc/int8_gemm/CMakeLists.txt deleted file mode 100644 index fe8b14455a..0000000000 --- a/tests/csrc/int8_gemm/CMakeLists.txt +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set(int8_test_files - int8_gemm_test.cu -) - -add_definitions(-DTORCH_CUDA=1) - -set(EXE_NAME "int8_gemm_test") -add_executable(${EXE_NAME} ${int8_test_files}) -set_target_properties(${EXE_NAME} PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON) -target_link_libraries(${EXE_NAME} PUBLIC "${TORCH_LIBRARIES}" int8_gemm tensor logger) diff --git a/tests/csrc/int8_gemm/int8_gemm_test.cu b/tests/csrc/int8_gemm/int8_gemm_test.cu deleted file mode 100644 index 0dc10b214d..0000000000 --- a/tests/csrc/int8_gemm/int8_gemm_test.cu +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -#include "torch/csrc/cuda/Stream.h" -#include -#include - -#include "src/turbomind/kernels/cutlass_kernels/int8_gemm/int8_gemm.h" -#include "src/turbomind/th_op/th_utils.h" -#include "src/turbomind/utils/cuda_bf16_wrapper.h" -#include "src/turbomind/utils/logger.h" - -#include "cutlass/numeric_types.h" - -using torch::Tensor; -using torch_ext::get_ptr; - -namespace ft = turbomind; - -template -void int8_gemm_test(const int m, - const int n, - const int k, - const at::ScalarType output_data_type, - const QuantMode quant_mode, - const int iters) -{ - const bool per_token_quant = - quant_mode == QuantMode::PerTokenChannelQuant || quant_mode == QuantMode::PerTokenQuant; - const bool per_channel_quant = - quant_mode == QuantMode::PerTokenChannelQuant || quant_mode == QuantMode::PerChannelQuant; - const int row_scale_size = per_token_quant ? m : 1; - const int col_scale_size = per_channel_quant ? n : 1; - - const at::ScalarType at_int32 = at::ScalarType::Int; - const at::ScalarType at_int8 = at::ScalarType::Char; - const at::ScalarType at_fp16 = at::ScalarType::Half; - const at::ScalarType at_bf16 = at::ScalarType::BFloat16; - const at::ScalarType at_fp32 = at::ScalarType::Float; - - using std::chrono::high_resolution_clock; - using std::chrono::duration_cast; - using std::chrono::microseconds; - - torch::manual_seed(0); - - auto x = torch::randint(-128, 128, {m, k}, torch::dtype(at_int32).requires_grad(false)); - auto w = torch::randint(-128, 128, {k, n}, torch::dtype(at_int32).requires_grad(false)); - - ft::FT_CHECK(torch::allclose(x, x.to(at_int8).to(at_int32))); - ft::FT_CHECK(torch::allclose(w, w.to(at_int8).to(at_int32))); - - auto y = torch::matmul(x, w); - - ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, {(size_t)m, (size_t)k}, get_ptr(x)}.saveNpy("x.npy"); - ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, {(size_t)k, (size_t)n}, get_ptr(w)}.saveNpy("w.npy"); - ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr(y)}.saveNpy("y.npy"); - - auto x_gpu = x.to(at_int8).to(torch::kCUDA); - auto w_T_gpu = w.to(at_int8).to(torch::kCUDA).t().contiguous(); - auto w_gpu = w.to(at_int8).to(torch::kCUDA); - auto y_gpu = torch::zeros({m, n}, torch::dtype(output_data_type).device(torch::kCUDA).requires_grad(false)); - auto y_gpu_int32 = torch::zeros({m, n}, torch::dtype(at_int32).device(torch::kCUDA).requires_grad(false)); - - auto alpha_row_cultass = torch::ones({row_scale_size, 1}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100) - * torch::randint(1, 10, {row_scale_size, 1}, torch::dtype(at_fp32)); - auto alpha_col_cutlass = torch::ones({1, col_scale_size}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100) - * torch::randint(1, 10, {1, col_scale_size}, torch::dtype(at_fp32)); - - auto alpha_row_torch = alpha_row_cultass.expand({m, 1}); - auto alpha_col_torch = alpha_col_cutlass.expand({1, n}); - - // std::cout << alpha_row << std::endl; - auto alpha_row_gpu = alpha_row_cultass.to(torch::kCUDA); - auto alpha_col_gpu = alpha_col_cutlass.to(torch::kCUDA); - - auto alpha_row_col_scale_gpu = torch::matmul(alpha_row_torch, alpha_col_torch).to(torch::kCUDA); - - ft::CutlassInt8GemmRunner cutlass_runner_half; - - auto stream = at::cuda::getCurrentCUDAStream().stream(); - // warm_up - cutlass_runner_half.gemm(get_ptr(x_gpu), - get_ptr(w_T_gpu), - quant_mode, - get_ptr(alpha_col_gpu), - get_ptr(alpha_row_gpu), - get_ptr(y_gpu), - m, - n, - k, - nullptr, - 0, - stream); - - ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)m, (size_t)k}, get_ptr(x_gpu)}.saveNpy("x_gpu.npy"); - ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)n, (size_t)k}, get_ptr(w_T_gpu)}.saveNpy("w_T_gpu.npy"); - ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)k, (size_t)n}, get_ptr(w_gpu)}.saveNpy("w_gpu.npy"); - ft::Tensor{ft::MEMORY_GPU, ft::TYPE_FP16, {(size_t)m, (size_t)n}, get_ptr(y_gpu)}.saveNpy("y_gpu.npy"); - ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr(y_gpu_int32)}.saveNpy( - "y_gpu_int32.npy"); - - ft::check_cuda_error(cudaStreamSynchronize(stream)); - auto start = high_resolution_clock::now(); - - for (int i = 0; i < iters; ++i) { - cutlass_runner_half.gemm(get_ptr(x_gpu), - get_ptr(w_T_gpu), - quant_mode, - get_ptr(alpha_col_gpu), - get_ptr(alpha_row_gpu), - get_ptr(y_gpu), - m, - n, - k, - nullptr, - 0, - stream); - } - - ft::check_cuda_error(cudaStreamSynchronize(stream)); - auto end = high_resolution_clock::now(); - - auto duration = duration_cast(end - start); - - if (torch::allclose( - (y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(output_data_type), y_gpu)) { - TM_LOG_INFO("SUCCESS " + std::to_string((double(duration.count()) / iters) / 1000) + " ms"); - } - else { - TM_LOG_ERROR("FAILED " + std::to_string((double(duration.count()) / iters) / 1000) + " ms"); - // std::cout << "diff " << (y.to(torch::kCUDA).to(at_fp32) * - // alpha_row_col_scale_gpu.to(torch::kCUDA)).to(at_fp16) - y_gpu << std::endl; - } -} - -int main(int argc, char** argv) -{ - if (argc != 7) { - TM_LOG_ERROR( - "arguments missing, needs m, n, k, data_type(fp16=0, bf16=1), quant_mode (perTensor=0, perToken=1, perChannel=2, perTokenChannel=3), iters."); - return 0; - } - - const int m = atoi(argv[1]); - const int n = atoi(argv[2]); - const int k = atoi(argv[3]); - const at::ScalarType output_data_type = atoi(argv[4]) == 0 ? at::ScalarType::Half : at::ScalarType::BFloat16; - const QuantMode quant_mode = static_cast(atoi(argv[5])); - if (quant_mode == QuantMode::PerChannelQuant) { - printf("per channel quant \n"); - } - const int iters = atoi(argv[6]); - - if (output_data_type == at::ScalarType::Half) { - int8_gemm_test(m, n, k, output_data_type, quant_mode, iters); - } - else { -#if ENABLE_BF16 - int8_gemm_test<__nv_bfloat16>(m, n, k, output_data_type, quant_mode, iters); -#endif - } - - return 0; -} diff --git a/tests/csrc/unittests/CMakeLists.txt b/tests/csrc/unittests/CMakeLists.txt index 01f926de60..454f9476f5 100644 --- a/tests/csrc/unittests/CMakeLists.txt +++ b/tests/csrc/unittests/CMakeLists.txt @@ -36,7 +36,6 @@ add_executable(unittest test_penalty_kernels.cu test_sampling_kernels.cu test_sampling_layer.cu - test_tensor.cu ) # automatic discovery of unit tests @@ -64,11 +63,7 @@ target_link_libraries( # Libs for test_sampling_layer unittest PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart cublasMMWrapper memory_utils - DynamicDecodeLayer tensor cuda_utils logger + DynamicDecodeLayer cuda_utils logger ) target_link_libraries( # Libs for test_tensor - unittest PUBLIC tensor cuda_utils logger) - -remove_definitions(-DTORCH_CUDA=1) -add_executable(test_gemm test_gemm.cu) -target_link_libraries(test_gemm PUBLIC CUDA::cublas CUDA::cudart CUDA::curand gemm cublasMMWrapper tensor cuda_utils logger) + unittest PUBLIC cuda_utils logger) diff --git a/tests/csrc/unittests/test_gemm.cu b/tests/csrc/unittests/test_gemm.cu deleted file mode 100644 index be7fed531d..0000000000 --- a/tests/csrc/unittests/test_gemm.cu +++ /dev/null @@ -1,1023 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#include "src/turbomind/layers/DenseWeight.h" -#include "src/turbomind/utils/allocator.h" -#include "src/turbomind/utils/cublasMMWrapper.h" -#include "src/turbomind/utils/cuda_utils.h" -#include "src/turbomind/utils/gemm.h" -#include "src/turbomind/utils/logger.h" -#include "src/turbomind/utils/memory_utils.h" - -using namespace turbomind; - -// Can be replaced by the function provided by a test framework - -class TestFailureError: public std::exception { -private: - std::string msg_; - -public: - explicit TestFailureError() = default; - explicit TestFailureError(std::string name, std::string msg = "") - { - msg_ = fmtstr("TEST FAIL [%s] %s", name.c_str(), msg.c_str()); - } - const char* what() const throw() - { - return msg_.c_str(); - } -}; - -#define EXPECT_TRUE(cond) \ - do { \ - if (!(cond)) { \ - TM_LOG_ERROR("TEST FAIL [%s] at %s:%d", __func__, __FILE__, __LINE__); \ - throw TestFailureError(__func__); \ - } \ - } while (false) - -#define EXPECT_ALMOST_EQUAL(name, dtype, ctype, out, ref) \ - do { \ - bool is_ok = checkResult(name, out, ref); \ - if (!is_ok) { \ - TM_LOG_ERROR("TEST FAIL [%s] at %s:%d", __func__, __FILE__, __LINE__); \ - throw TestFailureError(__func__); \ - } \ - } while (false) - -//////////////////////////////////////////////////////////////////////////////////// - -// TensorWrapper is to handle a tensor object as well as its memory buffer, -// because tensor.data is const we cannot set values. -class TensorWrapper { -private: - IAllocator* allocator; - -public: - std::vector shape; - DataType type; - Tensor* tensor; - void* data; - - TensorWrapper(IAllocator* allocator, DataType dtype, std::vector shape, bool zero_init = false) - { - this->allocator = allocator; - this->type = dtype; - this->shape = shape; - - size_t tensor_memsize = this->memsize(); - this->data = this->allocator->malloc(tensor_memsize, false); - if (zero_init) { - check_cuda_error(cudaMemset(data, 0x0, tensor_memsize)); - } - else { - setRandomValues(); - } - this->tensor = new Tensor(MEMORY_GPU, dtype, shape, data); - } - - TensorWrapper(TensorWrapper const& other): - allocator(other.allocator), shape(other.shape), type(other.type), data(other.data), tensor(other.tensor) - { - TM_LOG_DEBUG("TensorWrapper copy: this=%p other=%p", data, other.data); - } - ~TensorWrapper() - { - delete tensor; - allocator->free((void**)(&data)); - } - - void setInvalidValues() - { - size_t type_size = tensor->type == TYPE_FP32 ? sizeof(float) : sizeof(half); - size_t tensor_size = type_size * tensor->size(); - // Fill by a random number to guarantee invalid values - check_cuda_error(cudaMemset(data, 0xdc, tensor_size)); - } - - void setRandomValues() - { - // random initialization - size_t num_elements = this->size(); - switch (this->type) { - case TYPE_FP32: - cudaRandomUniform((float*)data, num_elements); - break; - case TYPE_FP16: - cudaRandomUniform((half*)data, num_elements); - break; - default: - // Will be added more if needed. - throw std::runtime_error("Not supported data type"); - } - } - - size_t size() - { - size_t n_elements = 1; - for (size_t s : this->shape) { - n_elements *= s; - } - return n_elements; - } - - size_t memsize() - { - size_t type_size = 0; - switch (this->type) { - case TYPE_FP32: - type_size = sizeof(float); - break; - case TYPE_FP16: - type_size = sizeof(half); - break; - default: - throw std::runtime_error("Not supported data type."); - } - return type_size * this->size(); - } -}; - -template -void computeReference(GemmOp transa, - GemmOp transb, - TensorWrapper& C, - TensorWrapper& A, - TensorWrapper& B, - float alpha = 1.0f, - float beta = 0.0f) -{ - size_t m = C.shape[0]; - size_t n = C.shape[1]; - size_t k = A.shape[1]; - - size_t lda = (transa == GEMM_OP_N) ? k : m; - size_t ldb = (transb == GEMM_OP_N) ? n : k; - size_t ldc = n; - - cudaDataType_t atype = (A.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F; - cudaDataType_t btype = (B.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F; - cudaDataType_t ctype = (C.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F; - cudaDataType_t compute_type = (computeType == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F; - - cublasHandle_t cublas_handle; - check_cuda_error(cublasCreate(&cublas_handle)); - - half h_alpha = (half)alpha; - half h_beta = (half)beta; - const void* _alpha = (computeType == TYPE_FP16) ? (const void*)&h_alpha : (const void*)α - const void* _beta = (computeType == TYPE_FP16) ? (const void*)&h_beta : (const void*)β - - check_cuda_error(cublasGemmEx(cublas_handle, - getCublasOperation(transb), - getCublasOperation(transa), - n, - m, - k, - _alpha, - (const void*)B.data, - btype, - ldb, - (const void*)A.data, - atype, - lda, - _beta, - (void*)C.data, - ctype, - ldc, - compute_type, - CUBLAS_GEMM_DEFAULT)); - check_cuda_error(cublasDestroy(cublas_handle)); - cudaDeviceSynchronize(); -} - -bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8) -{ - // Params: a = value to compare and b = reference - // This function follows implementation of numpy.isclose(), which checks - // abs(a - b) <= (atol + rtol * abs(b)). - // Note that the inequality above is asymmetric where b is considered as - // a reference value. To account into both absolute/relative errors, it - // uses absolute tolerance and relative tolerance at the same time. The - // default values of atol and rtol borrowed from numpy.isclose(). For the - // case of nan value, the result will be true. - if (isnan(a) && isnan(b)) { - return true; - } - return fabs(a - b) <= (atol + rtol * fabs(b)); -} - -template -bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, float atol, float rtol) -{ - assert(out.type == ref.type); - - size_t out_size = out.size(); - size_t ref_size = ref.size(); - T* h_out = reinterpret_cast(malloc(sizeof(T) * out_size)); - T* h_ref = reinterpret_cast(malloc(sizeof(T) * ref_size)); - - cudaMemcpy(h_out, out.data, sizeof(T) * out_size, cudaMemcpyDeviceToHost); - cudaMemcpy(h_ref, ref.data, sizeof(T) * ref_size, cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); - - size_t failures = 0; - for (size_t i = 0; i < out_size; ++i) { - // The values for the output and the reference. - float a = (float)h_out[i]; - float b = (float)h_ref[i]; - - bool ok = almostEqual(a, b, atol, rtol); - // Print the error. - if (!ok && failures < 4) { - TM_LOG_ERROR(">> invalid result for i=%lu:", i); - TM_LOG_ERROR(">> found......: %10.6f", a); - TM_LOG_ERROR(">> expected...: %10.6f", b); - TM_LOG_ERROR(">> error......: %.6f", fabsf(a - b)); - TM_LOG_ERROR(">> tol........: %.6f", atol + rtol * fabs(b)); - } - - // Update the number of failures. - failures += ok ? 0 : 1; - } - - // Allow not matched up to 1% elements. - size_t tol_failures = (size_t)(0.01 * out_size); - TM_LOG_INFO("check....... %30s : %s (failures: %.2f%% atol: %.2e rtol: %.2e)", - name.c_str(), - failures <= tol_failures ? "OK" : "FAILED", - 100. * failures / out_size, - atol, - rtol); - return failures <= tol_failures; -} - -template -bool checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref) -{ - float atol = (computeType == TYPE_FP32) ? 1e-6f : 1e-3f; - float rtol = (computeType == TYPE_FP32) ? 1e-4f : 1e-1f; - bool is_ok = false; - if (sizeof(T) == 4) { - is_ok = _checkResult(name, out, ref, atol, rtol); - } - else { - is_ok = _checkResult(name, out, ref, atol, rtol); - } - return is_ok; -} - -template -bool checkResult(TensorWrapper& out, TensorWrapper& ref) -{ - return checkResult("", out, ref); -} - -template -std::string toString() -{ - std::string str = "dtype="; - str += std::is_same::value ? "FP32" : "FP16"; - return str; -} - -template -std::string toString() -{ - std::string str = "dtype="; - str += std::is_same::value ? "FP32" : "FP16"; - str += ", compute_type="; - str += (ctype == TYPE_FP32) ? "FP32" : "FP16"; - return str; -} - -std::string toString(GemmOp op) -{ - return op == GEMM_OP_N ? "N" : "T"; -} - -struct GemmOpPair { - GemmOp transa; - GemmOp transb; -}; - -static const std::vector op_pairs{ - {GEMM_OP_N, GEMM_OP_N}, {GEMM_OP_N, GEMM_OP_T}, {GEMM_OP_T, GEMM_OP_N}, {GEMM_OP_T, GEMM_OP_T}}; - -static inline std::string getTestName(const char* func_name, GemmOp transa, GemmOp transb, size_t m, size_t n, size_t k) -{ - return fmtstr("%s [opA=%s, opB=%s, m=%ld, n=%ld, k=%ld]", - func_name, - getGemmOpString(transa).c_str(), - getGemmOpString(transb).c_str(), - m, - n, - k); -} - -static inline std::string getTestName(const char* func_name, GemmOpPair op_pairs, size_t m, size_t n, size_t k) -{ - return getTestName(func_name, op_pairs.transa, op_pairs.transb, m, n, k); -} - -/////////////////////////////////// Unittests ////////////////////////////////////////// - -template -void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k) -{ - TM_LOG_INFO( - "Matmul function correctness test [m=%ld, n=%ld, k=%ld, %s]", m, n, k, toString().c_str()); - cudaStream_t stream; - check_cuda_error(cudaStreamCreate(&stream)); - - Allocator allocator(getDevice()); - - DataType dtype = getTensorType(); - TensorWrapper a_tensor(&allocator, dtype, {m, k}, false); - TensorWrapper b_tensor(&allocator, dtype, {k, n}, false); - TensorWrapper c_tensor(&allocator, dtype, {m, n}, true); - TensorWrapper expected(&allocator, dtype, {m, n}, true); - - std::shared_ptr gemm = createGemm(&allocator, stream, false, false); - gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType); - - for (auto& op_pair : op_pairs) { - std::string tc_name = getTestName(__func__, op_pair, m, n, k); - TM_LOG_DEBUG(tc_name); - computeReference(op_pair.transa, op_pair.transb, expected, a_tensor, b_tensor); - - size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m; - size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k; - size_t ldc = n; - - c_tensor.setInvalidValues(); // to guarantee C has invalid data - gemm->gemm(op_pair.transa, - op_pair.transb, - m, - n, - k, - a_tensor.data, - a_tensor.type, - lda, - b_tensor.data, - b_tensor.type, - ldb, - c_tensor.data, - c_tensor.type, - ldc); - EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected); - - c_tensor.setInvalidValues(); - gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_tensor.data, ldb, c_tensor.data, ldc); - EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected); - - c_tensor.setInvalidValues(); - gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_tensor.data, c_tensor.data); - EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected); - - c_tensor.setInvalidValues(); - gemm->gemm(op_pair.transa, - op_pair.transb, - m, - n, - k, - a_tensor.data, - DenseWeight{(const T*)b_tensor.data, nullptr, nullptr}, - c_tensor.data); - EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected); - } - check_cuda_error(cudaStreamDestroy(stream)); -} - -template -void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) -{ - // Test if Gemm is consistent with cublasWrapper - TM_LOG_INFO( - "Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]", m, n, k, toString().c_str()); - - Allocator allocator(getDevice()); - cudaStream_t stream; - check_cuda_error(cudaStreamCreate(&stream)); - - DataType dtype = getTensorType(); - TensorWrapper a_tensor(&allocator, dtype, {m, k}, false); - TensorWrapper b_tensor(&allocator, dtype, {k, n}, false); - TensorWrapper c_tensor(&allocator, dtype, {m, n}, true); - TensorWrapper expected(&allocator, dtype, {m, n}, true); - - cublasHandle_t cublas_handle; - cublasLtHandle_t cublaslt_handle; - check_cuda_error(cublasCreate(&cublas_handle)); - check_cuda_error(cublasLtCreate(&cublaslt_handle)); - check_cuda_error(cublasSetStream(cublas_handle, stream)); - cublasAlgoMap cublas_algo_map(GEMM_CONFIG); - std::mutex* cublas_wrapper_mutex = new std::mutex(); - cublasMMWrapper cublas_wrapper( - cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator); - - cudaDataType_t cuda_dtype = std::is_same::value ? CUDA_R_32F : CUDA_R_16F; - cudaDataType_t cuda_ctype = (DataType::TYPE_FP32 == computeType) ? CUDA_R_32F : CUDA_R_16F; - cublas_wrapper.setGemmConfig(cuda_dtype, cuda_dtype, cuda_dtype, cuda_ctype); - - std::shared_ptr gemm = createGemm(&allocator, stream, false, false); - gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType); - - for (auto& op_pair : op_pairs) { - std::string tc_name = getTestName(__func__, op_pair, m, n, k); - - // Switch A/B because Gemm expects column major layout as cublas does. - size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m; - size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k; - size_t ldc = n; - cublas_wrapper.Gemm(getCublasOperation(op_pair.transb), - getCublasOperation(op_pair.transa), - n, - m, - k, - b_tensor.data, - ldb, - a_tensor.data, - lda, - expected.data, - ldc); - - c_tensor.setInvalidValues(); // to guarantee C has invalid data - gemm->gemm(op_pair.transa, - op_pair.transb, - m, - n, - k, - a_tensor.data, - a_tensor.type, - lda, - b_tensor.data, - b_tensor.type, - ldb, - c_tensor.data, - c_tensor.type, - ldc); - EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected); - - c_tensor.setInvalidValues(); - gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_tensor.data, ldb, c_tensor.data, ldc); - EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected); - - c_tensor.setInvalidValues(); - gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_tensor.data, c_tensor.data); - EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected); - - c_tensor.setInvalidValues(); - gemm->gemm(op_pair.transa, - op_pair.transb, - m, - n, - k, - a_tensor.data, - DenseWeight{(const T*)b_tensor.data, nullptr, nullptr}, - c_tensor.data); - EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected); - } - - delete cublas_wrapper_mutex; - check_cuda_error(cublasLtDestroy(cublaslt_handle)); - check_cuda_error(cublasDestroy(cublas_handle)); - check_cuda_error(cudaStreamDestroy(stream)); -} - -template -void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) -{ - // Test if Gemm is consistent with cublasWrapper - TM_LOG_INFO("Batched gemm function consistency test [m=%ld, n=%ld, k=%ld, %s]", - m, - n, - k, - toString().c_str()); - - Allocator allocator(getDevice()); - cudaStream_t stream; - check_cuda_error(cudaStreamCreate(&stream)); - - // batch of in/out tensors - DataType a_type = getTensorType(); - DataType b_type = getTensorType(); - DataType c_type = getTensorType(); - std::vector a_tensors; - std::vector b_tensors; - std::vector c_tensors; - std::vector expecteds; - const size_t batch_size = 3; - for (size_t i = 0; i < batch_size; ++i) { - a_tensors.push_back(new TensorWrapper(&allocator, a_type, {m, k}, false)); - b_tensors.push_back(new TensorWrapper(&allocator, b_type, {k, n}, false)); - c_tensors.push_back(new TensorWrapper(&allocator, c_type, {m, n}, true)); - expecteds.push_back(new TensorWrapper(&allocator, c_type, {m, n}, true)); - } - - const T* hA[]{(const T*)a_tensors[0]->data, - (const T*)a_tensors[1]->data, - (const T*)a_tensors[2]->data, - nullptr, // for memory alignment. - (const T*)b_tensors[0]->data, - (const T*)b_tensors[1]->data, - (const T*)b_tensors[2]->data, - nullptr, // for memory alignment. - (const T*)c_tensors[0]->data, - (const T*)c_tensors[1]->data, - (const T*)c_tensors[2]->data, - nullptr, // for memory alignment. - (const T*)expecteds[0]->data, - (const T*)expecteds[1]->data, - (const T*)expecteds[2]->data}; - - T** batch_tensor_ptrs = reinterpret_cast(allocator.malloc(sizeof(T*) * 16, false)); - check_cuda_error(cudaMemcpyAsync((void*)batch_tensor_ptrs, hA, sizeof(T*) * 16, cudaMemcpyHostToDevice, stream)); - const void* const* batch_a = reinterpret_cast(batch_tensor_ptrs); - const void* const* batch_b = reinterpret_cast(batch_tensor_ptrs + 4); - void* const* batch_c = reinterpret_cast(batch_tensor_ptrs + 8); - void* const* batch_expected = reinterpret_cast(batch_tensor_ptrs + 12); - - cublasHandle_t cublas_handle; - cublasLtHandle_t cublaslt_handle; - check_cuda_error(cublasCreate(&cublas_handle)); - check_cuda_error(cublasLtCreate(&cublaslt_handle)); - check_cuda_error(cublasSetStream(cublas_handle, stream)); - cublasAlgoMap cublas_algo_map(GEMM_CONFIG); - std::mutex* cublas_wrapper_mutex = new std::mutex(); - cublasMMWrapper cublas_wrapper( - cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator); - - cudaDataType_t dtype = std::is_same::value ? CUDA_R_32F : CUDA_R_16F; - cudaDataType_t ctype = (computeType == DataType::TYPE_FP32) ? CUDA_R_32F : CUDA_R_16F; - cublas_wrapper.setGemmConfig(dtype, dtype, dtype, ctype); - - std::shared_ptr gemm = createGemm(&allocator, stream, false, false); - gemm->setTypes(a_type, b_type, c_type, computeType); - - for (auto& op_pair : op_pairs) { - std::string tc_name = getTestName(__func__, op_pair, m, n, k); - TM_LOG_DEBUG(tc_name); - - size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m; - size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k; - size_t ldc = n; - - // Switch A/B because Gemm expects column major layout as cublas does. - cublas_wrapper.batchedGemm(getCublasOperation(op_pair.transb), // N - getCublasOperation(op_pair.transa), // T - n, - m, - k, - (const void* const*)batch_b, - ldb, - (const void* const*)batch_a, - lda, - (void* const*)batch_expected, - ldc, - batch_size); - - gemm->batchedGemm(op_pair.transa, - op_pair.transb, - m, - n, - k, - batch_a, - a_type, - lda, - batch_b, - b_type, - ldb, - batch_c, - c_type, - ldc, - batch_size); - for (size_t i = 0; i < batch_size; ++i) { - EXPECT_ALMOST_EQUAL( - tc_name + " api1 batch" + std::to_string(i), T, computeType, *c_tensors[i], *expecteds[i]); - } - - for (size_t i = 0; i < batch_size; ++i) { - c_tensors[i]->setInvalidValues(); - } - gemm->batchedGemm( - op_pair.transa, op_pair.transb, m, n, k, batch_a, lda, batch_b, ldb, batch_c, ldc, batch_size); - for (size_t i = 0; i < batch_size; ++i) { - EXPECT_ALMOST_EQUAL( - tc_name + " api2 batch" + std::to_string(i), T, computeType, *c_tensors[i], *expecteds[i]); - } - - for (size_t i = 0; i < batch_size; ++i) { - c_tensors[i]->setInvalidValues(); - } - gemm->batchedGemm(op_pair.transa, op_pair.transb, m, n, k, batch_a, batch_b, batch_c, batch_size); - for (size_t i = 0; i < batch_size; ++i) { - EXPECT_ALMOST_EQUAL( - tc_name + " api3 batch" + std::to_string(i), T, computeType, *c_tensors[i], *expecteds[i]); - } - } - a_tensors.clear(); - b_tensors.clear(); - c_tensors.clear(); - expecteds.clear(); - delete cublas_wrapper_mutex; - check_cuda_error(cublasLtDestroy(cublaslt_handle)); - check_cuda_error(cublasDestroy(cublas_handle)); - check_cuda_error(cudaStreamDestroy(stream)); -} - -template -void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t n, size_t k) -{ - // Test if Gemm is consistent with cublasWrapper - TM_LOG_INFO("Strided batched gemm function consistency test [bsz=%ld, m=%ld, n=%ld, k=%ld, %s]", - batch_size, - m, - n, - k, - toString().c_str()); - - Allocator allocator(getDevice()); - cudaStream_t stream; - check_cuda_error(cudaStreamCreate(&stream)); - - DataType data_type = getTensorType(); - TensorWrapper a_tensor(&allocator, data_type, {batch_size, m, k}, false); - TensorWrapper b_tensor(&allocator, data_type, {batch_size, k, n}, false); - TensorWrapper c_tensor(&allocator, data_type, {batch_size, m, n}, true); - TensorWrapper expected(&allocator, data_type, {batch_size, m, n}, true); - - cublasHandle_t cublas_handle; - cublasLtHandle_t cublaslt_handle; - check_cuda_error(cublasCreate(&cublas_handle)); - check_cuda_error(cublasLtCreate(&cublaslt_handle)); - check_cuda_error(cublasSetStream(cublas_handle, stream)); - cublasAlgoMap cublas_algo_map(GEMM_CONFIG); - std::mutex* cublas_wrapper_mutex = new std::mutex(); - cublasMMWrapper cublas_wrapper( - cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator); - - cudaDataType_t dtype = std::is_same::value ? CUDA_R_32F : CUDA_R_16F; - cudaDataType_t ctype = (computeType == DataType::TYPE_FP32) ? CUDA_R_32F : CUDA_R_16F; - cublas_wrapper.setGemmConfig(dtype, dtype, dtype, ctype); - - std::shared_ptr gemm = createGemm(&allocator, stream, false, false); - gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType); - - for (auto& op_pair : op_pairs) { - std::string tc_name = getTestName(__func__, op_pair, m, n, k); - - // Switch A/B because Gemm expects column major layout as cublas does. - size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m; - size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k; - size_t ldc = n; - - int64_t stridea = m * k; - int64_t strideb = k * n; - int64_t stridec = m * n; - - float alpha = 1.0f; - float beta = 0.0f; - - cublas_wrapper.stridedBatchedGemm(getCublasOperation(op_pair.transb), - getCublasOperation(op_pair.transa), - n, - m, - k, - alpha, - b_tensor.data, - getCublasDataType(b_tensor.type), - ldb, - strideb, - a_tensor.data, - getCublasDataType(a_tensor.type), - lda, - stridea, - beta, - expected.data, - getCublasDataType(expected.type), - ldc, - stridec, - batch_size, - getCublasDataType(computeType)); - - c_tensor.setInvalidValues(); // to guarantee C has invalid data - gemm->stridedBatchedGemm(op_pair.transa, - op_pair.transb, - m, - n, - k, - a_tensor.data, - a_tensor.type, - lda, - stridea, - b_tensor.data, - b_tensor.type, - ldb, - strideb, - c_tensor.data, - c_tensor.type, - ldc, - stridec, - batch_size, - computeType, - alpha, - beta); - EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected); - - c_tensor.setInvalidValues(); - gemm->stridedBatchedGemm(op_pair.transa, - op_pair.transb, - m, - n, - k, - a_tensor.data, - lda, - stridea, - b_tensor.data, - ldb, - strideb, - c_tensor.data, - ldc, - stridec, - batch_size, - alpha, - beta); - EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected); - - c_tensor.setInvalidValues(); - gemm->stridedBatchedGemm(op_pair.transa, - op_pair.transb, - m, - n, - k, - a_tensor.data, - stridea, - b_tensor.data, - strideb, - c_tensor.data, - stridec, - batch_size, - alpha, - beta); - EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected); - - c_tensor.setInvalidValues(); - gemm->stridedBatchedGemm(op_pair.transa, - op_pair.transb, - m, - n, - k, - a_tensor.data, - b_tensor.data, - c_tensor.data, - batch_size, - alpha, - beta); - EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected); - } - - delete cublas_wrapper_mutex; - check_cuda_error(cublasLtDestroy(cublaslt_handle)); - check_cuda_error(cublasDestroy(cublas_handle)); - check_cuda_error(cudaStreamDestroy(stream)); -} - -#ifdef SPARSITY_ENABLED -// The current SpGemm only supports TYPE_FP16 for T, computeType, -// but let us keep these template variables for later use. -template -void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) -{ - TM_LOG_INFO( - "Sparse gemm function correctness test [m=%ld, n=%ld, k=%ld, %s]", m, n, k, toString().c_str()); - cudaStream_t stream; - check_cuda_error(cudaStreamCreate(&stream)); - - Allocator allocator(getDevice()); - - DataType dtype = getTensorType(); - TensorWrapper a_tensor(&allocator, dtype, {m, k}, false); - TensorWrapper b_tensor(&allocator, dtype, {k, n}, false); - TensorWrapper c_tensor(&allocator, dtype, {m, n}, true); - TensorWrapper expected(&allocator, dtype, {m, n}, true); - - std::shared_ptr gemm = createGemm(&allocator, stream, true, false); - gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType); - - for (auto& op_pair : op_pairs) { - // A/B will be switched in SpGemm. - std::string tc_name = getTestName(__func__, op_pair, m, n, k); - TM_LOG_DEBUG(tc_name); - - b_tensor.setRandomValues(); - pruneMatrixB(b_tensor.data, stream, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb); - computeReference(op_pair.transa, op_pair.transb, expected, a_tensor, b_tensor); - - void* b_compressed; - compressMatrixB( - &b_compressed, allocator, stream, b_tensor.data, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb); - - size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m; - size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k; - size_t ldc = n; - - c_tensor.setInvalidValues(); // to guarantee C has invalid data - gemm->gemm(op_pair.transa, - op_pair.transb, - m, - n, - k, - a_tensor.data, - a_tensor.type, - lda, - b_compressed, - b_tensor.type, - ldb, - c_tensor.data, - c_tensor.type, - ldc); - EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected); - - c_tensor.setInvalidValues(); - gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_compressed, ldb, c_tensor.data, ldc); - EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected); - - c_tensor.setInvalidValues(); - gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_compressed, c_tensor.data); - EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected); - - c_tensor.setInvalidValues(); - gemm->gemm(op_pair.transa, - op_pair.transb, - m, - n, - k, - a_tensor.data, - DenseWeight{(const T*)b_tensor.data, nullptr, (const T*)b_compressed}, - c_tensor.data); - EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected); - - allocator.free((void**)(&b_compressed)); - } - check_cuda_error(cudaStreamDestroy(stream)); -} - -template -void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) -{ - // Test if Gemm is consistent with cublasWrapper - TM_LOG_INFO("Sparse Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]", - m, - n, - k, - toString().c_str()); - - Allocator allocator(getDevice()); - cudaStream_t stream; - check_cuda_error(cudaStreamCreate(&stream)); - - DataType dtype = getTensorType(); - TensorWrapper a_tensor(&allocator, dtype, {m, k}, false); - TensorWrapper b_tensor(&allocator, dtype, {k, n}, false); - TensorWrapper c_tensor(&allocator, dtype, {m, n}, true); - TensorWrapper expected(&allocator, dtype, {m, n}, true); - - cublasHandle_t cublas_handle; - cublasLtHandle_t cublaslt_handle; - check_cuda_error(cublasCreate(&cublas_handle)); - check_cuda_error(cublasLtCreate(&cublaslt_handle)); - check_cuda_error(cublasSetStream(cublas_handle, stream)); - cublasAlgoMap cublas_algo_map(GEMM_CONFIG); - std::mutex* cublas_wrapper_mutex = new std::mutex(); - cublasMMWrapper cublas_wrapper( - cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator); - - cudaDataType_t cu_dtype = std::is_same::value ? CUDA_R_32F : CUDA_R_16F; - cudaDataType_t cu_ctype = (DataType::TYPE_FP32 == computeType) ? CUDA_R_32F : CUDA_R_16F; - cublas_wrapper.setGemmConfig(cu_dtype, cu_dtype, cu_dtype, cu_ctype); - - std::shared_ptr gemm = createGemm(&allocator, stream, true, false); - gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType); - - for (auto& op_pair : op_pairs) { - std::string tc_name = getTestName(__func__, op_pair, m, n, k); - TM_LOG_DEBUG(tc_name); - - b_tensor.setRandomValues(); - pruneMatrixB(b_tensor.data, stream, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb); - - // Switch A/B because Gemm expects column major layout as cublas does. - size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m; - size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k; - size_t ldc = n; - cublas_wrapper.Gemm(getCublasOperation(op_pair.transb), - getCublasOperation(op_pair.transa), - n, - m, - k, - b_tensor.data, - ldb, - a_tensor.data, - lda, - expected.data, - ldc); - - void* b_compressed; - compressMatrixB( - &b_compressed, allocator, stream, b_tensor.data, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb); - - c_tensor.setInvalidValues(); // to guarantee C has invalid data - gemm->gemm(op_pair.transa, - op_pair.transb, - m, - n, - k, - a_tensor.data, - a_tensor.type, - lda, - b_compressed, - b_tensor.type, - ldb, - c_tensor.data, - c_tensor.type, - ldc); - EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected); - - c_tensor.setInvalidValues(); - gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_compressed, ldb, c_tensor.data, ldc); - EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected); - - c_tensor.setInvalidValues(); - gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_compressed, c_tensor.data); - EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected); - } - - delete cublas_wrapper_mutex; - check_cuda_error(cublasLtDestroy(cublaslt_handle)); - check_cuda_error(cublasDestroy(cublas_handle)); - check_cuda_error(cudaStreamDestroy(stream)); -} -#endif - -int main(int argc, char* argv[]) -{ - // testGemmCreate(); - using testcase_t = std::tuple; - - std::vector testcases = { - {16, 32, 64}, {255, 255, 255}, {1041, 2047, 9999}, {1041, 1, 9999}, {1041, 999, 1}}; - - // Computation correctness tests - for (testcase_t& tc : testcases) { - size_t m = std::get<0>(tc); - size_t n = std::get<1>(tc); - size_t k = std::get<2>(tc); - - testGemmCorrectnessMatmul(m, n, k); - testGemmCorrectnessMatmul(m, n, k); - testGemmCorrectnessMatmul(m, n, k); - - testGemmConsistencyMatmul(m, n, k); - testGemmConsistencyMatmul(m, n, k); - testGemmConsistencyMatmul(m, n, k); - - testGemmConsistencyBatchedMatmul(m, n, k); - testGemmConsistencyBatchedMatmul(m, n, k); - testGemmConsistencyBatchedMatmul(m, n, k); - - testGemmConsistencyStridedBatchedMatmul(7, m, n, k); - testGemmConsistencyStridedBatchedMatmul(7, m, n, k); - testGemmConsistencyStridedBatchedMatmul(7, m, n, k); - } - -#ifdef SPARSITY_ENABLED - // Reset for SpGemm test. - testcases.clear(); - testcases.insert(testcases.end(), - {{8, 32, 32}, // minimum possible example. - {8, 32, 64}, - {64, 64, 64}, - {16, 32, 64}, - {1024, 32, 1024}, - {1024, 1024, 32}, - {16, 1024, 1024}, - {1024, 1024, 1024}}); - - for (testcase_t& tc : testcases) { - size_t m = std::get<0>(tc); - size_t n = std::get<1>(tc); - size_t k = std::get<2>(tc); - testSpGemmCorrectnessMatmul(m, n, k); - testSpGemmConsistencyMatmul(m, n, k); - } -#endif - TM_LOG_INFO("Test done"); - return 0; -} diff --git a/tests/csrc/unittests/test_int8.cu b/tests/csrc/unittests/test_int8.cu deleted file mode 100644 index 6831c56ea1..0000000000 --- a/tests/csrc/unittests/test_int8.cu +++ /dev/null @@ -1,95 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "src/turbomind/kernels/transpose_int8_kernels.h" -#include "src/turbomind/utils/Tensor.h" -#include "src/turbomind/utils/cuda_utils.h" -#include "src/turbomind/utils/memory_utils.h" - -#include -#include -#include - -#include "gtest_utils.h" - -using namespace turbomind; - -class Int8TestSuite: public FtTestBase { - -public: - void SetUp() override - { - FtTestBase::SetUp(); - } - void TearDown() override - { - FtTestBase::TearDown(); - } - -protected: - using FtTestBase::stream; - using FtTestBase::allocator; - - struct cudaDeviceProp prop; - - void testTransposition(); -}; - -void fill_tensor_random(Tensor a) -{ - const size_t num_elems = a.size(); - std::vector host_values(num_elems); - std::uniform_int_distribution int8_random(-128, 127); - std::mt19937 rng(0); - - std::generate(host_values.begin(), host_values.end(), [&int8_random, &rng]() { return int8_random(rng); }); - cudaH2Dcpy(a.getPtr(), host_values.data(), num_elems); -} - -void reference_transpose_host(std::vector& a_t_host, const Tensor& a) -{ - std::vector a_host(a.size()); - cudaD2Hcpy(a_host.data(), a.getPtr(), a.size()); - - for (unsigned int i = 0; i < a.shape[0]; i++) { - for (unsigned int j = 0; j < a.shape[1]; j++) { - a_t_host[j * a.shape[0] + i] = a_host[i * a.shape[1] + j]; - } - } -} - -void Int8TestSuite::testTransposition() -{ - const int m = 32; - const int k = 2048; - const int n = 2048; - - int8_t *a_data, *a_t_data; - - cudaMalloc(&a_data, m * k * sizeof(int8_t)); - Tensor a{MEMORY_GPU, TYPE_INT8, {32, 2048}, a_data}; - fill_tensor_random(a); - - cudaMalloc(&a_t_data, k * m * sizeof(int8_t)); - Tensor a_t{MEMORY_GPU, TYPE_INT8, {2048, 32}, a_t_data}; - - std::vector a_t_host_ref(a_t.size()); - reference_transpose_host(a_t_host_ref, a); - - invokeTransposeInt8Tensor(a_t, a); - bool result = checkResult("", a_t.getPtr(), a_t_host_ref.data(), a_t.size()); - - cudaFree(a_data); - cudaFree(a_t_data); - - EXPECT_TRUE(result); -} - -TEST_F(Int8TestSuite, TranspositionCorrectness) -{ - this->testTransposition(); -} diff --git a/tests/csrc/unittests/test_tensor.cu b/tests/csrc/unittests/test_tensor.cu deleted file mode 100644 index 4211ed3409..0000000000 --- a/tests/csrc/unittests/test_tensor.cu +++ /dev/null @@ -1,256 +0,0 @@ -#include -#include -#include - -#include - -#include "src/turbomind/utils/Tensor.h" - -using namespace turbomind; - -namespace { - -#define EXPECT_EQUAL_TENSORS(t1, t2) \ - do { \ - EXPECT_TRUE(t1.where == t2.where); \ - EXPECT_TRUE(t1.type == t2.type); \ - EXPECT_TRUE(t1.shape == t2.shape); \ - EXPECT_TRUE(t1.data == t2.data); \ - } while (false) - -TEST(TensorMapTest, HasKeyCorrectness) -{ - bool* v1 = new bool(true); - float* v2 = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f}; - Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, v1}; - Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, v2}; - - TensorMap map({{"t1", t1}, {"t2", t2}}); - EXPECT_TRUE(map.isExist("t1")); - EXPECT_TRUE(map.isExist("t2")); - EXPECT_FALSE(map.isExist("t3")); - - delete v1; - delete[] v2; -} - -TEST(TensorMapTest, InsertCorrectness) -{ - int* v1 = new int[4]{1, 10, 20, 30}; - float* v2 = new float[2]{1.0f, 2.0f}; - Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1); - Tensor t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v2); - - TensorMap map({{"t1", t1}}); - EXPECT_TRUE(map.size() == 1); - EXPECT_TRUE(map.isExist("t1")); - EXPECT_EQUAL_TENSORS(map.at("t1"), t1); - EXPECT_FALSE(map.isExist("t2")); -} - -TEST(TensorMapTest, InsertDoesNotAllowNoneTensor) -{ - TensorMap map; - EXPECT_TRUE(map.size() == 0); - // forbid a none tensor. - EXPECT_THROW(map.insert("none", {}), std::runtime_error); - - // forbid a tensor having null data pointer. - Tensor none_data_tensor = Tensor(MEMORY_CPU, TYPE_INT32, {}, nullptr); - EXPECT_THROW(map.insert("empty", none_data_tensor), std::runtime_error); -} - -TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey) -{ - int* v1 = new int[4]{1, 10, 20, 30}; - Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1); - Tensor t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v1); - TensorMap map({{"t1", t1}}); - EXPECT_TRUE(map.size() == 1); - // forbid a duplicated key. - EXPECT_THROW(map.insert("t1", t2), std::runtime_error); - delete[] v1; -} - -TEST(TensorMapTest, GetValCorrectness) -{ - int* v1 = new int[4]{1, 10, 20, 30}; - Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1); - - TensorMap map({{"t1", t1}}); - EXPECT_TRUE(map.size() == 1); - // throw exception since the map doesn't have a key "t3". - EXPECT_THROW(map.getVal("t3"), std::runtime_error); - EXPECT_TRUE(map.getVal("t1") == 1); - EXPECT_TRUE(map.getVal("t1", 3) == 1); - - // map doesn't have t2 so return the default value 3. - EXPECT_TRUE(map.getVal("t2", 3) == 3); - - v1[0] += 1; // update value. - EXPECT_TRUE(map.getVal("t1") == 2); - EXPECT_TRUE(map.getVal("t1", 3) == 2); - - size_t index = 2; - EXPECT_TRUE(map.getValWithOffset("t1", index) == 20); - EXPECT_TRUE(map.getValWithOffset("t1", index, 3) == 20); - EXPECT_TRUE(map.getValWithOffset("t2", index, 3) == 3); - delete[] v1; -} - -TEST(TensorMapTest, GetTensorCorrectness) -{ - bool* t1_val = new bool(true); - float* t2_val = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f}; - Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val}; - Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val}; - - int* default_val = new int[4]{0, 1, 2, 3}; - Tensor default_tensor = Tensor{MEMORY_CPU, TYPE_INT32, {4}, default_val}; - - TensorMap map({{"t1", t1}, {"t2", t2}}); - EXPECT_THROW(map.at("t3"), std::runtime_error); - EXPECT_EQUAL_TENSORS(map.at("t1", default_tensor), t1); - EXPECT_EQUAL_TENSORS(map.at("t2", default_tensor), t2); - EXPECT_EQUAL_TENSORS(map.at("t3", default_tensor), default_tensor); - EXPECT_EQUAL_TENSORS(map.at("t3", {}), Tensor()); - - delete[] default_val; - delete[] t2_val; - delete[] t1_val; -} - -TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap) -{ - bool* t1_val = new bool(true); - float* t2_val = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f}; - Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val}; - Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val}; - - int* default_val = new int[4]{0, 1, 2, 3}; - Tensor default_tensor = Tensor{MEMORY_CPU, TYPE_INT32, {4}, default_val}; - - const TensorMap map({{"t1", t1}, {"t2", t2}}); - EXPECT_THROW(map.at("t3"), std::runtime_error); - EXPECT_EQUAL_TENSORS(map.at("t1", default_tensor), t1); - EXPECT_EQUAL_TENSORS(map.at("t2", default_tensor), t2); - EXPECT_EQUAL_TENSORS(map.at("t3", default_tensor), default_tensor); - EXPECT_EQUAL_TENSORS(map.at("t3", {}), Tensor()); - - delete[] default_val; - delete[] t2_val; - delete[] t1_val; -} - -TEST(TensorTest, EmptyTensorMinMaxRaiseError) -{ - Tensor t1; - EXPECT_THROW(t1.min(), std::runtime_error); - EXPECT_THROW(t1.max(), std::runtime_error); - - Tensor t2 = Tensor{MEMORY_CPU, TYPE_INT32, {}, nullptr}; - EXPECT_THROW(t2.min(), std::runtime_error); - EXPECT_THROW(t2.max(), std::runtime_error); -} - -using TensorTypes = testing::Types; - -template -class TensorFuncTest: public testing::Test {}; - -TYPED_TEST_SUITE(TensorFuncTest, TensorTypes); - -TYPED_TEST(TensorFuncTest, MaxCorrectness) -{ - using T = TypeParam; - - size_t size = 4; - - T* v1 = new T[size]{T(1), T(2), T(3), T(4)}; - T* v2 = new T[size]{T(4), T(3), T(2), T(1)}; - T* v3 = new T[size]{T(1), T(2), T(4), T(3)}; - - Tensor t1 = Tensor(MEMORY_CPU, getTensorType(), {size}, v1); - Tensor t2 = Tensor(MEMORY_CPU, getTensorType(), {size}, v2); - Tensor t3 = Tensor(MEMORY_CPU, getTensorType(), {size}, v3); - - EXPECT_EQ(t1.max(), T(4)); - EXPECT_EQ(t2.max(), T(4)); - EXPECT_EQ(t3.max(), T(4)); - - delete[] v1; - delete[] v2; - delete[] v3; -} - -TYPED_TEST(TensorFuncTest, MinCorrectness) -{ - using T = TypeParam; - - size_t size = 4; - - T* v1 = new T[size]{T(1), T(2), T(3), T(4)}; - T* v2 = new T[size]{T(4), T(3), T(2), T(1)}; - T* v3 = new T[size]{T(1), T(2), T(4), T(3)}; - - Tensor t1 = Tensor(MEMORY_CPU, getTensorType(), {size}, v1); - Tensor t2 = Tensor(MEMORY_CPU, getTensorType(), {size}, v2); - Tensor t3 = Tensor(MEMORY_CPU, getTensorType(), {size}, v3); - - EXPECT_EQ(t1.min(), T(1)); - EXPECT_EQ(t2.min(), T(1)); - EXPECT_EQ(t3.min(), T(1)); - - delete[] v1; - delete[] v2; - delete[] v3; -} - -TYPED_TEST(TensorFuncTest, AnyCorrectness) -{ - using T = TypeParam; - - T* v = new T[4]{T(1), T(2), T(3), T(4)}; - Tensor t = Tensor{MEMORY_CPU, getTensorType(), {4}, v}; - EXPECT_TRUE(t.any(T(1))); - EXPECT_FALSE(t.any(T(5))); - delete[] v; -} - -TYPED_TEST(TensorFuncTest, AllCorrectness) -{ - using T = TypeParam; - - constexpr size_t size = 4; - T* v1 = new T[size]{T(1), T(1), T(1), T(1)}; - T* v2 = new T[size]{T(1), T(1), T(1), T(2)}; - Tensor t1 = Tensor{MEMORY_CPU, getTensorType(), {size}, v1}; - Tensor t2 = Tensor{MEMORY_CPU, getTensorType(), {size}, v2}; - EXPECT_TRUE(t1.all(T(1))); - EXPECT_FALSE(t2.all(T(2))); - delete[] v1; - delete[] v2; -} - -TYPED_TEST(TensorFuncTest, SliceCorrectness) -{ - using T = TypeParam; - - constexpr int size = 12; - T* v = new T[size]; - for (int i = 0; i < size; ++i) { - v[i] = i; - } - - DataType dtype = getTensorType(); - Tensor t1 = Tensor(MEMORY_CPU, dtype, {3, 4}, v); - Tensor t2 = t1.slice({2, 4}, 4); - - EXPECT_EQUAL_TENSORS(t2, Tensor(MEMORY_CPU, dtype, {2, 4}, &v[4])); - // An overflowed tensor throws an exception. - EXPECT_THROW(t1.slice({2, 4}, 5), std::runtime_error); - - delete[] v; -} - -} // end of namespace