diff --git a/.github/workflows/windows-x64-gpu.yml b/.github/workflows/windows-x64-gpu.yml
index d3339ac15f..078eb7e4af 100644
--- a/.github/workflows/windows-x64-gpu.yml
+++ b/.github/workflows/windows-x64-gpu.yml
@@ -50,11 +50,11 @@ jobs:
             INPUT_CUDA_VERSION: ${{ matrix.cudaver }}
       - name: Build wheel
         run: |
-          $env:BUILD_TEST="ON"
+          $env:BUILD_TEST="OFF"
           mkdir build
           cd build
           ..\builder\windows\generate.ps1
-          cmake --build . --config Release -- /m /v:q
+          cmake --build . --config Release -- /m /v:n
           if (-Not $?) {
             echo "build failed"
             exit 1
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 34d04f7a06..5754f00dbd 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,13 +15,16 @@
 cmake_minimum_required(VERSION 3.11 FATAL_ERROR) # for PyTorch extensions, version should be greater than 3.13
 project(TurboMind LANGUAGES CXX CUDA)
 
-find_package(CUDA 10.2 REQUIRED)
+if (MSVC)
+    # use standard conformant preprocessor
+    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/Zc:preprocessor>)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=/Zc:preprocessor")
+endif ()
 
 find_package(CUDAToolkit REQUIRED)
 
-if(${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "11")
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "11")
   add_definitions("-DENABLE_BF16")
-  message("CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} is greater or equal than 11.0, enable -DENABLE_BF16 flag")
 endif()
 
 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
@@ -29,8 +32,11 @@ set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
 option(BUILD_MULTI_GPU "Build multi-gpu support" ON)
 option(BUILD_PY_FFI "Build python ffi" ON)
 option(BUILD_TEST "Build tests" OFF)
+option(SPARSITY_SUPPORT "Build project with Ampere sparsity feature support" OFF)
+option(BUILD_FAST_MATH "Build in fast math mode" ON)
 
 include(FetchContent)
+
 if (BUILD_TEST)
   FetchContent_Declare(
     repo-cutlass
@@ -45,6 +51,14 @@ if (BUILD_TEST)
 
   set(CUTLASS_HEADER_DIR ${PROJECT_SOURCE_DIR}/3rdparty/cutlass/include)
   set(CUTLASS_EXTENSIONS_DIR ${PROJECT_SOURCE_DIR}/src/turbomind/cutlass_extensions/include)
+
+
+  FetchContent_Declare(
+    Catch2
+    GIT_REPOSITORY https://github.com/catchorg/Catch2.git
+    GIT_TAG        v3.8.0
+  )
+  FetchContent_MakeAvailable(Catch2)
 endif()
 
 FetchContent_Declare(
@@ -56,10 +70,6 @@ set(YAML_BUILD_SHARED_LIBS OFF CACHE BOOL "Build static library of yaml-cpp")
 FetchContent_MakeAvailable(yaml-cpp)
 
 
-option(SPARSITY_SUPPORT "Build project with Ampere sparsity feature support" OFF)
-
-option(BUILD_FAST_MATH "Build in fast math mode" ON)
-
 # the environment variable
 #   ASAN_OPTIONS=protect_shadow_gap=0,intercept_tls_get_addr=0
 # must be set at runtime
@@ -112,13 +122,13 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall -ldl") # -Xptxas -v
 # TODO: build for sm_72 & sm_87 on aarch64 platform (Jetson devices)
 if (NOT CMAKE_CUDA_ARCHITECTURES)
   set(CMAKE_CUDA_ARCHITECTURES 70-real 75-real)
-  if (${CUDA_VERSION} VERSION_GREATER_EQUAL "11")
+  if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "11")
     list(APPEND CMAKE_CUDA_ARCHITECTURES 80-real)
   endif ()
-  if (${CUDA_VERSION} VERSION_GREATER_EQUAL "11.1")
+  if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "11.1")
     list(APPEND CMAKE_CUDA_ARCHITECTURES 86-real)
   endif ()
-  if (${CUDA_VERSION} VERSION_GREATER_EQUAL "11.8")
+  if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "11.8")
     list(APPEND CMAKE_CUDA_ARCHITECTURES 89-real 90-real)
   endif ()
   if (MSVC)
@@ -132,19 +142,23 @@ set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
 set(CMAKE_C_FLAGS_DEBUG    "${CMAKE_C_FLAGS_DEBUG}    -Wall -O0")
 set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG}  -Wall -O0")
 # set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall  --ptxas-options=-v --resource-usage")
-set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall -DCUDA_PTX_FP8_F2FP_ENABLED")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall")
 
 set(CMAKE_CXX_STANDARD "${CXX_STD}")
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++${CXX_STD} -DCUDA_PTX_FP8_F2FP_ENABLED")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++${CXX_STD}")
+
+string(REPLACE "-O2" "" CMAKE_CXX_FLAGS_RELEASE         "${CMAKE_CXX_FLAGS_RELEASE}")
+string(REPLACE "-O2" "" CMAKE_CUDA_FLAGS_RELEASE        "${CMAKE_CUDA_FLAGS_RELEASE}")
+string(REPLACE "-O2" "" CMAKE_CXX_FLAGS_RELWITHDEBINFO  "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
+string(REPLACE "-O2" "" CMAKE_CUDA_FLAGS_RELWITHDEBINFO "${CMAKE_CUDA_FLAGS_RELWITHDEBINFO}")
 
-set(CMAKE_CXX_FLAGS_RELEASE        "${CMAKE_CXX_FLAGS_RELEASE}        -O3")
-set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3")
-# set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler -O3 --ptxas-options=--verbose")
-set(CMAKE_CUDA_FLAGS_RELEASE        "${CMAKE_CUDA_FLAGS_RELEASE}        -Xcompiler -O3 -DCUDA_PTX_FP8_F2FP_ENABLED")
-set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "${CMAKE_CUDA_FLAGS_RELWITHDEBINFO} -Xcompiler -O3 -DCUDA_PTX_FP8_F2FP_ENABLED")
+set(CMAKE_CXX_FLAGS_RELEASE         "${CMAKE_CXX_FLAGS_RELEASE}         -O3")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO  "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}  -O3")
+set(CMAKE_CUDA_FLAGS_RELEASE        "${CMAKE_CUDA_FLAGS_RELEASE}        -O3")
+set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "${CMAKE_CUDA_FLAGS_RELWITHDEBINFO} -O3")
 
 if(BUILD_FAST_MATH)
     set(CMAKE_CUDA_FLAGS_RELEASE        "${CMAKE_CUDA_FLAGS_RELEASE}        --use_fast_math")
@@ -207,13 +221,11 @@ link_directories(
   ${COMMON_LIB_DIRS}
 )
 
-# add_subdirectory(3rdparty)
 add_subdirectory(src)
-# add_subdirectory(examples)
 
-if(BUILD_TEST)
-    add_subdirectory(tests/csrc)
-endif()
+# if(BUILD_TEST)
+#     add_subdirectory(tests/csrc)
+# endif()
 
 # install python api
 if (BUILD_PY_FFI)
diff --git a/builder/windows/generate.ps1 b/builder/windows/generate.ps1
index 96dbbc70bd..0c133b37d0 100644
--- a/builder/windows/generate.ps1
+++ b/builder/windows/generate.ps1
@@ -3,6 +3,5 @@ cmake .. -A x64 -T "v142,cuda=$env:CUDA_PATH" `
     -DCMAKE_INSTALL_PREFIX=install `
     -DBUILD_PY_FFI=ON `
     -DBUILD_MULTI_GPU=OFF `
-    -DCMAKE_CUDA_FLAGS="-lineinfo" `
-    -DUSE_NVTX=ON `
+    -DUSE_NVTX=OFF `
     -DBUILD_TEST="$env:BUILD_TEST"
diff --git a/builder/windows/setup_cuda.ps1 b/builder/windows/setup_cuda.ps1
index b573198ce2..5615aba84a 100644
--- a/builder/windows/setup_cuda.ps1
+++ b/builder/windows/setup_cuda.ps1
@@ -24,6 +24,8 @@ if ($CUDA_VERSION_FULL -eq "12.1.0") {
     $downloadUrl = "https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_531.14_windows.exe"
 } elseif ($CUDA_VERSION_FULL -eq "11.8.0") {
     $downloadUrl = "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_522.06_windows.exe"
+} elseif ($CUDA_VERSION_FULL -eq "12.5.0") {
+    $downloadUrl = "https://developer.download.nvidia.com/compute/cuda/12.5.0/local_installers/cuda_12.5.0_555.85_windows.exe"
 } else {
     Write-Output "Unsupported CUDA version specified"
     exit 1
@@ -84,6 +86,8 @@ $msBuildExtensions = (Get-ChildItem  "$src\visual_studio_integration\CUDAVisualS
     }
 }
 
+$CUDA_FLAGS="-allow-unsupported-compiler -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH=1"
+
 # Add to Github env
 Write-Output "Setting environment variables for GitHub Actions..."
 
@@ -97,7 +101,7 @@ Write-Output "CUDA_PATH_V$($CUDA_MAJOR)_$($CUDA_MINOR)=$dst" >> $env:GITHUB_ENV
 Write-Output "CUDA_PATH_VX_Y=CUDA_PATH_V$($CUDA_MAJOR)_$($CUDA_MINOR)" >> $env:GITHUB_ENV
 Write-Output "CudaToolkitDir=$dst" >> $env:GITHUB_ENV
 Write-Output "CMAKE_CUDA_COMPILER=$dst\bin\nvcc.exe" >> $env:GITHUB_ENV
-Write-Output "NVCC_APPEND_FLAGS=-allow-unsupported-compiler" >> $env:GITHUB_ENV
+Write-Output "NVCC_APPEND_FLAGS=$CUDA_FLAGS" >> $env:GITHUB_ENV
 
 Write-Output "CUDA_VERSION=$CUDA_VERSION_FULL" >> $env:GITHUB_ENV
 Write-Output "Setup completed."
diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py
index 7b2bc5db6f..53e2f6b7e1 100644
--- a/lmdeploy/turbomind/deploy/module.py
+++ b/lmdeploy/turbomind/deploy/module.py
@@ -319,7 +319,8 @@ def pad_weight(tensor: torch.Tensor, tp: int):
         if output_weight is not None:
             tp = self.model.attn_tp_size
             output_weight = pad_weight(output_weight, tp=tp)
-            self.model.save_split(output_weight, 'output.weight', split_dim=0, split_num=tp)
+            # transpose
+            self.model.save_split(output_weight.t(), 'output.weight', split_dim=1, split_num=tp)
 
 
 class Transformer:
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index 8d43923109..3ff1dc1436 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -241,7 +241,7 @@ def _from_hf(self, model_source: ModelSource, model_path: str, engine_config: Tu
 
         model_comm = _tm.AbstractTransformerModel.create_llama_model(model_dir='',
                                                                      config=yaml.safe_dump(self.config_dict),
-                                                                     data_type=self.config.model_config.weight_type)
+                                                                     weight_type=self.config.model_config.weight_type)
 
         # create empty weight
         self._create_weight(model_comm)
@@ -275,7 +275,7 @@ def _from_workspace(self, model_path: str, engine_config: TurbomindEngineConfig)
         weight_dir = osp.join(model_path, 'triton_models', 'weights')
         model_comm = _tm.AbstractTransformerModel.create_llama_model(model_dir=weight_dir,
                                                                      config=yaml.safe_dump(self.config_dict),
-                                                                     data_type=self.config.weight_type)
+                                                                     weight_type=self.config.weight_type)
 
         # create weight and load params
         self._create_weight(model_comm)
diff --git a/src/turbomind/CMakeLists.txt b/src/turbomind/CMakeLists.txt
index b4f1033e67..df86f40ea6 100644
--- a/src/turbomind/CMakeLists.txt
+++ b/src/turbomind/CMakeLists.txt
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 add_subdirectory(utils)
+add_subdirectory(core)
 add_subdirectory(kernels)
 add_subdirectory(layers)
 add_subdirectory(comm)
diff --git a/src/turbomind/comm/CMakeLists.txt b/src/turbomind/comm/CMakeLists.txt
index 43a2dacf21..6e5c772c46 100644
--- a/src/turbomind/comm/CMakeLists.txt
+++ b/src/turbomind/comm/CMakeLists.txt
@@ -3,10 +3,11 @@
 cmake_minimum_required(VERSION 3.8)
 
 add_library(host_comm STATIC host_comm.cc thread_comm.cc)
+target_link_libraries(host_comm PRIVATE core logger)
 set_property(TARGET host_comm PROPERTY POSITION_INDEPENDENT_CODE ON)
 
 add_library(device_comm STATIC device_comm.cc)
-target_link_libraries(device_comm PRIVATE logger)
+target_link_libraries(device_comm PRIVATE core logger)
 set_property(TARGET device_comm PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET device_comm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 
@@ -21,7 +22,7 @@ if (BUILD_MULTI_GPU)
 
     if (BUILD_TEST)
         add_executable(test_comm test_comm.cu)
-        target_link_libraries(test_comm PRIVATE device_comm host_comm pthread nvtx_utils)
+        target_link_libraries(test_comm PRIVATE device_comm host_comm core pthread nvtx_utils)
         target_compile_options(test_comm PRIVATE -O3 -march=native -mtune=native)
     endif ()
 endif ()
diff --git a/src/turbomind/comm/cuda_ipc/CMakeLists.txt b/src/turbomind/comm/cuda_ipc/CMakeLists.txt
index 948d75c94e..7cc07c11db 100644
--- a/src/turbomind/comm/cuda_ipc/CMakeLists.txt
+++ b/src/turbomind/comm/cuda_ipc/CMakeLists.txt
@@ -12,6 +12,8 @@ add_library(cuda_ipc_comm STATIC
 target_link_libraries(cuda_ipc_comm PRIVATE
         rms_norm
         host_comm
+        core
+        cuda_utils
         CUDA::cuda_driver
         logger)
 
diff --git a/src/turbomind/comm/cuda_ipc/allgather.cu b/src/turbomind/comm/cuda_ipc/allgather.cu
index 94d0ebe1f9..f71bae395c 100644
--- a/src/turbomind/comm/cuda_ipc/allgather.cu
+++ b/src/turbomind/comm/cuda_ipc/allgather.cu
@@ -4,7 +4,6 @@
 #include "src/turbomind/comm/cuda_ipc/device_semaphore.h"
 
 #include "src/turbomind/kernels/core/meta.h"
-#include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/cuda_utils.h"
 
 namespace turbomind::comm {
@@ -51,7 +50,7 @@ __global__ void __launch_bounds__(1024, 1) Allgather_Simple_Pull(T*
 void CudaIpcCommImpl::AllGather(
     const void* sendbuff, void* recvbuff, size_t sendcount, DataType type, int group, cudaStream_t stream)
 {
-    const size_t bytesize = get_elem_size(type) * sendcount;
+    const size_t bytesize = turbomind::byte_size(type) * sendcount;
 
     const int peers = this->n_ranks(group) - 1;
     const int rank  = this->rank(group);
@@ -165,9 +164,9 @@ void CudaIpcCommImpl::AllGather2D(const void*  sendbuff,
                                   int          group,
                                   cudaStream_t stream)
 {
-    const size_t byte_width  = get_elem_size(type) * width;
-    const size_t byte_pitch  = get_elem_size(type) * pitch;
-    const size_t byte_stride = get_elem_size(type) * stride;
+    const size_t byte_width  = byte_size(type, width);
+    const size_t byte_pitch  = byte_size(type, pitch);
+    const size_t byte_stride = byte_size(type, stride);
 
     void*  base{};
     size_t offset{};
diff --git a/src/turbomind/comm/cuda_ipc/allreduce.cu b/src/turbomind/comm/cuda_ipc/allreduce.cu
index 8461252a66..631aa1f212 100644
--- a/src/turbomind/comm/cuda_ipc/allreduce.cu
+++ b/src/turbomind/comm/cuda_ipc/allreduce.cu
@@ -6,9 +6,9 @@
 #include "src/turbomind/comm/cuda_ipc/cuda_ipc_comm.h"
 #include "src/turbomind/comm/cuda_ipc/device_semaphore.h"
 
+#include "src/turbomind/core/data_type.h"
 #include "src/turbomind/kernels/core/array_ops.h"
 #include "src/turbomind/kernels/core/meta.h"
-#include "src/turbomind/utils/Tensor.h"
 
 #include "src/turbomind/utils/cuda_utils.h"
 
@@ -423,14 +423,7 @@ void CudaIpcCommImpl::AllReduceSum(
         }
     };
 
-    switch (type) {
-        case DataType::TYPE_FP16:
-            return invoke(half{});
-        case DataType::TYPE_BF16:
-            return invoke(nv_bfloat16{});
-        default:
-            throw std::runtime_error("not implemented");
-    }
+    TM_DISPATCH_PRIMARY_DTYPES(type, invoke);
 }
 
 }  // namespace turbomind::comm
diff --git a/src/turbomind/comm/cuda_ipc/cuda_ipc_comm.cu b/src/turbomind/comm/cuda_ipc/cuda_ipc_comm.cu
index 0d229c58f0..7c0dde00af 100644
--- a/src/turbomind/comm/cuda_ipc/cuda_ipc_comm.cu
+++ b/src/turbomind/comm/cuda_ipc/cuda_ipc_comm.cu
@@ -1,8 +1,7 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
 #include <memory>
-#include <mutex>
-#include <type_traits>
+#include <numeric>
 #include <vector>
 
 #include <cuda.h>
diff --git a/src/turbomind/comm/cuda_ipc/cuda_ipc_comm.h b/src/turbomind/comm/cuda_ipc/cuda_ipc_comm.h
index ba820bfc7a..f985f12d25 100644
--- a/src/turbomind/comm/cuda_ipc/cuda_ipc_comm.h
+++ b/src/turbomind/comm/cuda_ipc/cuda_ipc_comm.h
@@ -10,7 +10,6 @@
 
 #include "src/turbomind/kernels/core/array.h"
 
-#include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/cuda_utils.h"
 
 namespace turbomind::comm {
diff --git a/src/turbomind/comm/cuda_ipc/fused_allreduce.cu b/src/turbomind/comm/cuda_ipc/fused_allreduce.cu
index 4948065e04..23e84cfbdf 100644
--- a/src/turbomind/comm/cuda_ipc/fused_allreduce.cu
+++ b/src/turbomind/comm/cuda_ipc/fused_allreduce.cu
@@ -8,13 +8,13 @@
 #include "src/turbomind/comm/cuda_ipc/device_semaphore.h"
 #include "src/turbomind/comm/cuda_ipc/group_sum.h"
 
+#include "src/turbomind/core/data_type.h"
 #include "src/turbomind/kernels/core/array_ops.h"
 #include "src/turbomind/kernels/core/common.h"
 #include "src/turbomind/kernels/core/meta.h"
 
 #include "src/turbomind/kernels/norm/rms_norm.h"
 
-#include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/cuda_utils.h"
 
 namespace turbomind::comm {
@@ -424,7 +424,7 @@ void CudaIpcCommImpl::AllreduceResidualBiasRMSnorm(void*        hidden,
                                                    cudaStream_t stream)
 {
 
-    const size_t elemsize = get_elem_size(dtype);
+    const size_t elemsize = byte_size(dtype);
     const size_t bytesize = elemsize * token_num * dim;
 
     const int n_ranks = this->n_ranks(group);
@@ -504,19 +504,10 @@ void CudaIpcCommImpl::AllreduceResidualBiasRMSnorm(void*        hidden,
         return false;  // > 1024 vdim
     };
 
-    auto dispatch = [&] {
-        switch (dtype) {
-            case DataType::TYPE_FP16:
-                return dispatch_D(half{});
-            case DataType::TYPE_BF16:
-                return dispatch_D(nv_bfloat16{});
-            default:
-                return false;
-        }
-    };
+    auto dispatch = [&]() -> bool { TM_DISPATCH_PRIMARY_DTYPES_RET(dtype, dispatch_D); };
 
     if (bytesize > (1 << 19)) {
-        if (auto success = dispatch()) {
+        if (dispatch()) {
             return;
         }
     }
diff --git a/src/turbomind/comm/cuda_ipc/fused_allreduce_ex.cu b/src/turbomind/comm/cuda_ipc/fused_allreduce_ex.cu
index 3340000777..a57172e60e 100644
--- a/src/turbomind/comm/cuda_ipc/fused_allreduce_ex.cu
+++ b/src/turbomind/comm/cuda_ipc/fused_allreduce_ex.cu
@@ -5,6 +5,7 @@
 #include "src/turbomind/comm/cuda_ipc/group_sum.h"
 
 #include "src/turbomind/comm/cuda_ipc/mscclpp.h"
+#include "src/turbomind/core/data_type.h"
 #include "src/turbomind/kernels/core/array_ops.h"
 #include "src/turbomind/kernels/core/common.h"
 #include "src/turbomind/kernels/core/meta.h"
@@ -279,18 +280,11 @@ void CudaIpcCommImpl::AllreduceResidualBiasRMSnormEx(void*        hidden,
         return false;  // > 1024 vdim
     };
 
-    auto dispatch = [&] {
-        switch (dtype) {
-            case DataType::TYPE_FP16:
-                return dispatch_D(half{});
-            case DataType::TYPE_BF16:
-                return dispatch_D(nv_bfloat16{});
-            default:
-                return false;
-        }
+    auto dispatch = [&]() -> bool {  //
+        TM_DISPATCH_PRIMARY_DTYPES_RET(dtype, dispatch_D);
     };
 
-    FT_CHECK(dispatch());
+    TM_CHECK(dispatch());
 }
 
 }  // namespace turbomind::comm
diff --git a/src/turbomind/comm/device_comm.cc b/src/turbomind/comm/device_comm.cc
index 8e35d9d22c..8217d9c298 100644
--- a/src/turbomind/comm/device_comm.cc
+++ b/src/turbomind/comm/device_comm.cc
@@ -25,7 +25,7 @@ DeviceComm CreateDeviceCommunicator(const std::string& backend, int n_ranks, int
     }
 #endif
 
-    FT_CHECK_WITH_INFO(0, fmtstr("Unknown communication backend: %s", backend.c_str()));
+    TM_CHECK(0) << "Unknown communication backend: " << backend;
     return {};
 }
 
diff --git a/src/turbomind/comm/device_comm.h b/src/turbomind/comm/device_comm.h
index 52045cbb03..d68ebdc4da 100644
--- a/src/turbomind/comm/device_comm.h
+++ b/src/turbomind/comm/device_comm.h
@@ -9,7 +9,6 @@
 #include <cuda_runtime.h>
 
 #include "src/turbomind/comm/host_comm.h"
-#include "src/turbomind/utils/Tensor.h"
 
 namespace turbomind::comm {
 
diff --git a/src/turbomind/comm/host_comm.h b/src/turbomind/comm/host_comm.h
index 5cf35d7b28..b036142264 100644
--- a/src/turbomind/comm/host_comm.h
+++ b/src/turbomind/comm/host_comm.h
@@ -6,8 +6,9 @@
 #include <memory>
 #include <stdexcept>
 #include <type_traits>
+#include <vector>
 
-#include "src/turbomind/utils/Tensor.h"
+#include "src/turbomind/core/data_type.h"
 
 namespace turbomind::comm {
 
@@ -79,12 +80,12 @@ template<class T>
 void Broadcast(HostCommImpl* comm, T* data, int n, int root)
 {
     if constexpr (std::is_trivially_copyable_v<T>) {
-        comm->Broadcast((char*)data, sizeof(T) * n, TYPE_INT8, root, detail::copy_fn<char>);
+        comm->Broadcast(data, sizeof(T) * n, data_type_v<uint8_t>, root, detail::copy_fn<uint8_t>);
     }
     else {
         if (comm->is_same_process()) {
             /// TODO: Constness should be considered
-            comm->Broadcast(data, n, TYPE_INVALID, root, detail::copy_fn<T>);
+            comm->Broadcast(data, n, kNull, root, detail::copy_fn<T>);
         }
         else {
             throw std::runtime_error("not implemented");
@@ -96,12 +97,12 @@ template<class T>
 void AllGather(HostCommImpl* comm, T* data, int n)
 {
     if constexpr (std::is_trivially_copyable_v<T>) {
-        comm->AllGather(data, sizeof(T) * n, TYPE_INT8, detail::copy_fn<char>);
+        comm->AllGather(data, sizeof(T) * n, data_type_v<uint8_t>, detail::copy_fn<uint8_t>);
     }
     else {
         if (comm->is_same_process()) {
             /// TODO: Constness should be considered
-            comm->AllGather(data, n, TYPE_INVALID, detail::copy_fn<T>);
+            comm->AllGather(data, n, kNull, detail::copy_fn<T>);
         }
         else {
             /// serialize data
@@ -113,7 +114,7 @@ void AllGather(HostCommImpl* comm, T* data, int n)
 template<class T>
 void AllReduce(HostCommImpl* comm, T* data, int n, RedOp red_op)
 {
-    comm->AllReduce(data, n, getTensorType<T>(), red_op);
+    comm->AllReduce(data, n, data_type_v<T>, red_op);
 }
 
 //////////////////////////////////////////////////////////////////////////////////
diff --git a/src/turbomind/comm/nccl/CMakeLists.txt b/src/turbomind/comm/nccl/CMakeLists.txt
index 4a6e8d71a7..ceddbfc3d3 100644
--- a/src/turbomind/comm/nccl/CMakeLists.txt
+++ b/src/turbomind/comm/nccl/CMakeLists.txt
@@ -3,7 +3,7 @@
 cmake_minimum_required(VERSION 3.8)
 
 add_library(nccl_comm STATIC nccl.cu)
-target_link_libraries(nccl_comm PRIVATE rms_norm ${NCCL_LIBRARIES} logger)
+target_link_libraries(nccl_comm PRIVATE rms_norm core ${NCCL_LIBRARIES} logger)
 target_include_directories(nccl_comm PRIVATE ${NCCL_INCLUDE_DIRS})
 
 set_property(TARGET nccl_comm PROPERTY POSITION_INDEPENDENT_CODE  ON)
diff --git a/src/turbomind/comm/nccl/nccl.cu b/src/turbomind/comm/nccl/nccl.cu
index 5a02d5b51e..804dfaaa46 100644
--- a/src/turbomind/comm/nccl/nccl.cu
+++ b/src/turbomind/comm/nccl/nccl.cu
@@ -10,7 +10,6 @@
 
 #include "src/turbomind/comm/device_comm.h"
 #include "src/turbomind/comm/host_comm.h"
-#include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/logger.h"
 #include "src/turbomind/utils/string_utils.h"
@@ -33,16 +32,16 @@
 
 namespace turbomind::comm {
 
-static inline ncclDataType_t getNcclDataType(DataType type)
+static inline ncclDataType_t to_nccl_dtype(DataType type)
 {
     switch (type) {
-        case DataType::TYPE_FP32:
+        case kFloat32:
             return ncclFloat;
-        case DataType::TYPE_FP16:
+        case kFloat16:
             return ncclHalf;
-        case DataType::TYPE_BF16:
+        case kBfloat16:
             return ncclBfloat16;
-        case DataType::TYPE_UINT8:
+        case kUint8:
             return ncclUint8;
         default:
             throw std::runtime_error("not supported");
@@ -166,7 +165,7 @@ public:
         const void* sendbuff, void* recvbuff, size_t count, DataType type, int group, cudaStream_t stream) override
     {
         NCCLCHECK(ncclGroupStart());
-        NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, getNcclDataType(type), ncclSum, groups_.at(group), stream));
+        NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, to_nccl_dtype(type), ncclSum, groups_.at(group), stream));
         NCCLCHECK(ncclGroupEnd());
     }
 
@@ -174,7 +173,7 @@ public:
         const void* sendbuff, void* recvbuff, size_t sendcount, DataType type, int group, cudaStream_t stream) override
     {
         NCCLCHECK(ncclGroupStart());
-        NCCLCHECK(ncclAllGather(sendbuff, recvbuff, sendcount, getNcclDataType(type), groups_.at(group), stream));
+        NCCLCHECK(ncclAllGather(sendbuff, recvbuff, sendcount, to_nccl_dtype(type), groups_.at(group), stream));
         NCCLCHECK(ncclGroupEnd());
     }
 
@@ -182,8 +181,8 @@ public:
         const void* sendbuff, void* recvbuff, size_t recvcount, DataType type, int group, cudaStream_t stream) override
     {
         NCCLCHECK(ncclGroupStart());
-        NCCLCHECK(ncclReduceScatter(
-            sendbuff, recvbuff, recvcount, getNcclDataType(type), ncclSum, groups_.at(group), stream));
+        NCCLCHECK(
+            ncclReduceScatter(sendbuff, recvbuff, recvcount, to_nccl_dtype(type), ncclSum, groups_.at(group), stream));
         NCCLCHECK(ncclGroupEnd());
     }
 
@@ -198,7 +197,7 @@ public:
                                       int          group,
                                       cudaStream_t stream) override
     {
-        const auto elem_size = get_elem_size(dtype);
+        const auto elem_size = byte_size(dtype);
 
         auto rms_norm = [&](int64_t first, int64_t count) {
             invokeResidualBiasRMSNorm((char*)hidden + elem_size * first * dim,
@@ -241,8 +240,8 @@ public:
                                         const int*   local_token_nums,
                                         cudaStream_t stream) override
     {
-        const size_t         elem_size = get_elem_size(type);
-        const ncclDataType_t nccl_type = getNcclDataType(type);
+        const size_t         elem_size = byte_size(type);
+        const ncclDataType_t nccl_type = to_nccl_dtype(type);
 
         FT_CHECK(group0 == 0 || group1 == 0);
 
diff --git a/src/turbomind/comm/test_comm.cu b/src/turbomind/comm/test_comm.cu
index 9bbc52d26b..3b2eac954b 100644
--- a/src/turbomind/comm/test_comm.cu
+++ b/src/turbomind/comm/test_comm.cu
@@ -17,11 +17,10 @@
 
 #include "src/turbomind/comm/device_comm.h"
 #include "src/turbomind/comm/host_comm.h"
-#include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/cuda_utils.h"
 
 using namespace turbomind::comm;
-using turbomind::getTensorType;
+using turbomind::data_type_v;
 using turbomind::check;
 using turbomind::myAssert;
 using std::vector;
@@ -164,7 +163,7 @@ struct TestComm {
             tp = device_num;
         }
 
-        std::tie(h_comm_, d_comm_, h_split_, d_split_) = Init(device_num, 4, "cudaipc");
+        std::tie(h_comm_, d_comm_, h_split_, d_split_) = Init(device_num, 4, "cuda-ipc");
 
         warmup_ = warmup;
         iters_  = iters;
@@ -186,7 +185,7 @@ struct TestComm {
     template<class T>
     void TestAllReduce(size_t dim, int group = 0)
     {
-        const auto dtype = getTensorType<T>();
+        const auto dtype = data_type_v<T>;
 
         const int tp_size = d_comm_[0]->n_ranks(group);
         const int dp_size = d_comm_.size() / tp_size;
@@ -325,7 +324,7 @@ struct TestComm {
             }
         }
 
-        const auto dtype = getTensorType<T>();
+        const auto dtype = data_type_v<T>;
 
         const int tp_size = d_comm_[0]->n_ranks(group);
         const int dp_size = d_comm_.size() / tp_size;
@@ -497,7 +496,7 @@ struct TestComm {
     template<class T>
     void TestAllGather(size_t dim, int group)
     {
-        const auto dtype = getTensorType<T>();
+        const auto dtype = data_type_v<T>;
 
         const int tp_size = d_comm_[0]->n_ranks(group);
         const int dp_size = d_comm_.size() / tp_size;
@@ -621,7 +620,7 @@ struct TestComm {
 
         const int inner_tp = std::gcd(tp_size_0, tp_size_1);
 
-        const auto dtype = getTensorType<T>();
+        const auto dtype = data_type_v<T>;
 
         std::mt19937                  gen{};
         std::uniform_int_distribution dist{0, 31};  // 5 mantissa bits
diff --git a/src/turbomind/comm/thread_comm.cc b/src/turbomind/comm/thread_comm.cc
index cb8dd66e9c..017d83abb0 100644
--- a/src/turbomind/comm/thread_comm.cc
+++ b/src/turbomind/comm/thread_comm.cc
@@ -7,12 +7,11 @@
 #include <memory>
 #include <mutex>
 #include <new>
+#include <numeric>
 
 #include "src/turbomind/comm/host_comm.h"
-
-#include "src/turbomind/utils/Tensor.h"
-#include "src/turbomind/utils/cuda_utils.h"
-
+#include "src/turbomind/core/check.h"
+#include "src/turbomind/core/data_type.h"
 namespace turbomind::comm {
 
 struct ThreadCommImpl: public HostCommImpl {
@@ -71,8 +70,8 @@ struct ThreadCommImpl: public HostCommImpl {
 
     std::shared_ptr<HostCommImpl> Split(int color, int key) override
     {
-        FT_CHECK(color >= 0);
-        FT_CHECK(g2l_[rank_] >= 0);
+        TM_CHECK(color >= 0);
+        TM_CHECK(g2l_[rank_] >= 0);
 
         // `g2l_[rank_]` imposes proper ordering when keys are equal
         auto vec = comm::AllGather(this, std::make_tuple(color, key, g2l_[rank_]));
@@ -124,7 +123,7 @@ struct ThreadCommImpl: public HostCommImpl {
 
     void Broadcast(void* data, int count, DataType dtype, int root, copy_fn copy) override
     {
-        FT_CHECK(copy);
+        TM_CHECK(copy);
         if (n_ranks() == 1) {
             return;
         }
@@ -158,7 +157,7 @@ struct ThreadCommImpl: public HostCommImpl {
 
     void AllGather(void* data, int count, DataType dtype, copy_fn copy) override
     {
-        FT_CHECK(copy);
+        TM_CHECK(copy);
         if (n_ranks() == 1) {
             return;
         }
@@ -226,13 +225,13 @@ struct ThreadCommImpl: public HostCommImpl {
         };
         auto dispatch = [&]() -> reduce_fn {
             switch (dtype) {
-                case DataType::TYPE_INT32:
+                case kInt32:
                     return dispatch_op(int32_t{});
-                case DataType::TYPE_INT64:
+                case kInt64:
                     return dispatch_op(int64_t{});
-                case DataType::TYPE_UINT32:
+                case kUint32:
                     return dispatch_op(uint32_t{});
-                case DataType::TYPE_UINT64:
+                case kUint64:
                     return dispatch_op(uint64_t{});
                 default:
                     return {};
@@ -250,7 +249,7 @@ struct ThreadCommImpl: public HostCommImpl {
     void AllReduce(void* data, int count, DataType dtype, RedOp red_op) override
     {
         const auto reduce    = get_reduce(dtype, red_op);
-        const auto elem_size = get_elem_size(dtype);
+        const auto elem_size = byte_size(dtype);
         if (n_ranks() == 1) {
             return;
         }
@@ -292,7 +291,7 @@ class ThreadGroupId: public HostGroupId {
 
     void Export(std::ostream& os) override
     {
-        FT_CHECK((bool)internal_);  // `Initialize` must come befor `Export`
+        TM_CHECK((bool)internal_);  // `Initialize` must come befor `Export`
 
         const void* ptr = this;
         os.write((const char*)&ptr, sizeof(ptr));
@@ -304,7 +303,7 @@ class ThreadGroupId: public HostGroupId {
         is.read((char*)&ptr, sizeof(ptr));
         internal_ = reinterpret_cast<ThreadGroupId*>(ptr)->internal_;
 
-        FT_CHECK((bool)internal_);
+        TM_CHECK((bool)internal_);
     }
 
     HostComm CreateCommunicator(int n_ranks, int rank) override
@@ -313,12 +312,12 @@ class ThreadGroupId: public HostGroupId {
             internal_->state = std::make_shared<ThreadCommImpl::State>(n_ranks);
         };
 
-        FT_CHECK((bool)internal_);
+        TM_CHECK((bool)internal_);
 
         // One of the rank initialize the shared state
         std::call_once(internal_->flag, init_shared_state);
 
-        FT_CHECK((bool)internal_->state);
+        TM_CHECK((bool)internal_->state);
 
         auto impl = std::make_shared<ThreadCommImpl>(n_ranks, internal_->state, rank);
 
diff --git a/src/turbomind/core/CMakeLists.txt b/src/turbomind/core/CMakeLists.txt
new file mode 100644
index 0000000000..9a0c9ff5ba
--- /dev/null
+++ b/src/turbomind/core/CMakeLists.txt
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+cmake_minimum_required(VERSION 3.8)
+
+add_library(core STATIC
+        check.cc
+        allocator.cc
+        stream.cc
+        context.cc
+        buffer.cc
+        layout.cc
+        tensor.cc
+        tensor.cu
+        module.cc)
+
+target_link_libraries(core PUBLIC cuda_utils CUDA::cudart CUDA::cuda_driver)
+
+set_property(TARGET core PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET core PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+
+target_compile_options(core PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xptxas=-v>)
+
+if (BUILD_TEST)
+    add_executable(test_core test_core.cc)
+    target_link_libraries(test_core PRIVATE core logger Catch2::Catch2WithMain)
+endif ()
diff --git a/src/turbomind/core/allocator.cc b/src/turbomind/core/allocator.cc
new file mode 100644
index 0000000000..5471acdf3a
--- /dev/null
+++ b/src/turbomind/core/allocator.cc
@@ -0,0 +1,159 @@
+
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+
+#include "src/turbomind/core/allocator.h"
+#include "src/turbomind/core/check.h"
+
+namespace turbomind::core {
+
+AllocatorImpl::~AllocatorImpl() = default;
+
+Stream AllocatorImpl::stream() const noexcept
+{
+    return Stream{};
+}
+
+class CudaMemPoolAllocator: public AllocatorImpl {
+public:
+    CudaMemPoolAllocator(Stream stream, bool use_default_pool):
+        pool_{}, stream_{stream}, device_{kDEVICE}, use_default_pool_{use_default_pool}
+    {
+        check_cuda_error(cudaGetDevice(&device_.id));
+        if (use_default_pool_) {
+            check_cuda_error(cudaDeviceGetDefaultMemPool(&pool_, device_.id));
+        }
+        else {
+            cudaMemPoolProps props{};
+            props.allocType     = cudaMemAllocationTypePinned;
+            props.handleTypes   = cudaMemHandleTypeNone;
+            props.location.type = cudaMemLocationTypeDevice;
+            props.location.id   = device_.id;
+            check_cuda_error(cudaMemPoolCreate(&pool_, &props));
+            cuuint64_t thres = (cuuint64_t)-1;
+            check_cuda_error(cudaMemPoolSetAttribute(pool_, cudaMemPoolAttrReleaseThreshold, &thres));
+        }
+    }
+
+    ~CudaMemPoolAllocator() override
+    {
+        if (!use_default_pool_) {
+            check_cuda_error(cudaMemPoolDestroy(pool_));
+        }
+        pool_ = {};
+    }
+
+    void* allocate(ssize_t size) override
+    {
+        void* ptr{};
+        check_cuda_error(cudaMallocFromPoolAsync(&ptr, size, pool_, stream_.handle()));
+        return ptr;
+    }
+
+    void deallocate(void* p, ssize_t) override
+    {
+        check_cuda_error(cudaFreeAsync(p, stream_.handle()));
+    }
+
+    Device device() const noexcept override
+    {
+        return device_;
+    }
+
+    Stream stream() const noexcept override
+    {
+        return stream_;
+    }
+
+    void trim(size_t bytes_to_keep)
+    {
+        check_cuda_error(cudaMemPoolTrimTo(pool_, bytes_to_keep));
+    }
+
+private:
+    cudaMemPool_t pool_;
+    Stream        stream_;
+    Device        device_;
+    bool          use_default_pool_;
+};
+
+class CudaAllocator: public AllocatorImpl {
+public:
+    void* allocate(ssize_t size) override
+    {
+        void* ptr{};
+        check_cuda_error(cudaMalloc(&ptr, size));
+        return ptr;
+    }
+
+    void deallocate(void* p, ssize_t) override
+    {
+        check_cuda_error(cudaFree(p));
+    }
+
+    Device device() const noexcept override
+    {
+        return kDEVICE;
+    }
+};
+
+class CudaHostAllocator: public AllocatorImpl {
+public:
+    void* allocate(ssize_t size) override
+    {
+        void* ptr{};
+        check_cuda_error(cudaHostAlloc(&ptr, size, cudaHostAllocDefault));
+        return ptr;
+    }
+
+    void deallocate(void* p, ssize_t) override
+    {
+        check_cuda_error(cudaFreeHost(p));
+    }
+
+    Device device() const noexcept override
+    {
+        return kCPUpinned;
+    }
+};
+
+class HostAllocator: public AllocatorImpl {
+public:
+    void* allocate(ssize_t size) override
+    {
+        return ::operator new(size);
+    }
+
+    void deallocate(void* p, ssize_t) override
+    {
+        ::operator delete(p);
+    }
+
+    Device device() const noexcept override
+    {
+        return kCPU;
+    }
+};
+
+Allocator::Allocator(DeviceType type)
+{
+    impl_ = [&]() -> shared_ptr<AllocatorImpl> {
+        switch (type) {
+            case kCPU:
+                return std::make_shared<HostAllocator>();
+            case kDEVICE:
+                return std::make_shared<CudaAllocator>();
+            case kCPUpinned:
+                return std::make_shared<CudaHostAllocator>();
+        }
+        return {};
+    }();
+    TM_CHECK_NOTNULL(impl_);
+}
+
+Allocator::Allocator(Stream stream, bool use_default_pool)
+{
+    impl_ = std::make_shared<CudaMemPoolAllocator>(std::move(stream), use_default_pool);
+}
+
+}  // namespace turbomind::core
diff --git a/src/turbomind/core/allocator.h b/src/turbomind/core/allocator.h
new file mode 100644
index 0000000000..bbc3ffb2d5
--- /dev/null
+++ b/src/turbomind/core/allocator.h
@@ -0,0 +1,244 @@
+#pragma once
+
+#include <functional>
+
+#include "src/turbomind/core/check.h"
+#include "src/turbomind/core/common.h"
+#include "src/turbomind/core/stream.h"
+
+#include "src/turbomind/kernels/core/math.h"
+
+namespace turbomind {
+
+enum class DeviceType : int
+{
+    kCPU,
+    kCPUpinned,
+    kDEVICE
+};
+
+inline constexpr DeviceType kCPU       = DeviceType::kCPU;
+inline constexpr DeviceType kCPUpinned = DeviceType::kCPUpinned;
+inline constexpr DeviceType kDEVICE    = DeviceType::kDEVICE;
+
+constexpr const char* to_string(DeviceType device)
+{
+    switch (device) {
+        case kCPU:
+            return "cpu";
+        case kCPUpinned:
+            return "cpu_pinned";
+        case kDEVICE:
+            return "device";
+    }
+    return "";
+}
+
+inline std::ostream& operator<<(std::ostream& os, DeviceType device)
+{
+    return os << to_string(device);
+}
+
+}  // namespace turbomind
+
+namespace turbomind::core {
+
+struct Device {
+    DeviceType type;
+    int        id;
+    Device(): Device{kCPU} {}
+    Device(DeviceType type_): type{type_}, id{-1} {}
+    Device(DeviceType type_, int device_): type{type_}, id{device_} {}
+    friend bool operator==(const Device& a, const Device& b)
+    {
+        return a.type == b.type && a.id == b.id;
+    }
+    friend bool operator!=(const Device& a, const Device& b)
+    {
+        return !(a == b);
+    }
+};
+
+class AllocatorImpl {
+public:
+    virtual ~AllocatorImpl();
+
+    virtual void* allocate(ssize_t size) = 0;
+
+    virtual void deallocate(void* p, ssize_t size) = 0;
+
+    // Returns invalid stream by default
+    virtual Stream stream() const noexcept;
+
+    virtual Device device() const noexcept = 0;
+};
+
+class Allocator {
+public:
+    Allocator() = default;
+
+    explicit Allocator(DeviceType type);
+
+    Allocator(Stream stream, bool use_default_pool);
+
+    Allocator(shared_ptr<AllocatorImpl> impl): impl_{std::move(impl)} {};
+
+    AllocatorImpl* operator->() const
+    {
+        TM_CHECK_NOTNULL(impl_);
+        return impl_.get();
+    }
+
+    explicit operator bool() const noexcept
+    {
+        return static_cast<bool>(impl_);
+    }
+
+    friend bool operator==(const Allocator& a, const Allocator& b)
+    {
+        return a.impl_ == b.impl_;
+    }
+
+    friend bool operator!=(const Allocator& a, const Allocator& b)
+    {
+        return !(a == b);
+    }
+
+    template<class T, class... Args>
+    shared_ptr<T> adapt(Args&&... args) const
+    {
+        return {std::make_shared<T>(impl_, ((Args &&) args)...)};
+    }
+
+private:
+    shared_ptr<AllocatorImpl> impl_;
+};
+
+class StackAllocatorImpl: public AllocatorImpl {
+public:
+    static constexpr ssize_t kAlignment = 256;
+
+    explicit StackAllocatorImpl(shared_ptr<AllocatorImpl> underlying_impl): underlying_impl_{std::move(underlying_impl)}
+    {
+    }
+
+    ~StackAllocatorImpl() override
+    {
+        if (cached_beg_) {
+            underlying_impl_->deallocate(cached_beg_, cached_end_ - cached_beg_);
+        }
+    }
+
+    void* allocate(ssize_t size) override
+    {
+        size = round_up(size, kAlignment);
+
+        void* p{};
+        if (cached_ptr_ + size <= cached_end_) {
+            p = cached_ptr_;
+            cached_ptr_ += size;
+        }
+        else {
+            TM_CHECK(!cached_beg_);
+            p = underlying_impl_->allocate(size);
+        }
+
+        // TM_LOG_ERROR("allocate %p, %ld", p, size);
+
+        size_ += size;
+        ++num_;
+        max_size_ = std::max(size_, max_size_);
+        num_      = std::max(num_, max_num_);
+        return p;
+    }
+
+    void deallocate(void* p, ssize_t size) override
+    {
+        size = round_up(size, kAlignment);
+
+        // TM_LOG_ERROR("deallocate %p, %p, %ld", p, cached_ptr_, size);
+
+        if ((char*)p + size == cached_ptr_) {
+            cached_ptr_ -= size;
+        }
+        else {
+            TM_CHECK(!cached_beg_);
+            underlying_impl_->deallocate(p, size);
+        }
+        size_ -= size;
+        --num_;
+    }
+
+    Stream stream() const noexcept override
+    {
+        return underlying_impl_->stream();
+    }
+
+    Device device() const noexcept override
+    {
+        return underlying_impl_->device();
+    }
+
+    void iter()
+    {
+        TM_CHECK_EQ((void*)cached_beg_, (void*)cached_ptr_);
+        auto excpected = max_size_ + kAlignment * max_num_;
+        if (cached_end_ - cached_beg_ < excpected) {
+            if (cached_beg_) {
+                underlying_impl_->deallocate(cached_beg_, cached_end_ - cached_beg_);
+            }
+            cached_ptr_ = cached_beg_ = (char*)underlying_impl_->allocate(excpected);
+            cached_end_               = cached_beg_ + excpected;
+        }
+        size_ = num_ = max_size_ = max_num_ = 0;
+    }
+
+private:
+    ssize_t size_{};
+    ssize_t num_{};
+    ssize_t max_size_{};
+    ssize_t max_num_{};
+
+    char* cached_beg_{};
+    char* cached_end_{};
+    char* cached_ptr_{};
+
+    std::shared_ptr<AllocatorImpl> underlying_impl_;
+};
+
+class SimpleAllocator: public AllocatorImpl {
+public:
+    template<class Alloc, class Dealloc>
+    static Allocator Create(Alloc&& alloc, Dealloc&& dealloc, Device device)
+    {
+        return Allocator{std::make_shared<SimpleAllocator>((Alloc &&) alloc, (Dealloc &&) dealloc, device)};
+    }
+
+    template<class Alloc, class Dealloc>
+    SimpleAllocator(Alloc&& alloc, Dealloc&& dealloc, Device device):
+        alloc_{std::move(alloc)}, dealloc_{std ::move(dealloc)}, device_{device}
+    {
+    }
+
+    void* allocate(ssize_t size) override
+    {
+        return alloc_(size);
+    };
+
+    void deallocate(void* p, ssize_t size) override
+    {
+        return dealloc_(p, size);
+    }
+
+    Device device() const noexcept override
+    {
+        return device_;
+    }
+
+private:
+    std::function<void*(ssize_t)>       alloc_;
+    std::function<void(void*, ssize_t)> dealloc_;
+    Device                              device_;
+};
+
+}  // namespace turbomind::core
diff --git a/src/turbomind/core/buffer.cc b/src/turbomind/core/buffer.cc
new file mode 100644
index 0000000000..6971e63482
--- /dev/null
+++ b/src/turbomind/core/buffer.cc
@@ -0,0 +1,89 @@
+
+#include "src/turbomind/core/buffer.h"
+#include "src/turbomind/core/check.h"
+#include "src/turbomind/core/context.h"
+#include "src/turbomind/core/data_type.h"
+#include "src/turbomind/core/stream.h"
+namespace turbomind::core {
+
+Buffer Buffer::view(DataType dtype) const
+{
+    auto b = *this;
+    if (dtype == dtype_) {
+        return b;
+    }
+    b.dtype_ = dtype;
+    b.size_  = numel(dtype, byte_size());
+    if (base_) {
+        b.base_ = numel(dtype, turbomind::byte_size(dtype_, base_));
+    }
+    return b;
+}
+
+Buffer Buffer::slice(ssize_t base, ssize_t size) const
+{
+    TM_CHECK_LE(base + size, size_);
+    auto b = *this;
+    b.base_ += base;
+    if (size == -1) {
+        b.size_ -= base;
+    }
+    else {
+        b.size_ = size;
+    }
+    return b;
+}
+
+std::ostream& operator<<(std::ostream& os, const Buffer& b)
+{
+    os << b.dtype() << "[" << b.size() << "]@" << b.data_;
+    if (b.base_) {
+        os << "+" << b.base_;
+    }
+    return os;
+}
+
+void Copy(const Buffer& a, ssize_t n, Ref<Buffer> b_, const Stream& stream)
+{
+    auto& b = b_.get();
+    TM_CHECK_EQ(a.dtype(), b.dtype());
+    TM_CHECK_LE(n, a.size());
+    TM_CHECK_LE(n, b.size());
+    check_cuda_error(
+        cudaMemcpyAsync(b.raw_data(), a.raw_data(), byte_size(a.dtype(), n), cudaMemcpyDefault, stream.handle()));
+}
+
+void Copy(const Buffer& a, ssize_t n, Ref<Buffer> b_)
+{
+    Copy(a, n, b_, Context::stream());
+}
+
+void Copy(const Buffer& a, Ref<Buffer> b_, const Stream& stream)
+{
+    TM_CHECK_EQ(a.size(), b_.get().size());
+    Copy(a, a.size(), b_, stream);
+}
+
+void Copy(const Buffer& a, Ref<Buffer> b_)
+{
+    Copy(a, b_, Context::stream());
+}
+
+void* Copy(const void* a, ssize_t n, void* b, const Stream& stream)
+{
+    check_cuda_error(cudaMemcpyAsync(b, a, n, cudaMemcpyDefault, stream.handle()));
+    return (char*)b + n;
+}
+
+void Clear(Ref<Buffer> b_, const Stream& stream)
+{
+    auto& b = b_.get();
+    check_cuda_error(cudaMemsetAsync(b.raw_data(), 0, b.byte_size(), stream.handle()));
+}
+
+void Clear(Ref<Buffer> b_)
+{
+    Clear(b_, Context::stream());
+}
+
+}  // namespace turbomind::core
diff --git a/src/turbomind/core/buffer.h b/src/turbomind/core/buffer.h
new file mode 100644
index 0000000000..48263facf8
--- /dev/null
+++ b/src/turbomind/core/buffer.h
@@ -0,0 +1,343 @@
+#pragma once
+
+#include <memory>
+
+#include <cuda_runtime.h>
+#include <type_traits>
+
+#include "src/turbomind/core/allocator.h"
+#include "src/turbomind/core/check.h"
+#include "src/turbomind/core/common.h"
+#include "src/turbomind/core/context.h"
+#include "src/turbomind/core/data_type.h"
+
+namespace turbomind::core {
+
+class Buffer {
+public:
+    Buffer(): data_{}, base_{}, size_{}, device_{}, dtype_{} {}
+
+    // Typed empty buffer
+    explicit Buffer(DataType dtype): Buffer()
+    {
+        dtype_ = dtype;
+    }
+
+    // Reference into `data` buffer
+    template<class T>
+    Buffer(T* data, ssize_t size, Device device):
+        data_{data, [](auto) {}}, base_{}, size_{size}, device_{device}, dtype_{data_type_v<T>}
+    {
+    }
+
+    Buffer(void* data, ssize_t size, DataType dtype, Device device):
+        data_{data, [](auto) {}}, base_{}, size_{size}, device_{device}, dtype_{dtype}
+    {
+    }
+
+    // Share ownership of `data`
+    Buffer(shared_ptr<void> data, ssize_t size, DataType dtype, Device device):
+        data_{std::move(data)}, base_{}, size_{size}, device_{device}, dtype_{dtype}
+    {
+    }
+
+    // Create from the allocator
+    Buffer(ssize_t size, DataType dtype, Allocator& alloc):
+        base_{}, size_{size}, device_{alloc->device()}, dtype_{dtype}
+    {
+        auto bytes = turbomind::byte_size(dtype, size);
+        data_      = {alloc->allocate(bytes), [=](auto p) { alloc->deallocate(p, bytes); }};
+    }
+
+    Buffer(ssize_t size, DataType dtype, Device device): Buffer{size, dtype, Context::alloc(device)} {}
+
+    template<class T>
+    T* data()
+    {
+        TM_CHECK_EQ(data_type_v<T>, dtype_);
+        return (T*)((char*)TM_CHECK_NOTNULL(data_).get() + turbomind::byte_size<T>(base_));
+    }
+
+    template<class T>
+    const T* data() const
+    {
+        return const_cast<Buffer*>(this)->data<T>();
+    }
+
+    void* raw_data(ssize_t offset = 0)
+    {
+        return (char*)TM_CHECK_NOTNULL(data_).get() + turbomind::byte_size(dtype_, base_ + offset);
+    }
+
+    const void* raw_data(ssize_t offset = 0) const
+    {
+        return const_cast<Buffer*>(this)->raw_data(offset);
+    }
+
+    template<class T>
+    T* data_or(T* other) noexcept
+    {
+        if constexpr (std::is_void_v<T>) {
+            return data_ ? (T*)raw_data() : other;
+        }
+        else {
+            return data_ ? data<T>() : other;
+        }
+    }
+
+    template<class T>
+    const T* data_or(const T* other) const noexcept
+    {
+        return const_cast<Buffer*>(this)->data_or(other);
+    }
+
+    DataType dtype() const
+    {
+        return dtype_;
+    }
+
+    Device device() const
+    {
+        return device_;
+    }
+
+    ssize_t size() const
+    {
+        return size_;
+    }
+
+    ssize_t byte_size() const
+    {
+        return turbomind::byte_size(dtype_, size_);
+    }
+
+    explicit operator bool() const noexcept
+    {
+        return static_cast<bool>(data_);
+    }
+
+    Buffer view(DataType dtype) const;
+
+    template<class T>
+    Buffer view() const
+    {
+        return view(data_type_v<T>);
+    }
+
+    Buffer slice(ssize_t base, ssize_t size) const;
+
+    Buffer borrow() const
+    {
+        return Buffer{const_cast<void*>(raw_data()), size_, dtype_, device_};
+    }
+
+    friend bool operator==(const Buffer& a, const Buffer& b);
+
+    friend bool operator!=(const Buffer& a, const Buffer& b);
+
+    friend std::ostream& operator<<(std::ostream& os, const Buffer& b);
+
+protected:
+    auto as_tuple() const
+    {
+        return std::tie(data_, base_, size_, dtype_, device_);
+    }
+
+    shared_ptr<void> data_;
+    ssize_t          base_;
+    ssize_t          size_;
+    Device           device_;
+    DataType         dtype_;
+};
+
+inline bool operator==(const Buffer& a, const Buffer& b)
+{
+    return a.as_tuple() == b.as_tuple();
+}
+
+inline bool operator!=(const Buffer& a, const Buffer& b)
+{
+    return !(a == b);
+}
+
+///////////////////////////////////////////////////////////
+// fill
+
+void Fill(Buffer& b, const void* v);
+
+void Fill(Buffer&& b, const void* v);
+
+void Fill(Buffer& b, const void* v, const Stream& stream);
+
+void Fill(Buffer&& b, const void* v, const Stream& stream);
+
+template<class T>
+struct Buffer_: public Buffer {
+
+    Buffer_(): Buffer{data_type_v<T>} {}
+
+    Buffer_(T* data, ssize_t size, Device device): Buffer{data, size, device} {}
+
+    Buffer_(shared_ptr<void> data, ssize_t size, Device device): Buffer{std::move(data), size, data_type_v<T>, device}
+    {
+    }
+
+    Buffer_(ssize_t size, Allocator& alloc): Buffer{size, data_type_v<T>, alloc} {}
+
+    Buffer_(ssize_t size, Device device): Buffer{size, data_type_v<T>, device} {}
+
+    Buffer_(const Buffer_&) = default;
+    Buffer_& operator=(const Buffer_&) = default;
+
+    Buffer_(Buffer_&&) noexcept = default;
+    Buffer_& operator=(Buffer_&&) noexcept = default;
+
+    Buffer_(const Buffer& b)
+    {
+        *static_cast<Buffer*>(this) = ensure_dtype(b);
+    }
+    Buffer_(Buffer&& b) noexcept
+    {
+        *static_cast<Buffer*>(this) = ensure_dtype(std::move(b));
+    }
+
+    T* data_or(T* other)
+    {
+        return data_ ? data() : other;
+    }
+
+    const T* data_or(const T* other) const
+    {
+        return const_cast<Buffer_*>(this)->data_or(other);
+    }
+
+    void* raw_data(ssize_t offset = 0)
+    {
+        return (char*)TM_CHECK_NOTNULL(data_).get() + turbomind::byte_size<T>(base_ + offset);
+    }
+
+    const void* raw_data(ssize_t offset = 0) const
+    {
+        return const_cast<Buffer_*>(this)->raw_data(offset);
+    }
+
+    T* data()
+    {
+        return static_cast<T*>(raw_data());
+    }
+
+    const T* data() const
+    {
+        return static_cast<const T*>(raw_data());
+    }
+
+    T* begin()
+    {
+        return data();
+    }
+
+    const T* begin() const
+    {
+        return data();
+    }
+
+    T* end()
+    {
+        return begin() + size();
+    }
+
+    const T* end() const
+    {
+        return begin() + size();
+    }
+
+    T& operator[](ssize_t i)
+    {
+        return data()[i];
+    }
+
+    const T& operator[](ssize_t i) const
+    {
+        return data()[i];
+    }
+
+    T& at(ssize_t i)
+    {
+        TM_CHECK_LT(i, size());
+        return data()[i];
+    }
+
+    T& at(ssize_t i) const
+    {
+        TM_CHECK_LT(i, size());
+        return data()[i];
+    }
+
+    constexpr DataType dtype() const noexcept
+    {
+        return data_type_v<T>;
+    }
+
+private:
+    template<class U>
+    static decltype(auto) ensure_dtype(U&& u) noexcept
+    {
+        TM_CHECK_EQ(u.dtype(), data_type_v<T>);
+        return (U &&) u;
+    }
+};
+
+template<class T>
+class Ref {
+public:
+    Ref(T& x): ref_{x} {}
+    Ref(T&& x): ref_{x} {}
+
+    operator T&()
+    {
+        return ref_;
+    }
+
+    T& get()
+    {
+        return ref_;
+    }
+
+private:
+    T& ref_;
+};
+
+void Copy(const Buffer& a, ssize_t n, Ref<Buffer> b_, const Stream& stream);
+
+void Copy(const Buffer& a, ssize_t n, Ref<Buffer> b_);
+
+void Copy(const Buffer& a, Ref<Buffer> b_, const Stream& stream);
+
+void Copy(const Buffer& a, Ref<Buffer> b_);
+
+// Static type checking
+template<class T>
+inline void Copy_(const Buffer_<T>& a, ssize_t n, Buffer_<T>& b_)
+{
+    Copy((const Buffer&)a, n, (Buffer&)b_);
+}
+
+void* Copy(const void* a, ssize_t n, void* b, const Stream& stream);
+
+template<class T>
+inline T* Copy(const T* a, ssize_t n, T* b, const Stream& stream)
+{
+    return (T*)Copy((const void*)a, sizeof(T) * n, (void*)b, stream);
+}
+
+template<class T>
+inline T* Copy(const T* a, ssize_t n, T* b)
+{
+    return Copy(a, n, b, Context::stream());
+}
+
+void Clear(Ref<Buffer> b_, const Stream& stream);
+
+void Clear(Ref<Buffer> b_);
+
+}  // namespace turbomind::core
diff --git a/src/turbomind/core/check.cc b/src/turbomind/core/check.cc
new file mode 100644
index 0000000000..47ad9a2ec7
--- /dev/null
+++ b/src/turbomind/core/check.cc
@@ -0,0 +1,90 @@
+
+#include <cstdlib>
+#include <filesystem>
+#include <iostream>
+#include <sstream>
+
+#include "src/turbomind/core/check.h"
+#include "src/turbomind/utils/logger.h"
+
+namespace turbomind::core {
+
+namespace {
+
+std::string StripSrcPrefix(const char* file)
+{
+    static const char* flag = std::getenv("TM_SRC_FULL_PATH");
+    if (flag) {
+        return file;
+    }
+
+    std::filesystem::path path{file};
+    std::filesystem::path ret{path};  // return the original path if anchor is not found
+
+    constexpr auto anchor = "turbomind";
+
+    bool found = false;
+
+    for (const auto& x : path) {
+        if (x == anchor) {
+            found = true;
+            ret.clear();
+        }
+        else if (found) {
+            ret /= x;
+        }
+    }
+
+    return ret.string();
+}
+
+}  // namespace
+
+CheckOpStringBuilder::CheckOpStringBuilder()
+{
+    oss_ = new std::ostringstream;
+}
+
+std::ostream* CheckOpStringBuilder::ForVal1()
+{
+    (*oss_) << "(";
+    return oss_;
+}
+std::ostream* CheckOpStringBuilder::ForVal2()
+{
+    (*oss_) << " vs. ";
+    return oss_;
+}
+std::string* CheckOpStringBuilder::NewString()
+{
+    (*oss_) << ")";
+    return new std::string{oss_->str()};
+}
+
+CheckErrorStream::CheckErrorStream(const char* file, int line, const char* expr)
+{
+    oss_ = new std::ostringstream{};
+    *oss_ << StripSrcPrefix(file) << "(" << line << "): Check failed: " << expr << " ";
+}
+
+CheckErrorStream::CheckErrorStream(const char* file, int line, const char* expr, std::string* str):
+    CheckErrorStream{file, line, expr}
+{
+    *oss_ << *str << " ";
+}
+
+void CheckErrorStream::Report()
+{
+    // ! Be aware of `%` in expr
+    std::cerr << "[TM][FATAL] " << oss_->str() << "\n";
+    std::abort();
+}
+
+void ReportNullError(const char* file, int line, const char* expr)
+{
+    // ! Be aware of `%` in expr
+    std::cerr << "[TM][FATAL] " << StripSrcPrefix(file) << "(" << line << "): '" << expr << "' Must be non NULL\n";
+    std::abort();
+}
+
+}  // namespace turbomind::core
diff --git a/src/turbomind/core/check.h b/src/turbomind/core/check.h
new file mode 100644
index 0000000000..33b275251f
--- /dev/null
+++ b/src/turbomind/core/check.h
@@ -0,0 +1,143 @@
+
+// Inspired by <glog/logging.h>
+
+#pragma once
+
+#include <sstream>
+
+namespace turbomind::core {
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#define TM_LIKELY(expr) (expr)
+#define TM_UNLIKELY(expr) (expr)
+#define TM_NOINLINE
+#define TM_UNREACHABLE __assume(0)
+#else
+#define TM_LIKELY(expr) (__builtin_expect(bool(expr), 1))
+#define TM_UNLIKELY(expr) (__builtin_expect(bool(expr), 0))
+#define TM_NOINLINE __attribute__((noinline))
+#define TM_UNREACHABLE __builtin_unreachable()
+#endif
+
+#define TM_DISABLE_CHECK_STREAM 0
+#define TM_DISABLE_CHECK_OP 0
+
+class CheckErrorStream {
+public:
+    CheckErrorStream(const char* file, int line, const char* expr);
+
+    CheckErrorStream(const char* file, int line, const char* expr, std::string* str);
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+#pragma warning(disable : 4722)  // MSVC warns dtor never return
+#endif
+    ~CheckErrorStream()
+    {
+        Report();
+    }
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
+
+    template<class T>
+    CheckErrorStream& operator<<(const T& msg)
+    {
+#if TM_DISABLE_CHECK_STREAM
+#else
+        *oss_ << msg;
+#endif
+        return *this;
+    }
+
+private:
+    [[noreturn]] void Report();
+
+    std::ostringstream* oss_;
+};
+
+class CheckOpStringBuilder {
+public:
+    CheckOpStringBuilder();
+    std::ostream* ForVal1();
+    std::ostream* ForVal2();
+    std::string*  NewString();
+
+private:
+    std::ostringstream* oss_;
+};
+
+template<class T1, class T2>
+std::string* MakeCheckOpString(const T1& v1, const T2& v2) TM_NOINLINE;
+
+template<class T1, class T2>
+std::string* MakeCheckOpString(const T1& v1, const T2& v2)
+{
+    CheckOpStringBuilder builder;
+    *builder.ForVal1() << v1;
+    *builder.ForVal2() << v2;
+    return builder.NewString();
+}
+
+#define DEFINE_CHECK_OP_IMPL(name, op)                                                                                 \
+    template<class T1, class T2>                                                                                       \
+    inline std::pair<bool, std::string*> name##Impl(const T1& v1, const T2& v2)                                        \
+    {                                                                                                                  \
+        if (TM_LIKELY(v1 op v2))                                                                                       \
+            return {false, nullptr};                                                                                   \
+        else                                                                                                           \
+            return {true, MakeCheckOpString(v1, v2)};                                                                  \
+    }
+
+DEFINE_CHECK_OP_IMPL(Check_EQ, ==);
+DEFINE_CHECK_OP_IMPL(Check_NE, !=);
+DEFINE_CHECK_OP_IMPL(Check_LE, <=);
+DEFINE_CHECK_OP_IMPL(Check_LT, <);
+DEFINE_CHECK_OP_IMPL(Check_GE, >=);
+DEFINE_CHECK_OP_IMPL(Check_GT, >);
+
+#undef DEFINE_CHECK_OP_IMPL
+
+// clang-format off
+#define TM_CHECK(e)                                                                  \
+    if (TM_UNLIKELY(!(e))) turbomind::core::CheckErrorStream(__FILE__, __LINE__, #e)
+
+#define TM_CHECK_OP(name, op, a, b)                                                  \
+    if (auto&& [__p, __s] = turbomind::core::Check##name##Impl(a, b); __p) \
+        turbomind::core::CheckErrorStream(__FILE__, __LINE__, #a " " #op " " #b, __s)
+// clang-format on
+
+#if TM_DISABLE_CHECK_OP
+
+#define TM_CHECK_EQ(a, b) TM_CHECK(a == b)
+#define TM_CHECK_NE(a, b) TM_CHECK(a != b)
+#define TM_CHECK_LE(a, b) TM_CHECK(a <= b)
+#define TM_CHECK_LT(a, b) TM_CHECK(a < b)
+#define TM_CHECK_GE(a, b) TM_CHECK(a >= b)
+#define TM_CHECK_GT(a, b) TM_CHECK(a > b)
+
+#else
+
+#define TM_CHECK_EQ(a, b) TM_CHECK_OP(_EQ, ==, a, b)
+#define TM_CHECK_NE(a, b) TM_CHECK_OP(_NE, !=, a, b)
+#define TM_CHECK_LE(a, b) TM_CHECK_OP(_LE, <=, a, b)
+#define TM_CHECK_LT(a, b) TM_CHECK_OP(_LT, <, a, b)
+#define TM_CHECK_GE(a, b) TM_CHECK_OP(_GE, >=, a, b)
+#define TM_CHECK_GT(a, b) TM_CHECK_OP(_GT, >, a, b)
+
+#endif
+
+[[noreturn]] void ReportNullError(const char* file, int line, const char* expr);
+
+template<class T>
+decltype(auto) EnsureNotNull(const char* file, int line, const char* expr, T&& p)
+{
+    if (TM_UNLIKELY(p == nullptr)) {
+        ReportNullError(file, line, expr);
+    }
+    return (T &&) p;
+}
+
+#define TM_CHECK_NOTNULL(p) ::turbomind::core::EnsureNotNull(__FILE__, __LINE__, #p, (p))
+
+}  // namespace turbomind::core
diff --git a/src/turbomind/core/common.h b/src/turbomind/core/common.h
new file mode 100644
index 0000000000..d3d4de6000
--- /dev/null
+++ b/src/turbomind/core/common.h
@@ -0,0 +1,24 @@
+
+#pragma once
+
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+/// TODO: remove this dependency
+#include "src/turbomind/utils/cuda_utils.h"
+
+namespace turbomind::core {
+
+class Allocator;
+class Buffer;
+class Stream;
+class Event;
+class Context;
+
+using std::shared_ptr;
+using std::vector;
+
+using ssize_t = std::ptrdiff_t;
+
+}  // namespace turbomind::core
diff --git a/src/turbomind/core/context.cc b/src/turbomind/core/context.cc
new file mode 100644
index 0000000000..41589fb9e4
--- /dev/null
+++ b/src/turbomind/core/context.cc
@@ -0,0 +1,144 @@
+
+#include <stack>
+
+#include "src/turbomind/core/allocator.h"
+#include "src/turbomind/core/context.h"
+
+namespace turbomind::core {
+
+namespace {
+
+struct ContextStorage {
+    enum
+    {
+        stream_bit       = 1,
+        host_alloc_bit   = 2,
+        device_alloc_bit = 4,
+        pinned_alloc_bit = 8,
+    };
+
+    std::stack<Stream>    stream_;
+    std::stack<Allocator> host_alloc_;
+    std::stack<Allocator> device_alloc_;
+    std::stack<Allocator> pinned_alloc_;
+    std::stack<int>       mask_;
+
+    ContextStorage()
+    {
+        push(Allocator{kCPU});
+    }
+
+    void push(const Stream& stream)
+    {
+        int mask{};
+        if (stream) {
+            stream_.push(stream);
+            mask = stream_bit;
+        }
+        mask_.push(mask);
+    }
+
+    void push(const Allocator& alloc)
+    {
+        int mask{};
+        if (alloc) {
+            const auto type = alloc->device().type;
+            if (type == kCPU) {
+                mask = host_alloc_bit;
+                host_alloc_.push(alloc);
+            }
+            else if (type == kDEVICE) {
+                mask = device_alloc_bit;
+                device_alloc_.push(alloc);
+            }
+            else if (type == kCPUpinned) {
+                mask = pinned_alloc_bit;
+                pinned_alloc_.push(alloc);
+            }
+        }
+        mask_.push(mask);
+    }
+
+    void pop()
+    {
+        if (mask_.top() & stream_bit) {
+            stream_.pop();
+        }
+        if (mask_.top() & host_alloc_bit) {
+            host_alloc_.pop();
+        }
+        if (mask_.top() & device_alloc_bit) {
+            device_alloc_.pop();
+        }
+        if (mask_.top() & pinned_alloc_bit) {
+            pinned_alloc_.pop();
+        }
+        mask_.pop();
+    }
+
+    static ContextStorage& instance()
+    {
+        thread_local ContextStorage inst{};
+        return inst;
+    }
+};
+
+}  // namespace
+
+void Context::push(const Stream& stream)
+{
+    ContextStorage::instance().push(stream);
+}
+
+void Context::push(const Allocator& alloc)
+{
+    ContextStorage::instance().push(alloc);
+}
+
+void Context::pop()
+{
+    ContextStorage::instance().pop();
+}
+
+Stream& Context::stream()
+{
+    auto& stream_ = ContextStorage::instance().stream_;
+    TM_CHECK(!stream_.empty()) << "No STREAM available in current context";
+    return stream_.top();
+}
+
+Allocator& Context::host_alloc()
+{
+    auto& host_alloc_ = ContextStorage::instance().host_alloc_;
+    TM_CHECK(!host_alloc_.empty()) << "No HOST memory allocator available in current context";
+    return host_alloc_.top();
+}
+
+Allocator& Context::device_alloc()
+{
+    auto& device_alloc_ = ContextStorage::instance().device_alloc_;
+    TM_CHECK(!device_alloc_.empty()) << "No DEVICE memory allocator available in current context";
+    return device_alloc_.top();
+}
+
+Allocator& Context::pinned_alloc()
+{
+    auto& pinned_alloc_ = ContextStorage::instance().pinned_alloc_;
+    TM_CHECK(!pinned_alloc_.empty()) << "No PINNED memory allocator available in current context";
+    return pinned_alloc_.top();
+}
+
+Allocator& Context::alloc(Device device)
+{
+    switch (device.type) {
+        case kDEVICE:
+            return device_alloc();
+        case kCPU:
+            return host_alloc();
+        case kCPUpinned:
+            return pinned_alloc();
+    }
+    TM_UNREACHABLE;
+}
+
+}  // namespace turbomind::core
diff --git a/src/turbomind/core/context.h b/src/turbomind/core/context.h
new file mode 100644
index 0000000000..ec8abe6f1e
--- /dev/null
+++ b/src/turbomind/core/context.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include "src/turbomind/core/allocator.h"
+#include "src/turbomind/core/common.h"
+#include "src/turbomind/core/stream.h"
+
+namespace turbomind::core {
+
+class Context {
+public:
+    static Stream&    stream();
+    static Allocator& host_alloc();
+    static Allocator& device_alloc();
+    static Allocator& pinned_alloc();
+    static Allocator& alloc(Device device);
+
+private:
+    friend class ContextGuard;
+    static void push(const Stream& stream);
+    static void push(const Allocator& alloc);
+    static void pop();
+};
+
+class ContextGuard {
+public:
+    template<class... Args>
+    explicit ContextGuard(Args&&... args): n_{}
+    {
+        (Context::push((Args &&) args), ...);
+        n_ = sizeof...(Args);
+    }
+    ~ContextGuard()
+    {
+        for (int i = 0; i < n_; ++i) {
+            Context::pop();
+        }
+    }
+
+private:
+    int n_;
+};
+
+}  // namespace turbomind::core
diff --git a/src/turbomind/core/core.h b/src/turbomind/core/core.h
new file mode 100644
index 0000000000..a58daba3d6
--- /dev/null
+++ b/src/turbomind/core/core.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "src/turbomind/core/allocator.h"
+#include "src/turbomind/core/buffer.h"
+#include "src/turbomind/core/check.h"
+#include "src/turbomind/core/context.h"
+#include "src/turbomind/core/data_type.h"
+#include "src/turbomind/core/layout.h"
+#include "src/turbomind/core/stream.h"
+#include "src/turbomind/core/tensor.h"
+
+namespace turbomind {
+
+using core::ssize_t;
+using core::Buffer;
+using core::Buffer_;
+using core::Tensor;
+using core::Tensor_;
+using core::TensorMap;
+using core::Ref;
+using core::Layout;
+using core::Allocator;
+using core::Stream;
+using core::Event;
+
+}  // namespace turbomind
diff --git a/src/turbomind/core/cuda_data_type.h b/src/turbomind/core/cuda_data_type.h
new file mode 100644
index 0000000000..e3227a9056
--- /dev/null
+++ b/src/turbomind/core/cuda_data_type.h
@@ -0,0 +1,59 @@
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_fp8.h>
+
+#include "src/turbomind/core/data_type.h"
+
+namespace turbomind {
+
+// clang-format off
+
+constexpr cudaDataType to_cuda_dtype(DataType type)
+{
+    switch (type) {
+        case kUint8:  return CUDA_R_8U;
+        case kUint16: return CUDA_R_16U;
+        case kUint32: return CUDA_R_32U;
+        case kUint64: return CUDA_R_64U;
+        case kInt8:  return CUDA_R_8I;
+        case kInt16: return CUDA_R_16I;
+        case kInt32: return CUDA_R_32I;
+        case kInt64: return CUDA_R_64I;
+        case kFloat16: return CUDA_R_16F;
+        case kFloat32: return CUDA_R_32F;
+        case kFloat64: return CUDA_R_64F;
+        case kBfloat16: return CUDA_R_16BF;
+        case kFloat8_e4m3: return CUDA_R_8F_E4M3;
+        case kFloat8_e5m2: return CUDA_R_8F_E5M2;
+        default:
+            throw std::runtime_error("Not supported " + std::string{to_string(type)});
+    }
+}
+
+constexpr DataType from_cuda_dtype(cudaDataType type) {
+    switch (type) {
+        case CUDA_R_8U:  return kUint8;
+        case CUDA_R_16U: return kUint16;
+        case CUDA_R_32U: return kUint32;
+        case CUDA_R_64U: return kUint64;
+        case CUDA_R_8I:  return kInt8;
+        case CUDA_R_16I: return kInt16;
+        case CUDA_R_32I: return kInt32;
+        case CUDA_R_64I: return kInt64;
+        case CUDA_R_16F: return kFloat16;
+        case CUDA_R_32F: return kFloat32;
+        case CUDA_R_64F: return kFloat64;
+        case CUDA_R_16BF: return kBfloat16;
+        case CUDA_R_8F_E4M3: return kFloat8_e4m3;
+        case CUDA_R_8F_E5M2: return kFloat8_e5m2;
+        default:
+            throw std::runtime_error("Not supported " + std::string{std::to_string(type)});
+    }
+}
+
+// clang-format on
+
+}  // namespace turbomind
diff --git a/src/turbomind/core/data_type.h b/src/turbomind/core/data_type.h
new file mode 100644
index 0000000000..a6a42079cf
--- /dev/null
+++ b/src/turbomind/core/data_type.h
@@ -0,0 +1,318 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/turbomind/core/check.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+// forward declarations for CUDA floating point types
+struct __half;
+struct __nv_bfloat16;
+struct __nv_fp8_e4m3;
+struct __nv_fp8_e5m2;
+
+namespace turbomind {
+
+// clang-format off
+
+struct uint2_t {};
+struct uint4_t {};
+struct uint6_t {};
+
+template <int I>
+struct int_constant: std::integral_constant<int, I> {};
+
+template <class T>
+struct bitsof_t: int_constant<sizeof(T) * 8> {};
+
+template <> struct bitsof_t<uint2_t>: int_constant<2> {};
+template <> struct bitsof_t<uint4_t>: int_constant<4> {};
+template <> struct bitsof_t<uint6_t>: int_constant<6> {};
+
+template <class T>
+inline constexpr bitsof_t<T> bitsof{};
+
+using half_t = __half;
+using bfloat16_t = __nv_bfloat16;
+using fp8_e4m3_t = __nv_fp8_e4m3;
+using fp8_e5m2_t = __nv_fp8_e5m2;
+
+constexpr int encode_data_type(bool sign, int exponent, int mantissa) {
+    return ((sign << 16) | (exponent << 8) | mantissa);
+}
+
+enum class DataType: int {
+    kNull        = 0,
+    kBool        = 1,
+    kUint8       = encode_data_type(0,  0,  8),
+    kUint16      = encode_data_type(0,  0, 16),
+    kUint32      = encode_data_type(0,  0, 32),
+    kUint64      = encode_data_type(0,  0, 64),
+    kInt8        = encode_data_type(1,  0,  8),
+    kInt16       = encode_data_type(1,  0, 16),
+    kInt32       = encode_data_type(1,  0, 32),
+    kInt64       = encode_data_type(1,  0, 64),
+    kFloat16     = encode_data_type(1,  5, 10),
+    kFloat32     = encode_data_type(1,  8, 23),
+    kFloat64     = encode_data_type(1, 11, 52),
+    kBfloat16    = encode_data_type(1,  8,  7),
+    kFloat8_e4m3 = encode_data_type(1,  4,  3),
+    kFloat8_e5m2 = encode_data_type(1,  5,  2),
+    kUint2       = encode_data_type(0,  0,  2),
+    kUint4       = encode_data_type(0,  0,  4),
+    kUint6       = encode_data_type(0,  0,  6),
+    kUint        = kUint32,
+    kInt         = kInt32,
+    kFloat       = kFloat32,
+    kHalf        = kFloat16,
+    kDouble      = kFloat64,
+};
+
+inline constexpr DataType kNull = DataType::kNull;
+inline constexpr DataType kBool = DataType::kBool;
+inline constexpr DataType kUint8  = DataType::kUint8;
+inline constexpr DataType kUint16 = DataType::kUint16;
+inline constexpr DataType kUint32 = DataType::kUint32;
+inline constexpr DataType kUint64 = DataType::kUint64;
+inline constexpr DataType kInt8  = DataType::kInt8;
+inline constexpr DataType kInt16 = DataType::kInt16;
+inline constexpr DataType kInt32 = DataType::kInt32;
+inline constexpr DataType kInt64 = DataType::kInt64;
+inline constexpr DataType kFloat16 = DataType::kFloat16;
+inline constexpr DataType kFloat32 = DataType::kFloat32;
+inline constexpr DataType kFloat64 = DataType::kFloat64;
+inline constexpr DataType kBfloat16 = DataType::kBfloat16;
+inline constexpr DataType kFloat8_e4m3 = DataType::kFloat8_e4m3;
+inline constexpr DataType kFloat8_e5m2 = DataType::kFloat8_e5m2;
+inline constexpr DataType kUint2  = DataType::kUint2;
+inline constexpr DataType kUint4  = DataType::kUint4;
+inline constexpr DataType kUint6  = DataType::kUint6;
+inline constexpr DataType kUint = DataType::kUint;
+inline constexpr DataType kInt = DataType::kInt;
+inline constexpr DataType kHalf = DataType::kHalf;
+inline constexpr DataType kFloat = DataType::kFloat;
+inline constexpr DataType kDouble = DataType::kDouble;
+
+template <class T>
+struct to_data_type;
+
+template <DataType D>
+struct from_data_type;
+
+#define CVT_DATA_TYPE(D, T) \
+    template <> struct to_data_type<T> { static constexpr auto value = DataType::D; }; \
+    template <> struct from_data_type<DataType::D> { using type = T; }
+
+CVT_DATA_TYPE(kNull, void);
+
+CVT_DATA_TYPE(kBool, bool);
+CVT_DATA_TYPE( kUint8, uint8_t);
+CVT_DATA_TYPE(kUint16, uint16_t);
+CVT_DATA_TYPE(kUint32, uint32_t);
+CVT_DATA_TYPE(kUint64, uint64_t);
+
+CVT_DATA_TYPE( kInt8, int8_t);  // NOTE: `int8_t` is `signed char` and is different from `char`
+CVT_DATA_TYPE(kInt16, int16_t);
+CVT_DATA_TYPE(kInt32, int32_t);
+CVT_DATA_TYPE(kInt64, int64_t);
+
+CVT_DATA_TYPE(kFloat16, half_t);
+CVT_DATA_TYPE(kFloat32, float);
+CVT_DATA_TYPE(kFloat64, double);
+CVT_DATA_TYPE(kBfloat16, bfloat16_t);
+CVT_DATA_TYPE(kFloat8_e4m3, fp8_e4m3_t);
+CVT_DATA_TYPE(kFloat8_e5m2, fp8_e5m2_t);
+
+CVT_DATA_TYPE(kUint2, uint2_t);
+CVT_DATA_TYPE(kUint4, uint4_t);
+CVT_DATA_TYPE(kUint6, uint6_t);
+
+#undef CVT_DATA_TYPE
+
+template <class T>
+inline constexpr auto data_type_v = to_data_type<std::remove_cv_t<T>>::value;
+
+template <DataType D>
+using data_type_t = typename from_data_type<D>::type;
+
+constexpr std::ptrdiff_t byte_size(DataType type, std::ptrdiff_t size = 1) {
+    switch (type) {
+        case kNull: return 0;
+        case kBool:
+        case kUint8:
+        case kInt8:
+        case kFloat8_e4m3:
+        case kFloat8_e5m2:
+            return size;
+        case kUint16:
+        case kInt16:
+        case kFloat16:
+        case kBfloat16:
+            return size * 2;
+        case kUint32:
+        case kInt32:
+        case kFloat32:
+            return size * 4;
+        case kUint64:
+        case kInt64:
+        case kFloat64:
+            return size * 8;
+        case kUint2: return size * 2 / 8;
+        case kUint4: return size * 4 / 8;
+        case kUint6: return size * 6 / 8;
+    }
+    return 0;
+}
+
+template <class T>
+constexpr std::ptrdiff_t byte_size(std::ptrdiff_t size = 1) { return byte_size(data_type_v<T>, size); }
+
+constexpr std::ptrdiff_t numel(DataType type, std::ptrdiff_t size = 1) {
+    switch (type) {
+        case kNull: return 0;
+        case kBool:
+        case kUint8:
+        case kInt8:
+        case kFloat8_e4m3:
+        case kFloat8_e5m2:
+            return size;
+        case kUint16:
+        case kInt16:
+        case kFloat16:
+        case kBfloat16:
+            return size / 2;
+        case kUint32:
+        case kInt32:
+        case kFloat32:
+            return size / 4;
+        case kUint64:
+        case kInt64:
+        case kFloat64:
+            return size / 8;
+        case kUint2: return size * 8 / 2;
+        case kUint4: return size * 8 / 4;
+        case kUint6: return size * 8 / 6;
+    }
+    return 0;
+}
+
+template <class T>
+constexpr std::ptrdiff_t numel(std::ptrdiff_t size) { return numel(data_type_v<T>, size); }
+
+constexpr const char* to_string(DataType type) {
+    switch (type) {
+        case kNull: return "nil";
+        case kBool: return "bool";
+        case kUint8: return "u8";
+        case kUint16: return "u16";
+        case kUint32: return "u32";
+        case kUint64: return "u64";
+        case kInt8: return "i8";
+        case kInt16: return "i16";
+        case kInt32: return "i32";
+        case kInt64: return "i64";
+        case kFloat16: return "f16";
+        case kFloat32: return "f32";
+        case kFloat64: return "f64";
+        case kBfloat16: return "bf16";
+        case kFloat8_e4m3: return "f8_e4m3";
+        case kFloat8_e5m2: return "f8_e5m2";
+        case kUint2: return "u2";
+        case kUint4: return "u4";
+        case kUint6: return "u8";
+        default:
+            return "unknown";
+    }
+    return "";
+}
+
+inline std::ostream& operator<<(std::ostream& os, DataType type) {
+    os << to_string(type);
+    return os;
+}
+
+/// TODO: mapping with DLPack
+
+// clang-format on
+
+#define TM_PP_NARGS(...) TM_PP_NARGS_IMPL(__VA_ARGS__, 8, 7, 6, 5, 4, 3, 2, 1, 0)
+#define TM_PP_NARGS_IMPL(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
+
+#define TM_PP_CAT(a, b) a##b
+#define TM_PP_STR(x) #x
+
+#define TM_PP_DISPATCH_N(macro, ...) TM_PP_DISPATCH_N_IMPL(macro, TM_PP_NARGS(__VA_ARGS__))
+#define TM_PP_DISPATCH_N_IMPL(macro, x) TM_PP_CAT(macro, x)
+
+#define TM_PP_INVOKE_1(macro, f, _0) macro(f, _0)
+
+#define TM_PP_INVOKE_2(macro, f, _0, _1)                                                                               \
+    macro(f, _0);                                                                                                      \
+    macro(f, _1)
+
+#define TM_PP_INVOKE_3(macro, f, _0, _1, _2)                                                                           \
+    macro(f, _0);                                                                                                      \
+    macro(f, _1);                                                                                                      \
+    macro(f, _2)
+
+#define TM_PP_INVOKE_4(macro, f, _0, _1, _2, _3)                                                                       \
+    macro(f, _0);                                                                                                      \
+    macro(f, _1);                                                                                                      \
+    macro(f, _2);                                                                                                      \
+    macro(f, _3)
+
+#define TM_PP_INVOKE_5(macro, f, _0, _1, _2, _3, _4)                                                                   \
+    macro(f, _0);                                                                                                      \
+    macro(f, _1);                                                                                                      \
+    macro(f, _2);                                                                                                      \
+    macro(f, _3);                                                                                                      \
+    macro(f, _4)
+
+#define TM_DISPATCH_DTYPE_RET_CASE(f, t)                                                                               \
+    case ::turbomind::data_type_v<t>:                                                                                  \
+        return f(t{});
+
+#define TM_DISPATCH_DTYPE_CASE(f, t)                                                                                   \
+    case ::turbomind::data_type_v<t>:                                                                                  \
+        f(t{});                                                                                                        \
+        break
+
+// clang-format off
+#define TM_DISPATCH_DTYPES_RET(var, f, ...)                                                                            \
+    switch (var) {                                                                                                     \
+        TM_PP_DISPATCH_N(TM_PP_INVOKE_, __VA_ARGS__)(TM_DISPATCH_DTYPE_RET_CASE, f, __VA_ARGS__);                      \
+        default:                                                                                                       \
+            TM_CHECK(0) << "unsupported type: "  << to_string(var);                                                    \
+            return {};                                                                                                 \
+    }
+
+#define TM_DISPATCH_DTYPES(var, f, ...)                                                                                \
+    switch (var) {                                                                                                     \
+        TM_PP_DISPATCH_N(TM_PP_INVOKE_, __VA_ARGS__)(TM_DISPATCH_DTYPE_CASE, f, __VA_ARGS__);                          \
+        default:                                                                                                       \
+            TM_CHECK(0) << "unsupported type: "  << to_string(var);                                                    \
+    }
+// clang-format on
+
+#define TM_PRIMARY_DTYPES_0 ::turbomind::half_t
+
+#if ENABLE_BF16
+#define TM_PRIMARY_DTYPES_1 TM_PRIMARY_DTYPES_0, ::turbomind::bfloat16_t
+#else
+#define TM_PRIMARY_DTYPES_1 TM_PRIMARY_DTYPES_0
+#endif
+
+#if ENABLE_FP32
+#define TM_PRIMARY_DTYPES TM_PRIMARY_DTYPES_1, float
+#else
+#define TM_PRIMARY_DTYPES TM_PRIMARY_DTYPES_1
+#endif
+
+#define TM_DISPATCH_PRIMARY_DTYPES(var, func) TM_DISPATCH_DTYPES(var, func, TM_PRIMARY_DTYPES)
+
+#define TM_DISPATCH_PRIMARY_DTYPES_RET(var, func) TM_DISPATCH_DTYPES_RET(var, func, TM_PRIMARY_DTYPES)
+
+}  // namespace turbomind
diff --git a/src/turbomind/core/layout.cc b/src/turbomind/core/layout.cc
new file mode 100644
index 0000000000..995f2a1fbf
--- /dev/null
+++ b/src/turbomind/core/layout.cc
@@ -0,0 +1,153 @@
+
+#include <numeric>
+
+#include "src/turbomind/core/check.h"
+#include "src/turbomind/core/layout.h"
+
+namespace turbomind::core {
+
+Layout::Layout(std::vector<ssize_t> shape): shape_{std::move(shape)}
+{
+    TM_CHECK(shape_.size());
+    stride_.resize(shape_.size());
+    size_ = 1;
+    for (int i = shape_.size() - 1; i >= 0; --i) {
+        stride_[i] = size_;
+        size_ *= shape_[i];
+    }
+}
+
+Layout::Layout(vector<ssize_t> shape, vector<ssize_t> stride): shape_{std::move(shape)}, stride_{std::move(stride)}
+{
+    TM_CHECK(shape_.size());
+    TM_CHECK_EQ(shape_.size(), stride_.size());
+
+    size_ = std::accumulate(shape_.begin(), shape_.end(), ssize_t{1}, std::multiplies<>{});
+
+    TM_CHECK_GE(size_, 0);
+}
+
+ssize_t Layout::cosize() const noexcept
+{
+    if (rank() == 0) {
+        return 0;
+    }
+    ssize_t value{1};
+    for (size_t i = 0; i < shape_.size(); ++i) {
+        value += (shape_[i] - 1) * stride_[i];
+    }
+    return value;
+}
+
+Layout Layout::coalesce() const noexcept
+{
+    vector<ssize_t> shape{shape_.front()};
+    vector<ssize_t> stride{stride_.front()};
+
+    for (size_t i = 1; i < shape_.size(); ++i) {
+        if (shape_[i] == 1) {
+            continue;
+        }
+        else if (shape.back() == 1) {
+            shape.back()  = shape_[i];
+            stride.back() = stride_[i];
+        }
+        else if (stride.back() == shape_[i] * stride_[i]) {
+            stride.back() = stride_[i];
+            shape.back() *= shape_[i];
+        }
+        else {
+            shape.push_back(shape_[i]);
+            stride.push_back(stride_[i]);
+        }
+    }
+
+    return Layout{shape, stride};
+}
+
+Layout Layout::view(vector<ssize_t> shape) const
+{
+    if (shape == shape_) {
+        return *this;
+    }
+
+    TM_CHECK(!shape.empty());
+
+    // size check & wildcard resolution
+    auto wildcard = std::find(shape.begin(), shape.end(), -1);
+    if (wildcard != shape.end()) {
+        TM_CHECK(std::find(wildcard + 1, shape.end(), -1) == shape.end());
+        *wildcard = 1;
+    }
+    auto new_size = std::accumulate(shape.begin(), shape.end(), ssize_t{1}, std::multiplies<>{});
+    if (wildcard != shape.end()) {
+        TM_CHECK(size_ % new_size == 0) << size_ << " % " << new_size;
+        *wildcard = size_ / new_size;
+    }
+    else {
+        TM_CHECK_EQ(size_, new_size);
+    }
+
+    if (is_contiguous()) {
+        return Layout{shape};
+    }
+
+    const Layout c = coalesce();  // merge contiguous dimensions
+
+    ssize_t p = c.rank();
+    ssize_t s = 1;
+    ssize_t d = 0;
+
+    vector<ssize_t> stride(shape.size());
+
+    for (int i = shape.size() - 1; i >= 0; --i) {
+        if (shape[i] == 1) {
+            stride[i] = 0;
+        }
+        else {
+            if (s == 1) {
+                --p;
+                s = c.shape().at(p);
+                d = c.stride().at(p);
+            }
+            TM_CHECK_EQ(s % shape[i], 0);  // crossing non-contiguous dimensions
+            stride[i] = d;
+            d *= shape[i];
+            s /= shape[i];
+        }
+    }
+    return Layout{std::move(shape), std::move(stride)};
+}
+
+std::pair<Layout, ssize_t> Layout::slice(const vector<ssize_t>& base, vector<ssize_t> shape) const
+{
+    TM_CHECK_EQ(base.size(), shape.size());
+    TM_CHECK_EQ(shape_.size(), shape.size());
+    ssize_t offset = 0;
+    for (size_t i = 0; i < shape.size(); ++i) {
+        const auto space = shape_[i] - base[i];
+        TM_CHECK_GE(space, 0);
+        if (shape[i] == -1) {
+            shape[i] = space;
+        }
+        TM_CHECK_LE(shape[i], space);
+        offset += base[i] * stride_[i];
+    }
+    return {Layout{std::move(shape), stride_}, offset};
+}
+
+std::ostream& operator<<(std::ostream& os, const Layout& x)
+{
+    os << "(";
+    for (int i = 0; i < x.rank(); ++i) {
+        os << (i ? "," : "") << x.shape_[i];
+    }
+    os << "):(";
+    for (int i = 0; i < x.rank(); ++i) {
+        os << (i ? "," : "") << x.stride_[i];
+    }
+    os << ")";
+    return os;
+}
+
+}  // namespace turbomind::core
diff --git a/src/turbomind/core/layout.h b/src/turbomind/core/layout.h
new file mode 100644
index 0000000000..2806d87c73
--- /dev/null
+++ b/src/turbomind/core/layout.h
@@ -0,0 +1,156 @@
+
+#pragma once
+
+#include <initializer_list>
+
+#include "src/turbomind/core/check.h"
+#include "src/turbomind/core/common.h"
+
+namespace turbomind::core {
+
+class Layout {
+public:
+    Layout(): size_{0} {}
+
+    /* implicit */ Layout(vector<ssize_t> shape);
+
+    /* implicit */ Layout(std::initializer_list<ssize_t> shape): Layout(vector(shape)) {}
+
+    Layout(vector<ssize_t> shape, vector<ssize_t> stride);
+
+    ssize_t size() const noexcept
+    {
+        return size_;
+    }
+
+    ssize_t cosize() const noexcept;
+
+    ssize_t rank() const noexcept
+    {
+        return shape_.size();
+    }
+
+    auto& shape() const noexcept
+    {
+        return shape_;
+    }
+
+    auto shape(int i) const
+    {
+        return shape_.at(wrap(i));
+    }
+
+    template<class... Is>
+    auto shapes(Is... is) const
+    {
+        return std::make_tuple(shape(is)...);
+    }
+
+    auto& stride() const noexcept
+    {
+        return stride_;
+    }
+
+    auto stride(int i) const
+    {
+        return stride_.at(wrap(i));
+    }
+
+    template<class... Is>
+    auto strides(Is... is)
+    {
+        return std::make_tuple(stride(is)...);
+    }
+
+    bool is_contiguous() const noexcept
+    {
+        if (stride_.back() != 1) {
+            return false;
+        }
+        if (size() != cosize()) {
+            return false;
+        }
+        for (int i = 0; i < rank() - 1; ++i) {
+            // TODO: skip when shape == 1
+            if (stride_[i] < stride_[i + 1]) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    Layout permute(const vector<int>& dims)
+    {
+        TM_CHECK((int)dims.size() == rank());
+        auto a = *this;
+        for (int i = 0; i < rank(); ++i) {
+            a.shape_[i]  = shape_[dims[i]];
+            a.stride_[i] = stride_[dims[i]];
+        }
+        return a;
+    }
+
+    ssize_t offset(const vector<ssize_t>& idxs) const
+    {
+        TM_CHECK((int)idxs.size() < rank());
+        ssize_t val = 0;
+        for (size_t i = 0; i < idxs.size(); ++i) {
+            TM_CHECK_LT(idxs[i], shape_[i]);
+            val += idxs[i] * stride_[i];
+        }
+        return val;
+    }
+
+    ssize_t offset(ssize_t idx0) const
+    {
+        TM_CHECK(rank());
+        TM_CHECK_LT(idx0, shape_[0]);
+        return stride_[0] * idx0;
+    }
+
+    Layout coalesce() const noexcept;
+
+    Layout view(vector<ssize_t> shape) const;
+
+    std::pair<Layout, ssize_t> slice(const vector<ssize_t>& base, vector<ssize_t> shape) const;
+
+    Layout squeeze(int dim) const
+    {
+        if (rank() == 1 || shape(dim) != 1) {
+            return *this;
+        }
+        Layout a;
+        a.shape_.reserve(rank() - 1);
+        a.stride_.reserve(rank() - 1);
+        for (int i = 0; i < rank(); ++i) {
+            if (i != dim) {
+                a.shape_.push_back(shape_[i]);
+                a.stride_.push_back(stride_[i]);
+            }
+        }
+        a.size_ = size_;
+        return a;
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const Layout& x);
+
+private:
+    int wrap(int dim) const noexcept
+    {
+        return dim < 0 ? dim + shape_.size() : dim;
+    }
+
+private:
+    vector<ssize_t> shape_;
+    vector<ssize_t> stride_;
+    ssize_t         size_;
+};
+
+inline std::string to_string(const Layout& x)
+{
+    std::stringstream ss;
+    ss << x;
+    return ss.str();
+}
+
+}  // namespace turbomind::core
diff --git a/src/turbomind/core/module.cc b/src/turbomind/core/module.cc
new file mode 100644
index 0000000000..92b08ba24c
--- /dev/null
+++ b/src/turbomind/core/module.cc
@@ -0,0 +1,78 @@
+
+#include "src/turbomind/core/module.h"
+#include "src/turbomind/core/check.h"
+#include <optional>
+
+namespace turbomind::core {
+
+Module::Module(): parent_{} {}
+
+Module::~Module()
+{
+    if (parent_) {
+        parent_->remove_module(*this);
+        parent_ = {};
+    }
+}
+
+void Module::register_module(std::string name, Module& module, std::optional<int> index)
+{
+    module.parent_ = this;
+    if (index) {
+        name += ".";
+        name += std::to_string(*index);
+    }
+    // std::cout << "register Module " << name << " " << &module << ", parent " << this << "\n";
+    modules_.emplace_back(std::move(name), &module);
+}
+
+void Module::register_parameter(std::string name, Tensor& param)
+{
+    // std::cout << "register Parameter " << name << " " << &param << " " << param.layout() << "\n";
+    params_.emplace_back(std::move(name), &param);
+}
+
+void Module::remove_module(Module& module)
+{
+    for (auto it = modules_.begin(); it != modules_.end(); ++it) {
+        if (it->second == &module) {
+            // std::cout << "erase " << it->first << " " << &module << " from " << this << "\n";
+            modules_.erase(it);
+            return;
+        }
+    }
+    TM_CHECK(0) << "module " << &module << " not found";
+}
+
+void Module::remove_parameter(Tensor& param)
+{
+    for (auto it = params_.begin(); it != params_.end(); ++it) {
+        if (it->second == &param) {
+            params_.erase(it);
+            return;
+        }
+    }
+    TM_CHECK(0) << "param " << &param << " not found";
+}
+
+TensorMap Module::get_parameters() const
+{
+    TensorMap m;
+    get_parameters_impl({}, m);
+    return m;
+}
+
+void Module::get_parameters_impl(std::string prefix, TensorMap& m) const
+{
+    if (!prefix.empty()) {
+        prefix += ".";
+    }
+    for (const auto& [k, v] : params_) {
+        m.emplace(prefix + k, *v);
+    }
+    for (const auto& [k, v] : modules_) {
+        v->get_parameters_impl(prefix + k, m);
+    }
+}
+
+}  // namespace turbomind::core
diff --git a/src/turbomind/core/module.h b/src/turbomind/core/module.h
new file mode 100644
index 0000000000..f48939a84b
--- /dev/null
+++ b/src/turbomind/core/module.h
@@ -0,0 +1,36 @@
+
+#include "src/turbomind/core/tensor.h"
+
+namespace turbomind::core {
+
+class Module {
+public:
+    virtual ~Module();
+
+    Module();
+
+    Module(const Module&) = delete;
+    Module& operator=(const Module&) = delete;
+
+    Module(Module&&) noexcept = delete;
+    Module& operator=(Module&&) noexcept = delete;
+
+    void register_module(std::string name, Module& module, std::optional<int> index = {});
+    void register_parameter(std::string name, Tensor& param);
+
+    void remove_module(Module& module);
+    void remove_parameter(Tensor& param);
+
+    TensorMap get_parameters() const;
+
+private:
+    void get_parameters_impl(std::string prefix, TensorMap& m) const;
+
+protected:
+    Module* parent_;
+
+    std::vector<std::pair<std::string, Module*>> modules_;
+    std::vector<std::pair<std::string, Tensor*>> params_;
+};
+
+}  // namespace turbomind::core
diff --git a/src/turbomind/core/stream.cc b/src/turbomind/core/stream.cc
new file mode 100644
index 0000000000..d63326133c
--- /dev/null
+++ b/src/turbomind/core/stream.cc
@@ -0,0 +1,19 @@
+
+#include "src/turbomind/core/stream.h"
+#include <memory>
+
+namespace turbomind::core {
+
+Stream Stream::create(int priority)
+{
+    Stream stream;
+    stream.impl_ = std::make_shared<StreamImpl>(priority);
+    return stream;
+}
+
+void StreamImpl::Wait(const Event& event)
+{
+    check_cuda_error(cudaStreamWaitEvent(stream_, event));
+}
+
+}  // namespace turbomind::core
diff --git a/src/turbomind/core/stream.h b/src/turbomind/core/stream.h
new file mode 100644
index 0000000000..9727dd5b64
--- /dev/null
+++ b/src/turbomind/core/stream.h
@@ -0,0 +1,160 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+#include "src/turbomind/core/check.h"
+#include "src/turbomind/core/common.h"
+
+namespace turbomind::core {
+
+class StreamImpl {
+public:
+    StreamImpl(int priority): stream_{}
+    {
+        check_cuda_error(cudaStreamCreateWithPriority(&stream_, cudaStreamNonBlocking, priority));
+    }
+
+    ~StreamImpl()
+    {
+        if (auto ec = cudaStreamDestroy(stream_); ec != cudaSuccess) {
+            TM_LOG_ERROR(cudaGetErrorString(ec));
+        }
+        stream_ = {};
+    }
+
+    void Sync()
+    {
+        check_cuda_error(cudaStreamSynchronize(stream_));
+    }
+
+    void Wait(const Event& event);
+
+    cudaStream_t handle() const
+    {
+        return stream_;
+    }
+
+public:
+    cudaStream_t stream_;
+};
+
+class Stream {
+public:
+    Stream() = default;
+
+    static Stream create(int priority = 0);
+
+    void Sync()
+    {
+        impl_->Sync();
+    }
+
+    void Wait(const Event& event)
+    {
+        impl_->Wait(event);
+    }
+
+    cudaStream_t handle() const
+    {
+        return TM_CHECK_NOTNULL(impl_)->handle();
+    }
+
+    explicit operator cudaStream_t() const
+    {
+        return handle();
+    }
+
+    explicit operator bool() const noexcept
+    {
+        return static_cast<bool>(impl_);
+    }
+
+    friend bool operator==(const Stream& a, const Stream& b)
+    {
+        return a.impl_ == b.impl_;
+    }
+
+    friend bool operator!=(const Stream& a, const Stream& b)
+    {
+        return !(a == b);
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const Stream& s)
+    {
+        os << s.impl_;
+        return os;
+    }
+
+private:
+    shared_ptr<StreamImpl> impl_;
+};
+
+class EventImpl {
+public:
+    explicit EventImpl(unsigned flags)
+    {
+        check_cuda_error(cudaEventCreateWithFlags(&event_, flags));
+    }
+
+    ~EventImpl()
+    {
+        if (auto ec = cudaEventDestroy(event_); ec != cudaSuccess) {
+            TM_LOG_ERROR(cudaGetErrorString(ec));
+        }
+    }
+
+    void Record(const Stream& stream)
+    {
+        check_cuda_error(cudaEventRecord(event_, stream.handle()));
+    }
+
+    void Sync() const
+    {
+        check_cuda_error(cudaEventSynchronize(event_));
+    }
+
+    cudaEvent_t handle() const
+    {
+        return event_;
+    }
+
+private:
+    cudaEvent_t event_;
+};
+
+class Event {
+public:
+    Event() = default;
+
+    static Event create(bool timing = false)
+    {
+        Event e{};
+        e.impl_ = std::make_shared<EventImpl>(timing ? 0 : cudaEventDisableTiming);
+        return e;
+    }
+
+    void Record(const Stream& stream)
+    {
+        TM_CHECK_NOTNULL(impl_)->Record(stream);
+    }
+
+    void Sync() const
+    {
+        TM_CHECK_NOTNULL(impl_)->Sync();
+    }
+
+    operator cudaEvent_t() const
+    {
+        return TM_CHECK_NOTNULL(impl_)->handle();
+    }
+
+    explicit operator bool() const noexcept
+    {
+        return static_cast<bool>(impl_);
+    }
+
+private:
+    shared_ptr<EventImpl> impl_;
+};
+
+}  // namespace turbomind::core
diff --git a/src/turbomind/core/tensor.cc b/src/turbomind/core/tensor.cc
new file mode 100644
index 0000000000..959d04ce6c
--- /dev/null
+++ b/src/turbomind/core/tensor.cc
@@ -0,0 +1,142 @@
+
+#include "src/turbomind/core/tensor.h"
+#include "src/turbomind/core/buffer.h"
+#include "src/turbomind/core/stream.h"
+
+namespace turbomind::core {
+
+std::ostream& operator<<(std::ostream& os, const Tensor& t)
+{
+    os << t.dtype() << "[" << t.layout() << "]@" << t.buffer_.data_or((void*)nullptr);
+    return os;
+}
+
+Tensor& TensorMap::at(const std::string& key)
+{
+    auto it = find(key);
+    TM_CHECK(it != end()) << get_out_of_range_msg(key);
+    return it->second;
+}
+
+std::string TensorMap::get_out_of_range_msg(const std::string& key) const
+{
+    std::ostringstream oss;
+    oss << "Cannot find a tensor of name '" << key << "' in the tensor map (keys: ";
+    auto sep = "";
+    for (const auto& [k, _] : *this) {
+        oss << std::exchange(sep, ", ") << k;
+    }
+    oss << ")";
+    return oss.str();
+}
+
+Tensor* TensorMap::try_(const std::string& key)
+{
+    auto it = find(key);
+    if (it != end()) {
+        return &it->second;
+    }
+    return nullptr;
+}
+
+#if 0
+
+void Copy(const Tensor& src, Tensor& dst, Stream& stream)
+{
+    TM_CHECK(src.dtype() == dst.dtype());
+    TM_CHECK(src.shape() == dst.shape());
+
+    const DataType dtype = src.dtype();
+
+    auto trivial = [&] {
+        const ssize_t bytesize = get_byte_size(dtype, src.size());
+        check_cuda_error(cudaMemcpyAsync(dst.raw_data(), src.raw_data(), bytesize, cudaMemcpyDefault, stream.handle()));
+    };
+
+    if (src.layout().is_contiguous() && dst.layout().is_contiguous()) {
+        return trivial();
+    }
+
+    auto a = src.layout();
+    auto b = dst.layout();
+
+    vector<int> idxs(a.rank());
+    std::iota(idxs.begin(), idxs.end(), 0);
+    std::sort(idxs.begin(), idxs.end(), [&](int i, int j) {  //
+        return a.stride()[j] < a.stride()[i];
+    });
+
+    // innermost dim is not contiguous
+    if (a.stride(idxs.back()) > 1 || b.stride(idxs.back()) > 1) {
+        return GenericCopy(src, dst, stream);
+    }
+
+    a = a.reorder(idxs);
+    b = b.reorder(idxs);
+
+    // trivial after reorder (e.g. transposed matrices)
+    if (a.is_contiguous() && b.is_contiguous()) {
+        return trivial();
+    }
+
+    a = a.coalesce();
+    b = b.coalesce();
+
+    int rank = std::max(a.rank(), b.rank());
+
+    if (rank > 3) {
+        return GenericCopy(src, dst, stream);
+    }
+
+    if (a.rank() < rank) {
+        a = a.view(b.shape());
+    }
+    else if (b.rank() < rank) {
+        b = b.view(b.shape());
+    }
+
+    if (rank == 2) {
+        check_cuda_error(cudaMemcpy2DAsync(dst.raw_data(),
+                                           get_byte_size(dtype, b.stride(0)),
+                                           src.raw_data(),
+                                           get_byte_size(dtype, a.stride(0)),
+                                           get_byte_size(dtype, a.shape(1)),
+                                           a.shape(0),
+                                           cudaMemcpyDefault,
+                                           stream.handle()));
+        return;
+    }
+
+    auto [a0, a1] = a.strides(0, 1);
+    auto [b0, b1] = b.strides(0, 1);
+
+    // make sure the underlying space is actually a cube [x % (y * z) == 0]
+    if (rank == 3 && a0 % a1 == 0 && b0 % b1 == 0) {
+        const auto xsz_a = get_byte_size(dtype, a.stride(1));
+        const auto xsz_b = get_byte_size(dtype, b.stride(1));
+        const auto ysz_a = a0 / a1;
+        const auto ysz_b = b0 / b1;
+
+        cudaMemcpy3DParms param{};
+        param.srcPtr = make_cudaPitchedPtr((void*)src.raw_data(), xsz_a, xsz_a, ysz_a);
+        param.dstPtr = make_cudaPitchedPtr((void*)dst.raw_data(), xsz_b, xsz_b, ysz_b);
+        param.extent = make_cudaExtent(get_byte_size(dtype, a.shape(2)), a.shape(1), a.shape(0));
+        param.kind   = cudaMemcpyDefault;
+
+        if (auto ec = cudaMemcpy3DAsync(&param, stream.handle()); ec == cudaSuccess) {
+            TM_LOG_WARNING(cudaGetErrorString(ec));
+            return;
+        }
+    }
+
+    return GenericCopy(src, dst, stream);
+}
+
+void Copy(const Tensor& src, Tensor&& dst, Stream& stream)
+{
+    return Copy(src, dst, stream);
+}
+
+#endif
+
+}  // namespace turbomind::core
diff --git a/src/turbomind/core/tensor.cu b/src/turbomind/core/tensor.cu
new file mode 100644
index 0000000000..8b6c0f724c
--- /dev/null
+++ b/src/turbomind/core/tensor.cu
@@ -0,0 +1,201 @@
+
+
+#include "src/turbomind/core/buffer.h"
+#include "src/turbomind/core/tensor.h"
+#include "src/turbomind/kernels/core/array.h"
+#include "src/turbomind/kernels/core/math.h"
+#include "src/turbomind/kernels/core/meta.h"
+
+namespace turbomind::core {
+
+#if 0
+
+namespace kernel {
+
+// This is going to be slow for transposing the innermost dim
+template<class T, class Index, int D>
+__global__ void GenericCopy(const T*          a,
+                            T*                b,
+                            Array<int64_t, D> stride_a,
+                            Array<int64_t, D> stride_b,
+                            Array<Index, D>   shape,
+                            int               ndim,
+                            int64_t           size)
+{
+    Index idx = threadIdx.x + (Index)blockIdx.x * blockDim.x;
+
+    if (idx >= size) {
+        return;
+    }
+
+    Array<int64_t, D> coord;
+    PRAGMA_UNROLL
+    for (int i = 0; i < D; ++i) {
+        if (i < ndim) {
+            auto div = idx / shape[i];
+            auto mod = idx % shape[i];
+            coord[i] = mod;
+            idx      = div;
+        }
+    }
+
+    int64_t idx_a = 0;
+    int64_t idx_b = 0;
+
+    PRAGMA_UNROLL
+    for (int i = 0; i < D; ++i) {
+        if (i < ndim) {
+            idx_a += coord[i] * stride_a[i];
+            idx_b += coord[i] * stride_b[i];
+        }
+    }
+
+    b[idx_b] = a[idx_a];
+}
+
+}  // namespace kernel
+
+void GenericCopy(const Tensor& src, Tensor& dst, Stream& stream)
+{
+    auto a = src.layout();
+    auto b = dst.layout();
+
+    // Sort strides ascending
+    vector<int> idxs(a.rank());
+    std::iota(idxs.begin(), idxs.end(), 0);
+    std::sort(idxs.begin(), idxs.end(), [&](int i, int j) {  //
+        return a.stride()[i] < a.stride()[j];
+    });
+
+    a = a.permute(idxs);
+    b = b.permute(idxs);
+
+    a = a.coalesce();
+    b = b.coalesce();
+
+    int rank = std::max(a.rank(), b.rank());
+
+    if (a.rank() < rank) {
+        a = a.view(b.shape());
+    }
+    else if (b.rank() < rank) {
+        b = b.view(b.shape());
+    }
+
+    const DataType dtype = src.dtype();
+
+    int64_t alignment = 16;
+
+    auto align = [&](auto v) { alignment = std::gcd(alignment, v); };
+
+    if (a.stride(0) > 1 || b.stride(0) > 1) {
+        alignment = get_byte_size(dtype);
+    }
+
+    align(get_byte_size(dtype, a.shape(0)));
+
+    auto data_a = src.raw_data();
+    auto data_b = dst.raw_data();
+
+    align(reinterpret_cast<uintptr_t>(data_a));
+    align(reinterpret_cast<uintptr_t>(data_b));
+
+    for (int i = 1; i < rank; ++i) {
+        align(get_byte_size(dtype, a.stride(i)));
+        align(get_byte_size(dtype, b.stride(i)));
+    }
+
+    const auto vec_size = get_elem_num(alignment, dtype);
+
+    const auto size = a.size() / vec_size;
+
+    int device{};
+    check_cuda_error(cudaGetDevice(&device));
+    int sm_num{};
+    check_cuda_error(cudaDeviceGetAttribute(&sm_num, cudaDevAttrMultiProcessorCount, device));
+
+    auto invoke = [&](auto vec_t, auto index_t, auto d) {
+        using T         = decltype(vec_t);
+        using Index     = decltype(index_t);
+        constexpr int D = d.value;
+
+        Array<Index, D> shape;
+        std::fill(shape.begin() + rank, shape.end(), 1);
+        std::copy_n(a.shape().data(), rank, shape.data());
+
+        Array<int64_t, D> stride_a{};
+        Array<int64_t, D> stride_b{};
+        std::copy_n(a.stride().data(), rank, stride_a.data());
+        std::copy_n(b.stride().data(), rank, stride_b.data());
+
+        if (vec_size > 1) {
+            shape[0] /= vec_size;
+            for (int i = 0; i < rank; ++i) {
+                stride_a[i] /= vec_size;
+                stride_b[i] /= vec_size;
+            }
+        }
+
+        auto func = kernel::GenericCopy<T, Index, D>;
+
+        int min_waves  = INT_MAX;
+        int block_size = 0;
+        int grid_size  = 0;
+
+        for (int threads = 256; threads <= 1024; threads *= 2) {
+            int blocks = cdiv<ssize_t>(size, block_size);
+            int n_active{};
+            check_cuda_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_active, func, block_size, 0));
+            int waves = cdiv(blocks, n_active * sm_num);
+            if (waves < min_waves) {
+                min_waves  = waves;
+                block_size = threads;
+                grid_size  = blocks;
+            }
+        }
+
+        func<<<grid_size, block_size, 0, stream.handle()>>>(
+            (const T*)data_a, (T*)data_b, stride_a, stride_b, shape, rank, a.size());
+    };
+
+    auto invoke_d = [&](auto vec_t, auto idx_t) {
+        if (rank <= 2) {
+            invoke(vec_t, idx_t, constant<2>{});
+        }
+        else if (rank <= 4) {
+            invoke(vec_t, idx_t, constant<4>{});
+        }
+        else if (rank <= 8) {
+            invoke(vec_t, idx_t, constant<8>{});
+        }
+        else {
+            throw std::runtime_error("not implemented");
+        }
+    };
+
+    auto invoke_i = [&](auto vec_t) {
+        if (size < INT_MAX) {
+            invoke_d(vec_t, int{});
+        }
+        else {
+            invoke_d(vec_t, int64_t{});
+        }
+    };
+
+    switch (alignment) {
+        case 16:
+            return invoke_i(uint4{});
+        case 8:
+            return invoke_i(uint2{});
+        case 4:
+            return invoke_i(uint{});
+        case 2:
+            return invoke_i(ushort{});
+        default:
+            return invoke_i(char{});
+    }
+}
+
+#endif
+
+}  // namespace turbomind::core
diff --git a/src/turbomind/core/tensor.h b/src/turbomind/core/tensor.h
new file mode 100644
index 0000000000..3721327748
--- /dev/null
+++ b/src/turbomind/core/tensor.h
@@ -0,0 +1,316 @@
+#pragma once
+
+#include <optional>
+#include <string>
+#include <unordered_map>
+
+#include "src/turbomind/core/allocator.h"
+#include "src/turbomind/core/buffer.h"
+#include "src/turbomind/core/context.h"
+#include "src/turbomind/core/layout.h"
+
+namespace turbomind::core {
+
+class Tensor {
+public:
+    Tensor() = default;
+
+    Tensor(Layout layout, DataType dtype, Device device): Tensor{layout, dtype, Context::alloc(device)} {}
+
+    Tensor(Layout layout, DataType dtype, Allocator& alloc): layout_{std::move(layout)}
+    {
+        buffer_ = Buffer(layout_.cosize(), dtype, alloc);
+    }
+
+    Tensor(Buffer buffer, Layout layout): layout_{std::move(layout)}, buffer_{std::move(buffer)}
+    {
+        TM_CHECK_LE(layout_.cosize(), buffer_.size());
+    }
+
+    Tensor(Buffer buffer): layout_{buffer.size()}, buffer_{buffer} {}
+
+    Tensor(void* data, Layout layout, DataType dtype, Device device):
+        Tensor{Buffer{data, layout.cosize(), dtype, device}, layout}
+    {
+    }
+
+    Tensor(std::shared_ptr<void> data, Layout layout, DataType dtype, Device device):
+        Tensor{Buffer{data, layout.cosize(), dtype, device}, layout}
+    {
+    }
+
+    template<class T>
+    Tensor(T* data, Layout layout, Device device): Tensor{Buffer{data, layout.cosize(), device}, layout}
+    {
+    }
+
+    static Tensor empty_like(const Tensor& tensor, std::optional<Device> device = {})
+    {
+        return Tensor{tensor.layout_, tensor.dtype(), device ? *device : tensor.device()};
+    }
+
+    Buffer& buffer() noexcept
+    {
+        return buffer_;
+    }
+
+    const Buffer& buffer() const noexcept
+    {
+        return buffer_;
+    }
+
+    DataType dtype() const
+    {
+        return buffer_.dtype();
+    }
+
+    Device device() const
+    {
+        return buffer_.device();
+    }
+
+    ssize_t size() const noexcept
+    {
+        return layout_.size();
+    }
+
+    ssize_t byte_size() const noexcept
+    {
+        return turbomind::byte_size(dtype(), size());
+    }
+
+    explicit operator bool() const noexcept
+    {
+        return static_cast<bool>(buffer_);
+    }
+
+    template<class T>
+    T* data()
+    {
+        return buffer_.data<T>();
+    }
+
+    template<class T>
+    const T* data() const
+    {
+        return const_cast<Tensor*>(this)->data<T>();
+    }
+
+    void* raw_data()
+    {
+        return buffer_.raw_data();
+    }
+
+    const void* raw_data() const
+    {
+        return const_cast<Tensor*>(this)->raw_data();
+    }
+
+    template<class T>
+    T* data_or(T* other)
+    {
+        return buffer_.data_or(other);
+    }
+
+    template<class T>
+    const T* data_or(T* other) const
+    {
+        return buffer_.data_or(other);
+    }
+
+    Tensor view(std::vector<ssize_t> shape) const
+    {
+        return Tensor{buffer_, layout_.view(std::move(shape))};
+    }
+
+    auto& layout() const noexcept
+    {
+        return layout_;
+    }
+
+    auto& shape() const noexcept
+    {
+        return layout_.shape();
+    }
+
+    auto shape(int i) const
+    {
+        return layout_.shape(i);
+    }
+
+    template<class... Is>
+    auto shapes(Is&&... is) const
+    {
+        return layout_.shapes(((Is &&) is)...);
+    }
+
+    auto& stride() const noexcept
+    {
+        return layout_.stride();
+    }
+
+    auto stride(int i) const
+    {
+        return layout_.stride(i);
+    }
+
+    bool is_contiguous() const noexcept
+    {
+        return layout().is_contiguous();
+    }
+
+    Tensor slice(std::vector<ssize_t> base, std::vector<ssize_t> shape) const
+    {
+        auto&& [layout, offset] = layout_.slice(base, std::move(shape));
+        const auto cosize       = layout.cosize();
+        return Tensor{buffer_.slice(offset, cosize), std::move(layout)};
+    }
+
+    // The outermost dimension
+    Tensor slice(ssize_t base, ssize_t size = 1) const
+    {
+        vector<ssize_t> bases(shape().size());
+        bases.front() = base;
+        vector<ssize_t> sizes{this->shape()};
+        sizes.front() = size;
+        return slice(bases, sizes);
+    }
+
+    Tensor borrow() const
+    {
+        return Tensor{buffer_.borrow(), layout_};
+    }
+
+    Tensor squeeze(int dim) const
+    {
+        return Tensor{buffer_, layout_.squeeze(dim)};
+    }
+
+    int ndim() const noexcept
+    {
+        return layout_.rank();
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const Tensor& t);
+
+private:
+    Layout layout_;
+    Buffer buffer_;
+};
+
+#if 0
+void Copy(const Tensor& src, Tensor& dst, Stream& stream);
+
+void Copy(const Tensor& src, Tensor&& dst, Stream& stream);
+
+// Launch a kernel to perform the complicated copying
+void GenericCopy(const Tensor& src, Tensor& dst, Stream& stream);
+
+Tensor Reshape(const Tensor& t, vector<ssize_t> shape);
+
+Tensor Transpoe(const Tensor& t, int dim0, int dim1);
+
+Tensor Permute(const Tensor& t, vector<int> dims);
+
+Tensor Contiguous(const Tensor& t);
+#endif
+
+template<class T>
+struct Tensor_: public Tensor {
+    Tensor_() = default;
+
+    Tensor_(Layout layout, Device device): Tensor{std::move(layout), data_type_v<T>, device} {}
+
+    Tensor_(Layout layout, Allocator& alloc): Tensor{std::move(layout), data_type_v<T>, alloc} {}
+
+    Tensor_(Buffer buffer, Layout layout): Tensor{ensure_dtype(std::move(buffer)), std::move(layout)} {}
+
+    Tensor_(T* data, Layout layout, Device device): Tensor{data, std::move(layout), device} {}
+
+    Tensor_(shared_ptr<void> data, Layout layout, Device device):
+        Tensor{Buffer{std::move(data), layout.cosize(), data_type_v<T>, device}, layout}
+    {
+    }
+
+    Tensor_(const Tensor_&) = default;
+    Tensor_& operator=(const Tensor_&) = default;
+
+    Tensor_(Tensor_&&) noexcept = default;
+    Tensor_& operator=(Tensor_&&) noexcept = default;
+
+    Tensor_(const Tensor& other)
+    {
+        *static_cast<Tensor*>(this) = ensure_dtype(other);
+    }
+    Tensor_(Tensor&& other) noexcept
+    {
+        *static_cast<Tensor*>(this) = ensure_dtype(std::move(other));
+    }
+
+    ssize_t offset(const vector<ssize_t>& idxs)
+    {
+        return layout().offset(idxs);
+    }
+
+    T* data() noexcept
+    {
+        return Tensor::data<T>();
+    }
+
+    const T* data() const noexcept
+    {
+        return Tensor::data<T>();
+    }
+
+    T* data_or(T* other)
+    {
+        return Tensor::data_or<T>(other);
+    }
+
+    const T* data_or(T* other) const
+    {
+        return Tensor::data_or<T>(other);
+    }
+
+    constexpr DataType dtype() const noexcept
+    {
+        return data_type_v<T>;
+    }
+
+private:
+    template<class U>
+    static decltype(auto) ensure_dtype(U&& u)
+    {
+        TM_CHECK_EQ(u.dtype(), data_type_v<T>);
+        return (U &&) u;
+    }
+};
+
+class TensorMap: public std::unordered_map<std::string, Tensor> {
+public:
+    using std::unordered_map<std::string, Tensor>::unordered_map;
+
+    Tensor& at(const std::string& key);
+
+    const Tensor& at(const std::string& key) const
+    {
+        return const_cast<TensorMap*>(this)->at(key);
+    }
+
+    Tensor* try_(const std::string& key);
+
+    const Tensor* try_(const std::string& key) const
+    {
+        return const_cast<TensorMap*>(this)->try_(key);
+    }
+
+    bool contains(const std::string& key) const
+    {
+        return find(key) != end();
+    }
+
+private:
+    std::string get_out_of_range_msg(const std::string& key) const;
+};
+
+}  // namespace turbomind::core
diff --git a/src/turbomind/core/test_core.cc b/src/turbomind/core/test_core.cc
new file mode 100644
index 0000000000..f0abac9b44
--- /dev/null
+++ b/src/turbomind/core/test_core.cc
@@ -0,0 +1,282 @@
+
+#include <numeric>
+
+#include "src/turbomind/core/core.h"
+
+#include "catch2/catch_test_macros.hpp"
+
+using namespace turbomind;
+
+TEST_CASE("test check", "[check]")
+{
+    int zero = 0;
+
+    TM_CHECK(!zero);
+
+    TM_CHECK_EQ(42, 42) << "Ok";
+    TM_CHECK_NE(42, 24) << "Ok";
+    TM_CHECK_GE(50, 42) << "Ok";
+    TM_CHECK_GT(50, 42) << "Ok";
+    TM_CHECK_LE(42, 50) << "Ok";
+    TM_CHECK_LT(42, 50) << "Ok";
+
+    if (0) {
+        TM_CHECK(zero);
+        TM_CHECK_EQ(42, 43) << "Not "
+                            << "Ok";
+    }
+
+    int  x = 42;
+    auto p = TM_CHECK_NOTNULL(&x);
+    REQUIRE(p == &x);
+
+    if (0) {
+        int* y{};
+        TM_CHECK_NOTNULL(y);
+        TM_CHECK_NOTNULL(std::shared_ptr<void>{});
+    }
+
+    auto y = TM_CHECK_NOTNULL(std::make_shared<int>(42));
+    REQUIRE(*y == 42);
+
+    TM_CHECK(y);
+}
+
+TEST_CASE("test allocator", "[allocator]")
+{
+
+    using core::Allocator;
+    using core::Stream;
+
+    Allocator a;
+    REQUIRE(!a);
+
+    Allocator b{kCPU};
+    REQUIRE(b);
+    REQUIRE(a != b);
+    REQUIRE(b->device() == kCPU);
+    Stream s{};
+    REQUIRE(!b->stream());
+
+    // std::vector<int> v(1 << 20);
+    // std::iota(v.begin(), v.end(), 0);
+
+    // auto p = (int*)b->allocate(sizeof(int) * v.size());
+    // std::iota(p, p + v.size(), 0);
+
+    // REQUIRE(v == std::vector(p, p + v.size()));
+}
+
+TEST_CASE("test context", "[context]")
+{
+    using core::Context;
+    using core::ContextGuard;
+    using core::Stream;
+    using core::Allocator;
+
+    Stream s0 = Stream::create();
+
+    ContextGuard g0{s0, Allocator{kCPU}};
+
+    REQUIRE(Context::stream());
+    REQUIRE(Context::stream() == s0);
+
+    auto a0 = Context::host_alloc();
+
+    {
+        Allocator a1(Context::stream(), false);  // device allocator
+        REQUIRE(a1->device().type == kDEVICE);
+
+        ContextGuard g1{a1};
+
+        REQUIRE(Context::stream() == s0);
+        REQUIRE(Context::device_alloc() == a1);
+        REQUIRE(Context::host_alloc() == a0);
+
+        {
+            ContextGuard g2{Stream::create(), Allocator(kDEVICE)};
+            REQUIRE(Context::device_alloc() != a1);
+            REQUIRE(Context::stream() != s0);
+        }
+
+        REQUIRE(Context::stream() == s0);
+        REQUIRE(Context::device_alloc() == a1);
+    }
+
+    REQUIRE(Context::stream() == s0);
+}
+
+TEST_CASE("test basic buffer", "[buffer]")
+{
+    using core::Buffer;
+    using core::Buffer_;
+    using core::Allocator;
+
+    Buffer a;
+    REQUIRE(!a);
+
+    Buffer b;
+    REQUIRE(!b);
+    REQUIRE(a == b);
+
+    std::vector v{0, 1, 2, 3, 4, 5, 6, 7};
+
+    SECTION("reference into v")
+    {
+        b = Buffer(v.data(), v.size(), kCPU);
+        REQUIRE(b.data<int>() == v.data());
+        REQUIRE(b.raw_data() == v.data());
+    }
+    SECTION("shared ownership")
+    {
+        auto x = std::shared_ptr<int[]>(new int[v.size()]);
+        std::copy(v.begin(), v.end(), x.get());
+        b = Buffer(x, v.size(), data_type_v<int>, kCPU);
+        REQUIRE(b.data<int>() == x.get());
+        REQUIRE(b.raw_data() == x.get());
+    }
+    SECTION("allocation")
+    {
+        Allocator alloc{kCPU};
+        b = Buffer(v.size(), data_type_v<int>, alloc);
+        std::copy(v.begin(), v.end(), b.data<int>());
+    }
+
+    REQUIRE(b);
+    REQUIRE(b.size() == v.size());
+    REQUIRE(b.dtype() == data_type_v<int>);
+    REQUIRE(b.byte_size() == sizeof(int) * v.size());
+    auto c = b;
+    REQUIRE(c == b);
+    REQUIRE(b == c);
+    REQUIRE(a != b);
+    REQUIRE(b != a);
+    REQUIRE(std::vector(b.data<int>(), b.data<int>() + b.size()) == v);
+
+    auto s = b.slice(3, 2);
+    REQUIRE(s.size() == 2);
+    REQUIRE(s.raw_data() == b.data<int>() + 3);
+
+    Buffer_<int> x;
+    Buffer_<int> y = Buffer{data_type_v<int>};
+
+    Buffer z = Buffer_<int>(1024, kCPU);
+
+    x = z;
+
+    for (int i = 0; i < z.size(); ++i) {
+        x[i] = i;
+    }
+
+    std::vector<int> ref(1024);
+    std::iota(ref.begin(), ref.end(), 0);
+    REQUIRE(std::vector(x.begin(), x.end()) == ref);
+
+    Buffer e;
+    REQUIRE(!e.data_or((void*)0));
+    REQUIRE(!e.data_or<int>(nullptr));
+}
+
+TEST_CASE("test buffer view", "[buffer]")
+{
+    using core::Buffer;
+
+    std::vector<int64_t> v{0, 1, 2, 3, 4, 5, 6, 7};
+
+    Buffer b(v.data(), v.size(), kCPU);
+
+    auto c = b.slice(2, 4);
+    REQUIRE(c.size() == 4);
+    REQUIRE(c.raw_data() == b.data<int64_t>() + 2);
+
+    std::cout << c << std::endl;
+
+    auto d = c.view<int>();
+
+    REQUIRE(d.size() == c.size() * 2);
+    REQUIRE(d.raw_data() == c.raw_data());
+}
+
+TEST_CASE("test layout", "[layout]")
+{
+    using core::Layout;
+
+    Layout a;  // default ctor
+    REQUIRE(a.size() == 0);
+    REQUIRE(a.cosize() == 0);
+
+    Layout b({20, 50});
+    REQUIRE(b.size() == 1000);
+    REQUIRE(b.cosize() == b.size());
+    REQUIRE(to_string(b) == "(20,50):(50,1)");
+
+    Layout c = b.coalesce();
+    REQUIRE(c.size() == b.size());
+    REQUIRE(c.cosize() == b.cosize());
+    REQUIRE(to_string(c) == "(1000):(1)");
+
+    Layout v = b.view({50, 20});
+    REQUIRE(v.size() == b.size());
+    REQUIRE(v.cosize() == b.cosize());
+    REQUIRE(to_string(v) == "(50,20):(20,1)");
+
+    v = b.view({25, -1});
+    REQUIRE(to_string(v) == "(25,40):(40,1)");
+
+    v = b.view({5, -1, 5});
+    REQUIRE(to_string(v) == "(5,40,5):(200,5,1)");
+
+    v = b.view({-1, 20, 10, 1});
+    REQUIRE(to_string(v) == "(5,20,10,1):(200,10,1,1)");
+
+    REQUIRE(to_string(v.coalesce()) == "(1000):(1)");
+
+    auto [s, offset] = b.slice({10, 20}, {-1, -1});
+    REQUIRE(to_string(s) == "(10,30):(50,1)");
+    REQUIRE(offset == 520);
+
+    v = s.view({2, -1, 3, 10});
+    std::cout << v << std::endl;
+
+    std::cout << v.coalesce() << std::endl;
+
+    // v = s.view({30, 10});
+    // std::cout << v << std::endl;
+}
+
+TEST_CASE("test tensor", "[tensor]")
+{
+    using core::Tensor;
+    using core::Tensor_;
+    using core::Allocator;
+
+    Tensor a;
+    REQUIRE(!a);
+
+    Tensor_<float> b{{10, 20}, kCPU};
+    Tensor_<float> c = b.slice(0, 5);
+
+    std::cout << b << std::endl;
+
+    REQUIRE(c.shape() == std::vector<ssize_t>{5, 20});
+    REQUIRE(c.data() == b.data());
+
+    auto d = b.view({2, -1, 10});
+    REQUIRE(d.shape() == std::vector<ssize_t>{2, 10, 10});
+
+    // this is typed
+    Tensor_<float> x = Tensor_<float>{};
+    // while being empty
+    REQUIRE(!x);
+
+    if (0) {
+        // empty Tensor has invalid type
+        Tensor_<float> x = Tensor{};
+    }
+    a = {};
+    x = {};
+
+    Tensor y = core::Buffer{100, kInt32, kCPU};
+    REQUIRE(y.ndim() == 1);
+    REQUIRE(y.shape(0) == 100);
+}
diff --git a/src/turbomind/engine/CMakeLists.txt b/src/turbomind/engine/CMakeLists.txt
index 1d68116cf6..6836d98155 100644
--- a/src/turbomind/engine/CMakeLists.txt
+++ b/src/turbomind/engine/CMakeLists.txt
@@ -3,5 +3,6 @@
 cmake_minimum_required(VERSION 3.8)
 
 add_library(engine STATIC gateway.cc request_queue.cc model_request.cc)
+target_link_libraries(engine PRIVATE core)
 set_property(TARGET engine PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET engine PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
diff --git a/src/turbomind/engine/model_request.cc b/src/turbomind/engine/model_request.cc
index dc27305139..29986405d6 100644
--- a/src/turbomind/engine/model_request.cc
+++ b/src/turbomind/engine/model_request.cc
@@ -3,57 +3,15 @@
 #include <algorithm>
 #include <functional>
 #include <memory>
-#include <numeric>
 #include <type_traits>
-#include <unordered_map>
 #include <utility>
-#include <vector>
 
 #include "src/turbomind/engine/model_request.h"
 #include "src/turbomind/engine/request.h"
-#include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/constant.h"
-#include "src/turbomind/utils/cuda_utils.h"
 
 namespace turbomind {
 
-static ManagedTensor create(DataType dtype, MemoryType where, const std::vector<int64_t>& size, int64_t& byte_size)
-{
-    byte_size = std::accumulate(size.begin(), size.end(), Tensor::getTypeSize(dtype), std::multiplies<>{});
-    void* data{};
-
-    if (where == MEMORY_GPU) {
-        check_cuda_error(cudaMallocAsync(&data, byte_size, nullptr));
-    }
-    else {
-        data = std::malloc(byte_size);
-    }
-
-    ManagedTensor ret;
-    ret.tensor = Tensor{where, dtype, std::vector<size_t>(size.begin(), size.end()), data};
-    ret.data_holder.reset((void*)nullptr, [data, where](auto) {
-        // std::cerr << "turbomind tensor deallocate" << std::endl;
-        if (where == MEMORY_GPU) {
-            /// TODO: guard device id
-            check_cuda_error(cudaFreeAsync(data, nullptr));
-        }
-        else {
-            std::free(data);
-        }
-    });
-    return ret;
-}
-
-template<class T>
-static T get(const std::unordered_map<std::string, ManagedTensor>& m, const std::string& key, T fallback = {})
-{
-    auto it = m.find(key);
-    if (it != m.end()) {
-        return it->second->getVal<T>();
-    }
-    return fallback;
-}
-
 ModelRequest::ModelRequest(Gateway* gateway, DataType data_type, int session_len, int vocab_size, int hidden_dim):
     gateway_{gateway},
     data_type_{data_type},
@@ -85,27 +43,25 @@ void ModelRequest::End(std::function<void(int)> cb, uint64_t session_id)
 
 auto ModelRequest::Forward(InputParam param, std::function<void()> cb) -> OutputParam
 {
-    inputs_  = std::make_shared<TensorMap_>();
-    outputs_ = std::make_shared<TensorMap_>();
+    inputs_  = std::make_shared<TensorMap>();
+    outputs_ = std::make_shared<TensorMap>();
 
     auto add = [](auto& dest, auto key, auto dtype, auto where, auto shape, auto&&... dims) {
-        std::vector<int64_t> shape_;
+        Layout shape_;
         if constexpr (std::is_integral_v<decltype(shape)>) {
             shape_ = {shape, dims...};
         }
         else {
             shape_ = {shape.cbegin(), shape.cend()};
         }
-        int64_t byte_size{};
-        auto    it = dest->emplace(key, create(dtype, where, shape_, byte_size)).first;
-        return std::make_pair(it->second->data, byte_size);
+        dest->emplace(key, Tensor{shape_, dtype, where});
     };
 
     auto& inputs = *param.tensors;
 
-    FT_CHECK(inputs.at("input_ids")->shape.size() == 1);
+    TM_CHECK_EQ(inputs.at("input_ids").ndim(), 1);
 
-    const int input_len  = inputs.at("input_ids")->shape[0];
+    const int input_len  = inputs.at("input_ids").shape(0);
     const int output_len = param.gen_cfg.max_new_tokens;
 
     // Max possible length of a sequence, this depends on `history_len` which isn't available here, so `session_len`
@@ -119,32 +75,32 @@ auto ModelRequest::Forward(InputParam param, std::function<void()> cb) -> Output
         inputs_->emplace(k, v);
     }
 
-    add(outputs_, "output_ids", TYPE_INT32, MEMORY_CPU, max_seq_len);
-    add(outputs_, "sequence_length", TYPE_INT32, MEMORY_CPU, 1);
+    add(outputs_, "output_ids", data_type_v<int>, kCPU, max_seq_len);
+    add(outputs_, "sequence_length", data_type_v<int>, kCPU, 1);
 
     if (param.gen_cfg.output_logits) {
         const int len = param.gen_cfg.output_logits == GenerationConfig::kAll ? max_in_out_len : max_out_len;
-        add(outputs_, "logits", data_type_, MEMORY_CPU, len, vocab_size_);
+        add(outputs_, "logits", data_type_, kCPU, len, vocab_size_);
     }
 
     if (param.gen_cfg.output_last_hidden_state) {
         const int len = param.gen_cfg.output_last_hidden_state == GenerationConfig::kAll ? max_in_out_len : max_out_len;
-        add(outputs_, "last_hidden_state", data_type_, MEMORY_CPU, len, hidden_dim_);
+        add(outputs_, "last_hidden_state", data_type_, kCPU, len, hidden_dim_);
     }
 
     if (param.gen_cfg.output_logprobs) {
-        add(outputs_, "logprob_vals", data_type_, MEMORY_CPU, max_out_len, kMaxLogProb);
-        add(outputs_, "logprob_indexes", TYPE_INT32, MEMORY_CPU, max_out_len, kMaxLogProb);
-        add(outputs_, "logprob_nums", TYPE_INT32, MEMORY_CPU, max_out_len);
+        add(outputs_, "logprob_vals", data_type_, kCPU, max_out_len, kMaxLogProb);
+        add(outputs_, "logprob_indexes", data_type_v<int>, kCPU, max_out_len, kMaxLogProb);
+        add(outputs_, "logprob_nums", data_type_v<int>, kCPU, max_out_len);
     }
 
     auto r = std::make_shared<Request>();
 
     for (const auto& [k, v] : *inputs_) {
-        r->inputs.insert(k, *v);
+        r->inputs.emplace(k, v);
     }
     for (const auto& [k, v] : *outputs_) {
-        r->outputs.insert(k, *v);
+        r->outputs.emplace(k, v);
     }
 
     auto state = std::make_shared<AtomicRequestState>();
@@ -160,8 +116,8 @@ auto ModelRequest::Forward(InputParam param, std::function<void()> cb) -> Output
     r->forward_cb    = std::move(cb);
     r->state         = state;
 
-    r->output_ids      = *outputs_->at("output_ids");
-    r->sequence_length = *outputs_->at("sequence_length");
+    r->output_ids      = outputs_->at("output_ids");
+    r->sequence_length = outputs_->at("sequence_length");
 
     // Keep a weak reference for canceling the request
     request_ = r;
diff --git a/src/turbomind/engine/model_request.h b/src/turbomind/engine/model_request.h
index aea889e856..b788c0434f 100644
--- a/src/turbomind/engine/model_request.h
+++ b/src/turbomind/engine/model_request.h
@@ -4,8 +4,8 @@
 
 #include <memory>
 
+#include "src/turbomind/core/core.h"
 #include "src/turbomind/engine/gateway.h"
-#include "src/turbomind/utils/Tensor.h"
 
 namespace turbomind {
 
@@ -21,10 +21,8 @@ class ModelRequest {
     // Reset the channel to uninitailized state, calls `notify` when done
     void End(std::function<void(int)> cb, uint64_t session_id);
 
-    using TensorMap_ = std::unordered_map<std::string, ManagedTensor>;
-
     struct InputParam {
-        std::shared_ptr<TensorMap_> tensors;
+        std::shared_ptr<TensorMap> tensors;
 
         SessionParam     session;
         GenerationConfig gen_cfg;
@@ -33,7 +31,7 @@ class ModelRequest {
     };
 
     struct OutputParam {
-        std::shared_ptr<TensorMap_>         tensors;
+        std::shared_ptr<TensorMap>          tensors;
         std::shared_ptr<AtomicRequestState> state;
     };
 
@@ -52,8 +50,8 @@ class ModelRequest {
 
     std::weak_ptr<Request> request_;
 
-    std::shared_ptr<TensorMap_> inputs_;   // owned by caller
-    std::shared_ptr<TensorMap_> outputs_;  // owned by `this`
+    std::shared_ptr<TensorMap> inputs_;
+    std::shared_ptr<TensorMap> outputs_;
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/engine/request.h b/src/turbomind/engine/request.h
index 28f2943b54..31276c004a 100644
--- a/src/turbomind/engine/request.h
+++ b/src/turbomind/engine/request.h
@@ -10,7 +10,7 @@
 #include <memory>
 #include <ostream>
 
-#include "src/turbomind/utils/Tensor.h"
+#include "src/turbomind/core/core.h"
 
 namespace turbomind {
 
@@ -122,8 +122,8 @@ struct Request {
     TensorMap inputs;
     TensorMap outputs;
     // fast path for accessing common output buffers
-    Tensor output_ids;
-    Tensor sequence_length;
+    Tensor_<int> output_ids;
+    Tensor_<int> sequence_length;
 
     std::function<void(int)> end_cb;
 
diff --git a/src/turbomind/kernels/activation_kernels.cu b/src/turbomind/kernels/activation_kernels.cu
index ec5292976f..77373a090c 100644
--- a/src/turbomind/kernels/activation_kernels.cu
+++ b/src/turbomind/kernels/activation_kernels.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "src/turbomind/core/core.h"
+#include "src/turbomind/core/data_type.h"
 #include "src/turbomind/kernels/activation_kernels.h"
 #include "src/turbomind/kernels/core/array.h"
 #include "src/turbomind/kernels/core/array_ops.h"
@@ -171,157 +173,6 @@ struct IdentityActivation {
     }
 };
 
-// clang-format off
-template<template<typename T> class Activation, typename T, typename BT>
-__global__ void generic_activation(T*                      out,
-                                   const BT*  __restrict   bias,
-                                   const T*   __restrict   gated_weights,
-                                   const BT*  __restrict   gated_bias,
-                                   const int* __restrict   ia3_tasks,
-                                   const T*   __restrict   ia3_weights,
-                                   const int               int8_mode,
-                                   const float* __restrict activation_in,
-                                   const float* __restrict activation_out,
-                                   const int* __restrict padding_offset,
-                                   const int seq_len,
-                                   int m,
-                                   int n)
-{
-    constexpr size_t packed_elems = num_elems<T>::value;
-
-    const bool with_bias = bias != nullptr;
-    const bool with_gate = gated_weights != nullptr;
-    // const bool with_ia3  = ia3_tasks != nullptr;
-
-    using Act_T         = typename Activation<T>::return_type;
-    using Float_T       = typename packed_as<float, packed_elems>::type;
-    using Packed_Int8_t = typename packed_as<int8_t, packed_elems>::type;
-
-    for (int64_t id = blockIdx.x * blockDim.x + threadIdx.x; id < 1LL * m * n; id += blockDim.x * gridDim.x) {
-        T val;
-        if (int8_mode == 2) {
-            // val = cuda_cast<T>(cuda_cast<Float_T>(reinterpret_cast<Packed_Int8_t*>(out)[id]) * activation_in[0]);
-        }
-        else {
-            val = out[id];
-        }
-
-        T gated_val;
-        if (with_gate) {
-            gated_val = gated_weights[id];
-        }
-
-        // if (with_bias) {
-        //     const T reg_bias = static_cast<T>(bias[id % n]);
-        //     val              = val + reg_bias;
-
-        //     if (with_gate) {
-        //         const T reg_gated_bias = static_cast<T>(gated_bias[id % n]);
-        //         gated_val              = gated_val + reg_gated_bias;
-        //     }
-        // }
-
-        if (with_gate) {
-            val = cuda_cast<T>(Activation<T>::apply(val) * cuda_cast<Act_T>(gated_val));
-        }
-        else {
-            // val = cuda_cast<T>(Activation<T>::apply(val));
-        }
-
-        // if (with_ia3) {
-        //     const int word_id = id / n;
-        //     const int offset = padding_offset == nullptr ? 0 : padding_offset[word_id];
-        //     const int batch_id = (word_id + offset) / seq_len;
-        //     const int task = ia3_tasks[batch_id];
-        //     val            = val * ia3_weights[task * n + (id % n)];
-        // }
-
-        if (int8_mode != 2) {
-            out[id] = val;
-        }
-        else {
-            // reinterpret_cast<Packed_Int8_t*>(out)[id] =
-            //     cuda_cast<Packed_Int8_t>(cuda_cast<Float_T>(val) * activation_out[0]);
-        }
-    }
-}
-// clang-format on
-
-template<template<typename T> class Activation, typename T, typename BT>
-void invokeGenericActivation(T*           out,
-                             const BT*    bias,
-                             const T*     gated_weights,
-                             const BT*    gated_bias,
-                             const int*   ia3_tasks,
-                             const T*     ia3_weights,
-                             const int    m,
-                             const int    n,
-                             const int    int8_mode,
-                             const float* activation_in,
-                             const float* activation_out,
-                             const int*   padding_offset,
-                             const int    seq_len,
-                             cudaStream_t stream)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    TM_LOG_DEBUG("invokeGenericActivation %d %d %d", m, n, seq_len);
-    using PT                   = typename packed_type<T>::type;
-    constexpr int packed_elems = num_elems<PT>::value;
-    using PBT                  = typename packed_as<BT, packed_elems>::type;
-
-    const int n_threads = 512;
-
-    dim3 block, grid;
-    if (n / 4 / packed_elems <= n_threads) {
-        block.x = n / 4 / packed_elems;
-        grid.x  = m;
-    }
-    else {
-        block.x = n_threads;
-        grid.x  = ceil(1LL * m * n / double(n_threads));
-    }
-    TM_LOG_DEBUG("%d %d", grid.x, block.x);
-    sync_check_cuda_error();
-    generic_activation<Activation><<<grid, block, 0, stream>>>(reinterpret_cast<PT*>(out),
-                                                               reinterpret_cast<const PBT*>(bias),
-                                                               reinterpret_cast<const PT*>(gated_weights),
-                                                               reinterpret_cast<const PBT*>(gated_bias),
-                                                               ia3_tasks,
-                                                               reinterpret_cast<const PT*>(ia3_weights),
-                                                               int8_mode,
-                                                               activation_in,
-                                                               activation_out,
-                                                               padding_offset,
-                                                               seq_len,
-                                                               m,
-                                                               n / packed_elems);
-    sync_check_cuda_error();
-}
-
-#define INSTANTIATE_GENERIC_ACTIVATION(Activation, T, BT)                                                              \
-    template void invokeGenericActivation<Activation, T, BT>(T * out,                                                  \
-                                                             const BT*    bias,                                        \
-                                                             const T*     gated_weights,                               \
-                                                             const BT*    gated_bias,                                  \
-                                                             const int*   ia3_tasks,                                   \
-                                                             const T*     ia3_weights,                                 \
-                                                             const int    m,                                           \
-                                                             const int    n,                                           \
-                                                             const int    int8_mode,                                   \
-                                                             const float* activation_in,                               \
-                                                             const float* activation_out,                              \
-                                                             const int*   padding_offset,                              \
-                                                             const int    seq_len,                                     \
-                                                             cudaStream_t stream);
-
-INSTANTIATE_GENERIC_ACTIVATION(SiluActivation, half, half);
-#ifdef ENABLE_FP32
-INSTANTIATE_GENERIC_ACTIVATION(SiluActivation, float, float);
-#endif
-#ifdef ENABLE_BF16
-INSTANTIATE_GENERIC_ACTIVATION(SiluActivation, __nv_bfloat16, __nv_bfloat16);
-#endif
-
 // `output` may be an alias of `inter_buf`
 template<int VecSize, template<typename T> class Activation, typename T>
 __global__ void activation_kernel(T* inter_buf, const T* __restrict__ gate_buf, int64_t stride, int token_num, int dims)
@@ -367,16 +218,33 @@ void invokeGenericActivation_v2(
         <<<grid, block, 0, stream>>>(inter_buf, gate_buf, stride, token_num, dims);
 }
 
-#define INSTANTIATE_ACTIVATION(Activation, T)                                                                          \
-    template void invokeGenericActivation_v2<SiluActivation>(                                                          \
-        T * inter_buf, const T* __restrict__ gate_buf, int64_t stride, int token_num, int dims, cudaStream_t stream)
+template<template<typename T> class Activation>
+void invokeGenericActivation_v3(Ref<Tensor> inter_, const Tensor& gate, cudaStream_t stream)
+{
+    auto& inter = inter_.get();
+    TM_CHECK_EQ(inter.ndim(), 2);
+    TM_CHECK_EQ(gate.ndim(), 2);
+    TM_CHECK_EQ(inter.stride(0), gate.stride(0));
 
-INSTANTIATE_ACTIVATION(SiluActivation, half);
-#ifdef ENABLE_FP32
-INSTANTIATE_ACTIVATION(SiluActivation, float);
-#endif
-#ifdef ENABLE_BF16
-INSTANTIATE_ACTIVATION(SiluActivation, __nv_bfloat16);
-#endif
+    TM_CHECK(inter.shape() == gate.shape());
+
+    auto invoke = [&](auto t) {
+        using T = decltype(t);
+
+        const auto [num, dim] = inter.shapes(0, 1);
+
+        constexpr int kVecSize = 4;
+        constexpr int block    = 512;
+
+        const dim3 grid(num, cdiv((int)dim, block * kVecSize));
+
+        activation_kernel<kVecSize, Activation, T>
+            <<<grid, block, 0, stream>>>(inter.data<T>(), gate.data<T>(), inter.stride(0), num, dim);
+    };
+
+    TM_DISPATCH_PRIMARY_DTYPES(inter.dtype(), invoke);
+}
+
+template void invokeGenericActivation_v3<SiluActivation>(Ref<Tensor> inter_, const Tensor& gate, cudaStream_t stream);
 
 }  // namespace turbomind
diff --git a/src/turbomind/kernels/activation_kernels.h b/src/turbomind/kernels/activation_kernels.h
index 1197ee4806..935203cf1e 100644
--- a/src/turbomind/kernels/activation_kernels.h
+++ b/src/turbomind/kernels/activation_kernels.h
@@ -16,10 +16,9 @@
 
 #pragma once
 
-#include "src/turbomind/utils/cuda_bf16_wrapper.h"
-#include <cuda_fp16.h>
 #include <cuda_runtime.h>
-#include <stdlib.h>
+
+#include "src/turbomind/core/core.h"
 
 namespace turbomind {
 
@@ -30,85 +29,7 @@ template<typename T> struct SiluActivation;
 template<typename T> struct IdentityActivation;
 // clang-format on
 
-template<template<typename T> class Activation, typename T, typename BT>
-void invokeGenericActivation(T*           out,
-                             const BT*    bias,
-                             const T*     gated_weights,
-                             const BT*    gated_bias,
-                             const int*   ia3_tasks,
-                             const T*     ia3_weights,
-                             const int    m,
-                             const int    n,
-                             const int    int8_mode,
-                             const float* activation_in,
-                             const float* activation_out,
-                             const int*   padding_offset,
-                             const int    seq_len,
-                             cudaStream_t stream);
-
-template<template<typename T> class Activation, typename T, typename BT>
-void invokeGenericActivation(T*           out,
-                             const BT*    bias,
-                             const T*     gated_weights,
-                             const BT*    gated_bias,
-                             const int*   ia3_tasks,
-                             const T*     ia3_weights,
-                             const int    m,
-                             const int    n,
-                             const int    int8_mode,
-                             const float* activation_in,
-                             const float* activation_out,
-                             cudaStream_t stream)
-{
-    invokeGenericActivation<Activation, T, BT>(out,
-                                               bias,
-                                               gated_weights,
-                                               gated_bias,
-                                               ia3_tasks,
-                                               ia3_weights,
-                                               m,
-                                               n,
-                                               int8_mode,
-                                               activation_in,
-                                               activation_out,
-                                               (const int*)nullptr,
-                                               0,
-                                               stream);
-}
-
-template<template<typename T> class Activation, typename T>
-void invokeGenericActivation_v2(
-    T* inter_buf, const T* __restrict__ gate_buf, int64_t stride, int token_num, int dims, cudaStream_t stream);
-
-template<typename T>
-void invokeAddBiasGeluV2(T*           out,
-                         const T*     bias,
-                         const int*   ia3_tasks,
-                         const T*     ia3_weights,
-                         const int*   padding_offset,
-                         const int    seq_len,
-                         const int    m,
-                         const int    n,
-                         cudaStream_t stream);
-
-template<typename T>
-void invokeAddBias(T* out, T const* bias, const int m, const int n, cudaStream_t stream)
-{
-    invokeGenericActivation<IdentityActivation, T, T>(
-        out, bias, nullptr, nullptr, nullptr, nullptr, m, n, 0, nullptr, nullptr, stream);
-}
-
-template<typename T>
-void invokeAddBiasGeluV2(
-    T* out, const T* bias, const int* ia3_tasks, const T* ia3_weights, const int m, const int n, cudaStream_t stream)
-{
-    invokeAddBiasGeluV2(out, bias, ia3_tasks, ia3_weights, nullptr, 0, m, n, stream);
-}
-
-template<typename T>
-void invokeAddBiasTanh(T* out, const T* bias, const int m, const int n, cudaStream_t stream);
-
-template<typename T>
-void invokeSigmoid(T* data, const int size, const float scale, cudaStream_t stream);
+template<template<typename T> class Activation>
+void invokeGenericActivation_v3(Ref<Tensor> inter_, const Tensor& gate, cudaStream_t stream);
 
 }  // namespace turbomind
diff --git a/src/turbomind/kernels/attention/CMakeLists.txt b/src/turbomind/kernels/attention/CMakeLists.txt
index 32de38981a..e1c92cf83c 100644
--- a/src/turbomind/kernels/attention/CMakeLists.txt
+++ b/src/turbomind/kernels/attention/CMakeLists.txt
@@ -63,7 +63,6 @@ if (BUILD_TEST)
         Llama
         unfused_attention_kernels
         logger
-        tensor
         cublas)
 
     add_executable(test_quant test_quant.cu test_utils.cu)
diff --git a/src/turbomind/kernels/attention/attention.cu b/src/turbomind/kernels/attention/attention.cu
index e7642584c2..8dcd409474 100644
--- a/src/turbomind/kernels/attention/attention.cu
+++ b/src/turbomind/kernels/attention/attention.cu
@@ -4,6 +4,7 @@
 #include "attention_config.h"
 #include "src/turbomind/kernels/attention/arch.h"
 #include "src/turbomind/models/llama/llama_utils.h"
+#include "src/turbomind/utils/cuda_utils.h"
 
 namespace turbomind {
 
diff --git a/src/turbomind/kernels/attention/decoding.cu b/src/turbomind/kernels/attention/decoding.cu
index 67bd81e45b..d7b0821b5d 100644
--- a/src/turbomind/kernels/attention/decoding.cu
+++ b/src/turbomind/kernels/attention/decoding.cu
@@ -1,11 +1,13 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
+#include <type_traits>
+#include <utility>
+
 #include "decoding.h"
 #include "decoding_config.h"
 #include "src/turbomind/kernels/attention/arch.h"
 #include "src/turbomind/models/llama/llama_utils.h"
-#include <type_traits>
-#include <utility>
+#include "src/turbomind/utils/cuda_utils.h"
 
 namespace turbomind {
 
diff --git a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
index f4b7fd4296..adb697e8c4 100644
--- a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
+++ b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
@@ -1,5 +1,7 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
+#include <type_traits>
+
 #include "src/turbomind/kernels/attention/block.h"
 #include "src/turbomind/kernels/attention/kv_cache_utils_v2.h"
 #include "src/turbomind/kernels/attention/quantization.h"
@@ -7,7 +9,7 @@
 #include "src/turbomind/kernels/core/array_ops.h"
 #include "src/turbomind/kernels/core/thread_map.h"
 #include "src/turbomind/models/llama/llama_utils.h"
-#include <type_traits>
+#include "src/turbomind/utils/cuda_utils.h"
 
 namespace turbomind {
 
diff --git a/src/turbomind/kernels/attention/kv_cache_utils_v2.h b/src/turbomind/kernels/attention/kv_cache_utils_v2.h
index 8a34f58759..01525f5596 100644
--- a/src/turbomind/kernels/attention/kv_cache_utils_v2.h
+++ b/src/turbomind/kernels/attention/kv_cache_utils_v2.h
@@ -2,8 +2,8 @@
 
 #pragma once
 
+#include "src/turbomind/core/data_type.h"
 #include "src/turbomind/kernels/attention/attention_params.h"
-#include "src/turbomind/utils/Tensor.h"
 
 namespace turbomind {
 
diff --git a/src/turbomind/kernels/attention/quantization.h b/src/turbomind/kernels/attention/quantization.h
index 02f49d0089..8f8dd4a92f 100644
--- a/src/turbomind/kernels/attention/quantization.h
+++ b/src/turbomind/kernels/attention/quantization.h
@@ -694,6 +694,7 @@ struct ConvertKvCache<uint8_t, T> {
     }
 };
 
+#if 0
 inline __device__ Array<nv_bfloat16, 4> cvt_bf16x4_e4m3(const Array<fp8_e4m3, 4>& v)
 {
 #if TURBOMIND_ARCH_SM80
@@ -743,6 +744,7 @@ struct ConvertKvCache<fp8_e4m3, T> {
         }
     }
 };
+#endif
 
 template<class Q, class T>
 inline __device__ void StoreQuantParam(T* dst, Array<T, 2> src)
diff --git a/src/turbomind/kernels/attention/reference.h b/src/turbomind/kernels/attention/reference.h
index 9958ddd3ad..7c55c6d9df 100644
--- a/src/turbomind/kernels/attention/reference.h
+++ b/src/turbomind/kernels/attention/reference.h
@@ -2,12 +2,14 @@
 
 #pragma once
 
-#include "src/turbomind/kernels/flash_attention/flash_attention.h"
-#include "src/turbomind/kernels/unfused_attention_kernels.h"
-#include "src/turbomind/utils/cublasMMWrapper.h"
+#include <cublas_v2.h>
 #include <cuda_runtime.h>
+
 #include <thrust/universal_vector.h>
 
+#include "src/turbomind/kernels/flash_attention/flash_attention.h"
+#include "src/turbomind/kernels/unfused_attention_kernels.h"
+
 namespace turbomind {
 
 template<class T>
diff --git a/src/turbomind/kernels/ban_bad_words.cu b/src/turbomind/kernels/ban_bad_words.cu
index 376432116f..3cc133c688 100644
--- a/src/turbomind/kernels/ban_bad_words.cu
+++ b/src/turbomind/kernels/ban_bad_words.cu
@@ -15,11 +15,40 @@
  */
 
 #include "src/turbomind/kernels/ban_bad_words.h"
-#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
-#include "src/turbomind/utils/cuda_utils.h"
+#include <cfloat>
+// #include "src/turbomind/kernels/reduce_kernel_utils.cuh"
+// #include "src/turbomind/utils/cuda_utils.h"
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
 
 namespace turbomind {
 
+template<typename T>
+__device__ inline T getMaxValue();
+
+template<>
+__device__ inline float getMaxValue<float>()
+{
+    return FLT_MAX;
+}
+
+template<>
+__device__ inline half getMaxValue<half>()
+{
+    return __ushort_as_half((unsigned short)0x7BFFU);
+}
+
+#ifdef ENABLE_BF16
+template<>
+__device__ inline __nv_bfloat16 getMaxValue<__nv_bfloat16>()
+{
+#if __CUDA_ARCH__ >= 800
+    return __ushort_as_bfloat16((unsigned short)0x7F7FU);
+#endif
+    return {};
+}
+#endif
+
 template<typename T>
 __global__ void ban_bad_words(T*         logits,
                               const int* output_ids_buf,
@@ -117,7 +146,6 @@ void invokeBanBadWords(T*           logits,
                                               id_offset,
                                               vocab_size_padded,
                                               step);
-    sync_check_cuda_error();
 }
 
 #define INSTANTIATE_INVOKE_BAN_BAD_WORDS(T)                                                                            \
diff --git a/src/turbomind/kernels/ban_bad_words.h b/src/turbomind/kernels/ban_bad_words.h
index 05bdc00849..af2c21158a 100644
--- a/src/turbomind/kernels/ban_bad_words.h
+++ b/src/turbomind/kernels/ban_bad_words.h
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <cuda_fp16.h>
 #include <cuda_runtime.h>
 
 namespace turbomind {
diff --git a/src/turbomind/kernels/core/data_type.h b/src/turbomind/kernels/core/data_type.h
index f57d1a2714..0c438bade0 100644
--- a/src/turbomind/kernels/core/data_type.h
+++ b/src/turbomind/kernels/core/data_type.h
@@ -2,67 +2,16 @@
 
 #pragma once
 
-#include <cstdint>
-#include <type_traits>
-
 #include <cuda_fp16.h>
 #if ENABLE_BF16
 #include <cuda_bf16.h>
 #endif
 
-namespace turbomind {
-
-struct uint1_t {
-};
-struct uint2_t {
-};
-struct uint3_t {
-};
-struct uint4_t {
-};
-struct uint5_t {
-};
-struct uint6_t {
-};
-
-template<class T>
-struct bitsof_t: std::integral_constant<int, sizeof(T) * 8> {
-};
-
-template<>
-struct bitsof_t<uint1_t>: std::integral_constant<int, 1> {
-};
-
-template<>
-struct bitsof_t<uint2_t>: std::integral_constant<int, 2> {
-};
-
-template<>
-struct bitsof_t<uint3_t>: std::integral_constant<int, 3> {
-};  // 2 + 1
-
-template<>
-struct bitsof_t<uint4_t>: std::integral_constant<int, 4> {
-};
-
-template<>
-struct bitsof_t<uint5_t>: std::integral_constant<int, 5> {
-};  // 4 + 1
-
-template<>
-struct bitsof_t<uint6_t>: std::integral_constant<int, 6> {
-};  // 4 + 2
+#include <cstdint>
 
-template<class T>
-inline constexpr bitsof_t<T> bitsof{};
+#include "src/turbomind/core/data_type.h"
 
-struct fp8 {
-    char v;
-};
-struct fp8_e4m3: fp8 {
-};
-struct fp8_e5m2: fp8 {
-};
+namespace turbomind {
 
 namespace detail {
 
diff --git a/src/turbomind/kernels/gemm/CMakeLists.txt b/src/turbomind/kernels/gemm/CMakeLists.txt
index 4e398e9e25..a9ff849e9e 100644
--- a/src/turbomind/kernels/gemm/CMakeLists.txt
+++ b/src/turbomind/kernels/gemm/CMakeLists.txt
@@ -47,10 +47,10 @@ if (BUILD_TEST)
                 # test/test_utils.cu
                 test/quantization.cu
                 test/reference.cu)
-        target_link_libraries(gemm_test PRIVATE gemm2 cublas)
+        target_link_libraries(gemm_test PRIVATE gemm2 core cublas)
 
         add_executable(test_moe_utils test/test_moe_utils.cu test/test_utils.cu)
-        target_link_libraries(test_moe_utils PRIVATE gemm2 cublas)
+        target_link_libraries(test_moe_utils PRIVATE gemm2 core cublas)
 
         if (NOT MSVC)
                 FetchContent_Declare(
@@ -60,6 +60,7 @@ if (BUILD_TEST)
                 )
 
                 set(NVBench_ENABLE_EXAMPLES OFF)
+                set(NVBench_ENABLE_TESTING OFF)
                 set(BUILD_SHARED_LIBS OFF)
 
                 FetchContent_MakeAvailable(repo-nvbench)
@@ -69,6 +70,6 @@ if (BUILD_TEST)
                         # test/test_utils.cu
                         test/quantization.cu
                         test/reference.cu)
-                target_link_libraries(gemm_bench PRIVATE gemm2 nvbench::nvbench cublas)
+                target_link_libraries(gemm_bench PRIVATE gemm2 core nvbench::nvbench cublas)
         endif ()
 endif ()
diff --git a/src/turbomind/kernels/gemm/context.cu b/src/turbomind/kernels/gemm/context.cu
index 1b1ea1a2c3..4aca585673 100644
--- a/src/turbomind/kernels/gemm/context.cu
+++ b/src/turbomind/kernels/gemm/context.cu
@@ -188,10 +188,10 @@ std::vector<LaunchSpec> StaticGemmContext::Populate(const Kernel& kernel, const
         const int64_t mma_cost = wave_mma_cost * waves;
 
         // IO has less severe quantization effect
-        const int64_t mio_cost_a = get_size(desc.type_a, tiled_shape_n * m * split_ceil_k) * splits;
-        const int64_t mio_cost_b = get_size(desc.type_b, tiled_shape_m * n * split_ceil_k) * splits;
+        const int64_t mio_cost_a = byte_size(desc.type_a, tiled_shape_n * m * split_ceil_k) * splits;
+        const int64_t mio_cost_b = byte_size(desc.type_b, tiled_shape_m * n * split_ceil_k) * splits;
         /// TODO: read type from `desc_.accum` when added
-        const int64_t mio_cost_c = get_size(DataType::F32, (int64_t)m * n) * (splits - 1) * 2;
+        const int64_t mio_cost_c = byte_size(desc.type_c, (int64_t)m * n) * (splits - 1) * 2;
         const int64_t mio_cost   = mio_cost_a + mio_cost_b + mio_cost_c;
 
         // std::cout << name() << " " << splits << " " << waves << " " << (float)mio_cost << " " << (float)mma_cost
@@ -435,10 +435,10 @@ std::vector<LaunchSpec> MoeGemmContext::Populate(const Kernel& kernel, const Pop
         const int64_t mma_cost = wave_mma_cost * waves;
 
         // IO has less severe quantization effect
-        const int64_t mio_cost_a = get_size(desc.type_a, tiled_shape_n * batch_size * split_ceil_k) * num * splits;
-        const int64_t mio_cost_b = get_size(desc.type_b, tiled_shape_m * n * split_ceil_k) * num * splits;
+        const int64_t mio_cost_a = byte_size(desc.type_a, tiled_shape_n * batch_size * split_ceil_k) * num * splits;
+        const int64_t mio_cost_b = byte_size(desc.type_b, tiled_shape_m * n * split_ceil_k) * num * splits;
         /// TODO: read type from `desc_.accum` when added
-        const int64_t mio_cost_c = get_size(DataType::F32, (int64_t)batch_size * n) * num * (splits - 1) * 2;
+        const int64_t mio_cost_c = byte_size(desc.type_c, (int64_t)batch_size * n) * num * (splits - 1) * 2;
         const int64_t mio_cost   = mio_cost_a + mio_cost_b + mio_cost_c;
 
         LaunchSpec spec{};
diff --git a/src/turbomind/kernels/gemm/convert_v2.cu b/src/turbomind/kernels/gemm/convert_v2.cu
index e58bfc9b95..a718c50410 100644
--- a/src/turbomind/kernels/gemm/convert_v2.cu
+++ b/src/turbomind/kernels/gemm/convert_v2.cu
@@ -157,12 +157,12 @@ int Convert(const void*         S,  //
     auto dispatch_3 = [&](auto mma, auto operand, auto order) -> bool {
         if constexpr (is_AB(operand)) {
             switch (Ddesc.type) {
-                case DataType::F16:
-                case DataType::BF16:
+                case kFloat16:
+                case kBfloat16:
                     return dispatch_4(mma, operand, order, type_c<uint16_t>, type_c<uint16_t>);
-                case DataType::U8:
+                case kUint8:
                     return dispatch_4(mma, operand, order, type_c<uint16_t>, type_c<uint8_t>);
-                case DataType::U4:
+                case kUint4:
                     return dispatch_4(mma, operand, order, type_c<uint16_t>, type_c<uint4_t>);
                 default:
                     return false;
@@ -170,7 +170,7 @@ int Convert(const void*         S,  //
         }
         else {  // UV: U16, U32
             switch (Ddesc.type) {
-                case DataType::U32:
+                case kUint32:
                     return dispatch_4(mma, operand, order, type_c<uint32_t>, type_c<uint32_t>);
                 default:
                     return false;
@@ -228,11 +228,11 @@ std::tuple<Order, Pack, Order, Pack>
 get_weight_and_scales_layout(DataType dtype, bool is_fused_moe, int sm, bool force_simt)
 {
     if (is_fused_moe) {
-        if (dtype == DataType::BF16 && sm >= 80) {
+        if (dtype == kBfloat16 && sm >= 80) {
             return {kColMajor, HMMA_16816 | OPERAND_B | 1, {}, {}};
         }
 
-        if (dtype == DataType::F16) {
+        if (dtype == kFloat16) {
             if (sm >= 80) {
                 return {kColMajor, HMMA_16816 | OPERAND_B | 1, {}, {}};
             }
@@ -243,7 +243,7 @@ get_weight_and_scales_layout(DataType dtype, bool is_fused_moe, int sm, bool for
                 return {kColMajor, HMMA_884 | OPERAND_B | 1, {}, {}};
             }
         }
-        else if (dtype == DataType::U4) {
+        else if (dtype == kUint4) {
             if (sm >= 80) {
                 return {kColMajor, HMMA_16816 | OPERAND_B | 2, kRowMajor, HMMA_16816 | OPERAND_V | 1};
             }
@@ -256,7 +256,7 @@ get_weight_and_scales_layout(DataType dtype, bool is_fused_moe, int sm, bool for
         }
     }
     else {
-        if (dtype == DataType::U4) {
+        if (dtype == kUint4) {
             if (force_simt) {
                 return {kColMajor, HMMA_SIMT | OPERAND_B | 1, kRowMajor, HMMA_SIMT | OPERAND_V | 1};
             }
diff --git a/src/turbomind/kernels/gemm/kernel_impl.h b/src/turbomind/kernels/gemm/kernel_impl.h
index 3980e1d222..760f29fc55 100644
--- a/src/turbomind/kernels/gemm/kernel_impl.h
+++ b/src/turbomind/kernels/gemm/kernel_impl.h
@@ -39,9 +39,9 @@ class KernelImpl: public Kernel {
         desc_.order_b = transpose(OpB::kOrder);
         desc_.order_c = Gemm::kOrderC;
 
-        desc_.type_a = get_data_type_v<typename Gemm::Ta>;
-        desc_.type_b = get_data_type_v<typename Gemm::Tb>;
-        desc_.type_c = get_data_type_v<typename Gemm::Tc>;
+        desc_.type_a = data_type_v<typename Gemm::Ta>;
+        desc_.type_b = data_type_v<typename Gemm::Tb>;
+        desc_.type_c = data_type_v<typename Gemm::Tc>;
 
         using IterA = typename OpA::GmemIter;
         using IterB = typename OpB::GmemIter;
@@ -127,9 +127,9 @@ class KernelImpl: public Kernel {
 
         MatrixLayout Adesc = _Adesc;
 
-        const int m = Ddesc.rows;
-        const int n = Ddesc.cols;
-        const int k = Adesc.cols;
+        [[maybe_unused]] const int m = Ddesc.rows;
+        [[maybe_unused]] const int n = Ddesc.cols;
+        [[maybe_unused]] const int k = Adesc.cols;
 
         auto transpose = [](MatrixLayout x) {
             std::swap(x.rows, x.cols);
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.cu b/src/turbomind/kernels/gemm/moe_utils_v2.cu
index 3309933dbf..4d3f87b3f6 100644
--- a/src/turbomind/kernels/gemm/moe_utils_v2.cu
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.cu
@@ -11,6 +11,7 @@
 #include <cub/block/block_scan.cuh>
 #include <cub/warp/warp_scan.cuh>
 
+#include "src/turbomind/core/data_type.h"
 #include "src/turbomind/kernels/core/array_ops.h"
 #include "src/turbomind/kernels/core/common.h"
 #include "src/turbomind/kernels/core/math.h"
@@ -690,20 +691,21 @@ __global__ void MoeGatherKernel(T*         dst,  // [e*n, d]
     }
 }
 
-template<class T>
-void invokeMoeGather(T* dst, const T* src, const int* f2n, int tokens, int experts_per_token, int dims, cudaStream_t st)
+void invokeMoeDispatch(Ref<Tensor> out_, const Tensor& src, const int* f2n, int expert_per_token, cudaStream_t st)
 {
+    using T = uint16_t;
+    TM_CHECK_EQ(byte_size(src.dtype()), byte_size<T>());
+    auto& out              = out_.get();
+    auto [num, dim]        = src.shapes(0, 1);
     constexpr int threads  = 256;
     constexpr int vec_size = 16 / sizeof(T);
-    MoeGatherKernel<vec_size, threads><<<tokens * experts_per_token, threads, 0, st>>>(  //
-        dst,
-        src,
+    MoeGatherKernel<vec_size, threads><<<num * expert_per_token, threads, 0, st>>>(  //
+        (T*)out.raw_data(),
+        (const T*)src.raw_data(),
         f2n,
-        dims / vec_size);
+        dim / vec_size);
 }
 
-template void invokeMoeGather(uint16_t*, const uint16_t*, const int*, int, int, int, cudaStream_t);
-
 template<int vec_size, int exp_k, int block_dim, class T>
 __global__ void MoeReduceKernel(T*           dst,         // [  n, d]
                                 const T*     src,         // [e*n, d]
@@ -819,12 +821,36 @@ void invokeMoeReduce(T*           dst,
     }
 }
 
-template void
-invokeMoeReduce(half*, const half*, const float*, const int*, const float*, int, int, int, float, cudaStream_t);
-#ifdef ENABLE_BF16
-template void invokeMoeReduce(
-    nv_bfloat16*, const nv_bfloat16*, const float*, const int*, const float*, int, int, int, float, cudaStream_t);
-#endif
+void invokeMoeCombine(Ref<Tensor>   out_,
+                      const Tensor& src,
+                      const float*  scales,
+                      const int*    en2f,
+                      const float*  dst_scales,
+                      int           experts_per_token,
+                      float         dst_scale,
+                      cudaStream_t  st)
+{
+    auto& out = out_.get();
+
+    const int tokens = out.shape(0);
+    TM_CHECK_EQ(src.shape(0), tokens * experts_per_token);
+
+    auto invoke = [&](auto t) {
+        using T = decltype(t);
+        return invokeMoeReduce(out.data<T>(),
+                               src.data<T>(),
+                               scales,
+                               en2f,
+                               dst_scales,
+                               tokens,
+                               experts_per_token,
+                               src.shape(1),
+                               dst_scale,
+                               st);
+    };
+
+    TM_DISPATCH_PRIMARY_DTYPES(src.dtype(), invoke);
+}
 
 std::vector<int> SampleUniform(int token_num, int expert_num, int exp_per_tok, std::mt19937& g)
 {
diff --git a/src/turbomind/kernels/gemm/moe_utils_v2.h b/src/turbomind/kernels/gemm/moe_utils_v2.h
index 4a603a07b3..618d097d11 100644
--- a/src/turbomind/kernels/gemm/moe_utils_v2.h
+++ b/src/turbomind/kernels/gemm/moe_utils_v2.h
@@ -5,6 +5,8 @@
 #include <random>
 #include <vector>
 
+#include "src/turbomind/core/core.h"
+
 namespace turbomind {
 
 constexpr int kMoeGateMaxTiles = 16;
@@ -26,38 +28,20 @@ void invokeMoeGate_V2(int*         f2n,
                       float        routed_scale,
                       cudaStream_t st);
 
-template<class T>
-void invokeMoeGather(
-    T* dst, const T* src, const int* f2n, int tokens, int experts_per_token, int dims, cudaStream_t st);
-
-template<class T>
-inline void
-dispatchMoeGather(T* dst, const T* src, const int* f2n, int tokens, int experts_per_token, int dims, cudaStream_t st)
-{
-    const auto invoke = [&](auto type) {
-        using V = decltype(type);
-        invokeMoeGather((V*)dst, (const V*)src, f2n, tokens, experts_per_token, dims, st);
-    };
-
-    if constexpr (sizeof(T) == 2) {
-        invoke(uint16_t{});
-    }
-    else {  /// TODO: dispatch for more types
-        static_assert(sizeof(T) != sizeof(T), "Not implemented");
-    }
-}
-
-template<class T>
-void invokeMoeReduce(T*           dst,
-                     const T*     src,
-                     const float* scales,
-                     const int*   en2f,
-                     const float* dst_scales,
-                     int          tokens,
-                     int          experts_per_token,
-                     int          dims,
-                     float        dst_scale,
-                     cudaStream_t st);
+void invokeMoeDispatch(Ref<Tensor>   out_,  //
+                       const Tensor& src,
+                       const int*    f2n,
+                       int           expert_per_token,
+                       cudaStream_t  st);
+
+void invokeMoeCombine(Ref<Tensor>   out_,
+                      const Tensor& src,
+                      const float*  scales,
+                      const int*    en2f,
+                      const float*  dst_scales,
+                      int           experts_per_token,
+                      float         dst_scale,
+                      cudaStream_t  st);
 
 void invokeMoeSoftmaxMaskTopKGroups(
     float* logits, int token_num, int expert_num, int group_size, int top_k, cudaStream_t st);
diff --git a/src/turbomind/kernels/gemm/test/reference.cu b/src/turbomind/kernels/gemm/test/reference.cu
index d1f7f34f64..ab7a1951bd 100644
--- a/src/turbomind/kernels/gemm/test/reference.cu
+++ b/src/turbomind/kernels/gemm/test/reference.cu
@@ -25,9 +25,9 @@ MatrixLayout transpose(MatrixLayout x)
 cudaDataType to_cuda_dtype(DataType dtype)
 {
     switch (dtype) {
-        case DataType::F16:
+        case DataType::kFloat16:
             return CUDA_R_16F;
-        case DataType::BF16:
+        case DataType::kBfloat16:
             return CUDA_R_16BF;
         default:
             CHECK("unsupported data type" && 0);
diff --git a/src/turbomind/kernels/gemm/test/testbed.h b/src/turbomind/kernels/gemm/test/testbed.h
index 4747644f9a..c296ae95c1 100644
--- a/src/turbomind/kernels/gemm/test/testbed.h
+++ b/src/turbomind/kernels/gemm/test/testbed.h
@@ -2,6 +2,20 @@
 
 #pragma once
 
+#include <algorithm>
+#include <climits>
+#include <cstdlib>
+#include <fstream>
+#include <iomanip>
+#include <iterator>
+#include <numeric>
+#include <random>
+#include <type_traits>
+
+#include <thrust/universal_vector.h>
+
+#include "src/turbomind/core/core.h"
+
 #include "src/turbomind/kernels/core/array.h"
 #include "src/turbomind/kernels/core/data_type.h"
 #include "src/turbomind/kernels/core/math.h"
@@ -16,16 +30,6 @@
 #include "src/turbomind/kernels/gemm/test/test_utils.h"
 #include "src/turbomind/kernels/gemm/types.h"
 #include "src/turbomind/kernels/gemm/utils.h"
-#include <algorithm>
-#include <climits>
-#include <cstdlib>
-#include <fstream>
-#include <iomanip>
-#include <iterator>
-#include <numeric>
-#include <random>
-#include <thrust/universal_vector.h>
-#include <type_traits>
 
 namespace turbomind::gemm {
 
@@ -110,9 +114,9 @@ class Testbed {
         b_.resize(n * k * E);
         c_.resize(m * n);
 
-        a_desc_ = MatrixLayout{get_data_type_v<Tc>, order_a, m, k, mk2cs<order_a>(m, k).x, 0};
-        b_desc_ = MatrixLayout{get_data_type_v<Tc>, order_b, k, n, _kn2cs<order_b>(k, n).x, 0};
-        c_desc_ = MatrixLayout{get_data_type_v<Tc>, order_c, m, n, mk2cs<order_c>(m, n).x, 0};
+        a_desc_ = MatrixLayout{data_type_v<Tc>, order_a, m, k, mk2cs<order_a>(m, k).x, 0};
+        b_desc_ = MatrixLayout{data_type_v<Tc>, order_b, k, n, _kn2cs<order_b>(k, n).x, 0};
+        c_desc_ = MatrixLayout{data_type_v<Tc>, order_c, m, n, mk2cs<order_c>(m, n).x, 0};
 
         c_f_.resize(c_.size());
         c_ref_.resize(c_.size());
@@ -151,7 +155,7 @@ class Testbed {
         if constexpr (is_quant_a) {
             static_assert(pack_a && pack_u);
             Quantize<Ta>(a_, m, k, order_a, g, a_f_, a_q_, u_, stream);
-            u_pack_desc_ = u_desc_ = {DataType::U32, kColMajor, m, ceil_div(k, g), m};
+            u_pack_desc_ = u_desc_ = {kUint32, kColMajor, m, ceil_div(k, g), m};
             u_pack_desc_.pack      = pack_u;
             u_pack_.resize(u_.size());
             CHECK(!Convert(u_.data().get(), u_desc_, u_pack_.data().get(), u_pack_desc_, stream_));
@@ -172,7 +176,7 @@ class Testbed {
             Quantize<Tb>(b_, n * E, k, _order_b, g, b_f_, b_q_, v_, stream);
             quant_b_ = {QuantType::kDefault, g};
 
-            v_pack_desc_ = v_desc_ = {DataType::U32, kRowMajor, ceil_div(k, g), n, int(n * E)};
+            v_pack_desc_ = v_desc_ = {kUint32, kRowMajor, ceil_div(k, g), n, int(n * E)};
             v_pack_desc_.pack      = pack_v;
             v_pack_.resize(v_.size());
             auto v_src_data = (uint32_t*)v_.data().get();
@@ -194,7 +198,7 @@ class Testbed {
         }
 
         if constexpr (pack_a) {
-            a_pack_desc_.type = get_data_type_v<Ta>;
+            a_pack_desc_.type = data_type_v<Ta>;
             a_pack_desc_.pack = pack_a;
             const auto a_data = is_quant_a ? (void*)a_q_.data().get() : (void*)a_.data().get();
             CHECK(!Convert(a_data, a_desc_, a_pack_.data().get(), a_pack_desc_, stream_));
@@ -206,7 +210,7 @@ class Testbed {
 
         if constexpr (pack_b) {
             // CHECK(experts == 0);
-            b_pack_desc_.type = get_data_type_v<Tb>;
+            b_pack_desc_.type = data_type_v<Tb>;
             b_pack_desc_.pack = pack_b;
             // clang-format off
             auto b_src_data = [&] {
@@ -367,8 +371,11 @@ class Testbed {
         c_e_ref_.resize(c_e_.size());
 
         for (int i = 0; i < 10; ++i) {
-            dispatchMoeGather(
-                a_e_.data().get(), a_f_.data().get(), moe_f2n_.data().get(), batch_size_, top_e, input_dims_, stream_);
+            invokeMoeDispatch(Tensor{a_e_.data().get(), {top_e * batch_size_, input_dims_}, kDEVICE},
+                              Tensor{a_f_.data().get(), {batch_size_, input_dims_}, kDEVICE},
+                              moe_f2n_.data().get(),
+                              top_e,
+                              stream_);
         }
 
         a_pack_desc_.num = b_pack_desc_.num = c_desc_.num = experts_;
@@ -510,27 +517,23 @@ class Testbed {
             Compare(c_.data().get(), c_ref_.data().get(), dims, dims, bsz, 0);
         }
         else {
-            invokeMoeReduce(c_.data().get(),
-                            c_e_.data().get(),
-                            moe_scales_.data().get(),
-                            moe_en2f_.data().get(),
-                            nullptr,
-                            batch_size_,
-                            expert_ids_.size() / batch_size_,
-                            output_dims_,
-                            0.f,
-                            stream_);
-
-            invokeMoeReduce(c_ref_.data().get(),
-                            c_e_ref_.data().get(),
-                            moe_scales_.data().get(),
-                            moe_en2f_.data().get(),
-                            nullptr,
-                            batch_size_,
-                            expert_ids_.size() / batch_size_,
-                            output_dims_,
-                            0.f,
-                            stream_);
+            invokeMoeCombine(Tensor{c_.data().get(), {batch_size_, output_dims_}, kDEVICE},
+                             Tensor{c_e_.data().get(), {(int)expert_ids_.size(), output_dims_}, kDEVICE},
+                             moe_scales_.data().get(),
+                             moe_en2f_.data().get(),
+                             nullptr,
+                             expert_ids_.size() / batch_size_,
+                             0.f,
+                             stream_);
+
+            invokeMoeCombine(Tensor{c_ref_.data().get(), {batch_size_, output_dims_}, kDEVICE},
+                             Tensor{c_e_ref_.data().get(), {(int)expert_ids_.size(), output_dims_}, kDEVICE},
+                             moe_scales_.data().get(),
+                             moe_en2f_.data().get(),
+                             nullptr,
+                             expert_ids_.size() / batch_size_,
+                             0.f,
+                             stream_);
 
             cudaDeviceSynchronize();
 
@@ -586,13 +589,14 @@ class Testbed {
     int64_t get_global_memory_reads()
     {
         if (experts_ == 0) {
-            return get_size(a_pack_desc_) + get_size(b_pack_desc_) + get_size(u_pack_desc_) + get_size(v_pack_desc_);
+            return byte_size(a_pack_desc_) + byte_size(b_pack_desc_) + byte_size(u_pack_desc_)
+                   + byte_size(v_pack_desc_);
         }
         else {
-            size_t    size = get_size(a_pack_desc_) + get_size(u_pack_desc_);
+            size_t    size = byte_size(a_pack_desc_) + byte_size(u_pack_desc_);
             const int nnz =
                 std::accumulate(moe_cnt_.begin(), moe_cnt_.end(), 0, [](auto a, auto x) { return a + (x > 0); });
-            size += nnz * (get_size(b_pack_desc_) + get_size(v_pack_desc_));
+            size += nnz * (byte_size(b_pack_desc_) + byte_size(v_pack_desc_));
             return size;
         }
     }
@@ -600,13 +604,13 @@ class Testbed {
     int64_t get_ref_global_memory_reads()
     {
         if (experts_ == 0) {
-            return get_size(a_desc_) + get_size(b_desc_);
+            return byte_size(a_desc_) + byte_size(b_desc_);
         }
         else {
-            size_t    size = get_size(a_desc_);
+            size_t    size = byte_size(a_desc_);
             const int nnz =
                 std::accumulate(moe_cnt_.begin(), moe_cnt_.end(), 0, [](auto a, auto x) { return a + (x > 0); });
-            size += nnz * get_size(b_desc_);
+            size += nnz * byte_size(b_desc_);
             return size;
         }
     }
diff --git a/src/turbomind/kernels/gemm/types.h b/src/turbomind/kernels/gemm/types.h
index 94a31e9452..00c4c87efd 100644
--- a/src/turbomind/kernels/gemm/types.h
+++ b/src/turbomind/kernels/gemm/types.h
@@ -98,126 +98,6 @@ enum class Epilogue : int
     kGatedSilu          = 0x2,
 };
 
-enum class DataType : int
-{
-    U4,
-    U8,
-    U16,
-    U32,
-    U64,
-    F8_E4M3,
-    F8_E5M2,
-    F16,
-    F32,
-    BF16,
-    TF32,
-};
-
-inline const char* to_string(DataType data_type)
-{
-    switch (data_type) {
-        case DataType::U4:
-            return "u4";
-        case DataType::U8:
-            return "u8";
-        case DataType::F16:
-            return "f16";
-        case DataType::F32:
-            return "f32";
-        case DataType::BF16:
-            return "bf16";
-        case DataType::TF32:
-            return "tf32";
-        default:
-            return "unknown";
-    }
-}
-
-inline int64_t get_size(DataType type, int64_t size)
-{
-    if (!size) {
-        return 0;
-    }
-    switch (type) {
-        case DataType::U64:
-            return size * 8;
-        case DataType::F32:
-        case DataType::U32:
-            return size * 4;
-        case DataType::BF16:
-        case DataType::F16:
-        case DataType::U16:
-            return size * 2;
-        case DataType::U8:
-        case DataType::F8_E4M3:
-        case DataType::F8_E5M2:
-            return size;
-        case DataType::U4:
-            return size / 2;
-        default:
-            // std::cerr << to_string(type) << "\n";
-            return -1;
-    }
-}
-
-template<class T>
-struct get_data_type {
-};
-
-template<>
-struct get_data_type<half> {
-    static constexpr auto value = DataType::F16;
-};
-
-#if ENABLE_BF16
-template<>
-struct get_data_type<nv_bfloat16> {
-    static constexpr auto value = DataType::BF16;
-};
-#endif
-
-template<>
-struct get_data_type<uint4_t> {
-    static constexpr auto value = DataType::U4;
-};
-
-template<>
-struct get_data_type<uint8_t> {
-    static constexpr auto value = DataType::U8;
-};
-
-template<class T>
-inline constexpr auto get_data_type_v = get_data_type<T>::value;
-
-template<DataType dtype>
-struct get_dtype {
-};
-
-template<>
-struct get_dtype<DataType::F16> {
-    using type = half;
-};
-
-template<>
-struct get_dtype<DataType::U4> {
-    using type = uint4_t;
-};
-
-template<>
-struct get_dtype<DataType::U8> {
-    using type = uint8_t;
-};
-
-template<>
-struct get_dtype<DataType::U16> {
-    using type = uint16_t;
-};
-
-template<>
-struct get_dtype<DataType::U32> {
-    using type = uint32_t;
-};
-
 struct QuantDesc {
     QuantType type;
     int       group_size;
@@ -273,9 +153,9 @@ struct MatrixLayout {
     int*     idxs;
 };
 
-inline int64_t get_size(const MatrixLayout& m)
+inline int64_t byte_size(const MatrixLayout& m)
 {
-    return get_size(m.type, (int64_t)m.rows * m.cols);
+    return byte_size(m.type, (int64_t)m.rows * m.cols);
 }
 
 inline Striding get_mode(const MatrixLayout& m)
diff --git a/src/turbomind/kernels/gpt_kernels.cu b/src/turbomind/kernels/gpt_kernels.cu
index 1d22d21b15..ed465bf078 100644
--- a/src/turbomind/kernels/gpt_kernels.cu
+++ b/src/turbomind/kernels/gpt_kernels.cu
@@ -14,194 +14,66 @@
  * limitations under the License.
  */
 
-#include "src/turbomind/utils/cuda_fp8_utils.h"
-#ifndef CUDART_VERSION
-#error CUDART_VERSION Undefined!
-#elif (CUDART_VERSION >= 11000)
 #include <cub/cub.cuh>
-#else
-#include "3rdparty/cub/cub.cuh"
-#endif
+
+#include "src/turbomind/kernels/core/array_ops.h"
 #include "src/turbomind/kernels/gpt_kernels.h"
 #include "src/turbomind/utils/memory_utils.h"
 
 namespace turbomind {
 
-// PROMPT_SRC: 0 --> no prompts, 1 --> from loaded prompts, 2 --> from request prompts
-template<typename T, bool OUTPUT_ID, int PROMPT_SRC>
-__global__ void start_id_embedding_position_lookups_kernel(T*                    from_tensor,
-                                                           int*                  output_ids,
-                                                           const T*              embedding_table,
-                                                           const T*              pos_table,
-                                                           pPromptTuningParam<T> prompt_param,
-                                                           const int*            input_ids,
-                                                           const int             start_step,
-                                                           const int             length,
-                                                           const int             max_length,
-                                                           const int             batch_size,
-                                                           const int64_t         hidden_units)
+template<class T, int vec_size>
+__global__ void
+embeddingLookupKernel(T* dst, int dst_stride, const T* src, int src_stride, const int* ids, int num, int dim)
 {
-    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch_size * length * hidden_units;
-         index += blockDim.x * gridDim.x) {
-        // transpose the input_ids [batch, length] (part of [batch, max_length]) to output_ids [length, batch]
-        if (OUTPUT_ID && index < batch_size * max_length) {
-            // for p/prompt_tuning (have prompt templates like [input1, prompt1, input2, prompt2])
-            // we have to process it to like [input1, input2, prompt1, prompt2], and then remove the prompts during post
-            // processing
-            if (PROMPT_SRC > 0) {
-                if (index < batch_size) {
-                    int no_prompt_output_seq_id = 0;
-#pragma unroll 1
-                    for (int seq_id = 0; seq_id < max_length; seq_id++) {
-                        int current_input_id = input_ids[index * max_length + seq_id];
-                        if (current_input_id < prompt_param.p_prompt_tuning_id_start) {
-                            output_ids[no_prompt_output_seq_id * batch_size + index] = current_input_id;
-                            no_prompt_output_seq_id++;
-                        }
-                    }
-                }
-            }
-            else {
-                const int seq_id   = index % max_length;
-                const int batch_id = index / max_length;
-                if (seq_id < length) {
-                    output_ids[seq_id * batch_size + batch_id] = input_ids[index];
-                }
-            }
-        }
+    const int ti = blockIdx.x;
 
-        // embedding lookup from word ids [batch, length] (part of [batch, max_length]) and [vocab, hidden] to generate
-        // embedding [batch, length, hidden]
-        const int word_index      = index / hidden_units;
-        const int word_index_row  = word_index / length;  // batch_id
-        const int word_index_col  = word_index % length;
-        const int real_word_index = word_index_row * max_length + word_index_col;
-        const int step            = start_step + word_index % length;
-        const int col_index       = index % hidden_units;
-        const int input_id        = input_ids == nullptr ? real_word_index : input_ids[real_word_index];
-        const int prompt_id       = input_id - prompt_param.p_prompt_tuning_id_start;
-        T         embedding       = (T)0.0f;
-        if (PROMPT_SRC > 0 && prompt_id >= 0) {
-            if (PROMPT_SRC == 1) {
-                // from loaded prompt embedding tables
-                embedding =
-                    prompt_param.p_prompt_tuning_batch_weights[word_index_row][prompt_id * hidden_units + col_index];
-            }
-            else {
-                // from request prompt embedding
-                embedding =
-                    prompt_param
-                        .request_prompt_embedding[word_index_row * prompt_param.request_prompt_max_length * hidden_units
-                                                  + prompt_id * hidden_units + col_index];
-            }
-        }
-        else {
-            embedding = embedding_table[input_id * hidden_units + col_index];
-        }
-        T pos_embed        = pos_table == nullptr ? (T)0.f : pos_table[(step - 1) * hidden_units + col_index];
-        from_tensor[index] = embedding + pos_embed;
+    const int64_t idx = ids[ti];
+
+    src += idx * src_stride;
+    dst += ti * dst_stride;
+
+    for (int di = threadIdx.x * vec_size; di < dim; di += blockDim.x * vec_size) {
+        Array<T, vec_size> vec;
+        Ldg(vec, &src[di]);
+        Store(&dst[di], vec);
     }
 }
 
-#define WORD_POS_EMBEDDING_LOOPUP_KERNEL(OUTPUT_ID, PROMPT_SRC)                                                        \
-    start_id_embedding_position_lookups_kernel<T, OUTPUT_ID, PROMPT_SRC><<<grid, block, 0, stream>>>(from_tensor,      \
-                                                                                                     output_ids,       \
-                                                                                                     embedding_table,  \
-                                                                                                     pos_table,        \
-                                                                                                     prompt_param,     \
-                                                                                                     input_ids,        \
-                                                                                                     start_step,       \
-                                                                                                     length,           \
-                                                                                                     max_length,       \
-                                                                                                     batch_size,       \
-                                                                                                     hidden_units);
-
-template<typename T>
-void invokeInputIdsEmbeddingLookupPosEncoding(T*                    from_tensor,
-                                              int*                  output_ids,
-                                              const T*              embedding_table,  // can also be inputs_embeds
-                                              const T*              pos_table,
-                                              pPromptTuningParam<T> prompt_param,
-                                              const int*            input_ids,
-                                              const int             start_step,
-                                              const int             length,
-                                              const int             max_length,
-                                              const int             batch_size,
-                                              const int             hidden_units,
-                                              cudaStream_t          stream)
+void invokeEmbeddingLookup(Ref<Tensor>         out_,
+                           const Buffer_<int>& token_ids,
+                           const Tensor&       embedding_table,
+                           cudaStream_t        st)
 {
-    dim3       grid(min(batch_size * length, 65536));
-    dim3       block(min(hidden_units, 512));
-    const bool has_output_ids = output_ids != nullptr;
-    FT_CHECK(!(has_output_ids && input_ids == nullptr));
-
-    if (has_output_ids) {
-        if (prompt_param.use_request_p_prompt_embedding) {
-            WORD_POS_EMBEDDING_LOOPUP_KERNEL(true, 2);
-        }
-        else if (prompt_param.p_prompt_tuning_batch_weights != nullptr) {
-            WORD_POS_EMBEDDING_LOOPUP_KERNEL(true, 1);
-        }
-        else {
-            WORD_POS_EMBEDDING_LOOPUP_KERNEL(true, 0);
-        }
-    }
-    else {
-        if (prompt_param.use_request_p_prompt_embedding) {
-            WORD_POS_EMBEDDING_LOOPUP_KERNEL(false, 2);
-        }
-        else if (prompt_param.p_prompt_tuning_batch_weights != nullptr) {
-            WORD_POS_EMBEDDING_LOOPUP_KERNEL(false, 1);
-        }
-        else {
-            WORD_POS_EMBEDDING_LOOPUP_KERNEL(false, 0);
-        }
+    auto& out = out_.get();
+
+    TM_CHECK_EQ(out.shape(0), token_ids.size());
+    TM_CHECK_EQ(out.shape(1), embedding_table.shape(1));
+
+    int num, dim;
+    std::tie(num, dim) = out.shapes(0, 1);
+
+    auto invoke = [&](auto t) {
+        using T                = decltype(t);
+        constexpr int vec_size = sizeof(uint4) / sizeof(T);
+        TM_CHECK(dim % vec_size == 0) << dim << " " << vec_size;
+        const int threads = std::min(dim / vec_size, 1024);
+        const int blocks  = num;
+        embeddingLookupKernel<T, vec_size><<<blocks, threads, 0, st>>>((T*)out.raw_data(),
+                                                                       out.stride(0),
+                                                                       (const T*)embedding_table.raw_data(),
+                                                                       embedding_table.stride(0),
+                                                                       token_ids.data(),
+                                                                       num,
+                                                                       dim);
+    };
+
+    if (byte_size(out.dtype()) == byte_size<uint16_t>()) {
+        return invoke(uint16_t{});
     }
+    TM_CHECK(0) << "not implemented";
 }
 
-#ifdef ENABLE_FP32
-template void invokeInputIdsEmbeddingLookupPosEncoding(float*                    from_tensor,
-                                                       int*                      output_ids,
-                                                       const float*              embedding_table,
-                                                       const float*              pos_table,
-                                                       pPromptTuningParam<float> prompt_param,
-                                                       const int*                input_ids,
-                                                       const int                 start_step,
-                                                       const int                 length,
-                                                       const int                 max_length,
-                                                       const int                 batch_size,
-                                                       const int                 hidden_units,
-                                                       cudaStream_t              stream);
-#endif
-
-template void invokeInputIdsEmbeddingLookupPosEncoding(half*                    from_tensor,
-                                                       int*                     output_ids,
-                                                       const half*              embedding_table,
-                                                       const half*              pos_table,
-                                                       pPromptTuningParam<half> prompt_param,
-                                                       const int*               input_ids,
-                                                       const int                start_step,
-                                                       const int                length,
-                                                       const int                max_length,
-                                                       const int                batch_size,
-                                                       const int                hidden_units,
-                                                       cudaStream_t             stream);
-
-#ifdef ENABLE_BF16
-template void invokeInputIdsEmbeddingLookupPosEncoding(__nv_bfloat16*                    from_tensor,
-                                                       int*                              output_ids,
-                                                       const __nv_bfloat16*              embedding_table,
-                                                       const __nv_bfloat16*              pos_table,
-                                                       pPromptTuningParam<__nv_bfloat16> prompt_param,
-                                                       const int*                        input_ids,
-                                                       const int                         start_step,
-                                                       const int                         length,
-                                                       const int                         max_length,
-                                                       const int                         batch_size,
-                                                       const int                         hidden_units,
-                                                       cudaStream_t                      stream);
-#endif
-
 // TODO Add half2 implementation
 template<typename T>
 __global__ void transposeAxis01(T* out, T* in, const int dim0, const int dim1, const int dim2)
diff --git a/src/turbomind/kernels/gpt_kernels.h b/src/turbomind/kernels/gpt_kernels.h
index a351473332..f2ce314ba0 100644
--- a/src/turbomind/kernels/gpt_kernels.h
+++ b/src/turbomind/kernels/gpt_kernels.h
@@ -20,7 +20,7 @@
 #include <cuda_runtime.h>
 #include <unordered_map>
 
-#include "src/turbomind/utils/Tensor.h"
+#include "src/turbomind/core/core.h"
 #include "src/turbomind/utils/memory_utils.h"
 
 namespace turbomind {
@@ -130,20 +130,6 @@ void invokeFindContextDups(int*         shared_contexts,
                            const size_t input_seq_len,
                            cudaStream_t stream = 0);
 
-template<typename T>
-void handleOptArg(TensorMap* input_tensors, const std::string& arg_name, T* d_ptr, T default_value, size_t size)
-{
-    if (input_tensors->isExist(arg_name)) {
-        FT_CHECK(input_tensors->at(arg_name).size() == size);
-        cudaH2Dcpy(d_ptr, input_tensors->at(arg_name).getPtr<const T>(), size);
-    }
-    else {
-        deviceFill(d_ptr, size, default_value);
-    }
-}
-
-void setSeqLimitLen(uint32_t* seq_len_d, Tensor seq_len, int limit_len_offset, int batch_size);
-
 template<typename T>
 void invokeCompactInputs(T*           compact_input,
                          T*           compact_attention_mask,
@@ -253,4 +239,9 @@ void invokeTranspose2D(T* dst, const T* src, int rows, int cols, cudaStream_t st
     }
 }
 
+void invokeEmbeddingLookup(Ref<Tensor>         out_,
+                           const Buffer_<int>& token_ids,
+                           const Tensor&       embedding_table,
+                           cudaStream_t        st);
+
 }  // namespace turbomind
diff --git a/src/turbomind/kernels/norm/rms_norm.cu b/src/turbomind/kernels/norm/rms_norm.cu
index 428725b62d..ee826c4105 100644
--- a/src/turbomind/kernels/norm/rms_norm.cu
+++ b/src/turbomind/kernels/norm/rms_norm.cu
@@ -4,26 +4,28 @@
 
 #include "cub/block/block_reduce.cuh"
 
+#include "src/turbomind/core/data_type.h"
 #include "src/turbomind/kernels/core/array_ops.h"
 #include "src/turbomind/kernels/core/common.h"
 #include "src/turbomind/kernels/core/math.h"
 #include "src/turbomind/kernels/core/meta.h"
 
 #include "src/turbomind/kernels/norm/rms_norm.h"
-#include "src/turbomind/utils/Tensor.h"
 
 namespace turbomind {
 
+namespace kernel {
+
 template<class T, class Accum, int block_dim, int vec_size>
-__global__ void RMSNormKernel(T*       dst,
-                              int      dst_ld,
-                              const T* src,
-                              int      src_ld,
-                              const T* __restrict__ weights,
-                              int   dims,
-                              int   num,
-                              float eps,
-                              float inv_dims)
+__global__ void RMSNorm(T*       dst,
+                        int      dst_ld,
+                        const T* src,
+                        int      src_ld,
+                        const T* __restrict__ weights,
+                        int   dims,
+                        int   num,
+                        float eps,
+                        float inv_dims)
 {
     const int ti = blockIdx.x;
     const int di = threadIdx.x * vec_size;
@@ -80,60 +82,54 @@ __global__ void RMSNormKernel(T*       dst,
     }
 }
 
-template<class T>
-void invokeRMSNorm(
-    T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, cudaStream_t st)
+}  // namespace kernel
+
+void invokeRMSNorm(Tensor& out, const Tensor& x, const Tensor& w, float eps, cudaStream_t st)
 {
-    if (num == 0) {
+    TM_CHECK(x.ndim() == 2);
+    TM_CHECK(out.shape() == x.shape());
+    TM_CHECK(out.dtype() == x.dtype());
+    TM_CHECK(w.dtype() == x.dtype() && w.shape(-1) == x.shape(-1));
+
+    if (x.size() == 0) {
         return;
     }
 
-    constexpr int vec_size = 16 / sizeof(T);
+    auto invoke = [&](auto t) {
+        using T = decltype(t);
+
+        const auto [num, dim] = x.shapes(0, 1);
+
+        constexpr int vec_size = 16 / sizeof(T);
+
+        constexpr int threads = 512;
+        const int     blocks  = num;
 
-    constexpr int threads = 512;
-    const int     blocks  = num;
-
-    RMSNormKernel<T, float, threads, vec_size><<<blocks, threads, 0, st>>>(dst,  //
-                                                                           dst_ld,
-                                                                           src,
-                                                                           src_ld,
-                                                                           weights,
-                                                                           dims,
-                                                                           num,
-                                                                           eps,
-                                                                           1.f / dims);
+        kernel::RMSNorm<T, float, threads, vec_size><<<blocks, threads, 0, st>>>((T*)out.raw_data(),  //
+                                                                                 out.stride(0),
+                                                                                 (const T*)x.raw_data(),
+                                                                                 x.stride(0),
+                                                                                 (const T*)w.raw_data(),
+                                                                                 dim,
+                                                                                 num,
+                                                                                 eps,
+                                                                                 1.f / dim);
+    };
+
+    TM_DISPATCH_PRIMARY_DTYPES(x.dtype(), invoke);
 }
 
-template void invokeRMSNorm(half*        dst,
-                            int          dst_ld,
-                            const half*  src,
-                            int          src_ld,
-                            const half*  weights,
-                            int          dims,
-                            int          num,
-                            float        eps,
-                            cudaStream_t st);
-#if ENABLE_BF16
-template void invokeRMSNorm(nv_bfloat16*       dst,
-                            int                dst_ld,
-                            const nv_bfloat16* src,
-                            int                src_ld,
-                            const nv_bfloat16* weights,
-                            int                dims,
-                            int                num,
-                            float              eps,
-                            cudaStream_t       st);
-#endif
+namespace kernel {
 
 template<class T, class A, int vec_size, int max_dim>
-__global__ void QkRMSNormKernel(T*       data,  //
-                                int      ld,
-                                const T* weight,
-                                int      dim,
-                                int      n,
-                                int      token_num,
-                                float    eps,
-                                float    inv_dim)
+__global__ void RMSNormQK(T*       data,  //
+                          int      ld,
+                          const T* weight,
+                          int      dim,
+                          int      n,
+                          int      token_num,
+                          float    eps,
+                          float    inv_dim)
 {
     static_assert((max_dim & (max_dim - 1)) == 0);
 
@@ -183,6 +179,8 @@ __global__ void QkRMSNormKernel(T*       data,  //
     }
 }
 
+}  // namespace kernel
+
 void invokeQkRMSNorm(void*        data,
                      int          ld,
                      const void*  weight,
@@ -193,12 +191,16 @@ void invokeQkRMSNorm(void*        data,
                      float        eps,
                      cudaStream_t stream)
 {
-    auto invoke = [&](auto t, auto max_dim_t) {
+
+    constexpr constant<128> max_dim{};
+    TM_CHECK_LE(head_dim, max_dim);
+
+    auto invoke = [&](auto t) {
         using T = decltype(t);
 
-        constexpr int vec_size   = sizeof(uint4) / sizeof(T);
-        constexpr int max_dim    = max_dim_t.value;
-        constexpr int thr_per_qk = max_dim / vec_size;
+        constexpr int vec_size = sizeof(uint4) / sizeof(T);
+        // Captured constexpr may not be constant to MSVC
+        constexpr int thr_per_qk = max_dim.value / vec_size;
 
         FT_CHECK(head_dim % vec_size == 0);
 
@@ -206,21 +208,45 @@ void invokeQkRMSNorm(void*        data,
         const int block_dim = 512;
         const int grid_dim  = cdiv(threads, block_dim);
 
-        QkRMSNormKernel<T, float, vec_size, max_dim><<<grid_dim, block_dim, 0, stream>>>(
+        kernel::RMSNormQK<T, float, vec_size, max_dim><<<grid_dim, block_dim, 0, stream>>>(
             (T*)data, ld, (const T*)weight, head_dim, n, token_num, eps, 1.f / head_dim);
     };
 
+    TM_DISPATCH_PRIMARY_DTYPES(dtype, invoke);
+}
+
+void invokeRMSNormQK(Tensor& x, const Tensor& w, float eps, cudaStream_t st)
+{
+    TM_CHECK(x.ndim() == 3);
+
+    int token_num, head_num, head_dim;
+    std::tie(token_num, head_num, head_dim) = x.shapes(0, 1, 2);
+
+    TM_CHECK(x.stride(1) == head_dim);
+
+    auto data   = x.raw_data();
+    auto stride = x.stride(0);
+
     constexpr constant<128> max_dim{};
-    FT_CHECK(head_dim <= max_dim);
-
-    switch (dtype) {
-        case TYPE_FP16:
-            return invoke(half{}, max_dim);
-        case TYPE_BF16:
-            return invoke(nv_bfloat16{}, max_dim);
-        default:
-            throw std::runtime_error("not implemented");
-    }
+    TM_CHECK_LE(head_dim, max_dim);
+
+    auto invoke = [&](auto t) {
+        using T = decltype(t);
+
+        constexpr int vec_size   = sizeof(uint4) / sizeof(T);
+        constexpr int thr_per_qk = max_dim.value / vec_size;
+
+        TM_CHECK(head_dim % vec_size == 0);
+
+        const int threads   = token_num * head_num * thr_per_qk;
+        const int block_dim = 512;
+        const int grid_dim  = cdiv(threads, block_dim);
+
+        kernel::RMSNormQK<T, float, vec_size, max_dim><<<grid_dim, block_dim, 0, st>>>(
+            (T*)data, stride, (const T*)w.raw_data(), head_dim, head_num, token_num, eps, 1.f / head_dim);
+    };
+
+    TM_DISPATCH_PRIMARY_DTYPES(x.dtype(), invoke);
 }
 
 // r' <- r + (h + b)
@@ -368,14 +394,8 @@ void invokeResidualBiasRMSNorm(void*        hidden_states,
                                                                                            eps,
                                                                                            1.f / dims);
     };
-    switch (dtype) {
-        case DataType::TYPE_FP16:
-            return invoke(half{});
-        case DataType::TYPE_BF16:
-            return invoke(nv_bfloat16{});
-        default:
-            FT_CHECK(0);
-    }
+
+    TM_DISPATCH_PRIMARY_DTYPES(dtype, invoke);
 }
 
 }  // namespace turbomind
diff --git a/src/turbomind/kernels/norm/rms_norm.h b/src/turbomind/kernels/norm/rms_norm.h
index 562be1aea6..4027d83260 100644
--- a/src/turbomind/kernels/norm/rms_norm.h
+++ b/src/turbomind/kernels/norm/rms_norm.h
@@ -2,29 +2,13 @@
 
 #include <cuda_runtime.h>
 
-#include "src/turbomind/utils/Tensor.h"
+#include "src/turbomind/core/core.h"
 
 namespace turbomind {
 
-template<class T>
-void invokeRMSNorm(
-    T* dst, int dst_ld, const T* src, int src_ld, const T* weights, int dims, int num, float eps, cudaStream_t st);
+void invokeRMSNorm(Tensor& out, const Tensor& x, const Tensor& w, float eps, cudaStream_t st);
 
-template<class T>
-void invokeRMSNorm(T* dst, const T* src, const T* weights, int dims, int num, float eps, cudaStream_t st)
-{
-    invokeRMSNorm(dst, dims, src, dims, weights, dims, num, eps, st);
-}
-
-void invokeQkRMSNorm(void*        data,
-                     int          ld,
-                     const void*  weight,
-                     DataType     dtype,
-                     int          head_dim,
-                     int          n,
-                     int          token_num,
-                     float        eps,
-                     cudaStream_t stream);
+void invokeRMSNormQK(Tensor& x, const Tensor& w, float eps, cudaStream_t st);
 
 template<class T>
 void invokeBiasResidualRMSNorm(
diff --git a/src/turbomind/kernels/sampling_topk_kernels.cu b/src/turbomind/kernels/sampling_topk_kernels.cu
index d52d112765..a3834ebce3 100644
--- a/src/turbomind/kernels/sampling_topk_kernels.cu
+++ b/src/turbomind/kernels/sampling_topk_kernels.cu
@@ -55,14 +55,15 @@ __global__ void curandBatchInitialize(curandState_t* states, const int size, con
     }
 }
 
-void invokeCurandBatchInitialize(curandState_t*            states,
-                                 const size_t              batch_size,
-                                 const unsigned long long* random_seeds,
-                                 cudaStream_t              stream)
+void invokeCurandBatchInitialize(curandState_t*  states,
+                                 const size_t    batch_size,
+                                 const uint64_t* random_seeds,
+                                 cudaStream_t    stream)
 {
     dim3 block(256);
     dim3 grid((int)(ceil(batch_size * 1.0 / 256)));
-    curandBatchInitialize<<<grid, block, 0, stream>>>(states, batch_size, random_seeds);
+    static_assert(sizeof(uint64_t) == sizeof(unsigned long long));
+    curandBatchInitialize<<<grid, block, 0, stream>>>(states, batch_size, (unsigned long long*)random_seeds);
 }
 
 template<typename T, int BLOCK_SIZE, int BLOCKS_PER_BEAM>
diff --git a/src/turbomind/kernels/sampling_topk_kernels.h b/src/turbomind/kernels/sampling_topk_kernels.h
index cb357bc1c9..c0c60b4f82 100644
--- a/src/turbomind/kernels/sampling_topk_kernels.h
+++ b/src/turbomind/kernels/sampling_topk_kernels.h
@@ -48,10 +48,10 @@ void invokeCurandInitialize(curandState_t*     state,
                             unsigned long long random_seed,
                             cudaStream_t       stream);
 
-void invokeCurandBatchInitialize(curandState_t*            states,
-                                 const size_t              batch_size,
-                                 const unsigned long long* random_seeds,
-                                 cudaStream_t              stream);
+void invokeCurandBatchInitialize(curandState_t*  states,
+                                 const size_t    batch_size,
+                                 const uint64_t* random_seeds,
+                                 cudaStream_t    stream);
 
 struct TopKSortFilterParams {
     void*  workspace;
diff --git a/src/turbomind/kernels/stop_criteria_kernels.cu b/src/turbomind/kernels/stop_criteria_kernels.cu
index 06452535b4..b31dd9216d 100644
--- a/src/turbomind/kernels/stop_criteria_kernels.cu
+++ b/src/turbomind/kernels/stop_criteria_kernels.cu
@@ -104,58 +104,32 @@ void invokeStopWordsCriterion(const int*   output_ids,
     sync_check_cuda_error();
 }
 
-__global__ void length_criterion(bool*           finished,
-                                 bool*           should_stop,
-                                 int*            finished_sum,
-                                 const uint32_t* sequence_limit_length,
-                                 int             batch_size,
-                                 int             beam_width,
-                                 int             step)
+__global__ void length_criterion(bool*      finished,  //
+                                 const int* sequence_limit_length,
+                                 int        batch_size,
+                                 int        beam_width,
+                                 int        step)
 {
-    int thread_finished_count = 0;
     for (int index = threadIdx.x; index < batch_size * beam_width; index += blockDim.x) {
         const int batch_idx = index / beam_width;
-
         finished[index] |= step >= sequence_limit_length[batch_idx];
-        thread_finished_count += finished[index] ? 1 : 0;
-    }
-    int block_finished_count = 0;
-    if (blockDim.x <= 32) {
-        block_finished_count = warpReduceSum(thread_finished_count);
-    }
-    else {
-        block_finished_count = blockReduceSum(thread_finished_count);
-    }
-    __syncthreads();
-
-    if (threadIdx.x == 0 && should_stop) {
-        finished_sum[0] = block_finished_count;
     }
 }
 
-void invokeLengthCriterion(bool*           finished,
-                           bool*           should_stop,
-                           int*            h_pinned_finished_sum_,
-                           const uint32_t* sequence_limit_length,
-                           int             batch_size,
-                           int             beam_width,
-                           int             step,
-                           cudaStream_t    stream)
+void invokeLengthCriterion(bool*        finished,  //
+                           const int*   sequence_limit_length,
+                           int          batch_size,
+                           int          beam_width,
+                           int          step,
+                           cudaStream_t stream)
 {
     // Check if we have attained the sequence length limit. If so, stop the sequence.
     // In addition, check if all sequences are stopped and return the result in should_stop
     TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     dim3 block(std::min(512, batch_size * beam_width));
     dim3 grid{1};
-    h_pinned_finished_sum_[0] = -1;
-
-    length_criterion<<<grid, block, 0, stream>>>(
-        finished, should_stop, h_pinned_finished_sum_, sequence_limit_length, batch_size, beam_width, step);
 
-    if (should_stop) {
-        check_cuda_error(cudaStreamSynchronize(stream));
-        *should_stop = h_pinned_finished_sum_[0] == batch_size * beam_width;
-    }
+    length_criterion<<<grid, block, 0, stream>>>(finished, sequence_limit_length, batch_size, beam_width, step);
 }
 
 }  // namespace turbomind
diff --git a/src/turbomind/kernels/stop_criteria_kernels.h b/src/turbomind/kernels/stop_criteria_kernels.h
index e403c947cb..2a83fbb6fd 100644
--- a/src/turbomind/kernels/stop_criteria_kernels.h
+++ b/src/turbomind/kernels/stop_criteria_kernels.h
@@ -30,13 +30,11 @@ void invokeStopWordsCriterion(const int*   output_ids,
                               int          step,
                               cudaStream_t stream);
 
-void invokeLengthCriterion(bool*           finished,
-                           bool*           should_stop,
-                           int*            finished_sum,
-                           const uint32_t* sequence_limit_length,
-                           int             batch_size,
-                           int             beam_width,
-                           int             step,
-                           cudaStream_t    stream);
+void invokeLengthCriterion(bool*        finished,  //
+                           const int*   sequence_limit_length,
+                           int          batch_size,
+                           int          beam_width,
+                           int          step,
+                           cudaStream_t stream);
 
 }  // namespace turbomind
diff --git a/src/turbomind/kernels/unfused_attention_kernels.cu b/src/turbomind/kernels/unfused_attention_kernels.cu
index 7f733a6dfc..a5c36dd148 100644
--- a/src/turbomind/kernels/unfused_attention_kernels.cu
+++ b/src/turbomind/kernels/unfused_attention_kernels.cu
@@ -531,6 +531,45 @@ void invokeMaskedSoftmax(MaskedSoftmaxParam<__nv_bfloat16, __nv_bfloat16>& param
 #undef LAUNCH_MAKSED_SOFTMAX
 #undef LAUNCH_MAKSED_SOFTMAX_
 
+// clang-format off
+template<typename T> struct packed_type;
+template <>          struct packed_type<float>         { using type = float; }; // we don't need to pack float by default
+template <>          struct packed_type<half>          { using type = half2; };
+
+#ifdef ENABLE_BF16
+template<>
+struct packed_type<__nv_bfloat16> {
+    using type = __nv_bfloat162;
+};
+#endif
+
+template<typename T> struct num_elems;
+template <>          struct num_elems<float>           { static constexpr int value = 1; };
+template <>          struct num_elems<float2>          { static constexpr int value = 2; };
+template <>          struct num_elems<float4>          { static constexpr int value = 4; };
+template <>          struct num_elems<half>            { static constexpr int value = 1; };
+template <>          struct num_elems<half2>           { static constexpr int value = 2; };
+#ifdef ENABLE_BF16
+template <>          struct num_elems<__nv_bfloat16>   { static constexpr int value = 1; };
+template <>          struct num_elems<__nv_bfloat162>  { static constexpr int value = 2; };
+#endif
+
+template<typename T, int num> struct packed_as;
+template<typename T>          struct packed_as<T, 1>              { using type = T; };
+template<>                    struct packed_as<half,  2>          { using type = half2; };
+template<>                    struct packed_as<float,  2>         { using type = float2; };
+template<>                    struct packed_as<int8_t, 2>         { using type = int16_t; };
+template<>                    struct packed_as<int32_t, 2>        { using type = int2; };
+template<>                    struct packed_as<half2, 1>          { using type = half; };
+#ifdef ENABLE_BF16
+template<> struct packed_as<__nv_bfloat16,  2> { using type = __nv_bfloat162; };
+template<> struct packed_as<__nv_bfloat162, 1> { using type = __nv_bfloat16;  };
+#endif
+
+inline __device__ float2 operator*(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
+inline __device__ float2 operator*(float2 a, float  b) { return make_float2(a.x * b, a.y * b); }
+// clang-format on
+
 template<typename T>
 __global__ void transpose_remove_padding(const T*     src,
                                          T*           dst,
diff --git a/src/turbomind/kernels/unfused_attention_kernels.h b/src/turbomind/kernels/unfused_attention_kernels.h
index 758fe7fba0..7df6a421e5 100644
--- a/src/turbomind/kernels/unfused_attention_kernels.h
+++ b/src/turbomind/kernels/unfused_attention_kernels.h
@@ -15,8 +15,6 @@
  */
 #pragma once
 
-#include "src/turbomind/utils/Tensor.h"
-
 namespace turbomind {
 
 template<typename T, typename T_IN>
@@ -142,7 +140,4 @@ void invokeMaskedSoftMaxWithRelPosBias(T*           qk_buf,
                                        const float  qk_scale,
                                        cudaStream_t stream);
 
-template<typename T>
-void invokeTransposeAttentions(Tensor& attentions_out, const Tensor& attentions_in, cudaStream_t stream = 0);
-
 }  // namespace turbomind
diff --git a/src/turbomind/layers/BaseDynamicDecodeLayer.h b/src/turbomind/layers/BaseDynamicDecodeLayer.h
new file mode 100644
index 0000000000..a3e14407ff
--- /dev/null
+++ b/src/turbomind/layers/BaseDynamicDecodeLayer.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+
+#include "src/turbomind/core/core.h"
+#include "src/turbomind/engine/request.h"
+
+namespace turbomind {
+
+class BaseDynamicDecodeLayer {
+public:
+    struct BaseParam {
+        int                   max_batch_size;
+        int                   vocab_size;
+        int                   vocab_size_padded;
+        cudaStream_t          stream;
+        const cudaDeviceProp* device_prop;
+    };
+
+    virtual ~BaseDynamicDecodeLayer() = default;
+
+    explicit BaseDynamicDecodeLayer(const BaseParam& param)
+    {
+        max_batch_size_    = param.max_batch_size;
+        vocab_size_        = param.vocab_size;
+        vocab_size_padded_ = param.vocab_size_padded;
+        stream_            = param.stream;
+        device_prop_       = param.device_prop;
+    };
+
+    virtual void Setup(const std::vector<const Request*>& rs, const TensorMap& args) = 0;
+
+    virtual void Forward(TensorMap& args) = 0;
+
+protected:
+    int                   max_batch_size_;
+    int                   vocab_size_;
+    int                   vocab_size_padded_;
+    cudaStream_t          stream_;
+    const cudaDeviceProp* device_prop_;
+};
+
+}  // namespace turbomind
diff --git a/src/turbomind/layers/BaseLayer.h b/src/turbomind/layers/BaseLayer.h
deleted file mode 100644
index fcb0ef37cc..0000000000
--- a/src/turbomind/layers/BaseLayer.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <assert.h>
-
-#include "src/turbomind/utils/Tensor.h"
-#include "src/turbomind/utils/allocator.h"
-#include "src/turbomind/utils/cublasMMWrapper.h"
-
-namespace turbomind {
-
-class BaseLayer {
-public:
-    BaseLayer(cudaStream_t     stream,
-              cublasMMWrapper* cublas_wrapper,
-              IAllocator*      allocator,
-              bool             is_free_buffer_after_forward,
-              cudaDeviceProp*  cuda_device_prop = nullptr,
-              bool             sparse           = false):
-        stream_(stream),
-        cublas_wrapper_(cublas_wrapper),
-        allocator_(allocator),
-        cuda_device_prop_(cuda_device_prop),
-        is_free_buffer_after_forward_(is_free_buffer_after_forward),
-        sparse_(sparse){};
-    virtual ~BaseLayer() = default;
-
-    virtual cudaStream_t getStream()
-    {
-        return stream_;
-    }
-
-    virtual void setStream(cudaStream_t stream)
-    {
-        stream_ = stream;
-    }
-
-protected:
-    virtual void allocateBuffer() = 0;
-    virtual void freeBuffer()     = 0;
-
-    // device environments
-    cudaStream_t     stream_;
-    cublasMMWrapper* cublas_wrapper_;
-    IAllocator*      allocator_;
-    cudaDeviceProp*  cuda_device_prop_ = nullptr;
-
-    bool is_free_buffer_after_forward_;
-    bool is_allocate_buffer_ = false;  // TODO (bhsueh) to be deprecated
-    bool sparse_;
-};
-
-}  // namespace turbomind
diff --git a/src/turbomind/layers/CMakeLists.txt b/src/turbomind/layers/CMakeLists.txt
index ae308d0fd8..975ee77ec7 100644
--- a/src/turbomind/layers/CMakeLists.txt
+++ b/src/turbomind/layers/CMakeLists.txt
@@ -22,4 +22,4 @@ set_property(TARGET DynamicDecodeLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET DynamicDecodeLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 target_link_libraries(DynamicDecodeLayer PUBLIC CUDA::cudart
         LogitsProcessorLayer SamplingLayer StopCriteriaLayer
-        gpt_kernels tensor nvtx_utils)
+        gpt_kernels nvtx_utils)
diff --git a/src/turbomind/layers/DenseWeight.h b/src/turbomind/layers/DenseWeight.h
deleted file mode 100644
index ba27764d38..0000000000
--- a/src/turbomind/layers/DenseWeight.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include "src/turbomind/utils/cuda_fp8_utils.h"
-#include "stdlib.h"
-#include <cstdint>
-
-namespace turbomind {
-
-// Note that the int8 mode of BERT and GPT are different.
-// For int8 mode = 2 on GPT:
-// scale (gemm input scale): quantize input of GEMM (float/half) in the int8 range. Namely, int8_x = scale * x
-// scale_inter: (gemm output scale) / (gemm input scale * gemm weight scale)
-// scale_out: 1 / (gemm output scale), dequantize activation from int8 range to float/half.
-template<typename T1, typename T2 = T1>
-struct DenseWeight {
-    const T1* kernel    = nullptr;
-    const T2* bias      = nullptr;
-    const T1* fp8_bias  = nullptr;
-    const T1* sp_kernel = nullptr;
-    // for int8 kernel
-    const int8_t* int8_kernel             = nullptr;
-    const float*  scale                   = nullptr;
-    const T2*     weight_only_quant_scale = nullptr;
-    const T2*     moe_scale               = nullptr;
-    const float*  scale_inter             = nullptr;
-    const float*  scale_out               = nullptr;
-
-    // FP8 scales
-    // scale = AMAX(tensor) / FP8_MAX
-    // During GEMM, A (original) = A_scaled (fp8) * "scale of A"
-    const float* input_scale      = nullptr;  // a scalar
-    const float* input_scale_inv  = nullptr;  // a scalar
-    const float* weight_scale     = nullptr;  // a scalar or a vector
-    const float* weight_scale_inv = nullptr;  // a scalar or a vector
-    const float* output_scale     = nullptr;  // a scalar
-    const float* output_scale_inv = nullptr;  // a scalar
-    // host pointer of scales, all are scalars
-    const float* input_h_scale      = nullptr;
-    const float* input_h_scale_inv  = nullptr;
-    const float* weight_h_scale     = nullptr;
-    const float* weight_h_scale_inv = nullptr;
-    const float* output_h_scale     = nullptr;
-    const float* output_h_scale_inv = nullptr;
-
-    // TODO(bhsueh) check do we need this param
-    const float* per_channel_scale_min =
-        nullptr;  // = min(weight_scale), used to adjust the scaling of per channel scaling
-
-    bool fuse_gemm_bias = false;
-};
-
-}  // namespace turbomind
diff --git a/src/turbomind/layers/DynamicDecodeBaseLayer.h b/src/turbomind/layers/DynamicDecodeBaseLayer.h
deleted file mode 100644
index 132197269a..0000000000
--- a/src/turbomind/layers/DynamicDecodeBaseLayer.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-
-#include "src/turbomind/layers/BaseLayer.h"
-
-namespace turbomind {
-
-struct DynamicDecodeCommonArgs {
-    size_t vocab_size;
-    size_t vocab_size_padded;
-};
-
-class DynamicDecodeBaseLayer: public BaseLayer {
-protected:
-    DynamicDecodeCommonArgs args_;
-
-    virtual void allocateBuffer() = 0;
-    virtual void freeBuffer()     = 0;
-
-public:
-    DynamicDecodeBaseLayer(cudaStream_t            stream,
-                           IAllocator*             allocator,
-                           bool                    is_free_buffer_after_forward,
-                           DynamicDecodeCommonArgs args):
-        BaseLayer(stream, nullptr, allocator, is_free_buffer_after_forward, nullptr), args_(args){};
-    ~DynamicDecodeBaseLayer() = default;
-
-    virtual void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) = 0;
-
-    virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors) = 0;
-};
-
-}  // namespace turbomind
diff --git a/src/turbomind/layers/DynamicDecodeLayer.cc b/src/turbomind/layers/DynamicDecodeLayer.cc
index 7d1f1b5ed3..748c0e7184 100644
--- a/src/turbomind/layers/DynamicDecodeLayer.cc
+++ b/src/turbomind/layers/DynamicDecodeLayer.cc
@@ -15,103 +15,44 @@
  */
 
 #include "src/turbomind/layers/DynamicDecodeLayer.h"
+#include "src/turbomind/core/data_type.h"
+#include "src/turbomind/layers/BaseDynamicDecodeLayer.h"
 #include "src/turbomind/layers/sampling_layers/LogitsProcessorLayer.h"
 #include "src/turbomind/layers/sampling_layers/SamplingLayer.h"
 #include "src/turbomind/layers/sampling_layers/StopCriteriaLayer.h"
 #include "src/turbomind/macro.h"
-#include "src/turbomind/utils/cuda_utils.h"
 
 namespace turbomind {
 
-template<typename T>
-void DynamicDecodeLayer<T>::allocateBuffer()
-{
-}
-
-template<typename T>
-void DynamicDecodeLayer<T>::freeBuffer()
-{
-}
-
-template<typename T>
-void DynamicDecodeLayer<T>::initialize()
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-
-    DynamicDecodeCommonArgs args{vocab_size_, vocab_size_padded_};
-    layers_.emplace_back(new LogitsProcessorLayer<T>(stream_, allocator_, is_free_buffer_after_forward_, args));
-    layers_.emplace_back(new SamplingLayer<T>(stream_, allocator_, is_free_buffer_after_forward_, args));
-    layers_.emplace_back(new StopCriteriaLayer<T>(stream_, allocator_, is_free_buffer_after_forward_, args));
-}
-
-template<typename T>
-DynamicDecodeLayer<T>::DynamicDecodeLayer(size_t           vocab_size,
-                                          size_t           vocab_size_padded,
-                                          cudaStream_t     stream,
-                                          cublasMMWrapper* cublas_wrapper,
-                                          IAllocator*      allocator,
-                                          bool             is_free_buffer_after_forward,
-                                          cudaDeviceProp*  cuda_device_prop):
-    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
-    vocab_size_(vocab_size),
-    vocab_size_padded_(vocab_size_padded),
-    cuda_device_prop_(cuda_device_prop)
+DynamicDecodeLayer::DynamicDecodeLayer(DataType              dtype,
+                                       int                   max_batch_size,
+                                       int                   vocab_size,
+                                       int                   vocab_size_padded,
+                                       cudaStream_t          stream,
+                                       const cudaDeviceProp* device_prop)
 {
     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    initialize();
-}
-
-template<typename T>
-DynamicDecodeLayer<T>::~DynamicDecodeLayer()
-{
+    auto dispatch = [&](auto t) {
+        using T = decltype(t);
+        BaseDynamicDecodeLayer::BaseParam param{max_batch_size, vocab_size, vocab_size_padded, stream, device_prop};
+        layers_.emplace_back(new LogitsProcessorLayer<T>{param});
+        layers_.emplace_back(new SamplingLayer<T>{param});
+        layers_.emplace_back(new StopCriteriaLayer<T>{param});
+    };
+    TM_DISPATCH_PRIMARY_DTYPES(dtype, dispatch);
 }
 
-template<typename T>
-DynamicDecodeLayer<T>::DynamicDecodeLayer(DynamicDecodeLayer const& dynamic_decode_layer):
-    BaseLayer(dynamic_decode_layer),
-    vocab_size_(dynamic_decode_layer.vocab_size_),
-    vocab_size_padded_(dynamic_decode_layer.vocab_size_padded_),
-    cuda_device_prop_(dynamic_decode_layer.cuda_device_prop_)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    initialize();
-}
+DynamicDecodeLayer::~DynamicDecodeLayer() {}
 
-template<typename T>
-void DynamicDecodeLayer<T>::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args)
+void DynamicDecodeLayer::Setup(const std::vector<const Request*>& rs, const TensorMap& args)
 {
-    /**
-     * @brief Set up the dynamic decode layer for given input runtime arguments.
-     *
-     * runtime_args:
-     *   \param  runtime_top_k [batch_size] on cpu, optional.
-     *   \param  runtime_top_p [batch_size] on cpu, optional
-     *   \param  temperature [batch_size] on cpu, optional
-     *   \param  repetition_penalty [batch_size] on cpu, optional
-     *   \param  min_length [batch_size], optional
-     *   \param  context_length [batch_size], optional
-     *   \param  prompt_length [batch_size], optional
-     */
-
     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    FT_CHECK_WITH_INFO(beam_width == 1, "only support beam_width=1");
     for (const auto& layer : layers_) {
-        layer->setup(batch_size, beam_width, runtime_args);
+        layer->Setup(rs, args);
     }
 }
 
-template<typename T>
-void DynamicDecodeLayer<T>::forward(std::unordered_map<std::string, Tensor>*       output_tensors,
-                                    const std::unordered_map<std::string, Tensor>* input_tensors)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    TensorMap input_map(*input_tensors);
-    TensorMap output_map(*output_tensors);
-    forward(&output_map, &input_map);
-}
-
-template<typename T>
-void DynamicDecodeLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_tensors)
+void DynamicDecodeLayer::Forward(TensorMap& args)
 {
     /**
      * @brief
@@ -140,25 +81,9 @@ void DynamicDecodeLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_
      *   \param  sampled_nums [batch_size, 1], optional
      */
 
-    const int    ite              = (int)input_tensors->at("ite").getVal<uint>();
-    const size_t batch_size       = input_tensors->at("logits").shape[0];
-    const size_t local_batch_size = (size_t)input_tensors->at("local_batch_size").getVal<int>();
-
-    FT_CHECK(ite == 0);
-    FT_CHECK(local_batch_size == batch_size);
-    FT_CHECK(input_tensors->at("logits").shape.size() == 3);
-
     for (const auto& layer : layers_) {
-        layer->forward(output_tensors, input_tensors);
+        layer->Forward(args);
     }
 }
 
-#ifdef ENABLE_FP32
-template class DynamicDecodeLayer<float>;
-#endif
-template class DynamicDecodeLayer<half>;
-#ifdef ENABLE_BF16
-template class DynamicDecodeLayer<__nv_bfloat16>;
-#endif
-
 }  // namespace turbomind
diff --git a/src/turbomind/layers/DynamicDecodeLayer.h b/src/turbomind/layers/DynamicDecodeLayer.h
index 152a5c30a5..c527ff8e0f 100644
--- a/src/turbomind/layers/DynamicDecodeLayer.h
+++ b/src/turbomind/layers/DynamicDecodeLayer.h
@@ -16,43 +16,33 @@
 
 #pragma once
 
-#include <string>
-#include <unordered_map>
+#include <memory>
+#include <vector>
 
-#include "src/turbomind/layers/BaseLayer.h"
-#include "src/turbomind/layers/DynamicDecodeBaseLayer.h"
+#include "src/turbomind/engine/request.h"
+#include "src/turbomind/layers/BaseDynamicDecodeLayer.h"
 
-namespace turbomind {
-
-template<typename T>
-class DynamicDecodeLayer: public BaseLayer {
-protected:
-    void allocateBuffer() override;
-    void freeBuffer() override;
-    void initialize();
-
-    size_t          vocab_size_;
-    size_t          vocab_size_padded_;
-    cudaDeviceProp* cuda_device_prop_;
+#include "src/turbomind/core/tensor.h"
 
-    std::vector<std::unique_ptr<DynamicDecodeBaseLayer>> layers_;
+namespace turbomind {
 
+class DynamicDecodeLayer {
 public:
-    DynamicDecodeLayer(size_t           vocab_size,
-                       size_t           vocab_size_padded,
-                       cudaStream_t     stream,
-                       cublasMMWrapper* cublas_wrapper,
-                       IAllocator*      allocator,
-                       bool             is_free_buffer_after_forward,
-                       cudaDeviceProp*  cuda_device_prop);
+    DynamicDecodeLayer(DataType              data_type,
+                       int                   max_batch_size,
+                       int                   vocab_size,
+                       int                   vocab_size_padded,
+                       cudaStream_t          stream,
+                       const cudaDeviceProp* device_prop);
 
     ~DynamicDecodeLayer();
-    DynamicDecodeLayer(DynamicDecodeLayer const& dynamic_decode_layer);
 
-    void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args);
-    void forward(TensorMap* output_tensors, TensorMap* input_tensors);
-    void forward(std::unordered_map<std::string, Tensor>*       output_tensors,
-                 const std::unordered_map<std::string, Tensor>* input_tensors);
+    void Setup(const std::vector<const Request*>& rs, const TensorMap& args);
+
+    void Forward(TensorMap& args);
+
+private:
+    std::vector<std::unique_ptr<BaseDynamicDecodeLayer>> layers_;
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/layers/attention_layers/AttentionWeight.h b/src/turbomind/layers/attention_layers/AttentionWeight.h
deleted file mode 100644
index 46d7bf3e89..0000000000
--- a/src/turbomind/layers/attention_layers/AttentionWeight.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "src/turbomind/layers/DenseWeight.h"
-
-namespace turbomind {
-
-template<typename T1, typename T2 = T1>
-struct AttentionWeight {
-    DenseWeight<T1, T2> query_weight;
-    DenseWeight<T1, T2> key_weight;
-    DenseWeight<T1, T2> value_weight;
-    DenseWeight<T1, T2> attention_output_weight;
-    DenseWeight<T1, T2> ia3_key_weight;
-    DenseWeight<T1, T2> ia3_value_weight;
-};
-
-}  // namespace turbomind
diff --git a/src/turbomind/layers/attention_layers/BaseAttentionLayer.h b/src/turbomind/layers/attention_layers/BaseAttentionLayer.h
deleted file mode 100644
index db9972ab65..0000000000
--- a/src/turbomind/layers/attention_layers/BaseAttentionLayer.h
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <assert.h>
-#include <vector>
-
-// #include "3rdparty/trt_fused_multihead_attention/fused_multihead_attention_common.h"
-#include "src/turbomind/layers/BaseLayer.h"
-#include "src/turbomind/layers/attention_layers/AttentionWeight.h"
-#include "src/turbomind/utils/Tensor.h"
-#include "src/turbomind/utils/allocator.h"
-#include "src/turbomind/utils/cublasMMWrapper.h"
-#include "src/turbomind/utils/cuda_fp8_utils.h"
-#include "src/turbomind/utils/memory_utils.h"
-
-namespace turbomind {
-
-enum class AttentionType
-{
-    UNFUSED_MHA,
-    UNFUSED_PADDED_MHA,
-    FUSED_MHA,
-    FUSED_PADDED_MHA
-};
-
-/* NOTE:
-1. only swin-style relative position bias is supported currently
-2. gpt-style (causal-mask) models support any-sequence-length fmha, so we don't need to call isValidSeqLen at run-time
-3. bert/vit can also support any-seq-length fmha
-*/
-template<typename T>
-AttentionType getAttentionType(size_t     size_per_head,
-                               const int  sm,
-                               const bool remove_padding,
-                               const int  max_seq_len,
-                               const bool is_fuse                          = true,
-                               const bool with_swin_relative_position_bias = false,
-                               const bool causal_mask                      = false)
-{
-
-    if (std::is_same<T, half>::value && is_fuse) {
-        // Bert/Vit
-        if (!causal_mask) {
-            if (!with_swin_relative_position_bias
-                && (((sm == kSM_70 || sm == kSM_72) && size_per_head == 64)
-                    || ((sm == kSM_75 || sm == kSM_80 || sm == kSM_86)
-                        && (size_per_head == 64 || size_per_head == 32)))) {
-                return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA;
-            }
-            else if (with_swin_relative_position_bias && (sm == kSM_75 || sm == kSM_80 || sm == kSM_86)
-                     && max_seq_len <= 256 && size_per_head == 32) {
-                return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA;
-            }
-        }
-        // GPT and its variants
-        else {
-            // FMHA_ENABLE only affects gpt-style models (causal-mask)
-            char* fused_qkv = std::getenv("FMHA_ENABLE");
-            if (fused_qkv != nullptr && std::string(fused_qkv) == "ON") {
-                if ((sm == kSM_70 || sm == kSM_72 || sm == kSM_75 || sm == kSM_80 || sm == kSM_86 || sm == kSM_89)
-                    && (size_per_head == 32 || size_per_head == 40 || size_per_head == 64 || size_per_head == 80
-                        || size_per_head == 128 || size_per_head == 144 || size_per_head == 160
-                        || size_per_head == 256)) {
-                    return remove_padding ? AttentionType::FUSED_MHA : AttentionType::UNFUSED_PADDED_MHA;
-                }
-            }
-        }
-    }
-#ifdef ENABLE_FP8
-    else if (std::is_same<T, __nv_fp8_e4m3>::value && is_fuse) {
-        if (!causal_mask) {
-            if ((sm == kSM_89 || sm == kSM_90) && max_seq_len < 512 && is_fuse && size_per_head == 64) {
-                return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA;
-            }
-            else {
-                return remove_padding ? AttentionType::UNFUSED_MHA : AttentionType::UNFUSED_PADDED_MHA;
-            }
-        }
-    }
-#endif
-
-    return remove_padding ? AttentionType::UNFUSED_MHA : AttentionType::UNFUSED_PADDED_MHA;
-}
-
-template<typename T>
-AttentionType getAttentionTypeINT8(
-    size_t size_per_head, const int sm, const bool remove_padding, const int max_seq_len, const int int8_mode)
-{
-    if ((int8_mode == 1 || int8_mode == 2)
-        && (((sm == kSM_80 || sm == kSM_86) && (size_per_head == 64 || size_per_head == 32) && max_seq_len <= 512)
-            || (sm == kSM_75
-                && ((size_per_head == 64 && max_seq_len <= 384) || (size_per_head == 32 && max_seq_len <= 512))))) {
-        return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA;
-    }
-    else {
-        return remove_padding ? AttentionType::UNFUSED_MHA : AttentionType::UNFUSED_PADDED_MHA;
-    }
-}
-
-inline bool isFusedMHA(AttentionType attention_type)
-{
-    return attention_type == AttentionType::FUSED_MHA || attention_type == AttentionType::FUSED_PADDED_MHA;
-}
-
-inline bool isUnPaddedMHA(AttentionType attention_type)
-{
-    return attention_type == AttentionType::FUSED_MHA || attention_type == AttentionType::UNFUSED_MHA;
-}
-
-inline bool isPaddedMHA(AttentionType attention_type)
-{
-    return attention_type == AttentionType::FUSED_PADDED_MHA || attention_type == AttentionType::UNFUSED_PADDED_MHA;
-}
-
-inline AttentionType getUnfusedAttentionType(AttentionType attention_type)
-{
-    if (attention_type == AttentionType::FUSED_MHA) {
-        return AttentionType::UNFUSED_MHA;
-    }
-    else if (attention_type == AttentionType::FUSED_PADDED_MHA) {
-        return AttentionType::UNFUSED_PADDED_MHA;
-    }
-    return attention_type;
-}
-
-template<typename T>
-class BaseAttentionLayer: public BaseLayer {
-
-public:
-    virtual void
-    forward(TensorMap* output_tensors, TensorMap* input_tensors, const AttentionWeight<T>* attention_weights) = 0;
-
-    BaseAttentionLayer(cudaStream_t     stream,
-                       cublasMMWrapper* cublas_wrapper,
-                       IAllocator*      allocator,
-                       bool             is_free_buffer_after_forward,
-                       bool             sparse = false):
-        BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse)
-    {
-    }
-    virtual ~BaseAttentionLayer() = default;
-    virtual bool isValidSeqLen(const size_t seq_len)
-    {
-        return true;
-    }
-};
-
-}  // namespace turbomind
diff --git a/src/turbomind/layers/attention_layers/CMakeLists.txt b/src/turbomind/layers/attention_layers/CMakeLists.txt
deleted file mode 100644
index 0d1a96fef3..0000000000
--- a/src/turbomind/layers/attention_layers/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-cmake_minimum_required(VERSION 3.8)
diff --git a/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.cc b/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.cc
index 1194ad16f1..1839284f03 100644
--- a/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.cc
+++ b/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.cc
@@ -14,140 +14,108 @@
  * limitations under the License.
  */
 
-#include "src/turbomind/layers/sampling_layers/LogitsProcessorLayer.h"
+#include <iostream>
+#include <numeric>
+
+#include "src/turbomind/core/check.h"
+#include "src/turbomind/engine/request.h"
 #include "src/turbomind/kernels/ban_bad_words.h"
+#include "src/turbomind/kernels/penalty_types.h"
 #include "src/turbomind/kernels/sampling_penalty_kernels.h"
-#include "src/turbomind/utils/memory_utils.h"
+#include "src/turbomind/layers/sampling_layers/LogitsProcessorLayer.h"
+#include "src/turbomind/layers/sampling_layers/utils.h"
 
 namespace turbomind {
 
 #define ALL_OF(p_, sz_, dt_, v_) (std::all_of(p_, p_ + sz_, [&](dt_ b) { return b == v_; }))
 
+namespace {
+
 template<typename T>
-void init_host_buffer(TensorMap* runtime_args, const std::string& key, size_t size, T* dst, T default_value)
+void init_host_buffer(const TensorMap& map, const std::string& key, size_t size, T* dst, T default_value)
 {
-    const Tensor src      = runtime_args->isExist(key) ? runtime_args->at(key) : Tensor();
-    const size_t src_size = src.size();
-    if (src_size > size) {
-        TM_LOG_ERROR("runtime_args %s has invalid size %ld vs batch_size %ld", key.c_str(), src_size, size);
-    }
-    if (src_size > 0) {
-        std::copy_n(src.getPtr<T>(), size, dst);
+    Tensor        empty{};
+    const Tensor& src = map.contains(key) ? map.at(key) : empty;
+
+    if (src) {
+        if (size_t sz = src.size(); sz > size) {
+            TM_LOG_ERROR("runtime_args %s has invalid size %ld vs batch_size %ld", key.c_str(), sz, size);
+        }
+        std::copy_n(src.data<T>(), size, dst);
     }
     else {
         std::fill_n(dst, size, default_value);
     }
 }
 
-template<typename T>
-void LogitsProcessorLayer<T>::allocateBuffer()
-{
-    FT_CHECK(false);
-}
-
-template<typename T>
-void LogitsProcessorLayer<T>::allocateBuffer(const size_t batch_size)
-{
-    TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
-
-    repetition_penalty_buf_ =
-        reinterpret_cast<float*>(allocator_->reMalloc(repetition_penalty_buf_, sizeof(float) * batch_size, false));
-    min_lengths_buf_ = reinterpret_cast<int*>(allocator_->reMalloc(min_lengths_buf_, sizeof(int) * batch_size, false));
-    temperature_buf_ =
-        reinterpret_cast<float*>(allocator_->reMalloc(temperature_buf_, sizeof(float) * batch_size, false));
-
-    repetition_penalty_.resize(batch_size);
-    min_lengths_.resize(batch_size);
-    context_length_.resize(batch_size);
-    prompt_length_.resize(batch_size);
-    temperature_.resize(batch_size);
-
-    TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
-}
+}  // namespace
 
 template<typename T>
-void LogitsProcessorLayer<T>::freeBuffer()
+LogitsProcessorLayer<T>::LogitsProcessorLayer(const BaseParam& param): BaseDynamicDecodeLayer{param}
 {
-    TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
-
-    repetition_penalty_ = {};
-    min_lengths_        = {};
-    context_length_     = {};
-    prompt_length_      = {};
-    temperature_        = {};
 
-    allocator_->free((void**)&repetition_penalty_workspace_);
-    allocator_->free((void**)&repetition_penalty_buf_);
-    allocator_->free((void**)&min_lengths_buf_);
-    allocator_->free((void**)&temperature_buf_);
-
-    TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+    repetition_penalty_ = {max_batch_size_, kCPUpinned};
+    min_lengths_        = {max_batch_size_, kCPUpinned};
+    temperature_        = {max_batch_size_, kCPUpinned};
+    bad_words_          = {max_batch_size_ * 2 * kMaxStopBadWordsLen, kCPUpinned};
+    end_ids_            = {max_batch_size_ * kMaxEndIdsSize, kCPUpinned};
+
+    repetition_penalty_buf_ = {max_batch_size_, kDEVICE};
+    min_lengths_buf_        = {max_batch_size_, kDEVICE};
+    temperature_buf_        = {max_batch_size_, kDEVICE};
+    bad_words_buf_          = {max_batch_size_ * 2 * kMaxStopBadWordsLen, kDEVICE};
+    end_ids_buf_            = {max_batch_size_ * kMaxEndIdsSize, kDEVICE};
 }
 
 template<typename T>
-LogitsProcessorLayer<T>::~LogitsProcessorLayer()
-{
-    TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
-
-    freeBuffer();
-
-    TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
-}
-
-template<typename T>
-void LogitsProcessorLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_tensors)
+void LogitsProcessorLayer<T>::Forward(TensorMap& args)
 {
     // apply repetition penalty -> ban bad words -> min length penalty -> temperature penalty
     // the order is same with transformers
 
     TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
 
-    FT_CHECK(input_tensors->at("logits").shape.size() == 3);
+    Tensor_<int> output_ids = args.at("output_ids");
+    Tensor_<T>   logits     = args.at("logits");
+
+    const auto bsz = logits.shape(0);
 
-    const int batch_size       = output_tensors->at("output_ids").shape[1];
-    const int step             = input_tensors->at("step").getVal<int>();
-    const int max_input_length = input_tensors->at("max_input_length").getVal<int>();
-    T*        logits           = input_tensors->at("logits").getPtr<T>();
+    const int step             = *args.at("step").data<int>();
+    const int max_input_length = *args.at("max_input_length").data<int>();
 
     // repetition penalty
     if (step > 1 && repetition_penalty_type_ != RepetitionPenaltyType::None) {
-        float default_value = getDefaultPenaltyValue(repetition_penalty_type_);
-        if (!ALL_OF(repetition_penalty_.begin(), batch_size, float, default_value)) {
-            repetition_penalty_workspace_ = reinterpret_cast<int*>(allocator_->reMalloc(
-                repetition_penalty_workspace_, batch_size * step * (sizeof(int) + sizeof(float)), false));
-            invokeBatchApplyRepetitionPenalty(
-                logits,
-                repetition_penalty_buf_,
-                repetition_penalty_workspace_,
-                output_tensors->at("output_ids").getPtr<int>(),
-                batch_size,
-                batch_size,
-                args_.vocab_size_padded,
-                input_tensors->at("input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {}, nullptr}).getPtr<int>(),
-                max_input_length,
-                step,
-                repetition_penalty_type_,
-                stream_);
-            sync_check_cuda_error();
-        }
+        Buffer_<uint8_t> workspace(bsz * step * (sizeof(int) + sizeof(float)), kDEVICE);
+        invokeBatchApplyRepetitionPenalty(logits.data(),
+                                          repetition_penalty_buf_.data(),
+                                          (int*)workspace.data(),
+                                          output_ids.data(),
+                                          bsz,
+                                          bsz,
+                                          vocab_size_padded_,
+                                          args.at("init_context_length").data<int>(),
+                                          max_input_length,
+                                          step,
+                                          repetition_penalty_type_,
+                                          stream_);
+        sync_check_cuda_error();
     }
 
     // ban bad words
-    if (input_tensors->isExist("bad_words_list")) {
-        const Tensor bad_words = input_tensors->at("bad_words_list");
-        FT_CHECK(bad_words.shape.size() == 3);
-        const size_t bad_words_len = bad_words.shape[2];
-        invokeBanBadWords(logits,
-                          output_tensors->at("output_ids").getPtr<const int>(),
+    if (auto& bad_words = bad_words_ten_) {
+        TM_CHECK_EQ(bad_words.ndim(), 3);
+        const auto bad_words_len = bad_words.shape(2);
+        invokeBanBadWords(logits.data(),
+                          output_ids.data(),
                           nullptr,
-                          batch_size,
-                          batch_size,
+                          bsz,
+                          bsz,
                           1,
-                          bad_words.getPtr<const int>(),
+                          bad_words.data(),
                           false,
                           bad_words_len,
                           0,
-                          args_.vocab_size_padded,
+                          vocab_size_padded_,
                           step,
                           stream_);
 
@@ -155,72 +123,113 @@ void LogitsProcessorLayer<T>::forward(TensorMap* output_tensors, TensorMap* inpu
     }
 
     // min length
-    {
-        const int        num_generated_tokens = step - max_input_length;
-        const int*       min_lengths          = min_lengths_.data();
-        std::vector<int> index(batch_size);
-        std::iota(index.begin(), index.end(), 0);
-        const bool invoke_min_length_penalty = std::any_of(index.begin(), index.end(), [&](int i) {
-            return min_lengths[i] > context_length_[i] + num_generated_tokens;
-        });
-        if (invoke_min_length_penalty && input_tensors->isExist("end_ids")) {
-            const Tensor end_ids = input_tensors->at("end_ids");
-            FT_CHECK(end_ids.shape.size() == 2);
-            invokeMinLengthPenalty(logits,
-                                   min_lengths_buf_,
-                                   output_tensors->getPtr<const int>("sequence_length"),
-                                   args_.vocab_size_padded,
-                                   batch_size,
-                                   input_tensors->getPtr<const int>("end_ids"),
-                                   end_ids.shape[1],
+    if (end_ids_ten_) {
+        TM_CHECK_EQ(end_ids_ten_.ndim(), 2);
+        auto enable = [&] {
+            const int num_generated_tokens = step - max_input_length;
+            auto      context_len          = args.at("context_length").data<int>();
+            for (int i = 0; i < bsz; ++i) {
+                if (min_lengths_[i] > context_len[i] + num_generated_tokens) {
+                    return true;
+                }
+            }
+            return false;
+        }();
+        if (enable) {
+            invokeMinLengthPenalty(logits.data(),
+                                   min_lengths_buf_.data(),
+                                   args.at("sequence_length").data<int>(),
+                                   vocab_size_padded_,
+                                   bsz,
+                                   end_ids_ten_.data(),
+                                   end_ids_ten_.shape(1),
                                    stream_);
             sync_check_cuda_error();
         }
     }
 
     // temperature
-    {
-        if (!ALL_OF(temperature_.begin(), batch_size, float, 1.f)) {
-            invokeBatchApplyTemperaturePenalty_v2(
-                logits, (T*)nullptr, temperature_buf_, batch_size, args_.vocab_size, args_.vocab_size_padded, stream_);
-            sync_check_cuda_error();
-        }
+    if (!ALL_OF(temperature_.begin(), bsz, float, 1.f)) {
+        invokeBatchApplyTemperaturePenalty_v2(logits.data(),  //
+                                              (T*)nullptr,
+                                              temperature_buf_.data(),
+                                              bsz,
+                                              vocab_size_,
+                                              vocab_size_padded_,
+                                              stream_);
+        sync_check_cuda_error();
     }
 
     TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 template<typename T>
-void LogitsProcessorLayer<T>::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args)
+void LogitsProcessorLayer<T>::Setup(const std::vector<const Request*>& rs, const TensorMap& args)
 {
     TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
 
-    allocateBuffer(batch_size);
-
-    // repetition_penalty
-    if (runtime_args->isExist("repetition_penalty")) {
-        init_host_buffer(runtime_args, "repetition_penalty", batch_size, repetition_penalty_.data(), 1.f);
-        repetition_penalty_type_ = RepetitionPenaltyType::Multiplicative;
-    }
+    const int bsz = rs.size();
 
-    // temperature
-    init_host_buffer(runtime_args, "temperature", batch_size, temperature_.data(), 1.f);
+    const auto prompt_length = args.at("prompt_length").data<int>();
 
-    // min_length
-    init_host_buffer(runtime_args, "min_length", batch_size, min_lengths_.data(), 0);
-    init_host_buffer(runtime_args, "context_length", batch_size, context_length_.data(), 0);
-    init_host_buffer(runtime_args, "prompt_length", batch_size, prompt_length_.data(), 0);
+    repetition_penalty_type_ = RepetitionPenaltyType::None;
 
-    // invokeMinLengthPenalty if min_length > context_length - prompt_length + num_generated_tokens
-    std::transform(
-        min_lengths_.begin(), min_lengths_.end(), prompt_length_.begin(), min_lengths_.begin(), std::plus<int>());
+    for (int i = 0; i < bsz; ++i) {
+        auto& c = rs[i]->gen_cfg;
+        // repetition_penalty
+        repetition_penalty_[i] = c.repetition_penalty;
+        if (repetition_penalty_[i] != 1.f) {
+            repetition_penalty_type_ = RepetitionPenaltyType::Multiplicative;
+        }
+        // temperature
+        temperature_[i] = c.temperature;
+        // min_length
+        min_lengths_[i] = c.min_new_tokens + prompt_length[i];
+    }
 
-    cudaAutoCpy(temperature_buf_, temperature_.data(), batch_size, stream_);
-    cudaAutoCpy(repetition_penalty_buf_, repetition_penalty_.data(), batch_size, stream_);
-    cudaAutoCpy(min_lengths_buf_, min_lengths_.data(), batch_size, stream_);
+    Copy_(temperature_, bsz, temperature_buf_);
+    Copy_(repetition_penalty_, bsz, repetition_penalty_buf_);
+    Copy_(min_lengths_, bsz, min_lengths_buf_);
 
     sync_check_cuda_error();
 
+    init_stop_bad_words(&GenerationConfig::bad_ids,  //
+                        "bad_words",
+                        rs,
+                        bad_words_.data(),
+                        bad_words_buf_.data(),
+                        bad_words_ten_);
+
+    {  // end ids for min length
+        end_ids_ten_   = {};
+        int max_length = 0;
+        for (int i = 0; i < bsz; ++i) {
+            max_length = std::max(max_length, (int)rs[i]->gen_cfg.eos_ids.size());
+        }
+        if (max_length) {
+            max_length     = std::min(max_length, kMaxEndIdsSize);
+            int* h_end_ids = end_ids_.data();
+            std::fill(h_end_ids, h_end_ids + std::min(kMaxEndIdsSize, max_length) * bsz, -1);
+            for (int i = 0; i < bsz; ++i) {
+                const auto& eos_ids = rs[i]->gen_cfg.eos_ids;
+                if (eos_ids.size() == 0) {
+                    continue;
+                }
+                if (TM_UNLIKELY(eos_ids.size() > kMaxEndIdsSize)) {
+                    TM_LOG_WARNING("[InitializeSampling] [%ld] eos length (%d) exceeds %d, truncated to %d",
+                                   (long)rs[i]->id,
+                                   (int)eos_ids.size(),
+                                   kMaxEndIdsSize,
+                                   kMaxEndIdsSize);
+                }
+                std::copy_n(eos_ids.begin(), std::min((int)eos_ids.size(), kMaxEndIdsSize), h_end_ids);
+                h_end_ids += max_length;
+            }
+            Copy(end_ids_, bsz * max_length, end_ids_buf_);
+            end_ids_ten_ = {end_ids_buf_.data(), {bsz, max_length}, kDEVICE};
+        }
+    }
+
     TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
diff --git a/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.h b/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.h
index 23f108b829..1e56dabd64 100644
--- a/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.h
+++ b/src/turbomind/layers/sampling_layers/LogitsProcessorLayer.h
@@ -16,47 +16,45 @@
 
 #pragma once
 
+#include <vector>
+
 #include "src/turbomind/kernels/penalty_types.h"
-#include "src/turbomind/layers/DynamicDecodeBaseLayer.h"
+#include "src/turbomind/layers/BaseDynamicDecodeLayer.h"
 #include "src/turbomind/macro.h"
-#include <vector>
+
+#include "src/turbomind/engine/request.h"
 
 namespace turbomind {
 
 template<typename T>
-class LogitsProcessorLayer: public DynamicDecodeBaseLayer {
+class LogitsProcessorLayer: public BaseDynamicDecodeLayer {
 public:
-    using DynamicDecodeBaseLayer::DynamicDecodeBaseLayer;
-    using DynamicDecodeBaseLayer::args_;
+    explicit LogitsProcessorLayer(const BaseParam& param);
 
-    void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override;
+    void Setup(const std::vector<const Request*>& rs, const TensorMap& args) override;
 
-    void forward(TensorMap* output_tensors, TensorMap* input_tensors) override;
-
-    ~LogitsProcessorLayer();
+    void Forward(TensorMap& args) override;
 
 private:
-    void allocateBuffer() override;
-
-    void allocateBuffer(const size_t batch_size);
-
-    void freeBuffer() override;
-
     // repetition penalty type
     RepetitionPenaltyType repetition_penalty_type_ = RepetitionPenaltyType::None;
 
     // host buffer
-    std::vector<float> repetition_penalty_;
-    std::vector<int>   min_lengths_;
-    std::vector<float> temperature_;
-    std::vector<int>   context_length_;
-    std::vector<int>   prompt_length_;
+    Buffer_<float> repetition_penalty_;
+    Buffer_<int>   min_lengths_;
+    Buffer_<float> temperature_;
+    Buffer_<int>   bad_words_;
+    Buffer_<int>   end_ids_;
 
     // device buffer
-    int*   repetition_penalty_workspace_ = nullptr;
-    float* repetition_penalty_buf_       = nullptr;
-    int*   min_lengths_buf_              = nullptr;
-    float* temperature_buf_              = nullptr;
+    Buffer_<float> repetition_penalty_buf_;
+    Buffer_<int>   min_lengths_buf_;
+    Buffer_<float> temperature_buf_;
+    Buffer_<int>   bad_words_buf_;
+    Buffer_<int>   end_ids_buf_;
+
+    Tensor_<int> bad_words_ten_;
+    Tensor_<int> end_ids_ten_;
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/layers/sampling_layers/SamplingLayer.cc b/src/turbomind/layers/sampling_layers/SamplingLayer.cc
index 315226f7e3..04d051d10d 100644
--- a/src/turbomind/layers/sampling_layers/SamplingLayer.cc
+++ b/src/turbomind/layers/sampling_layers/SamplingLayer.cc
@@ -15,135 +15,34 @@
  */
 
 #include "src/turbomind/layers/sampling_layers/SamplingLayer.h"
+#include "src/turbomind/core/check.h"
+#include "src/turbomind/core/tensor.h"
 #include "src/turbomind/kernels/sampling_kernels.h"
 #include "src/turbomind/kernels/sampling_topk_kernels.h"
 #include "src/turbomind/kernels/sampling_topp_kernels.h"
-#include "src/turbomind/utils/memory_utils.h"
+#include "src/turbomind/utils/logger.h"
 
 namespace turbomind {
 
-void set_runtime_args(int    batch_size,
-                      int    top_k,
-                      int*   top_ks,
-                      int    top_ks_size,
-                      int*   runtime_top_k,
-                      float  top_p,
-                      float* top_ps,
-                      int    top_ps_size,
-                      float* runtime_top_p,
-                      float  min_p,
-                      float* min_ps,
-                      int    min_ps_size,
-                      float* runtime_min_p)
-{
-    for (int i = 0; i < batch_size; i++) {
-        int   topk = top_ks_size > 1 ? top_ks[i] : top_k;
-        float topp = top_ps_size > 1 ? top_ps[i] : top_p;
-        float minp = min_ps_size > 1 ? min_ps[i] : min_p;
-
-        if (topk == 0 && topp == 0.f) {
-            topk = 1;
-        }
-
-        if (topk < 0 || topk > 1024) {
-            TM_LOG_WARNING("topk (%d) is out of range [0, 1024]", topk);
-            topk = std::max(0, std::min(topk, 1024));
-        }
-        if (topp < 0.f || topp > 1.f) {
-            TM_LOG_WARNING("topp (%f) is out of range [0.0, 1.0f]", topp);
-            topp = std::max(0.f, std::min(topp, 1.f));
-        }
-        if (minp < 0.f || minp > 1.f) {
-            TM_LOG_WARNING("minp (%f) is out of range [0.0, 1.0f]", minp);
-            minp = std::max(0.f, std::min(minp, 1.f));
-        }
-        runtime_top_k[i] = topk;
-        runtime_top_p[i] = topp;
-        runtime_min_p[i] = minp;
-    }
-}
-
 template<typename T>
-void SamplingLayer<T>::allocateBuffer()
+SamplingLayer<T>::SamplingLayer(const BaseParam& param): BaseDynamicDecodeLayer{param}
 {
-    FT_CHECK(false);
+    top_k_ = {max_batch_size_, kCPUpinned};
+    top_p_ = {max_batch_size_, kCPUpinned};
+    min_p_ = {max_batch_size_, kCPUpinned};
+    kept_  = {max_batch_size_, kCPUpinned};
+
+    // constant array
+    std::fill_n(kept_.data(), max_batch_size_, vocab_size_);
+
+    top_k_buf_ = {max_batch_size_, kDEVICE};
+    top_p_buf_ = {max_batch_size_, kDEVICE};
+    min_p_buf_ = {max_batch_size_, kDEVICE};
+    kept_buf_  = {max_batch_size_, kDEVICE};
 }
 
 template<typename T>
-void SamplingLayer<T>::allocateBuffer(const size_t batch_size)
-{
-    TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
-
-    runtime_top_k_buf_ =
-        reinterpret_cast<int*>(allocator_->reMalloc(runtime_top_k_buf_, sizeof(int) * batch_size, false));
-    runtime_top_p_buf_ =
-        reinterpret_cast<float*>(allocator_->reMalloc(runtime_top_p_buf_, sizeof(float) * batch_size, false));
-    runtime_min_p_buf_ =
-        reinterpret_cast<float*>(allocator_->reMalloc(runtime_min_p_buf_, sizeof(float) * batch_size, false));
-
-    indices_ = reinterpret_cast<int*>(
-        allocator_->reMalloc(indices_, batch_size * sizeof(int) * args_.vocab_size_padded, false));
-    kept_ = reinterpret_cast<int*>(allocator_->reMalloc(kept_, batch_size * sizeof(int), false));
-
-    {
-        // topk buffer
-        TopKSortFilterParams params{};
-        params.batch_size = batch_size;
-        params.max_top_k  = max_topk_;
-        invokeTopKSortFilter<T>(params, stream_);
-        topk_ws_size_ = params.workspace_size;
-        topk_ws_      = allocator_->reMalloc(topk_ws_, topk_ws_size_, false);
-    }
-
-    {
-        // topp buffer
-        TopPSortParams params{};
-        params.batch_size        = batch_size;
-        params.vocab_size        = args_.vocab_size;
-        params.vocab_size_padded = args_.vocab_size_padded;
-        invokeTopPSort<T>(params, stream_);
-        topp_ws_size_ = params.workspace_size;
-        topp_ws_      = allocator_->reMalloc(topp_ws_, topp_ws_size_, false);
-    }
-
-    TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
-}
-
-template<typename T>
-void SamplingLayer<T>::freeBuffer()
-{
-    TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
-
-    kept_n_        = {};
-    runtime_top_k_ = {};
-    runtime_top_p_ = {};
-    runtime_min_p_ = {};
-
-    allocator_->free((void**)&runtime_top_k_buf_);
-    allocator_->free((void**)&runtime_top_p_buf_);
-    allocator_->free((void**)&runtime_min_p_buf_);
-    allocator_->free((void**)&topk_ws_);
-    allocator_->free((void**)&topp_ws_);
-
-    allocator_->free((void**)&indices_);
-    allocator_->free((void**)&kept_);
-    logits_ = nullptr;
-
-    TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
-}
-
-template<typename T>
-SamplingLayer<T>::~SamplingLayer()
-{
-    TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
-
-    freeBuffer();
-
-    TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
-}
-
-template<typename T>
-void SamplingLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_tensors)
+void SamplingLayer<T>::Forward(TensorMap& args)
 {
     // step1:
     //  - use topk / topp_minp kernel to sort and filter the scores
@@ -153,82 +52,82 @@ void SamplingLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_tenso
 
     TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
 
-    Tensor    logits     = input_tensors->at("logits");
-    const int batch_size = logits.shape[0];
-    const int step       = input_tensors->at("step").getVal<int>();
-    logits_              = logits.getPtr<T>();
+    Tensor_<T> logits = args.at("logits");
+
+    const auto bsz = logits.shape(0);
+
+    const int step = *args.at("step").data<int>();
 
-    cudaAutoCpy(kept_, kept_n_.data(), batch_size, stream_);
+    core::Copy(kept_.data(), bsz, kept_buf_.data());
 
     // use topk sort if some request use topk filter
     if (max_topk_ > 0) {
         // TODO: top_k >= 64 is much slower than torch.topk()
         TopKSortFilterParams params{};
-        params.workspace         = topk_ws_;
-        params.workspace_size    = topk_ws_size_;
-        params.logits            = logits_;
-        params.sorted_logits     = logits_;
-        params.sorted_indices    = indices_;
-        params.kept              = kept_;
-        params.top_ks            = runtime_top_k_buf_;
+        params.workspace         = topk_ws_.data();
+        params.workspace_size    = topk_ws_.size();
+        params.logits            = logits.data();
+        params.sorted_logits     = logits.data();
+        params.sorted_indices    = indices_.data();
+        params.kept              = kept_buf_.data();
+        params.top_ks            = top_k_buf_.data();
         params.max_top_k         = max_topk_;
-        params.batch_size        = batch_size;
-        params.vocab_size        = args_.vocab_size;
-        params.vocab_size_padded = args_.vocab_size_padded;
+        params.batch_size        = bsz;
+        params.vocab_size        = vocab_size_;
+        params.vocab_size_padded = vocab_size_padded_;
         invokeTopKSortFilter<T>(params, stream_);
     }
 
     // use topp sort if some request skip topk filter
     if (min_topk_ == 0) {
-        invokeSoftmax<T>(logits_, args_.vocab_size_padded, args_.vocab_size, batch_size, kept_, stream_);
+        invokeSoftmax<T>(logits.data(), vocab_size_padded_, vocab_size_, bsz, kept_buf_.data(), stream_);
 
         TopPSortParams params{};
-        params.workspace         = topp_ws_;
-        params.workspace_size    = topp_ws_size_;
-        params.logits            = logits_;
-        params.sorted_logits     = logits_;
-        params.sorted_indices    = indices_;
-        params.kept              = kept_;
-        params.top_ks            = runtime_top_k_buf_;
-        params.top_ps            = runtime_top_p_buf_;
-        params.batch_size        = batch_size;
-        params.vocab_size        = args_.vocab_size;
-        params.vocab_size_padded = args_.vocab_size_padded;
+        params.workspace         = topp_ws_.data();
+        params.workspace_size    = topp_ws_.size();
+        params.logits            = logits.data();
+        params.sorted_logits     = logits.data();
+        params.sorted_indices    = indices_.data();
+        params.kept              = kept_buf_.data();
+        params.top_ks            = top_k_buf_.data();
+        params.top_ps            = top_p_buf_.data();
+        params.batch_size        = bsz;
+        params.vocab_size        = vocab_size_;
+        params.vocab_size_padded = vocab_size_padded_;
         invokeTopPSort<T>(params, stream_);
     }
 
     // apply topp minp filter
     if (max_minp_ != 0.f || min_topp_ != 1.f) {
         TopPMinPFilterParams params{};
-        params.sorted_logits     = logits_;
-        params.sorted_indices    = indices_;
-        params.kept              = kept_;
-        params.top_ps            = runtime_top_p_buf_;
-        params.min_ps            = runtime_min_p_buf_;
-        params.batch_size        = batch_size;
-        params.vocab_size        = args_.vocab_size;
-        params.vocab_size_padded = args_.vocab_size_padded;
+        params.sorted_logits     = logits.data();
+        params.sorted_indices    = indices_.data();
+        params.kept              = kept_buf_.data();
+        params.top_ps            = top_p_buf_.data();
+        params.min_ps            = min_p_buf_.data();
+        params.batch_size        = bsz;
+        params.vocab_size        = vocab_size_;
+        params.vocab_size_padded = vocab_size_padded_;
         invokeTopPMinPFilter<T>(params, stream_);
     }
 
     // sample
     {
         SamplingParams params{};
-        params.logits      = logits.getPtr<T>();
-        params.stride      = args_.vocab_size_padded;
-        params.indices     = indices_;
-        params.kept        = kept_;
-        params.curandstate = output_tensors->at("curand_state").getPtr<curandState_t>();
-        params.batch_size  = batch_size;
-        params.output_ids  = output_tensors->at("output_ids").getPtrWithOffset<int>(step * batch_size);
-        params.sequence_length =
-            output_tensors->at("sequence_length", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<int>();
-        params.sampled_logprobs =
-            output_tensors->at("sampled_logprobs", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<T>();
-        params.sampled_indexes =
-            output_tensors->at("sampled_indexes", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<uint32_t>();
-        params.sampled_nums =
-            output_tensors->at("sampled_nums", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<uint32_t>();
+        params.logits          = logits.data();
+        params.stride          = vocab_size_padded_;
+        params.indices         = indices_.data();
+        params.kept            = kept_buf_.data();
+        params.curandstate     = (curandState_t*)args.at("curand_state").raw_data();
+        params.batch_size      = bsz;
+        params.output_ids      = args.at("output_ids").data<int>() + step * bsz;
+        params.sequence_length = args.at("sequence_length").data<int>();
+
+        if (auto sampled_logprobs = args.try_("sampled_logprobs")) {
+            params.sampled_logprobs = sampled_logprobs->data<T>();
+            params.sampled_indexes  = args.at("sampled_indexes").data<uint32_t>();
+            params.sampled_nums     = args.at("sampled_nums").data<uint32_t>();
+        }
 
         invokeSampling<T>(params, stream_);
         sync_check_cuda_error();
@@ -238,51 +137,45 @@ void SamplingLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_tenso
 }
 
 template<typename T>
-void SamplingLayer<T>::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args)
+void SamplingLayer<T>::Setup(const std::vector<const Request*>& rs, const TensorMap&)
 {
-    TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
-
-    const Tensor runtime_top_k = runtime_args->isExist("runtime_top_k") ? runtime_args->at("runtime_top_k") : Tensor();
-    const Tensor runtime_top_p = runtime_args->isExist("runtime_top_p") ? runtime_args->at("runtime_top_p") : Tensor();
-    const Tensor runtime_min_p = runtime_args->isExist("runtime_min_p") ? runtime_args->at("runtime_min_p") : Tensor();
+    const auto bsz = rs.size();
 
-    kept_n_.resize(batch_size);
-    runtime_top_k_.resize(batch_size);
-    runtime_top_p_.resize(batch_size);
-    runtime_min_p_.resize(batch_size);
-
-    int   top_k = runtime_top_k.size() > 0 ? runtime_top_k.getVal<int>() : 0;
-    float top_p = runtime_top_p.size() > 0 ? runtime_top_p.getVal<float>() : 0.0f;
-    float min_p = runtime_min_p.size() > 0 ? runtime_min_p.getVal<float>() : 0.0f;
-    set_runtime_args(batch_size,
-                     top_k,
-                     runtime_top_k.getPtr<int>(),
-                     runtime_top_k.size(),
-                     runtime_top_k_.data(),
-                     top_p,
-                     runtime_top_p.getPtr<float>(),
-                     runtime_top_p.size(),
-                     runtime_top_p_.data(),
-                     min_p,
-                     runtime_min_p.getPtr<float>(),
-                     runtime_min_p.size(),
-                     runtime_min_p_.data());
+    for (int i = 0; i < bsz; ++i) {
+        top_k_[i] = rs[i]->gen_cfg.top_k;
+        top_p_[i] = rs[i]->gen_cfg.top_p;
+        min_p_[i] = rs[i]->gen_cfg.min_p;
+    }
 
-    max_topk_ = *std::max_element(runtime_top_k_.begin(), runtime_top_k_.end());
-    min_topk_ = *std::min_element(runtime_top_k_.begin(), runtime_top_k_.end());
-    min_topp_ = *std::min_element(runtime_top_p_.begin(), runtime_top_p_.end());
-    max_minp_ = *std::max_element(runtime_min_p_.begin(), runtime_min_p_.end());
+    max_topk_ = *std::max_element(top_k_.begin(), top_k_.end());
+    min_topk_ = *std::min_element(top_k_.begin(), top_k_.end());
+    min_topp_ = *std::min_element(top_p_.begin(), top_p_.end());
+    max_minp_ = *std::max_element(min_p_.begin(), min_p_.end());
 
-    allocateBuffer(batch_size);
+    indices_ = Buffer_<int>(bsz * vocab_size_padded_, kDEVICE);
 
-    // kept
-    std::fill_n(kept_n_.data(), batch_size, args_.vocab_size);
+    {
+        // topk buffer
+        TopKSortFilterParams params{};
+        params.batch_size = bsz;
+        params.max_top_k  = max_topk_;
+        invokeTopKSortFilter<T>(params, stream_);
+        topk_ws_ = {(ssize_t)params.workspace_size, kDEVICE};
+    }
 
-    cudaAutoCpy(runtime_top_k_buf_, runtime_top_k_.data(), batch_size, stream_);
-    cudaAutoCpy(runtime_top_p_buf_, runtime_top_p_.data(), batch_size, stream_);
-    cudaAutoCpy(runtime_min_p_buf_, runtime_min_p_.data(), batch_size, stream_);
+    {
+        // topp buffer
+        TopPSortParams params{};
+        params.batch_size        = bsz;
+        params.vocab_size        = vocab_size_;
+        params.vocab_size_padded = vocab_size_padded_;
+        invokeTopPSort<T>(params, stream_);
+        topp_ws_ = {(ssize_t)params.workspace_size, kDEVICE};
+    }
 
-    TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+    core::Copy(top_k_.data(), bsz, top_k_buf_.data());
+    core::Copy(top_p_.data(), bsz, top_p_buf_.data());
+    core::Copy(min_p_.data(), bsz, min_p_buf_.data());
 }
 
 #ifdef ENABLE_FP32
diff --git a/src/turbomind/layers/sampling_layers/SamplingLayer.h b/src/turbomind/layers/sampling_layers/SamplingLayer.h
index 0de3088248..55696767fb 100644
--- a/src/turbomind/layers/sampling_layers/SamplingLayer.h
+++ b/src/turbomind/layers/sampling_layers/SamplingLayer.h
@@ -15,55 +15,47 @@
  */
 #pragma once
 
-#include "src/turbomind/layers/DynamicDecodeBaseLayer.h"
-#include "src/turbomind/macro.h"
 #include <vector>
 
+#include "src/turbomind/core/tensor.h"
+#include "src/turbomind/layers/BaseDynamicDecodeLayer.h"
+#include "src/turbomind/macro.h"
+
+#include "src/turbomind/engine/request.h"
+
 namespace turbomind {
 
 template<typename T>
-class SamplingLayer: public DynamicDecodeBaseLayer {
+class SamplingLayer: public BaseDynamicDecodeLayer {
 public:
-    using DynamicDecodeBaseLayer::DynamicDecodeBaseLayer;
-    using DynamicDecodeBaseLayer::args_;
-
-    void setup(const size_t batch_size, const size_t beam_width, TensorMap* params) override;
+    explicit SamplingLayer(const BaseParam& param);
 
-    void forward(TensorMap* output_tensors, TensorMap* input_tensors) override;
+    void Setup(const std::vector<const Request*>& rs, const TensorMap&) override;
 
-    ~SamplingLayer();
+    void Forward(TensorMap& args) override;
 
 private:
-    void allocateBuffer() override;
-
-    void freeBuffer() override;
-
-    void allocateBuffer(const size_t batch_size);
-
     // host buffer
-    std::vector<int>   kept_n_;
-    std::vector<int>   runtime_top_k_;
-    std::vector<float> runtime_top_p_;
-    std::vector<float> runtime_min_p_;
-    int                max_topk_;
-    int                min_topk_;
-    float              min_topp_;
-    float              max_minp_;
+    Buffer_<int>   kept_;
+    Buffer_<int>   top_k_;
+    Buffer_<float> top_p_;
+    Buffer_<float> min_p_;
 
-    // device buffer
-    int*   runtime_top_k_buf_{};
-    float* runtime_top_p_buf_{};
-    float* runtime_min_p_buf_{};
+    int   max_topk_;
+    int   min_topk_;
+    float min_topp_;
+    float max_minp_;
 
-    void*  topk_ws_{};
-    size_t topk_ws_size_;
+    // device buffer
+    Buffer_<int>   top_k_buf_;
+    Buffer_<float> top_p_buf_;
+    Buffer_<float> min_p_buf_;
 
-    void*  topp_ws_{};
-    size_t topp_ws_size_;
+    Buffer_<uint8_t> topk_ws_;
+    Buffer_<uint8_t> topp_ws_;
 
-    T*   logits_{};   // sorted logits
-    int* indices_{};  // sorted indices
-    int* kept_{};     // kept sample
+    Buffer_<int> indices_;   // sorted indices
+    Buffer_<int> kept_buf_;  // kept sample
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/layers/sampling_layers/StopCriteriaLayer.cc b/src/turbomind/layers/sampling_layers/StopCriteriaLayer.cc
index 5d40d85dce..b4e49cc5a6 100644
--- a/src/turbomind/layers/sampling_layers/StopCriteriaLayer.cc
+++ b/src/turbomind/layers/sampling_layers/StopCriteriaLayer.cc
@@ -16,92 +16,66 @@
 
 #include "src/turbomind/layers/sampling_layers/StopCriteriaLayer.h"
 #include "src/turbomind/kernels/stop_criteria_kernels.h"
-#include "src/turbomind/utils/memory_utils.h"
+#include "src/turbomind/layers/sampling_layers/utils.h"
+#include "src/turbomind/macro.h"
 
 namespace turbomind {
 
 template<typename T>
-void StopCriteriaLayer<T>::allocateBuffer()
+StopCriteriaLayer<T>::StopCriteriaLayer(const BaseParam& param): BaseDynamicDecodeLayer{param}
 {
-    TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
-
-    h_pinned_finished_sum_ = (int*)allocator_->reMalloc(h_pinned_finished_sum_, sizeof(int), true, true);
-
-    TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+    stop_words_     = {max_batch_size_ * 2 * kMaxStopBadWordsLen, kCPUpinned};
+    stop_words_buf_ = {max_batch_size_ * 2 * kMaxStopBadWordsLen, kDEVICE};
 }
 
 template<typename T>
-void StopCriteriaLayer<T>::freeBuffer()
+void StopCriteriaLayer<T>::Setup(const std::vector<const Request*>& rs, const TensorMap&)
 {
-    TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
-
-    allocator_->free((void**)(&h_pinned_finished_sum_), true);
-
-    TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
-}
-
-template<typename T>
-StopCriteriaLayer<T>::~StopCriteriaLayer()
-{
-    TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
-
-    freeBuffer();
-
-    TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+    init_stop_bad_words(&GenerationConfig::stop_ids,  //
+                        "stop_words",
+                        rs,
+                        stop_words_.data(),
+                        stop_words_buf_.data(),
+                        stop_words_ten_);
 }
 
 template<typename T>
-void StopCriteriaLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_tensors)
+void StopCriteriaLayer<T>::Forward(TensorMap& args)
 {
     TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
 
-    const size_t batch_size = input_tensors->at("logits").shape[0];
-    const int    step       = input_tensors->at("step").getVal<int>();
+    const int batch_size = args.at("logits").shape(0);
+    const int step       = *args.at("step").data<int>();
 
-    if (input_tensors->isExist("stop_words_list")) {
-        const Tensor stop_words_list = input_tensors->at("stop_words_list");
-        FT_CHECK(stop_words_list.shape.size() == 3);  // [batch, 2, len]
-        size_t stop_words_len = stop_words_list.shape[2];
-        invokeStopWordsCriterion(output_tensors->at("output_ids").getPtr<const int>(),
+    if (auto& stop_words = stop_words_ten_) {
+        TM_CHECK_EQ(stop_words.ndim(), 3);  // [batch, 2, len]
+        size_t stop_words_len = stop_words.shape(2);
+        invokeStopWordsCriterion(args.at("output_ids").data<int>(),
                                  nullptr,
-                                 stop_words_list.getPtr<const int>(),
-                                 output_tensors->at("finished").getPtr<bool>(),
+                                 stop_words.data(),
+                                 args.at("finished").data<bool>(),
                                  0,
                                  stop_words_len,
                                  batch_size,
                                  1,
                                  step,
                                  stream_);
-
         sync_check_cuda_error();
     }
 
-    if (input_tensors->isExist("sequence_limit_length")) {
-        invokeLengthCriterion(output_tensors->at("finished").getPtr<bool>(),
-                              output_tensors->getPtr<bool>("should_stop", nullptr),
-                              h_pinned_finished_sum_,
-                              input_tensors->at("sequence_limit_length").getPtr<const uint32_t>(),
+    if (auto seq_lim_len = args.try_("sequence_limit_length")) {
+        invokeLengthCriterion(args.at("finished").data<bool>(),  //
+                              seq_lim_len->data<int>(),
                               batch_size,
                               1,
                               step,
                               stream_);
-
         sync_check_cuda_error();
     }
 
     TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
-template<typename T>
-void StopCriteriaLayer<T>::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args)
-{
-    TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
-
-    allocateBuffer();
-
-    TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
-}
-
 #ifdef ENABLE_FP32
 template class StopCriteriaLayer<float>;
 #endif
diff --git a/src/turbomind/layers/sampling_layers/StopCriteriaLayer.h b/src/turbomind/layers/sampling_layers/StopCriteriaLayer.h
index b70ed2e69e..b9f6ee1b9a 100644
--- a/src/turbomind/layers/sampling_layers/StopCriteriaLayer.h
+++ b/src/turbomind/layers/sampling_layers/StopCriteriaLayer.h
@@ -16,29 +16,25 @@
 
 #pragma once
 
-#include "src/turbomind/layers/DynamicDecodeBaseLayer.h"
-#include "src/turbomind/macro.h"
+#include "src/turbomind/layers/BaseDynamicDecodeLayer.h"
+
+#include "src/turbomind/engine/request.h"
 
 namespace turbomind {
 
 template<typename T>
-class StopCriteriaLayer: public DynamicDecodeBaseLayer {
+class StopCriteriaLayer: public BaseDynamicDecodeLayer {
 public:
-    using DynamicDecodeBaseLayer::DynamicDecodeBaseLayer;
-
-    void setup(const size_t batch_size, const size_t beam_width, TensorMap* params) override;
+    explicit StopCriteriaLayer(const BaseParam& param);
 
-    void forward(TensorMap* output_tensors, TensorMap* input_tensors) override;
+    void Setup(const std::vector<const Request*>& rs, const TensorMap&) override;
 
-    ~StopCriteriaLayer();
+    void Forward(TensorMap& args) override;
 
 private:
-    void allocateBuffer() override;
-
-    void freeBuffer() override;
-
-    // host buffer
-    int* h_pinned_finished_sum_{};
+    Buffer_<int> stop_words_;
+    Buffer_<int> stop_words_buf_;
+    Tensor_<int> stop_words_ten_;
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/layers/sampling_layers/utils.h b/src/turbomind/layers/sampling_layers/utils.h
new file mode 100644
index 0000000000..c36c910dac
--- /dev/null
+++ b/src/turbomind/layers/sampling_layers/utils.h
@@ -0,0 +1,72 @@
+
+#include <functional>
+#include <vector>
+
+#include "src/turbomind/core/core.h"
+
+namespace turbomind {
+
+constexpr int kMaxStopBadWordsLen = 32;
+constexpr int kMaxEndIdsSize      = 32;
+
+namespace {
+
+template<class G, class Rs, class T>
+void init_stop_bad_words(G getter, const char* key, const Rs& rs, T* h_buf, T* d_buf, Tensor_<T>& out)
+{
+    const int bsz        = rs.size();
+    int       max_length = 0;
+
+    std::vector<std::pair<const int*, int>> copy_tokens(bsz);
+    std::vector<std::pair<const int*, int>> copy_offsets(bsz);
+    for (int i = 0; i < bsz; ++i) {
+        const auto& [token_ids, offsets] = std::invoke(getter, rs[i]->gen_cfg);
+        if (offsets.size() == 0 || token_ids.size() == 0) {
+            continue;
+        }
+        FT_CHECK(offsets.back() == token_ids.size());
+        if (offsets.back() <= kMaxStopBadWordsLen) {
+            copy_tokens[i]  = std::make_pair(token_ids.data(), (int)token_ids.size());
+            copy_offsets[i] = std::make_pair(offsets.data(), (int)offsets.size());
+            max_length      = std::max(max_length, (int)token_ids.size());
+        }
+        else {
+            auto trunc_offset_size =
+                std::upper_bound(offsets.begin(),
+                                 offsets.begin() + std::min(kMaxStopBadWordsLen, (int)offsets.size()),
+                                 kMaxStopBadWordsLen)
+                - offsets.begin();
+            TM_LOG_WARNING("[InitializeSampling] [%ld] %s length (%d) exceeds %d, truncated to %d",
+                           rs[i]->id,
+                           key,
+                           offsets.back(),
+                           kMaxStopBadWordsLen,
+                           trunc_offset_size);
+            if (trunc_offset_size > 0) {
+                int trunc_token_size = offsets[trunc_token_size - 1];
+                copy_tokens[i]       = std::make_pair(token_ids.data(), trunc_token_size);
+                copy_offsets[i]      = std::make_pair(offsets.data(), trunc_offset_size);
+                max_length           = std::max(max_length, trunc_token_size);
+            }
+        }
+    }
+    if (!max_length) {
+        return;
+    }
+    std::fill_n(h_buf, bsz * 2 * max_length, -1);
+    for (int i = 0; i < bsz; ++i) {
+        if (copy_tokens[i].first != nullptr) {
+            std::copy_n(copy_tokens[i].first, copy_tokens[i].second, h_buf + i * 2 * max_length);
+        }
+        if (copy_offsets[i].first != nullptr) {
+            std::copy_n(copy_offsets[i].first, copy_offsets[i].second, h_buf + i * 2 * max_length + max_length);
+        }
+    }
+    core::Copy(h_buf, bsz * 2 * max_length, d_buf);
+    // Construct a tensor from the device buffer
+    out = {d_buf, {bsz, 2, max_length}, kDEVICE};
+};
+
+}  // namespace
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/llama/BlockManager.cc b/src/turbomind/models/llama/BlockManager.cc
index 2744b71b55..d04634a287 100644
--- a/src/turbomind/models/llama/BlockManager.cc
+++ b/src/turbomind/models/llama/BlockManager.cc
@@ -28,7 +28,7 @@ size_t GetSyncFreeMemSize(Barrier& barrier, std::atomic<size_t>& value)
 }
 
 BlockManager::BlockManager(
-    size_t block_size, double block_count, int chunk_size, IAllocator* allocator, GetFreeMemSize get_free_size):
+    size_t block_size, double block_count, int chunk_size, core::Allocator allocator, GetFreeMemSize get_free_size):
     block_size_(block_size), allocator_(allocator)
 {
     if (block_count < 1.) {
@@ -66,7 +66,7 @@ BlockManager::BlockManager(
 BlockManager::~BlockManager()
 {
     for (auto& chunk : chunks_) {
-        allocator_->free(&chunk);
+        allocator_->deallocate(chunk, block_size_);
     }
 }
 
@@ -78,7 +78,7 @@ bool BlockManager::Malloc()
         return false;
     }
 
-    auto ptr = (std::byte*)allocator_->malloc(block_size_ * chunk_size);
+    auto ptr = (std::byte*)allocator_->allocate(block_size_ * chunk_size);
     if (!ptr) {
         return false;
     }
@@ -285,8 +285,7 @@ std::ostream& operator<<(std::ostream& os, const BlockManager& manager)
     os << "free_ids: " << manager.free_ids_.size() << ", ";
     os << "blocks: " << manager.blocks_.size() << ", ";
     os << "unique_id: " << manager.unique_id_ << ", ";
-    os << "timestamp: " << manager.timestamp_ << ", ";
-    os << "allocator: " << manager.allocator_;
+    os << "timestamp: " << manager.timestamp_;
     return os;
 }
 
diff --git a/src/turbomind/models/llama/BlockManager.h b/src/turbomind/models/llama/BlockManager.h
index 70ca74475d..df1f0d3833 100644
--- a/src/turbomind/models/llama/BlockManager.h
+++ b/src/turbomind/models/llama/BlockManager.h
@@ -2,8 +2,8 @@
 
 #pragma once
 
+#include "src/turbomind/core/allocator.h"
 #include "src/turbomind/models/llama/Barrier.h"
-#include "src/turbomind/utils/allocator.h"
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/logger.h"
 #include <algorithm>
@@ -73,7 +73,7 @@ size_t GetSyncFreeMemSize(Barrier& barrier, std::atomic<size_t>& value);
 class BlockManager {
 public:
     explicit BlockManager(
-        size_t block_size, double block_count, int chunk_size, IAllocator* allocator, GetFreeMemSize get_free_size);
+        size_t block_size, double block_count, int chunk_size, core::Allocator allocator, GetFreeMemSize get_free_size);
 
     ~BlockManager();
 
@@ -141,10 +141,11 @@ class BlockManager {
     bool Malloc();
 
 private:
-    size_t      block_size_;
-    int         max_block_count_{};
-    int         chunk_size_{};
-    IAllocator* allocator_;
+    size_t block_size_;
+    int    max_block_count_{};
+    int    chunk_size_{};
+
+    core::Allocator allocator_;
 
     std::vector<void*> chunks_;
 
diff --git a/src/turbomind/models/llama/CMakeLists.txt b/src/turbomind/models/llama/CMakeLists.txt
index 3b79254970..90c1b239fe 100644
--- a/src/turbomind/models/llama/CMakeLists.txt
+++ b/src/turbomind/models/llama/CMakeLists.txt
@@ -13,42 +13,30 @@ add_library(Llama STATIC
         BlockTrie.cc
         SequenceManager.cc
         LlamaWeight.cc
+        LlamaDenseWeight.cc
         LlamaDecoderLayerWeight.cc
         LlamaFfnLayer.cc
         moe_ffn_layer.cc
         unified_decoder.cc
         unified_attention_layer.cc
         llama_kernels.cu
-        llama_decoder_kernels.cu
         llama_utils.cu
         mla_utils.cu)
 set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 target_link_libraries(Llama PUBLIC CUDA::cudart
         engine
+        core
         gemm2
+        CUDA::cublas
         rms_norm
-        cublasMMWrapper
         DynamicDecodeLayer
         activation_kernels
         attention
         decoding_kernels
         unfused_attention_kernels
         gpt_kernels
-        tensor
         memory_utils
         cuda_utils
         logger
         anomaly_handler)
-
-
-add_executable(llama_gemm llama_gemm.cc)
-target_link_libraries(llama_gemm PUBLIC CUDA::cudart gpt_gemm_func memory_utils cuda_utils logger)
-
-install(TARGETS llama_gemm DESTINATION ${CMAKE_SOURCE_DIR}/lmdeploy/bin)
-
-# find_package(Catch2 3 QUIET)
-# if (Catch2_FOUND)
-#         add_executable(test_cache_manager test_cache_manager.cc)
-#         target_link_libraries(test_cache_manager PRIVATE Llama Catch2::Catch2WithMain)
-# endif ()
diff --git a/src/turbomind/models/llama/LlamaBatch.cc b/src/turbomind/models/llama/LlamaBatch.cc
index deb8a49da9..065200acb8 100644
--- a/src/turbomind/models/llama/LlamaBatch.cc
+++ b/src/turbomind/models/llama/LlamaBatch.cc
@@ -21,12 +21,17 @@
 
 #include "src/turbomind/comm/device_comm.h"
 #include "src/turbomind/comm/host_comm.h"
+
+#include "src/turbomind/core/allocator.h"
+#include "src/turbomind/core/buffer.h"
+#include "src/turbomind/core/context.h"
+#include "src/turbomind/core/tensor.h"
+
 #include "src/turbomind/macro.h"
 
 #include "src/turbomind/engine/gateway.h"
 #include "src/turbomind/engine/request.h"
 
-#include "src/turbomind/kernels/core/data_type.h"
 #include "src/turbomind/kernels/decoding_kernels.h"
 #include "src/turbomind/kernels/gemm/tuner/params.h"
 #include "src/turbomind/kernels/sampling_topk_kernels.h"
@@ -39,7 +44,6 @@
 #include "src/turbomind/models/llama/llama_kernels.h"
 #include "src/turbomind/models/llama/llama_utils.h"
 
-#include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/anomaly_handler.h"
 #include "src/turbomind/utils/constant.h"
 #include "src/turbomind/utils/cuda_utils.h"
@@ -93,8 +97,7 @@ void DropEmbeddings(const Sequence& seq)
     seq.input_embedding_ranges.resize(sz);
 }
 
-template<typename T>
-void LlamaBatch<T>::DisableInvalidRequests(Requests& infer_reqs, Requests& kill_reqs)
+void LlamaBatch::DisableInvalidRequests(Requests& infer_reqs, Requests& kill_reqs)
 {
     NvtxScope _("disable invalid");
 
@@ -137,8 +140,7 @@ void LlamaBatch<T>::DisableInvalidRequests(Requests& infer_reqs, Requests& kill_
     }
 }
 
-template<class T>
-void LlamaBatch<T>::FindCanceledIndices(std::vector<int>& indices)
+void LlamaBatch::FindCanceledIndices(std::vector<int>& indices)
 {
     for (int i = 0; i < state_->size; ++i) {  // current batch
         const auto& r = state_->requests[i];
@@ -148,8 +150,7 @@ void LlamaBatch<T>::FindCanceledIndices(std::vector<int>& indices)
     }
 }
 
-template<class T>
-void LlamaBatch<T>::ProcessCancelRequests(std::vector<int>& indices, std::vector<Signal>& signals)
+void LlamaBatch::ProcessCancelRequests(std::vector<int>& indices, std::vector<Signal>& signals)
 {
     int count = 0;
 
@@ -168,8 +169,7 @@ void LlamaBatch<T>::ProcessCancelRequests(std::vector<int>& indices, std::vector
     }
 }
 
-template<class T>
-void LlamaBatch<T>::ProcessKillRequests(const Requests& kill_reqs, std::vector<Signal>& signals)
+void LlamaBatch::ProcessKillRequests(const Requests& kill_reqs, std::vector<Signal>& signals)
 {
     for (auto& r : kill_reqs) {
         if (r) {
@@ -188,8 +188,7 @@ void LlamaBatch<T>::ProcessKillRequests(const Requests& kill_reqs, std::vector<S
     }
 }
 
-template<typename T>
-void LlamaBatch<T>::ProcessInferRequests(const Requests& reqs, std::vector<Signal>& signals)
+void LlamaBatch::ProcessInferRequests(const Requests& reqs, std::vector<Signal>& signals)
 {
     NvtxScope scope("infer_request");
     auto&     state = *incoming_;
@@ -211,7 +210,7 @@ void LlamaBatch<T>::ProcessInferRequests(const Requests& reqs, std::vector<Signa
             continue;
         }
 
-        const int input_length = r->inputs.at("input_ids").shape[0];
+        const int input_length = r->inputs.at("input_ids").shape(0);
 
         if (input_length > session_len_) {
             signals.push_back([r] { UpdateState(*r, Request::kTooLong, 0); });
@@ -257,22 +256,22 @@ void LlamaBatch<T>::ProcessInferRequests(const Requests& reqs, std::vector<Signa
             DropEmbeddings(seq);
         }
 
-        const int* input_ids = r->inputs.getPtr<int>("input_ids");
+        const int* input_ids = r->inputs.at("input_ids").data<int>();
 
         {
             // `output_ids` contains all token ids of the sequences
-            const auto output_ids_base = state.output_ids + session_len_ * idx;
+            const auto output_ids_base = state.output_ids.data() + session_len_ * idx;
             auto       d_output_ids    = output_ids_base;
-            auto       h_output_ids    = r->output_ids.getPtr<int>();
+            auto       h_output_ids    = r->output_ids.data();
             // copy history tokens
             if (!seq.tokens.empty()) {
-                d_output_ids = Copy(seq.tokens.data(), seq.tokens.size(), d_output_ids);
+                d_output_ids = core::Copy(seq.tokens.data(), seq.tokens.size(), d_output_ids);
                 h_output_ids = std::copy_n(seq.tokens.data(), seq.tokens.size(), h_output_ids);
             }
 
             // copy input tokens
             if (input_length) {
-                d_output_ids = Copy(input_ids, input_length, d_output_ids);
+                d_output_ids = core::Copy(input_ids, input_length, d_output_ids);
                 h_output_ids = std::copy_n(input_ids, input_length, h_output_ids);
             }
 
@@ -283,23 +282,25 @@ void LlamaBatch<T>::ProcessInferRequests(const Requests& reqs, std::vector<Signa
         }
 
         // copy input tokens to prompt for prefix matching
-        if (input_length && r->session.start_flag && !r->inputs.isExist("input_embedding_ranges")) {
+        if (input_length && r->session.start_flag && !r->inputs.contains("input_embedding_ranges")) {
             // TODO: truncate prompt to enable prefix caching for VLM
             seq.prompt.resize(input_length);
             std::copy_n(input_ids, input_length, seq.prompt.data());
         }
 
+        const int elem_size = byte_size(data_type_);
+
         // copy input embeddings
-        if (r->inputs.isExist("input_embedding_ranges")) {
-            const auto range_tensor = r->inputs.at("input_embedding_ranges");
-            const auto emb_tensor   = r->inputs.at("input_embeddings");
-            const int* ranges       = range_tensor.getPtr<int>();
+        if (r->inputs.contains("input_embedding_ranges")) {
+            const auto& range_tensor = r->inputs.at("input_embedding_ranges");
+            const auto& emb_tensor   = r->inputs.at("input_embeddings");
+            const int*  ranges       = range_tensor.data<int>();
 
             auto check_embeddings = [&](int& num_valid_embeddings) {
-                if (range_tensor.shape.size() != 3 || range_tensor.shape[2] % 2 != 0) {
+                if (range_tensor.ndim() != 3 || range_tensor.shape(2) % 2 != 0) {
                     return false;
                 }
-                int embedding_count  = range_tensor.shape[1];
+                int embedding_count  = range_tensor.shape(1);
                 int embedding_length = 0;
                 int pre_end          = -1;
 
@@ -311,7 +312,7 @@ void LlamaBatch<T>::ProcessInferRequests(const Requests& reqs, std::vector<Signa
                         break;
                     }
                     if (begin >= end || end > input_length || begin < pre_end
-                        || embedding_length * model_->hidden_units_ * sizeof(T) > emb_tensor.shape[1]) {
+                        || embedding_length * model_->hidden_units_ * elem_size > emb_tensor.shape(1)) {
                         return false;
                     }
                     pre_end              = end;
@@ -322,20 +323,17 @@ void LlamaBatch<T>::ProcessInferRequests(const Requests& reqs, std::vector<Signa
 
             int num_valid_embeddings = 0;
             if (!check_embeddings(num_valid_embeddings)) {
-                TM_LOG_WARNING("[ImageFeature] Skip invalid input embeddings, id = %ld, input_length = %d, "
-                               "input embeddings = %s, range_tensor = %s",
+                TM_LOG_WARNING("[ImageFeature] Skip invalid input embeddings, id = %ld, input_length = %d",
                                (long)seq.id,
-                               input_length,
-                               emb_tensor.toString().c_str(),
-                               range_tensor.toString().c_str());
+                               input_length);
             }
             else {
-                const char* emb_tensor_ptr = emb_tensor.getPtr<char>();
+                const std::byte* emb_tensor_ptr = (const std::byte*)emb_tensor.raw_data();
                 for (size_t i = 0; i < num_valid_embeddings; i++) {
                     int    begin = ranges[i * 2];
                     int    end   = ranges[i * 2 + 1];
-                    size_t count = (end - begin) * model_->hidden_units_ * sizeof(T);
-                    seq.input_embeddings.emplace_back((std::byte*)emb_tensor_ptr, (std::byte*)(emb_tensor_ptr + count));
+                    size_t count = (end - begin) * model_->hidden_units_ * elem_size;
+                    seq.input_embeddings.emplace_back(emb_tensor_ptr, emb_tensor_ptr + count);
                     seq.input_embedding_ranges.emplace_back(begin + seq.tokens.size(), end + seq.tokens.size());
                     emb_tensor_ptr += count;
                 }
@@ -388,7 +386,7 @@ void LlamaBatch<T>::ProcessInferRequests(const Requests& reqs, std::vector<Signa
         }
         else {
             // Recover device states if not a new sequence
-            h_curand_state_[existing_idx.size()] = *(curandState_t*)seq.random_state.data();
+            ((curandState_t*)h_curand_state_.data())[existing_idx.size()] = *(curandState_t*)seq.random_state.data();
             existing_idx.push_back(idx);
         }
 
@@ -403,22 +401,24 @@ void LlamaBatch<T>::ProcessInferRequests(const Requests& reqs, std::vector<Signa
         // copy random seeds to device
         Copy(h_random_seed_, state.size, d_random_seed_);
         // initialize random states
-        invokeCurandBatchInitialize(state.curand_state, state.size, d_random_seed_, stream_);
+        invokeCurandBatchInitialize(
+            (curandState_t*)state.curand_state.data(), state.size, d_random_seed_.data(), stream_);
         sync_check_cuda_error();
     }
 
     if (!existing_idx.empty()) {
         // copy existing curand states to device
-        Copy(h_curand_state_, existing_idx.size(), d_curand_state_);
+        core::Copy((curandState_t*)h_curand_state_.data(), existing_idx.size(), (curandState_t*)h_curand_state_.data());
         // insert the states to their correct positions in the batch
-        IndexedCopy({}, existing_idx, std::tuple{d_curand_state_, state.curand_state, 1});
+        IndexedCopy({},
+                    existing_idx,
+                    std::tuple{(curandState_t*)d_curand_state_.data(), (curandState_t*)state.curand_state.data(), 1});
     }
 }
 
-template<typename T>
-int LlamaBatch<T>::AdjustMaxInputCount(GenerationState&                    g,
-                                       const std::vector<const Sequence*>& sequences,
-                                       const std::vector<int>&             context_length)
+int LlamaBatch::AdjustMaxInputCount(GenerationState&                    g,
+                                    const std::vector<const Sequence*>& sequences,
+                                    const std::vector<int>&             context_length)
 {
     int input_count = 0;
     for (int i = 0; i < sequences.size(); ++i) {
@@ -448,8 +448,7 @@ int LlamaBatch<T>::AdjustMaxInputCount(GenerationState&                    g,
     return input_count;
 }
 
-template<typename T>
-void LlamaBatch<T>::Initialize(GenerationState& g)
+void LlamaBatch::Initialize(GenerationState& g)
 {
     NvtxScope                                scope("initialize");
     std::vector<const Sequence*>             sequences;
@@ -558,7 +557,7 @@ void LlamaBatch<T>::Initialize(GenerationState& g)
         // Prepare intermediate buffers
         h_cu_block_counts_[0] = 0;
 
-        auto block_ptrs = h_block_ptrs_;
+        auto block_ptrs = h_block_ptrs_.data();
 
         const int batch_size = state_->active_size;
 
@@ -577,8 +576,6 @@ void LlamaBatch<T>::Initialize(GenerationState& g)
 
         Copy(h_cu_block_counts_, batch_size + 1, cu_block_counts_);
         Copy(h_block_ptrs_, h_cu_block_counts_[batch_size], block_ptrs_);
-        // Copy(h_k_block_ptrs_, h_cu_block_counts_[batch_size], k_block_ptrs_);
-        // Copy(h_v_block_ptrs_, h_cu_block_counts_[batch_size], v_block_ptrs_);
     }
 
     const int batch_size = state_->active_size;
@@ -597,7 +594,8 @@ void LlamaBatch<T>::Initialize(GenerationState& g)
         }
     }
 
-    const int max_context_len = *std::max_element(state_->h_context_length, state_->h_context_length + batch_size);
+    const int max_context_len =
+        *std::max_element(state_->h_context_length.data(), state_->h_context_length.data() + batch_size);
 
     std::vector<uint64_t> unique_ids(batch_size);
     for (int i = 0; i < batch_size; ++i) {
@@ -605,9 +603,9 @@ void LlamaBatch<T>::Initialize(GenerationState& g)
     }
 
     // Real-time context length that will change during generation
-    Copy(state_->h_context_length, batch_size, context_length_buf_);
-    Copy(state_->h_finished, batch_size, finished_buf_);
-    Copy(state_->h_rope_theta, batch_size, rope_theta_);
+    Copy_(state_->h_context_length, batch_size, context_length_buf_);
+    Copy_(state_->h_finished, batch_size, finished_buf_);
+    Copy_(state_->h_rope_theta, batch_size, rope_theta_);
 
     bool skip_init_sampling = std::equal(g.unique_ids.begin(),  //
                                          g.unique_ids.end() - g.partial,
@@ -628,8 +626,7 @@ void LlamaBatch<T>::Initialize(GenerationState& g)
     }
 }
 
-template<typename T>
-void LlamaBatch<T>::CopyState(const std::vector<std::tuple<BatchState*, BatchState*, int, int>>& desc)
+void LlamaBatch::CopyState(const std::vector<std::tuple<BatchState*, BatchState*, int, int>>& desc)
 {
     if (desc.empty()) {
         return;
@@ -674,8 +671,8 @@ void LlamaBatch<T>::CopyState(const std::vector<std::tuple<BatchState*, BatchSta
 
         IndexedCopy(s_idx,
                     d_idx,
-                    std::tuple{s->output_ids, d->output_ids, session_len_},
-                    std::tuple{s->curand_state, d->curand_state, 1});
+                    std::tuple{s->output_ids.data(), d->output_ids.data(), session_len_},
+                    std::tuple{(curandState_t*)s->curand_state.data(), (curandState_t*)d->curand_state.data(), 1});
     }
 
     for (const auto& [s, d, si, di] : desc) {
@@ -689,258 +686,103 @@ void LlamaBatch<T>::CopyState(const std::vector<std::tuple<BatchState*, BatchSta
     }
 }
 
-template<typename T>
-void LlamaBatch<T>::AllocateBuffer(size_t batch_size, size_t session_len, int cache_block_seq_len)
+void LlamaBatch::AllocateBuffer(ssize_t batch_size, ssize_t session_len, int cache_block_seq_len)
 {
     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    const size_t batchxbeam = batch_size;
+    const ssize_t batchxbeam = batch_size;
 
-    const size_t hidden_units      = model_->hidden_units_;
-    const size_t vocab_size        = model_->vocab_size_padded_;
-    const size_t head_dim          = model_->size_per_head_;
-    const size_t local_kv_head_num = model_->local_kv_head_num_;
+    const ssize_t hidden_units      = model_->hidden_units_;
+    const ssize_t vocab_size        = model_->vocab_size_padded_;
+    const ssize_t head_dim          = model_->size_per_head_;
+    const ssize_t local_kv_head_num = model_->local_kv_head_num_;
     // +1 padding, BlockIterator does not use predicate
-    const size_t max_batch_block_count =
+    const ssize_t max_batch_block_count =
         batch_size * ((session_len + cache_block_seq_len - 1) / cache_block_seq_len) + 1;
 
-    context_decoder_input_buf_ =
-        (T*)allocator_->reMalloc(context_decoder_input_buf_, sizeof(T) * max_forward_token_num_ * hidden_units, false);
-    context_decoder_ids_buf_ =
-        (int*)allocator_->reMalloc(context_decoder_ids_buf_, sizeof(int) * max_forward_token_num_, false);
-
-    decoder_input_buf_  = (T*)allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units, false);
-    decoder_output_buf_ = (T*)allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units, false);
-
-    input_ids_buf_       = (int*)allocator_->reMalloc(input_ids_buf_, sizeof(int) * batchxbeam * session_len, true);
-    input_length_buf_    = (int*)allocator_->reMalloc(input_length_buf_, sizeof(int) * batchxbeam);
-    context_length_buf_  = (int*)allocator_->reMalloc(context_length_buf_, sizeof(int) * batchxbeam);
-    init_context_length_ = (int*)allocator_->reMalloc(init_context_length_, sizeof(int) * batchxbeam);
+    input_ids_buf_ = {max_forward_token_num_, kDEVICE};
 
-    sequence_lengths_ = (int*)allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false);
+    decoder_output_buf_ = {{batchxbeam, hidden_units}, data_type_, kDEVICE};
 
-    cu_block_counts_ = (int*)allocator_->reMalloc(cu_block_counts_, sizeof(int) * (batch_size + 1));
-    block_ptrs_      = (uintptr_t*)allocator_->reMalloc(block_ptrs_, sizeof(uintptr_t) * max_batch_block_count);
+    input_length_buf_    = {batchxbeam, kDEVICE};
+    context_length_buf_  = {batchxbeam, kDEVICE};
+    init_context_length_ = {batchxbeam, kDEVICE};
 
-    if (!logits_buf_) {  // may be alias of local_logits_buf_
-        logits_buf_ = (T*)allocator_->reMalloc(logits_buf_, sizeof(T) * batchxbeam * vocab_size, false);
-    }
-
-    sampled_logprobs_ = (T*)allocator_->reMalloc(sampled_logprobs_, sizeof(T) * batchxbeam * kMaxLogProb, false);
-    sampled_indexes_ =
-        (uint32_t*)allocator_->reMalloc(sampled_indexes_, sizeof(uint32_t) * batchxbeam * kMaxLogProb, false);
-    sampled_nums_ = (uint32_t*)allocator_->reMalloc(sampled_nums_, sizeof(uint32_t) * batchxbeam, false);
+    sequence_lengths_ = {batchxbeam, kDEVICE};
 
-    token_ids_buf_ = (int*)allocator_->reMalloc(token_ids_buf_, sizeof(int) * batchxbeam * session_len * 2, true);
+    cu_block_counts_ = {batch_size + 1, kDEVICE};
+    block_ptrs_      = {max_batch_block_count, kDEVICE};
 
-    finished_buf_  = (bool*)allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false);
-    seq_limit_len_ = (uint32_t*)allocator_->reMalloc(seq_limit_len_, sizeof(uint32_t) * batch_size, false);
+    sampled_logprobs_ = {batchxbeam * kMaxLogProb, kDEVICE};
+    sampled_indexes_  = {batchxbeam * kMaxLogProb, kDEVICE};
+    sampled_nums_     = {batchxbeam, kDEVICE};
 
-    rope_theta_ = (float*)allocator_->reMalloc(rope_theta_, sizeof(float) * batch_size, false);
+    token_ids_buf_ = {ssize_t(session_len * 2 * batchxbeam), kDEVICE};
 
-    is_allocate_buffer_ = true;
-}
-
-template<typename T>
-void LlamaBatch<T>::AllocatePersistantBuffer(size_t max_batch_size, int cache_block_seq_len)
-{
-    d_stop_words_ =
-        (int*)allocator_->reMalloc(d_stop_words_, sizeof(int) * max_batch_size * 2 * kMaxStopBadWordsLen, true);
-    d_bad_words_ =
-        (int*)allocator_->reMalloc(d_bad_words_, sizeof(int) * max_batch_size * 2 * kMaxStopBadWordsLen, true);
-    h_stop_words_ =
-        (int*)allocator_->reMalloc(h_stop_words_, sizeof(int) * max_batch_size * 2 * kMaxStopBadWordsLen, true, true);
-    h_bad_words_ =
-        (int*)allocator_->reMalloc(h_bad_words_, sizeof(int) * max_batch_size * 2 * kMaxStopBadWordsLen, true, true);
-
-    h_min_length_    = (int*)allocator_->reMalloc(h_min_length_, sizeof(int) * max_batch_size, true, true);
-    h_runtime_top_k_ = (int*)allocator_->reMalloc(h_runtime_top_k_, sizeof(int) * max_batch_size, true, true);
-    h_runtime_top_p_ = (float*)allocator_->reMalloc(h_runtime_top_p_, sizeof(float) * max_batch_size, true, true);
-    h_runtime_min_p_ = (float*)allocator_->reMalloc(h_runtime_min_p_, sizeof(float) * max_batch_size, true, true);
-    h_temperature_   = (float*)allocator_->reMalloc(h_temperature_, sizeof(float) * max_batch_size, true, true);
-    h_repetition_penalty_ =
-        (float*)allocator_->reMalloc(h_repetition_penalty_, sizeof(float) * max_batch_size, true, true);
-
-    h_random_seed_ = (unsigned long long*)allocator_->reMalloc(
-        h_random_seed_, sizeof(unsigned long long) * max_batch_size, true, true);
-    d_random_seed_ = (unsigned long long*)allocator_->reMalloc(
-        d_random_seed_, sizeof(unsigned long long) * max_batch_size, true, false);
-
-    h_curand_state_ =
-        (curandState_t*)allocator_->reMalloc(h_curand_state_, sizeof(curandState_t) * max_batch_size, true, true);
-    d_curand_state_ =
-        (curandState_t*)allocator_->reMalloc(d_curand_state_, sizeof(curandState_t) * max_batch_size, true, false);
-
-    d_end_ids_buf_ = (int*)allocator_->reMalloc(d_end_ids_buf_, sizeof(int) * max_batch_size * kMaxEndIdsSize, false);
-    h_end_ids_buf_ =
-        (int*)allocator_->reMalloc(h_end_ids_buf_, sizeof(int) * max_batch_size * kMaxEndIdsSize, false, true);
+    finished_buf_  = {(int)batchxbeam, kDEVICE};
+    seq_limit_len_ = {batch_size, kDEVICE};
 
-    for (auto& s : states_) {
-        s.output_ids = (int*)allocator_->reMalloc(s.output_ids, sizeof(int) * max_batch_size * session_len_, true);
-        s.curand_state =
-            (curandState_t*)allocator_->reMalloc(s.curand_state, sizeof(curandState_t) * max_batch_size, true);
-    }
+    rope_theta_ = {batch_size, kDEVICE};
 
-    const size_t max_batch_block_count =
-        max_batch_size * ((session_len_ + cache_block_seq_len - 1) / cache_block_seq_len);
+    h_random_seed_ = {batch_size, kCPUpinned};
+    Clear(h_random_seed_);
 
-    {
-        h_input_ids_buf_ =
-            (int*)allocator_->reMalloc(h_input_ids_buf_, sizeof(int) * max_batch_size * session_len_, false, true);
-        h_input_length_buf_ =
-            (int*)allocator_->reMalloc(h_input_length_buf_, sizeof(int) * max_batch_size, false, true);
+    d_random_seed_ = {batch_size, kDEVICE};
+    Clear(d_random_seed_);
 
-        h_cu_block_counts_ =
-            (int*)allocator_->reMalloc(h_cu_block_counts_, sizeof(int) * (max_batch_size + 1), false, true);
-        h_block_ptrs_ =
-            (uintptr_t*)allocator_->reMalloc(h_block_ptrs_, sizeof(uintptr_t) * max_batch_block_count, false, true);
+    h_curand_state_ = {{batch_size, sizeof(curandState_t)}, kCPUpinned};
+    Clear(h_curand_state_.buffer());
 
-        for (auto& s : states_) {
-            s.h_prompt_length =
-                (int*)allocator_->reMalloc(s.h_prompt_length, sizeof(int) * max_batch_size, false, true);
-            s.h_context_length =
-                (int*)allocator_->reMalloc(s.h_context_length, sizeof(int) * max_batch_size, false, true);
-            s.h_finished   = (bool*)allocator_->reMalloc(s.h_finished, sizeof(bool) * max_batch_size * 2, false, true);
-            s.h_rope_theta = (float*)allocator_->reMalloc(s.h_rope_theta, sizeof(float) * max_batch_size, false, true);
-        }
+    d_curand_state_ = {{batch_size, sizeof(curandState_t)}, kDEVICE};
+    Clear(d_curand_state_.buffer());
 
-        h_seq_limit_len_ =
-            (uint32_t*)allocator_->reMalloc(h_seq_limit_len_, sizeof(uint32_t) * max_batch_size, false, true);
+    for (auto& s : states_) {
+        s.output_ids = {{batch_size, session_len_}, kDEVICE};
+        Clear(s.output_ids.buffer());
 
-        h_output_ids_ =
-            (int*)allocator_->reMalloc(h_output_ids_, sizeof(int) * max_batch_size * session_len_, false, true);
+        s.curand_state = {{batch_size, sizeof(curandState_t)}, kDEVICE};
+        Clear(s.curand_state.buffer());
     }
 
-    h_sampled_logprobs_ =
-        (T*)allocator_->reMalloc(h_sampled_logprobs_, sizeof(T) * max_batch_size * kMaxLogProb, false, true);
-    h_sampled_indexes_ = (uint32_t*)allocator_->reMalloc(
-        h_sampled_indexes_, sizeof(uint32_t) * max_batch_size * kMaxLogProb, false, true);
-    h_sampled_nums_ = (uint32_t*)allocator_->reMalloc(h_sampled_nums_, sizeof(uint32_t) * max_batch_size, false, true);
+    h_input_length_buf_ = {batch_size, kCPUpinned};
+    h_cu_block_counts_  = {batch_size + 1, kCPUpinned};
+    h_block_ptrs_       = {(ssize_t)max_batch_block_count, kCPUpinned};
 
-    is_allocate_persistant_buffer_ = true;
-}
+    for (auto& s : states_) {
+        s.h_prompt_length  = {batch_size, kCPUpinned};
+        s.h_context_length = {batch_size, kCPUpinned};
+        s.h_finished       = {batch_size * 2, kCPUpinned};
+        s.h_rope_theta     = {batch_size, kCPUpinned};
+    }
 
-template<class T>
-void LlamaBatch<T>::AllocCommBuffers()
-{
-    const size_t hidden_units      = model_->hidden_units_;
-    const size_t vocab_size_padded = model_->vocab_size_padded_;
+    h_seq_limit_len_ = {batch_size, kCPUpinned};
+    std::fill_n(h_seq_limit_len_.data(), batch_size, 0);
 
-    // Native comm fuses allreduce & rmsnorm in token granularity
-    const size_t max_fwd_token_num = ((size_t)max_forward_token_num_ + tp_size_ - 1) / tp_size_ * tp_size_;
+    h_output_ids_ = {batch_size * session_len_, kCPUpinned};
 
-    // TODO: rename this to hidden_states
-    context_decoder_output_buf_ =
-        (T*)CommBufAlloc(sizeof(T) * param_.attn_dp_size * max_fwd_token_num * hidden_units, true);
-
-    local_logits_buf_ = (T*)CommBufAlloc(sizeof(T) * max_batch_size_ * vocab_size_padded, true);
-    if (model_->use_allgather_2d_) {
-        logits_buf_ = local_logits_buf_;
-    }
+    h_sampled_logprobs_ = {batch_size * kMaxLogProb, kCPUpinned};
+    h_sampled_indexes_  = {batch_size * kMaxLogProb, kCPUpinned};
+    h_sampled_nums_     = {batch_size, kCPUpinned};
 }
 
-template<class T>
-void LlamaBatch<T>::FreeCommBuffers()
+void LlamaBatch::AllocSymmBuffers()
 {
-    CommBufFree((void**)&context_decoder_output_buf_, true);
+    const ssize_t hidden_units      = model_->hidden_units_;
+    const ssize_t vocab_size_padded = model_->vocab_size_padded_;
 
-    if (local_logits_buf_) {
-        if (logits_buf_ == local_logits_buf_) {
-            logits_buf_ = {};
-        }
-        CommBufFree((void**)&local_logits_buf_, true);
-    }
+    // Native comm fuses allreduce & rmsnorm in token granularity
+    TM_CHECK(max_forward_token_num_ % tp_size_ == 0);
 
-    if (local_context_logits_buf_) {
-        if (context_logits_buf_ == local_context_logits_buf_) {
-            context_logits_buf_ = {};
-        }
-        CommBufFree((void**)&local_context_logits_buf_, true);
-    }
+    symm_hidden_states_buf_ = {{max_forward_token_num_ * param_.attn_dp_size, hidden_units}, data_type_, symm_alloc_};
+    symm_logits_buf_        = {{max_batch_size_, vocab_size_padded}, data_type_, symm_alloc_};
 }
 
-template<typename T>
-void LlamaBatch<T>::FreeBuffer()
+void LlamaBatch::FreeSymmBuffers()
 {
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    if (is_allocate_buffer_) {
-        allocator_->free((void**)&context_decoder_input_buf_);
-
-        allocator_->free((void**)&context_decoder_ids_buf_);
-        allocator_->free((void**)&lora_mask_buf_);
-
-        allocator_->free((void**)&decoder_input_buf_);
-        allocator_->free((void**)&decoder_output_buf_);
-
-        allocator_->free((void**)&input_ids_buf_);
-        allocator_->free((void**)&input_length_buf_);
-        allocator_->free((void**)&context_length_buf_);
-        allocator_->free((void**)&init_context_length_);
-
-        allocator_->free((void**)&sequence_lengths_);
-
-        allocator_->free((void**)&cu_block_counts_);
-        allocator_->free((void**)&block_ptrs_);
-
-        if (logits_buf_) {
-            allocator_->free((void**)&logits_buf_);
-        }
-        if (context_logits_buf_) {
-            allocator_->free((void**)&context_logits_buf_);
-        }
-
-        allocator_->free((void**)&token_ids_buf_);
-
-        allocator_->free((void**)&d_end_ids_buf_);
-        allocator_->free((void**)&h_end_ids_buf_, true);
-
-        allocator_->free((void**)&finished_buf_);
-        allocator_->free((void**)&seq_limit_len_);
-
-        allocator_->free((void**)&rope_theta_);
-
-        allocator_->free((void**)&sampled_logprobs_);
-        allocator_->free((void**)&sampled_indexes_);
-        allocator_->free((void**)&sampled_nums_);
-
-        is_allocate_buffer_ = false;
-    }
-
-    if (is_allocate_persistant_buffer_) {
-
-        allocator_->free((void**)&d_stop_words_);
-        allocator_->free((void**)&h_stop_words_, true);
-        allocator_->free((void**)&d_bad_words_);
-        allocator_->free((void**)&h_bad_words_, true);
-        allocator_->free((void**)&d_random_seed_);
-        allocator_->free((void**)&h_random_seed_, true);
-        allocator_->free((void**)&d_curand_state_);
-        allocator_->free((void**)&h_curand_state_, true);
-
-        for (auto& s : states_) {
-            allocator_->free((void**)&s.h_context_length, true);
-            allocator_->free((void**)&s.h_finished, true);
-            allocator_->free((void**)&s.h_rope_theta, true);
-            allocator_->free((void**)&s.output_ids);
-            allocator_->free((void**)&s.curand_state);
-        }
-        allocator_->free((void**)&h_cu_block_counts_, true);
-        allocator_->free((void**)&h_block_ptrs_, true);
-        allocator_->free((void**)&h_input_ids_buf_, true);
-        allocator_->free((void**)&h_input_length_buf_, true);
-        allocator_->free((void**)&h_seq_limit_len_, true);
-
-        allocator_->free((void**)&h_output_ids_, true);
-
-        allocator_->free((void**)&h_sampled_logprobs_);
-        allocator_->free((void**)&h_sampled_indexes_);
-        allocator_->free((void**)&h_sampled_nums_);
-
-        is_allocate_persistant_buffer_ = false;
-    }
+    symm_hidden_states_buf_ = {};
+    symm_logits_buf_        = {};
 }
 
-template<typename T>
-LlamaBatch<T>::~LlamaBatch()
+LlamaBatch::~LlamaBatch()
 {
     TM_LOG_DEBUG("~LlamaBatch()");
 
@@ -950,24 +792,22 @@ LlamaBatch<T>::~LlamaBatch()
     cudaSetDevice(device_id_);
     cudaStreamSynchronize(stream_);
 
-    FreeBuffer();
-
     model_.reset();
     sequence_manager_.reset();
     context_.reset();  // This destroy all objects in context except for `stream`
 }
 
-template<typename T>
-LlamaBatch<T>::LlamaBatch(const EngineParam&          param,
-                          std::unique_ptr<LlamaV2<T>> model,  // ! This is moved
-                          std::unique_ptr<Context<T>> ctx,    // ! This is moved
-                          std::shared_ptr<Gateway>    gateway,
-                          int                         device_id,
-                          int                         dp_rank):
+LlamaBatch::LlamaBatch(DataType                 data_type,
+                       const EngineParam&       param,
+                       std::unique_ptr<LlamaV2> model,  // ! This is moved
+                       std::unique_ptr<Context> ctx,    // ! This is moved
+                       std::shared_ptr<Gateway> gateway,
+                       int                      device_id,
+                       int                      dp_rank):
     param_(param),
     gateway_(gateway),
     max_batch_size_(param.max_batch_size),
-    max_forward_token_num_(param.max_prefill_token_num + param.max_batch_size),
+    max_forward_token_num_(param.max_forward_token_num),
     max_context_token_num_(param.max_context_token_num),
     num_tokens_per_iter_(param.num_tokens_per_iter),
     max_prefill_iters_(param.max_prefill_iters),
@@ -975,11 +815,9 @@ LlamaBatch<T>::LlamaBatch(const EngineParam&          param,
     dp_rank_(dp_rank),
     tp_size_(model->tp_size_),
     tp_rank_(model->tp_rank_),
-    data_type_(getTensorType<T>()),
+    data_type_(data_type),
     debug_(isDebug()),
     stream_(ctx->stream),
-    allocator_(ctx->allocator.get()),
-    cublas_wrapper_(ctx->cublas_wrapper.get()),
     context_(std::move(ctx)),
     model_(std::move(model)),
     comm_(context_->comm),
@@ -987,14 +825,16 @@ LlamaBatch<T>::LlamaBatch(const EngineParam&          param,
 {
     const auto cache_block_seq_len = model_->attn_param_.cache_block_seq_len;
 
+    const int dbits = byte_size(data_type, 8);
+
     const auto quant_policy = model_->param_.quant_policy;
-    const int  elem_bits    = quant_policy ? quant_policy : bitsof<T>;
+    const int  elem_bits    = quant_policy ? quant_policy : dbits;
 
     SequenceManager::BlockConfig block_config{
         (int)model_->size_per_head_,
         (int)model_->local_kv_head_num_,
         cache_block_seq_len,
-        elem_bits == bitsof<T> ? 0 : bitsof<T>,
+        elem_bits == dbits ? 0 : dbits,
         elem_bits,
     };
 
@@ -1010,7 +850,7 @@ LlamaBatch<T>::LlamaBatch(const EngineParam&          param,
                                                 param.cache_chunk_size,
                                                 param.enable_prefix_caching,
                                                 tp_rank_,
-                                                allocator_,
+                                                core::Context::alloc(kDEVICE),
                                                 get_free_size});
 
     const size_t max_session_len = sequence_manager_->max_block_count() * cache_block_seq_len;
@@ -1037,20 +877,24 @@ LlamaBatch<T>::LlamaBatch(const EngineParam&          param,
     back_     = &states_[1];
     incoming_ = &states_[2];
 
-    AllocCommBuffers();
+    symm_alloc_ = core::SimpleAllocator::Create([this](ssize_t size) { return SymmAlloc(size, true); },
+                                                [this](void* p, ssize_t size) { return SymmFree(p, size, true); },
+                                                kDEVICE);
+
+    AllocSymmBuffers();
 
     AllocateBuffer(max_batch_size_, session_len_, cache_block_seq_len);
-    AllocatePersistantBuffer(max_batch_size_, cache_block_seq_len);
 
     // Wait for allocations
     check_cuda_error(cudaStreamSynchronize(stream_));
 }
 
-template<typename T>
-void LlamaBatch<T>::InitializeSampling(const GenerationState& g)
+void LlamaBatch::InitializeSampling(const GenerationState& g)
 {
     NvtxScope _("InitSampling");
+
     const int batch_size = state_->active_size - g.partial;
+
     if (batch_size == 0) {
         return;
     }
@@ -1063,11 +907,11 @@ void LlamaBatch<T>::InitializeSampling(const GenerationState& g)
     // note that in decoder and in output "sequence length" has different semantic
     // - in decoder it means length of sequence that has kv cache already computed
     // - in output it means length of all tokens (the last generated token does not have k/v cache computed yet)
-    invokePlusScalar(sequence_lengths_, -1, batch_size, stream_);
+    invokePlusScalar(sequence_lengths_.data(), -1, batch_size, stream_);
     sync_check_cuda_error();
 
-    Clear(token_ids_buf_, batch_size * session_len_);
-    invokeTranspose2D(token_ids_buf_, state_->output_ids, batch_size, session_len_, stream_);
+    Clear(token_ids_buf_.slice(0, batch_size * session_len_));
+    invokeTranspose2D(token_ids_buf_.data(), state_->output_ids.data(), batch_size, session_len_, stream_);
     sync_check_cuda_error();
 
     // token_ids_buf_[s, b]
@@ -1076,7 +920,7 @@ void LlamaBatch<T>::InitializeSampling(const GenerationState& g)
     // ABCDEFGHi    ->  ABCDEFGHi i
     // ABCDEFGh         ABCDEFGh  h
     // ABCd             ABCd      d
-    invokePadLastTokenIds(token_ids_buf_, init_context_length_, g.max_init_ctx_len, batch_size, stream_);
+    invokePadLastTokenIds(token_ids_buf_.data(), init_context_length_.data(), g.max_init_ctx_len, batch_size, stream_);
     sync_check_cuda_error();
 
     // seq_limit_len_, will be compared to `step` instead of `sequence_length`, so padding len should be accounted for
@@ -1085,213 +929,74 @@ void LlamaBatch<T>::InitializeSampling(const GenerationState& g)
     }
     Copy(h_seq_limit_len_, batch_size, seq_limit_len_);
 
-    TensorMap inputs;
-
-    auto member_to_tensor = [&](auto getter, auto key, auto dest, auto init) {
-        int count = 0;
-        for (int i = 0; i < batch_size; ++i) {
-            // `std::invoke`
-            dest[i] = state_->requests[i]->gen_cfg.*getter;
-            count += dest[i] != init;
-        }
-        if (count) {
-            inputs.insert(key, {MEMORY_CPU, getTensorType<decltype(init)>(), {(size_t)batch_size}, dest});
-        }
-    };
-
-    using G = GenerationConfig;
-    member_to_tensor(&G::top_k, "runtime_top_k", h_runtime_top_k_, 0);
-    member_to_tensor(&G::top_p, "runtime_top_p", h_runtime_top_p_, 0);
-    member_to_tensor(&G::min_p, "runtime_min_p", h_runtime_min_p_, 0);
-    member_to_tensor(&G::temperature, "temperature", h_temperature_, 1.f);
-    member_to_tensor(&G::repetition_penalty, "repetition_penalty", h_repetition_penalty_, 1.f);
-    member_to_tensor(&G::min_new_tokens, "min_length", h_min_length_, 0);
-
-    auto init_stop_bad_words = [&](auto getter, auto key, auto h_buf, auto d_buf) {
-        int                                     max_length = 0;
-        std::vector<std::pair<const int*, int>> copy_tokens(batch_size);
-        std::vector<std::pair<const int*, int>> copy_offsets(batch_size);
-        for (int i = 0; i < batch_size; ++i) {
-            const auto& [token_ids, offsets] = std::invoke(getter, state_->requests[i]->gen_cfg);
-            if (offsets.size() == 0 || token_ids.size() == 0) {
-                continue;
-            }
-            FT_CHECK(offsets.back() == token_ids.size());
-            if (offsets.back() <= kMaxStopBadWordsLen) {
-                copy_tokens[i]  = std::make_pair(token_ids.data(), (int)token_ids.size());
-                copy_offsets[i] = std::make_pair(offsets.data(), (int)offsets.size());
-                max_length      = std::max(max_length, (int)token_ids.size());
-            }
-            else {
-                auto trunc_offset_size =
-                    std::upper_bound(offsets.begin(),
-                                     offsets.begin() + std::min(kMaxStopBadWordsLen, (int)offsets.size()),
-                                     kMaxStopBadWordsLen)
-                    - offsets.begin();
-                TM_LOG_WARNING("[InitializeSampling] [%ld] %s length (%d) exceeds %d, truncated to %d",
-                               state_->requests[i]->id,
-                               key,
-                               offsets.back(),
-                               kMaxStopBadWordsLen,
-                               trunc_offset_size);
-                if (trunc_offset_size > 0) {
-                    int trunc_token_size = offsets[trunc_token_size - 1];
-                    copy_tokens[i]       = std::make_pair(token_ids.data(), trunc_token_size);
-                    copy_offsets[i]      = std::make_pair(offsets.data(), trunc_offset_size);
-                    max_length           = std::max(max_length, trunc_token_size);
-                }
-            }
-        }
-        if (!max_length) {
-            return;
-        }
-        std::fill_n(h_buf, batch_size * 2 * max_length, -1);
-        for (int i = 0; i < batch_size; ++i) {
-            if (copy_tokens[i].first != nullptr) {
-                std::copy_n(copy_tokens[i].first, copy_tokens[i].second, h_buf + i * 2 * max_length);
-            }
-            if (copy_offsets[i].first != nullptr) {
-                std::copy_n(copy_offsets[i].first, copy_offsets[i].second, h_buf + i * 2 * max_length + max_length);
-            }
-        }
-        Copy(h_buf, batch_size * 2 * max_length, d_buf);
-        inputs.insert(key, {MEMORY_GPU, TYPE_INT32, {(size_t)batch_size, (size_t)2, (size_t)max_length}, d_buf});
-    };
-    init_stop_bad_words(&G::stop_ids, "stop_words_list", h_stop_words_, d_stop_words_);
-    init_stop_bad_words(&G::bad_ids, "bad_words_list", h_bad_words_, d_bad_words_);
-
-    // MinLengthPenalty
-    if (inputs.isExist("min_length")) {
-        inputs.insert({"prompt_length", {MEMORY_CPU, TYPE_INT32, {(size_t)batch_size}, state_->h_prompt_length}});
-        inputs.insert({"context_length", {MEMORY_CPU, TYPE_INT32, {(size_t)batch_size}, state_->h_context_length}});
-    }
-
-    // init for eos
-    auto init_for_eos = [&] {
-        int max_length = 0;
-        for (int i = 0; i < batch_size; ++i) {
-            max_length = std::max(max_length, (int)state_->requests[i]->gen_cfg.eos_ids.size());
-        }
-        if (max_length) {
-            max_length     = std::min(max_length, kMaxEndIdsSize);
-            int* h_end_ids = h_end_ids_buf_;
-            std::fill(h_end_ids, h_end_ids + std::min(kMaxEndIdsSize, max_length) * batch_size, -1);
-            for (int i = 0; i < batch_size; ++i) {
-                const auto& eos_ids = state_->requests[i]->gen_cfg.eos_ids;
-                if (eos_ids.size() == 0) {
-                    continue;
-                }
-                if (eos_ids.size() > kMaxEndIdsSize) {
-                    TM_LOG_WARNING("[InitializeSampling] [%ld] eos length (%d) exceeds %d, truncated to %d",
-                                   (long)state_->requests[i]->id,
-                                   (int)eos_ids.size(),
-                                   kMaxEndIdsSize,
-                                   kMaxEndIdsSize);
-                }
-                std::copy_n(eos_ids.begin(), std::min((int)eos_ids.size(), kMaxEndIdsSize), h_end_ids);
-                h_end_ids += max_length;
-            }
-            Copy(h_end_ids_buf_, batch_size * max_length, d_end_ids_buf_);
-            inputs.insert("end_ids",
-                          {MEMORY_GPU, TYPE_INT32, {(size_t)batch_size, (size_t)max_length}, d_end_ids_buf_});
-        }
-    };
-    init_for_eos();
-
-    inputs_ = std::move(inputs);
-
-    {
-        NvtxScope setup("DynamicDecodeLayer.setup");
-        model_->dynamic_decode_layer_->setup(batch_size, 1, &inputs_);
+    std::vector<const Request*> rs;
+    rs.reserve(batch_size);
+    for (int i = 0; i < batch_size; ++i) {
+        rs.push_back(state_->requests[i].get());
     }
 
-    TensorMap outputs;
-    for (int i = 0; i < batch_size; i++) {
-        if (state_->requests[i]->gen_cfg.output_logprobs) {
-            outputs.insert({"sampled_logprobs",
-                            {MEMORY_GPU, getTensorType<T>(), {(size_t)batch_size, 1, kMaxLogProb}, sampled_logprobs_}});
-            outputs.insert(
-                {"sampled_indexes", {MEMORY_GPU, TYPE_UINT32, {(size_t)batch_size, 1, kMaxLogProb}, sampled_indexes_}});
-            outputs.insert({"sampled_nums", {MEMORY_GPU, TYPE_UINT32, {(size_t)batch_size, 1}, sampled_nums_}});
+    model_->dynamic_decode_->Setup(rs, {{"prompt_length", {state_->h_prompt_length, {batch_size}}}});
 
-            break;
-        }
-    }
-    outputs_ = std::move(outputs);
     sync_check_cuda_error();
 }
 
-template<class T>
-void LlamaBatch<T>::ComputeAndOutputLogits(T* hidden_states, int first, int last)
+void LlamaBatch::ComputeAndOutputLogits(const Tensor& hidden_states, int first, int last)
 {
-    int  token_num = 0;
-    bool found     = false;
-    for (int i = first; i < last; ++i) {
-        if (state_->requests[i]->gen_cfg.output_logits == GenerationConfig::kAll) {
-            const auto& s = *state_->sequences[i];
-            // Skip when the seq is filling missed cache only
-            if (s.cache_len + h_input_length_buf_[i] > s.tokens.size()) {
-                found = true;
+    auto enable = [&] {
+        for (int i = first; i < last; ++i) {
+            if (state_->requests[i]->gen_cfg.output_logits == GenerationConfig::kAll) {
+                const auto& s = *state_->sequences[i];
+                // Skip when the seq is filling missed cache only
+                if (s.cache_len + h_input_length_buf_[i] > s.tokens.size()) {
+                    return true;
+                }
             }
         }
-        token_num += h_input_length_buf_[i];
-    }
+        return false;
+    }();
 
-    if (!found) {
+    if (!enable) {
         return;
     }
 
-    if (tp_size_ > 1) {
-        FT_CHECK(model_->vocab_size_padded_ % tp_size_ == 0);
-        const size_t byte_size = sizeof(T) * model_->vocab_size_padded_ * token_num;
+    const int vocab_size_padded = model_->vocab_size_padded_;
+    const int token_num         = hidden_states.shape(0);
 
-        if (local_context_logits_buf_size_ < byte_size) {
+    if (symm_logits_buf_.shape(0) < token_num) {
+        if (tp_size_ > 1) {
             check_cuda_error(cudaStreamSynchronize(stream_));
             comm_.h_tp_group->Sync();
-
-            CommBufFree((void**)&local_context_logits_buf_, true);
-            local_context_logits_buf_      = (T*)CommBufAlloc(byte_size, true);
-            local_context_logits_buf_size_ = byte_size;
-
+        }
+        symm_logits_buf_ = {{token_num, vocab_size_padded}, data_type_, symm_alloc_};
+        if (tp_size_ > 1) {
             check_cuda_error(cudaStreamSynchronize(stream_));
             comm_.h_tp_group->Sync();
         }
     }
 
-    if (model_->use_allgather_2d_) {
-        // No intermediate transpose needed
-        context_logits_buf_ = local_context_logits_buf_;
-    }
-    else {
-        context_logits_buf_ =
-            (T*)allocator_->reMalloc(context_logits_buf_, sizeof(T) * model_->vocab_size_padded_ * token_num, false);
-    }
-
-    model_->postDecodeEmbedding(context_logits_buf_, local_context_logits_buf_, hidden_states, token_num);
+    auto logits = model_->postDecodeEmbedding(hidden_states, symm_logits_buf_.buffer());
 
-    if (tp_rank_ != 0) {
-        return;
+    if (tp_rank_ == 0) {
+        OutputLogits(logits, first, last, GenerationConfig::kAll);
     }
-
-    OutputLogits(context_logits_buf_, first, last, GenerationConfig::kAll);
 }
 
-template<typename T>
-void LlamaBatch<T>::OutputLogits(const T* logits, int first, int last, GenerationConfig::OutType out_type)
+void LlamaBatch::OutputLogits(const Tensor& logits, int first, int last, GenerationConfig::OutType out_type)
 {
+    const auto& src_buf   = logits.buffer();
+    const auto  elem_size = byte_size(logits.dtype(), 1);
     // when `is_all` is true, logits only contains last token of the sequences
     const bool is_all = out_type == GenerationConfig::kAll;
 
+    int base = 0;
+
     for (int i = first; i < last; ++i) {
 
         const int input_len = h_input_length_buf_[i];  // input lenght for this iter
-        const T*  src_ptr   = logits;
-
-        logits += (is_all ? input_len : 1) * model_->vocab_size_padded_;
 
         if (state_->requests[i]->gen_cfg.output_logits == out_type) {
 
-            auto dst_ptr = state_->requests[i]->outputs.getPtr<T>("logits");
+            auto& dst_buf = state_->requests[i]->outputs.at("logits").buffer();
 
             const int cache_len   = state_->sequences[i]->cache_len;
             const int history_len = state_->sequences[i]->tokens.size();
@@ -1300,7 +1005,7 @@ void LlamaBatch<T>::OutputLogits(const T* logits, int first, int last, Generatio
             //      C        C      C         C
 
             // offset to the last token prompt
-            const int offset = is_all ? 0 : state_->requests[i]->inputs.at("input_ids").shape[0] - 1;
+            const int offset = is_all ? 0 : state_->requests[i]->inputs.at("input_ids").shape(0) - 1;
 
             int diff = (history_len + offset) - cache_len;
 
@@ -1319,67 +1024,72 @@ void LlamaBatch<T>::OutputLogits(const T* logits, int first, int last, Generatio
                 continue;
             }
 
+            int src_base = base;
+
             if (is_all) {
                 // Skip invalid tokens caused by cache miss
-                src_ptr += std::max(0, (history_len + offset) - cache_len) * model_->vocab_size_padded_;
+                src_base += std::max(0, (history_len + offset) - cache_len);
             }
             // Skip previous chunks
-            dst_ptr += std::max(0, cache_len - (history_len + offset)) * model_->vocab_size_;
+            int dst_base = std::max(0, cache_len - (history_len + offset));
 
-            check_cuda_error(cudaMemcpy2DAsync(dst_ptr,
-                                               sizeof(T) * model_->vocab_size_,
-                                               src_ptr,
-                                               sizeof(T) * model_->vocab_size_padded_,
-                                               sizeof(T) * model_->vocab_size_,
+            check_cuda_error(cudaMemcpy2DAsync(dst_buf.raw_data(dst_base * model_->vocab_size_),
+                                               elem_size * model_->vocab_size_,
+                                               src_buf.raw_data(src_base * model_->vocab_size_padded_),
+                                               elem_size * model_->vocab_size_padded_,
+                                               elem_size * model_->vocab_size_,
                                                valid_len,
                                                cudaMemcpyDefault,
                                                stream_));
         }
+
+        base += is_all ? input_len : 1;
     }
 }
 
-template<class T>
-void LlamaBatch<T>::OutputLastHiddenState(const T* hidden_states, int first, int last)
+void LlamaBatch::OutputLastHiddenState(const Tensor& hidden_states, int first, int last)
 {
-    for (int i = first; i < last; ++i) {
+    const auto& src_buf   = hidden_states.buffer();
+    const auto  data_type = src_buf.dtype();
+    int         base      = 0;
 
+    for (int i = first; i < last; ++i) {
         const int input_len = h_input_length_buf_[i];  // input lenght for this iter
-        const T*  src_ptr   = hidden_states;
-
-        hidden_states += input_len * model_->hidden_units_;
 
         if (auto out_type = state_->requests[i]->gen_cfg.output_last_hidden_state) {
 
             const bool is_all = out_type == GenerationConfig::kAll;
 
-            T* dst_ptr = state_->requests[i]->outputs.getPtr<T>("last_hidden_state");
+            auto& dst_buf = state_->requests[i]->outputs.at("last_hidden_state").buffer();
 
             const int cache_len   = state_->sequences[i]->cache_len;
             const int history_len = state_->sequences[i]->tokens.size();
 
             // offset to the last prompt token
-            const int offset = is_all ? 0 : state_->requests[i]->inputs.at("input_ids").shape[0] - 1;
+            const int offset = is_all ? 0 : state_->requests[i]->inputs.at("input_ids").shape(0) - 1;
 
             const int valid_len = input_len - std::max(0, (history_len + offset) - cache_len);
 
             // TM_LOG_ERROR("%d %d %d %d %d", history_len, offset, cache_len, input_len, valid_len);
 
-            if (valid_len <= 0) {
-                continue;
-            }
-
-            // Skip invalid tokens caused by cache miss
-            src_ptr += std::max(0, (history_len + offset) - cache_len) * model_->hidden_units_;
-            // Skip previous chunks
-            dst_ptr += std::max(0, cache_len - (history_len + offset)) * model_->hidden_units_;
+            if (valid_len > 0) {
+                // Skip invalid tokens caused by cache miss
+                int src_base = std::max(0, (history_len + offset) - cache_len) + base;
+                // Skip previous chunks
+                int dst_base = std::max(0, cache_len - (history_len + offset));
 
-            Copy(src_ptr, valid_len * model_->hidden_units_, dst_ptr);
+                core::Copy(src_buf.raw_data(src_base * model_->hidden_units_),
+                           byte_size(data_type, valid_len * model_->hidden_units_),
+                           dst_buf.raw_data(dst_base * model_->hidden_units_));
+            }
         }
+
+        // hidden_states += input_len * model_->hidden_units_;
+        base += input_len;
     }
 }
 
-template<typename T>
-void LlamaBatch<T>::Finish(GenerationState& g, std::vector<Signal>& signals)
+void LlamaBatch::Finish(GenerationState& g, std::vector<Signal>& signals)
 {
     NvtxScope scope("Finish");
     const int batch_size = state_->active_size;
@@ -1390,9 +1100,9 @@ void LlamaBatch<T>::Finish(GenerationState& g, std::vector<Signal>& signals)
         FT_CHECK(g.step >= 0);
 
         // [s,b] -> [b,s] and skip padding in [context_len, max_context_len)
-        invokeGatherOutput(state_->output_ids,
-                           token_ids_buf_,
-                           init_context_length_,
+        invokeGatherOutput(state_->output_ids.data(),
+                           token_ids_buf_.data(),
+                           init_context_length_.data(),
                            g.max_init_ctx_len,
                            g.step,
                            session_len_,
@@ -1401,7 +1111,7 @@ void LlamaBatch<T>::Finish(GenerationState& g, std::vector<Signal>& signals)
         sync_check_cuda_error();
     }
 
-    Copy(token_ids_buf_ + (g.step - 1) * (batch_size - g.partial), batch_size - g.partial, h_output_ids_);
+    Copy(token_ids_buf_.slice((g.step - 1) * (batch_size - g.partial), -1), batch_size - g.partial, h_output_ids_);
     Copy(finished_buf_, batch_size, state_->h_finished);
     Copy(sequence_lengths_, batch_size, state_->h_context_length);
 
@@ -1430,14 +1140,14 @@ void LlamaBatch<T>::Finish(GenerationState& g, std::vector<Signal>& signals)
     if (tp_rank_ == 0 && output_logprobs) {
         NvtxScope scope("logprobs");
         // output logprobs, should be set before sequence_length
-        T*        sampled_logprobs_ptr = h_sampled_logprobs_;
-        uint32_t* sampled_indexes_ptr  = h_sampled_indexes_;
-        uint32_t* sampled_nums_ptr     = h_sampled_nums_;
+        float*    sampled_logprobs_ptr = h_sampled_logprobs_.data();
+        uint32_t* sampled_indexes_ptr  = h_sampled_indexes_.data();
+        uint32_t* sampled_nums_ptr     = h_sampled_nums_.data();
         for (int i = 0; i < batch_size - g.partial; ++i) {
             if (state_->requests[i] && state_->requests[i]->gen_cfg.output_logprobs) {
-                auto logprob_vals    = state_->requests[i]->outputs.getPtr<T>("logprob_vals");
-                auto logprob_indexes = state_->requests[i]->outputs.getPtr<uint32_t>("logprob_indexes");
-                auto logprob_nums    = state_->requests[i]->outputs.getPtr<uint32_t>("logprob_nums");
+                auto logprob_vals    = state_->requests[i]->outputs.at("logprob_vals").data<float>();
+                auto logprob_indexes = state_->requests[i]->outputs.at("logprob_indexes").data<int32_t>();
+                auto logprob_nums    = state_->requests[i]->outputs.at("logprob_nums").data<int32_t>();
 
                 int offset = state_->h_context_length[i] - state_->h_prompt_length[i] - 1;
                 std::copy(sampled_logprobs_ptr,
@@ -1457,35 +1167,13 @@ void LlamaBatch<T>::Finish(GenerationState& g, std::vector<Signal>& signals)
     // ! Only rank-0 writes to output
     if (tp_rank_ == 0) {
         NvtxScope scope("output_ids");
-        if constexpr (0) {
-            // set output tokens ids and sequence length
-            int* output_ptr = h_output_ids_;
-            for (int i = 0; i < batch_size - g.partial; ++i) {
-                if (auto& r = state_->requests[i]) {
-                    auto      output_ids = static_cast<int*>(r->output_ids.data);
-                    auto      output_len = static_cast<int*>(r->sequence_length.data);
-                    const int count      = state_->h_context_length[i];
-                    if (r->stream_output) {
-                        output_ids[count - 1] = output_ptr[count - 1];
-                        *output_len           = count;
-                    }
-                    else if (state_->h_finished[i]) {
-                        std::copy(output_ptr, output_ptr + count, output_ids);
-                        *output_len = count;
-                    }
-                }
-                output_ptr += session_len_;
-            }
-        }
-        else {
-            for (int i = 0; i < batch_size - g.partial; ++i) {
-                if (auto& r = state_->requests[i]) {
-                    auto      output_ids  = static_cast<int*>(r->output_ids.data);
-                    auto      output_len  = static_cast<int*>(r->sequence_length.data);
-                    const int count       = state_->h_context_length[i];
-                    output_ids[count - 1] = h_output_ids_[i];
-                    *output_len           = count;
-                }
+        for (int i = 0; i < batch_size - g.partial; ++i) {
+            if (auto& r = state_->requests[i]) {
+                auto      output_ids  = r->output_ids.data();
+                auto      output_len  = r->sequence_length.data();
+                const int count       = state_->h_context_length[i];
+                output_ids[count - 1] = h_output_ids_[i];
+                *output_len           = count;
             }
         }
     }
@@ -1497,7 +1185,7 @@ void LlamaBatch<T>::Finish(GenerationState& g, std::vector<Signal>& signals)
         for (int i = 0; i < batch_size; ++i) {
             // ss << (i ? ", " : "") << "(" << state_->h_context_length[i] << "," << state_->h_finished[i] << ")";
             std::vector<int> tokens(state_->h_context_length[i]);
-            Copy(state_->output_ids + i * session_len_, tokens.size(), tokens.data());
+            core::Copy(state_->output_ids.data() + i * session_len_, tokens.size(), tokens.data());
             cudaStreamSynchronize(stream_);
             std::stringstream ss;
             for (const auto& t : tokens) {
@@ -1535,7 +1223,7 @@ void LlamaBatch<T>::Finish(GenerationState& g, std::vector<Signal>& signals)
                 FT_CHECK(!r);
             }
             else if (r->stream_output && tp_rank_ == 0) {
-                const auto seq_len = r->sequence_length.getVal<int>();
+                const auto seq_len = *r->sequence_length.data();
                 // Create signals by copying the request handles for non-finished streaming requests
                 signals.push_back([this, r, seq_len] {  //
                     UpdateState(*r, Request::kOk, seq_len);
@@ -1556,8 +1244,7 @@ void LlamaBatch<T>::Finish(GenerationState& g, std::vector<Signal>& signals)
     }
 }
 
-template<typename T>
-auto LlamaBatch<T>::Interrupt(int index, bool force_stop, bool force_end) -> Signal
+auto LlamaBatch::Interrupt(int index, bool force_stop, bool force_end) -> Signal
 {
     if (tp_rank_ == 0) {
         TM_LOG_INFO("[Interrupt] slot %d, request %lu, stop %d, end %d",
@@ -1569,7 +1256,7 @@ auto LlamaBatch<T>::Interrupt(int index, bool force_stop, bool force_end) -> Sig
 
     if (debug_ && tp_rank_ == 0) {
         std::vector<int> tokens(state_->h_context_length[index]);
-        Copy(state_->output_ids + index * session_len_, tokens.size(), tokens.data());
+        core::Copy(state_->output_ids.data() + index * session_len_, tokens.size(), tokens.data());
         cudaStreamSynchronize(stream_);
         std::stringstream ss;
         for (const auto& t : tokens) {
@@ -1590,13 +1277,13 @@ auto LlamaBatch<T>::Interrupt(int index, bool force_stop, bool force_end) -> Sig
         seq.tokens.resize(output_len);
 
         // output_ids is updated & synced in `Finish`
-        const auto output_ids = state_->requests[index]->output_ids.getPtr<int>();
+        const auto output_ids = state_->requests[index]->output_ids.data();
         std::copy_n(output_ids, output_len, seq.tokens.data());
 
         // Save random state in host memory
         seq.random_state.resize(sizeof(curandState_t));
         // This async copy must be synchronized by the caller
-        Copy(state_->curand_state + index, 1, (curandState_t*)seq.random_state.data());
+        core::Copy((curandState_t*)state_->curand_state.data() + index, 1, (curandState_t*)seq.random_state.data());
 
         // Set unlock flag for corresponding blocks, will be unlocked in the next `Materialize()`
         sequence_manager_->UpdateAndSetUnlock(seq);
@@ -1606,7 +1293,7 @@ auto LlamaBatch<T>::Interrupt(int index, bool force_stop, bool force_end) -> Sig
 
     auto ec = std::exchange(state_->errors[index], Request::kOk);
 
-    const auto len = state_->requests[index]->sequence_length.getVal<int>();
+    const auto len = *state_->requests[index]->sequence_length.data();
     // move the request handle into the signal
     return [this, len, force_stop, r = std::move(state_->requests[index])] {  //
         UpdateState(*r, force_stop ? Request::kCancel : Request::kFinish, len);
@@ -1625,12 +1312,13 @@ struct RequestData {
 
 }  // namespace
 
-template<typename T>
-void LlamaBatch<T>::InternalThreadEntry()
+void LlamaBatch::InternalThreadEntry()
 {
     // TM_LOG_INFO("[InternalThreadEntry] %d", (int)rank_);
     check_cuda_error(cudaSetDevice(device_id_));
 
+    core::ContextGuard guard{context_->core_stream, context_->allocator};
+
     // Initialize `AnomalyHandler`
     AnomalyHandler::instance().Init(tp_rank_, model_->vocab_size_padded_, 0, max_batch_size_, stream_);
 
@@ -1712,8 +1400,7 @@ void LlamaBatch<T>::InternalThreadEntry()
     DestroyCommunicators();
 }
 
-template<typename T>
-void LlamaBatch<T>::Start()
+void LlamaBatch::Start()
 {
     TM_LOG_INFO("LlamaBatch<T>::Start()");
     internal_thread_ = std::thread([this] {
@@ -1727,8 +1414,7 @@ void LlamaBatch<T>::Start()
     });
 }
 
-template<typename T>
-bool LlamaBatch<T>::Forward(GenerationState& g)
+bool LlamaBatch::Forward(GenerationState& g)
 {
     NvtxScope _("Forward");
 
@@ -1749,7 +1435,7 @@ bool LlamaBatch<T>::Forward(GenerationState& g)
         // const int   missing = state_->h_context_length[i] - seq.cache_len;
         FT_CHECK(seq.input_length >= 1);
         h_input_length_buf_[i] = seq.input_length;
-        input_d_ptrs[i]        = state_->output_ids + i * session_len_ + seq.cache_len;
+        input_d_ptrs[i]        = state_->output_ids.data() + i * session_len_ + seq.cache_len;
         if (seq.input_length > 1 && pf_offset < 0) {
             pf_offset = i;
         }
@@ -1800,7 +1486,7 @@ bool LlamaBatch<T>::Forward(GenerationState& g)
         const int first           = offsets[p];
         const int last            = offsets[p + 1];
         const int mini_batch_size = last - first;
-        int*      input_ids       = context_decoder_ids_buf_;
+        int*      input_ids       = input_ids_buf_.data();
 
         BatchedCopy batched_copy;
         int         sum_k = 0;
@@ -1810,7 +1496,7 @@ bool LlamaBatch<T>::Forward(GenerationState& g)
                 sum_k += state_->h_context_length[i];
             }
         }
-        int sum_q = input_ids - context_decoder_ids_buf_;
+        int sum_q = input_ids - input_ids_buf_.data();
 
         batched_copy.Submit(stream_);
 
@@ -1819,8 +1505,10 @@ bool LlamaBatch<T>::Forward(GenerationState& g)
 
         if (tp_rank_ == 0) {
             if (pf_batch_size) {
-                const auto max_q = *std::max_element(h_input_length_buf_ + first, h_input_length_buf_ + last);
-                const auto max_k = *std::max_element(state_->h_context_length + first, state_->h_context_length + last);
+                const auto max_q =
+                    *std::max_element(h_input_length_buf_.data() + first, h_input_length_buf_.data() + last);
+                const auto max_k =
+                    *std::max_element(state_->h_context_length.data() + first, state_->h_context_length.data() + last);
                 TM_LOG_INFO("[Forward] [%d, %d), dc=%d, pf=%d, sum_q=%d, sum_k=%d, max_q=%d, max_k=%d",
                             first,
                             last,
@@ -1835,68 +1523,71 @@ bool LlamaBatch<T>::Forward(GenerationState& g)
 
         // Synchronize batch token num with sync DP ranks
         auto local_token_nums = AllGather(comm_.h_dp_group, sum_q);
+        auto global_token_num = std::accumulate(local_token_nums.begin(), local_token_nums.end(), 0);
 
-        // if (comm_.h_comm->rank() == 0) {
-        //     std::stringstream ss;
-        //     for (auto x : local_token_nums) {
-        //         ss << x << " ";
-        //     }
-        //     TM_LOG_ERROR("%s", ss.str().c_str());
-        // }
-
-        model_->forwardUnified(decoder_output_buf_ + first * model_->hidden_units_,
-                               context_decoder_output_buf_,  // temp
-                               context_decoder_input_buf_,   // temp
-                               (void**)block_ptrs_,
-                               cu_block_counts_ + first,
-                               context_decoder_ids_buf_,  // temp
-                               h_input_length_buf_ + first,
-                               state_->h_context_length + first,
-                               rope_theta_ + first,
-                               finished_buf_ + first,
-                               sum_q,
-                               local_token_nums.data(),
-                               dc_batch_size,
-                               pf_batch_size,
-                               lora_mask_buf_,
-                               state_->sequences.data() + first);
-
-        ComputeAndOutputLogits(context_decoder_output_buf_, first, last);
-        OutputLastHiddenState(context_decoder_output_buf_, first, last);
-    }
-
-    if (active_size > g.partial) {
-        model_->postDecodeEmbedding(logits_buf_, local_logits_buf_, decoder_output_buf_, active_size - g.partial);
-
-        AnomalyHandler::instance().FixLogits(logits_buf_, active_size - g.partial, 1);
-
-        OutputLogits(logits_buf_, 0, active_size - g.partial, GenerationConfig::kGeneration);
+        auto hidden_states = symm_hidden_states_buf_.slice(0, global_token_num);
 
-        FT_CHECK(g.step >= 0);
+        model_->Forward(input_ids_buf_.slice(0, sum_q),  // temp
+                        hidden_states,                   // temp
+                        decoder_output_buf_.slice(first, mini_batch_size),
+                        block_ptrs_,
+                        cu_block_counts_.slice(first, mini_batch_size + 1),
+                        h_input_length_buf_.slice(first, mini_batch_size),
+                        state_->h_context_length.slice(first, mini_batch_size),
+                        rope_theta_.slice(first, mini_batch_size),
+                        finished_buf_.slice(first, mini_batch_size),
+                        Buffer(local_token_nums.data(), local_token_nums.size(), kCPU),
+                        lora_mask_buf_,
+                        dc_batch_size,
+                        pf_batch_size,
+                        state_->sequences.data() + first);
+
+        ComputeAndOutputLogits(hidden_states, first, last);
+        OutputLastHiddenState(hidden_states, first, last);
+    }
+
+    if (const auto bsz = active_size - g.partial; bsz > 0) {
+
+        auto logits = model_->postDecodeEmbedding(decoder_output_buf_.slice(0, bsz), symm_logits_buf_.buffer());
+
+        // AnomalyHandler::instance().FixLogits(logits.data<nv_bfloat16>(), bsz, 1);
+
+        OutputLogits(logits, 0, bsz, GenerationConfig::kGeneration);
+
+        TM_CHECK_GE(g.step, 0);
 
         if (!g.skip_init_sampling) {
             InitializeSampling(g);
         }
+
+        bool output_logprobs = [&] {
+            for (int i = 0; i < bsz; ++i) {
+                if (state_->requests[i]->gen_cfg.output_logprobs) {
+                    return true;
+                }
+            }
+            return false;
+        }();
+
         // stop-words & bad-words require the matched tokens to be contiguous, so item size > 1 is
-        // not supported yet.
+        // not supported.
         model_->dynamicDecode(token_ids_buf_,
                               finished_buf_,
                               sequence_lengths_,
-                              nullptr,
                               state_->curand_state,
-                              &inputs_,
-                              &outputs_,
-                              logits_buf_,
+                              logits,  // <- batch size indicator
                               seq_limit_len_,
                               init_context_length_,
+                              state_->h_context_length,
+                              state_->h_prompt_length,
+                              output_logprobs ? sampled_indexes_ : Buffer{},  // <- indicator
+                              sampled_logprobs_,
+                              sampled_nums_,
                               g.step,
-                              0,
-                              g.max_init_ctx_len,
-                              session_len_ * 2,
-                              active_size - g.partial);
+                              g.max_init_ctx_len);
     }
 
-    std::fill(h_input_length_buf_, h_input_length_buf_ + active_size, 0);
+    std::fill(h_input_length_buf_.data(), h_input_length_buf_.data() + active_size, 0);
 
     // `SequenceManager` needs real-time value of cache length
     for (int i = 0; i < active_size; ++i) {
@@ -1918,7 +1609,7 @@ bool LlamaBatch<T>::Forward(GenerationState& g)
 
     if (debug_ && tp_rank_ == 0) {
         std::vector<int> curr(active_size);
-        Copy(token_ids_buf_ + g.step * active_size, active_size, curr.data());
+        core::Copy(token_ids_buf_.data() + g.step * active_size, active_size, curr.data());
         cudaStreamSynchronize(stream_);
         std::stringstream scurr;
         for (int k = 0; k < curr.size(); ++k) {
@@ -1927,14 +1618,10 @@ bool LlamaBatch<T>::Forward(GenerationState& g)
         TM_LOG_INFO("[Forward] step = %d, [%s]", g.step - 1, scurr.str().c_str());
     }
 
-    // check_cuda_error(cudaStreamSynchronize(stream_));
-
     ////////////////////////////////////////////////
     /// ! increase the counters
     g.step += 1;
 
-    // PrintDecodeTokens(token_ids_buf_, g.step, active_size, stream_, "Forward");
-
     return true;
 }
 
@@ -1954,11 +1641,10 @@ std::string Join(First first, Last last, const std::string& delim)
     return oss.str();
 }
 
-template<class T>
 struct TuningContext {
-    LlamaLinear<T>& linear_;
-    cudaStream_t    stream_;
-    TuningContext(LlamaLinear<T>& linear, cudaStream_t stream): linear_{linear}, stream_{stream}
+    LlamaLinear& linear_;
+    cudaStream_t stream_;
+    TuningContext(LlamaLinear& linear, cudaStream_t stream): linear_{linear}, stream_{stream}
     {
         isTuning() = true;
         linear_.set_measure(true);
@@ -1972,8 +1658,7 @@ struct TuningContext {
 
 }  // namespace
 
-template<class T>
-void LlamaBatch<T>::Warmup()
+void LlamaBatch::Warmup()
 {
     auto& linear = *context_->linear;
     if (auto str = std::getenv("TM_GEMM_IMPORT")) {
@@ -2006,7 +1691,7 @@ void LlamaBatch<T>::Warmup()
         for (auto& x : input_ids) {
             x = d(g);
         }
-        Copy(input_ids.data(), max_bs, context_decoder_ids_buf_);
+        core::Copy(input_ids.data(), max_bs, input_ids_buf_.data());
         check_cuda_error(cudaStreamSynchronize(stream_));
 
         TuningContext context{linear, stream_};
@@ -2014,29 +1699,31 @@ void LlamaBatch<T>::Warmup()
         auto tick = std::chrono::steady_clock::now();
 
         /// NOTE: No explicit barrier can be used here as internal threads are waiting on it now
-        for (auto bs : bss) {
+        for (auto token_num : bss) {
             if (tp_rank_ == 0) {
-                TM_LOG_INFO("[Gemm2] %d", bs);
+                TM_LOG_INFO("[Gemm2] %d", token_num);
             }
-            const int input_length     = bs;
-            auto      local_token_nums = AllGather(comm_.h_dp_group, bs);
-
-            model_->forwardUnified(decoder_output_buf_,
-                                   context_decoder_output_buf_,
-                                   context_decoder_input_buf_,
-                                   (void**)block_ptrs_,  // invalid data
-                                   cu_block_counts_,     // invalid data
-                                   context_decoder_ids_buf_,
-                                   &input_length,
-                                   &input_length,
-                                   rope_theta_,    // invalid data
-                                   finished_buf_,  // invalid data
-                                   bs,
-                                   local_token_nums.data(),
-                                   0,
-                                   1,
-                                   nullptr,
-                                   nullptr);
+
+            int  input_length     = token_num;
+            auto local_token_nums = AllGather(comm_.h_dp_group, token_num);
+
+            const auto bsz = 1;
+
+            // A single sequence containing `token_num` prefill tokens
+            model_->Forward(input_ids_buf_.slice(0, token_num),
+                            symm_hidden_states_buf_.slice(0, token_num * param_.attn_dp_size),
+                            decoder_output_buf_.slice(0, bsz),
+                            block_ptrs_,
+                            cu_block_counts_.slice(0, bsz + 1),
+                            Buffer{&input_length, 1, kCPU},
+                            Buffer{&input_length, 1, kCPU},
+                            rope_theta_.slice(0, bsz),
+                            finished_buf_.slice(0, bsz),
+                            Buffer{local_token_nums.data(), (int)local_token_nums.size(), kCPU},
+                            Buffer{},
+                            0,
+                            bsz,
+                            nullptr);
         }
 
         auto tock = std::chrono::steady_clock::now();
@@ -2060,8 +1747,7 @@ void LlamaBatch<T>::Warmup()
     }
 }
 
-template<class T>
-void* LlamaBatch<T>::CommBufAlloc(size_t size, bool register_)
+void* LlamaBatch::SymmAlloc(size_t size, bool register_)
 {
     if (auto& comm = model_->comm_->d_comm) {
         auto ptr = comm->Allocate(size);
@@ -2071,52 +1757,39 @@ void* LlamaBatch<T>::CommBufAlloc(size_t size, bool register_)
         return ptr;
     }
     else {
-        return allocator_->malloc(size);
+        return context_->allocator->allocate(size);
     }
 }
 
-template<class T>
-void LlamaBatch<T>::CommBufFree(void** ptr, bool deregister)
+void LlamaBatch::SymmFree(void* ptr, size_t size, bool deregister)
 {
     if (!ptr) {
         return;
     }
-    if (auto& comm = model_->comm_->d_comm) {
+    if (auto& comm = comm_.d_comm) {
         if (deregister) {
-            comm->Deregister(*ptr);
+            comm->Deregister(ptr);
         }
-        comm->Free(*ptr);
-        *ptr = {};
+        comm->Free(ptr);
     }
     else {
-        return allocator_->free(ptr);
+        context_->allocator->deallocate(ptr, size);
     }
 }
 
-template<class T>
-void LlamaBatch<T>::DestroyCommunicators()
+void LlamaBatch::DestroyCommunicators()
 {
-    if (comm_.d_comm) {
-        cudaStreamSynchronize(stream_);
-        comm_.h_comm->Sync();
+    cudaStreamSynchronize(stream_);
+    comm_.h_comm->Sync();
 
-        FreeCommBuffers();
-        comm_.h_comm->Sync();
+    FreeSymmBuffers();
+    comm_.h_comm->Sync();
 
-        // Destroy device communicator
-        comm_.d_comm = {};
+    // Destroy device communicator
+    comm_.d_comm = {};
 
-        cudaStreamSynchronize(stream_);
-        comm_.h_comm->Sync();
-    }
+    cudaStreamSynchronize(stream_);
+    comm_.h_comm->Sync();
 }
 
-template class LlamaBatch<half>;
-#ifdef ENABLE_FP32
-template class LlamaBatch<float>;
-#endif
-#ifdef ENABLE_BF16
-template class LlamaBatch<__nv_bfloat16>;
-#endif
-
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaBatch.h b/src/turbomind/models/llama/LlamaBatch.h
index 4fc5cee93a..837f841188 100644
--- a/src/turbomind/models/llama/LlamaBatch.h
+++ b/src/turbomind/models/llama/LlamaBatch.h
@@ -4,30 +4,31 @@
 
 #include <curand_kernel.h>
 
+#include "src/turbomind/core/core.h"
+
 #include "src/turbomind/engine/gateway.h"
 #include "src/turbomind/engine/request.h"
 
-#include "src/turbomind/models/llama/Barrier.h"
 #include "src/turbomind/models/llama/SequenceManager.h"
 #include "src/turbomind/models/llama/context.h"
 #include "src/turbomind/models/llama/llama_kernels.h"
 #include "src/turbomind/models/llama/llama_params.h"
 
-#include "src/turbomind/utils/allocator.h"
-#include "src/turbomind/utils/cublasMMWrapper.h"
 #include "src/turbomind/utils/cuda_utils.h"
 
 namespace turbomind {
 
 struct BatchState {
-    int*  h_prompt_length;  // history + input, ignore generated
-    int*  h_context_length;
-    bool* h_finished;
 
-    curandState_t* curand_state;
-    int*           output_ids;  // output ids in [B, S]
+    Buffer_<int>  h_prompt_length;  // history + input, ignore generated
+    Buffer_<int>  h_context_length;
+    Buffer_<bool> h_finished;
+
+    Tensor_<uint8_t> curand_state;  // [n, sizeof(curandState_t)]
+
+    Tensor_<int> output_ids;  // output ids in [B, S]
 
-    float* h_rope_theta;
+    Buffer_<float> h_rope_theta;
 
     std::vector<int> seq_len_limit;
 
@@ -42,7 +43,6 @@ struct BatchState {
     int size;
 };
 
-template<typename T>
 class LlamaV2;
 
 struct GenerationState {
@@ -62,14 +62,12 @@ struct GenerationState {
     int finished_count;
 };
 
-template<typename T>
 class LlamaBatch {
 public:
-    void AllocateBuffer(size_t batch_size, size_t session_len, int cache_block_seq_len);
-    void AllocatePersistantBuffer(size_t max_batch_size, int cache_block_seq_len);
+    void AllocateBuffer(ssize_t batch_size, ssize_t session_len, int cache_block_seq_len);
 
-    void AllocCommBuffers();
-    void FreeCommBuffers();
+    void AllocSymmBuffers();
+    void FreeSymmBuffers();
 
     void FreeBuffer();
 
@@ -96,24 +94,25 @@ class LlamaBatch {
 
     [[nodiscard]] Signal Interrupt(int index, bool force_stop = false, bool force_end = false);
 
-    void ComputeAndOutputLogits(T* hidden_states, int first, int last);
+    void ComputeAndOutputLogits(const Tensor& hidden_states, int first, int last);
 
-    void OutputLogits(const T* logits, int first, int last, GenerationConfig::OutType out_type);
+    void OutputLogits(const Tensor& logits, int first, int last, GenerationConfig::OutType out_type);
 
-    void OutputLastHiddenState(const T* hidden_states, int first, int last);
+    void OutputLastHiddenState(const Tensor& hidden_states, int first, int last);
 
-    explicit LlamaBatch(const EngineParam&          param,
-                        std::unique_ptr<LlamaV2<T>> model,
-                        std::unique_ptr<Context<T>> ctx,
-                        std::shared_ptr<Gateway>    gateway,
-                        int                         device_id,
-                        int                         dp_rank);
+    explicit LlamaBatch(DataType                 data_type,
+                        const EngineParam&       param,
+                        std::unique_ptr<LlamaV2> model,
+                        std::unique_ptr<Context> ctx,
+                        std::shared_ptr<Gateway> gateway,
+                        int                      device_id,
+                        int                      dp_rank);
 
     ~LlamaBatch();
 
     void Start();
 
-    LlamaV2<T>& model() noexcept
+    LlamaV2& model() noexcept
     {
         return *model_;
     }
@@ -136,21 +135,6 @@ class LlamaBatch {
 
     void CopyState(const std::vector<std::tuple<BatchState*, BatchState*, int, int>>& desc);
 
-    // analogs to `std::copy_n`
-    template<typename U>
-    U* Copy(const U* src, size_t count, U* dst)
-    {
-        check_cuda_error(cudaMemcpyAsync(dst, src, sizeof(U) * count, cudaMemcpyDefault, stream_));
-        return dst += count;
-    }
-
-    template<typename U>
-    U* Clear(U* data, size_t count)
-    {
-        check_cuda_error(cudaMemsetAsync(data, 0, sizeof(U) * count, stream_));
-        return data += count;
-    }
-
     template<class... Ts>
     void IndexedCopyImpl(const int* src_idx, const int* dst_idx, int count, const std::tuple<Ts*, Ts*, int>&... cpys)
     {
@@ -192,9 +176,9 @@ class LlamaBatch {
         IndexedCopyImpl(nullptr, nullptr, count, cpys...);
     }
 
-    void* CommBufAlloc(size_t size, bool register_);
+    void* SymmAlloc(size_t size, bool register_);
 
-    void CommBufFree(void** ptr, bool deregister);
+    void SymmFree(void* ptr, size_t size, bool deregister);
 
     void DestroyCommunicators();
 
@@ -216,86 +200,68 @@ class LlamaBatch {
     const bool     debug_;
 
     // Refs into `Context<T>`
-    cudaStream_t const     stream_{};
-    cublasMMWrapper* const cublas_wrapper_{};
-    IAllocator* const      allocator_{};
+    cudaStream_t const stream_{};
 
     int session_len_;  // May be truncated in ctor
 
-    std::unique_ptr<Context<T>>      context_;
-    std::unique_ptr<LlamaV2<T>>      model_;
+    std::unique_ptr<Context>         context_;
+    std::unique_ptr<LlamaV2>         model_;
     std::unique_ptr<SequenceManager> sequence_manager_;
 
     Communicators& comm_;
 
+    Allocator symm_alloc_;
+
     ///////////////////////////////////////////////////////////////////
     // k/v cache block buffers
-    int*       cu_block_counts_{};
-    uintptr_t* block_ptrs_{};
+    Buffer_<int>       cu_block_counts_;
+    Buffer_<uintptr_t> block_ptrs_;
 
     ////////////////////////////////////////////////////////////////////
     // context decoding temp buffers
-    T*   context_decoder_input_buf_{};
-    T*   context_decoder_output_buf_{};
-    int* context_decoder_ids_buf_{};
-    int* input_ids_buf_{};
-    // lengths
-    int* input_length_buf_{};    // input + cache missed length
-    int* context_length_buf_{};  // history length + input_length
-    int* init_context_length_{};
+    Tensor symm_hidden_states_buf_;
+    Tensor symm_logits_buf_;
+
+    Tensor decoder_output_buf_;
 
-    T*   decoder_input_buf_{};
-    T*   decoder_output_buf_{};
-    int* sequence_lengths_{};  // current sequence length
-    int* init_ctx_lens_{};
-    int* lora_mask_buf_{};  // lora
+    Buffer_<int> input_ids_buf_;
 
-    T* logits_buf_{};        // combined logits
-    T* local_logits_buf_{};  // tensor parallel local logits
-    T* context_logits_buf_{};
-    T* local_context_logits_buf_{};
+    // lengths
+    Buffer_<int> input_length_buf_;    // input + cache missed length
+    Buffer_<int> context_length_buf_;  // history length + input_length
+    Buffer_<int> init_context_length_;
 
-    size_t local_context_logits_buf_size_{};
+    Buffer_<int> sequence_lengths_;  // current sequence length
+    Buffer_<int> init_ctx_lens_;
+    Buffer_<int> lora_mask_buf_;  // lora
 
-    T*        sampled_logprobs_{};
-    uint32_t* sampled_indexes_{};
-    uint32_t* sampled_nums_{};
-    T*        h_sampled_logprobs_{};
-    uint32_t* h_sampled_indexes_{};
-    uint32_t* h_sampled_nums_{};
+    Buffer_<float>    sampled_logprobs_;
+    Buffer_<uint32_t> sampled_indexes_;
+    Buffer_<uint32_t> sampled_nums_;
+    Buffer_<float>    h_sampled_logprobs_;
+    Buffer_<uint32_t> h_sampled_indexes_;
+    Buffer_<uint32_t> h_sampled_nums_;
 
-    float* rope_theta_{};
+    Buffer_<float> rope_theta_;
 
     // used by dynamic decoder
-    int*      token_ids_buf_{};  // all token IDs in [S, B], indexed using `step`
-    bool*     finished_buf_{};
-    uint32_t* seq_limit_len_{};
-    int*      h_end_ids_buf_{};
-    int*      d_end_ids_buf_{};
+    Buffer_<int>  token_ids_buf_;  // all token IDs in [S, B], indexed using `step`
+    Buffer_<bool> finished_buf_;
+    Buffer_<int>  seq_limit_len_;
 
     // pinned buffers
-    int*       h_input_ids_buf_{};
-    int*       h_input_length_buf_{};
-    uint32_t*  h_seq_limit_len_{};
-    int*       h_cu_block_counts_{};
-    uintptr_t* h_block_ptrs_{};
-
-    int*   h_min_length_{};
-    int*   h_runtime_top_k_{};
-    float* h_runtime_top_p_{};
-    float* h_runtime_min_p_{};
-    float* h_temperature_{};
-    float* h_repetition_penalty_{};
-    int*   h_stop_words_{};  // [batch_size, 2, kMaxStopWordsLen]
-    int*   h_bad_words_{};
-    int*   d_stop_words_{};  // [batch_size, 2, kMaxStopWordsLen]
-    int*   d_bad_words_{};
-
-    unsigned long long* h_random_seed_{};
-    unsigned long long* d_random_seed_{};
-
-    curandState_t* h_curand_state_{};
-    curandState_t* d_curand_state_{};
+    Buffer_<int> h_output_ids_;
+    Buffer_<int> h_input_length_buf_;
+    Buffer_<int> h_seq_limit_len_;
+
+    Buffer_<int>       h_cu_block_counts_;
+    Buffer_<uintptr_t> h_block_ptrs_;
+
+    Buffer_<uint64_t> h_random_seed_;
+    Buffer_<uint64_t> d_random_seed_;
+
+    Tensor_<uint8_t> h_curand_state_;  // [n, sizeof(curandState_t)]
+    Tensor_<uint8_t> d_curand_state_;
 
     std::array<BatchState, 3> states_{};
 
@@ -307,18 +273,9 @@ class LlamaBatch {
     static constexpr int kMaxStopBadWordsLen = 32;
     static constexpr int kMaxEndIdsSize      = 32;
 
-    bool is_allocate_persistant_buffer_ = false;
-    bool is_allocate_buffer_            = false;
-
-    TensorMap inputs_;
-    TensorMap outputs_;
-
     std::thread internal_thread_;
-
-    int* h_output_ids_{};
 };
 
-template<class T>
-using Engine = LlamaBatch<T>;
+using Engine = LlamaBatch;
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
index 9e1e2eb4dc..5fc7040c99 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -18,19 +18,18 @@
 // Modified from
 // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.cc
 
+#include <cstdlib>
+
+#include <cuda_bf16.h>
+#include <cuda_runtime.h>
+
 #include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
-#include "src/turbomind/kernels/gemm/cast.h"
-#include "src/turbomind/kernels/gemm/gemm.h"
-#include "src/turbomind/kernels/gemm/types.h"
-#include "src/turbomind/kernels/gpt_kernels.h"
+
 #include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/logger.h"
-#include "src/turbomind/utils/memory_utils.h"
-#include <cstdlib>
-#include <cuda_runtime.h>
-#include <filesystem>
+
 namespace turbomind {
 
 static bool is_fuse_silu_act()
@@ -52,17 +51,18 @@ static bool is_fuse_silu_act()
     return value;
 }
 
-template<typename T>
-LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int                layer_id,
-                                                    const ModelParam&  model,
-                                                    const EngineParam& engine,
-                                                    const LoraParam&   lora_param,
-                                                    const MoeParam&    moe_param):
+LlamaDecoderLayerWeight::LlamaDecoderLayerWeight(DataType           data_type,
+                                                 int                layer_id,
+                                                 const ModelParam&  model,
+                                                 const EngineParam& engine,
+                                                 const LoraParam&   lora_param,
+                                                 const MoeParam&    moe_param):
     head_num_(model.head_num),
     kv_head_num_(model.kv_head_num),
     size_per_head_(model.head_dim),
     hidden_units_(model.hidden_units),
     inter_size_(model.inter_size.at(layer_id)),
+    data_type_{data_type},
     weight_type_(model.weight_type),
     attn_bias_(model.attn_bias),
     attn_tp_size_(engine.attn_tp_size),
@@ -70,663 +70,68 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int                layer_id,
     mlp_tp_size_(engine.mlp_tp_size),
     mlp_tp_rank_(engine.mlp_tp_rank)
 {
-    self_attn_weights = LlamaAttentionWeight<T>{hidden_units_,
-                                                size_per_head_,
-                                                head_num_,
-                                                kv_head_num_,
-                                                model.mla,
-                                                attn_bias_,
-                                                model.qk_norm,
-                                                attn_tp_size_,
-                                                weight_type_,
-                                                model.group_size};
-
-    ffn_weights = LlamaFfnWeight<T>{
-        hidden_units_,
-        inter_size_,
-        mlp_tp_size_,
-        weight_type_,
-        model.group_size,
-        weight_type_ == WeightType::kINT4 && is_fuse_silu_act(),
-    };
-
-    moe_weights = MoeFfnWeight<T>{
-        layer_id, moe_param, hidden_units_, weight_type_, model.group_size, mlp_tp_size_, is_fuse_silu_act()};
-
-    if (lora_param.policy == LoraPolicy::kPlora) {
-        std::vector<std::string> keys = {
-            "attention.w_qkv", "attention.wo", "feed_forward.w1", "feed_forward.w2", "feed_forward.w3"};
-        std::vector<LlamaDenseWeight<T>*> weights = {&self_attn_weights.qkv,
-                                                     &self_attn_weights.output,
-                                                     &ffn_weights.gating,
-                                                     &ffn_weights.output,
-                                                     &ffn_weights.intermediate};
-        for (int i = 0; i < keys.size(); i++) {
-            const auto& name      = keys[i];
-            auto&       weight    = *weights[i];
-            int         rank      = lora_param.r;
-            float       scale     = lora_param.scale;
-            std::string full_name = "layers." + std::to_string(layer_id) + "." + name;
-
-            for (const auto& [re, pr] : lora_param.rank_pattern) {
-                if (std::regex_search(full_name, pr.first)) {
-                    rank = pr.second;
-                    TM_LOG_DEBUG("find rank, pattern=%s, name=%s, value=%d", re.c_str(), full_name.c_str(), rank);
-                    break;
-                }
-            }
-            for (const auto& [re, pr] : lora_param.scale_pattern) {
-                if (std::regex_search(full_name, pr.first)) {
-                    scale = pr.second;
-                    TM_LOG_DEBUG("find scale pattern=%s, name=%s, value=%f", re.c_str(), full_name.c_str(), scale);
-                    break;
-                }
-            }
-            if (rank) {
-                weight.lora.r      = rank;
-                weight.lora.scale  = scale;
-                weight.lora.policy = lora_param.policy;
-            }
-        }
-    }
-
-    fused_up_and_gate_ = ffn_weights.gating.lora.policy != LoraPolicy::kPlora;
-}
-
-template<typename T>
-void LlamaDecoderLayerWeight<T>::malloc(cudaStream_t st)
-{
-    deviceMalloc((T**)&self_attn_norm_weights, hidden_units_, st);
-    deviceMalloc((T**)&ffn_norm_weights, hidden_units_, st);
-
-    self_attn_weights.malloc(st);
-
-    if (inter_size_) {
-        ffn_weights.malloc(st);
-    }
-
-    if (!moe_weights.experts.empty()) {
-        moe_weights.malloc(st);
-    }
-}
-
-template<typename T>
-size_t LlamaDecoderLayerWeight<T>::workspace_size() const noexcept
-{
-    // Space to hold the largest weight in full precision
-
-    auto get_size = [](const auto& w) { return (size_t)w.input_dims * w.output_dims; };
-
-    size_t size = 0;
-
-    size = std::max(size, get_size(self_attn_weights.qkv));
-    size = std::max(size, get_size(self_attn_weights.output));
-    size = std::max(size, get_size(ffn_weights.gating));
-    size = std::max(size, get_size(ffn_weights.fused_gating_intermediate));
-
-    for (const auto& e : moe_weights.experts) {
-        size = std::max(size, get_size(e.gating));
-        size = std::max(size, get_size(e.fused_gating_intermediate));
-    }
-
-    return size * sizeof(uint16_t);
-}
-
-template<typename FirstArg, typename... Args>
-std::string concat(FirstArg&& first, Args&&... args)
-{
-    std::stringstream stream;
-    stream << first;
-    ((stream << "." << args), ...);
-    return stream.str();
-}
-
-template<typename T>
-void getWeightTensor(LlamaDenseWeight<T>& weights, bool bias, const std::string& prefix, TensorMap& output)
-{
-    auto get_name = [=](const std::string& name) { return concat(prefix, name); };
-
-    if (bias) {
-        output.insert(get_name("bias"), Tensor{MEMORY_GPU, getTensorType<T>(), {weights.bias_size()}, weights.bias});
-    }
-
-    const size_t bit_size = getBitSize(weights.type);
-    if (bit_size >= 16) {
-        output.insert(get_name("weight"),
-                      Tensor{MEMORY_GPU, getTensorType<T>(), {weights.kernel_size()}, weights.kernel});
-    }
-    else {
-        output.insert(get_name("qweight"), Tensor{MEMORY_GPU, TYPE_INT32, {weights.kernel_size()}, weights.kernel});
-        output.insert(get_name("scales"),
-                      Tensor{MEMORY_GPU, getTensorType<T>(), {weights.scales_size()}, weights.scales});
-        output.insert(get_name("zeros"),
-                      Tensor{MEMORY_GPU, getTensorType<T>(), {weights.scales_size()}, weights.zeros});
-    }
-
-    if (weights.lora.r) {
-        auto n = prefix.rfind(".");
-
-        std::string _prefix = prefix.substr(0, n);
-        std::string _num    = prefix.substr(n + 1);
-
-        output.insert(concat(_prefix, "lora_a", _num, "weight"),
-                      Tensor{MEMORY_GPU, getTensorType<T>(), {weights.lora_size().first}, weights.lora.a});
-        output.insert(concat(_prefix, "lora_b", _num, "weight"),
-                      Tensor{MEMORY_GPU, getTensorType<T>(), {weights.lora_size().second}, weights.lora.b});
-
-        TM_LOG_DEBUG("allocate lora weight, layer_name=%s input_dims=%d, output_dims=%d, lora_r=%d",
-                     get_name("weight").c_str(),
-                     weights.input_dims,
-                     weights.output_dims,
-                     weights.lora.r);
-    }
-}
-
-template<typename T>
-void loadWeights(
-    LlamaDenseWeight<T>& w, std::string prefix, int rank, FtCudaDataType model_file_type, size_t tensor_para_size)
-{
-    auto weight_file  = prefix + "." + std::to_string(tensor_para_size - 1) + ".weight";
-    auto qweight_file = prefix + "." + std::to_string(tensor_para_size - 1) + ".qweight";
-
-    if (!std::filesystem::exists(weight_file) && !std::filesystem::exists(qweight_file)) {
-        TM_LOG_ERROR("%s and %s does not exist", weight_file.c_str(), qweight_file.c_str());
-        FT_CHECK(false);
-    }
-
-    prefix += "." + std::to_string(rank);
-
-    size_t     dim0 = w.input_dims;
-    size_t     dim1 = w.output_dims;
-    const auto type = model_file_type;
-
-    if (w.bias) {
-        loadWeightFromBin((T*)w.bias, {1, dim1}, prefix + ".bias", type);
-    }
-    const size_t bit_size = getBitSize(w.type);
-    if (bit_size >= 16) {  // fp16, fp32
-        loadWeightFromBin((T*)w.kernel, {dim0, dim1}, prefix + ".weight", type);
-    }
-    else {  // int8, int4
-        const int factor = sizeof(float) * 8 / bit_size;
-
-        FT_CHECK(dim1 % factor == 0);
-
-        std::vector<size_t> w_shape{dim0, dim1 / factor * sizeof(uint32_t)};
-        loadWeightFromBin((int8_t*)w.kernel, w_shape, prefix + ".qweight", FtCudaDataType::INT8);
-
-        const size_t group_count = w.group_size > 0 ? dim0 / w.group_size : 1;
-
-        loadWeightFromBin((half*)w.scales, {group_count, dim1}, prefix + ".scales", type);
-        loadWeightFromBin((half*)w.zeros, {group_count, dim1}, prefix + ".zeros", type);
-    }
-}
-
-template<typename T>
-void loadWeights(LlamaDenseWeight<T>& w, std::string prefix, FtCudaDataType model_file_type)
-{
-    auto weight_file  = prefix + ".weight";
-    auto qweight_file = prefix + ".qweight";
-
-    if (!std::filesystem::exists(weight_file) && !std::filesystem::exists(qweight_file)) {
-        TM_LOG_ERROR("%s and %s does not exist", weight_file.c_str(), qweight_file.c_str());
-        FT_CHECK(false);
-    }
-
-    size_t     dim0 = w.input_dims;
-    size_t     dim1 = w.output_dims;
-    const auto type = model_file_type;
-
-    if (w.bias) {
-        loadWeightFromBin((T*)w.bias, {1, dim1}, prefix + ".bias", type);
-    }
-    const size_t bit_size = getBitSize(w.type);
-    if (bit_size >= 16) {  // fp16, fp32
-        loadWeightFromBin((T*)w.kernel, {dim0, dim1}, prefix + ".weight", type);
-    }
-    else {  // int8, int4
-        const int factor = sizeof(float) * 8 / bit_size;
-
-        FT_CHECK(dim1 % factor == 0);
-
-        std::vector<size_t> w_shape{dim0, dim1 / factor * sizeof(uint32_t)};
-        loadWeightFromBin((int8_t*)w.kernel, w_shape, prefix + ".qweight", FtCudaDataType::INT8);
-
-        const size_t group_count = w.group_size > 0 ? dim0 / w.group_size : 1;
-
-        loadWeightFromBin((half*)w.scales, {group_count, dim1}, prefix + ".scales", type);
-        loadWeightFromBin((half*)w.zeros, {group_count, dim1}, prefix + ".zeros", type);
-    }
-}
-
-template<typename T>
-void LlamaDecoderLayerWeight<T>::free(cudaStream_t st)
-{
-    deviceFree(self_attn_norm_weights, st);
-    deviceFree(ffn_norm_weights, st);
-
-    self_attn_weights.free(st);
-
-    if (inter_size_) {
-        ffn_weights.free(st);
-    }
-
-    if (!moe_weights.experts.empty()) {
-        moe_weights.free(st);
-    }
-}
-
-template<typename T>
-LlamaDecoderLayerWeight<T>::~LlamaDecoderLayerWeight() = default;
-
-template<typename T>
-void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType model_file_type)
-{
-    const auto type = model_file_type;
-
-    loadWeightFromBin(
-        (T*)self_attn_norm_weights, {hidden_units_}, dir_path + ".attention_norm.weight", model_file_type);
-    loadWeightFromBin((T*)ffn_norm_weights, {hidden_units_}, dir_path + ".ffn_norm.weight", model_file_type);
-
-    loadWeights(self_attn_weights.qkv, dir_path + ".attention.w_qkv", attn_tp_rank_, type, attn_tp_size_);
-
-    loadWeights(self_attn_weights.output, dir_path + ".attention.wo", attn_tp_rank_, type, attn_tp_size_);
-    if (moe_weights.experts.empty()) {
-        loadWeights(ffn_weights.gating, dir_path + ".feed_forward.w1", mlp_tp_rank_, type, mlp_tp_size_);
-        loadWeights(ffn_weights.intermediate, dir_path + ".feed_forward.w3", mlp_tp_rank_, type, mlp_tp_size_);
-        loadWeights(ffn_weights.output, dir_path + ".feed_forward.w2", mlp_tp_rank_, type, mlp_tp_size_);
-    }
-    else {
-        loadWeights(moe_weights.gate, dir_path + ".moe_ffn.gate", type);
-        for (size_t i = 0; i < moe_weights.experts.size(); ++i) {
-            std::string weight_name = dir_path + ".moe_ffn.experts." + std::to_string(i);
-            loadWeights(moe_weights.experts[i].gating, weight_name + ".w1", mlp_tp_rank_, type, mlp_tp_size_);
-            loadWeights(moe_weights.experts[i].intermediate, weight_name + ".w3", mlp_tp_rank_, type, mlp_tp_size_);
-            loadWeights(moe_weights.experts[i].output, weight_name + ".w2", mlp_tp_rank_, type, mlp_tp_size_);
-        }
-    }
-}
-
-template<class T>
-void getMLATensor(LlamaAttentionWeight<T>& w, const std::string& p, TensorMap& m, int tp_rank)
-{
-    if (w.q_proj.output_dims) {
-        getWeightTensor(w.q_proj, false, concat(p, "attention.q_proj", tp_rank), m);
-    }
-    else {
-        getWeightTensor(w.q_a_proj, false, concat(p, "attention.q_a_proj"), m);
-        getWeightTensor(w.q_b_proj, false, concat(p, "attention.q_b_proj", tp_rank), m);
-        m.insert(concat(p, "attention.q_a_layernorm"),
-                 Tensor{MEMORY_GPU, getTensorType<T>(), {sizeof(T) * w.q_b_proj.input_dims}, w.q_a_layernorm});
-    }
-    getWeightTensor(w.kv_a_proj, false, concat(p, "attention.kv_a_proj"), m);
-    getWeightTensor(w.kv_b_proj, false, concat(p, "attention.kv_b_proj", tp_rank), m);
-    m.insert(concat(p, "attention.kv_a_layernorm"),
-             Tensor{MEMORY_GPU, getTensorType<T>(), {sizeof(T) * w.kv_b_proj.input_dims}, w.kv_a_layernorm});
-}
-
-template<typename T>
-TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
-{
-    TensorMap output;
-
-    output.insert(concat(prefix, "attention_norm.weight"),
-                  Tensor{MEMORY_GPU, getTensorType<T>(), {hidden_units_ * sizeof(T)}, self_attn_norm_weights});
-
-    output.insert(concat(prefix, "ffn_norm.weight"),
-                  Tensor{MEMORY_GPU, getTensorType<T>(), {hidden_units_ * sizeof(T)}, ffn_norm_weights});
-
-    auto get_attn = [=](std::string_view name) { return concat(prefix, name, attn_tp_rank_); };
-
-    if (self_attn_weights.qkv.output_dims) {
-        getWeightTensor(self_attn_weights.qkv, attn_bias_, get_attn("attention.w_qkv"), output);
-
-        if (self_attn_weights.qk_norm) {
-            output.insert(concat(prefix, "attention.q_norm"),
-                          Tensor{MEMORY_GPU,
-                                 getTensorType<T>(),
-                                 {sizeof(T) * self_attn_weights.head_dim},
-                                 self_attn_weights.q_a_layernorm});
-            output.insert(concat(prefix, "attention.k_norm"),
-                          Tensor{MEMORY_GPU,
-                                 getTensorType<T>(),
-                                 {sizeof(T) * self_attn_weights.head_dim},
-                                 self_attn_weights.kv_a_layernorm});
-        }
-    }
-    else {
-        getMLATensor(self_attn_weights, prefix, output, attn_tp_rank_);
-    }
-    getWeightTensor(self_attn_weights.output, attn_bias_, get_attn("attention.wo"), output);
-
-    auto get_mlp = [=](std::string_view name) { return concat(prefix, name, mlp_tp_rank_); };
+    self_attn_weights.reset(new LlamaAttentionWeight{hidden_units_,
+                                                     size_per_head_,
+                                                     head_num_,
+                                                     kv_head_num_,
+                                                     model.mla,
+                                                     attn_bias_,
+                                                     model.qk_norm,
+                                                     attn_tp_size_,
+                                                     attn_tp_rank_,
+                                                     data_type_,
+                                                     weight_type_,
+                                                     model.group_size});
+    register_module("attention", *self_attn_weights);
 
     if (inter_size_) {
-        getWeightTensor(ffn_weights.gating, false, get_mlp("feed_forward.w1"), output);
-        getWeightTensor(ffn_weights.intermediate, false, get_mlp("feed_forward.w3"), output);
-        getWeightTensor(ffn_weights.output, false, get_mlp("feed_forward.w2"), output);
-    }
-
-    if (!moe_weights.experts.empty()) {
-        output.insert(
-            concat(prefix, "moe_ffn.gate.weight"),
-            Tensor{MEMORY_GPU, getTensorType<T>(), {moe_weights.gate.kernel_size()}, moe_weights.gate.kernel});
-        auto& experts = moe_weights.experts;
-        for (size_t i = 0; i < experts.size(); ++i) {
-            const std::string name = "moe_ffn.experts." + std::to_string(i);
-            getWeightTensor(experts[i].gating, false, get_mlp(concat(name, "w1")), output);
-            getWeightTensor(experts[i].intermediate, false, get_mlp(concat(name, "w3")), output);
-            getWeightTensor(experts[i].output, false, get_mlp(concat(name, "w2")), output);
-        }
-        if (moe_weights.shared_gate.kernel) {
-            output.insert(concat(prefix, "moe_ffn.shared_gate.weight"),
-                          Tensor{MEMORY_GPU,
-                                 getTensorType<T>(),
-                                 {moe_weights.shared_gate.kernel_size()},
-                                 moe_weights.shared_gate.kernel});
-        }
-    }
-
-    return output;
-}
-
-// template<class T>
-static void convert_u4(
-    LlamaDenseWeight<half>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt, cudaStream_t st)
-{
-    FT_CHECK(weight.type == WeightType::kINT4);
-
-    using namespace gemm;
-
-    auto [order_b, pack_b, order_v, pack_v] =
-        get_weight_and_scales_layout(gemm::DataType::U4, is_fused_moe, getSMVersion(), use_simt);
-
-    if (order_b == kColMajor) {
-        transpose_u4((uint4_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims, weight.output_dims, st);
-        cudaMemcpyAsync(weight.kernel, workspace, weight.input_dims * weight.output_dims / 2, cudaMemcpyDefault, st);
-    }
-
-    extend_to_u16((uint16_t*)workspace, (const uint4_t*)weight.kernel, weight.input_dims * weight.output_dims, st);
-    sync_check_cuda_error();
-
-    MatrixLayout w_desc{
-        gemm::DataType::F16,
-        order_b,
-        (int)weight.input_dims,   // k
-        (int)weight.output_dims,  // n
-        order_b == kRowMajor ? (int)weight.output_dims : (int)weight.input_dims,
-    };
-
-    MatrixLayout k_desc = w_desc;
-    k_desc.type         = gemm::DataType::U4;
-    k_desc.pack         = pack_b;
-
-    cudaMemsetAsync(weight.kernel, 0, weight.input_dims * weight.output_dims / 2, st);
-
-    FT_CHECK(Convert(workspace, w_desc, weight.kernel, k_desc, st) == 0);
-    sync_check_cuda_error();
-
-    const int scale_count = (weight.input_dims / weight.group_size) * weight.output_dims;
-
-    // std::cout << "fuse_scales_and_zeros\n";
-    fuse_scales_and_zeros((half*)workspace, weight.scales, weight.zeros, scale_count, st);
-    // cudaMemset((T*)workspace, 0, sizeof(T) * scale_count * 2);
-    sync_check_cuda_error();
-
-    deviceFree(weight.scales, st);
-    deviceFree(weight.zeros, st);
-
-    deviceMalloc((half**)&weight.scales_zeros, scale_count * 2, st);
-
-    MatrixLayout s_desc{
-        gemm::DataType::U32,
-        order_v,
-        (int)weight.input_dims / weight.group_size,  // k
-        (int)weight.output_dims,                     // n
-        (int)weight.output_dims,
-    };
-
-    MatrixLayout q_desc = s_desc;
-    q_desc.pack         = pack_v;
-
-    FT_CHECK(Convert(workspace, s_desc, weight.scales_zeros, q_desc, st) == 0);
-    sync_check_cuda_error();
-
-    weight.k_desc = k_desc;
-    weight.q_desc = q_desc;
-
-    // FT_CHECK(0);
-}
-
-template<class T>
-static void
-convert_fp(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt, cudaStream_t st)
-{
-    using namespace gemm;
-
-    if (!is_fused_moe) {
-        return;
-    }
-
-    const auto [order_b, pack_b, order_v, pack_v] =
-        get_weight_and_scales_layout(get_data_type_v<T>, is_fused_moe, getSMVersion(), use_simt);
-
-    const int input_dim  = weight.input_dims;
-    const int output_dim = weight.output_dims;
-
-    if (order_b == kColMajor) {
-        invokeTransposeAxis01((uint16_t*)workspace, (uint16_t*)weight.kernel, input_dim, output_dim, 1, st);
-        sync_check_cuda_error();
-        // FT_CHECK(0);
-    }
-    else {
-        check_cuda_error(
-            cudaMemcpyAsync(workspace, weight.kernel, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault, st));
-    }
-
-    MatrixLayout src{
-        get_data_type_v<T>,
-        order_b,
-        input_dim,   // k
-        output_dim,  // n
-        order_b == kRowMajor ? output_dim : input_dim,
-    };
-
-    MatrixLayout dst = src;
-    dst.pack         = pack_b;
-
-    if (pack_b) {
-        FT_CHECK(Convert(workspace, src, weight.kernel, dst, st) == 0);
-        sync_check_cuda_error();
-        // FT_CHECK(0);
-    }
-    else {
-        check_cuda_error(
-            cudaMemcpyAsync(weight.kernel, workspace, sizeof(T) * input_dim * output_dim, cudaMemcpyDefault, st));
+        ffn_weights.reset(new LlamaFfnWeight{
+            hidden_units_,
+            inter_size_,
+            mlp_tp_size_,
+            mlp_tp_rank_,
+            data_type_,
+            weight_type_,
+            model.group_size,
+            weight_type_ == data_type_v<uint4_t> && is_fuse_silu_act(),
+        });
+        register_module("feed_forward", *ffn_weights);
     }
 
-    weight.k_desc = dst;
-}
-
-template<class T>
-static void
-convert(LlamaDenseWeight<T>& weight, bool is_fused_moe, void* workspace, size_t size, bool use_simt, cudaStream_t st)
-{
-    if (weight.type == WeightType::kINT4) {
-        if constexpr (std::is_same_v<T, half>) {
-            convert_u4(weight, is_fused_moe, workspace, size, use_simt, st);
-        }
-        else {
-            FT_CHECK(0);
-        }
-    }
-    else {
-        convert_fp(weight, is_fused_moe, workspace, size, use_simt, st);
-    }
-}
-
-template<class T>
-void interleave(LlamaDenseWeight<T>& c,
-                LlamaDenseWeight<T>& a,
-                LlamaDenseWeight<T>& b,
-                void*                workspace,
-                size_t               size,
-                cudaStream_t         st)
-{
-    FT_CHECK(c.input_dims == a.input_dims);
-    FT_CHECK(c.input_dims == b.input_dims);
-    FT_CHECK(c.output_dims == a.output_dims * 2);
-    FT_CHECK(c.output_dims == b.output_dims * 2);
-    FT_CHECK(c.group_size == a.group_size);
-    FT_CHECK(c.group_size == b.group_size);
-
-    if (a.type == WeightType::kINT4) {
-        uint8_t* tmp_a = (uint8_t*)workspace;
-        uint8_t* tmp_b = tmp_a + a.output_dims * a.input_dims;
-        uint8_t* tmp_c = tmp_b + b.output_dims * b.input_dims;
-
-        const auto sentinel = tmp_c + c.output_dims * c.input_dims;
-        FT_CHECK(sentinel <= (uint8_t*)workspace + size);
-
-        extend_to_u8(tmp_a, (const uint4_t*)a.kernel, a.output_dims * a.input_dims, st);
-        extend_to_u8(tmp_b, (const uint4_t*)b.kernel, b.output_dims * b.input_dims, st);
-
-        interleave_output_dims(tmp_c, tmp_a, tmp_b, a.output_dims, a.input_dims, st);
-
-        compact_to_u4((uint4_t*)c.kernel, tmp_c, c.output_dims * c.input_dims, st);
-
-        interleave_output_dims(c.scales, a.scales, b.scales, a.output_dims, a.input_dims / a.group_size, st);
-        interleave_output_dims(c.zeros, a.zeros, b.zeros, a.output_dims, a.input_dims / a.group_size, st);
-    }
-    else {
-        interleave_output_dims((T*)c.kernel, (const T*)a.kernel, (const T*)b.kernel, a.output_dims, a.input_dims, st);
+    if (layer_id < moe_param.expert_num.size() && moe_param.expert_num[layer_id]) {
+        moe_weights.reset(new MoeFfnWeight{layer_id,
+                                           moe_param,
+                                           hidden_units_,
+                                           data_type_,
+                                           weight_type_,
+                                           model.group_size,
+                                           mlp_tp_size_,
+                                           mlp_tp_rank_,
+                                           is_fuse_silu_act()});
+        register_module("moe_ffn", *moe_weights);
     }
 
-    // Check at function level
-    sync_check_cuda_error();
+    self_attn_norm = Tensor{{hidden_units_}, data_type_, kDEVICE};
+    ffn_norm       = Tensor{{hidden_units_}, data_type_, kDEVICE};
+    register_parameter("attention_norm.weight", self_attn_norm);
+    register_parameter("ffn_norm.weight", ffn_norm);
 }
 
-template<class T>
-void chunk(LlamaDenseWeight<T>& c, LlamaDenseWeight<T>& a, LlamaDenseWeight<T>& b, void*, size_t, cudaStream_t st)
-{
-    FT_CHECK(c.input_dims == a.input_dims);
-    FT_CHECK(c.input_dims == b.input_dims);
-    FT_CHECK(c.output_dims == a.output_dims * 2);
-    FT_CHECK(c.output_dims == b.output_dims * 2);
-    FT_CHECK(c.group_size == a.group_size);
-    FT_CHECK(c.group_size == b.group_size);
-
-    auto _chunks = [&](auto c, auto a, auto b, int height, int width) {
-        check_cuda_error(
-            cudaMemcpy2DAsync((char*)c + 0x000, width * 2, a, width, width, height, cudaMemcpyDefault, st));
-        check_cuda_error(
-            cudaMemcpy2DAsync((char*)c + width, width * 2, b, width, width, height, cudaMemcpyDefault, st));
-    };
-
-    if (c.type == WeightType::kINT4) {
-        _chunks(c.kernel, a.kernel, b.kernel, a.input_dims, 4 * a.output_dims / 8);
-        _chunks(c.scales, a.scales, b.scales, a.input_dims / a.group_size, sizeof(T) * a.output_dims);
-        _chunks(c.zeros, a.zeros, b.zeros, a.input_dims / a.group_size, sizeof(T) * a.output_dims);
-    }
-    else {
-        _chunks(c.kernel, a.kernel, b.kernel, a.input_dims, sizeof(T) * a.output_dims);
-    }
-
-    // Check at function level
-    sync_check_cuda_error();
-}
+LlamaDecoderLayerWeight::~LlamaDecoderLayerWeight() = default;
 
-template<typename T>
-void LlamaDecoderLayerWeight<T>::prepare(void* workspace, size_t size, const cudaDeviceProp& prop, cudaStream_t st)
+void LlamaDecoderLayerWeight::prepare(const cudaDeviceProp& prop, cudaStream_t st)
 {
-    const bool is_16xx = is_16xx_series(prop.name);
+    const bool use_simt = is_16xx_series(prop.name);
 
-    convert(self_attn_weights.qkv, false, workspace, size, is_16xx, st);
-    convert(self_attn_weights.output, false, workspace, size, is_16xx, st);
+    self_attn_weights->prepare(use_simt);
 
-    auto process_ffn = [&](LlamaFfnWeight<T>& ffn, bool is_fused_moe) {
-        if (fused_up_and_gate_) {
-            auto& fused_up_and_gate = ffn.fused_gating_intermediate;
-
-            fused_up_and_gate.malloc(st);
-
-            if (ffn.is_fused_silu) {
-                interleave(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size, st);
-            }
-            else {
-                chunk(fused_up_and_gate, ffn.gating, ffn.intermediate, workspace, size, st);
-            }
-
-            convert(ffn.fused_gating_intermediate, is_fused_moe, workspace, size, is_16xx, st);
-
-            ffn.gating.free(st);
-            ffn.intermediate.free(st);
-        }
-        else {
-            convert(ffn.gating, is_fused_moe, workspace, size, is_16xx, st);
-            convert(ffn.intermediate, is_fused_moe, workspace, size, is_16xx, st);
-        }
-
-        convert(ffn.output, is_fused_moe, workspace, size, is_16xx, st);
-    };
-
-    if (inter_size_) {
-        // std::cerr << "process FFN\n";
-        process_ffn(ffn_weights, false);
+    if (ffn_weights) {
+        ffn_weights->prepare(false, use_simt);
     }
 
-    if (!moe_weights.experts.empty()) {
-        // std::cerr << "process MoE\n";
-        std::vector<std::pair<void*, int>> fused_ptrs;
-        std::vector<std::pair<void*, int>> output_ptrs;
-        std::vector<std::pair<void*, int>> fused_param_ptrs;
-        std::vector<std::pair<void*, int>> output_param_ptrs;
-
-        for (auto& e : moe_weights.experts) {
-
-            process_ffn(e, moe_weights.method == MoeParam::kFused);
-
-            const auto& fused  = e.fused_gating_intermediate;
-            const auto& output = e.output;
-
-            fused_ptrs.push_back({fused.kernel, fused.k_desc.ld});
-            output_ptrs.push_back({output.kernel, output.k_desc.ld});
-
-            if (e.fused_gating_intermediate.scales_zeros) {
-                fused_param_ptrs.emplace_back(fused.scales_zeros, fused.q_desc.ld);
-                output_param_ptrs.emplace_back(output.scales_zeros, output.q_desc.ld);
-            }
-        }
-
-        // Note: This assumes all experts has the same shape
-        moe_weights.block = moe_weights.experts.at(0);
-
-        auto& fused  = moe_weights.block.fused_gating_intermediate;
-        auto& output = moe_weights.block.output;
-
-        // TODO: free these ptrs
-        fused.kernel  = gemm::make_blocked_ptrs(fused_ptrs, st);
-        output.kernel = gemm::make_blocked_ptrs(output_ptrs, st);
-
-        if (!fused_param_ptrs.empty()) {
-            fused.scales_zeros  = (T*)gemm::make_blocked_ptrs(fused_param_ptrs, st);
-            output.scales_zeros = (T*)gemm::make_blocked_ptrs(output_param_ptrs, st);
-        }
-
-        fused.k_desc.ld = output.k_desc.ld = 0;
-        fused.k_desc.num = output.k_desc.num = moe_weights.experts.size();
-
-        fused.q_desc.ld = output.q_desc.ld = 0;
-        fused.q_desc.num = output.q_desc.num = moe_weights.experts.size();
+    if (moe_weights) {
+        moe_weights->prepare(use_simt);
     }
 }
 
-#ifdef ENABLE_FP32
-template struct LlamaDecoderLayerWeight<float>;
-#endif
-template struct LlamaDecoderLayerWeight<half>;
-#ifdef ENABLE_BF16
-template struct LlamaDecoderLayerWeight<__nv_bfloat16>;
-#endif
-
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
index 44838a747d..0df8077341 100644
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
@@ -20,18 +20,19 @@
 
 #pragma once
 
+#include "src/turbomind/core/core.h"
+
 #include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include "src/turbomind/models/llama/llama_params.h"
-#include "src/turbomind/utils/Tensor.h"
 
 namespace turbomind {
 
-template<typename T>
-struct LlamaDecoderLayerWeight {
+struct LlamaDecoderLayerWeight: core::Module {
 public:
     LlamaDecoderLayerWeight() = delete;
 
-    LlamaDecoderLayerWeight(int                layer_id,
+    LlamaDecoderLayerWeight(DataType           data_type,
+                            int                layer_id,
                             const ModelParam&  model,
                             const EngineParam& engine,
                             const LoraParam&   lora_param,
@@ -41,41 +42,32 @@ struct LlamaDecoderLayerWeight {
     LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight&) = delete;
     LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight&) = delete;
 
-    void loadModel(std::string dir_path, FtCudaDataType model_file_type);
-
-    TensorMap getParams(std::string prefix);
-
-    void prepare(void* workspace, size_t size, const cudaDeviceProp& prop, cudaStream_t st);
-
-    size_t workspace_size() const noexcept;
-
-    void malloc(cudaStream_t st);
-
-    void free(cudaStream_t st);
+    void prepare(const cudaDeviceProp& prop, cudaStream_t st);
 
-    T* self_attn_norm_weights{};
-    T* ffn_norm_weights{};
+    Tensor self_attn_norm;
+    Tensor ffn_norm;
 
-    LlamaAttentionWeight<T> self_attn_weights{};
+    std::unique_ptr<LlamaAttentionWeight> self_attn_weights;
 
-    LlamaFfnWeight<T> ffn_weights{};
-    MoeFfnWeight<T>   moe_weights{};
+    std::unique_ptr<LlamaFfnWeight> ffn_weights;
+    std::unique_ptr<MoeFfnWeight>   moe_weights;
 
 private:
-    size_t     head_num_;
-    size_t     kv_head_num_;
-    size_t     size_per_head_;
-    size_t     hidden_units_;
-    size_t     inter_size_;
-    WeightType weight_type_;
-    size_t     bit_size_;
-    bool       attn_bias_;
-    size_t     attn_tp_size_;
-    size_t     attn_tp_rank_;
-    size_t     mlp_tp_size_;
-    size_t     mlp_tp_rank_;
-    bool       is_maintain_buffer_ = false;
-    bool       fused_up_and_gate_;
+    int head_num_;
+    int kv_head_num_;
+    int size_per_head_;
+    int hidden_units_;
+    int inter_size_;
+
+    DataType data_type_;
+    DataType weight_type_;
+
+    int  bit_size_;
+    bool attn_bias_;
+    int  attn_tp_size_;
+    int  attn_tp_rank_;
+    int  mlp_tp_size_;
+    int  mlp_tp_rank_;
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaDenseWeight.cc b/src/turbomind/models/llama/LlamaDenseWeight.cc
new file mode 100644
index 0000000000..24afe3122e
--- /dev/null
+++ b/src/turbomind/models/llama/LlamaDenseWeight.cc
@@ -0,0 +1,502 @@
+#include "src/turbomind/models/llama/LlamaDenseWeight.h"
+
+#include "src/turbomind/core/data_type.h"
+#include "src/turbomind/kernels/gemm/cast.h"
+#include "src/turbomind/kernels/gemm/gemm.h"
+#include "src/turbomind/kernels/gemm/types.h"
+#include "src/turbomind/kernels/gpt_kernels.h"
+
+#include "src/turbomind/utils/memory_utils.h"
+
+namespace turbomind {
+
+void LlamaDenseWeight::emplace(
+    int input_dim, int output_dim, DataType data_type, bool bias, DataType weight_type, int group_size)
+{
+    this->data_type   = data_type;
+    this->weight_type = weight_type;
+    this->input_dim   = input_dim;
+    this->output_dim  = output_dim;
+    this->group_size  = group_size;
+
+    const auto wbits = byte_size(weight_type, 8);
+
+    weight = Tensor({input_dim, output_dim}, weight_type, kDEVICE);
+    register_parameter(wbits < 16 ? "qweight" : "weight", weight);
+
+    if (bias) {
+        this->bias = Tensor{{output_dim}, data_type, kDEVICE};
+        register_parameter("bias", this->bias);
+    }
+
+    if (wbits < 16) {
+        TM_CHECK(input_dim % group_size == 0) << input_dim << " " << group_size;
+        scales = Tensor{{input_dim / group_size, output_dim}, data_type, kDEVICE};
+        zeros  = Tensor{{input_dim / group_size, output_dim}, data_type, kDEVICE};
+        register_parameter("scales", scales);
+        register_parameter("zeros", zeros);
+    }
+}
+
+static void convert_u4(LlamaDenseWeight& dense, bool is_fused_moe, bool use_simt, cudaStream_t st)
+{
+    TM_CHECK_EQ(dense.weight_type, data_type_v<uint4_t>);
+
+    using namespace gemm;
+
+    auto [order_b, pack_b, order_v, pack_v] =
+        get_weight_and_scales_layout(data_type_v<uint4_t>, is_fused_moe, getSMVersion(), use_simt);
+
+    if (order_b == kColMajor) {
+        Buffer trans{dense.input_dim * dense.output_dim, data_type_v<uint4_t>, kDEVICE};
+        transpose_u4(
+            (uint4_t*)trans.raw_data(), (const uint4_t*)dense.weight.raw_data(), dense.input_dim, dense.output_dim, st);
+        cudaMemcpyAsync(
+            dense.weight.raw_data(), trans.raw_data(), dense.input_dim * dense.output_dim / 2, cudaMemcpyDefault, st);
+    }
+
+    Buffer_<uint16_t> tmp_w{dense.input_dim * dense.output_dim, kDEVICE};
+    extend_to_u16(tmp_w.data(), (const uint4_t*)dense.weight.raw_data(), dense.input_dim * dense.output_dim, st);
+    sync_check_cuda_error();
+
+    MatrixLayout w_desc{
+        data_type_v<half_t>,
+        order_b,
+        (int)dense.input_dim,   // k
+        (int)dense.output_dim,  // n
+        order_b == kRowMajor ? (int)dense.output_dim : (int)dense.input_dim,
+    };
+
+    MatrixLayout k_desc = w_desc;
+    k_desc.type         = data_type_v<uint4_t>;
+    k_desc.pack         = pack_b;
+
+    cudaMemsetAsync(dense.weight.raw_data(), 0, dense.input_dim * dense.output_dim / 2, st);
+
+    FT_CHECK(Convert(tmp_w.data(), w_desc, dense.weight.raw_data(), k_desc, st) == 0);
+    sync_check_cuda_error();
+
+    const int scale_count = (dense.input_dim / dense.group_size) * dense.output_dim;
+
+    Buffer_<half> tmp_q{scale_count * 2, kDEVICE};
+    fuse_scales_and_zeros(tmp_q.data(), dense.scales.data<half>(), dense.zeros.data<half>(), scale_count, st);
+    sync_check_cuda_error();
+
+    dense.scales = {};
+    dense.zeros  = {};
+
+    dense.scales_zeros = Tensor_<half>{{scale_count, 2}, kDEVICE};
+
+    MatrixLayout s_desc{
+        data_type_v<uint32_t>,
+        order_v,
+        (int)dense.input_dim / dense.group_size,  // k
+        (int)dense.output_dim,                    // n
+        (int)dense.output_dim,
+    };
+
+    MatrixLayout q_desc = s_desc;
+    q_desc.pack         = pack_v;
+
+    FT_CHECK(Convert(tmp_q.data(), s_desc, dense.scales_zeros.raw_data(), q_desc, st) == 0);
+    sync_check_cuda_error();
+
+    dense.k_desc = k_desc;
+    dense.q_desc = q_desc;
+}
+
+static void convert_fp(LlamaDenseWeight& dense, bool is_fused_moe, bool use_simt, cudaStream_t st)
+{
+    using namespace gemm;
+
+    if (!is_fused_moe) {
+        return;
+    }
+
+    /// TODO: unify data types
+    auto data_type = dense.data_type;
+
+    const auto [order_b, pack_b, order_v, pack_v] =
+        get_weight_and_scales_layout(data_type, is_fused_moe, getSMVersion(), use_simt);
+
+    const int input_dim  = dense.input_dim;
+    const int output_dim = dense.output_dim;
+
+    TM_CHECK(dense.weight.is_contiguous());
+
+    Buffer_<uint16_t> tmp{input_dim * output_dim, kDEVICE};
+
+    if (order_b == kColMajor) {
+        invokeTransposeAxis01(tmp.data(), (uint16_t*)dense.weight.raw_data(), input_dim, output_dim, 1, st);
+        sync_check_cuda_error();
+    }
+    else {
+        check_cuda_error(
+            cudaMemcpyAsync(tmp.data(), dense.weight.raw_data(), dense.weight.byte_size(), cudaMemcpyDefault, st));
+    }
+
+    MatrixLayout src{
+        data_type,
+        order_b,
+        input_dim,   // k
+        output_dim,  // n
+        order_b == kRowMajor ? output_dim : input_dim,
+    };
+
+    MatrixLayout dst = src;
+    dst.pack         = pack_b;
+
+    if (pack_b) {
+        FT_CHECK(Convert(tmp.data(), src, dense.weight.raw_data(), dst, st) == 0);
+        sync_check_cuda_error();
+    }
+    else {
+        check_cuda_error(
+            cudaMemcpyAsync(dense.weight.raw_data(), tmp.data(), dense.weight.byte_size(), cudaMemcpyDefault, st));
+    }
+
+    dense.k_desc = dst;
+}
+
+static void convert(LlamaDenseWeight& dense, bool is_fused_moe, DataType data_type, bool use_simt, cudaStream_t st) {}
+
+void LlamaDenseWeight::prepare(bool fused_moe, bool use_simt)
+{
+    if (!weight) {
+        return;
+    }
+
+    auto stream = core::Context::stream().handle();
+
+    if (weight_type == data_type_v<uint4_t>) {
+        TM_CHECK_EQ(data_type, data_type_v<half_t>);
+        convert_u4(*this, fused_moe, use_simt, stream);
+    }
+    else {
+        convert_fp(*this, fused_moe, use_simt, stream);
+    }
+}
+
+LlamaAttentionWeight::LlamaAttentionWeight(int      hidden_dim,
+                                           int      head_dim,
+                                           int      head_num,
+                                           int      kv_head_num,
+                                           MLAParam mla,
+                                           bool     bias,
+                                           bool     qk_norm,
+                                           int      tp_size,
+                                           int      tp_rank,
+                                           DataType data_type,
+                                           DataType weight_type,
+                                           int      group_size)
+{
+    if (mla.kv_lora_rank == 0) {
+        qkv.emplace(
+            hidden_dim, (head_num + 2 * kv_head_num) * head_dim / tp_size, data_type, bias, weight_type, group_size);
+        register_module("w_qkv", qkv, tp_rank);
+        if (qk_norm) {
+            q_a_layernorm  = Tensor{{head_dim}, data_type, kDEVICE};
+            kv_a_layernorm = Tensor{{head_dim}, data_type, kDEVICE};
+            register_parameter("q_norm", q_a_layernorm);
+            register_parameter("k_norm", kv_a_layernorm);
+        }
+    }
+    else {
+        const int qk_nope_dim = head_dim - mla.qk_rope_dim;
+        if (mla.q_lora_rank) {
+            q_a_proj.emplace(hidden_dim, mla.q_lora_rank, data_type, false, weight_type, group_size);
+            q_b_proj.emplace(mla.q_lora_rank, head_num * head_dim / tp_size, data_type, false, weight_type, group_size);
+            q_a_layernorm = Tensor{{q_b_proj.input_dim}, data_type, kDEVICE};
+            register_module("q_a_proj", q_a_proj);
+            register_module("q_b_proj", q_b_proj, tp_rank);
+            register_parameter("q_a_layernorm", q_a_layernorm);
+        }
+        else {
+            q_proj.emplace(hidden_dim, head_num * head_dim / tp_size, data_type, false, weight_type, group_size);
+            register_module("q_proj", q_proj, tp_rank);
+        }
+        kv_a_proj.emplace(hidden_dim, mla.kv_lora_rank + mla.qk_rope_dim, data_type, false, weight_type, group_size);
+        kv_b_proj.emplace(mla.kv_lora_rank,
+                          head_num * (qk_nope_dim + mla.v_head_dim) / tp_size,
+                          data_type,
+                          false,
+                          weight_type,
+                          group_size);
+
+        kv_a_layernorm = Tensor{{kv_b_proj.input_dim}, data_type, kDEVICE};
+        register_module("kv_a_proj", kv_a_proj);
+        register_module("kv_b_proj", kv_b_proj, tp_rank);
+        register_parameter("kv_a_layernorm", kv_a_layernorm);
+    }
+    output.emplace((head_num * head_dim) / tp_size, hidden_dim, data_type, bias, weight_type, group_size);
+    register_module("wo", output, tp_rank);
+}
+
+void LlamaAttentionWeight::prepare(bool use_simt)
+{
+    std::vector weights{
+        &qkv,
+        &output,
+        &q_a_proj,
+        &q_a_proj,
+        &q_b_proj,
+        &kv_a_proj,
+        &kv_b_proj,
+    };
+    for (auto& w : weights) {
+        w->prepare(false, use_simt);
+    }
+}
+
+LlamaFfnWeight::LlamaFfnWeight(int      hidden_dim,
+                               int      inter_size,
+                               int      tp_size,
+                               int      tp_rank,
+                               DataType data_type,
+                               DataType weight_type,
+                               int      group_size,
+                               bool     fuse_silu_act)
+{
+    TM_CHECK(inter_size % tp_size == 0) << inter_size << " " << tp_size;
+
+    inter_size /= tp_size;
+
+    this->inter_size = inter_size;
+
+    gating.emplace(hidden_dim, inter_size, data_type, false, weight_type, group_size);
+
+    intermediate.emplace(hidden_dim, inter_size, data_type, false, weight_type, group_size);
+
+    // fused_gating_intermediate = {hidden_dim, inter_size * 2, data_type, weight_type, group_size};
+    is_fused_silu = fuse_silu_act;
+
+    output.emplace(inter_size, hidden_dim, data_type, false, weight_type, group_size);
+
+    register_module("w1", gating, tp_rank);
+    register_module("w3", intermediate, tp_rank);
+    register_module("w2", output, tp_rank);
+}
+
+void interleave(LlamaDenseWeight& c, LlamaDenseWeight& a, LlamaDenseWeight& b, DataType data_type, cudaStream_t st)
+{
+    FT_CHECK(c.input_dim == a.input_dim);
+    FT_CHECK(c.input_dim == b.input_dim);
+    FT_CHECK(c.output_dim == a.output_dim * 2);
+    FT_CHECK(c.output_dim == b.output_dim * 2);
+    FT_CHECK(c.group_size == a.group_size);
+    FT_CHECK(c.group_size == b.group_size);
+
+    auto invoke = [&](auto t) {
+        using T = decltype(t);
+        if (a.weight_type == data_type_v<uint4_t>) {
+            Buffer_<uint8_t> tmp_a{a.weight.size(), kDEVICE};
+            Buffer_<uint8_t> tmp_b{b.weight.size(), kDEVICE};
+            Buffer_<uint8_t> tmp_c{c.weight.size(), kDEVICE};
+
+            extend_to_u8(tmp_a.data(), (const uint4_t*)a.weight.raw_data(), a.output_dim * a.input_dim, st);
+            extend_to_u8(tmp_b.data(), (const uint4_t*)b.weight.raw_data(), b.output_dim * b.input_dim, st);
+
+            interleave_output_dims(tmp_c.data(), tmp_a.data(), tmp_b.data(), a.output_dim, a.input_dim, st);
+
+            compact_to_u4((uint4_t*)c.weight.raw_data(), tmp_c.data(), c.output_dim * c.input_dim, st);
+
+            interleave_output_dims(c.scales.data<T>(),
+                                   a.scales.data<T>(),
+                                   b.scales.data<T>(),
+                                   a.output_dim,
+                                   a.input_dim / a.group_size,
+                                   st);
+            interleave_output_dims(c.zeros.data<T>(),  //
+                                   a.zeros.data<T>(),
+                                   b.zeros.data<T>(),
+                                   a.output_dim,
+                                   a.input_dim / a.group_size,
+                                   st);
+        }
+        else {
+            interleave_output_dims(
+                c.weight.data<T>(), a.weight.data<T>(), b.weight.data<T>(), a.output_dim, a.input_dim, st);
+        }
+        // Check at function level
+        sync_check_cuda_error();
+    };
+
+    TM_DISPATCH_DTYPES(data_type, invoke, half_t, bfloat16_t);
+}
+
+void chunk(LlamaDenseWeight& c, LlamaDenseWeight& a, LlamaDenseWeight& b, DataType data_type, cudaStream_t st)
+{
+    FT_CHECK(c.input_dim == a.input_dim);
+    FT_CHECK(c.input_dim == b.input_dim);
+    FT_CHECK(c.output_dim == a.output_dim * 2);
+    FT_CHECK(c.output_dim == b.output_dim * 2);
+    FT_CHECK(c.group_size == a.group_size);
+    FT_CHECK(c.group_size == b.group_size);
+
+    auto _chunks = [&](auto c, auto a, auto b, int height, int width) {
+        check_cuda_error(
+            cudaMemcpy2DAsync((char*)c + 0x000, width * 2, a, width, width, height, cudaMemcpyDefault, st));
+        check_cuda_error(
+            cudaMemcpy2DAsync((char*)c + width, width * 2, b, width, width, height, cudaMemcpyDefault, st));
+    };
+
+    auto invoke = [&](auto t) {
+        using T = decltype(t);
+        if (c.weight_type == data_type_v<uint4_t>) {
+            _chunks(c.weight.raw_data(), a.weight.raw_data(), b.weight.raw_data(), a.input_dim, 4 * a.output_dim / 8);
+            _chunks(c.scales.data<T>(),
+                    a.scales.data<T>(),
+                    b.scales.data<T>(),
+                    a.input_dim / a.group_size,
+                    sizeof(T) * a.output_dim);
+            _chunks(c.zeros.data<T>(),
+                    a.zeros.data<T>(),
+                    b.zeros.data<T>(),
+                    a.input_dim / a.group_size,
+                    sizeof(T) * a.output_dim);
+        }
+        else {
+            _chunks(c.weight.data<T>(), a.weight.data<T>(), b.weight.data<T>(), a.input_dim, sizeof(T) * a.output_dim);
+        }
+        // Check at function level
+        sync_check_cuda_error();
+    };
+
+    TM_DISPATCH_DTYPES(data_type, invoke, half_t, bfloat16_t);
+}
+
+void LlamaFfnWeight::prepare(bool fused_moe, bool use_simt)
+{
+    const auto data_type = gating.data_type;
+
+    auto stream = core::Context().stream().handle();
+
+    if (fuse_up_and_gate) {
+        auto& fused_up_and_gate = fused_gating_intermediate;
+
+        fused_up_and_gate.emplace(gating.input_dim,  //
+                                  gating.output_dim * 2,
+                                  gating.data_type,
+                                  false,
+                                  gating.weight_type,
+                                  gating.group_size);
+
+        if (is_fused_silu) {
+            interleave(fused_up_and_gate, gating, intermediate, data_type, stream);
+        }
+        else {
+            chunk(fused_up_and_gate, gating, intermediate, data_type, stream);
+        }
+
+        fused_gating_intermediate.prepare(fused_moe, use_simt);
+
+        gating       = {};
+        intermediate = {};
+    }
+    else {
+        gating.prepare(fused_moe, use_simt);
+        intermediate.prepare(fused_moe, use_simt);
+    }
+
+    output.prepare(fused_moe, use_simt);
+}
+
+MoeFfnWeight::MoeFfnWeight(int             layer_id,
+                           const MoeParam& param,
+                           int             hidden_dim,
+                           DataType        data_type,
+                           DataType        weight_type,
+                           int             group_size,
+                           int             tp_size,
+                           int             tp_rank,
+                           bool            fuse_silu_act)
+{
+    if ((int)param.expert_num.size() <= layer_id) {
+        return;
+    }
+
+    const int expert_num = param.expert_num[layer_id];
+
+    if (expert_num == 0) {
+        return;
+    }
+
+    gate.emplace(hidden_dim, expert_num, data_type, false, data_type, 1);
+    register_module("gate", gate);
+
+    method        = param.method;
+    fuse_silu_act = fuse_silu_act && method == MoeParam::kFused;
+
+    experts.reserve(expert_num);
+    for (int i = 0; i < expert_num; ++i) {
+        experts.emplace_back(new LlamaFfnWeight{
+            hidden_dim, param.inter_size, tp_size, tp_rank, data_type, weight_type, group_size, fuse_silu_act});
+        register_module("experts", *experts.back(), i);
+    }
+
+    if (param.shared_gate) {
+        shared_gate.emplace(hidden_dim, 1, data_type, false, data_type, 1);
+        register_module("shared_gate", shared_gate);
+    }
+}
+
+void MoeFfnWeight::prepare(bool use_simt)
+{
+    const auto fused_moe = method == MoeParam::kFused;
+
+    for (auto& e : experts) {
+        e->prepare(fused_moe, use_simt);
+    }
+    const int  n_expert = experts.size();
+    const auto st       = core::Context::stream().handle();
+
+    auto make_block_ptr = [&](const auto& ptrs) {
+        return std::shared_ptr<void>{gemm::make_blocked_ptrs(ptrs, st), [](auto p) { cudaFree(p); }};
+    };
+
+    auto process = [&](auto getter) {
+        std::vector<std::pair<void*, int>> weight_ptrs;
+        std::vector<std::pair<void*, int>> quant_ptrs;
+
+        for (auto& e : experts) {
+            auto& m = (*e).*getter;
+            weight_ptrs.push_back({m.weight.raw_data(), m.k_desc.ld});
+            if (m.scales_zeros) {
+                quant_ptrs.emplace_back(m.scales_zeros.raw_data(), m.q_desc.ld);
+            }
+        }
+
+        LlamaDenseWeight& m = block.*getter;
+
+        {  // Copy properties from exemplar, this assumes all experts has the same shape
+            LlamaDenseWeight& e = (*experts.at(0)).*getter;
+            m.input_dim         = e.input_dim;
+            m.output_dim        = e.output_dim;
+            m.group_size        = e.group_size;
+            m.data_type         = e.data_type;
+            m.weight_type       = e.weight_type;
+            m.k_desc            = e.k_desc;
+            m.q_desc            = e.q_desc;
+        }
+
+        // Dummy tensors to hold the blocked ptrs
+        m.weight = Tensor{make_block_ptr(weight_ptrs), {n_expert}, m.weight_type, kDEVICE};
+        if (!quant_ptrs.empty()) {
+            TM_CHECK_EQ(quant_ptrs.size(), n_expert);
+            m.scales_zeros = Tensor{make_block_ptr(quant_ptrs), {n_expert}, m.data_type, kDEVICE};
+        }
+
+        m.k_desc.num = m.q_desc.num = experts.size();
+        m.k_desc.ld = m.q_desc.ld = 0;  // `ld` is meaningless in this case
+    };
+
+    process(&LlamaFfnWeight::fused_gating_intermediate);
+    process(&LlamaFfnWeight::output);
+
+    auto& e = *experts.at(0);
+    // Copy MLP properties
+    block.inter_size    = e.inter_size;
+    block.is_fused_silu = e.is_fused_silu;
+}
+
+}  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h
index b12592c757..794aa10b97 100644
--- a/src/turbomind/models/llama/LlamaDenseWeight.h
+++ b/src/turbomind/models/llama/LlamaDenseWeight.h
@@ -19,12 +19,11 @@
 
 #pragma once
 
+#include "src/turbomind/core/core.h"
+#include "src/turbomind/core/module.h"
+
 #include "src/turbomind/kernels/gemm/types.h"
 #include "src/turbomind/models/llama/llama_params.h"
-#include "src/turbomind/models/llama/weight_type.h"
-#include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/memory_utils.h"
-#include <cuda_bf16.h>
 
 namespace turbomind {
 
@@ -44,319 +43,128 @@ struct LoraWeight {
     void*      b;
 };
 
-template<typename T>
-struct LlamaDenseWeight {
-    size_t     input_dims  = 0;
-    size_t     output_dims = 0;
-    WeightType type;  // uninitialized
-    void*      kernel       = nullptr;
-    T*         bias         = nullptr;
-    T*         scales       = nullptr;
-    T*         zeros        = nullptr;
-    T*         scales_zeros = nullptr;
-    int        group_size   = 1;
-
-    LoraWeight lora;
-
-    gemm::MatrixLayout k_desc;
-    gemm::MatrixLayout q_desc;
+struct LlamaDenseWeight: public core::Module {
 
-    LlamaDenseWeight(): type{}, lora{}, k_desc{}, q_desc{} {}
+    LlamaDenseWeight(): data_type{}, weight_type{}, lora{}, k_desc{}, q_desc{} {}
 
-    LlamaDenseWeight(size_t input_dim, size_t output_dim, WeightType type, int group_size): LlamaDenseWeight{}
-    {
-        this->input_dims  = input_dim;
-        this->output_dims = output_dim;
-        this->type        = type;
-        this->group_size  = group_size;
-    }
+    void emplace(int input_dim, int output_dim, DataType data_type, bool bias, DataType weight_type, int group_size);
 
-    size_t kernel_size() const noexcept
-    {
-        return getBitSize(type) * input_dims * output_dims / 8;
-    }
-
-    size_t bias_size() const noexcept
-    {
-        return sizeof(T) * output_dims;
-    }
-
-    size_t scales_size() const noexcept
-    {
-        return sizeof(T) * input_dims / group_size * output_dims;
-    }
+    void prepare(bool fused_moe, bool use_simt);
 
-    std::pair<size_t, size_t> lora_size() const noexcept
+    LlamaDenseWeight& operator=(std::nullptr_t)
     {
-        return {sizeof(T) * input_dims * lora.r, sizeof(T) * lora.r * output_dims};
+        this->~LlamaDenseWeight();
+        new (this) LlamaDenseWeight{};
+        return *this;
     }
 
-    void malloc(cudaStream_t st, bool with_bias = false)
+    operator bool() const noexcept
     {
-        if (with_bias) {
-            deviceMalloc((T**)&bias, output_dims, st);
-        }
-        const size_t bit_size = getBitSize(type);
-        if (bit_size >= 16) {  // fp16, fp32
-            deviceMalloc((T**)&kernel, input_dims * output_dims, st);
-        }
-        else {  // int8, int4
-            const int factor = sizeof(float) * 8 / bit_size;
-            FT_CHECK(input_dims % factor == 0);
-            deviceMalloc((int**)&kernel, input_dims * output_dims / factor, st);
-            deviceMalloc((T**)&scales, input_dims / group_size * output_dims, st);
-            deviceMalloc((T**)&zeros, input_dims / group_size * output_dims, st);
-        }
-
-        if (lora.r > 0) {
-            deviceMalloc((T**)&lora.a, input_dims * lora.r, st);
-            deviceMalloc((T**)&lora.b, lora.r * output_dims, st);
-        }
+        return static_cast<bool>(weight);
     }
 
-    void free(cudaStream_t st)
-    {
-        deviceFree(kernel, st);
-        deviceFree(bias, st);
-        deviceFree(scales, st);
-        deviceFree(zeros, st);
-        deviceFree(lora.a, st);
-        deviceFree(lora.b, st);
-    }
-};
+    int input_dim  = 0;
+    int output_dim = 0;
+    int group_size = 1;
 
-template<typename T>
-struct LlamaAttentionWeight {
+    DataType data_type;
+    DataType weight_type;
 
-    LlamaAttentionWeight() = default;
+    Tensor weight;
+    Tensor bias;
 
-    LlamaAttentionWeight(size_t     hidden_dim,
-                         size_t     head_dim,
-                         size_t     head_num,
-                         size_t     kv_head_num,
-                         MLAParam   mla,
-                         bool       bias,
-                         bool       qk_norm,
-                         size_t     tp,
-                         WeightType weight_type,
-                         int        group_size)
-    {
-        this->bias     = bias;
-        this->head_dim = head_dim;
-        this->qk_norm  = qk_norm;
-
-        if (mla.kv_lora_rank == 0) {
-            qkv = {hidden_dim, (head_num + 2 * kv_head_num) * head_dim / tp, weight_type, group_size};
-        }
-        else {
-            const int qk_nope_dim = head_dim - mla.qk_rope_dim;
-            if (mla.q_lora_rank) {
-                q_a_proj = {hidden_dim, mla.q_lora_rank, weight_type, group_size};
-                q_b_proj = {mla.q_lora_rank, head_num * head_dim / tp, weight_type, group_size};
-            }
-            else {
-                q_proj = {hidden_dim, head_num * head_dim / tp, weight_type, group_size};
-            }
-            kv_a_proj = {hidden_dim, mla.kv_lora_rank + mla.qk_rope_dim, weight_type, group_size};
-            kv_b_proj = {mla.kv_lora_rank, head_num * (qk_nope_dim + mla.v_head_dim) / tp, weight_type, group_size};
-        }
-        output = {(head_num * head_dim) / tp, hidden_dim, weight_type, group_size};
-    }
+    Tensor scales;
+    Tensor zeros;
 
-    void malloc(cudaStream_t st)
-    {
-        if (qkv.output_dims) {
-            qkv.malloc(st, bias);
-            if (qk_norm) {
-                deviceMalloc((T**)&q_a_layernorm, head_dim, st);
-                deviceMalloc((T**)&kv_a_layernorm, head_dim, st);
-            }
-        }
-        else {  // MLA
-            if (q_proj.output_dims) {
-                q_proj.malloc(st);
-            }
-            else {
-                q_a_proj.malloc(st);
-                q_b_proj.malloc(st);
-                deviceMalloc((T**)&q_a_layernorm, q_b_proj.input_dims, st);
-            }
-            kv_a_proj.malloc(st);
-            kv_b_proj.malloc(st);
-            deviceMalloc((T**)&kv_a_layernorm, kv_b_proj.input_dims, st);
-        }
-        output.malloc(st, bias);
-    }
+    Tensor scales_zeros;
 
-    void free(cudaStream_t st)
-    {
-        qkv.free(st);
-        q_proj.free(st);
-        q_a_proj.free(st);
-        q_b_proj.free(st);
-        kv_a_proj.free(st);
-        kv_b_proj.free(st);
-        output.free(st);
-        deviceFree(q_a_layernorm, st);
-        deviceFree(kv_a_layernorm, st);
-    }
+    LoraWeight lora;
 
-    int  head_dim{};
-    bool bias{};
-    bool qk_norm{};
+    gemm::MatrixLayout k_desc;
+    gemm::MatrixLayout q_desc;
+};
 
-    LlamaDenseWeight<T> qkv;
-    LlamaDenseWeight<T> output;
+struct LlamaAttentionWeight: public core::Module {
 
-    LlamaDenseWeight<T> q_proj;
-    LlamaDenseWeight<T> q_a_proj;
-    LlamaDenseWeight<T> q_b_proj;
-    LlamaDenseWeight<T> kv_a_proj;
-    LlamaDenseWeight<T> kv_b_proj;
+    LlamaAttentionWeight() = default;
 
-    T* q_a_layernorm{};
-    T* kv_a_layernorm{};
+    LlamaAttentionWeight(int      hidden_dim,
+                         int      head_dim,
+                         int      head_num,
+                         int      kv_head_num,
+                         MLAParam mla,
+                         bool     bias,
+                         bool     qk_norm,
+                         int      tp_size,
+                         int      tp_rank,
+                         DataType data_type,
+                         DataType weight_type,
+                         int      group_size);
+
+    void prepare(bool use_simt);
+
+    LlamaDenseWeight qkv;
+    LlamaDenseWeight output;
+
+    LlamaDenseWeight q_proj;
+    LlamaDenseWeight q_a_proj;
+    LlamaDenseWeight q_b_proj;
+    LlamaDenseWeight kv_a_proj;
+    LlamaDenseWeight kv_b_proj;
+
+    Tensor q_a_layernorm;
+    Tensor kv_a_layernorm;
 };
 
-template<typename T>
-struct LlamaFfnWeight {
+struct LlamaFfnWeight: core::Module {
 
     LlamaFfnWeight() = default;
 
-    LlamaFfnWeight(
-        size_t hidden_dim, size_t inter_size, size_t tp, WeightType weight_type, int group_size, bool fuse_silu_act)
-    {
-        inter_size /= tp;
-
-        this->inter_size = inter_size;
-
-        gating.input_dims  = hidden_dim;
-        gating.output_dims = inter_size;
-        gating.type        = weight_type;
-        gating.group_size  = group_size;
+    LlamaFfnWeight(int      hidden_dim,
+                   int      inter_size,
+                   int      tp_size,
+                   int      tp_rank,
+                   DataType data_type,
+                   DataType weight_type,
+                   int      group_size,
+                   bool     fuse_silu_act);
 
-        intermediate.input_dims  = hidden_dim;
-        intermediate.output_dims = inter_size;
-        intermediate.type        = weight_type;
-        intermediate.group_size  = group_size;
+    static constexpr bool fuse_up_and_gate = true;
 
-        fused_gating_intermediate.input_dims  = hidden_dim;
-        fused_gating_intermediate.output_dims = inter_size * 2;
-        fused_gating_intermediate.type        = weight_type;
-        fused_gating_intermediate.group_size  = group_size;
+    void prepare(bool fused_moe, bool use_simt);
 
-        is_fused_silu = fuse_silu_act;
-
-        output.input_dims  = inter_size;
-        output.output_dims = hidden_dim;
-        output.type        = weight_type;
-        output.group_size  = group_size;
-    }
-
-    void malloc(cudaStream_t st)
-    {
-        gating.malloc(st);
-        intermediate.malloc(st);
-        output.malloc(st);
-    }
-
-    void free(cudaStream_t st)
-    {
-        gating.free(st);
-        intermediate.free(st);
-        output.free(st);
-        fused_gating_intermediate.free(st);
-    }
-
-    LlamaDenseWeight<T> gating;
-    LlamaDenseWeight<T> intermediate;
-    LlamaDenseWeight<T> output;
-    LlamaDenseWeight<T> fused_gating_intermediate;
+    LlamaDenseWeight gating;
+    LlamaDenseWeight intermediate;
+    LlamaDenseWeight output;
+    LlamaDenseWeight fused_gating_intermediate;
 
     int  inter_size{};
     bool is_fused_silu{};
 };
 
-template<class T>
-struct MoeFfnWeight {
+struct MoeFfnWeight: core::Module {
 
     MoeFfnWeight() = default;
 
     MoeFfnWeight(int             layer_id,
                  const MoeParam& param,
-                 size_t          hidden_dim,
-                 WeightType      weight_type,
+                 int             hidden_dim,
+                 DataType        data_type,
+                 DataType        weight_type,
                  int             group_size,
-                 size_t          tp,
-                 bool            fuse_silu_act)
-    {
-
-        if (param.expert_num.size() <= layer_id) {
-            return;
-        }
+                 int             tp_size,
+                 int             tp_rank,
+                 bool            fuse_silu_act);
 
-        const int expert_num = param.expert_num[layer_id];
-
-        if (expert_num == 0) {
-            return;
-        }
-
-        // printf("%d %d %d\n", (int)hidden_dim, (int)param.inter_size, (int)expert_num);
-
-        gate.input_dims  = hidden_dim;
-        gate.output_dims = expert_num;
-        gate.type        = get_default_weight_type<T>();
-        gate.group_size  = group_size;
-
-        experts.resize(expert_num);
-
-        method        = param.method;
-        fuse_silu_act = fuse_silu_act && method == MoeParam::kFused;
-
-        for (auto& e : experts) {
-            // inter size is divided by tp in `FfnWeight`
-            e = LlamaFfnWeight<T>{hidden_dim, (size_t)param.inter_size, tp, weight_type, group_size, fuse_silu_act};
-        }
-
-        if (param.shared_gate) {
-            shared_gate.input_dims  = hidden_dim;
-            shared_gate.output_dims = 1;
-            shared_gate.type        = get_default_weight_type<T>();
-            gate.group_size         = group_size;
-        }
-        else {
-            shared_gate = {};
-        }
-    }
-
-    void malloc(cudaStream_t st)
-    {
-        gate.malloc(st);
-        if (shared_gate.output_dims) {
-            shared_gate.malloc(st);
-        }
-        for (auto& e : experts) {
-            e.malloc(st);
-        }
-    }
-
-    void free(cudaStream_t st)
-    {
-        gate.free(st);
-        shared_gate.free(st);
-        for (auto& e : experts) {
-            e.free(st);
-        }
-        block.free(st);
-    }
+    void prepare(bool use_simt);
 
-    LlamaDenseWeight<T>            gate;
-    std::vector<LlamaFfnWeight<T>> experts;
+    LlamaDenseWeight gate;
+    LlamaDenseWeight shared_gate;
 
-    LlamaDenseWeight<T> shared_gate;
+    std::vector<std::unique_ptr<LlamaFfnWeight>> experts;
 
     // reference into `experts`
-    LlamaFfnWeight<T> block;
+    LlamaFfnWeight block;
 
     MoeParam::Method method{};
 };
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.cc b/src/turbomind/models/llama/LlamaFfnLayer.cc
index 7fc15dba38..dd3def0518 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.cc
+++ b/src/turbomind/models/llama/LlamaFfnLayer.cc
@@ -21,153 +21,63 @@
 #include "src/turbomind/kernels/activation_kernels.h"
 #include "src/turbomind/models/llama/llama_utils.h"
 #include "src/turbomind/utils/anomaly_handler.h"
-#include "src/turbomind/utils/nvtx_utils.h"
 
 namespace turbomind {
 
-template<typename T>
-void LlamaFfnLayer<T>::allocateBuffer(
-    size_t token_num, int inter_size, size_t inter_buf_factor, size_t gating_lora_r, size_t inter_lora_r)
+void LlamaFfnLayer::activation(Tensor& gating, Tensor& inter, cudaStream_t stream)
 {
-    const size_t sz = token_num * inter_size;
-
-    gating_buf_ = (T*)allocator_->reMalloc(gating_buf_, sizeof(T) * sz * inter_buf_factor, false);
-    inter_buf_  = gating_buf_ + sz;
-
-    if (gating_lora_r + inter_lora_r) {
-        lora_buf_ = (T*)allocator_->reMalloc(lora_buf_, sizeof(T) * token_num * (gating_lora_r + inter_lora_r));
-    }
-
-    is_allocate_buffer_ = true;
-}
-
-template<typename T>
-void LlamaFfnLayer<T>::freeBuffer()
-{
-    if (is_allocate_buffer_) {
-        allocator_->free((void**)&gating_buf_);
-        allocator_->free((void**)&lora_buf_);
-        is_allocate_buffer_ = false;
-    }
-}
-
-template<typename T>
-void LlamaFfnLayer<T>::activation(int token_num, int inter_size, bool is_chunked)
-{
-    NvtxScope scope("activation");
-    if (is_chunked) {
-        // gate & up are in the SAME buffer
-        invokeGenericActivation_v2<SiluActivation>(
-            gating_buf_, gating_buf_ + inter_size, inter_size * 2, token_num, inter_size, stream_);
-        sync_check_cuda_error();
-    }
-    else {
-        // gate & up are in separate buffers
-        invokeGenericActivation_v2<SiluActivation>(gating_buf_, inter_buf_, inter_size, token_num, inter_size, stream_);
-        sync_check_cuda_error();
-    }
+    // Code for dispatching activation types
+    invokeGenericActivation_v3<SiluActivation>(gating, inter, stream);
 }
 
-template<typename T>
-void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
-                               const TensorMap*         input_tensors,
-                               const LlamaFfnWeight<T>* weights)
+void LlamaFfnLayer::forward(ForwardParam param)
 {
-    /**
-     * input_tensors:
-     *   \param ffn_input [token_num, hidden_dimension]
-     *
-     * output_tensors:
-     *   \param ffn_output [token_num, hidden_dimension]
-     */
-
     NvtxScope scope("ffn");
 
-    const size_t token_num  = input_tensors->at("ffn_input").shape[0];
-    const int    layer_id   = input_tensors->getVal<int>("layer_id");
-    const int    inter_size = weights->inter_size;
-
-    const bool is_fused_silu = weights->fused_gating_intermediate.kernel && weights->is_fused_silu;
+    const auto& mlp = *param.weights;
 
-    allocateBuffer(token_num, inter_size, is_fused_silu ? 1 : 2, weights->gating.lora.r, weights->intermediate.lora.r);
+    const int token_num  = param.input.shape(0);
+    const int inter_size = mlp.inter_size;
+    const int layer_id   = param.layer_id;
 
-    const T* ffn_input_data  = input_tensors->at("ffn_input").getPtr<T>();
-    T*       ffn_output_data = output_tensors->at("ffn_output").getPtr<T>();
-    int*     lora_mask = input_tensors->at("lora_mask", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<int>();
+    const auto stream = core::Context::stream().handle();
 
-    if (weights->fused_gating_intermediate.kernel) {
-        NvtxScope scope("fused_silu_ffn");
+    Tensor gating;
+    Tensor inter;
 
-        const auto type = weights->is_fused_silu ? LlamaLinear<T>::kFusedSiluFfn : LlamaLinear<T>::kGemm;
+    if (mlp.fused_gating_intermediate.weight) {
+        const auto type = mlp.is_fused_silu ? LlamaLinear::kFusedSiluFfn : LlamaLinear::kGemm;
 
-        linear_->forward(gating_buf_, ffn_input_data, token_num, weights->fused_gating_intermediate, type);
+        auto mix = linear_.forward(param.input, mlp.fused_gating_intermediate, type);
         sync_check_cuda_error();
 
-        if (!weights->is_fused_silu) {
-            activation(token_num, inter_size, true);
+        gating = mix.slice({0, 0}, {(int)token_num, inter_size});
+        if (!mlp.is_fused_silu) {
+            inter = mix.slice({0, inter_size}, {(ssize_t)token_num, inter_size});
         }
-
-        count_and_fix(gating_buf_, token_num * weights->output.input_dims, Concat("w1_w3_silu", layer_id), 3);
     }
     else {
-        {  // w1(x)
-            NvtxScope scope("w1");
-            linear_->forward(gating_buf_,  //
-                             ffn_input_data,
-                             token_num,
-                             weights->gating,
-                             LlamaLinear<T>::kGemm,
-                             lora_buf_,
-                             lora_mask);
-            sync_check_cuda_error();
-        }
-        count_and_fix(gating_buf_, token_num * weights->gating.output_dims, Concat("w1", layer_id), 3);
-
-        {  // w3(x)
-            NvtxScope scope("w3");
-            linear_->forward(inter_buf_,
-                             ffn_input_data,
-                             token_num,
-                             weights->intermediate,
-                             LlamaLinear<T>::kGemm,
-                             lora_buf_,
-                             lora_mask);
-            sync_check_cuda_error();
-        }
-        count_and_fix(inter_buf_, token_num * weights->intermediate.output_dims, Concat("w3", layer_id), 3);
+        gating = linear_.forward(param.input, mlp.gating, LlamaLinear::kGemm);
+        sync_check_cuda_error();
+        TM_DEBUG_TENSOR(gating, Concat("w1", layer_id), 3);
 
-        // silu(w1(x)) * w3(x)
-        activation(token_num, inter_size, false);
+        inter = linear_.forward(param.input, mlp.intermediate, LlamaLinear::kGemm);
+        sync_check_cuda_error();
+        TM_DEBUG_TENSOR(inter, Concat("w3", layer_id), 3);
+    }
 
-        count_and_fix(gating_buf_, token_num * weights->output.input_dims, Concat("act", layer_id), 3);
+    if (!mlp.is_fused_silu) {
+        // silu(w1(x)) * w3(x)
+        activation(gating, inter, stream);
+        sync_check_cuda_error();
+        TM_DEBUG_TENSOR(gating, Concat("act", layer_id), 3);
     }
 
     {  // w2(x)
         NvtxScope scope("w2");
-        const int pitch = (weights->fused_gating_intermediate.kernel && !weights->is_fused_silu) ? inter_size * 2 : 0;
-        linear_->forward(ffn_output_data,
-                         {gating_buf_, pitch},
-                         token_num,
-                         weights->output,
-                         LlamaLinear<T>::kGemm,
-                         lora_buf_,
-                         lora_mask);
+        linear_.forward(gating, mlp.output, LlamaLinear::kGemm, param.output);
         sync_check_cuda_error();
     }
-
-    count_and_fix(ffn_output_data, token_num * weights->output.output_dims, Concat("w2", layer_id), 3);
-
-    if (is_free_buffer_after_forward_) {
-        freeBuffer();
-    }
 }
 
-#ifdef ENABLE_FP32
-template class LlamaFfnLayer<float>;
-#endif
-template class LlamaFfnLayer<half>;
-#ifdef ENABLE_BF16
-template class LlamaFfnLayer<__nv_bfloat16>;
-#endif
-
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaFfnLayer.h b/src/turbomind/models/llama/LlamaFfnLayer.h
index c11c5e56fb..6b5e339fbc 100644
--- a/src/turbomind/models/llama/LlamaFfnLayer.h
+++ b/src/turbomind/models/llama/LlamaFfnLayer.h
@@ -19,50 +19,35 @@
 
 #pragma once
 
+#include "src/turbomind/core/core.h"
+#include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include "src/turbomind/models/llama/LlamaLinear.h"
 #include "src/turbomind/models/llama/context.h"
 #include "src/turbomind/models/llama/llama_params.h"
-#include "src/turbomind/utils/Tensor.h"
 
 namespace turbomind {
 
-template<typename T>
 class LlamaFfnLayer {
 public:
-    LlamaFfnLayer(const ModelParam& model, const Context<T>& ctx):
-        hidden_units_(model.hidden_units),
-        stream_(ctx.stream),
-        linear_(ctx.linear.get()),
-        allocator_(ctx.allocator.get())
+    LlamaFfnLayer(const ModelParam& model, const Context& ctx): hidden_units_(model.hidden_units), linear_(*ctx.linear)
     {
     }
 
-    ~LlamaFfnLayer()
-    {
-        freeBuffer();
-    }
+    struct ForwardParam {
+        Tensor                input;
+        Tensor                output;
+        const LlamaFfnWeight* weights;
+        int                   layer_id;
+    };
 
-    void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaFfnWeight<T>* weights);
+    void forward(ForwardParam param);
 
 private:
-    void allocateBuffer(
-        size_t token_num, int inter_size, size_t inter_buf_factor, size_t gating_lora_r, size_t inter_lora_r);
-
-    void freeBuffer();
-
-    void activation(int token_num, int inter_size, bool is_chunked);
+    void activation(Tensor& gating, Tensor& inter, cudaStream_t stream);
 
-    const size_t          hidden_units_;
-    cudaStream_t const    stream_;
-    LlamaLinear<T>* const linear_;
-    IAllocator* const     allocator_;
-    bool                  is_free_buffer_after_forward_{};
-
-    T* gating_buf_{};
-    T* inter_buf_{};
-    T* lora_buf_{};
-
-    bool is_allocate_buffer_{};
+private:
+    const size_t hidden_units_;
+    LlamaLinear& linear_;
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaLinear.cu b/src/turbomind/models/llama/LlamaLinear.cu
index 81dcff7a74..1696920d9b 100644
--- a/src/turbomind/models/llama/LlamaLinear.cu
+++ b/src/turbomind/models/llama/LlamaLinear.cu
@@ -3,176 +3,176 @@
 #include "src/turbomind/kernels/gemm/gemm.h"
 #include "src/turbomind/kernels/gemm/types.h"
 #include "src/turbomind/models/llama/LlamaLinear.h"
-#include "src/turbomind/models/llama/llama_decoder_kernels.h"
-#include <fstream>
+#include "src/turbomind/utils/cuda_utils.h"
+
+#include "src/turbomind/core/cuda_data_type.h"
 
 namespace turbomind {
 
-template<class T>
-struct LlamaLinear<T>::Impl {
+struct LlamaLinear::Impl {
 
-    Impl(cublasMMWrapper* cublas_wrapper, cudaStream_t stream): cublas_wrapper_(cublas_wrapper), stream_(stream)
+    explicit Impl(cudaStream_t stream): stream_(stream)
     {
         workspace_ = {};
 
         workspace_.barriers_size = gemm::Gemm::kBarriersSize;
         workspace_.partials_size = gemm::Gemm::kPartialsSize;
-        cudaMallocAsync(&workspace_.barriers, workspace_.barriers_size, stream_);
-        cudaMallocAsync(&workspace_.partials, workspace_.partials_size, stream_);
-        cudaMemsetAsync(workspace_.barriers, 0, workspace_.barriers_size, stream_);
+
+        check_cuda_error(cudaMallocAsync(&workspace_.barriers, workspace_.barriers_size, stream_));
+        check_cuda_error(cudaMallocAsync(&workspace_.partials, workspace_.partials_size, stream_));
+        check_cuda_error(cudaMemsetAsync(workspace_.barriers, 0, workspace_.barriers_size, stream_));
+
+        check_cuda_error(cublasCreate(&cublas_));
+        check_cuda_error(cublasSetStream(cublas_, stream_));
+        check_cuda_error(cublasSetWorkspace(cublas_, workspace_.partials, workspace_.partials_size));
+
+        if (0) {
+            check_cuda_error(cublasSetMathMode(cublas_, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION));
+        }
     }
 
     ~Impl()
     {
+        cublasDestroy(cublas_);
         cudaFreeAsync(workspace_.barriers, stream_);
         cudaFreeAsync(workspace_.partials, stream_);
         workspace_ = {};
     }
 
-    void forward(T*                         output_data,
-                 Pitched                    input_data,
-                 int                        batch_size,
-                 const LlamaDenseWeight<T>& weight,
-                 Type                       type,
-                 T*                         lora_buff,
-                 int*                       lora_mask)
+    void forward(Tensor& output, const Tensor& input, const LlamaDenseWeight& dense, Type type)
     {
-        if (input_data.pitch == 0) {
-            input_data.pitch = weight.input_dims;
-        }
-        if (lora_mask != nullptr && weight.lora.r > 0) {
-            FT_CHECK(type == kGemm);
-            // output = lora(x) * scale
-            // output = mask(output)
-            // output = x*W + output
-            cublas_wrapper_->Gemm(CUBLAS_OP_N,
-                                  CUBLAS_OP_N,
-                                  weight.lora.r,            // m
-                                  batch_size,               // n
-                                  weight.input_dims,        // k
-                                  (const T*)weight.lora.a,  // A
-                                  weight.lora.r,            // lda
-                                  input_data.ptr,           // B
-                                  input_data.pitch,         // ldb
-                                  lora_buff,                // C
-                                  weight.lora.r);           // ldc
-
-            cublas_wrapper_->Gemm(CUBLAS_OP_N,
-                                  CUBLAS_OP_N,
-                                  weight.output_dims,       // m
-                                  batch_size,               // n
-                                  weight.lora.r,            // k
-                                  (const T*)weight.lora.b,  // A
-                                  weight.output_dims,       // lda
-                                  lora_buff,                // B
-                                  weight.lora.r,            // ldb
-                                  output_data,              // C
-                                  weight.output_dims,       // ldc
-                                  weight.lora.scale,        // alpha
-                                  0.0f);                    // beta
-
-            invokeMask(output_data, lora_mask, batch_size, weight.output_dims, stream_);
-            sync_check_cuda_error();
-
-            type = kFusedAdd;
-        }
-        switch (weight.type) {
-            case WeightType::kFP16:
-            case WeightType::kFP32:
-            case WeightType::kBF16:
-                return forwardFp(output_data, input_data, batch_size, weight, type);
-            case WeightType::kINT4:
-                return forwardInt4(output_data, input_data, batch_size, weight, type);
+        switch (dense.weight_type) {
+            case kFloat16:
+            case kFloat32:
+            case kBfloat16:
+                return forwardFp(output, input, dense.weight);
+            case kUint4:
+                return forwardInt4(output, input, dense, type);
             default:
-                FT_CHECK(0);
+                TM_CHECK(0) << "not implemented for weight type: " << dense.weight_type;
         }
     }
 
-    void forwardFp(T* output_data, Pitched input_data, int batch_size, const LlamaDenseWeight<T>& weight, Type type)
+    void forwardFp(Ref<Tensor> output_, const Tensor& input, const Tensor& weight)
     {
-        cublas_wrapper_->Gemm(CUBLAS_OP_N,
-                              CUBLAS_OP_N,
-                              weight.output_dims,
-                              batch_size,
-                              weight.input_dims,
-                              (const T*)weight.kernel,
-                              weight.output_dims,
-                              input_data.ptr,
-                              input_data.pitch,
-                              output_data,
-                              weight.output_dims,
-                              1.0f,
-                              type == kFusedAdd ? 1.0f : 0.0f);
-        // sync_check_cuda_error();
+        auto& output = output_.get();
+        TM_CHECK_EQ(weight.ndim(), 2);
+        TM_CHECK_EQ(input.ndim(), 2);
+        TM_CHECK_EQ(output.ndim(), 2);
+
+        int m, n, k;
+        std::tie(k, m) = weight.shapes(0, 1);
+        n              = input.shape(0);
+
+        TM_CHECK_EQ(input.shape(1), k);
+        TM_CHECK_EQ(output.shape(0), n);
+        TM_CHECK_EQ(output.shape(1), m);
+
+        // [k, m]
+        cublasOperation_t transa = weight.stride(1) == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
+        // [n, k]
+        cublasOperation_t transb = input.stride(1) == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+        const float alpha = 1.f;
+        const float beta  = 0.f;
+
+        check_cuda_error(cublasGemmEx(cublas_,
+                                      transa,
+                                      transb,
+                                      m,
+                                      n,
+                                      k,
+                                      &alpha,
+                                      weight.raw_data(),
+                                      to_cuda_dtype(weight.dtype()),
+                                      weight.stride(0) * weight.stride(1),  // one of these is 1
+                                      input.raw_data(),
+                                      to_cuda_dtype(input.dtype()),
+                                      input.stride(0) * input.stride(1),  // one of these is 1
+                                      &beta,
+                                      output.raw_data(),
+                                      to_cuda_dtype(output.dtype()),
+                                      output.stride(0) * output.stride(1),  // one of these is 1
+                                      CUDA_R_32F,
+                                      CUBLAS_GEMM_DEFAULT_TENSOR_OP));
     }
 
-    void forwardInt4(T* output_data, Pitched input_data, int batch_size, const LlamaDenseWeight<T>& weight, Type type)
+    void forwardInt4(Tensor& output, const Tensor& input, const LlamaDenseWeight& dense, Type type)
     {
+        TM_CHECK_EQ(output.ndim(), 2);  // A [m, k]
+        TM_CHECK_EQ(input.ndim(), 2);   // C [m, n]
+
+        TM_CHECK_EQ(input.stride(1), 1) << "input must be row-major";
+        TM_CHECK_EQ(output.stride(1), 1) << "output must be row-major";
+
+        TM_CHECK_EQ(output.shape(0), input.shape(0));
+        TM_CHECK_EQ(input.shape(1), dense.input_dim);
+        // TM_CHECK_EQ(output.shape(1), dense.output_dim);
+
         using namespace gemm;
 
         const Operation operation{dispatch_policy_,
                                   type == kFusedSiluFfn ? Epilogue::kGatedSilu : Epilogue::kNone,
                                   {QuantType::kNone},
-                                  {QuantType::kDefault, weight.group_size},
+                                  {QuantType::kDefault, dense.group_size},
                                   0,
                                   {},
                                   nullptr};
 
         const MatrixLayout a_desc{
-            get_data_type_v<T>,
+            input.dtype(),
             kRowMajor,
-            batch_size,
-            (int)weight.input_dims,
-            input_data.pitch,
+            (int)input.shape(0),
+            dense.input_dim,
+            (int)input.stride(0),
         };
 
         const MatrixLayout c_desc{
-            get_data_type_v<T>,
+            output.dtype(),  //
             kRowMajor,
-            batch_size,
-            (int)weight.output_dims,
-            type == kFusedSiluFfn ? (int)weight.output_dims / 2 : (int)weight.output_dims,
+            (int)output.shape(0),
+            dense.output_dim,
+            (int)output.stride(0),
+            // type == kFusedSiluFfn ? (int)weight.output_dim / 2 : (int)weight.output_dim,
         };
 
         auto ec = gemm_.Run(operation,
                             1.f,
-                            input_data.ptr,
+                            input.raw_data(),
                             a_desc,
                             nullptr,
                             {},
-                            weight.kernel,
-                            weight.k_desc,
-                            weight.scales_zeros,
-                            weight.q_desc,
+                            dense.weight.raw_data(),
+                            dense.k_desc,
+                            dense.scales_zeros.raw_data(),
+                            dense.q_desc,
                             type == kFusedAdd ? 1.0f : 0.0f,
-                            output_data,
+                            output.raw_data(),
                             c_desc,
-                            output_data,
+                            output.raw_data(),
                             c_desc,
                             workspace_,
                             stream_);
 
         if (ec) {
             TM_LOG_ERROR("%s: %d", __PRETTY_FUNCTION__, ec);
-            // std::abort();
         }
     }
 
-    void forward_moe(T*                         output_data,
-                     Pitched                    input_data,
-                     const int*                 indexes,
-                     const int*                 offsets,
-                     int                        batch_size,
-                     const LlamaDenseWeight<T>& weight,
-                     Type                       type,
-                     gemm::Context*             context)
+    void forward_moe(Tensor&                 output,
+                     const Tensor&           input,
+                     const int*              indexes,
+                     const int*              offsets,
+                     const LlamaDenseWeight& dense,
+                     Type                    type,
+                     gemm::Context*          context)
     {
         using namespace gemm;
 
         QuantDesc quant_b{};
-        if (weight.k_desc.type == gemm::DataType::U4) {
+        if (dense.k_desc.type == kUint4) {
             quant_b.type       = QuantType::kDefault;
-            quant_b.group_size = weight.group_size;
+            quant_b.group_size = dense.group_size;
         }
 
         const Operation operation{dispatch_policy_,
@@ -184,56 +184,57 @@ struct LlamaLinear<T>::Impl {
                                   nullptr};
 
         MatrixLayout a_desc{
-            get_data_type_v<T>,
+            input.dtype(),
             kRowMajor,
-            batch_size,              // m
-            (int)weight.input_dims,  // k
-            input_data.pitch,
+            (int)output.shape(0),  // batch size
+            dense.input_dim,       // k
+            (int)input.stride(0),
         };
 
-        // std::cout << "m" << batch_size << "n" << weight.output_dims << "k" << weight.input_dims << " "
-        //           << input_data.pitch << "\n";
-
         a_desc.offsets = (int*)offsets;
         a_desc.idxs    = (int*)indexes;
 
+        // std::cout << "m" << batch_size << "n" << weight.output_dims << "k" << weight.input_dims << " "
+        //           << input_data.pitch << "\n";
+
         MatrixLayout c_desc{
-            get_data_type_v<T>,
+            output.dtype(),  //
             kRowMajor,
-            batch_size,
-            (int)weight.output_dims,
-            type == kFusedSiluFfn ? (int)weight.output_dims / 2 : (int)weight.output_dims,
+            (int)output.shape(0),  // batch size
+            dense.output_dim,
+            (int)output.stride(0),
+            // type == kFusedSiluFfn ? (int)weight.output_dims / 2 : (int)weight.output_dims,
         };
 
         c_desc.offsets = (int*)offsets;
 
-        a_desc.num = c_desc.num = weight.k_desc.num;
+        a_desc.num = c_desc.num = dense.k_desc.num;
 
         auto ec = gemm_.Run(operation,
                             1.f,
-                            input_data.ptr,
+                            input.raw_data(),
                             a_desc,
                             nullptr,
                             {},
-                            weight.kernel,
-                            weight.k_desc,
-                            weight.scales_zeros,
-                            weight.q_desc,
+                            dense.weight.raw_data(),
+                            dense.k_desc,
+                            dense.scales_zeros.data_or((void*)nullptr),
+                            dense.q_desc,
                             type == kFusedAdd ? 1.0f : 0.0f,
-                            output_data,
+                            output.raw_data(),
                             c_desc,
-                            output_data,
+                            output.raw_data(),
                             c_desc,
                             workspace_,
                             stream_);
 
         if (ec) {
             TM_LOG_ERROR("%s: %d", __PRETTY_FUNCTION__, ec);
-            // std::abort();
         }
     }
 
-    cublasMMWrapper*     cublas_wrapper_;
+    // cublasMMWrapper*     cublas_wrapper_;
+    cublasHandle_t       cublas_;
     gemm::Gemm           gemm_;
     gemm::DispatchPolicy dispatch_policy_{gemm::DispatchPolicy::kDefault};
     cudaStream_t         stream_{};
@@ -241,45 +242,50 @@ struct LlamaLinear<T>::Impl {
     gemm::Workspace workspace_;
 };
 
-template<class T>
-LlamaLinear<T>::LlamaLinear(cublasMMWrapper* cublas_wrapper, cudaStream_t stream):
-    impl_{std::make_shared<Impl>(cublas_wrapper, stream)}
-{
-}
+LlamaLinear::LlamaLinear(cudaStream_t stream): impl_{std::make_shared<Impl>(stream)} {}
 
-template<class T>
-void LlamaLinear<T>::forward(T*                         output_data,
-                             Pitched                    input_data,
-                             int                        batch_size,
-                             const LlamaDenseWeight<T>& weight,
-                             Type                       type,
-                             T*                         lora_buff,
-                             int*                       lora_mask)
+Tensor LlamaLinear::forward(const Tensor&           input,  //
+                            const LlamaDenseWeight& dense,
+                            Type                    type,
+                            std::optional<Tensor>   output)
 {
-    impl_->forward(output_data, input_data, batch_size, weight, type, lora_buff, lora_mask);
+    ssize_t output_dim = type == kFusedSiluFfn ? dense.output_dim / 2 : dense.output_dim;
+
+    Tensor in = input.view({-1, input.shape(-1)});
+    Tensor out;
+
+    if (output) {
+        out = output->view({in.shape(0), output_dim});
+    }
+    else {
+        out = Tensor({in.shape(0), output_dim}, input.dtype(), input.device());
+    }
+
+    impl_->forward(out, in, dense, type);
+
+    auto shape   = input.shape();
+    shape.back() = out.shape(-1);
+
+    return out.view(shape);
 }
 
-template<class T>
-void LlamaLinear<T>::forward_moe(T*                         output_data,
-                                 Pitched                    input_data,
-                                 const int*                 indexes,
-                                 const int*                 offsets,
-                                 int                        batch_size,
-                                 const LlamaDenseWeight<T>& weight,
-                                 Type                       type,
-                                 gemm::Context*             context)
+void LlamaLinear::forward_moe(Tensor&                 output,
+                              const Tensor&           input,
+                              const int*              indexes,
+                              const int*              offsets,
+                              const LlamaDenseWeight& dense,
+                              Type                    type,
+                              gemm::Context*          context)
 {
-    impl_->forward_moe(output_data, input_data, indexes, offsets, batch_size, weight, type, context);
+    return impl_->forward_moe(output, input, indexes, offsets, dense, type, context);
 }
 
-template<class T>
-void LlamaLinear<T>::set_measure(bool measure)
+void LlamaLinear::set_measure(bool measure)
 {
     impl_->dispatch_policy_ = measure ? gemm::DispatchPolicy::kMeasure : gemm::DispatchPolicy::kReuse;
 }
 
-template<class T>
-int LlamaLinear<T>::Export(std::ostream& os)
+int LlamaLinear::Export(std::ostream& os)
 {
     if (os) {
         return impl_->gemm_.Export(os);
@@ -287,8 +293,7 @@ int LlamaLinear<T>::Export(std::ostream& os)
     return 0;
 }
 
-template<class T>
-int LlamaLinear<T>::Import(std::istream& is)
+int LlamaLinear::Import(std::istream& is)
 {
     auto n_records = 0;
     if (is) {
@@ -300,18 +305,9 @@ int LlamaLinear<T>::Import(std::istream& is)
     return n_records;
 }
 
-template<class T>
-std::vector<int> LlamaLinear<T>::GetTuningSeq() const
+std::vector<int> LlamaLinear::GetTuningSeq() const
 {
     return impl_->gemm_.GetTuningSeq();
 }
 
-#ifdef ENABLE_FP32
-template class LlamaLinear<float>;
-#endif
-template class LlamaLinear<half>;
-#ifdef ENABLE_BF16
-template class LlamaLinear<__nv_bfloat16>;
-#endif
-
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaLinear.h b/src/turbomind/models/llama/LlamaLinear.h
index a22eb69ebd..625376aeb7 100644
--- a/src/turbomind/models/llama/LlamaLinear.h
+++ b/src/turbomind/models/llama/LlamaLinear.h
@@ -2,14 +2,14 @@
 
 #pragma once
 
-#include "src/turbomind/models/llama/LlamaDenseWeight.h"
-#include "src/turbomind/utils/cublasMMWrapper.h"
 #include <istream>
 #include <ostream>
 
+#include "src/turbomind/core/core.h"
+#include "src/turbomind/models/llama/LlamaDenseWeight.h"
+
 namespace turbomind {
 
-template<typename T>
 class LlamaLinear {
 public:
     enum Type
@@ -19,30 +19,20 @@ class LlamaLinear {
         kFusedAdd
     };
 
-    struct Pitched {
-        const T* ptr;
-        int      pitch;
-        Pitched(const T* ptr, int pitch = 0): ptr{ptr}, pitch{pitch} {}
-    };
+    explicit LlamaLinear(cudaStream_t stream);
+
+    Tensor forward(const Tensor&           input,  //
+                   const LlamaDenseWeight& weight,
+                   Type                    type   = kGemm,
+                   std::optional<Tensor>   output = {});
 
-    LlamaLinear(cublasMMWrapper* cublas_wrapper, cudaStream_t stream);
-
-    void forward(T*                         output_data,
-                 Pitched                    input_data,
-                 int                        batch_size,
-                 const LlamaDenseWeight<T>& weight,
-                 Type                       type      = kGemm,
-                 T*                         lora_buff = nullptr,
-                 int*                       lora_mask = nullptr);
-
-    void forward_moe(T*                         output_data,
-                     Pitched                    input_data,
-                     const int*                 indexes,
-                     const int*                 offsets,
-                     int                        batch_size,
-                     const LlamaDenseWeight<T>& weight,
-                     Type                       type,
-                     gemm::Context*             context);
+    void forward_moe(Tensor&                 output,
+                     const Tensor&           input,
+                     const int*              indexes,
+                     const int*              offsets,
+                     const LlamaDenseWeight& weight,
+                     Type                    type,
+                     gemm::Context*          context);
 
     void set_measure(bool measure);
 
diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc
index ad79b91789..6739cce2fa 100644
--- a/src/turbomind/models/llama/LlamaV2.cc
+++ b/src/turbomind/models/llama/LlamaV2.cc
@@ -24,9 +24,10 @@
 #include <memory>
 
 #include "src/turbomind/comm/device_comm.h"
+#include "src/turbomind/core/core.h"
 #include "src/turbomind/macro.h"
 
-#include "src/turbomind/models/llama/LlamaBatch.h"
+#include "src/turbomind/models/llama/LlamaLinear.h"
 #include "src/turbomind/models/llama/LlamaV2.h"
 #include "src/turbomind/models/llama/LlamaWeight.h"
 #include "src/turbomind/models/llama/SequenceManager.h"
@@ -36,7 +37,6 @@
 
 #include "src/turbomind/kernels/gpt_kernels.h"
 
-#include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/anomaly_handler.h"
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/logger.h"
@@ -50,15 +50,16 @@ inline int pad_vocab_size(int vocab_size, int tp)
     return (vocab_size + tp - 1) / tp * tp;
 }
 
-template<typename T>
-LlamaV2<T>::LlamaV2(const ModelParam&               model,
-                    const EngineParam&              engine,
-                    const AttentionParam&           attn,
-                    const MoeParam&                 moe,
-                    const LoraParam&                lora,
-                    const Context<T>&               ctx,
-                    int                             max_batch_size,
-                    std::shared_ptr<LlamaWeight<T>> weights):
+LlamaV2::LlamaV2(DataType                     dtype,
+                 const ModelParam&            model,
+                 const EngineParam&           engine,
+                 const AttentionParam&        attn,
+                 const MoeParam&              moe,
+                 const LoraParam&             lora,
+                 const Context&               ctx,
+                 int                          max_batch_size,
+                 std::shared_ptr<LlamaWeight> weights):
+    dtype_{dtype},
     param_(model),
     attn_param_(attn),
     lora_param_(lora),
@@ -76,10 +77,7 @@ LlamaV2<T>::LlamaV2(const ModelParam&               model,
     local_kv_head_num_(model.kv_head_num / engine.attn_tp_size),
     weights_(std::move(weights)),
     stream_(ctx.stream),
-    cublas_wrapper_(ctx.cublas_wrapper.get()),
-    allocator_(ctx.allocator.get()),
-    linear_(ctx.linear.get()),
-    is_free_buffer_after_forward_(false),
+    linear_(*ctx.linear),
     debug_(isDebug())
 {
     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
@@ -88,34 +86,19 @@ LlamaV2<T>::LlamaV2(const ModelParam&               model,
         use_allgather_2d_ = true;
     }
 
-    unified_decoder_ = std::make_unique<UnifiedDecoder<T>>(model, engine, attn, moe, lora, ctx);
+    unified_decoder_ = std::make_unique<UnifiedDecoder>(model, engine, attn, moe, lora, ctx);
 
-    dynamic_decode_layer_ = std::make_unique<DynamicDecodeLayer<T>>(vocab_size_,
-                                                                    vocab_size_padded_,
-                                                                    stream_,
-                                                                    cublas_wrapper_,
-                                                                    allocator_,
-                                                                    is_free_buffer_after_forward_,
-                                                                    (cudaDeviceProp*)&ctx.cuda_device_prop);
-
-    unified_decoder_->allocateBuffer(max_batch_size);
-}
-
-template<typename T>
-LlamaV2<T>::~LlamaV2()
-{
-    dynamic_decode_layer_.reset();
-    unified_decoder_.reset();
+    dynamic_decode_ = std::make_unique<DynamicDecodeLayer>(
+        dtype_, max_batch_size, vocab_size_, vocab_size_padded_, stream_, &ctx.device_prop);
 }
 
-template<typename T>
-void LlamaV2<T>::updateEmbedding(T*               decoder_input,
-                                 const int        bsz,
-                                 const int*       h_input_length,
-                                 const Sequence** sequences,
-                                 int              token_num,
-                                 int*             lora_mask,
-                                 bool*            have_embeddings)
+void LlamaV2::updateEmbedding(char*            decoder_input,
+                              const int        bsz,
+                              const int*       h_input_length,
+                              const Sequence** sequences,
+                              int              token_num,
+                              int*             lora_mask,
+                              bool*            have_embeddings)
 {
     if (isTuning())
         return;
@@ -130,6 +113,8 @@ void LlamaV2<T>::updateEmbedding(T*               decoder_input,
         mask_ptr = mask.data();
     }
 
+    const size_t elem_size = byte_size(dtype_, 1);
+
     for (int i = 0; i < bsz; i++) {
         const auto& seq        = *sequences[i];
         const auto& embeddings = seq.input_embeddings;
@@ -148,16 +133,16 @@ void LlamaV2<T>::updateEmbedding(T*               decoder_input,
             // calculate intersection of [begin, end) and [seq.cache_len, seq.cache_len + h_input_length[i])
             begin            = std::max(begin, seq.cache_len);
             end              = std::min(end, seq.cache_len + h_input_length[i]);
-            size_t byte_size = (end - begin) * hidden_units_ * sizeof(T);
-            T*     dst_ptr   = decoder_input + off_dst * hidden_units_;
-            auto   src_ptr   = embeddings[j].data() + off_src * hidden_units_ * sizeof(T);
-            cudaMemcpyAsync(dst_ptr, src_ptr, byte_size, cudaMemcpyDefault, stream_);
+            size_t byte_size = elem_size * (end - begin) * hidden_units_;
+            char*  dst_ptr   = decoder_input + elem_size * off_dst * hidden_units_;
+            auto   src_ptr   = embeddings[j].data() + elem_size * off_src * hidden_units_;
+            check_cuda_error(cudaMemcpyAsync(dst_ptr, src_ptr, byte_size, cudaMemcpyDefault, stream_));
             if (lora_mask != nullptr) {
                 std::fill_n(mask_ptr + off_dst, (end - begin), 1);
                 *have_embeddings = true;
             }
         }
-        decoder_input += h_input_length[i] * hidden_units_;
+        decoder_input += elem_size * h_input_length[i] * hidden_units_;
         mask_ptr += h_input_length[i];
     }
 
@@ -168,271 +153,206 @@ void LlamaV2<T>::updateEmbedding(T*               decoder_input,
     sync_check_cuda_error();
 }
 
-template<typename T>
-void LlamaV2<T>::forwardUnified(T*               out,
-                                T*               decoder_output,
-                                T*               decoder_input,
-                                void**           block_ptrs,
-                                const int*       cu_block_cnts,
-                                const int*       input_ids,
-                                const int*       h_input_length,
-                                const int*       h_context_length,
-                                const float*     rope_theta,
-                                const bool*      finished,
-                                size_t           token_num,
-                                const int*       local_token_nums,
-                                int              dc_batch_size,
-                                int              pf_batch_size,
-                                int*             lora_mask,
-                                const Sequence** sequences)
+void LlamaV2::Forward(Buffer_<int>     input_ids,
+                      Tensor           hidden_states_out,
+                      Tensor           decoder_out,
+                      Buffer           kv_block_ptrs,
+                      Buffer           cu_block_nums,
+                      Buffer_<int>     h_input_length,
+                      Buffer_<int>     h_context_length,
+                      Buffer           rope_base,
+                      Buffer           finished,
+                      Buffer           local_token_nums,
+                      Buffer           lora_mask,
+                      int              decode_num,
+                      int              prefil_num,
+                      const Sequence** sequences)
 {
     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
 
+    Tensor input_embeds;
+
+    const int token_num = input_ids.size();
+
     if (token_num) {
+        const auto& embedding_table = weights_->pre_decoder_embedding.weight;
+        TM_CHECK_EQ(embedding_table.shape(1) * tp_size_, hidden_units_);
+
+        input_embeds = Tensor{{token_num, (int)hidden_units_}, dtype_, kDEVICE};
+
         if (tp_size_ == 1) {
-            invokeInputIdsEmbeddingLookupPosEncoding(decoder_input,
-                                                     nullptr,  // processed somewhere else
-                                                     weights_->pre_decoder_embedding_table,
-                                                     static_cast<T*>(nullptr),
-                                                     pPromptTuningParam<T>{},
-                                                     input_ids,
-                                                     0,  // only used for position encoding
-                                                     token_num,
-                                                     token_num,
-                                                     1,
-                                                     hidden_units_,
-                                                     stream_);
+            invokeEmbeddingLookup(input_embeds, input_ids, embedding_table, stream_);
             sync_check_cuda_error();
         }
-        else {
-            const size_t local_hidden_units = hidden_units_ / tp_size_;
-            const size_t slice              = token_num * local_hidden_units;
-            invokeInputIdsEmbeddingLookupPosEncoding(decoder_output + tp_rank_ * slice,
-                                                     nullptr,  // processed somewhere else
-                                                     weights_->pre_decoder_embedding_table,
-                                                     static_cast<T*>(nullptr),
-                                                     pPromptTuningParam<T>{},
-                                                     input_ids,
-                                                     0,  // only used for position encoding
-                                                     token_num,
-                                                     token_num,
-                                                     1,
-                                                     local_hidden_units,
-                                                     stream_);
+        else if (use_allgather_2d_) {
+            const auto local_hidden_units = embedding_table.shape(1);
+            Tensor     temp{hidden_states_out.buffer(), {token_num, tp_size_, local_hidden_units}};
+
+            auto local = temp.slice({0, tp_rank_, 0}, {-1, 1, -1}).squeeze(1);
+
+            invokeEmbeddingLookup(local, input_ids, embedding_table, stream_);
             sync_check_cuda_error();
 
-            comm_->d_comm->AllGather(decoder_output + tp_rank_ * slice,
-                                     decoder_output,
-                                     slice,
-                                     getTensorType<T>(),
-                                     comm_->d_tp_group,
-                                     stream_);
+            comm_->d_comm->AllGather2D(local.raw_data(),
+                                       temp.raw_data(),
+                                       hidden_units_,
+                                       local_hidden_units,
+                                       local_hidden_units,
+                                       token_num,
+                                       local.dtype(),
+                                       {true, true},
+                                       comm_->d_tp_group,
+                                       stream_);
             sync_check_cuda_error();
 
-            invokeInPlaceTranspose102(
-                decoder_input, decoder_output, tp_size_, token_num, local_hidden_units, false, stream_);
+            Copy(temp.buffer(), input_embeds.buffer());
+        }
+        else {
+            const auto local_hidden_units = embedding_table.shape(1);
+            Tensor     temp{hidden_states_out.buffer(), {tp_size_, token_num, local_hidden_units}};
+
+            auto local = temp.slice(tp_rank_).squeeze(0);
+
+            invokeEmbeddingLookup(local, input_ids, embedding_table, stream_);
+            sync_check_cuda_error();
 
+            comm_->d_comm->AllGather(
+                local.raw_data(), temp.raw_data(), local.size(), dtype_, comm_->d_tp_group, stream_);
+            sync_check_cuda_error();
+
+            invokeInPlaceTranspose102((uint16_t*)input_embeds.raw_data(),
+                                      (uint16_t*)temp.raw_data(),
+                                      tp_size_,
+                                      token_num,
+                                      local_hidden_units,
+                                      false,
+                                      stream_);
             sync_check_cuda_error();
         }
 
-        count_and_fix(decoder_input, token_num * hidden_units_, "embedding", 1);
+        TM_DEBUG_TENSOR(input_embeds, "embeddings", 1);
     }
 
     bool have_embeddings = false;
     if (token_num) {
-        updateEmbedding(decoder_input,
-                        dc_batch_size + pf_batch_size,
-                        h_input_length,
+        // Copy input embeddings from corresponding sequences
+        updateEmbedding((char*)input_embeds.raw_data(),
+                        h_input_length.size(),
+                        h_input_length.data(),
                         sequences,
                         token_num,
-                        lora_mask,
+                        lora_mask ? lora_mask.data<int>() : nullptr,
                         &have_embeddings);
         sync_check_cuda_error();
     }
 
-    const auto   dtype = getTensorType<T>();
-    const size_t bsz   = dc_batch_size + pf_batch_size;
-
-    TensorMap inputs{
-        {"decoder_input", {MEMORY_GPU, dtype, {token_num, hidden_units_}, decoder_input}},
-        {"output_norm_weight", {MEMORY_GPU, dtype, {hidden_units_}, weights_->output_norm_weight}},
-        {"h_q_len", {MEMORY_CPU, TYPE_INT32, {bsz}, h_input_length}},
-        {"h_k_len", {MEMORY_CPU, TYPE_INT32, {bsz}, h_context_length}},
-        {"finished", {MEMORY_GPU, TYPE_BOOL, {bsz}, finished}},
-        {"dc_batch_size", {MEMORY_CPU, TYPE_INT32, {1}, &dc_batch_size}},
-        {"pf_batch_size", {MEMORY_CPU, TYPE_INT32, {1}, &pf_batch_size}},
-        {"rope_theta", {MEMORY_GPU, TYPE_FP32, {hidden_units_}, rope_theta}},
-        {"cu_block_counts", {MEMORY_GPU, TYPE_INT32, {bsz}, cu_block_cnts}},
-        {"local_token_nums", {MEMORY_GPU, TYPE_INT32, {1}, local_token_nums}},
-    };
-
-    TensorMap outputs{{"decoder_output", {MEMORY_GPU, dtype, {token_num, hidden_units_}, decoder_output}},
-                      {"block_ptrs", {MEMORY_GPU, TYPE_UINT64, {bsz}, block_ptrs}},
-                      {"last_token_hidden_units", {MEMORY_GPU, dtype, {bsz, hidden_units_}, out}}};
-
-    if (lora_mask != nullptr && have_embeddings) {
-        inputs.insert({"lora_mask", {MEMORY_GPU, TYPE_INT32, {token_num}, lora_mask}});
-    }
-
-    unified_decoder_->forward(&outputs, &inputs, &weights_->decoder_layer_weights);
+    TensorMap args{{"decoder_input", input_embeds},
+                   {"decoder_output", hidden_states_out.view({-1, (int)hidden_units_}).borrow()},
+                   {"last_token_hidden_units", decoder_out},
+                   {"output_norm_weight", weights_->output_norm_weight},
+                   {"h_q_len", h_input_length},
+                   {"h_k_len", h_context_length},
+                   {"finished", finished},
+                   {"decode_num", Buffer{&decode_num, 1, kCPU}},
+                   {"prefil_num", Buffer{&prefil_num, 1, kCPU}},
+                   {"rope_base", rope_base},
+                   {"cu_block_nums", cu_block_nums},
+                   {"kv_block_ptrs", kv_block_ptrs},
+                   {"local_token_nums", local_token_nums}};
+
+    unified_decoder_->Forward(args, weights_->decoder_layer_weights);
 }
 
-template<typename T>
-void LlamaV2<T>::postDecodeEmbedding(T* logits, T* local_logits, const T* decoder_output, int batch_size)
+Tensor LlamaV2::postDecodeEmbedding(const Tensor& features, Buffer local_logits)
 {
     NvtxScope scope("postDecodeEmbedding");
     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
 
-    cudaDataType_t data_type = getCudaDataType<T>();
-    float          alpha     = 1.f;
-    float          beta      = 0.f;
-    FT_CHECK(vocab_size_padded_ % tp_size_ == 0);
-    const size_t local_vocab_size = vocab_size_padded_ / tp_size_;
-
-    auto invoke_gemm = [&](int first, int n, auto C, size_t batch_stride_C, size_t rank_stride_C) {
-        cublas_wrapper_->Gemm(CUBLAS_OP_T,
-                              CUBLAS_OP_N,
-                              local_vocab_size,  // m
-                              n,
-                              hidden_units_,  // k
-                              &alpha,
-                              weights_->post_decoder_embedding_kernel,
-                              data_type,
-                              hidden_units_,  // k
-                              decoder_output + first * hidden_units_,
-                              data_type,
-                              hidden_units_,  // k
-                              &beta,
-                              C + first * batch_stride_C + tp_rank_ * rank_stride_C,
-                              data_type,
-                              batch_stride_C,  // ldc
-                              CUDA_R_32F,
-                              cublasGemmAlgo_t(-1));
-    };
+    TM_CHECK(vocab_size_padded_ % tp_size_ == 0) << vocab_size_padded_ << " " << tp_size_;
+
+    const int bsz              = features.shape(0);
+    const int local_vocab_size = vocab_size_padded_ / tp_size_;
 
     if (tp_size_ == 1) {
-        invoke_gemm(0, batch_size, logits, vocab_size_padded_, 0);
+        Tensor logits{local_logits, {bsz, (int)vocab_size_padded_}};
+        linear_.forward(features, weights_->post_decoder_embedding, LlamaLinear::kGemm, logits);
         sync_check_cuda_error();
+
+        TM_DEBUG_TENSOR(logits, "logits", 1);
+        return logits;
     }
-    else if (use_allgather_2d_ == false) {
-        FT_CHECK(logits != local_logits);
-        const size_t slice = batch_size * local_vocab_size;
-        invoke_gemm(0, batch_size, local_logits, local_vocab_size, slice);
-        sync_check_cuda_error();
-        comm_->d_comm->AllGather(
-            local_logits + tp_rank_ * slice, local_logits, slice, getTensorType<T>(), comm_->d_tp_group, stream_);
+    else if (use_allgather_2d_) {
+        Tensor logits{local_logits, {bsz, tp_size_, local_vocab_size}};
+        Tensor local = logits.slice({0, tp_rank_, 0}, {-1, 1, -1});
+        linear_.forward(features, weights_->post_decoder_embedding, LlamaLinear::kGemm, local.squeeze(1));
         sync_check_cuda_error();
-        invokeTransposeAxis01(logits, local_logits, tp_size_, batch_size, local_vocab_size, stream_);
+        comm_->d_comm->AllGather2D(local.raw_data(),
+                                   logits.raw_data(),
+                                   vocab_size_padded_,
+                                   local_vocab_size,
+                                   local_vocab_size,
+                                   bsz,
+                                   logits.dtype(),
+                                   {true, true},
+                                   comm_->d_tp_group,
+                                   stream_);
         sync_check_cuda_error();
+        return logits.view({bsz, -1});
     }
     else {
-        FT_CHECK(logits == local_logits);
-        const int max_stages       = 1;
-        const int min_stage_tokens = 512;
-        const int step = std::max(std::min(batch_size, min_stage_tokens), (batch_size + max_stages - 1) / max_stages);
-        cudaStream_t comm_stream = stream_;
-        cudaEvent_t  comm_event{};
-        if (step < batch_size) {
-            check_cuda_error(cudaStreamCreateWithFlags(&comm_stream, cudaStreamNonBlocking));
-            check_cuda_error(cudaEventCreateWithFlags(&comm_event, cudaEventDisableTiming));
-        }
-        for (int first = 0; first < batch_size; first += step) {
-            const int n = std::min(first + step, batch_size) - first;
-            invoke_gemm(first, n, local_logits, vocab_size_padded_, local_vocab_size);
-            sync_check_cuda_error();
-            if (comm_stream != stream_) {
-                check_cuda_error(cudaEventRecord(comm_event, stream_));
-                check_cuda_error(cudaStreamWaitEvent(comm_stream, comm_event));
-            }
-            comm_->d_comm->AllGather2D(local_logits + first * vocab_size_padded_ + tp_rank_ * local_vocab_size,
-                                       local_logits + first * vocab_size_padded_,
-                                       vocab_size_padded_,
-                                       local_vocab_size,
-                                       local_vocab_size,
-                                       n,
-                                       getTensorType<T>(),
-                                       {first == 0, first + n == batch_size},
-                                       comm_->d_tp_group,
-                                       comm_stream);
-            sync_check_cuda_error();
-        }
-        if (comm_stream != stream_) {
-            check_cuda_error(cudaEventRecord(comm_event, comm_stream));
-            check_cuda_error(cudaStreamWaitEvent(stream_, comm_event));
-            check_cuda_error(cudaEventDestroy(comm_event));
-            check_cuda_error(cudaStreamDestroy(comm_stream));
-        }
+        Tensor logits{local_logits, {tp_size_, bsz, local_vocab_size}};
+        Tensor local = logits.slice({tp_rank_, 0, 0}, {1, -1, -1});
+        linear_.forward(features, weights_->post_decoder_embedding, LlamaLinear::kGemm, local.squeeze(0));
+        sync_check_cuda_error();
+        comm_->d_comm->AllGather(
+            local.raw_data(), logits.raw_data(), local.size(), local.dtype(), comm_->d_tp_group, stream_);
+        sync_check_cuda_error();
+        Tensor out{{bsz, (int)vocab_size_padded_}, features.dtype(), features.device()};
+        invokeTransposeAxis01(
+            (uint16_t*)out.raw_data(), (uint16_t*)logits.raw_data(), tp_size_, bsz, local_vocab_size, stream_);
+        sync_check_cuda_error();
+        return out;
     }
 }
 
-template<typename T>
-void LlamaV2<T>::dynamicDecode(int*            token_ids,
-                               bool*           finished,
-                               int*            sequence_length,
-                               bool*           should_stop,
-                               curandState_t*  curand_state,
-                               TensorMap*      inputs,
-                               TensorMap*      outputs,
-                               const T*        logits,
-                               const uint32_t* seq_limit_len,
-                               const int*      context_length,
-                               int             step,
-                               int             ite,
-                               size_t          max_context_len,
-                               size_t          token_ids_len,
-                               size_t          batch_size)
+void LlamaV2::dynamicDecode(Buffer token_ids,
+                            Buffer finished,
+                            Buffer sequence_length,
+                            Tensor curand_state,
+                            Tensor logits,
+                            Buffer seq_limit_len,
+                            Buffer init_context_length,
+                            Buffer context_length,
+                            Buffer prompt_length,
+                            Buffer sampled_logprobs,
+                            Buffer sampled_indexes,
+                            Buffer sampled_nums,
+                            int    step,
+                            int    max_context_len)
 {
     NvtxScope scope("dynamicDecode");
     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    int local_batch_size = (int)batch_size;
-
-    std::unordered_map<std::string, Tensor> dynamic_decode_input_tensors{
-        {"logits", {MEMORY_GPU, getTensorType<T>(), {batch_size, (size_t)1, vocab_size_padded_}, logits}},
-        {"step", {MEMORY_CPU, TYPE_INT32, {1}, &step}},
-        {"max_input_length", {MEMORY_CPU, TYPE_INT32, {1}, &max_context_len}},
-        {"sequence_limit_length", {MEMORY_GPU, TYPE_UINT32, {batch_size}, seq_limit_len}},
-        {"input_lengths", {MEMORY_GPU, TYPE_INT32, {batch_size, 1}, context_length}},
-        {"ite", {MEMORY_CPU, TYPE_UINT32, {1}, &ite}},
-        {"local_batch_size", {MEMORY_CPU, TYPE_INT32, {1}, &local_batch_size}},
+    TensorMap args{
+        {"logits", logits},
+        {"step", Buffer{&step, 1, kCPU}},
+        {"max_input_length", Buffer{&max_context_len, 1, kCPU}},
+        {"sequence_limit_length", seq_limit_len},
+        {"init_context_length", init_context_length},
+        {"context_length", context_length},
+        {"prompt_length", prompt_length},
+        {"output_ids", token_ids},             // inout
+        {"finished", finished},                // inout
+        {"sequence_length", sequence_length},  // inout
+        {"curand_state", curand_state},        // inout
     };
 
-    const std::vector<std::string> optional_inputs{"end_ids",
-                                                   "stop_words_list",
-                                                   "bad_words_list",
-                                                   "runtime_top_k",
-                                                   "runtime_top_p",
-                                                   "temperature",
-                                                   "repetition_penalty"};
-    for (const auto& key : optional_inputs) {
-        if (inputs->isExist(key)) {
-            dynamic_decode_input_tensors.insert({key, inputs->at(key)});
-        }
+    if (sampled_logprobs) {
+        args.emplace("sampled_logprobs", sampled_logprobs);
+        args.emplace("sampled_indexes", sampled_indexes);
+        args.emplace("sampled_nums", sampled_nums);
     }
 
-    std::unordered_map<std::string, Tensor> dynamic_decode_output_tensors{
-        {"output_ids", {MEMORY_GPU, TYPE_INT32, {token_ids_len, batch_size, 1U}, token_ids}},
-        {"finished", {MEMORY_GPU, TYPE_BOOL, {batch_size}, finished}},
-        {"sequence_length", {MEMORY_GPU, TYPE_INT32, {batch_size}, sequence_length}},
-        {"should_stop", {MEMORY_CPU, TYPE_BOOL, {1}, should_stop}},
-        {"curand_state", {MEMORY_GPU, TYPE_VOID, {batch_size}, curand_state}}};
-
-    const std::vector<std::string> optional_outputs{
-        "cum_log_probs", "output_log_probs", "sampled_indexes", "sampled_logprobs", "sampled_nums"};
-    for (const auto& key : optional_outputs) {
-        if (outputs->isExist(key)) {
-            dynamic_decode_output_tensors.insert({key, outputs->at(key)});
-        }
-    }
-
-    dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
+    dynamic_decode_->Forward(args);
 }
 
-template class LlamaV2<half>;
-#ifdef ENABLE_FP32
-template class LlamaV2<float>;
-#endif
-#ifdef ENABLE_BF16
-template class LlamaV2<__nv_bfloat16>;
-#endif
-
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaV2.h b/src/turbomind/models/llama/LlamaV2.h
index 445c778c5d..e799070b3a 100644
--- a/src/turbomind/models/llama/LlamaV2.h
+++ b/src/turbomind/models/llama/LlamaV2.h
@@ -29,24 +29,22 @@
 #include "src/turbomind/models/llama/context.h"
 #include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/models/llama/unified_decoder.h"
-#include "src/turbomind/utils/allocator.h"
-#include "src/turbomind/utils/cublasMMWrapper.h"
 
 namespace turbomind {
 
-template<typename T>
+class LlamaBatch;
+
 class LlamaV2 {
 public:
-    ~LlamaV2();
-
-    LlamaV2(const ModelParam&               model,
-            const EngineParam&              engine,
-            const AttentionParam&           attn,
-            const MoeParam&                 moe,
-            const LoraParam&                lora,
-            const Context<T>&               ctx,
-            int                             max_batch_size,
-            std::shared_ptr<LlamaWeight<T>> weights);
+    LlamaV2(DataType                     dtype,
+            const ModelParam&            model,
+            const EngineParam&           engine,
+            const AttentionParam&        attn,
+            const MoeParam&              moe,
+            const LoraParam&             lora,
+            const Context&               ctx,
+            int                          max_batch_size,
+            std::shared_ptr<LlamaWeight> weights);
 
     size_t vocab_size() const noexcept
     {
@@ -54,7 +52,7 @@ class LlamaV2 {
     }
 
 private:
-    void updateEmbedding(T*               decoder_input,
+    void updateEmbedding(char*            decoder_input,
                          const int        bsz,
                          const int*       h_input_length,
                          const Sequence** sequences,
@@ -62,43 +60,42 @@ class LlamaV2 {
                          int*             lora_mask,
                          bool*            have_embeddings);
 
-    void forwardUnified(T*               out,
-                        T*               decoder_output,
-                        T*               decoder_input,
-                        void**           block_ptrs,
-                        const int*       cu_block_cnts,
-                        const int*       input_ids,
-                        const int*       h_input_length,
-                        const int*       h_context_length,
-                        const float*     rope_theta,
-                        const bool*      finished,
-                        size_t           token_num,
-                        const int*       local_token_nums,
-                        int              dc_batch_size,
-                        int              pf_batch_size,
-                        int*             lora_mask,
-                        const Sequence** sequences);
-
-    void postDecodeEmbedding(T* logits, T* local_logits, const T* decoder_output, int batch_size);
-
-    void dynamicDecode(int*            token_ids,
-                       bool*           finished,
-                       int*            sequence_length,
-                       bool*           should_stop,
-                       curandState_t*  curand_state,
-                       TensorMap*      inputs,
-                       TensorMap*      outputs,
-                       const T*        logits,
-                       const uint32_t* seq_limit_len,
-                       const int*      context_length,
-                       int             step,
-                       int             ite,
-                       size_t          max_context_len,
-                       size_t          token_ids_len,
-                       size_t          batch_size);
+    void Forward(Buffer_<int>     input_ids,
+                 Tensor           hidden_states_out,
+                 Tensor           decoder_out,
+                 Buffer           kv_block_ptrs,
+                 Buffer           cu_block_nums,
+                 Buffer_<int>     h_input_length,
+                 Buffer_<int>     h_context_length,
+                 Buffer           rope_base,
+                 Buffer           finished,
+                 Buffer           local_token_nums,
+                 Buffer           lora_mask,
+                 int              decode_num,
+                 int              prefil_num,
+                 const Sequence** sequences);
+
+    Tensor postDecodeEmbedding(const Tensor& features, Buffer local_logits);
+
+    void dynamicDecode(Buffer token_ids,
+                       Buffer finished,
+                       Buffer sequence_length,
+                       Tensor curand_state,
+                       Tensor logits,
+                       Buffer seq_limit_len,
+                       Buffer init_context_length,
+                       Buffer context_length,
+                       Buffer prompt_length,
+                       Buffer sampled_logprobs,  // <- indicator
+                       Buffer sampled_indexes,
+                       Buffer sampled_nums,
+                       int    step,
+                       int    max_context_len);
 
 private:
-    friend class LlamaBatch<T>;
+    friend class LlamaBatch;
+
+    const DataType dtype_;
 
     const ModelParam     param_;
     const AttentionParam attn_param_;
@@ -118,21 +115,18 @@ class LlamaV2 {
     const size_t local_head_num_;
     const size_t local_kv_head_num_;
 
-    const std::shared_ptr<LlamaWeight<T>> weights_{};
+    const std::shared_ptr<LlamaWeight> weights_;
 
-    // Refs into `Context<T>`, make the pointer constant (not the pointed objects)
-    cudaStream_t const     stream_;
-    cublasMMWrapper* const cublas_wrapper_;
-    IAllocator* const      allocator_;
-    LlamaLinear<T>* const  linear_;
+    // Refs into `Context`, make the pointer constant (not the pointed objects)
+    cudaStream_t const stream_;
+    LlamaLinear&       linear_;
 
     bool use_allgather_2d_{false};
 
-    const bool is_free_buffer_after_forward_;
     const bool debug_;
 
-    std::unique_ptr<UnifiedDecoder<T>>     unified_decoder_;
-    std::unique_ptr<DynamicDecodeLayer<T>> dynamic_decode_layer_;
+    std::unique_ptr<UnifiedDecoder>     unified_decoder_;
+    std::unique_ptr<DynamicDecodeLayer> dynamic_decode_;
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaWeight.cc b/src/turbomind/models/llama/LlamaWeight.cc
index 4aca3c0056..0a23f986d6 100644
--- a/src/turbomind/models/llama/LlamaWeight.cc
+++ b/src/turbomind/models/llama/LlamaWeight.cc
@@ -18,26 +18,30 @@
 // Modified from
 // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptWeight.cc
 
+#include <cuda_runtime.h>
+
+#include "src/turbomind/core/allocator.h"
+#include "src/turbomind/core/context.h"
+#include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include "src/turbomind/models/llama/LlamaWeight.h"
 #include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/memory_utils.h"
-#include <cuda_runtime.h>
 
 namespace turbomind {
 
-template<typename T>
-LlamaWeight<T>::LlamaWeight(const ModelParam&  model,
-                            const EngineParam& engine_param,
-                            const LoraParam&   lora_param,
-                            const MoeParam&    moe_param):
+LlamaWeight::LlamaWeight(DataType           data_type,
+                         const ModelParam&  model,
+                         const EngineParam& engine_param,
+                         const LoraParam&   lora_param,
+                         const MoeParam&    moe_param):
     hidden_units_(model.hidden_units),
     inter_size_(model.inter_size),
     vocab_size_(model.vocab_size),
     vocab_size_padded_(model.vocab_size),
     embedding_size_(model.embedding_size),
     num_layer_(model.layer_num),
-    weight_type_(model.weight_type),
+    data_type_{data_type},
+    weight_type_{model.weight_type},
     tp_size_(engine_param.attn_tp_size),
     tp_rank_(engine_param.attn_tp_rank)
 {
@@ -51,138 +55,65 @@ LlamaWeight<T>::LlamaWeight(const ModelParam&  model,
     }
     FT_CHECK(hidden_units_ % tp_size_ == 0);
 
-    check_cuda_error(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
+    stream_ = core::Stream::create();
+    alloca_ = core::Allocator{stream_, false};
+
+    core::ContextGuard guard = context();
+
+    TM_CHECK_EQ(vocab_size_padded_ % tp_size_, 0);
+    TM_CHECK_EQ(hidden_units_ % tp_size_, 0);
+
+    pre_decoder_embedding.emplace(embedding_size_, hidden_units_ / tp_size_, data_type, false, data_type, 1);
+    post_decoder_embedding.emplace(hidden_units_, vocab_size_padded_ / tp_size_, data_type, false, data_type, 1);
+    register_module("tok_embeddings", pre_decoder_embedding, tp_rank_);
+    register_module("output", post_decoder_embedding, tp_rank_);
 
     decoder_layer_weights.reserve(num_layer_);
-    for (unsigned l = 0; l < num_layer_; ++l) {
+    for (int i = 0; i < num_layer_; ++i) {
         decoder_layer_weights.emplace_back(
-            new LlamaDecoderLayerWeight<T>(l, model, engine_param, lora_param, moe_param));
-        decoder_layer_weights.back()->malloc(stream_);
+            new LlamaDecoderLayerWeight(data_type, i, model, engine_param, lora_param, moe_param));
+        register_module("layers", *decoder_layer_weights.back(), i);
     }
 
-    FT_CHECK(vocab_size_padded_ % tp_size_ == 0);
-    deviceMalloc((T**)&pre_decoder_embedding_table, embedding_size_ * hidden_units_ / tp_size_, stream_);
-    deviceMalloc((T**)&output_norm_weight, hidden_units_, stream_);
-    deviceMalloc((T**)&post_decoder_embedding_kernel, hidden_units_ * vocab_size_padded_ / tp_size_, stream_);
-
-    // Wait for allocations
-    check_cuda_error(cudaStreamSynchronize(stream_));
+    output_norm_weight = Tensor{{hidden_units_}, data_type_, kDEVICE};
+    register_parameter("norm.weight", output_norm_weight);
 }
 
-template<typename T>
-LlamaWeight<T>::~LlamaWeight()
+LlamaWeight::~LlamaWeight()
 {
-    deviceFree(pre_decoder_embedding_table, stream_);
-    deviceFree(output_norm_weight, stream_);
-    deviceFree(post_decoder_embedding_kernel, stream_);
+    core::ContextGuard guard = context();
+
+    pre_decoder_embedding  = {};
+    post_decoder_embedding = {};
+    output_norm_weight     = {};
 
     for (auto& p : decoder_layer_weights) {
-        p->free(stream_);
         delete p;
     }
 
     decoder_layer_weights.clear();
 
     // Wait for deallocations
-    check_cuda_error(cudaStreamSynchronize(stream_));
-    check_cuda_error(cudaStreamDestroy(stream_));
-    stream_ = {};
-}
-
-template<typename T>
-void LlamaWeight<T>::loadModel(std::string dir_path)
-{
-    FtCudaDataType model_file_type = FtCudaDataType::FP16;
-    if (weight_type_ == WeightType::kBF16) {
-        model_file_type = FtCudaDataType::BF16;
-    }
-    dir_path += '/';
-
-    loadWeightFromBin((T*)pre_decoder_embedding_table,
-                      {embedding_size_ * hidden_units_ / tp_size_},
-                      dir_path + "tok_embeddings." + std::to_string(tp_rank_) + ".weight",
-                      model_file_type);
-
-    loadWeightFromBin((T*)output_norm_weight, {hidden_units_}, dir_path + "norm.weight", model_file_type);
-
-    loadWeightFromBin((T*)post_decoder_embedding_kernel,
-                      {hidden_units_ * vocab_size_padded_ / tp_size_},
-                      dir_path + "output." + std::to_string(tp_rank_) + ".weight",
-                      model_file_type);
-
-    for (unsigned layer = 0; layer < num_layer_; ++layer) {
-        decoder_layer_weights[layer]->loadModel(dir_path + "layers." + std::to_string(layer), model_file_type);
-    }
+    core::Context::stream().Sync();
 }
 
-template<typename T>
-TensorMap LlamaWeight<T>::getParams()
+core::ContextGuard LlamaWeight::context() const
 {
-    TensorMap output;
-
-    output.insert("tok_embeddings." + std::to_string(tp_rank_) + ".weight",
-                  Tensor{MEMORY_GPU,
-                         getTensorType<T>(),
-                         {embedding_size_ * hidden_units_ / tp_size_ * sizeof(T)},
-                         pre_decoder_embedding_table});
-
-    output.insert("norm.weight",
-                  Tensor{MEMORY_GPU, getTensorType<T>(), {hidden_units_ * sizeof(T)}, output_norm_weight});
-
-    output.insert("output." + std::to_string(tp_rank_) + ".weight",
-                  Tensor{MEMORY_GPU,
-                         getTensorType<T>(),
-                         {hidden_units_ * vocab_size_padded_ * sizeof(T) / tp_size_},
-                         post_decoder_embedding_kernel});
-
-    // transformer layers
-    for (size_t i = 0; i < num_layer_; i++) {
-        std::string prefix = fmtstr("layers.%d", i);
-        TensorMap   layeri = decoder_layer_weights[i]->getParams(prefix);
-        for (auto [name, tensor] : layeri) {
-            output.insert(name, tensor);
-        }
-    }
-
-    return output;
+    return core::ContextGuard{stream_, alloca_};
 }
 
-template<typename T>
-void LlamaWeight<T>::prepare(const cudaDeviceProp& prop)
+void LlamaWeight::prepare(const cudaDeviceProp& prop)
 {
-    const auto workspace_size = [&] {
-        size_t size{};
-        for (const auto& layer : decoder_layer_weights) {
-            size = std::max(size, layer->workspace_size());
-        }
-        return size;
-    }();
-
-    char* workspace{};
-
-    TM_LOG_INFO("[LlamaWeight<T>::prepare] workspace size: %d", workspace_size);
+    core::ContextGuard guard = context();
 
     // Wait for the weights to be filled externally
     check_cuda_error(cudaDeviceSynchronize());
 
-    if (workspace_size) {
-        deviceMalloc((char**)&workspace, workspace_size, stream_);
-    }
+    auto stream = core::Context::stream().handle();
+
     for (auto& layer : decoder_layer_weights) {
-        layer->prepare(workspace, workspace_size, prop, stream_);
+        layer->prepare(prop, stream);
     }
-
-    deviceFree(workspace, stream_);
-
-    check_cuda_error(cudaStreamSynchronize(stream_));
 }
 
-#ifdef ENABLE_FP32
-template struct LlamaWeight<float>;
-#endif
-template struct LlamaWeight<half>;
-#ifdef ENABLE_BF16
-template struct LlamaWeight<__nv_bfloat16>;
-#endif
-
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/LlamaWeight.h b/src/turbomind/models/llama/LlamaWeight.h
index 8bc77dc26b..08a3f7d7d7 100644
--- a/src/turbomind/models/llama/LlamaWeight.h
+++ b/src/turbomind/models/llama/LlamaWeight.h
@@ -20,16 +20,18 @@
 
 #pragma once
 
+#include "src/turbomind/core/context.h"
 #include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
+#include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include "src/turbomind/models/llama/llama_params.h"
 
 namespace turbomind {
 
-template<typename T>
-struct LlamaWeight {
+struct LlamaWeight: core::Module {
     LlamaWeight() = default;
 
-    LlamaWeight(const ModelParam&  model_param,
+    LlamaWeight(DataType           data_type,
+                const ModelParam&  model_param,
                 const EngineParam& engine_param,
                 const LoraParam&   lora_param,
                 const MoeParam&    moe_param);
@@ -39,31 +41,34 @@ struct LlamaWeight {
     LlamaWeight(const LlamaWeight&) = delete;
     LlamaWeight& operator=(const LlamaWeight&) = delete;
 
-    void loadModel(std::string dir_path);
+    void prepare(const cudaDeviceProp& prop);
 
-    TensorMap getParams();
+    core::ContextGuard context() const;
 
-    void prepare(const cudaDeviceProp& prop);
+    std::vector<LlamaDecoderLayerWeight*> decoder_layer_weights;
 
-    std::vector<LlamaDecoderLayerWeight<T>*> decoder_layer_weights;
+    LlamaDenseWeight pre_decoder_embedding;
+    LlamaDenseWeight post_decoder_embedding;
 
-    T* pre_decoder_embedding_table{};
-    T* output_norm_weight{};
-    T* post_decoder_embedding_kernel{};
+    Tensor output_norm_weight;
 
 private:
-    size_t     hidden_units_;
-    size_t     vocab_size_;
-    size_t     vocab_size_padded_;
-    size_t     embedding_size_;
-    size_t     num_layer_;
-    WeightType weight_type_;
-    size_t     tp_size_;  // this will follow attn tp param
-    size_t     tp_rank_;
+    int hidden_units_;
+    int vocab_size_;
+    int vocab_size_padded_;
+    int embedding_size_;
+    int num_layer_;
+
+    DataType data_type_;
+    DataType weight_type_;
+
+    int tp_size_;  // this will follow attn tp param
+    int tp_rank_;
 
     std::vector<int> inter_size_;
 
-    cudaStream_t stream_;
+    core::Stream    stream_;
+    core::Allocator alloca_;
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/SequenceManager.cc b/src/turbomind/models/llama/SequenceManager.cc
index 9497f42164..623ae3e332 100644
--- a/src/turbomind/models/llama/SequenceManager.cc
+++ b/src/turbomind/models/llama/SequenceManager.cc
@@ -3,15 +3,12 @@
 #include "src/turbomind/models/llama/SequenceManager.h"
 #include "src/turbomind/kernels/attention/block.h"
 #include "src/turbomind/models/llama/BlockManager.h"
-#include "src/turbomind/utils/allocator.h"
 #include "src/turbomind/utils/debug_utils.h"
 #include "src/turbomind/utils/logger.h"
 #include <cstddef>
 #include <cstdlib>
 #include <ctime>
 #include <numeric>
-#include <stdexcept>
-
 namespace turbomind {
 
 SequenceManager::SequenceManager(size_t             layer_num,
@@ -20,7 +17,7 @@ SequenceManager::SequenceManager(size_t             layer_num,
                                  int                chunk_size,
                                  bool               enable_prefix_caching,
                                  int                rank,
-                                 IAllocator*        allocator,
+                                 core::Allocator    allocator,
                                  GetFreeMemSize     get_free_size):
     block_seq_len_(block_config.block_len_), rank_(rank)
 {
diff --git a/src/turbomind/models/llama/SequenceManager.h b/src/turbomind/models/llama/SequenceManager.h
index a71a556aaa..3e17ff3553 100644
--- a/src/turbomind/models/llama/SequenceManager.h
+++ b/src/turbomind/models/llama/SequenceManager.h
@@ -2,9 +2,12 @@
 
 #pragma once
 
+#include <functional>
+
+#include "src/turbomind/core/allocator.h"
+
 #include "src/turbomind/models/llama/BlockManager.h"
 #include "src/turbomind/models/llama/BlockTrie.h"
-#include <functional>
 
 namespace turbomind {
 
@@ -78,7 +81,7 @@ class SequenceManager {
                              int                chunk_size,
                              bool               enable_prefix_caching,
                              int                rank,
-                             IAllocator*        allocator,
+                             core::Allocator    allocator,
                              GetFreeMemSize     get_free_size);
 
     SequenceManager(const SequenceManager&)     = delete;
diff --git a/src/turbomind/models/llama/context.h b/src/turbomind/models/llama/context.h
index 062db42247..33b7be29ac 100644
--- a/src/turbomind/models/llama/context.h
+++ b/src/turbomind/models/llama/context.h
@@ -10,9 +10,8 @@
 #include <cublas_v2.h>
 
 #include "src/turbomind/comm/device_comm.h"
+#include "src/turbomind/core/core.h"
 #include "src/turbomind/models/llama/LlamaLinear.h"
-#include "src/turbomind/utils/allocator.h"
-#include "src/turbomind/utils/cublasMMWrapper.h"
 
 namespace turbomind {
 
@@ -26,76 +25,21 @@ struct Communicators {
 };
 
 // Execution context for the model
-template<class T>
 struct Context {
-    cudaStream_t                                    stream;
-    std::unique_ptr<Allocator<AllocatorType::CUDA>> allocator;
-    cublasHandle_t                                  cublas_handle;
-    cublasLtHandle_t                                cublasLt_handle;
-    std::unique_ptr<cublasAlgoMap>                  cublas_algo_map;
-    std::unique_ptr<std::mutex>                     cublas_wrapper_mutex;
-    std::unique_ptr<cublasMMWrapper>                cublas_wrapper;
-    std::unique_ptr<LlamaLinear<T>>                 linear;
-    Communicators                                   comm;
-    cudaDeviceProp                                  cuda_device_prop;
-
-    Context(int device_id)
+    core::Stream                 core_stream;
+    core::Allocator              allocator;
+    cudaStream_t                 stream;
+    std::unique_ptr<LlamaLinear> linear;
+    cudaDeviceProp               device_prop;
+    Communicators                comm;  // initialize later
+
+    Context(int device_id):
+        core_stream{core::Stream::create()},
+        allocator{core::Allocator(core_stream, false)},
+        stream{core_stream.handle()},
+        linear{std::make_unique<LlamaLinear>(stream)}
     {
-        check_cuda_error(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-
-        allocator = std::make_unique<Allocator<AllocatorType::CUDA>>(device_id, false);
-        allocator->setStream(stream);
-
-        cublasCreate(&cublas_handle);
-        cublasLtCreate(&cublasLt_handle);
-        cublasSetStream(cublas_handle, stream);
-
-        if (0) {
-            cublasSetWorkspace(cublas_handle, nullptr, 0);
-            cublasSetMathMode(cublas_handle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
-        }
-
-        cublas_algo_map      = std::make_unique<cublasAlgoMap>("gemm_config.in");
-        cublas_wrapper_mutex = std::make_unique<std::mutex>();
-        cublas_wrapper       = std::make_unique<cublasMMWrapper>(
-            cublas_handle, cublasLt_handle, stream, cublas_algo_map.get(), cublas_wrapper_mutex.get(), allocator.get());
-        linear = std::make_unique<LlamaLinear<T>>(cublas_wrapper.get(), stream);
-
-        check_cuda_error(cudaGetDeviceProperties(&cuda_device_prop, device_id));
-
-        if (std::is_same<T, half>::value) {
-            cublas_wrapper->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F);
-        }
-#ifdef ENABLE_FP32
-        else if (std::is_same<T, float>::value) {
-            cublas_wrapper->setFP32GemmConfig();
-        }
-#endif
-#ifdef ENABLE_BF16
-        else if (std::is_same<T, __nv_bfloat16>::value) {
-            cublas_wrapper->setBF16GemmConfig();
-        }
-#endif
-    }
-
-    ~Context()
-    {
-        linear.reset();
-        cublas_wrapper.reset();
-        cublas_algo_map.reset();
-
-        cublasDestroy(cublas_handle);
-        cublas_handle = {};
-
-        cublasLtDestroy(cublasLt_handle);
-        cublasLt_handle = {};
-
-        allocator.reset();
-
-        // `comm` destroyed by infer threads collectively
-
-        cudaStreamDestroy(stream);
-        stream = {};
+        check_cuda_error(cudaGetDeviceProperties(&device_prop, device_id));
     }
 };
 
diff --git a/src/turbomind/models/llama/llama_decoder_kernels.cu b/src/turbomind/models/llama/llama_decoder_kernels.cu
deleted file mode 100644
index f0ed63ca72..0000000000
--- a/src/turbomind/models/llama/llama_decoder_kernels.cu
+++ /dev/null
@@ -1,220 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include "src/turbomind/macro.h"
-#include "src/turbomind/models/llama/llama_decoder_kernels.h"
-#include "src/turbomind/utils/cuda_type_utils.cuh"
-#include "src/turbomind/utils/cuda_utils.h"
-#include <cooperative_groups.h>
-#include <cooperative_groups/reduce.h>
-#include <cuda_fp16.h>
-
-namespace cg = cooperative_groups;
-
-namespace turbomind {
-
-template<typename T>
-struct res_norm_ops_t {
-};
-
-template<typename T>
-struct res_norm_t {
-    res_norm_ops_t<T> f;
-    __device__ uint4  addvec(const uint4& a, const uint4& b, const uint4& bias, float& accum) const
-    {
-        uint4 c;
-        c.x = f.cast(f.add(f.cast(a.x), f.cast(b.x), f.cast(bias.x), accum));
-        c.y = f.cast(f.add(f.cast(a.y), f.cast(b.y), f.cast(bias.y), accum));
-        c.z = f.cast(f.add(f.cast(a.z), f.cast(b.z), f.cast(bias.z), accum));
-        c.w = f.cast(f.add(f.cast(a.w), f.cast(b.w), f.cast(bias.w), accum));
-        return c;
-    }
-    __device__ uint4 normvec(const uint4& u, const uint4& s, float factor) const
-    {
-        uint4 v;
-        v.x = f.cast(f.norm(f.cast(u.x), f.cast(s.x), factor));
-        v.y = f.cast(f.norm(f.cast(u.y), f.cast(s.y), factor));
-        v.z = f.cast(f.norm(f.cast(u.z), f.cast(s.z), factor));
-        v.w = f.cast(f.norm(f.cast(u.w), f.cast(s.w), factor));
-        return v;
-    }
-};
-
-template<>
-struct res_norm_ops_t<half> {
-    __device__ float2 cast(const uint& x) const
-    {
-        return __half22float2(reinterpret_cast<const half2&>(x));
-    }
-    __device__ uint cast(const float2& x) const
-    {
-        auto y = __float22half2_rn(x);
-        return reinterpret_cast<uint&>(y);
-    }
-    __device__ float2 add(const float2& a, const float2& b, const float2& bias, float& accum) const
-    {
-        float2 c{a.x + b.x + bias.x, a.y + b.y + bias.y};
-        accum += c.x * c.x + c.y * c.y;
-        return c;
-    }
-    __device__ float2 norm(const float2& a, const float2& s, float factor) const
-    {
-        return {a.x * s.x * factor, a.y * s.y * factor};
-    }
-};
-
-template<>
-struct res_norm_ops_t<float> {
-    __device__ float cast(const uint& x) const
-    {
-        return reinterpret_cast<const float&>(x);
-    }
-    __device__ uint cast(const float& x) const
-    {
-        return reinterpret_cast<const uint&>(x);
-    }
-    __device__ float add(const float& a, const float& b, const float& bias, float& accum) const
-    {
-        float c = a + b + bias;
-        accum += c * c;
-        return c;
-    }
-    __device__ float norm(const float& a, const float& s, float factor) const
-    {
-        return a * s * factor;
-    }
-};
-
-#ifdef ENABLE_BF16
-template<>
-struct res_norm_ops_t<__nv_bfloat16> {
-    __device__ float2 cast(const uint& x) const
-    {
-        return cuda_cast<float2, __nv_bfloat162>(reinterpret_cast<const __nv_bfloat162&>(x));
-    }
-    __device__ uint cast(const float2& x) const
-    {
-        auto y = cuda_cast<__nv_bfloat162, float2>(x);
-        return reinterpret_cast<uint&>(y);
-    }
-    __device__ float2 add(const float2& a, const float2& b, const float2& bias, float& accum) const
-    {
-        float2 c{a.x + b.x + bias.x, a.y + b.y + bias.y};
-        accum += c.x * c.x + c.y * c.y;
-        return c;
-    }
-    __device__ float2 norm(const float2& a, const float2& s, float factor) const
-    {
-        return {a.x * s.x * factor, a.y * s.y * factor};
-    }
-};
-
-#endif
-
-template<typename T>
-__device__ T blockReduceSum(const cg::thread_block& block, T value)
-{
-    __shared__ float partial[32];
-
-    auto tile = cg::tiled_partition<32>(block);
-    value     = cg::reduce(tile, value, cg::plus<float>{});
-
-    if (tile.thread_rank() == 0) {
-        partial[tile.meta_group_rank()] = value;
-    }
-
-    block.sync();
-
-    value = tile.thread_rank() < tile.meta_group_size() ? partial[tile.thread_rank()] : T{};
-    return cg::reduce(tile, value, cg::plus<float>{});
-}
-
-// r' = r + x
-// x' = norm(r') * scales
-template<typename T>
-__global__ void fusedAddBiasResidualNorm(T* __restrict__ r_data,
-                                         T* __restrict__ x_data,
-                                         const T* __restrict__ bias,
-                                         const T* __restrict__ scale,
-                                         float eps,
-                                         int   batch_size,
-                                         int   n_dims)
-{
-    auto block = cg::this_thread_block();
-    auto grid  = cg::this_grid();
-
-    constexpr int PACK_DIM = sizeof(uint4) / sizeof(T);
-
-    const auto batch_idx            = block.group_index().x;
-    uint4* __restrict__ r_ptr       = reinterpret_cast<uint4*>(r_data + batch_idx * n_dims);
-    uint4* __restrict__ x_ptr       = reinterpret_cast<uint4*>(x_data + batch_idx * n_dims);
-    const uint4* __restrict__ b_ptr = reinterpret_cast<const uint4*>(bias);
-
-    res_norm_t<T> ops;
-
-    float thread_sum{};
-    for (auto i = block.thread_rank(); i < n_dims / PACK_DIM; i += block.size()) {
-        auto  r  = r_ptr[i];
-        auto  x  = x_ptr[i];
-        uint4 b  = b_ptr ? b_ptr[i] : uint4{};
-        r        = ops.addvec(r, x, b, thread_sum);
-        r_ptr[i] = r;
-    }
-
-    auto total_sum = blockReduceSum(block, thread_sum);
-
-    float s_inv_mean = rsqrt(total_sum / n_dims + eps);
-
-    const uint4* __restrict__ s_ptr = reinterpret_cast<const uint4*>(scale);
-    for (uint i = block.thread_rank(); i < n_dims / PACK_DIM; i += block.size()) {
-        auto r   = r_ptr[i];
-        auto s   = s_ptr[i];
-        auto o   = ops.normvec(r, s, s_inv_mean);
-        x_ptr[i] = o;
-    }
-}
-
-template<typename T>
-void invokeFusedAddBiasResidualRMSNorm(
-    T* residual, T* in_out, const T* bias, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream)
-{
-    constexpr int PACK_DIM = sizeof(uint4) / sizeof(T);
-    FT_CHECK(n_dims % PACK_DIM == 0);
-    const int n_pack    = n_dims / PACK_DIM;
-    const int n_iter    = ((n_pack + 1023) / 1024);        // iterations when block size == 1024
-    int       n_threads = (n_pack + n_iter - 1) / n_iter;  // adjust block size to avoid tail effect
-    n_threads           = (n_threads + 31) / 32 * 32;      // round up to the nearest multiple of warp size
-
-    fusedAddBiasResidualNorm<<<batch_size, n_threads, 0, stream>>>(
-        residual, in_out, bias, scale, eps, batch_size, n_dims);
-}
-
-template<typename T>
-__global__ void maskOutput(T* output, const int* mask, int dim)
-{
-    int batch_idx = blockIdx.x;
-    output += dim * batch_idx;
-    int masked = mask[batch_idx];
-    for (int i = threadIdx.x; i < dim; i += blockDim.x) {
-        output[i] = (masked) ? output[i] : T();
-    }
-}
-
-template<typename T>
-void invokeMask(T* output, const int* mask, int batch_size, int dim, cudaStream_t stream)
-{
-    maskOutput<<<batch_size, 1024, 0, stream>>>(output, mask, dim);
-}
-
-#ifdef ENABLE_FP32
-template void
-invokeFusedAddBiasResidualRMSNorm(float*, float*, const float*, const float*, float, int, int, cudaStream_t);
-template void invokeMask(float* output, const int* mask, int batch_size, int dim, cudaStream_t stream);
-#endif
-template void invokeFusedAddBiasResidualRMSNorm(half*, half*, const half*, const half*, float, int, int, cudaStream_t);
-template void invokeMask(half* output, const int* mask, int batch_size, int dim, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void invokeFusedAddBiasResidualRMSNorm(
-    __nv_bfloat16*, __nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, float, int, int, cudaStream_t);
-template void invokeMask(__nv_bfloat16* output, const int* mask, int batch_size, int dim, cudaStream_t stream);
-#endif
-}  // namespace turbomind
diff --git a/src/turbomind/models/llama/llama_decoder_kernels.h b/src/turbomind/models/llama/llama_decoder_kernels.h
deleted file mode 100644
index 9d4dc51fe7..0000000000
--- a/src/turbomind/models/llama/llama_decoder_kernels.h
+++ /dev/null
@@ -1,14 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-
-#include <cuda_runtime.h>
-
-namespace turbomind {
-
-template<typename T>
-void invokeFusedAddBiasResidualRMSNorm(
-    T* residual, T* in_out, const T* bias, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream);
-
-template<typename T>
-void invokeMask(T* output, const int* mask, int batch_size, int dim, cudaStream_t stream);
-
-}  // namespace turbomind
diff --git a/src/turbomind/models/llama/llama_gemm.cc b/src/turbomind/models/llama/llama_gemm.cc
deleted file mode 100644
index f9a0191e4b..0000000000
--- a/src/turbomind/models/llama/llama_gemm.cc
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copied from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/gpt_gemm.cc
-
-#include "src/turbomind/utils/gemm_test/gpt_gemm_func.h"
-#include "src/turbomind/utils/memory_utils.h"
-
-namespace ft = turbomind;
-
-int main(int argc, char* argv[])
-{
-    if (argc < 9 || argc > 11) {
-        TM_LOG_ERROR("./bin/llama_gemm batch_size \\ \n"
-                     "                 beam_width \\ \n"
-                     "                 max_input_len \\ \n"
-                     "                 head_number \\ \n"
-                     "                 size_per_head \\ \n"
-                     "                 inter_size \\ \n"
-                     "                 vocab_size \\ \n"
-                     "                 data_type \\ \n"
-                     "                 tensor_para_size \\\n"
-                     "                 is_append (append new config into exist gemm_config.ini or not)");
-        TM_LOG_ERROR("e.g. ./bin/llama_gemm 8 4 32 96 128 49152 51200 1 8 1");
-        return 0;
-    }
-
-    const int                batch_size    = atoi(argv[1]);
-    const int                beam_width    = atoi(argv[2]);
-    const int                max_input_len = atoi(argv[3]);
-    const int                head_num      = atoi(argv[4]);
-    const int                size_per_head = atoi(argv[5]);
-    const int                inter_size    = atoi(argv[6]);
-    const int                vocab_size    = atoi(argv[7]);
-    const ft::CublasDataType data_type     = static_cast<ft::CublasDataType>(atoi(argv[8]));  // 0 FP32, 1 FP16, 2 BF 16
-    const int                tensor_para_size = argc < 10 ? 1 : atoi(argv[9]);
-    const bool               is_append        = argc < 11 ? false : (bool)(atoi(argv[10]));
-
-    TM_LOG_INFO("Arguments:");
-    TM_LOG_INFO("  batch_size: %d", batch_size);
-    TM_LOG_INFO("  beam_width: %d", beam_width);
-    TM_LOG_INFO("  max_input_len: %d", max_input_len);
-    TM_LOG_INFO("  head_num: %d", head_num);
-    TM_LOG_INFO("  size_per_head: %d", size_per_head);
-    TM_LOG_INFO("  inter_size: %d", inter_size);
-    TM_LOG_INFO("  vocab_size: %d", vocab_size);
-    TM_LOG_INFO("  data_type: %d", data_type);
-    TM_LOG_INFO("  tensor_para_size: %d", tensor_para_size);
-    TM_LOG_INFO("  is_append: %d", (int)is_append);
-    std::cout << std::endl;
-
-    void*  gemm_test_buf;
-    size_t buf_size_in_byte = ft::calGptGemmTestBufSizeInByte(batch_size,
-                                                              beam_width,
-                                                              max_input_len,
-                                                              head_num,
-                                                              size_per_head,
-                                                              inter_size,
-                                                              vocab_size,
-                                                              tensor_para_size,
-                                                              data_type);
-    size_t total, free;
-    ft::check_cuda_error(cudaMemGetInfo(&free, &total));
-    if (free < buf_size_in_byte + 10 * 1024 * 1024) {
-        printf("[ERROR] There is no enough device memory for gemm test!\n"
-               " %ld Bytes is needed, but only %ld Bytes is free.\n",
-               buf_size_in_byte,
-               free);
-        gemm_test_buf = NULL;
-        return -1;
-    }
-    else {
-        ft::deviceMalloc(reinterpret_cast<char**>(&gemm_test_buf), buf_size_in_byte, nullptr, false);
-    }
-
-    if (0) {}
-#ifdef ENABLE_FP32
-    else if (data_type == ft::FLOAT_DATATYPE) {
-        ft::generate_gpt_gemm_config<float>(batch_size,
-                                            beam_width,
-                                            max_input_len,
-                                            head_num,
-                                            size_per_head,
-                                            inter_size,
-                                            vocab_size,
-                                            tensor_para_size,
-                                            gemm_test_buf,
-                                            is_append);
-    }
-#endif
-    else if (data_type == ft::HALF_DATATYPE) {
-        ft::generate_gpt_gemm_config<half>(batch_size,
-                                           beam_width,
-                                           max_input_len,
-                                           head_num,
-                                           size_per_head,
-                                           inter_size,
-                                           vocab_size,
-                                           tensor_para_size,
-                                           gemm_test_buf,
-                                           is_append);
-    }
-#ifdef ENABLE_BF16
-    else if (data_type == ft::BFLOAT16_DATATYPE) {
-        ft::generate_gpt_gemm_config<__nv_bfloat16>(batch_size,
-                                                    beam_width,
-                                                    max_input_len,
-                                                    head_num,
-                                                    size_per_head,
-                                                    inter_size,
-                                                    vocab_size,
-                                                    tensor_para_size,
-                                                    gemm_test_buf,
-                                                    is_append);
-    }
-#endif
-#ifdef ENABLE_FP8
-    else if (data_type == ft::FP8_DATATYPE) {
-        ft::generate_gpt_gemm_config<__nv_fp8_e4m3>(batch_size,
-                                                    beam_width,
-                                                    max_input_len,
-                                                    head_num,
-                                                    size_per_head,
-                                                    inter_size,
-                                                    vocab_size,
-                                                    tensor_para_size,
-                                                    gemm_test_buf,
-                                                    false);
-    }
-#endif
-    else {
-        printf("[ERROR] data type only supports fp32(0), fp16(1), bf16(2), fp8(4). \n");
-        return -1;
-    }
-
-    ft::check_cuda_error(cudaFree(gemm_test_buf));
-    return 0;
-}
diff --git a/src/turbomind/models/llama/llama_kernels.cu b/src/turbomind/models/llama/llama_kernels.cu
index 879a39d409..4a49460439 100644
--- a/src/turbomind/models/llama/llama_kernels.cu
+++ b/src/turbomind/models/llama/llama_kernels.cu
@@ -1,242 +1,20 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
-#include "src/turbomind/kernels/core/array_ops.h"
-#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
-#include "src/turbomind/macro.h"
-#include "src/turbomind/models/llama/llama_kernels.h"
-#include "src/turbomind/models/llama/llama_utils.h"
-#include "src/turbomind/utils/cuda_type_utils.cuh"
-#include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/dispatch.h"
-#include "src/turbomind/utils/logger.h"
 #include <algorithm>
 #include <cstdint>
-#include <cub/block/block_reduce.cuh>
+#include <numeric>
 #include <type_traits>
 #include <utility>
 
-namespace turbomind {
-
-// fp16, bf16
-// n is divided by 2 for this impl
-template<typename T>
-__global__ void rootMeanSquareNorm(T* out, const T* input, const T* scale, float eps, int m, int n)
-{
-    using T2 = typename TypeConverter<T>::Type;
-    __shared__ float s_inv_mean;
-    float            mean = 0.f;
-
-    T2*       out_ptr   = (T2*)out;
-    const T2* input_ptr = (const T2*)input;
-    const T2* scale_ptr = (const T2*)scale;
-
-    for (uint idx = threadIdx.x; idx < n; idx += blockDim.x) {
-        float2 tmp2 = cuda_cast<float2>(input_ptr[blockIdx.x * n + idx]);
-        mean += tmp2.x * tmp2.x;
-        mean += tmp2.y * tmp2.y;
-    }
-
-    mean = blockReduceSum<float>(mean);
-    if (threadIdx.x == 0) {
-        s_inv_mean = rsqrt(.5f * mean / (float)n + eps);
-    }
-    __syncthreads();
-
-    for (uint idx = threadIdx.x; idx < n; idx += blockDim.x) {
-        float2 tmp2                   = cuda_cast<float2>(input_ptr[blockIdx.x * n + idx]);
-        float2 sca2                   = cuda_cast<float2>(scale_ptr[idx]);
-        tmp2.x                        = tmp2.x * s_inv_mean * sca2.x;
-        tmp2.y                        = tmp2.y * s_inv_mean * sca2.y;
-        out_ptr[blockIdx.x * n + idx] = cuda_cast<T2>(tmp2);
-    }
-}
-
-template<>
-__global__ void rootMeanSquareNorm(float* out, const float* input, const float* scale, float eps, int m, int n)
-{
-    __shared__ float s_inv_mean;
-    float            mean = 0.f;
-
-    for (uint idx = threadIdx.x; idx < n; idx += blockDim.x) {
-        float tmp = input[blockIdx.x * n + idx];
-        mean += tmp * tmp;
-    }
-
-    mean = blockReduceSum<float>(mean);
-    if (threadIdx.x == 0) {
-        s_inv_mean = rsqrt(mean / static_cast<float>(n) + eps);
-    }
-    __syncthreads();
-
-    for (uint idx = threadIdx.x; idx < n; idx += blockDim.x) {
-        float tmp                 = input[blockIdx.x * n + idx];
-        out[blockIdx.x * n + idx] = tmp * s_inv_mean * scale[idx];
-    }
-}
-
-template<typename T>
-void invokeRootMeanSquareNorm(T* out, const T* input, const T* scale, float eps, int m, int n, cudaStream_t stream)
-{
-    if (sizeof(T) == 2) {
-        FT_CHECK(n % 2 == 0);
-        n /= 2;
-    }
-    dim3 grid(m);
-    dim3 block(std::min(n, 1024));
-    rootMeanSquareNorm<<<grid, block, 0, stream>>>(out, input, scale, eps, m, n);
-}
-
-template void invokeRootMeanSquareNorm(float*, const float*, const float*, float, int, int, cudaStream_t);
-template void invokeRootMeanSquareNorm(half*, const half*, const half*, float, int, int, cudaStream_t);
-#ifdef ENABLE_BF16
-template void
-invokeRootMeanSquareNorm(__nv_bfloat16*, const __nv_bfloat16*, const __nv_bfloat16*, float, int, int, cudaStream_t);
-#endif
-
-// #ifdef ENABLE_BF16
-
-// template void invokeRootMeanSquareNorm(__nv_bfloat16*, const __nv_bfloat16*, float, int, int, cudaStream_t);
-
-// #endif
-
-template<typename T, typename T0>
-__device__ T saturate_cast(T0 x)
-{
-    return x;
-}
-
-template<>
-__device__ half saturate_cast<half, float>(float x)
-{
-    return (x > 64512.f || x < -64512.f) ? (x > 0.f ? 64512.f : -64512.f) : x;
-}
-
-template<typename T>
-__global__ void addResidual(T* out, const T* in, size_t n)
-{
-    auto idx = threadIdx.x + (size_t)blockIdx.x * blockDim.x;
-    if (idx < n) {
-        out[idx] = static_cast<T>(static_cast<float>(out[idx]) + static_cast<float>(in[idx]));
-    }
-}
-
-template<typename T>
-void invokeAddResidual(T* out, const T* in, int m, int n, cudaStream_t stream)
-{
-    auto total = static_cast<size_t>(m) * n;
-    dim3 block(std::min((unsigned long)total, 1024UL));
-    dim3 grid((total + block.x - 1) / block.x);
-
-    addResidual<<<grid, block, 0, stream>>>(out, in, total);
-}
-
-template void invokeAddResidual(float*, const float*, int, int, cudaStream_t);
-template void invokeAddResidual(half*, const half*, int, int, cudaStream_t);
-
-// ids [seq_len, batch_size]
-// input_ids [batch_size, max_input_len]
-__global__ void
-fixInputIds(int* ids, const int* input_ids, const int* input_lengths, int batch_size, int seq_len, int max_input_len)
-{
-    int seq_id   = threadIdx.x;
-    int batch_id = blockIdx.x;
-    for (; seq_id < input_lengths[batch_id]; seq_id += blockDim.x) {
-        ids[seq_id * batch_size + batch_id] = input_ids[batch_id * max_input_len + seq_id];
-    }
-}
-
-void invokeFixInputIds(int*         ids,
-                       const int*   input_ids,
-                       const int*   input_lengths,
-                       int          batch_size,
-                       int          seq_len,
-                       int          max_input_len,
-                       cudaStream_t st)
-{
-    dim3 block(std::min(1024, max_input_len));
-    dim3 grid(batch_size);
-    fixInputIds<<<grid, block, 0, st>>>(ids, input_ids, input_lengths, batch_size, seq_len, max_input_len);
-}
-
-template<typename T>
-__global__ void sliceCausalMask(T* mask, int seq_len, int key_len, int step)
-{
-    mask += (size_t)blockIdx.x * seq_len * key_len;
-    for (int i = threadIdx.x; i < seq_len * key_len; i += blockDim.x) {
-        int row = i / key_len;
-        int col = i % key_len;
-        if (col <= row + step) {
-            mask[i] = static_cast<T>(1.f);
-        }
-        else {
-            mask[i] = static_cast<T>(0.f);
-        }
-    }
-}
-
-// [step: step+Q, :] of the K*K causal mask
-template<typename T>
-void invokeSliceCausalMask(T* mask, int seq_len, int key_len, int step, int batch_size, cudaStream_t stream)
-{
-    FT_CHECK(step == key_len - seq_len);
-    sliceCausalMask<<<batch_size, 256, 0, stream>>>(mask, seq_len, key_len, step);
-}
-
-template void invokeSliceCausalMask(half*, int, int, int, int, cudaStream_t);
-template void invokeSliceCausalMask(float*, int, int, int, int, cudaStream_t);
-
-// mask [bsz, max_q_len, max_k_len]
-
-template<typename T>
-__global__ void createCausalMasks(T* mask, const int* q_lens, const int* k_lens, int max_q_len, int max_k_len)
-{
-    const auto q_len = q_lens ? q_lens[blockIdx.x] : max_q_len;
-    const auto k_len = k_lens ? k_lens[blockIdx.x] : max_k_len;
-    mask += blockIdx.x * max_q_len * max_k_len;
-    for (int i = threadIdx.x; i < max_q_len * max_k_len; i += blockDim.x) {
-        const int q        = i / max_k_len;  // [0, max_q_len)
-        const int k        = i % max_k_len;  // [0, max_k_len)
-        bool      is_valid = q < q_len && k < k_len && k <= q + (k_len - q_len);
-        mask[i]            = static_cast<T>(is_valid);
-    }
-}
-
-template<typename T>
-void invokeCreateCausalMasks(
-    T* mask, const int* q_lens, const int* k_lens, int max_q_len, int max_k_len, int batch_size, cudaStream_t stream)
-{
-    createCausalMasks<<<batch_size, 512, 0, stream>>>(mask, q_lens, k_lens, max_q_len, max_k_len);
-}
-
-template void invokeCreateCausalMasks(float* mask, const int*, const int*, int, int, int, cudaStream_t);
-template void invokeCreateCausalMasks(half* mask, const int*, const int*, int, int, int, cudaStream_t);
-#ifdef ENABLE_BF16
-template<>
-__global__ void createCausalMasks<__nv_bfloat16>(
-    __nv_bfloat16* mask, const int* q_lens, const int* k_lens, int max_q_len, int max_k_len)
-{
-    const auto q_len = q_lens[blockIdx.x];
-    const auto k_len = k_lens[blockIdx.x];
-    mask += blockIdx.x * max_q_len * max_k_len;
-    for (int i = threadIdx.x; i < max_q_len * max_k_len; i += blockDim.x) {
-        const int q        = i / max_k_len;  // [0, max_q_len)
-        const int k        = i % max_k_len;  // [0, max_k_len)
-        bool      is_valid = q < q_len && k < k_len && k <= q + (k_len - q_len);
-        mask[i]            = static_cast<__nv_bfloat16>(float(is_valid));
-    }
-}
-template void invokeCreateCausalMasks(__nv_bfloat16* mask, const int*, const int*, int, int, int, cudaStream_t);
-#endif
-
-namespace {
+#include <cub/block/block_reduce.cuh>
 
-template<class Kernel, class Params>
-__global__ void KernelWrapper(Params params)
-{
-    Kernel{}(params);
-};
+#include "src/turbomind/kernels/core/array.h"
+#include "src/turbomind/macro.h"
+#include "src/turbomind/models/llama/llama_kernels.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/dispatch.h"
 
-}  // namespace
+namespace turbomind {
 
 __global__ void gatherOutput(int*       output_ids,
                              const int* ids,
@@ -477,19 +255,12 @@ __global__ void getFeatureOfLastToken(T* output, const T* input, const int* cu_s
     }
 }
 
-template<typename T>
 void invokeGetFeatureOfLastToken(
-    T* output, const T* input, const int* cu_seqlens, int dims, int batch_size, cudaStream_t stream)
+    uint16_t* output, const uint16_t* input, const int* cu_seqlens, int dims, int batch_size, cudaStream_t stream)
 {
     getFeatureOfLastToken<<<batch_size, 256, 0, stream>>>(output, input, cu_seqlens, dims);
 }
 
-template void invokeGetFeatureOfLastToken(half*, const half*, const int*, int, int, cudaStream_t);
-template void invokeGetFeatureOfLastToken(float*, const float*, const int*, int, int, cudaStream_t);
-#ifdef ENABLE_BF16
-template void invokeGetFeatureOfLastToken(__nv_bfloat16*, const __nv_bfloat16*, const int*, int, int, cudaStream_t);
-#endif  // ENABLE_BF16
-
 template<class T, int C>
 struct BatchedCopyParam {
     Array<T*, C>  src_ptr;
@@ -560,4 +331,29 @@ void invokeBatchedCopy(void** src_ptr, void** dst_ptr, int* size, int count, cud
         });
 }
 
+template<typename T>
+__global__ void maskOutput(T* output, const int* mask, int dim)
+{
+    int batch_idx = blockIdx.x;
+    output += dim * batch_idx;
+    int masked = mask[batch_idx];
+    for (int i = threadIdx.x; i < dim; i += blockDim.x) {
+        output[i] = (masked) ? output[i] : T();
+    }
+}
+
+template<typename T>
+void invokeMask(T* output, const int* mask, int batch_size, int dim, cudaStream_t stream)
+{
+    maskOutput<<<batch_size, 1024, 0, stream>>>(output, mask, dim);
+}
+
+#ifdef ENABLE_FP32
+template void invokeMask(float* output, const int* mask, int batch_size, int dim, cudaStream_t stream);
+#endif
+template void invokeMask(half* output, const int* mask, int batch_size, int dim, cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeMask(__nv_bfloat16* output, const int* mask, int batch_size, int dim, cudaStream_t stream);
+#endif
+
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/llama_kernels.h b/src/turbomind/models/llama/llama_kernels.h
index aaade1a513..82dbeb13e8 100644
--- a/src/turbomind/models/llama/llama_kernels.h
+++ b/src/turbomind/models/llama/llama_kernels.h
@@ -2,72 +2,11 @@
 
 #pragma once
 
-#include "src/turbomind/kernels/gpt_kernels.h"
-#include "src/turbomind/utils/cuda_bf16_wrapper.h"
-#include "src/turbomind/utils/cuda_utils.h"
-#include <assert.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <numeric>
+#include <cstdint>
 
+#include <cuda_runtime.h>
 namespace turbomind {
 
-template<typename T>
-void invokeRootMeanSquareNorm(T* out, const T* input, const T* scale, float eps, int m, int n, cudaStream_t stream);
-
-template<typename T>
-void invokeAddResidual(T* out, const T* in, int m, int n, cudaStream_t stream);
-
-void invokeFixInputIds(int*         ids,
-                       const int*   input_ids,
-                       const int*   input_lengths,
-                       int          batch_size,
-                       int          seq_len,
-                       int          max_input_len,
-                       cudaStream_t st);
-
-template<typename T>
-void invokeSliceCausalMask(T* mask, int seq_len, int key_len, int step, int batch_size, cudaStream_t stream);
-
-template<typename T>
-void invokeCreateCausalMasks(
-    T* mask, const int* q_lens, const int* k_lens, int max_q_len, int max_k_len, int batch_size, cudaStream_t stream);
-
-template<typename T>
-void invokeExtendKVCache(void**       k_dst_ptrs,
-                         void**       v_dst_ptrs,
-                         const T*     k_src,
-                         const T*     v_src,
-                         const int*   cu_block_counts,
-                         const int*   query_length,
-                         const int*   context_length,
-                         int          batch_size,
-                         int          block_length,
-                         size_t       dst_layer_offset,
-                         int          max_q_len,
-                         int          head_dim,
-                         int          head_num,
-                         int          quant,
-                         const float* kv_scale,
-                         cudaStream_t stream);
-
-template<typename T>
-void invokeTransposeKVCache(T*           key_cache_trans,
-                            T*           val_cache_trans,
-                            const T**    key_cache,
-                            const T**    val_cache,
-                            size_t       layer_offset,
-                            int          batch_size,
-                            const int*   key_length,
-                            int          max_kv_len,
-                            int          max_seq_len,
-                            int          size_per_head,
-                            int          head_num,
-                            int          head_n_rep,
-                            cudaStream_t stream,
-                            int          quant_policy,
-                            const float* kv_scale);
-
 void invokeGatherOutput(int*         output_ids,
                         const int*   ids,
                         const int*   context_length,
@@ -115,56 +54,10 @@ void invokeBatchedCopy(void** src_ptr, void** dst_ptr, int* size, int count, cud
 void invokePadLastTokenIds(
     int* token_ids, const int* context_length, int max_context_len, int batch_size, cudaStream_t stream);
 
-template<typename T>
 void invokeGetFeatureOfLastToken(
-    T* output, const T* input, const int* cu_seqlens, int dims, int batch_size, cudaStream_t stream);
-
-void invokeMyCopyInt(int* dst, const int* src, size_t count, cudaStream_t st);
+    uint16_t* output, const uint16_t* input, const int* cu_seqlens, int dims, int batch_size, cudaStream_t stream);
 
 template<typename T>
-inline void dump(const T* x, int size, cudaStream_t st, const char* msg, bool full = false)
-{
-    std::vector<T> h_x(size);
-    cudaMemcpyAsync(h_x.data(), x, sizeof(T) * size, cudaMemcpyDefault, st);
-    cudaStreamSynchronize(st);
-    fprintf(stderr, "\n%s:\n", msg);
-    std::vector<float> h_y(h_x.begin(), h_x.end());
-    float              asum = 0.f;
-    for (const auto& x : h_y) {
-        asum += std::fabs(x);
-    }
-    if (full) {
-        for (int i = 0; i < size; ++i) {
-            printf("%d %.8f\n", i, h_y[i]);
-        }
-    }
-    else {
-        for (int i = 0; i < 8; ++i) {
-            fprintf(stderr, "%.8f\n", h_y[i]);
-        }
-        for (int i = size - 8; i < size; ++i) {
-            fprintf(stderr, "%.8f\n", h_y[i]);
-        }
-    }
-    fprintf(stderr, "\nasum = %f\n", asum);
-    // getchar();
-}
-
-template<typename T>
-struct TempBuffer {
-    TempBuffer(size_t size)
-    {
-        cudaMalloc(&data, size);
-    }
-    T* data;
-};
-
-inline void dump_sequence_len(int* d_seq_len, int step, int tp_rank, cudaStream_t st)
-{
-    int h_seq_len = -1;
-    cudaMemcpyAsync(&h_seq_len, d_seq_len, sizeof(int), cudaMemcpyDefault, st);
-    cudaStreamSynchronize(st);
-    TM_LOG_ERROR("--------> rank = %d, step = %d, seq_len = %d <--------", tp_rank, step, h_seq_len);
-}
+void invokeMask(T* output, const int* mask, int batch_size, int dim, cudaStream_t stream);
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h
index a5dd8bcb49..4b88e10fb8 100644
--- a/src/turbomind/models/llama/llama_params.h
+++ b/src/turbomind/models/llama/llama_params.h
@@ -7,34 +7,34 @@
 #include <regex>
 #include <string>
 
+#include "src/turbomind/core/data_type.h"
 #include "src/turbomind/models/llama/llama_rope.h"
-#include "src/turbomind/models/llama/weight_type.h"
 
 namespace turbomind {
 
 struct MLAParam {
-    size_t q_lora_rank;
-    size_t kv_lora_rank;
-    size_t qk_rope_dim;
-    size_t v_head_dim;
+    int q_lora_rank;
+    int kv_lora_rank;
+    int qk_rope_dim;
+    int v_head_dim;
 };
 
 struct ModelParam {
-    size_t     head_num;
-    size_t     head_dim;
-    size_t     kv_head_num;
-    size_t     hidden_units;
-    size_t     layer_num;
-    size_t     vocab_size;
-    size_t     embedding_size;
-    float      norm_eps;
-    int        quant_policy;
-    bool       attn_bias;
-    WeightType weight_type;
-    int        group_size;
-    MLAParam   mla;
-    bool       qk_norm;
-    int        tune_layer_num;
+    size_t   head_num;
+    size_t   head_dim;
+    size_t   kv_head_num;
+    size_t   hidden_units;
+    size_t   layer_num;
+    size_t   vocab_size;
+    size_t   embedding_size;
+    float    norm_eps;
+    int      quant_policy;
+    bool     attn_bias;
+    DataType weight_type;
+    int      group_size;
+    MLAParam mla;
+    bool     qk_norm;
+    int      tune_layer_num;
 
     std::vector<int> inter_size;
 };
@@ -81,7 +81,7 @@ struct EngineParam {
     bool  enable_prefix_caching;
 
     // chunking params
-    int max_prefill_token_num;
+    int max_forward_token_num;
     int max_context_token_num;
     int num_tokens_per_iter;
     int max_prefill_iters;
diff --git a/src/turbomind/models/llama/llama_utils.cu b/src/turbomind/models/llama/llama_utils.cu
index eaa450ae20..e4220a8e47 100644
--- a/src/turbomind/models/llama/llama_utils.cu
+++ b/src/turbomind/models/llama/llama_utils.cu
@@ -70,7 +70,8 @@ void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream)
 
     Tacc asum{};
     Tacc rsum{};
-    Tacc amean{};
+    Tacc amean_r{};
+    Tacc amean_x{};
     for (size_t i = 0; i < size; ++i) {
         Tacc x        = (Tacc)h_b[i];
         Tacc r        = (Tacc)h_a[i];
@@ -78,10 +79,18 @@ void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream)
         Tacc rel_diff = abs_diff / std::max(std::max(std::abs(r), std::abs(x)), eps);
         asum += abs_diff;
         rsum += rel_diff;
-        amean += std::abs(r);
+        amean_x += std::abs(x);
+        amean_r += std::abs(r);
     }
 
-    std::cerr << key << ": " << amean / size << " " << asum << " " << asum / size << " " << rsum / size << "\n";
+    fprintf(stderr,
+            "%12s%12f%12f%12f%12f%12f\n",
+            key.c_str(),
+            (float)amean_x / (float)size,
+            (float)amean_r / (float)size,
+            (float)asum,
+            (float)asum / (float)size,
+            (float)rsum / (float)size);
 
     check_cuda_error(cudaMemcpyAsync(ptr, h_a.data(), sizeof(T) * h_a.size(), cudaMemcpyDefault, stream));
     check_cuda_error(cudaStreamSynchronize(stream));
@@ -124,19 +133,6 @@ template void Compare(__nv_bfloat16* ptr, size_t size, std::string key, CmpMode
 template void CheckNan(const float* ptr, size_t size, std::string key, cudaStream_t stream);
 template void CheckNan(const half* ptr, size_t size, std::string key, cudaStream_t stream);
 
-std::string format(const std::pair<std::string, Tensor>& p)
-{
-    std::stringstream ss;
-    ss << p.first << " [";
-    bool first = true;
-    for (const auto& x : p.second.shape) {
-        ss << (first ? "" : ", ") << x;
-        first = false;
-    }
-    ss << "]";
-    return ss.str();
-}
-
 size_t curandStateGetSize()
 {
     return sizeof(curandState_t);
diff --git a/src/turbomind/models/llama/llama_utils.h b/src/turbomind/models/llama/llama_utils.h
index e50364bbd1..193bbfb87c 100644
--- a/src/turbomind/models/llama/llama_utils.h
+++ b/src/turbomind/models/llama/llama_utils.h
@@ -1,7 +1,6 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
 #pragma once
-#include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/nvtx_utils.h"
 #include <cuda_runtime.h>
 #include <sstream>
@@ -62,8 +61,6 @@ std::string Concat(std::string key, Args&&... args)
     return key;
 }
 
-std::string format(const std::pair<std::string, Tensor>& p);
-
 size_t curandStateGetSize();
 
 bool isDebug();
diff --git a/src/turbomind/models/llama/mla_utils.cu b/src/turbomind/models/llama/mla_utils.cu
index 2f9e786f2a..74478401e2 100644
--- a/src/turbomind/models/llama/mla_utils.cu
+++ b/src/turbomind/models/llama/mla_utils.cu
@@ -1,5 +1,10 @@
 // Copyright (c) OpenMMLab. All rights reserved.
+
+#include <cuda_bf16.h>
+
+#include "src/turbomind/core/check.h"
 #include "src/turbomind/kernels/core/array_ops.h"
+#include "src/turbomind/utils/cuda_utils.h"
 
 namespace turbomind {
 
@@ -78,16 +83,37 @@ void invokeMLACopyQKV(T*           qkv,
         qkv, q, kv_a, kv_b, head_num, head_dim, nope_dim, rope_dim, kv_lora_rank, v_head_dim);
 }
 
-template void invokeMLACopyQKV(uint16_t*       qkv,
-                               const uint16_t* q,
-                               const uint16_t* kv_a,
-                               const uint16_t* kv_b,
-                               int             token_num,
-                               int             head_num,
-                               int             nope_dim,
-                               int             rope_dim,
-                               int             kv_lora_rank,
-                               int             v_head_dim,
-                               cudaStream_t    stream);
+void MLACopyQKV(DataType     dtype,
+                void*        qkv,
+                const void*  q,
+                const void*  kv_a,
+                const void*  kv_b,
+                int          token_num,
+                int          head_num,
+                int          nope_dim,
+                int          rope_dim,
+                int          kv_lora_rank,
+                int          v_head_dim,
+                cudaStream_t stream)
+{
+    auto invoke = [&](auto t) {
+        using T = decltype(t);
+        invokeMLACopyQKV((T*)qkv,
+                         (const T*)q,
+                         (const T*)kv_a,
+                         (const T*)kv_b,
+                         token_num,
+                         head_num,
+                         nope_dim,
+                         rope_dim,
+                         kv_lora_rank,
+                         v_head_dim,
+                         stream);
+    };
+
+    TM_CHECK_EQ(byte_size(dtype, 1), 2) << "unsupported data type: " << dtype;
+
+    return invoke(uint16_t{});
+}
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/mla_utils.h b/src/turbomind/models/llama/mla_utils.h
index bc06a352f9..255318306f 100644
--- a/src/turbomind/models/llama/mla_utils.h
+++ b/src/turbomind/models/llama/mla_utils.h
@@ -1,57 +1,23 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 #pragma once
 
-#include <cstdint>
 #include <cuda_runtime.h>
 
-#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/core/data_type.h"
 
 namespace turbomind {
 
-template<class T>
-void invokeMLACopyQKV(T*           qkv,
-                      const T*     q,
-                      const T*     kv_a,
-                      const T*     kv_b,
-                      int          token_num,
-                      int          head_num,
-                      int          nope_dim,
-                      int          rope_dim,
-                      int          kv_lora_rank,
-                      int          v_head_dim,
-                      cudaStream_t stream);
-
-template<class T>
-void dispatchMLACopyQKV(T*           qkv,
-                        const T*     q,
-                        const T*     kv_a,
-                        const T*     kv_b,
-                        int          token_num,
-                        int          head_num,
-                        int          nope_dim,
-                        int          rope_dim,
-                        int          kv_lora_rank,
-                        int          v_head_dim,
-                        cudaStream_t stream)
-{
-    auto invoke = [&](auto x) {
-        using type = decltype(x);
-        invokeMLACopyQKV((type*)qkv,
-                         (const type*)q,
-                         (const type*)kv_a,
-                         (const type*)kv_b,
-                         token_num,
-                         head_num,
-                         nope_dim,
-                         rope_dim,
-                         kv_lora_rank,
-                         v_head_dim,
-                         stream);
-    };
-    if constexpr (sizeof(T) == 2) {
-        return invoke(uint16_t{});
-    }
-    FT_CHECK(0);
-}
+void MLACopyQKV(DataType     dtype,
+                void*        qkv,
+                const void*  q,
+                const void*  kv_a,
+                const void*  kv_b,
+                int          token_num,
+                int          head_num,
+                int          nope_dim,
+                int          rope_dim,
+                int          kv_lora_rank,
+                int          v_head_dim,
+                cudaStream_t stream);
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/moe_ffn_layer.cc b/src/turbomind/models/llama/moe_ffn_layer.cc
index ab5d42bd7b..5b84da56e7 100644
--- a/src/turbomind/models/llama/moe_ffn_layer.cc
+++ b/src/turbomind/models/llama/moe_ffn_layer.cc
@@ -1,131 +1,94 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
-#include "src/turbomind/models/llama/moe_ffn_layer.h"
+#include <cuda_runtime.h>
+
 #include "src/turbomind/kernels/activation_kernels.h"
 #include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include "src/turbomind/models/llama/LlamaLinear.h"
 #include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/models/llama/llama_utils.h"
+#include "src/turbomind/models/llama/moe_ffn_layer.h"
 #include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/monotonic.h"
-#include "src/turbomind/utils/nvtx_utils.h"
-#include "src/turbomind/utils/string_utils.h"
-#include <cuda_runtime.h>
-#include <iomanip>
 
 namespace turbomind {
 
-template<class T>
-void MoeFfnLayer<T>::AllocateBuffer(size_t tokens, size_t padded, size_t expert_num, size_t inter_buf_factor)
+MoeFfnLayer::MoeFfnLayer(const ModelParam& model, const MoeParam& param, const EngineParam& engine, const Context& ctx):
+    inter_size_(param.inter_size / engine.mlp_tp_size),
+    hidden_dim_(model.hidden_units),
+    param_(param),
+    stream_(ctx.stream),
+    linear_(*ctx.linear)
 {
-    char* base = 0;
-
-    auto allocate = [&](void* base) {
-        Monotonic alloc{base};
-        alloc(&inout_buf_, tokens * param_.experts_per_token * hidden_dim_);
-        alloc(&inter_buf_, tokens * param_.experts_per_token * inter_size_ * inter_buf_factor);
-        alloc(&logits_, tokens * expert_num);
-        alloc(&masks_, expert_num * padded);
-        alloc(&f2n_, param_.experts_per_token * tokens);
-        alloc(&en2f_, param_.experts_per_token * tokens);
-        alloc(&scales_, param_.experts_per_token * tokens);
-        alloc(&shared_scales_, tokens);
-        return (char*)alloc.ptr() - (char*)base;
-    };
-
-    const auto workspace_size = allocate(0);
-
-    workspace_ = (char*)allocator_->reMalloc(workspace_, workspace_size);
-
-    allocate(workspace_);
-}
+    TM_CHECK(!param.expert_num.empty());
 
-template<class T>
-void MoeFfnLayer<T>::FreeBuffer()
-{
-    allocator_->free((void**)&workspace_);
+    const int max_expert_num = *std::max_element(param.expert_num.begin(), param.expert_num.end());
+
+    if (param_.method == MoeParam::kFused) {
+        context_ =
+            std::make_unique<gemm::MoeGemmContext>(max_expert_num, param.experts_per_token, ctx.device_prop, stream_);
+    }
+    else {
+        expert_ffn_ = std::make_unique<LlamaFfnLayer>(model, ctx);
+    }
 
-    allocator_->free((void**)&accum_);
-    allocator_->free((void**)&offsets_);
+    h_offsets_ = {max_expert_num + 1, kCPUpinned};
 
-    allocator_->free((void**)&h_offsets_, true);
+    const int max_token_num = engine.max_forward_token_num;
+    const int pad_token_num = (max_token_num + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
+
+    masks_   = {max_expert_num * pad_token_num, kDEVICE};
+    f2n_     = {param_.experts_per_token * max_token_num, kDEVICE};
+    en2f_    = {param_.experts_per_token * max_token_num, kDEVICE};
+    scales_  = {param_.experts_per_token * max_token_num, kDEVICE};
+    offsets_ = {max_expert_num + 1, kDEVICE};
+    accum_   = {max_expert_num * kMoeGateMaxTiles, kDEVICE};
+
+    shared_scales_ = {max_token_num, kDEVICE};
 }
 
-template<class T>
-void MoeFfnLayer<T>::gate(float* logits, const T* input, int tokens, const LlamaDenseWeight<T>& weight)
+Tensor_<float> MoeFfnLayer::Gate(const Tensor& input, const LlamaDenseWeight& gate)
 {
-    const float alpha = 1.f;
-    const float beta  = 0.f;
-    cublas_->Gemm(CUBLAS_OP_N,
-                  CUBLAS_OP_N,
-                  weight.output_dims,
-                  tokens,
-                  weight.input_dims,
-                  &alpha,
-                  weight.kernel,
-                  getCudaDataType<T>(),
-                  weight.output_dims,
-                  input,
-                  getCudaDataType<T>(),
-                  hidden_dim_,
-                  &beta,
-                  logits,
-                  CUDA_R_32F,
-                  weight.output_dims,
-                  CUDA_R_32F,
-                  CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+    auto& weight = gate.weight;
+    TM_CHECK_EQ(input.shape(1), weight.shape(0));
+    Tensor_<float> logits{{input.shape(0), weight.shape(1)}, kDEVICE};
+    linear_.forward(input, gate, LlamaLinear::kGemm, logits);
+    sync_check_cuda_error();
+    return logits;
 }
 
-template<class T>
-void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight<T>& moe)
+void MoeFfnLayer::Forward(ForwardParam& p)
 {
+    const int   tokens = p.input.shape(0);
+    const auto& moe    = *p.weights;
+
     const size_t padded     = (tokens + kMoeGateVecSize - 1) / kMoeGateVecSize * kMoeGateVecSize;
     const int    expert_num = moe.experts.size();
 
     FT_CHECK(expert_num);
 
-    const size_t inter_buf_factor = [&] {
-        if (param_.method == MoeParam::kNaive) {
-            return 0;  // managed by ffn
-        }
-        else if (moe.block.is_fused_silu) {
-            return 1;
-        }
-        else {
-            return 2;
-        }
-    }();
-
-    AllocateBuffer(tokens, padded, expert_num, inter_buf_factor);
-
-    gate(logits_, input, tokens, moe.gate);
-    sync_check_cuda_error();
-
-    // if (tensor_para_.rank_ == 0) {
-    //     Compare(logits_, tokens * expert_num, Concat("logit", layer_id), compare_mode, stream_);
-    // }
+    auto logits = Gate(p.input, moe.gate);
 
-    check_cuda_error(cudaMemsetAsync(accum_, 0, sizeof(int) * expert_num * kMoeGateMaxTiles, stream_));
-    check_cuda_error(cudaMemsetAsync(masks_, -1, sizeof(int8_t) * expert_num * padded, stream_));
+    check_cuda_error(cudaMemsetAsync(accum_.data(), 0, sizeof(int) * expert_num * kMoeGateMaxTiles, stream_));
+    check_cuda_error(cudaMemsetAsync(masks_.data(), -1, sizeof(int8_t) * expert_num * padded, stream_));
 
     // dump_logits(tokens, layer_id);
 
     bool softmax = true;
     if (param_.topk_method == "group_limited_greedy") {
         invokeMoeSoftmaxMaskTopKGroups(
-            logits_, tokens, expert_num, expert_num / param_.n_group, param_.topk_group, stream_);
+            logits.data(), tokens, expert_num, expert_num / param_.n_group, param_.topk_group, stream_);
         sync_check_cuda_error();
         softmax = false;
     }
 
     /// TODO: fix illegal memory access even if NaN are present in logits
-    invokeMoeGate_V2(f2n_,
-                     en2f_,
-                     offsets_,
-                     scales_,
-                     masks_,
-                     accum_,
-                     logits_,
+    invokeMoeGate_V2(f2n_.data(),
+                     en2f_.data(),
+                     offsets_.data(),
+                     scales_.data(),
+                     masks_.data(),
+                     accum_.data(),
+                     logits.data(),
                      tokens,
                      padded,
                      expert_num,
@@ -147,143 +110,87 @@ void MoeFfnLayer<T>::forward(T* output, const T* input, int tokens, int layer_id
         for (int i = 0; i < expert_num; ++i) {
             h_offsets_[i + 1] = h_offsets_[i] + cnt[i];
         }
-        check_cuda_error(
-            cudaMemcpyAsync(offsets_, h_offsets_, sizeof(int) * (expert_num + 1), cudaMemcpyDefault, stream_));
+        check_cuda_error(cudaMemcpyAsync(
+            offsets_.data(), h_offsets_.data(), sizeof(int) * (expert_num + 1), cudaMemcpyDefault, stream_));
     }
 
+    temp_ = Tensor{{param_.experts_per_token * tokens, hidden_dim_}, p.input.dtype(), p.input.device()};
+
     if (param_.method == MoeParam::kNaive) {
 
-        dispatchMoeGather(inout_buf_, input, f2n_, tokens, param_.experts_per_token, hidden_dim_, stream_);
+        invokeMoeDispatch(temp_, p.input, f2n_.data(), param_.experts_per_token, stream_);
         sync_check_cuda_error();
 
-        check_cuda_error(
-            cudaMemcpyAsync(h_offsets_, offsets_, sizeof(int) * (expert_num + 1), cudaMemcpyDefault, stream_));
+        check_cuda_error(cudaMemcpyAsync(
+            h_offsets_.data(), offsets_.data(), sizeof(int) * (expert_num + 1), cudaMemcpyDefault, stream_));
 
         check_cuda_error(cudaStreamSynchronize(stream_));
 
-        if (h_offsets_[expert_num] != tokens * param_.experts_per_token) {
-            FT_CHECK_WITH_INFO(0, fmtstr("%d vs %d", h_offsets_[expert_num], tokens * param_.experts_per_token));
-        }
+        TM_CHECK_EQ(h_offsets_[expert_num], tokens * param_.experts_per_token);
 
         for (int i = 0; i < expert_num; ++i) {
-
-            FT_CHECK(moe.experts[i].is_fused_silu == false);
-
-            if (size_t count = h_offsets_[i + 1] - h_offsets_[i]) {
-                auto io = inout_buf_ + h_offsets_[i] * hidden_dim_;
-
-                TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, dtype_, {count, hidden_dim_}, io}},
-                                     {"layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}}};
-                TensorMap ffn_outputs{{"ffn_output", {MEMORY_GPU, dtype_, {count, hidden_dim_}, io}}};
-
-                expert_ffn_->forward(&ffn_outputs, &ffn_inputs, &moe.experts[i]);
+            if (int count = h_offsets_[i + 1] - h_offsets_[i]) {
+                auto io = temp_.slice({h_offsets_[i], 0}, {count, -1});
+                expert_ffn_->forward({io, io, moe.experts.at(i).get(), p.layer_id});
             }
         }
     }
     else {
-        context_->update(expert_num, param_.experts_per_token, offsets_);
+        context_->update(expert_num, param_.experts_per_token, offsets_.data());
 
         auto& block = moe.block;
 
-        linear_->forward_moe(inter_buf_,
-                             {input, (int)hidden_dim_},
-                             f2n_,
-                             offsets_,
-                             tokens * param_.experts_per_token,
-                             block.fused_gating_intermediate,
-                             block.is_fused_silu ? LlamaLinear<T>::kFusedSiluFfn : LlamaLinear<T>::kGemm,
-                             context_.get());
-        sync_check_cuda_error();
-        auto mode = kCmpRead;
+        const int inter_dim = block.is_fused_silu ? inter_size_ : inter_size_ * 2;
+        Tensor    inter{{tokens * param_.experts_per_token, inter_dim}, p.input.dtype(), p.input.device()};
 
-        // if (tensor_para_.rank_ == 0) {
-        //     Compare(inter_buf_,  //
-        //             tokens * param_.experts_per_token * inter_size_ * 2,
-        //             "inter_buf",
-        //             mode,
-        //             stream_);
-        // }
+        linear_.forward_moe(inter,
+                            p.input,
+                            f2n_.data(),
+                            offsets_.data(),
+                            block.fused_gating_intermediate,
+                            block.is_fused_silu ? LlamaLinear::kFusedSiluFfn : LlamaLinear::kGemm,
+                            context_.get());
+        sync_check_cuda_error();
 
         if (!block.is_fused_silu) {
-            invokeGenericActivation_v2<SiluActivation>(inter_buf_,
-                                                       inter_buf_ + inter_size_,
-                                                       inter_size_ * 2,
-                                                       tokens * param_.experts_per_token,
-                                                       inter_size_,
+            invokeGenericActivation_v3<SiluActivation>(inter.slice({0, 0}, {-1, inter_size_}),  //
+                                                       inter.slice({0, inter_size_}, {-1, -1}),
                                                        stream_);
             sync_check_cuda_error();
         }
 
-        linear_->forward_moe(inout_buf_,
-                             {inter_buf_, block.is_fused_silu ? (int)inter_size_ : (int)inter_size_ * 2},
-                             nullptr,
-                             offsets_,
-                             tokens * param_.experts_per_token,
-                             block.output,
-                             LlamaLinear<T>::kGemm,
-                             context_.get());
+        linear_.forward_moe(temp_,
+                            inter.slice({0, 0}, {-1, inter_size_}),
+                            nullptr,
+                            offsets_.data(),
+                            block.output,
+                            LlamaLinear::kGemm,
+                            context_.get());
         sync_check_cuda_error();
-        auto mode1 = kCmpRead;
-
-        // if (tensor_para_.rank_ == 0) {
-        //     Compare(inter_buf_2_,  //
-        //             tokens * param_.experts_per_token * inter_size_,
-        //             "inter_buf_2_",
-        //             mode1,
-        //             stream_);
-        //     Compare(inout_buf_,  //
-        //             tokens * param_.experts_per_token * hidden_dim_,
-        //             "inout_buf",
-        //             mode1,
-        //             stream_);
-        // }
-    }
-
-    if (moe.shared_gate.kernel) {
-        gate(shared_scales_, input, tokens, moe.shared_gate);
     }
 }
 
-template<class T>
-void MoeFfnLayer<T>::reduce(T* output, int tokens, float output_scale, int layer_id, const MoeFfnWeight<T>& moe)
+void MoeFfnLayer::Combine(ForwardParam& p)
 {
-    invokeMoeReduce(output,
-                    inout_buf_,
-                    scales_,
-                    en2f_,
-                    moe.shared_gate.kernel ? shared_scales_ : nullptr,
-                    tokens,
-                    param_.experts_per_token,
-                    hidden_dim_,
-                    output_scale,
-                    stream_);
-    sync_check_cuda_error();
-}
+    auto& moe = *p.weights;
 
-template<class T>
-void MoeFfnLayer<T>::dump_logits(int token_num, int layer_id, int expert_num)
-{
-    std::vector<float> logits(token_num * expert_num);
-    check_cuda_error(
-        cudaMemcpyAsync(logits.data(), logits_, sizeof(float) * logits.size(), cudaMemcpyDefault, stream_));
-    check_cuda_error(cudaStreamSynchronize(stream_));
-
-    auto ptr = logits.data();
-    std::cout << "layer_id: " << layer_id << std::endl;
-    for (int i = 0; i < token_num; ++i) {
-        for (int e = 0; e < expert_num; ++e) {
-            std::cout << *ptr++ << " ";
-        }
-        std::cout << std::endl;
+    Tensor_<float> shared_scales;
+
+    if (moe.shared_gate.weight) {
+        shared_scales = Gate(p.input, moe.shared_gate);
     }
-}
 
-#ifdef ENABLE_FP32
-template class MoeFfnLayer<float>;
-#endif
-template class MoeFfnLayer<half>;
-#ifdef ENABLE_BF16
-template class MoeFfnLayer<__nv_bfloat16>;
-#endif
+    invokeMoeCombine(p.output,
+                     temp_,
+                     scales_.data(),
+                     en2f_.data(),
+                     shared_scales.data_or((float*)nullptr),
+                     param_.experts_per_token,
+                     p.scale,
+                     stream_);
+    sync_check_cuda_error();
+
+    temp_ = {};
+}
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/moe_ffn_layer.h b/src/turbomind/models/llama/moe_ffn_layer.h
index 67c13609bb..abad2402cf 100644
--- a/src/turbomind/models/llama/moe_ffn_layer.h
+++ b/src/turbomind/models/llama/moe_ffn_layer.h
@@ -7,89 +7,54 @@
 #include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include "src/turbomind/models/llama/LlamaFfnLayer.h"
 #include "src/turbomind/models/llama/llama_params.h"
-#include "src/turbomind/utils/cublasMMWrapper.h"
-#include <algorithm>
 
 namespace turbomind {
 
-template<class T>
 class MoeFfnLayer {
 public:
-    MoeFfnLayer(ModelParam model, const MoeParam& param, size_t tp_size, const Context<T>& ctx):
-        inter_size_(param.inter_size / tp_size),
-        hidden_dim_(model.hidden_units),
-        param_(param),
-        dtype_(getTensorType<T>()),
-        stream_(ctx.stream),
-        cublas_(ctx.cublas_wrapper.get()),
-        linear_(ctx.linear.get()),
-        allocator_(ctx.allocator.get())
-    {
-        FT_CHECK(!param.expert_num.empty());
-        const int max_expert_num = *std::max_element(param.expert_num.begin(), param.expert_num.end());
+    MoeFfnLayer(const ModelParam& model, const MoeParam& param, const EngineParam& engine, const Context& ctx);
 
-        if (param_.method == MoeParam::kFused) {
-            context_ = std::make_unique<gemm::MoeGemmContext>(
-                max_expert_num, param.experts_per_token, ctx.cuda_device_prop, stream_);
-        }
-        else {
-            expert_ffn_ = std::make_unique<LlamaFfnLayer<T>>(model, ctx);
-        }
+    struct ForwardParam {
+        Tensor              input;
+        Tensor              output;
+        const MoeFfnWeight* weights;
+        float               scale;
+        int                 layer_id;
+    };
 
-        h_offsets_ = (int*)allocator_->malloc(sizeof(int) * (max_expert_num + 1), false, true);
+    void Forward(ForwardParam& p);
 
-        offsets_ = (int*)allocator_->malloc(sizeof(int) * (max_expert_num + 1));
-        accum_   = (int*)allocator_->malloc(sizeof(int) * max_expert_num * kMoeGateMaxTiles);
-    }
-
-    void AllocateBuffer(size_t tokens, size_t padded, size_t expert_num, size_t inter_buf_factor);
-
-    void FreeBuffer();
-
-    ~MoeFfnLayer()
-    {
-        FreeBuffer();
-    }
-
-    void forward(T* output, const T* input, int tokens, int layer_id, const MoeFfnWeight<T>& moe);
-
-    void reduce(T* output, int tokens, float output_scale, int layer_id, const MoeFfnWeight<T>& moe);
-
-    void gate(float* logits, const T* input, int tokens, const LlamaDenseWeight<T>& weight);
-
-    void dump_logits(int token_num, int layer_id, int expert_num);
+    void Combine(ForwardParam& p);
 
 private:
-    const size_t           inter_size_;
-    const size_t           hidden_dim_;
-    const MoeParam         param_;
-    const DataType         dtype_;
-    cudaStream_t const     stream_;
-    cublasMMWrapper* const cublas_;
-    LlamaLinear<T>* const  linear_;
-    IAllocator* const      allocator_;
-
-    std::unique_ptr<LlamaFfnLayer<T>>     expert_ffn_;
-    std::unique_ptr<gemm::MoeGemmContext> context_;
+    Tensor_<float> Gate(const Tensor& input, const LlamaDenseWeight& gate);
 
-    int* h_offsets_{};
+    void dump_logits(int token_num, int layer_id, int expert_num);
 
-    char* workspace_{};
+    const int      inter_size_;
+    const int      hidden_dim_;
+    const MoeParam param_;
 
-    T* inout_buf_{};  // [n * e, hidden_dim]
-    T* inter_buf_{};  // [n * e, inter_size]
+    cudaStream_t const stream_;
+    LlamaLinear&       linear_;
 
-    float* logits_{};
-    int*   masks_{};
+    std::unique_ptr<LlamaFfnLayer>        expert_ffn_;
+    std::unique_ptr<gemm::MoeGemmContext> context_;
 
-    int*   f2n_{};
-    int*   en2f_{};
-    float* scales_{};
+    ///////////////////////////////////////////////////////
+    /// runtime states
+    Buffer_<int> h_offsets_;
 
-    float* shared_scales_{};
+    Buffer_<int>   masks_;
+    Buffer_<int>   f2n_;
+    Buffer_<int>   en2f_;
+    Buffer_<float> scales_;
+    Buffer_<float> shared_scales_;
+    Buffer_<int>   accum_;
+    Buffer_<int>   offsets_;
 
-    int* accum_{};
-    int* offsets_{};
+    Tensor temp_;
+    ///////////////////////////////////////////////////////
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc
index 30efbdedf2..692a68997b 100644
--- a/src/turbomind/models/llama/unified_attention_layer.cc
+++ b/src/turbomind/models/llama/unified_attention_layer.cc
@@ -21,27 +21,49 @@
 
 #include <algorithm>
 #include <math.h>
+#include <numeric>
+
+#include "src/turbomind/core/check.h"
+#include "src/turbomind/core/data_type.h"
+#include "src/turbomind/core/tensor.h"
 
 #include "src/turbomind/kernels/attention/attention.h"
 #include "src/turbomind/kernels/attention/decoding.h"
 #include "src/turbomind/kernels/attention/kv_cache_utils_v2.h"
 #include "src/turbomind/kernels/norm/rms_norm.h"
+
 #include "src/turbomind/macro.h"
-#include "src/turbomind/models/llama/llama_kernels.h"
+
 #include "src/turbomind/models/llama/llama_utils.h"
 #include "src/turbomind/models/llama/mla_utils.h"
 #include "src/turbomind/models/llama/unified_attention_layer.h"
-#include "src/turbomind/utils/Tensor.h"
+
 #include "src/turbomind/utils/anomaly_handler.h"
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/logger.h"
-#include "src/turbomind/utils/memory_utils.h"
 
 namespace turbomind {
 
-template<class T>
-UnifiedAttentionLayer<T>::UnifiedAttentionLayer(
-    const ModelParam& model, const AttentionParam& attn, const LoraParam& lora, size_t tp_size, const Context<T>& ctx):
+UnifiedAttentionLayer::~UnifiedAttentionLayer()
+{
+    for (auto& s : streams_) {
+        s = {};
+    }
+
+    check_cuda_error(cudaEventDestroy(aux_event_));
+    check_cuda_error(cudaEventDestroy(qkv_event_));
+    check_cuda_error(cudaStreamDestroy(aux_stream_));
+
+    aux_event_ = qkv_event_ = {};
+    aux_stream_             = {};
+}
+
+UnifiedAttentionLayer::UnifiedAttentionLayer(const ModelParam&     model,
+                                             const AttentionParam& attn,
+                                             const EngineParam&    engine,
+                                             const LoraParam&      lora,
+                                             int                   tp_size,
+                                             const Context&        ctx):
     head_num_(model.head_num),
     kv_head_num_(model.kv_head_num),
     size_per_head_(model.head_dim),
@@ -53,11 +75,11 @@ UnifiedAttentionLayer<T>::UnifiedAttentionLayer(
     lora_param_(lora),
     context_(ctx),
     stream_(ctx.stream),
-    linear_(ctx.linear.get()),
-    allocator_(ctx.allocator.get()),
+    linear_(*ctx.linear),
     arch_(getSMVersion())
 {
-    FT_CHECK(head_num_ % kv_head_num_ == 0);
+    TM_CHECK_EQ(head_num_ % tp_size, 0) << head_num_ << " " << tp_size;
+    TM_CHECK_EQ(head_num_ % kv_head_num_, 0) << head_num_ << " " << kv_head_num_;
 
     check_cuda_error(cudaStreamCreateWithFlags(&aux_stream_, cudaStreamNonBlocking));
     check_cuda_error(cudaEventCreateWithFlags(&qkv_event_, cudaEventDisableTiming));
@@ -68,77 +90,59 @@ UnifiedAttentionLayer<T>::UnifiedAttentionLayer(
 
     init_rope_kernel_param(param_.rope, rope_param_);
 
-    allocateWorkspace();
+    partial_M_ = Tensor_<float>({kMaxWorkspaceTokens, local_head_num_}, kDEVICE);
+    partial_L_ = Tensor_<float>({kMaxWorkspaceTokens, local_head_num_}, kDEVICE);
+    partial_O_ = Tensor_<float>({kMaxWorkspaceTokens, local_head_num_, size_per_head_}, kDEVICE);
+    split_cnt_ = Tensor_<int>({kMaxWorkspaceTokens}, kDEVICE);
+    barriers_  = Tensor_<int>({kMaxWorkspaceTokens, local_head_num_}, kDEVICE);
+
+    Clear(split_cnt_.buffer());
+    Clear(barriers_.buffer());
+
+    const auto max_batch_size = engine.max_batch_size;
+
+    d_cu_x_len_ = {2 * (max_batch_size + 1), kDEVICE};
+    h_cu_x_len_ = {2 * (max_batch_size + 1), kCPUpinned};
+    event_      = Event::create();
 }
 
-template<typename T>
-void UnifiedAttentionLayer<T>::allocateBuffer(size_t q_count, size_t k_count, size_t batch_size, size_t max_lora_rank)
+void UnifiedAttentionLayer::Initialize(TensorMap& args)
 {
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-
-    if (max_lora_rank) {
-        lora_buf_ = (T*)allocator_->reMalloc(lora_buf_, sizeof(T) * q_count * max_lora_rank);
-    }
+    h_q_len_ = args.at("h_q_len").buffer();
+    h_k_len_ = args.at("h_k_len").buffer();
 
-    const int local_q_kv_head_num = local_head_num_ + 2 * local_kv_head_num_;
+    const int bsz = h_q_len_.size();
 
-    qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * q_count * local_q_kv_head_num * size_per_head_, false);
+    d_cu_q_len_ = d_cu_x_len_.data();
+    h_cu_q_len_ = h_cu_x_len_.data();
+    d_cu_k_len_ = d_cu_q_len_ + bsz + 1;
+    h_cu_k_len_ = h_cu_q_len_ + bsz + 1;
 
-    qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * q_count * local_head_num_ * size_per_head_, false);
+    h_cu_q_len_[0] = h_cu_k_len_[0] = 0;
 
-    // Pad the tmp buffer for linear KV cache by `MAX_CTA_S` to avoid illegal accesses
-    tmp_kv_buf_ = (T*)allocator_->reMalloc(
-        tmp_kv_buf_, sizeof(T) * local_kv_head_num_ * 2 * (k_count + MAX_CTA_S) * size_per_head_, false);
+    std::inclusive_scan(h_q_len_.data(), h_q_len_.data() + bsz, h_cu_q_len_ + 1);
+    std::inclusive_scan(h_k_len_.data(), h_k_len_.data() + bsz, h_cu_k_len_ + 1);
 
-    is_allocate_buffer_ = true;
-}
+    Copy(h_cu_x_len_.slice(0, 2 * bsz + 2), d_cu_x_len_.slice(0, 2 * bsz + 2));
 
-template<typename T>
-void UnifiedAttentionLayer<T>::allocateWorkspace()
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    FT_CHECK(!is_allocate_workspace_);
-    partial_M_ = (float*)allocator_->malloc(sizeof(float) * kMaxWorkspaceTokens * local_head_num_);
-    partial_L_ = (float*)allocator_->malloc(sizeof(float) * kMaxWorkspaceTokens * local_head_num_);
-    partial_O_ = (float*)allocator_->malloc(sizeof(float) * kMaxWorkspaceTokens * local_head_num_ * size_per_head_);
-    split_cnt_ = (int*)allocator_->malloc(sizeof(int) * kMaxWorkspaceTokens);
-    barriers_  = (int*)allocator_->malloc(sizeof(int) * kMaxWorkspaceTokens * local_head_num_, true, false);
-    is_allocate_workspace_ = true;
-}
+    event_.Record(core::Context::stream());
 
-template<typename T>
-void UnifiedAttentionLayer<T>::freeWorkspace()
-{
-    if (is_allocate_workspace_) {
-        TM_LOG_DEBUG(__PRETTY_FUNCTION__);
+    decode_num_ = *args.at("decode_num").data<int>();
+    prefil_num_ = *args.at("prefil_num").data<int>();
 
-        allocator_->free((void**)&partial_M_);
-        allocator_->free((void**)&partial_L_);
-        allocator_->free((void**)&partial_O_);
-        allocator_->free((void**)&split_cnt_);
-        allocator_->free((void**)&barriers_);
+    finished_  = args.at("finished").buffer();
+    rope_base_ = args.at("rope_base").buffer();
 
-        is_allocate_workspace_ = false;
-    }
+    cu_block_nums_ = args.at("cu_block_nums").buffer();
+    kv_block_ptrs_ = args.at("kv_block_ptrs").buffer();
 }
 
-template<typename T>
-void UnifiedAttentionLayer<T>::freeBuffer()
+void UnifiedAttentionLayer::Finalize()
 {
-    if (is_allocate_buffer_) {
-        TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-
-        allocator_->free((void**)&qkv_buf_);
-        allocator_->free((void**)&qkv_buf_3_);
-        allocator_->free((void**)&tmp_kv_buf_);
-        allocator_->free((void**)&lora_buf_);
-
-        is_allocate_buffer_ = false;
-    }
+    event_.Sync();
 }
 
-template<typename T>
-inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMap* inputs, const WeightType* weights)
+void UnifiedAttentionLayer::Forward(ForwardParam p)
 {
     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
 
@@ -165,100 +169,66 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
 
     /////////////////////////////////////////////
     /// parse inputs
-    const int token_num = inputs->at("input_query").shape[0];
-    const int layer_id  = inputs->getVal<int>("layer_id");
-
-    const int dc_batch_size = inputs->getVal<int>("dc_batch_size");
-    const int pf_batch_size = inputs->getVal<int>("pf_batch_size");
-    const int batch_size    = dc_batch_size + pf_batch_size;
-
-    int* h_q_len    = inputs->getPtr<int>("h_q_len");
-    int* h_k_len    = inputs->getPtr<int>("h_k_len");
-    int* cu_q_len   = inputs->getPtr<int>("cu_q_len");
-    int* cu_k_len   = inputs->getPtr<int>("cu_k_len");
-    int* h_cu_q_len = inputs->getPtr<int>("h_cu_q_len");
-    int* h_cu_k_len = inputs->getPtr<int>("h_cu_k_len");
-
-    bool*  is_finished = inputs->getPtr<bool>("finished");
-    float* rope_theta  = inputs->getPtr<float>("rope_theta");
-
-    void** block_ptrs     = outputs->getPtr<void*>("block_ptrs");
-    int*   cu_block_count = inputs->getPtr<int>("cu_block_counts");
-
-    T* attention_input = inputs->getPtr<T>("input_query");
-    T* attention_out   = outputs->getPtr<T>("hidden_features");
+    const int token_num = p.input.shape(0);
 
     if (token_num == 0) {
         return;
     }
 
-    /////////////////////////////////////////////
-    /// allocate buffers
-    allocateBuffer(token_num,                                           // shared
-                   h_cu_k_len[batch_size] - h_cu_k_len[dc_batch_size],  // prefill
-                   batch_size,
-                   std::max(weights->qkv.lora.r, weights->output.lora.r));
+    const int layer_id = p.layer_id;
+
+    const auto& weights = *p.weights;
 
     // [L, 2, H, s, D]
     const size_t layer_offset = layer_id * 2 * local_kv_head_num_ * param_.cache_block_seq_len * size_per_head_;
 
-    // static int count = 0;
-
-    // if (tensor_para_.rank_ == 0) {
-    //     Compare(attention_input, token_num * hidden_units_, Concat("qkv_input", layer_id), compare_mode, stream_);
-    // }
-
-    int* lora_mask = inputs->at("lora_mask", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<int>();
+    Tensor qkv;
 
-    if (weights->qkv.output_dims) {
-        //////////////////////////////////////////////
-        /// qkv gemm
-        // [token_num, hidden_dim] -> [token_num, 3, local_hidden_dim]
-        linear_->forward(
-            qkv_buf_, attention_input, token_num, weights->qkv, LlamaLinear<T>::kGemm, lora_buf_, lora_mask);
+    if (weights.qkv.output_dim) {
+        // [token_num, hidden_dim] -> [token_num, local_q_kv_head_num, head_dim]
+        qkv = linear_.forward(p.input, weights.qkv, LlamaLinear::kGemm);
         sync_check_cuda_error();
 
         if (model_param_.qk_norm) {
-            qk_norm(qkv_buf_, token_num, *weights);
+            qk_norm(qkv, weights);
         }
     }
     else {
-        forward_mla(attention_input, token_num, *weights);
+        qkv = forward_mla(p.input, weights);
     }
 
-    // std::cerr << layer_id << " " << count << " " << tensor_para_.rank_ << "\n";
+    TM_DEBUG_TENSOR(qkv, Concat("qkv", layer_id), 3);
 
-    count_and_fix(qkv_buf_, token_num * weights->qkv.output_dims, Concat("qkv", layer_id), 3);
+    auto invoke = [&](auto t) -> Tensor {
+        using T = decltype(t);
+        return core_attention<T>(qkv, p, weights);
+    };
 
-    // std::cerr << "token num: " << token_num << "\n";
+    Tensor attn = [&]() -> Tensor { TM_DISPATCH_PRIMARY_DTYPES_RET(qkv.dtype(), invoke); }();
 
-    // if (layer_id == 0 && count == 0 && tensor_para_.rank_ == 0) {
-    //     Compare(qkv_buf_, token_num * (3 * local_head_num_ * size_per_head_), "qkv_buf", CMP_MODE, stream_);
-    // }
+    TM_DEBUG_TENSOR(attn, Concat("attn", layer_id), 3);
 
-    if constexpr (0) {
-        std::vector<T> tmp(token_num * weights->qkv.output_dims);
-        cudaMemcpyAsync(tmp.data(), qkv_buf_, sizeof(T) * tmp.size(), cudaMemcpyDefault, stream_);
-        cudaStreamSynchronize(stream_);
-        int i = 0;
-        for (auto& x : tmp) {
-            std::cout << (float)x << " ";
-            if (++i == 256) {
-                break;
-            }
-        }
-        std::cout << "\n";
-        i = 0;
-        for (auto it = tmp.rbegin(); it != tmp.rend(); ++it) {
-            std::cout << (float)*it << " ";
-            if (++i == 256) {
-                break;
-            }
-        }
-        std::cout << "\n";
-    }
+    //////////////////////////////////////////////
+    /// output gemm <Bs,HD> -> <Bs,HD>
+    (void)linear_.forward(attn, weights.output, LlamaLinear::kGemm, p.output);
+    sync_check_cuda_error();
+}
+
+template<class T>
+Tensor UnifiedAttentionLayer::core_attention(Tensor& qkv, const ForwardParam& p, const WeightType& weights)
+{
+    const auto device = qkv.device();
+    const auto dtype  = qkv.dtype();
+
+    const int batch_size = decode_num_ + prefil_num_;
+    const int q_count    = qkv.shape(0);
+    const int k_count    = h_cu_k_len_[batch_size] - h_cu_k_len_[decode_num_];
+    const int layer_id   = p.layer_id;
+
+    const int local_q_kv_head_num = local_head_num_ + 2 * local_kv_head_num_;
 
-    // FT_CHECK(0);
+    Tensor attn{{q_count, (int)local_head_num_ * (int)size_per_head_}, dtype, device};
+    Tensor tmp_kv{{2, (int)local_kv_head_num_, k_count + MAX_CTA_S, (int)size_per_head_}, dtype, device};
 
     auto stream_ptr = streams_.data();
 
@@ -266,39 +236,40 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
         AttentionParams<T> params{};
 
         // Batch offset for `out` and `q` are computed inside the kernel
-        params.out = qkv_buf_3_;
+        params.out = (T*)attn.raw_data();
 
-        params.q      = (T*)qkv_buf_;
+        params.q      = (T*)qkv.raw_data();
         params.k      = params.q + local_head_num_ * size_per_head_;
         params.v      = params.k + local_kv_head_num_ * size_per_head_;
         params.stride = (local_head_num_ + 2 * local_kv_head_num_) * size_per_head_;
 
-        if (weights->qkv.bias) {
-            params.q_bias = weights->qkv.bias;
+        if (weights.qkv.bias) {
+            params.q_bias = (T*)weights.qkv.bias.data_or<T>(nullptr);
             params.k_bias = params.q_bias + local_head_num_ * size_per_head_;
             params.v_bias = params.k_bias + local_kv_head_num_ * size_per_head_;
         }
 
-        params.token_num  = h_cu_q_len[offset + batch_size] - h_cu_q_len[offset];
+        params.token_num  = h_cu_q_len_[offset + batch_size] - h_cu_q_len_[offset];
         params.batch_size = batch_size;
-        params.max_q_len  = *std::max_element(h_q_len + offset, h_q_len + offset + batch_size);
-        params.max_k_len  = *std::max_element(h_k_len + offset, h_k_len + offset + batch_size);
+        /// TODO: maximum on buffer slice
+        params.max_q_len = *std::max_element(h_q_len_.data() + offset, h_q_len_.data() + offset + batch_size);
+        params.max_k_len = *std::max_element(h_k_len_.data() + offset, h_k_len_.data() + offset + batch_size);
 
         // Decoding use only
-        params.block_iter_params = BlockIteratorParams{(char**)block_ptrs,  //
-                                                       (int*)cu_block_count + offset,
+        params.block_iter_params = BlockIteratorParams{(char**)kv_block_ptrs_.data(),  //
+                                                       cu_block_nums_.data() + offset,
                                                        layer_id,
                                                        (int)param_.cache_block_seq_len};
 
         // Prefilling use only
-        const int sum_k_len       = h_cu_k_len[offset + pf_batch_size] - h_cu_k_len[offset];
-        params.linear_iter_params = LinearIteratorParams{tmp_kv_buf_,  //
+        const int sum_k_len       = h_cu_k_len_[offset + prefil_num_] - h_cu_k_len_[offset];
+        params.linear_iter_params = LinearIteratorParams{tmp_kv.raw_data(),  //
                                                          int(2 * sum_k_len * size_per_head_),
                                                          int(sum_k_len * size_per_head_)};
 
-        params.finished = is_finished + offset;
-        params.cu_q_len = cu_q_len + offset;
-        params.cu_k_len = cu_k_len + offset;
+        params.finished = finished_.data() + offset;
+        params.cu_q_len = d_cu_q_len_ + offset;
+        params.cu_k_len = d_cu_k_len_ + offset;
 
         params.num_heads     = local_head_num_;
         params.num_kv_heads  = local_kv_head_num_;
@@ -315,7 +286,7 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
 
         // rotary embedding
         if (rope_param_.type == RopeType::kDynamic) {
-            rope_param_.base = rope_theta + offset;
+            rope_param_.base = const_cast<float*>(rope_base_.data()) + offset;
         }
         params.rope_param = rope_param_;
 
@@ -324,12 +295,11 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
         params.max_position_embeddings = param_.max_position_embeddings;
 
         // Decoding use only for now
-        FT_CHECK(barriers_);
-        params.split_cnt   = split_cnt_;
-        params.partial_L   = partial_L_;
-        params.partial_M   = partial_M_;
-        params.partial_O   = partial_O_;
-        params.locks       = barriers_;
+        params.split_cnt   = split_cnt_.data();
+        params.partial_L   = partial_L_.data();
+        params.partial_M   = partial_M_.data();
+        params.partial_O   = partial_O_.data();
+        params.locks       = barriers_.data();
         params.max_split_k = std::min(std::max(1, kMaxWorkspaceTokens / params.token_num), max_kv_splits);
 
         params.arch   = arch_;
@@ -342,18 +312,18 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
     cudaStream_t pf_stream = stream_;
     cudaStream_t dc_stream = stream_;
 
-    if (pf_batch_size && dc_batch_size) {
+    if (decode_num_ && prefil_num_) {
         pf_stream = aux_stream_;
         check_cuda_error(cudaEventRecord(qkv_event_, stream_));
         check_cuda_error(cudaStreamWaitEvent(aux_stream_, qkv_event_));
     }
 
-    if (pf_batch_size && !isTuning()) {
-        const int offset    = dc_batch_size;
-        const int sum_k_len = h_cu_k_len[offset + pf_batch_size] - h_cu_k_len[offset];
+    if (prefil_num_ && !isTuning()) {
+        const int offset    = decode_num_;
+        const int sum_k_len = h_cu_k_len_[offset + prefil_num_] - h_cu_k_len_[offset];
         // We are executing prefill & decoding kernels concurrently, but only have 1 workspace
         // disable split kv for prefill for now
-        auto params = CreateParams(offset, pf_batch_size, 1, pf_stream);
+        auto params = CreateParams(offset, prefil_num_, 1, pf_stream);
         if constexpr (sizeof(T) == 2) {
             invokeProcessKV_v2_(params);
             sync_check_cuda_error();
@@ -367,170 +337,106 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
         }
     }
 
-    if (dc_batch_size && !isTuning()) {
-        auto params = CreateParams(0, dc_batch_size, kMaxKVSplits, dc_stream);
+    if (decode_num_ && !isTuning()) {
+        auto params = CreateParams(0, decode_num_, kMaxKVSplits, dc_stream);
         if constexpr (sizeof(T) == 2) {
             dispatchDecoding<T>(params);
             sync_check_cuda_error();
         }
     }
 
-    if (pf_batch_size && dc_batch_size) {
+    if (decode_num_ && prefil_num_) {
         check_cuda_error(cudaEventRecord(aux_event_, aux_stream_));
         check_cuda_error(cudaStreamWaitEvent(stream_, aux_event_));
     }
 
-    // if (layer_id == 0 && count == 0) {
-    //     Compare(qkv_buf_3_, num_token * weights->output.input_dims, "qkv_buf_3", kCmpRead, stream_);
-
-    //     dump(qkv_buf_3_, num_token * weights->output.input_dims, stream_, "qkv_buf_3");
-    // }
-
     if (isTuning()) {
         rng_.set_stream(stream_);
-        rng_.GenerateUniform(qkv_buf_3_, token_num * weights->output.input_dims, .02f, -.01f);
-    }
-
-    count_and_fix(qkv_buf_3_, token_num * weights->output.input_dims, Concat("attn", layer_id), 3);
-
-    //////////////////////////////////////////////
-    /// output gemm <Bs,HD> -> <Bs,HD>
-    linear_->forward(
-        attention_out, qkv_buf_3_, token_num, weights->output, LlamaLinear<T>::kGemm, lora_buf_, lora_mask);
-    sync_check_cuda_error();
-
-    count_and_fix(attention_out, token_num * weights->output.output_dims, Concat("wo", layer_id), 3);
-
-    // if (tensor_para_.rank_ == 0) {
-    //     Compare(attention_out, token_num * hidden_units_, Concat("attn_out", layer_id), compare_mode, stream_);
-    //     // dump(qkv_buf_3_, num_token * weights->output.input_dims, stream_, "qkv_buf_3");
-    // }
-
-    if (is_free_buffer_after_forward_ == true) {
-        freeBuffer();
+        rng_.GenerateUniform(attn.data<T>(), attn.size(), .02f, -.01f);
     }
-    sync_check_cuda_error();
 
-    // ++count;
+    return attn;
 }
 
-template<typename T>
-void UnifiedAttentionLayer<T>::forward_mla(const T* inputs, int token_num, const WeightType& w)
+Tensor UnifiedAttentionLayer::forward_mla(const Tensor& hidden_state, const WeightType& w)
 {
-    const int q_lora_rank  = w.q_a_proj.output_dims;
-    const int kv_lora_rank = w.kv_b_proj.input_dims;
-    const int qk_rope_dim  = w.kv_a_proj.output_dims - kv_lora_rank;
-    const int qk_nope_dim  = std::max(w.q_b_proj.output_dims, w.q_proj.output_dims) / local_head_num_ - qk_rope_dim;
-    const int v_head_dim   = w.kv_b_proj.output_dims / local_head_num_ - qk_nope_dim;
+    const int q_lora_rank  = w.q_a_proj.output_dim;
+    const int kv_lora_rank = w.kv_b_proj.input_dim;
+    const int qk_rope_dim  = w.kv_a_proj.output_dim - kv_lora_rank;
+    const int qk_nope_dim  = std::max(w.q_b_proj.output_dim, w.q_proj.output_dim) / local_head_num_ - qk_rope_dim;
+    const int v_head_dim   = w.kv_b_proj.output_dim / local_head_num_ - qk_nope_dim;
+
+    const auto token_num = hidden_state.shape(0);
+    const auto dtype     = hidden_state.dtype();
 
-    T* q{};
+    Tensor q;
 
-    if (w.q_proj.kernel) {
-        deviceMalloc((T**)&q, (size_t)token_num * w.q_proj.output_dims, stream_);
-        linear_->forward(q, inputs, token_num, w.q_proj);
+    if (w.q_proj.weight) {
+        q = linear_.forward(hidden_state, w.q_proj);
         sync_check_cuda_error();
     }
     else {
-        T* q_a{};
-        deviceMalloc((T**)&q_a, (size_t)token_num * q_lora_rank, stream_);
-
-        linear_->forward(q_a, inputs, token_num, w.q_a_proj);
+        Tensor q_a = linear_.forward(hidden_state, w.q_a_proj);
         sync_check_cuda_error();
 
-        invokeRMSNorm(q_a,
-                      q_lora_rank,
-                      q_a,
-                      q_lora_rank,
-                      w.q_a_layernorm,
-                      q_lora_rank,
-                      token_num,
-                      model_param_.norm_eps,
-                      stream_);
+        invokeRMSNorm(q_a, q_a, w.q_a_layernorm, model_param_.norm_eps, stream_);
         sync_check_cuda_error();
 
-        deviceMalloc((T**)&q, (size_t)token_num * w.q_b_proj.output_dims, stream_);
-        linear_->forward(q, q_a, token_num, w.q_b_proj);
+        q = linear_.forward(q_a, w.q_b_proj);
         sync_check_cuda_error();
-
-        deviceFree(q_a, stream_);
     }
 
-    T*        kv_a{};
-    const int kv_a_dim = w.kv_a_proj.output_dims;
-    deviceMalloc((T**)&kv_a, (size_t)token_num * kv_a_dim, stream_);
-
-    linear_->forward(kv_a, inputs, token_num, w.kv_a_proj);
+    Tensor kv_a_k_pe = linear_.forward(hidden_state, w.kv_a_proj);
     sync_check_cuda_error();
 
-    invokeRMSNorm(
-        kv_a, kv_a_dim, kv_a, kv_a_dim, w.kv_a_layernorm, kv_lora_rank, token_num, model_param_.norm_eps, stream_);
+    auto kv_a = kv_a_k_pe.slice({0, 0}, {-1, kv_lora_rank});
+    invokeRMSNorm(kv_a, kv_a, w.kv_a_layernorm, model_param_.norm_eps, stream_);
     sync_check_cuda_error();
 
-    T* kv_b{};
-    deviceMalloc((T**)&kv_b, (size_t)token_num * w.kv_b_proj.output_dims, stream_);
+    Tensor kv_b = linear_.forward(kv_a, w.kv_b_proj);
     sync_check_cuda_error();
 
-    linear_->forward(kv_b, {kv_a, kv_a_dim}, token_num, w.kv_b_proj);
-    sync_check_cuda_error();
+    const int local_q_kv_head_num = local_head_num_ + 2 * local_kv_head_num_;
 
-    dispatchMLACopyQKV(qkv_buf_,
-                       q,
-                       kv_a,
-                       kv_b,
-                       token_num,
-                       local_head_num_,
-                       qk_nope_dim,
-                       qk_rope_dim,
-                       kv_lora_rank,
-                       v_head_dim,
-                       stream_);
+    Tensor qkv{{token_num, local_q_kv_head_num, (int)size_per_head_}, dtype, hidden_state.device()};
+    MLACopyQKV(dtype,
+               qkv.raw_data(),
+               q.raw_data(),
+               kv_a.raw_data(),
+               kv_b.raw_data(),
+               token_num,
+               local_head_num_,
+               qk_nope_dim,
+               qk_rope_dim,
+               kv_lora_rank,
+               v_head_dim,
+               stream_);
     sync_check_cuda_error();
 
-    deviceFree(q, stream_);
-    deviceFree(kv_a, stream_);
-    deviceFree(kv_b, stream_);
+    return qkv;
 }
 
-template<typename T>
-void UnifiedAttentionLayer<T>::qk_norm(T* qkv, int token_num, const WeightType& weights)
+void UnifiedAttentionLayer::qk_norm(Tensor& qkv, const WeightType& weights)
 {
     check_cuda_error(cudaEventRecord(qkv_event_, stream_));
     check_cuda_error(cudaStreamWaitEvent(aux_stream_, qkv_event_));
 
-    FT_CHECK(model_param_.attn_bias == false);
-
-    invokeQkRMSNorm(qkv_buf_,
-                    weights.qkv.output_dims,
-                    weights.q_a_layernorm,
-                    getTensorType<T>(),
-                    size_per_head_,
-                    local_head_num_,
-                    token_num,
-                    model_param_.norm_eps,
-                    stream_);
+    TM_CHECK(model_param_.attn_bias == false) << "not implemented";
+
+    const auto token_num = qkv.shape(0);
+
+    auto qkv3 = qkv.view({token_num, -1, (int)size_per_head_});
+
+    auto q = qkv3.slice({0, 0, 0}, {-1, (int)local_head_num_, -1});
+    invokeRMSNormQK(q, weights.q_a_layernorm, model_param_.norm_eps, stream_);
     sync_check_cuda_error();
 
-    invokeQkRMSNorm(qkv_buf_ + size_per_head_ * local_head_num_,
-                    weights.qkv.output_dims,
-                    weights.kv_a_layernorm,
-                    getTensorType<T>(),
-                    size_per_head_,
-                    local_kv_head_num_,
-                    token_num,
-                    model_param_.norm_eps,
-                    aux_stream_);
+    auto k = qkv3.slice({0, (int)local_head_num_, 0}, {-1, (int)local_kv_head_num_, -1});
+    invokeRMSNormQK(k, weights.kv_a_layernorm, model_param_.norm_eps, aux_stream_);
     sync_check_cuda_error();
 
     check_cuda_error(cudaEventRecord(aux_event_, aux_stream_));
     check_cuda_error(cudaStreamWaitEvent(stream_, aux_event_));
 }
 
-#ifdef ENABLE_FP32
-template class UnifiedAttentionLayer<float>;
-#endif
-template class UnifiedAttentionLayer<half>;
-#ifdef ENABLE_BF16
-template class UnifiedAttentionLayer<__nv_bfloat16>;
-#endif  // ENABLE_BF16
-
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/unified_attention_layer.h b/src/turbomind/models/llama/unified_attention_layer.h
index 2dd114e0b4..a498b3b881 100644
--- a/src/turbomind/models/llama/unified_attention_layer.h
+++ b/src/turbomind/models/llama/unified_attention_layer.h
@@ -21,116 +21,79 @@
 
 #pragma once
 
+#include <array>
+
 #include <cuda_runtime.h>
 
+#include "src/turbomind/core/core.h"
 #include "src/turbomind/kernels/gemm/test/test_utils.h"
 #include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include "src/turbomind/models/llama/LlamaLinear.h"
 #include "src/turbomind/models/llama/context.h"
 #include "src/turbomind/models/llama/llama_params.h"
-#include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/cuda_utils.h"
 
 namespace turbomind {
 
-template<typename T>
 class UnifiedAttentionLayer {
 public:
-    using WeightType = LlamaAttentionWeight<T>;
+    using WeightType = LlamaAttentionWeight;
 
     static constexpr int kMaxKVSplits        = 128;
     static constexpr int kMaxWorkspaceTokens = 4096;
 
-    void freeBuffer();
-    void allocateBuffer(size_t q_count, size_t k_count, size_t batch_size, size_t qkv_lora_rank);
+    struct ForwardParam {
+        Tensor            input;
+        Tensor            output;
+        const WeightType* weights;
+        int               layer_id;
+    };
 
-    void allocateWorkspace();
-    void freeWorkspace();
+    ~UnifiedAttentionLayer();
 
-    ~UnifiedAttentionLayer()
-    {
-        freeBuffer();
-        freeWorkspace();
+    UnifiedAttentionLayer(const ModelParam&     model,
+                          const AttentionParam& attn,
+                          const EngineParam&    engine,
+                          const LoraParam&      lora,
+                          int                   tp_size,
+                          const Context&        context);
 
-        for (auto& s : streams_) {
-            s = {};
-        }
+    void Forward(ForwardParam p);
 
-        check_cuda_error(cudaEventDestroy(aux_event_));
-        check_cuda_error(cudaEventDestroy(qkv_event_));
-        check_cuda_error(cudaStreamDestroy(aux_stream_));
+    void Initialize(TensorMap& args);
 
-        aux_event_ = qkv_event_ = {};
-        aux_stream_             = {};
-    }
+    void Finalize();
 
-    UnifiedAttentionLayer(const ModelParam&     model,
-                          const AttentionParam& attn,
-                          const LoraParam&      lora,
-                          size_t                tp_size,
-                          const Context<T>&     context);
-
-    void forward(TensorMap* outputs, const TensorMap* inputs, const WeightType* weights);
-
-    void prefill(T*                output,
-                 T*                tmp_kv_buffer,
-                 const T*          qkv,
-                 void**            block_ptrs,
-                 const int*        cu_q_len,
-                 const int*        cu_k_len,
-                 const int*        input_length,
-                 const int*        context_length,
-                 const int*        cu_block_count,
-                 const bool*       is_finished,
-                 const float*      rope_theta,
-                 int               pf_batch_size,
-                 int               pf_num_token,
-                 size_t            layer_offset,
-                 int               pf_max_q_len,
-                 int               pf_max_k_len,
-                 int               pf_session_len,
-                 const WeightType* weights);
-
-    void decode(T*                output,
-                const T*          qkv,
-                void**            block_ptrs,
-                const int*        cu_q_len,
-                const int*        cu_block_count,
-                const int*        input_length,
-                const int*        context_length,
-                const bool*       is_finished,
-                const float*      rope_theta,
-                size_t            layer_offset,
-                int               batch_size,
-                int               dc_sum_seq_len,
-                int               dc_max_seq_len,
-                int               max_split_k,
-                const WeightType* weights);
+    const int* d_cu_q_len()
+    {
+        return d_cu_q_len_;
+    }
 
 private:
-    void forward_mla(const T* inputs, int token_num, const WeightType& weights);
+    Tensor forward_mla(const Tensor& hidden_state, const WeightType& weights);
+
+    /// TODO: dropping the `T` here requires deep refactor of attention dispatch
+    template<class T>
+    Tensor core_attention(Tensor& qkv, const ForwardParam& p, const WeightType& weights);
 
-    void qk_norm(T* qkv, int token_num, const WeightType& weights);
+    void qk_norm(Tensor& qkv, const WeightType& weights);
 
 private:
-    const size_t head_num_;
-    const size_t kv_head_num_;
-    const size_t size_per_head_;
-    const size_t hidden_units_;
-    const size_t local_head_num_;
-    const size_t local_kv_head_num_;
+    const int head_num_;
+    const int kv_head_num_;
+    const int size_per_head_;
+    const int hidden_units_;
+    const int local_head_num_;
+    const int local_kv_head_num_;
 
     const AttentionParam param_;
     const ModelParam     model_param_;
     const LoraParam      lora_param_;
-    const Context<T>&    context_;
-
-    cudaStream_t const    stream_;
-    LlamaLinear<T>* const linear_;
-    IAllocator* const     allocator_;
-    const int             arch_{};
+    const Context&       context_;
 
-    const bool is_free_buffer_after_forward_{false};
+    cudaStream_t const stream_;
+    LlamaLinear&       linear_;
+    const int          arch_{};
 
     cudaStream_t aux_stream_;
     cudaEvent_t  qkv_event_;
@@ -142,28 +105,37 @@ class UnifiedAttentionLayer {
 
     RopeKernelParam rope_param_{};
 
-    T*     qkv_buf_{};
-    T*     q_buf_2_{};
-    T*     k_buf_2_{};
-    T*     v_buf_2_{};
-    T*     k_cache_buf_{};
-    T*     v_cache_buf_{};
-    T*     qk_buf_{};
-    float* qk_buf_float_{};
-    T*     qkv_buf_2_{};
-    T*     qkv_buf_3_{};
-    T*     lora_buf_{};
-
-    float* partial_M_{};
-    float* partial_L_{};
-    float* partial_O_{};
-    int*   split_cnt_{};
-    int*   barriers_{};  // always zero
-
-    T* tmp_kv_buf_{};
-
-    bool is_allocate_buffer_    = false;
-    bool is_allocate_workspace_ = false;
+    ///////////////////////////////////////////////////////
+    /// runtime states
+    int decode_num_;
+    int prefil_num_;
+
+    Tensor_<float> partial_M_;
+    Tensor_<float> partial_L_;
+    Tensor_<float> partial_O_;
+    Tensor_<int>   split_cnt_;
+    Tensor_<int>   barriers_;  // always zero
+
+    Event event_;
+
+    Buffer_<int> h_q_len_;
+    Buffer_<int> h_k_len_;
+
+    Buffer_<int> d_cu_x_len_;
+    Buffer_<int> h_cu_x_len_;
+
+    // references into d/h_cu_x_len_
+    int* d_cu_q_len_;
+    int* d_cu_k_len_;
+    int* h_cu_q_len_;
+    int* h_cu_k_len_;
+
+    Buffer_<bool>  finished_;
+    Buffer_<float> rope_base_;
+
+    Buffer_<int>       cu_block_nums_;
+    Buffer_<uintptr_t> kv_block_ptrs_;
+    ///////////////////////////////////////////////////////
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/unified_decoder.cc b/src/turbomind/models/llama/unified_decoder.cc
index d801539483..c875c7852f 100644
--- a/src/turbomind/models/llama/unified_decoder.cc
+++ b/src/turbomind/models/llama/unified_decoder.cc
@@ -1,30 +1,28 @@
 
 
-#include <cuda_runtime.h>
-#include <iterator>
 #include <numeric>
+#include <optional>
+
+#include <cuda_runtime.h>
 
 #include "src/turbomind/kernels/core/math.h"
 #include "src/turbomind/kernels/norm/rms_norm.h"
-#include "src/turbomind/models/llama/llama_decoder_kernels.h"
 #include "src/turbomind/models/llama/llama_kernels.h"
 #include "src/turbomind/models/llama/llama_utils.h"
 #include "src/turbomind/models/llama/moe_ffn_layer.h"
 #include "src/turbomind/models/llama/unified_attention_layer.h"
 #include "src/turbomind/models/llama/unified_decoder.h"
-#include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/anomaly_handler.h"
 #include "src/turbomind/utils/cuda_utils.h"
 
 namespace turbomind {
 
-template<class T>
-UnifiedDecoder<T>::UnifiedDecoder(const ModelParam&     model,
-                                  const EngineParam&    engine,
-                                  const AttentionParam& attn,
-                                  const MoeParam&       moe,
-                                  const LoraParam&      lora,
-                                  const Context<T>&     ctx):
+UnifiedDecoder::UnifiedDecoder(const ModelParam&     model,
+                               const EngineParam&    engine,
+                               const AttentionParam& attn,
+                               const MoeParam&       moe,
+                               const LoraParam&      lora,
+                               const Context&        ctx):
     layer_num_(model.layer_num),
     hidden_units_(model.hidden_units),
     attn_tp_size_(engine.attn_tp_size),
@@ -34,91 +32,39 @@ UnifiedDecoder<T>::UnifiedDecoder(const ModelParam&     model,
     attn_tp_group_(ctx.comm.d_tp_group),
     rmsnorm_eps_(model.norm_eps),
     stream_(ctx.stream),
-    allocator_(ctx.allocator.get()),
     d_comm_(ctx.comm.d_comm),
-    dtype_(getTensorType<T>()),
     tune_layer_num_(model.tune_layer_num)
 {
-    attn_layer_ = std::make_unique<UnifiedAttentionLayer<T>>(model, attn, lora, attn_tp_size_, ctx);
+    attn_layer_ = std::make_unique<UnifiedAttentionLayer>(model, attn, engine, lora, attn_tp_size_, ctx);
 
     if (std::accumulate(moe.expert_num.begin(), moe.expert_num.end(), 0LL)) {
-        moe_ffn_layer_ = std::make_unique<MoeFfnLayer<T>>(model, moe, mlp_tp_size_, ctx);
+        moe_ffn_layer_ = std::make_unique<MoeFfnLayer>(model, moe, engine, ctx);
     }
 
     if (std::accumulate(model.inter_size.begin(), model.inter_size.end(), 0LL)) {
-        ffn_layer_ = std::make_unique<LlamaFfnLayer<T>>(model, ctx);
+        ffn_layer_ = std::make_unique<LlamaFfnLayer>(model, ctx);
     }
-
-    check_cuda_error(cudaEventCreateWithFlags(&ev_h_cu_x_, cudaEventDisableTiming));
-}
-
-template<typename T>
-UnifiedDecoder<T>::~UnifiedDecoder()
-{
-    freeBuffer();
-    check_cuda_error(cudaEventDestroy(ev_h_cu_x_));
-}
-
-template<typename T>
-void UnifiedDecoder<T>::allocateBuffer(size_t batch_size)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-
-    cu_q_len_   = (int*)allocator_->reMalloc(cu_q_len_, 2 * sizeof(int) * (batch_size + 1), false);
-    h_cu_q_len_ = (int*)allocator_->reMalloc(h_cu_q_len_, 2 * sizeof(int) * (batch_size + 1), false, true);
-}
-
-template<typename T>
-void UnifiedDecoder<T>::freeBuffer()
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-
-    allocator_->free((void**)&cu_q_len_);
-    allocator_->free((void**)&h_cu_q_len_, true);
-}
-
-template<typename T>
-void UnifiedDecoder<T>::forwardSelfAttn(T*                attn_io,
-                                        TensorMap*        _outputs,
-                                        const TensorMap*  _inputs,
-                                        size_t            token_num,
-                                        size_t            batch_size,
-                                        int               layer_id,
-                                        const WeightType* weight)
-{
-    TensorMap inputs(*_inputs);
-    inputs.insert("input_query", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, attn_io});
-    inputs.insert("layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id});
-    inputs.insert("cu_q_len", {MEMORY_GPU, TYPE_INT32, {batch_size + 1}, cu_q_len_});
-    inputs.insert("cu_k_len", {MEMORY_GPU, TYPE_INT32, {batch_size + 1}, cu_k_len_});
-    inputs.insert("h_cu_q_len", {MEMORY_CPU, TYPE_INT32, {batch_size + 1}, h_cu_q_len_});
-    inputs.insert("h_cu_k_len", {MEMORY_CPU, TYPE_INT32, {batch_size + 1}, h_cu_k_len_});
-
-    TensorMap outputs(*_outputs);
-    outputs.insert("hidden_features", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, attn_io});
-
-    attn_layer_->forward(&outputs, &inputs, &weight->self_attn_weights);
 }
 
-template<typename T>
-void UnifiedDecoder<T>::AllreduceResidualRMSnorm(T*         hidden_states,
-                                                 T*         residual,
-                                                 const T*   bias,
-                                                 const T*   weight,
-                                                 int        token_num,
-                                                 int        group0,
-                                                 int        group1,
-                                                 const int* local_token_nums)
+void UnifiedDecoder::AllreduceResidualRMSnorm(Tensor&       hidden_states,
+                                              Tensor&       residual,
+                                              const Tensor& bias,
+                                              const Tensor& weight,
+                                              int           token_num,
+                                              int           group0,
+                                              int           group1,
+                                              const int*    local_token_nums)
 {
+    const auto dtype = hidden_states.dtype();
     if (0) {}
     else if (group0 || group1) {
-        d_comm_->AllreduceResidualBiasRMSnormEx(hidden_states,
-                                                residual,
-                                                bias,
-                                                weight,
+        d_comm_->AllreduceResidualBiasRMSnormEx(hidden_states.raw_data(),
+                                                residual.raw_data(),
+                                                bias.data_or((void*)nullptr),
+                                                weight.raw_data(),
                                                 rmsnorm_eps_,
                                                 hidden_units_,
-                                                dtype_,
+                                                dtype,
                                                 group0,
                                                 group1,
                                                 local_token_nums,
@@ -126,19 +72,33 @@ void UnifiedDecoder<T>::AllreduceResidualRMSnorm(T*         hidden_states,
         sync_check_cuda_error();
     }
     else if (d_comm_) {
-        d_comm_->AllreduceResidualBiasRMSnorm(
-            hidden_states, residual, bias, weight, rmsnorm_eps_, hidden_units_, token_num, dtype_, 0, stream_);
+        d_comm_->AllreduceResidualBiasRMSnorm(hidden_states.raw_data(),
+                                              residual.raw_data(),
+                                              bias.data_or((void*)nullptr),
+                                              weight.raw_data(),
+                                              rmsnorm_eps_,
+                                              hidden_units_,
+                                              token_num,
+                                              dtype,
+                                              0,
+                                              stream_);
         sync_check_cuda_error();
     }
     else {
-        invokeBiasResidualRMSNorm(
-            residual, hidden_states, weight, bias, hidden_units_, token_num, rmsnorm_eps_, stream_);
+        invokeResidualBiasRMSNorm(hidden_states.raw_data(),
+                                  residual.raw_data(),
+                                  weight.raw_data(),
+                                  bias.data_or((void*)nullptr),
+                                  dtype,
+                                  hidden_units_,
+                                  token_num,
+                                  rmsnorm_eps_,
+                                  stream_);
         sync_check_cuda_error();
     }
 }
 
-template<typename T>
-void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, const std::vector<WeightType*>* weights)
+void UnifiedDecoder::Forward(TensorMap& args, const std::vector<WeightType*>& weights)
 {
     /**
      * input tensors:
@@ -158,70 +118,42 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
      *   \param block_ptrs [total_block_counts], void*
      */
 
-    const size_t token_num = inputs->at("decoder_input").shape[0];
-
-    const int pf_batch_size = inputs->getVal<int>("pf_batch_size");
-    const int dc_batch_size = inputs->getVal<int>("dc_batch_size");
-    const int batch_size    = pf_batch_size + dc_batch_size;
-
-    const int* h_q_len = inputs->getPtr<int>("h_q_len");
-    const int* h_k_len = inputs->getPtr<int>("h_k_len");
+    const int decode_num = *args.at("decode_num").data<int>();
+    const int prefil_num = *args.at("prefil_num").data<int>();
+    const int batch_size = prefil_num + decode_num;
 
-    T* residual      = inputs->getPtr<T>("decoder_input");
-    T* hidden_states = outputs->getPtr<T>("decoder_output");
+    constexpr auto device = kDEVICE;
 
-    T* last_token_hidden_units = outputs->getPtr<T>("last_token_hidden_units");
+    Tensor_<int> local_token_nums = args.at("local_token_nums");
 
-    {  // compute cumulative lengths
+    Tensor local_residual       = args.at("decoder_input");
+    Tensor global_hidden_states = args.at("decoder_output");
 
-        h_cu_k_len_ = h_cu_q_len_ + batch_size + 1;
-        cu_k_len_   = cu_q_len_ + batch_size + 1;
+    Tensor local_hidden_states = global_hidden_states;
 
-        h_cu_q_len_[0] = h_cu_k_len_[0] = 0;
+    const auto global_token_num = global_hidden_states.shape(0);
+    const auto local_token_num  = local_residual.shape(0);
 
-        for (int i = 1; i <= batch_size; ++i) {
-            h_cu_q_len_[i] = h_cu_q_len_[i - 1] + h_q_len[i - 1];
-            h_cu_k_len_[i] = h_cu_k_len_[i - 1] + h_k_len[i - 1];
-        }
-
-        check_cuda_error(
-            cudaMemcpyAsync(cu_q_len_, h_cu_q_len_, 2 * sizeof(int) * (batch_size + 1), cudaMemcpyDefault, stream_));
-
-        check_cuda_error(cudaEventRecord(ev_h_cu_x_, stream_));
+    if (attn_dp_size_ > 1) {  // Offset hidden states buffer for mixed DP
+        TM_CHECK_EQ(local_token_nums.size(), attn_dp_size_);
+        std::vector cumul_token_nums(attn_dp_size_ + 1, 0);
+        std::inclusive_scan(
+            local_token_nums.data(), local_token_nums.data() + attn_dp_size_, cumul_token_nums.begin() + 1);
+        const int offset    = cumul_token_nums[attn_dp_rank_];
+        local_hidden_states = global_hidden_states.slice({offset, 0}, {local_token_num, -1});
     }
 
-    const int pf_offset = dc_batch_size;
+    attn_layer_->Initialize(args);
 
-    /// Offset hidden states buffer for mixed DP
-    T*         global_hidden_states = hidden_states;
-    size_t     global_token_num     = token_num;
-    const int* local_token_nums     = inputs->getPtr<int>("local_token_nums", nullptr);
-    if (attn_dp_size_ > 1) {
-        FT_CHECK(local_token_nums);
-        std::vector cumul_token_nums(attn_dp_size_ + 1, 0);
-        std::inclusive_scan(local_token_nums, local_token_nums + attn_dp_size_, cumul_token_nums.begin() + 1);
-        hidden_states    = hidden_states + (size_t)cumul_token_nums[attn_dp_rank_] * hidden_units_;
-        global_token_num = cumul_token_nums.back();
-        // TM_LOG_ERROR("rank %d, global_token_num %d, offset %d",
-        //              attn_dp_rank_,
-        //              global_token_num,
-        //              cumul_token_nums[attn_dp_rank_]);
-    }
+    TM_DEBUG_TENSOR(local_residual, "res", 1);
+    TM_DEBUG_TENSOR(weights.at(0)->self_attn_norm, "norm_weight", 2);
 
-    /////////////////////////////////////////////
-    /// RMSNorm
-    invokeRMSNorm(hidden_states,
-                  residual,
-                  weights->at(0)->self_attn_norm_weights,
-                  hidden_units_,
-                  token_num,
-                  rmsnorm_eps_,
-                  stream_);
+    invokeRMSNorm(local_hidden_states, local_residual, weights.at(0)->self_attn_norm, rmsnorm_eps_, stream_);
     sync_check_cuda_error();
 
-    count_and_fix(hidden_states, token_num * hidden_units_, Concat("norm0", 0), 2);
+    TM_DEBUG_TENSOR(local_hidden_states, Concat("norm0", 0), 2);
 
-    for (size_t layer = 0; layer < layer_num_; ++layer) {
+    for (int layer = 0; layer < layer_num_; ++layer) {
 
         /// TODO: do not skip the layers when they are heterogeneous
         if (isTuning() && layer >= tune_layer_num_) {
@@ -230,113 +162,99 @@ void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, con
 
         /////////////////////////////////////////////
         /// self-attention
-        forwardSelfAttn(hidden_states,  //
-                        outputs,
-                        inputs,
-                        token_num,
-                        batch_size,
-                        layer,
-                        weights->at(layer));
+        attn_layer_->Forward({local_hidden_states,  //
+                              local_hidden_states,
+                              weights.at(layer)->self_attn_weights.get(),
+                              layer});
 
-        count_and_fix(hidden_states, token_num * hidden_units_, Concat("attn_block", layer), 2);
+        TM_DEBUG_TENSOR(local_hidden_states, Concat("attn_block", layer), 2);
 
         AllreduceResidualRMSnorm(global_hidden_states,
-                                 residual,
-                                 weights->at(layer)->self_attn_weights.output.bias,
-                                 weights->at(layer)->ffn_norm_weights,
-                                 token_num,
+                                 local_residual,
+                                 weights.at(layer)->self_attn_weights->output.bias,
+                                 weights.at(layer)->ffn_norm,
+                                 local_token_num,
                                  attn_tp_group_,
                                  0,
-                                 local_token_nums);
+                                 local_token_nums.data());
 
-        count_and_fix(residual, token_num * hidden_units_, Concat("residual0", layer), 2);
-        count_and_fix(hidden_states, token_num * hidden_units_, Concat("norm1", layer), 2);
+        TM_DEBUG_TENSOR(local_residual, Concat("residual0", layer), 2);
+        TM_DEBUG_TENSOR(local_hidden_states, Concat("norm1", layer), 2);
 
         ////////////////////////////////////////////
         /// feed-forward network
 
-        const bool is_moe = !weights->at(layer)->moe_weights.experts.empty();
-        if (is_moe) {
-            // Writes to internal buffer
-            moe_ffn_layer_->forward(
-                nullptr, global_hidden_states, global_token_num, layer, weights->at(layer)->moe_weights);
+        std::optional<MoeFfnLayer::ForwardParam> moe_fwd_param;
+
+        if (weights.at(layer)->moe_weights) {
+            moe_fwd_param = MoeFfnLayer::ForwardParam{global_hidden_states,
+                                                      global_hidden_states,
+                                                      weights.at(layer)->moe_weights.get(),
+                                                      ffn_layer_ ? 1.f : 0.f,
+                                                      layer};
+            moe_ffn_layer_->Forward(*moe_fwd_param);
         }
 
-        if (weights->at(layer)->ffn_weights.output.kernel) {
-            int       layer_id = layer;  // int is needed
-            TensorMap ffn_inputs{
-                {"ffn_input", {MEMORY_GPU, dtype_, {global_token_num, hidden_units_}, global_hidden_states}},
-                {"layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id}},
-            };
-            TensorMap ffn_outputs{
-                {"ffn_output", {MEMORY_GPU, dtype_, {global_token_num, hidden_units_}, global_hidden_states}},
-            };
-            if (inputs->isExist("lora_mask")) {
-                ffn_inputs.insert({"lora_mask", inputs->at("lora_mask")});
-            }
-            ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &weights->at(layer)->ffn_weights);
+        if (weights.at(layer)->ffn_weights) {
+            ffn_layer_->forward(
+                {global_hidden_states, global_hidden_states, weights.at(layer)->ffn_weights.get(), (int)layer});
         }
 
-        if (is_moe) {
-            moe_ffn_layer_->reduce(
-                global_hidden_states, global_token_num, (bool)ffn_layer_, layer, weights->at(layer)->moe_weights);
+        if (moe_fwd_param) {
+            moe_ffn_layer_->Combine(*moe_fwd_param);
         }
 
-        count_and_fix(global_hidden_states, global_token_num * hidden_units_, Concat("ffn_block", layer), 2);
+        TM_DEBUG_TENSOR(global_hidden_states, Concat("ffn_block", layer), 2);
 
-        const bool is_last_layer = layer == layer_num_ - 1;
+        const bool last = layer == layer_num_ - 1;
 
-        auto scale_weight = !is_last_layer ? weights->at(layer + 1)->self_attn_norm_weights :
-                                             inputs->at("output_norm_weight").getPtr<T>();
+        auto& scale_weight = !last ? weights.at(layer + 1)->self_attn_norm : args.at("output_norm_weight");
 
         AllreduceResidualRMSnorm(global_hidden_states,
-                                 residual,
-                                 weights->at(layer)->ffn_weights.output.bias,
+                                 local_residual,
+                                 {},
                                  scale_weight,
-                                 token_num,
+                                 local_token_num,
                                  0,
                                  attn_tp_group_,
-                                 local_token_nums);
+                                 local_token_nums.data());
         sync_check_cuda_error();
 
-        count_and_fix(residual, token_num * hidden_units_, Concat("residual1", layer), 2);
-        count_and_fix(hidden_states, token_num * hidden_units_, Concat("norm0", layer + 1), 2);
+        TM_DEBUG_TENSOR(local_residual, Concat("residual1", layer), 2);
+        TM_DEBUG_TENSOR(local_hidden_states, Concat("norm0", layer + 1), 2);
     }
 
-    if (dc_batch_size) {
+    /// TODO
+    using T = uint16_t;
+
+    auto last_token_hidden_units = (T*)args.at("last_token_hidden_units").raw_data();
+
+    if (decode_num) {
         check_cuda_error(cudaMemcpyAsync(last_token_hidden_units,
-                                         hidden_states,
-                                         sizeof(T) * dc_batch_size * hidden_units_,
+                                         (T*)local_hidden_states.raw_data(),
+                                         sizeof(T) * decode_num * hidden_units_,
                                          cudaMemcpyDefault,
                                          stream_));
-        count_and_fix(last_token_hidden_units, dc_batch_size * hidden_units_, "dc_out", 2);
+        // TM_DEBUG_RAW(last_token_hidden_units, decode_num * hidden_units_, "dc_out", 2);
     }
 
-    if (pf_batch_size) {
-        invokeGetFeatureOfLastToken(last_token_hidden_units + pf_offset * hidden_units_,  //
-                                    hidden_states,
-                                    cu_q_len_ + pf_offset,
+    if (prefil_num) {
+        invokeGetFeatureOfLastToken(last_token_hidden_units + decode_num * hidden_units_,  //
+                                    (T*)local_hidden_states.raw_data(),
+                                    attn_layer_->d_cu_q_len() + decode_num,
                                     hidden_units_,
-                                    pf_batch_size,
+                                    prefil_num,
                                     stream_);
         sync_check_cuda_error();
-        count_and_fix(last_token_hidden_units + pf_offset * hidden_units_, pf_batch_size * hidden_units_, "pf_out", 2);
+        // TM_DEBUG_RAW(last_token_hidden_units + decode_num * hidden_units_, prefil_num * hidden_units_, "pf_out", 2);
     }
 
-    if (is_free_buffer_after_forward_) {
-        freeBuffer();
-    }
+    Buffer out(
+        (void*)last_token_hidden_units, (decode_num + prefil_num) * hidden_units_, local_residual.dtype(), kDEVICE);
 
-    // Wait for `h_cu_q/k_len_` to be consumed
-    check_cuda_error(cudaEventSynchronize(ev_h_cu_x_));
-}
+    TM_DEBUG_TENSOR(out, "out", 1);
 
-#ifdef ENABLE_FP32
-template class UnifiedDecoder<float>;
-#endif
-template class UnifiedDecoder<half>;
-#ifdef ENABLE_BF16
-template class UnifiedDecoder<__nv_bfloat16>;
-#endif  // ENABLE_BF16
+    attn_layer_->Finalize();
+}
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/unified_decoder.h b/src/turbomind/models/llama/unified_decoder.h
index 3dcb3e04a0..dd03293744 100644
--- a/src/turbomind/models/llama/unified_decoder.h
+++ b/src/turbomind/models/llama/unified_decoder.h
@@ -7,16 +7,24 @@
 #include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/models/llama/moe_ffn_layer.h"
 #include "src/turbomind/models/llama/unified_attention_layer.h"
-#include "src/turbomind/utils/cublasMMWrapper.h"
 #include "src/turbomind/utils/cuda_utils.h"
 
 namespace turbomind {
 
-template<typename T>
 class UnifiedDecoder {
-private:
-    void freeBuffer();
+public:
+    using WeightType = LlamaDecoderLayerWeight;
 
+    UnifiedDecoder(const ModelParam&     model,
+                   const EngineParam&    engine,
+                   const AttentionParam& attn,
+                   const MoeParam&       moe,
+                   const LoraParam&      lora,
+                   const Context&        ctx);
+
+    void Forward(TensorMap& args, const std::vector<WeightType*>& weights);
+
+private:
     const size_t layer_num_;
     const size_t hidden_units_;
 
@@ -29,58 +37,23 @@ class UnifiedDecoder {
 
     const float        rmsnorm_eps_;
     cudaStream_t const stream_;
-    IAllocator* const  allocator_;
 
     comm::DeviceCommImpl* const d_comm_;
 
-    const DataType dtype_;
-    const int      tune_layer_num_;
-    bool           is_free_buffer_after_forward_{};
-
-    int* cu_q_len_{};
-    int* cu_k_len_{};
-
-    int* h_cu_q_len_{};
-    int* h_cu_k_len_{};
-
-    std::unique_ptr<UnifiedAttentionLayer<T>> attn_layer_;
-    std::unique_ptr<LlamaFfnLayer<T>>         ffn_layer_;
-    std::unique_ptr<MoeFfnLayer<T>>           moe_ffn_layer_;
-
-    cudaEvent_t ev_h_cu_x_{};
-
-    using WeightType = LlamaDecoderLayerWeight<T>;
-
-    void forwardSelfAttn(T*                attn_io,
-                         TensorMap*        _outputs,
-                         const TensorMap*  _inputs,
-                         size_t            token_num,
-                         size_t            batch_size,
-                         int               layer_id,
-                         const WeightType* weight);
-
-    void AllreduceResidualRMSnorm(T*         hidden_states,
-                                  T*         residual,
-                                  const T*   bias,
-                                  const T*   weight,
-                                  int        token_num,
-                                  int        t0,
-                                  int        t1,
-                                  const int* local_token_nums);
-
-public:
-    UnifiedDecoder(const ModelParam&     model,
-                   const EngineParam&    engine,
-                   const AttentionParam& attn,
-                   const MoeParam&       moe,
-                   const LoraParam&      lora,
-                   const Context<T>&     ctx);
-
-    void allocateBuffer(size_t max_batch_size);
+    const int tune_layer_num_;
 
-    ~UnifiedDecoder();
+    std::unique_ptr<UnifiedAttentionLayer> attn_layer_;
+    std::unique_ptr<LlamaFfnLayer>         ffn_layer_;
+    std::unique_ptr<MoeFfnLayer>           moe_ffn_layer_;
 
-    void forward(TensorMap* outputs, const TensorMap* inputs, const std::vector<WeightType*>* weights);
+    void AllreduceResidualRMSnorm(Tensor&       hidden_states,
+                                  Tensor&       residual,
+                                  const Tensor& bias,
+                                  const Tensor& weight,
+                                  int           token_num,
+                                  int           t0,
+                                  int           t1,
+                                  const int*    local_token_nums);
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/models/llama/weight_type.h b/src/turbomind/models/llama/weight_type.h
deleted file mode 100644
index bc2f49a08e..0000000000
--- a/src/turbomind/models/llama/weight_type.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#pragma once
-
-#include <cstdint>
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-
-namespace turbomind {
-
-enum class WeightType : int
-{
-    kFP32,
-    kFP16,
-    kFP8,  // not supported yet
-    kBF16,
-    kINT8,
-    kINT4
-};
-
-template<class T>
-constexpr WeightType get_default_weight_type()
-{
-    if constexpr (std::is_same_v<T, half>) {
-        return WeightType::kFP16;
-    }
-    else if constexpr (std::is_same_v<T, nv_bfloat16>) {
-        return WeightType::kBF16;
-    }
-    else if constexpr (std::is_same_v<T, float>) {
-        return WeightType::kFP32;
-    }
-    else {
-        static_assert(sizeof(T) != sizeof(T), "not implemented");
-        return {};
-    }
-}
-
-inline size_t getBitSize(WeightType type)
-{
-    switch (type) {
-        case WeightType::kFP32:
-            return 32;
-        case WeightType::kFP16:
-            return 16;
-        case WeightType::kFP8:
-            return 8;
-        case WeightType::kBF16:
-            return 16;
-        case WeightType::kINT8:
-            return 8;
-        case WeightType::kINT4:
-            return 4;
-    }
-    return 0;
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/python/CMakeLists.txt b/src/turbomind/python/CMakeLists.txt
index bc7b063e95..8e8c07de2a 100644
--- a/src/turbomind/python/CMakeLists.txt
+++ b/src/turbomind/python/CMakeLists.txt
@@ -13,8 +13,7 @@ if(NOT pybind11_FOUND)
 endif()
 
 pybind11_add_module(${PROJECT_NAME} bind.cpp)
-target_link_libraries(${PROJECT_NAME} PRIVATE TransformerTritonBackend
-    LlamaTritonBackend)
+target_link_libraries(${PROJECT_NAME} PRIVATE LlamaTritonBackend)
 target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_14)
 
 set(_INSTALL_CUDA_RPATH
diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp
index 1dea57375b..a25daab2f7 100644
--- a/src/turbomind/python/bind.cpp
+++ b/src/turbomind/python/bind.cpp
@@ -12,44 +12,42 @@
 #include <pybind11/stl.h>
 #include <pybind11/stl_bind.h>
 
+#include "src/turbomind/core/tensor.h"
 #include "src/turbomind/engine/model_request.h"
 #include "src/turbomind/python/dlpack.h"
 #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
-#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
-#include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/cuda_utils.h"
 
 namespace py = pybind11;
 namespace ft = turbomind;
 using namespace pybind11::literals;
 
-using ft::ManagedTensor;
-using ft::Tensor;
+using ft::core::Tensor;
 
 // prepare to bind container
-using TensorMap = std::unordered_map<std::string, ft::ManagedTensor>;
+using TensorMap = ft::core::TensorMap;
 PYBIND11_MAKE_OPAQUE(TensorMap);
 static const char kDlTensorCapsuleName[] = "dltensor";
 
-DLDevice getDLDevice(const ft::Tensor& tensor)
+DLDevice getDLDevice(const Tensor& tensor)
 {
     int device_id = 0;
-    if (tensor.where == ft::MEMORY_GPU) {
+    if (tensor.device().type == ft::kDEVICE) {
         cudaPointerAttributes ptr_attr{};
-        cudaPointerGetAttributes(&ptr_attr, tensor.data);
+        cudaPointerGetAttributes(&ptr_attr, tensor.raw_data());
         device_id = ptr_attr.device;
     }
 
     DLDevice device{kDLCPU, device_id};
 
-    switch (tensor.where) {
-        case ft::MEMORY_CPU:
+    switch (tensor.device().type) {
+        case ft::kCPU:
             device.device_type = DLDeviceType::kDLCPU;
             break;
-        case ft::MEMORY_CPU_PINNED:
+        case ft::kCPUpinned:
             device.device_type = DLDeviceType::kDLCUDAHost;
             break;
-        case ft::MEMORY_GPU:
+        case ft::kDEVICE:
             device.device_type = DLDeviceType::kDLCUDA;
             break;
         default:
@@ -59,179 +57,170 @@ DLDevice getDLDevice(const ft::Tensor& tensor)
     return device;
 }
 
-DLManagedTensor* TritonTensorToDLManagedTensor(ManagedTensor& tensor)
+DLManagedTensor* TritonTensorToDLManagedTensor(Tensor& tensor)
 {
-    DLDevice device = getDLDevice(*tensor);
-
+    DLDevice   device = getDLDevice(tensor);
     DLDataType data_type{0, 0, 1};
-    switch (tensor->type) {
-        case ft::TYPE_BOOL:
+    using ft::data_type_v;
+    switch (tensor.dtype()) {
+        case data_type_v<bool>:
             data_type.code = DLDataTypeCode::kDLBool;
             data_type.bits = 8;
             break;
-        case ft::TYPE_UINT8:
+        case data_type_v<uint8_t>:
             data_type.code = DLDataTypeCode::kDLUInt;
             data_type.bits = 8;
             break;
-        case ft::TYPE_UINT16:
+        case data_type_v<uint16_t>:
             data_type.code = DLDataTypeCode::kDLUInt;
             data_type.bits = 16;
             break;
-        case ft::TYPE_UINT32:
+        case data_type_v<uint32_t>:
             data_type.code = DLDataTypeCode::kDLUInt;
             data_type.bits = 32;
             break;
-        case ft::TYPE_UINT64:
+        case data_type_v<uint64_t>:
             data_type.code = DLDataTypeCode::kDLUInt;
             data_type.bits = 64;
             break;
-        case ft::TYPE_INT8:
-        case ft::TYPE_BYTES:
+        case data_type_v<int8_t>:
             data_type.code = DLDataTypeCode::kDLInt;
             data_type.bits = 8;
             break;
-        case ft::TYPE_INT16:
+        case data_type_v<int16_t>:
             data_type.code = DLDataTypeCode::kDLInt;
             data_type.bits = 16;
             break;
-        case ft::TYPE_INT32:
+        case data_type_v<int32_t>:
             data_type.code = DLDataTypeCode::kDLInt;
             data_type.bits = 32;
             break;
-        case ft::TYPE_INT64:
+        case data_type_v<int64_t>:
             data_type.code = DLDataTypeCode::kDLInt;
             data_type.bits = 64;
             break;
-        case ft::TYPE_FP16:
+        case data_type_v<turbomind::half_t>:
             data_type.code = DLDataTypeCode::kDLFloat;
             data_type.bits = 16;
             break;
-        case ft::TYPE_FP32:
+        case data_type_v<float>:
             data_type.code = DLDataTypeCode::kDLFloat;
             data_type.bits = 32;
             break;
-        case ft::TYPE_FP64:
+        case data_type_v<double>:
             data_type.code = DLDataTypeCode::kDLFloat;
             data_type.bits = 64;
             break;
-        case ft::TYPE_BF16:
+        case data_type_v<turbomind::bfloat16_t>:
             data_type.code = DLDataTypeCode::kDLBfloat;
             data_type.bits = 16;
             break;
         default:
             break;
     }
-    ManagedTensor* ctx = new ManagedTensor(tensor);
-    DLTensor       dl_tensor{const_cast<void*>((*ctx)->data),
+
+    static_assert(sizeof(int64_t) == sizeof(tensor.shape(0)));
+
+    Tensor*  ctx = new Tensor(tensor);
+    DLTensor dl_tensor{const_cast<void*>(ctx->raw_data()),
                        device,
-                       (int32_t)((*ctx)->shape.size()),
+                       (int32_t)(ctx->ndim()),
                        data_type,
-                       reinterpret_cast<int64_t*>(const_cast<size_t*>((*ctx)->shape.data())),
+                       (int64_t*)ctx->shape().data(),
                        (int64_t*)(nullptr),
                        0};
     return new DLManagedTensor{dl_tensor, ctx, [](DLManagedTensor* dlmt) {  //
-                                   //    auto&             x = *(ManagedTensor*)dlmt->manager_ctx;
-                                   //    std::stringstream ss;
-                                   //    ss << "(";
-                                   //    for (const auto& d : x->shape) {
-                                   //        ss << d << ",";
-                                   //    }
-                                   //    ss << ")";
-                                   //    std::cerr << "turbomind tensor dtor " << ss.str() << " " << std::endl;
-                                   delete (ManagedTensor*)dlmt->manager_ctx;
+                                   delete (Tensor*)dlmt->manager_ctx;
                                    delete dlmt;
                                }};
 }
 
-ft::MemoryType getMemoryType(DLDevice device)
+ft::DeviceType getMemoryType(DLDevice device)
 {
     switch (device.device_type) {
         case DLDeviceType::kDLCUDAHost:
-            return ft::MemoryType::MEMORY_CPU_PINNED;
+            return ft::DeviceType::kCPUpinned;
         case DLDeviceType::kDLCUDA:
-            return ft::MemoryType::MEMORY_GPU;
+            return ft::DeviceType::kDEVICE;
         case DLDeviceType::kDLCPU:
         default:
-            return ft::MemoryType::MEMORY_CPU;
+            return ft::DeviceType::kCPU;
     }
 }
 
 ft::DataType getDataType(DLDataType data_type)
 {
+    using ft::data_type_v;
     switch (data_type.code) {
         case DLDataTypeCode::kDLUInt:
             switch (data_type.bits) {
                 case 8:
-                    return ft::TYPE_UINT8;
+                    return data_type_v<uint8_t>;
                 case 16:
-                    return ft::TYPE_UINT16;
+                    return data_type_v<uint16_t>;
                 case 32:
-                    return ft::TYPE_UINT32;
+                    return data_type_v<uint32_t>;
                 case 64:
-                    return ft::TYPE_UINT64;
+                    return data_type_v<uint64_t>;
                 default:
-                    return ft::TYPE_INVALID;
+                    return data_type_v<void>;
             }
             break;
         case DLDataTypeCode::kDLInt:
             switch (data_type.bits) {
                 case 8:
-                    return ft::TYPE_INT8;
+                    return data_type_v<int8_t>;
                 case 16:
-                    return ft::TYPE_INT16;
+                    return data_type_v<int16_t>;
                 case 32:
-                    return ft::TYPE_INT32;
+                    return data_type_v<int32_t>;
                 case 64:
-                    return ft::TYPE_INT64;
+                    return data_type_v<int64_t>;
                 default:
-                    return ft::TYPE_INVALID;
+                    return data_type_v<void>;
             }
             break;
         case DLDataTypeCode::kDLFloat:
             switch (data_type.bits) {
                 case 16:
-                    return ft::TYPE_FP16;
+                    return data_type_v<turbomind::half_t>;
                 case 32:
-                    return ft::TYPE_FP32;
+                    return data_type_v<float>;
                 case 64:
-                    return ft::TYPE_FP64;
+                    return data_type_v<double>;
                 default:
-                    return ft::TYPE_INVALID;
+                    return data_type_v<void>;
             }
             break;
         case DLDataTypeCode::kDLBfloat:
             switch (data_type.bits) {
                 case 16:
-                    return ft::TYPE_BF16;
+                    return data_type_v<turbomind::bfloat16_t>;
                 default:
-                    return ft::TYPE_INVALID;
+                    return data_type_v<void>;
             }
             break;
         case DLDataTypeCode::kDLBool:
-            return ft::TYPE_BOOL;
+            return data_type_v<bool>;
         default:
-            return ft::TYPE_INVALID;
+            return data_type_v<void>;
     }
 }
 
-std::shared_ptr<ManagedTensor> DLManagedTensorToTritonTensor(DLManagedTensor* tensor)
+std::shared_ptr<Tensor> DLManagedTensorToTritonTensor(DLManagedTensor* tensor)
 {
     auto& dl_tensor = tensor->dl_tensor;
     auto  where     = getMemoryType(dl_tensor.device);
     auto  dtype     = getDataType(dl_tensor.dtype);
     assert(dl_tensor.ndim > 0);
-    std::vector<size_t> shape(dl_tensor.shape, dl_tensor.shape + dl_tensor.ndim);
-    auto                data = dl_tensor.data;
-
-    auto ret    = std::make_shared<ManagedTensor>();
-    ret->tensor = Tensor(where, dtype, std::move(shape), data);
-    ret->data_holder.reset((void*)nullptr, [tensor](void*) {
-        // std::cerr << "dlpack tensor dtor" << std::endl;
-        if (tensor->deleter) {
-            tensor->deleter(tensor);
-        }
-    });
-    return ret;
+    std::vector<ft::core::ssize_t> shape(dl_tensor.shape, dl_tensor.shape + dl_tensor.ndim);
+
+    std::shared_ptr<void> ptr{dl_tensor.data, [tensor](void* p) {
+                                  if (tensor->deleter) {
+                                      tensor->deleter(tensor);
+                                  }
+                              }};
+    return std::make_shared<Tensor>(ptr, std::move(shape), dtype, where);
 }
 
 static void safe_memcpy(void* dst, const void* src, size_t size)
@@ -352,80 +341,54 @@ PYBIND11_MODULE(_turbomind, m)
         .def("consume", [](ft::AtomicRequestState& s) { return s.exchange(nullptr); });
 
     // data type
-    py::enum_<ft::DataType>(m, "DataType")
-        .value("TYPE_INVALID", ft::DataType::TYPE_INVALID)
-        .value("TYPE_BOOL", ft::DataType::TYPE_BOOL)
-        .value("TYPE_UINT8", ft::DataType::TYPE_UINT8)
-        .value("TYPE_UINT16", ft::DataType::TYPE_UINT16)
-        .value("TYPE_UINT32", ft::DataType::TYPE_UINT32)
-        .value("TYPE_UINT64", ft::DataType::TYPE_UINT64)
-        .value("TYPE_INT8", ft::DataType::TYPE_INT8)
-        .value("TYPE_INT16", ft::DataType::TYPE_INT16)
-        .value("TYPE_INT32", ft::DataType::TYPE_INT32)
-        .value("TYPE_INT64", ft::DataType::TYPE_INT64)
-        .value("TYPE_FP16", ft::DataType::TYPE_FP16)
-        .value("TYPE_FP32", ft::DataType::TYPE_FP32)
-        .value("TYPE_FP64", ft::DataType::TYPE_FP64)
-        .value("TYPE_BYTES", ft::DataType::TYPE_BYTES)
-        .value("TYPE_BF16", ft::DataType::TYPE_BF16);
-
-    // memory type
-    py::enum_<ft::MemoryType>(m, "MemoryType")
-        .value("MEMORY_CPU", ft::MemoryType::MEMORY_CPU)
-        .value("MEMORY_CPU_PINNED", ft::MemoryType::MEMORY_CPU_PINNED)
-        .value("MEMORY_GPU", ft::MemoryType::MEMORY_GPU);
+    {
+        using namespace turbomind;
+        py::enum_<ft::DataType>(m, "DataType")
+            .value("TYPE_INVALID", kNull)
+            .value("TYPE_BOOL", kBool)
+            .value("TYPE_UINT8", kUint8)
+            .value("TYPE_UINT16", kUint16)
+            .value("TYPE_UINT32", kUint32)
+            .value("TYPE_UINT64", kUint64)
+            .value("TYPE_INT8", kInt8)
+            .value("TYPE_INT16", kInt16)
+            .value("TYPE_INT32", kInt32)
+            .value("TYPE_INT64", kInt64)
+            .value("TYPE_FP16", kFloat16)
+            .value("TYPE_FP32", kFloat32)
+            .value("TYPE_FP64", kFloat64)
+            .value("TYPE_BF16", kBfloat16);
+
+        // memory type
+        py::enum_<ft::DeviceType>(m, "MemoryType")
+            .value("MEMORY_CPU", ft::DeviceType::kCPU)
+            .value("MEMORY_CPU_PINNED", ft::DeviceType::kCPUpinned)
+            .value("MEMORY_GPU", ft::DeviceType::kDEVICE);
+    }
 
     // tensor
-    py::class_<ManagedTensor, std::shared_ptr<ManagedTensor>>(m, "Tensor")
-        .def_property_readonly("where", [](const ManagedTensor& t) { return t->where; })
-        .def_property_readonly("type", [](const ManagedTensor& t) { return t->type; })
-        .def_property_readonly("shape", [](const ManagedTensor& t) { return t->shape; })
-        .def_property_readonly("data", [](const ManagedTensor& t) { return t->data; })
-        .def(
-            "view",
-            [](const ManagedTensor& self, ft::DataType new_type) {
-                auto x  = self;
-                x->type = new_type;
-                return std::make_shared<ManagedTensor>(std::move(x));
-            },
-            "new_type"_a)
-        .def(
-            "view",
-            [](const ManagedTensor& self, std::vector<size_t> new_shape) {
-                auto x   = self;
-                x->shape = new_shape;
-                return std::make_shared<ManagedTensor>(std::move(x));
-            },
-            "new_shape"_a)
+    py::class_<Tensor, std::shared_ptr<Tensor>>(m, "Tensor")
+        .def_property_readonly("where", [](const Tensor& t) { return t.device().type; })
+        .def_property_readonly("type", [](const Tensor& t) { return t.dtype(); })
+        .def_property_readonly("shape", [](const Tensor& t) { return t.shape(); })
+        .def_property_readonly("data", [](const Tensor& t) { return t.raw_data(); })
         .def(
             "copy_from",
-            [](ManagedTensor& self, py::object obj) {
+            [](Tensor& self, py::object obj) {
                 py::capsule      cap = obj.attr("__dlpack__")();
                 DLManagedTensor* dlmt =
                     static_cast<DLManagedTensor*>(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName));
                 auto src = DLManagedTensorToTritonTensor(dlmt);
                 // take ownership of capsule's payload
                 cap.set_name("used_dltensor");
-                switch (self->type) {
-                    case ft::TYPE_FP16:
-                    case ft::TYPE_FP32:
-                    case ft::TYPE_INT32:
-                    case ft::TYPE_BF16: {
-                        auto num_element = std::accumulate(
-                            (*src)->shape.begin(), (*src)->shape.end(), 1LL, std::multiplies<int64_t>());
-                        auto num_bytes = num_element * dlmt->dl_tensor.dtype.bits / 8;
-                        ft::FT_CHECK(self->shape.size() == 1 && num_bytes == self->shape[0]);
-                        safe_memcpy(const_cast<void*>(self->data), (*src)->data, num_bytes);
-                        break;
-                    }
-                    default:
-                        ft::FT_CHECK(0);
-                }
+
+                TM_CHECK_EQ(self.byte_size(), src->byte_size());
+                safe_memcpy(self.raw_data(), src->raw_data(), self.byte_size());
             },
             "tensor"_a)
         .def(
             "__dlpack__",
-            [](ManagedTensor& self, long stream) {
+            [](Tensor& self, long stream) {
                 DLManagedTensor* dlmt = TritonTensorToDLManagedTensor(self);
                 return py::capsule(dlmt, kDlTensorCapsuleName, [](PyObject* obj) {
                     DLManagedTensor* dlmt =
@@ -441,8 +404,8 @@ PYBIND11_MODULE(_turbomind, m)
                 });
             },
             "stream"_a = 0)
-        .def("__dlpack_device__", [](const ManagedTensor& self) {
-            auto device = getDLDevice(*self);
+        .def("__dlpack_device__", [](const Tensor& self) {
+            auto device = getDLDevice(self);
             return std::tuple<int, int>(int(device.device_type), device.device_id);
         });
     m.def(
@@ -458,9 +421,9 @@ PYBIND11_MODULE(_turbomind, m)
         },
         "dl_managed_tensor"_a);
 
-    // transformer model instance
-    using ft::ModelRequest;
     py::bind_map<TensorMap, std::shared_ptr<TensorMap>>(m, "TensorMap");
+
+    using ft::ModelRequest;
     py::class_<ModelRequest>(m, "ModelRequest")
         .def(
             "forward",
@@ -507,87 +470,80 @@ PYBIND11_MODULE(_turbomind, m)
             "session_id"_a);
 
     // transformer model
-    using ft::AbstractTransformerModel;
     using ft::LlamaTritonModel;
-    py::class_<AbstractTransformerModel, std::shared_ptr<AbstractTransformerModel>>(m, "AbstractTransformerModel")
+    py::class_<LlamaTritonModel, std::shared_ptr<LlamaTritonModel>>(m, "AbstractTransformerModel")
         .def_static(
             "create_llama_model",
             [](std::string model_dir,
                std::string config,
-               std::string data_type) -> std::shared_ptr<AbstractTransformerModel> {
+               std::string weight_type) -> std::shared_ptr<LlamaTritonModel> {
                 auto gil_factory = [] {  //
                     // erase the type
                     return std::static_pointer_cast<void>(std::make_shared<ScopedGIL>());
                 };
-                auto no_gil_deleter = [](AbstractTransformerModel* ptr) {
+                auto no_gil_deleter = [](LlamaTritonModel* ptr) {
                     pybind11::gil_scoped_release release;
                     delete ptr;
                 };
 
-                if (data_type == "half" || data_type == "fp16" || data_type == "float16" || data_type == "int4") {
-                    std::shared_ptr<LlamaTritonModel<half>> model(
-                        new LlamaTritonModel<half>(model_dir, config, gil_factory), no_gil_deleter);
-                    return model;
+                turbomind::DataType data_type{};
+
+                if (weight_type == "half" || weight_type == "fp16" || weight_type == "float16"
+                    || weight_type == "int4") {
+                    data_type = turbomind::kFloat16;
                 }
-                else if (data_type == "bf16" || data_type == "bfloat16") {
+                else if (weight_type == "bf16" || weight_type == "bfloat16") {
 #ifdef ENABLE_BF16
-                    std::shared_ptr<LlamaTritonModel<__nv_bfloat16>> model(
-                        new LlamaTritonModel<__nv_bfloat16>(model_dir, config, gil_factory), no_gil_deleter);
-                    return model;
+                    data_type = turbomind::kBfloat16;
 #else
                     throw std::runtime_error("Error: turbomind has not been built with bf16 support.");
 #endif
                 }
                 else {
 #ifdef ENABLE_FP32
-                    auto model = std::make_shared<LlamaTritonModel<float>>(model_dir, config, gil_factory);
-                    return model;
+                    data_type = turbomind::kF32;
 #else
                     throw std::runtime_error("Error: turbomind has not been built with fp32 support.");
 #endif
                 }
+
+                std::shared_ptr<LlamaTritonModel> model(new LlamaTritonModel(data_type, model_dir, config, gil_factory),
+                                                        no_gil_deleter);
+                return model;
             },
             "model_dir"_a,
-            "config"_a    = "",
-            "data_type"_a = "half")
+            "config"_a      = "",
+            "weight_type"_a = "half")
         .def(
             "create_model_instance",
-            [](AbstractTransformerModel* model, int deviceId) { return model->createModelInstance(deviceId); },
+            [](LlamaTritonModel* model, int deviceId) { return model->createModelInstance(deviceId); },
             py::call_guard<py::gil_scoped_release>(),
             "device_id"_a)
         .def("create_shared_weights",
-             &AbstractTransformerModel::createSharedWeights,
+             &LlamaTritonModel::createSharedWeights,
              py::call_guard<py::gil_scoped_release>(),
              "device_id"_a,
              "rank"_a)
         .def(
             "get_params",
-            [](AbstractTransformerModel* model, int deviceId, int rank) {
-                auto      output = model->getParams(deviceId, rank);
-                TensorMap ret;
-                for (const auto& [k, v] : output) {
-                    // export reference to weight data only (no ownership)
-                    ret.emplace(k, ManagedTensor{v});
-                }
-                return ret;
-            },
+            [](LlamaTritonModel* model, int deviceId, int rank) { return model->getParams(deviceId, rank); },
             py::call_guard<py::gil_scoped_release>(),
             "device_id"_a,
             "rank"_a)
         .def(
             "process_weight",
-            [](AbstractTransformerModel* model, int deviceId, int rank) { model->processWeights(deviceId, rank); },
+            [](LlamaTritonModel* model, int deviceId, int rank) { model->processWeights(deviceId, rank); },
             py::call_guard<py::gil_scoped_release>(),
             "device_id"_a,
             "rank"_a)
         .def(
             "create_engine",
-            [](AbstractTransformerModel* model, int deviceId, int rank) { model->createEngine(deviceId, rank); },
+            [](LlamaTritonModel* model, int deviceId, int rank) { model->createEngine(deviceId, rank); },
             py::call_guard<py::gil_scoped_release>(),
             "device_id"_a,
             "rank"_a)
-        .def("__str__", &AbstractTransformerModel::toString)
-        .def("__repr__", &AbstractTransformerModel::toString)
-        .def("get_tensor_para_size", &AbstractTransformerModel::getTensorParaSize)
-        .def("get_pipeline_para_size", &AbstractTransformerModel::getPipelineParaSize);
+        .def("__str__", &LlamaTritonModel::toString)
+        .def("__repr__", &LlamaTritonModel::toString)
+        .def("get_tensor_para_size", &LlamaTritonModel::getTensorParaSize)
+        .def("get_pipeline_para_size", &LlamaTritonModel::getPipelineParaSize);
 }
diff --git a/src/turbomind/triton_backend/CMakeLists.txt b/src/turbomind/triton_backend/CMakeLists.txt
index e152073204..08c8e4e884 100644
--- a/src/turbomind/triton_backend/CMakeLists.txt
+++ b/src/turbomind/triton_backend/CMakeLists.txt
@@ -1,35 +1,2 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-cmake_minimum_required (VERSION 3.18)
-
-project(tritonturbomindbackend LANGUAGES C CXX)
-
-add_library(TransformerTritonBackend STATIC transformer_triton_backend.cpp)
-set_property(TARGET TransformerTritonBackend PROPERTY POSITION_INDEPENDENT_CODE ON)
-install(TARGETS TransformerTritonBackend DESTINATION ${CMAKE_INSTALL_LIBDIR})
 
 add_subdirectory(llama)
diff --git a/src/turbomind/triton_backend/llama/CMakeLists.txt b/src/turbomind/triton_backend/llama/CMakeLists.txt
index 7f745d64b9..756f5ac67d 100644
--- a/src/turbomind/triton_backend/llama/CMakeLists.txt
+++ b/src/turbomind/triton_backend/llama/CMakeLists.txt
@@ -26,11 +26,10 @@ add_library(LlamaTritonBackend STATIC ${llama_triton_backend_files})
 set_property(TARGET LlamaTritonBackend PROPERTY POSITION_INDEPENDENT_CODE ON)
 
 target_link_libraries(LlamaTritonBackend PUBLIC
-        TransformerTritonBackend
         Llama
         device_comm
         host_comm
-        tensor
+        core
         memory_utils
         CUDA::cublasLt
         yaml-cpp::yaml-cpp)
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
index af48fbba3f..a1b33a8316 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -20,19 +20,23 @@
 
 #include <cctype>
 #include <optional>
+#include <string>
 
 #include <cuda_runtime.h>
+
 #include <yaml-cpp/yaml.h>
 
 #include "src/turbomind/comm/device_comm.h"
 #include "src/turbomind/comm/host_comm.h"
+#include "src/turbomind/core/allocator.h"
+#include "src/turbomind/core/check.h"
+#include "src/turbomind/core/tensor.h"
 #include "src/turbomind/engine/gateway.h"
 #include "src/turbomind/engine/model_request.h"
 #include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include "src/turbomind/models/llama/LlamaV2.h"
 #include "src/turbomind/models/llama/context.h"
 #include "src/turbomind/models/llama/llama_params.h"
-#include "src/turbomind/utils/allocator.h"
 #include "src/turbomind/utils/cuda_utils.h"
 
 #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
@@ -141,8 +145,7 @@ std::map<std::string, std::pair<std::regex, T>> getLoraPattern(std::string patte
     return res;
 }
 
-template<typename T>
-void LlamaTritonModel<T>::handleMissingParams()
+void LlamaTritonModel::handleMissingParams()
 {
     if (model_param_.kv_head_num == 0) {
         model_param_.kv_head_num = model_param_.head_num;
@@ -173,12 +176,6 @@ void LlamaTritonModel<T>::handleMissingParams()
         TM_LOG_WARNING("[LlamaTritonModel] `session_len` is not set, default to %d.", (int)engine_param_.session_len);
     }
 
-    if (!engine_param_.max_prefill_token_num) {
-        engine_param_.max_prefill_token_num = 8192;
-        TM_LOG_WARNING("[LlamaTritonModel] `max_prefill_token_num` is not set, default to %d.",
-                       (int)engine_param_.max_prefill_token_num);
-    }
-
     if (!engine_param_.max_context_token_num) {
         engine_param_.max_context_token_num = engine_param_.session_len;
         TM_LOG_WARNING("[LlamaTritonModel] `max_context_token_num` is not set, default to %d.",
@@ -219,8 +216,7 @@ void LlamaTritonModel<T>::handleMissingParams()
     }
 }
 
-template<typename T>
-LlamaTritonModel<T>::~LlamaTritonModel()
+LlamaTritonModel::~LlamaTritonModel()
 {
     FT_CHECK(weights_.size() == engines_.size());
 
@@ -235,11 +231,17 @@ LlamaTritonModel<T>::~LlamaTritonModel()
     }
 }
 
-template<typename T>
-LlamaTritonModel<T>::LlamaTritonModel(std::string                            model_dir,
-                                      std::string                            config,
-                                      std::function<std::shared_ptr<void>()> ffi_ctx_factory):
-    model_param_{}, attn_param_{}, moe_param_{}, lora_param_{}, engine_param_{}, weights_(getDeviceCount())
+LlamaTritonModel::LlamaTritonModel(DataType                               dtype,
+                                   std::string                            model_dir,
+                                   std::string                            config,
+                                   std::function<std::shared_ptr<void>()> ffi_ctx_factory):
+    dtype_{dtype},
+    model_param_{},
+    attn_param_{},
+    moe_param_{},
+    lora_param_{},
+    engine_param_{},
+    weights_(getDeviceCount())
 {
     FT_CHECK_WITH_INFO(!(config.empty() && model_dir.empty()), "invalid init options");
 
@@ -298,8 +300,10 @@ LlamaTritonModel<T>::LlamaTritonModel(std::string                            mod
     // rotary embedding parameters
     parse_rope_param(attention_reader["rope_param"], attn_param_.rope);
 
-    engine_param_.max_batch_size        = engine_reader["max_batch_size"].as<int>(0);
-    engine_param_.max_prefill_token_num = engine_reader["max_prefill_token_num"].as<int>(0);
+    engine_param_.max_batch_size = engine_reader["max_batch_size"].as<int>(0);
+    auto max_forward_token_num   = engine_reader["max_prefill_token_num"].as<int>(0);
+    max_forward_token_num += engine_param_.max_batch_size;
+
     engine_param_.max_context_token_num = engine_reader["max_context_token_num"].as<int>(0);
     engine_param_.session_len           = model_reader["session_len"].as<int>(0);
 
@@ -319,6 +323,11 @@ LlamaTritonModel<T>::LlamaTritonModel(std::string                            mod
     engine_param_.mlp_tp_size   = engine_reader["mlp_tp_size"].as<int>();
     engine_param_.mlp_tp_rank   = 0;
 
+    {
+        auto tp                             = engine_param_.attn_tp_size;
+        engine_param_.max_forward_token_num = ((size_t)max_forward_token_num + tp - 1) / tp * tp;
+    }
+
     comm_size_ = engine_param_.attn_dp_size * engine_param_.attn_tp_size;
     FT_CHECK(engine_param_.mlp_tp_size == comm_size_);
 
@@ -355,19 +364,19 @@ LlamaTritonModel<T>::LlamaTritonModel(std::string                            mod
 
     const std::string weight_type_str = model_reader["weight_type"].as<std::string>();
     if (weight_type_str == "fp16" || weight_type_str == "float16") {
-        model_param_.weight_type = WeightType::kFP16;
+        model_param_.weight_type = kFloat16;
     }
     else if (weight_type_str == "bf16" || weight_type_str == "bfloat16") {
-        model_param_.weight_type = WeightType::kBF16;
+        model_param_.weight_type = kBfloat16;
     }
     else if (weight_type_str == "fp32") {
-        model_param_.weight_type = WeightType::kFP32;
+        model_param_.weight_type = kFloat32;
     }
     else if (weight_type_str == "int8") {
-        model_param_.weight_type = WeightType::kINT8;
+        model_param_.weight_type = kUint8;
     }
     else if (weight_type_str == "int4") {
-        model_param_.weight_type = WeightType::kINT4;
+        model_param_.weight_type = kUint4;
     }
     else {
         std::cout << "[ERROR] Unsupported weight type: '" << weight_type_str << "'\n";
@@ -402,51 +411,33 @@ LlamaTritonModel<T>::LlamaTritonModel(std::string                            mod
     TM_LOG_INFO("%s", toString().c_str());
 }
 
-template<typename T>
-std::unique_ptr<ModelRequest> LlamaTritonModel<T>::createModelInstance(int device_id)
+std::unique_ptr<ModelRequest> LlamaTritonModel::createModelInstance(int device_id)
 {
     check_cuda_error(cudaSetDevice(device_id));
 
     FT_CHECK(engines_[device_id] != nullptr);
 
-    return std::make_unique<ModelRequest>(gateway_.get(),
-                                          getTensorType<T>(),
-                                          engine_param_.session_len,
-                                          model_param_.vocab_size,
-                                          model_param_.hidden_units);
+    return std::make_unique<ModelRequest>(
+        gateway_.get(), dtype_, engine_param_.session_len, model_param_.vocab_size, model_param_.hidden_units);
 }
 
-template<typename T>
-void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank) noexcept
+void LlamaTritonModel::createSharedWeights(int device_id, int rank)
 {
     check_cuda_error(cudaSetDevice(device_id));
-    weights_[rank] = std::make_shared<LlamaWeight<T>>(model_param_, engine_params_.at(rank), lora_param_, moe_param_);
+    weights_[rank] =
+        std::make_shared<LlamaWeight>(dtype_, model_param_, engine_params_.at(rank), lora_param_, moe_param_);
     // model inited with model_dir
-    if (model_dir_ != "") {
-        weights_[device_id]->loadModel(model_dir_);
-    }
+    // if (model_dir_ != "") {
+    //     weights_[device_id]->loadModel(model_dir_);
+    // }
 }
 
-template<typename T>
-std::unordered_map<std::string, Tensor> LlamaTritonModel<T>::getParams(int device_id, int rank) noexcept
+TensorMap LlamaTritonModel::getParams(int device_id, int rank)
 {
-    check_cuda_error(cudaSetDevice(device_id));
-
-    // shared_weight should be created before getParams
-    FT_CHECK(weights_[rank] != nullptr);
-
-    TensorMap output = weights_[rank]->getParams();
-
-    std::unordered_map<std::string, Tensor> result;
-    for (auto [name, tensor] : output) {
-        result.insert({{name, Tensor{tensor.where, tensor.type, tensor.shape, tensor.data}}});
-    }
-
-    return result;
+    return TM_CHECK_NOTNULL(weights_[rank])->get_parameters();
 }
 
-template<typename T>
-void LlamaTritonModel<T>::processWeights(int device_id, int rank) noexcept
+void LlamaTritonModel::processWeights(int device_id, int rank)
 {
     check_cuda_error(cudaSetDevice(device_id));
     FT_CHECK(weights_[device_id] != nullptr);
@@ -458,8 +449,7 @@ void LlamaTritonModel<T>::processWeights(int device_id, int rank) noexcept
     sync_check_cuda_error();
 }
 
-template<class T>
-Communicators LlamaTritonModel<T>::createCommSplits(int rank)
+Communicators LlamaTritonModel::createCommSplits(int rank)
 {
     Communicators comm{};
 
@@ -483,12 +473,13 @@ Communicators LlamaTritonModel<T>::createCommSplits(int rank)
     return comm;
 }
 
-template<typename T>
-void LlamaTritonModel<T>::createEngine(int device_id, int rank)
+void LlamaTritonModel::createEngine(int device_id, int rank)
 {
     check_cuda_error(cudaSetDevice(device_id));
 
-    auto ctx = std::make_unique<Context<T>>(device_id);
+    auto ctx = std::make_unique<Context>(device_id);
+
+    core::ContextGuard guard{ctx->core_stream, ctx->allocator, Allocator{kCPUpinned}};
 
     ctx->comm = createCommSplits(rank);
 
@@ -499,25 +490,27 @@ void LlamaTritonModel<T>::createEngine(int device_id, int rank)
 
     h_comm->Sync();
 
-    auto model = std::make_unique<LlamaV2<T>>(model_param_,  //
-                                              engine_param,
-                                              attn_param_,
-                                              moe_param_,
-                                              lora_param_,
-                                              *ctx,
-                                              engine_param_.max_batch_size,
-                                              weights_[device_id]);
+    auto model = std::make_unique<LlamaV2>(dtype_,
+                                           model_param_,  //
+                                           engine_param,
+                                           attn_param_,
+                                           moe_param_,
+                                           lora_param_,
+                                           *ctx,
+                                           engine_param_.max_batch_size,
+                                           weights_[device_id]);
 
     h_comm->Sync();
 
     try {
         const int dp_rank   = engine_param.outer_dp_rank * engine_param.attn_dp_size + engine_param.attn_dp_rank;
-        engines_[device_id] = std::make_unique<Engine<T>>(engine_param_,  //
-                                                          std::move(model),
-                                                          std::move(ctx),
-                                                          gateway_,
-                                                          device_id,
-                                                          dp_rank);
+        engines_[device_id] = std::make_unique<Engine>(dtype_,
+                                                       engine_param_,  //
+                                                       std::move(model),
+                                                       std::move(ctx),
+                                                       gateway_,
+                                                       device_id,
+                                                       dp_rank);
     }
     catch (const std::exception& e) {
         TM_LOG_ERROR("[Engine][Init] %s", e.what());
@@ -544,8 +537,7 @@ void LlamaTritonModel<T>::createEngine(int device_id, int rank)
     engine.Start();
 }
 
-template<typename T>
-std::string LlamaTritonModel<T>::toString()
+std::string LlamaTritonModel::toString()
 {
     std::stringstream ss;
     ss << "Model: "  //
@@ -556,7 +548,6 @@ std::string LlamaTritonModel<T>::toString()
        << "\nnum_layer: " << model_param_.layer_num << "\nvocab_size: " << model_param_.vocab_size
        << "\nattn_bias: " << model_param_.attn_bias << "\nqk_norm: " << model_param_.qk_norm
        << "\nmax_batch_size: " << engine_param_.max_batch_size
-       << "\nmax_prefill_token_num: " << engine_param_.max_prefill_token_num
        << "\nmax_context_token_num: " << engine_param_.max_context_token_num
        << "\nnum_tokens_per_iter: " << engine_param_.num_tokens_per_iter
        << "\nmax_prefill_iters: " << engine_param_.max_prefill_iters << "\nsession_len: " << engine_param_.session_len
@@ -574,24 +565,14 @@ std::string LlamaTritonModel<T>::toString()
     return ss.str();
 }
 
-template<typename T>
-int LlamaTritonModel<T>::getTensorParaSize()
+int LlamaTritonModel::getTensorParaSize()
 {
     return engine_param_.attn_tp_size;
 }
 
-template<typename T>
-int LlamaTritonModel<T>::getPipelineParaSize()
+int LlamaTritonModel::getPipelineParaSize()
 {
     return 1;
 }
 
-#ifdef ENABLE_FP32
-template struct LlamaTritonModel<float>;
-#endif
-template struct LlamaTritonModel<half>;
-#ifdef ENABLE_BF16
-template struct LlamaTritonModel<__nv_bfloat16>;
-#endif
-
 }  // namespace turbomind
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.h b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
index 12fc3abffc..f58c982fd8 100644
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
@@ -21,38 +21,44 @@
 #pragma once
 
 #include <cuda_fp16.h>
+#include <string>
+#include <unordered_map>
 
 #include "src/turbomind/comm/device_comm.h"
+
 #include "src/turbomind/engine/gateway.h"
+#include "src/turbomind/engine/model_request.h"
+
 #include "src/turbomind/models/llama/LlamaBatch.h"
 #include "src/turbomind/models/llama/LlamaWeight.h"
 #include "src/turbomind/models/llama/context.h"
 #include "src/turbomind/models/llama/llama_params.h"
 
-#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
-
 namespace turbomind {
 
-template<typename T>
-class LlamaTritonModel: public AbstractTransformerModel {
+class LlamaTritonModel {
 public:
-    LlamaTritonModel(std::string model_dir, std::string config, std::function<std::shared_ptr<void>()> ffi_ctx_factory);
+    LlamaTritonModel(DataType                               dtype,
+                     std::string                            model_dir,
+                     std::string                            config,
+                     std::function<std::shared_ptr<void>()> ffi_ctx_factory);
+
+    ~LlamaTritonModel();
 
-    ~LlamaTritonModel() override;
+    std::unique_ptr<ModelRequest> createModelInstance(int deviceId);
 
-    std::unique_ptr<ModelRequest> createModelInstance(int deviceId) override;
+    void createSharedWeights(int deviceId, int rank);
 
-    void createSharedWeights(int deviceId, int rank) noexcept override;
+    TensorMap getParams(int deviceId, int rank);
 
-    std::unordered_map<std::string, Tensor> getParams(int deviceId, int rank) noexcept override;
+    void processWeights(int deviceId, int rank);
 
-    void processWeights(int deviceId, int rank) noexcept override;
+    void createEngine(int device_id, int rank);
 
-    void createEngine(int device_id, int rank) override;
+    std::string toString();
 
-    std::string toString() override;
-    int         getTensorParaSize() override;
-    int         getPipelineParaSize() override;
+    int getTensorParaSize();
+    int getPipelineParaSize();
 
 private:
     void handleMissingParams();
@@ -60,6 +66,7 @@ class LlamaTritonModel: public AbstractTransformerModel {
     Communicators createCommSplits(int rank);
 
 private:
+    DataType       dtype_;
     ModelParam     model_param_;
     AttentionParam attn_param_;
     MoeParam       moe_param_;
@@ -76,8 +83,8 @@ class LlamaTritonModel: public AbstractTransformerModel {
     std::shared_ptr<Gateway> gateway_;
 
     // Weights & engine instances for the ranks
-    std::vector<std::shared_ptr<LlamaWeight<T>>> weights_;
-    std::vector<std::shared_ptr<Engine<T>>>      engines_;
+    std::vector<std::shared_ptr<LlamaWeight>> weights_;
+    std::vector<std::shared_ptr<Engine>>      engines_;
 
     bool is_fp16_;
 
diff --git a/src/turbomind/triton_backend/transformer_triton_backend.cpp b/src/turbomind/triton_backend/transformer_triton_backend.cpp
deleted file mode 100644
index 5268ad723c..0000000000
--- a/src/turbomind/triton_backend/transformer_triton_backend.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (c) OpenMMLab. All rights reserved.
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.cpp
-
-#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
-
-namespace turbomind {
-
-}  // namespace turbomind
diff --git a/src/turbomind/triton_backend/transformer_triton_backend.hpp b/src/turbomind/triton_backend/transformer_triton_backend.hpp
deleted file mode 100644
index 6ebcdc9e11..0000000000
--- a/src/turbomind/triton_backend/transformer_triton_backend.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) OpenMMLab. All rights reserved.
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Modified from
-// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.hpp
-
-#pragma once
-
-#include "src/turbomind/comm/device_comm.h"
-#include <functional>
-#include <memory>
-#include <vector>
-
-#ifdef __linux__
-#include <sys/time.h>
-#endif
-
-#include "src/turbomind/utils/Tensor.h"
-
-#include "src/turbomind/engine/model_request.h"
-
-namespace turbomind {
-
-using triton_stream_cb_t = std::function<void(std::shared_ptr<std::unordered_map<std::string, Tensor>>, void*)>;
-
-struct AbstractTransformerModel;
-struct AbstractTransformerModelInstance;
-
-struct AbstractTransformerModelInstance {
-    virtual ~AbstractTransformerModelInstance() = default;
-
-    virtual std::shared_ptr<std::unordered_map<std::string, Tensor>>
-    forward(std::shared_ptr<std::unordered_map<std::string, Tensor>> input_tensors) = 0;
-
-    void registerCallback(triton_stream_cb_t cb, void* ctx)
-    {
-        stream_cb_  = cb;
-        stream_ctx_ = ctx;
-    }
-
-    void unRegisterCallback()
-    {
-        stream_cb_  = nullptr;
-        stream_ctx_ = nullptr;
-    }
-
-    triton_stream_cb_t stream_cb_  = nullptr;
-    void*              stream_ctx_ = nullptr;
-};
-
-struct AbstractTransformerModel {
-
-    virtual ~AbstractTransformerModel() = default;
-
-    virtual std::unique_ptr<ModelRequest> createModelInstance(int deviceId) = 0;
-
-    virtual void createSharedWeights(int deviceId, int rank) = 0;
-
-    virtual std::unordered_map<std::string, Tensor> getParams(int deviceId, int rank) = 0;
-
-    virtual void processWeights(int deviceId, int rank) = 0;
-
-    virtual void createEngine(int device_id, int rank) = 0;
-
-    virtual std::string toString() = 0;
-
-    virtual int getTensorParaSize()   = 0;
-    virtual int getPipelineParaSize() = 0;
-};
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/CMakeLists.txt b/src/turbomind/utils/CMakeLists.txt
index fe6584543a..f9aa832696 100644
--- a/src/turbomind/utils/CMakeLists.txt
+++ b/src/turbomind/utils/CMakeLists.txt
@@ -16,31 +16,16 @@ cmake_minimum_required(VERSION 3.8)
 
 find_package(CUDAToolkit REQUIRED)
 
-add_subdirectory(gemm_test)
-
 add_library(cuda_utils STATIC cuda_utils.cc)
 set_property(TARGET cuda_utils PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET cuda_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-target_link_libraries(cuda_utils PUBLIC CUDA::cudart)
+target_link_libraries(cuda_utils PUBLIC CUDA::cudart CUDA::cuda_driver)
 
 add_library(logger STATIC logger.cc)
 set_property(TARGET logger PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET logger PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 target_link_libraries(logger PUBLIC CUDA::cudart)
 
-add_library(cublasAlgoMap STATIC cublasAlgoMap.cc)
-set_property(TARGET cublasAlgoMap PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET cublasAlgoMap PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-target_link_libraries(cublasAlgoMap PUBLIC CUDA::cublas CUDA::cudart CUDA::curand cuda_utils logger)
-
-add_library(cublasMMWrapper STATIC cublasMMWrapper.cc)
-set_property(TARGET cublasMMWrapper PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET cublasMMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-target_link_libraries(cublasMMWrapper PUBLIC CUDA::cublas CUDA::cudart CUDA::curand cublasAlgoMap cuda_utils logger)
-if (SPARSITY_SUPPORT)
-target_link_libraries(cublasMMWrapper PUBLIC CUDA::cusparse -lcusparseLt)
-endif()
-
 add_library(nvtx_utils STATIC nvtx_utils.cc)
 set_property(TARGET nvtx_utils PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET nvtx_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
@@ -53,38 +38,7 @@ endif()
 add_library(memory_utils STATIC memory_utils.cu)
 set_property(TARGET memory_utils PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET memory_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-target_link_libraries(memory_utils PUBLIC cuda_utils logger tensor)
-
-# add_library(mpi_utils STATIC mpi_utils.cc)
-# set_property(TARGET mpi_utils PROPERTY POSITION_INDEPENDENT_CODE  ON)
-# set_property(TARGET mpi_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-# if (BUILD_MULTI_GPU)
-#     target_link_libraries(mpi_utils PUBLIC ${MPI_CXX_LIBRARIES} logger)
-# endif()
-
-add_library(cublasINT8MMWrapper STATIC cublasINT8MMWrapper.cc)
-set_property(TARGET cublasINT8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET cublasINT8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-target_link_libraries(cublasINT8MMWrapper PUBLIC CUDA::cublasLt CUDA::cudart CUDA::curand cublasAlgoMap cublasMMWrapper cuda_utils logger)
-
-add_library(gemm STATIC gemm.cc)
-set_property(TARGET gemm PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET gemm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-target_link_libraries(gemm PUBLIC
-                      CUDA::cublas CUDA::cublasLt CUDA::cudart CUDA::curand
-                      cublasAlgoMap memory_utils cuda_utils logger)
-if (SPARSITY_SUPPORT)
-    target_link_libraries(gemm PUBLIC CUDA::cusparse -lcusparseLt)
-endif()
-
-add_library(cuda_fp8_utils STATIC cuda_fp8_utils.cu)
-set_property(TARGET cuda_fp8_utils PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET cuda_fp8_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-
-add_library(tensor STATIC Tensor.cc)
-set_property(TARGET tensor PROPERTY POSITION_INDEPENDENT_CODE  ON)
-set_property(TARGET tensor PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-target_link_libraries(tensor PUBLIC cuda_utils logger)
+target_link_libraries(memory_utils PUBLIC cuda_utils logger)
 
 add_library(anomaly_handler STATIC anomaly_handler.cu)
 set_property(TARGET anomaly_handler PROPERTY POSITION_INDEPENDENT_CODE  ON)
diff --git a/src/turbomind/utils/Tensor.cc b/src/turbomind/utils/Tensor.cc
deleted file mode 100644
index 7a2cedac13..0000000000
--- a/src/turbomind/utils/Tensor.cc
+++ /dev/null
@@ -1,441 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "src/turbomind/utils/Tensor.h"
-#include "src/turbomind/utils/cuda_bf16_wrapper.h"
-#include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/string_utils.h"
-
-#include "stdlib.h"
-#include <cuda_fp16.h>
-#include <cuda_runtime_api.h>
-#include <filesystem>
-#include <numeric>
-#include <stdlib.h>
-#include <string>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unordered_map>
-#include <vector>
-
-namespace fs = std::filesystem;
-namespace turbomind {
-
-Tensor::Tensor():
-    // a none tensor.
-    where(MEMORY_CPU),
-    type(TYPE_INVALID),
-    shape({}),
-    data(nullptr),
-    offsets({})  // only a record to record offset
-{
-}
-
-Tensor::Tensor(const MemoryType _where, const DataType _type, const std::vector<size_t> _shape, const void* _data):
-    where(_where), type(_type), shape(_shape), data(const_cast<void*>(_data))
-{
-}
-
-Tensor::Tensor(const MemoryType          _where,
-               const DataType            _type,
-               const std::vector<size_t> _shape,
-               const void*               _data,
-               const std::vector<size_t> _offset):
-    where(_where), type(_type), shape(_shape), data(const_cast<void*>(_data)), offsets(_offset)
-{
-}
-
-void Tensor::parseNpyIntro(FILE*& f_ptr, uint32_t& header_len, uint32_t& start_data)
-{
-    const char magic[] = "\x93"
-                         "NUMPY";
-    char magic_test[sizeof(magic)] = "\0";
-
-    size_t n_elems = fread((void*)magic_test, sizeof(char), sizeof(magic) - 1, f_ptr);
-    if (n_elems != sizeof(magic) - 1 || std::string(magic) != std::string(magic_test)) {
-        throw std::runtime_error("Could read magic token in NPY file");
-    }
-
-    uint8_t npy_major = 0;
-    uint8_t npy_minor = 0;
-    n_elems           = fread((void*)&npy_major, sizeof(uint8_t), 1, f_ptr);
-    n_elems += fread((void*)&npy_minor, sizeof(uint8_t), 1, f_ptr);
-
-    if (npy_major == 1) {
-        uint16_t header_len_u16 = 0;
-        n_elems                 = fread((void*)&header_len_u16, sizeof(uint16_t), 1, f_ptr);
-        header_len              = header_len_u16;
-    }
-    else if (npy_major == 2) {
-        uint32_t header_len_u32 = 0;
-        n_elems                 = fread((void*)&header_len_u32, sizeof(uint32_t), 1, f_ptr);
-        header_len              = header_len_u32;
-    }
-    else {
-        throw std::runtime_error("Unsupported npy version: " + std::to_string(npy_major));
-    }
-
-    start_data = 8 + 2 * npy_major + header_len;
-}
-
-int Tensor::parseNpyHeader(FILE*& f_ptr, uint32_t header_len, DataType& type, std::vector<size_t>& shape)
-{
-    char*  header_c = (char*)malloc(header_len * sizeof(char));
-    size_t n_elems  = fread((void*)header_c, sizeof(char), header_len, f_ptr);
-    if (n_elems != header_len) {
-        free(header_c);
-        return -1;
-    }
-    std::string header(header_c, header_len);
-    free(header_c);
-
-    size_t start, end;
-    start = header.find("'descr'") + 7;
-    start = header.find("'", start);
-    end   = header.find("'", start + 1);
-    type  = typeFromNumpyDesc(header.substr(start + 1, end - start - 1));
-
-    start = header.find("'fortran_order'") + 15;
-    start = header.find(":", start);
-    end   = header.find(",", start + 1);
-    if (header.substr(start + 1, end - start - 1).find("False") == std::string::npos) {
-        throw std::runtime_error("Unsupported value for fortran_order while reading npy file");
-    }
-
-    start = header.find("'shape'") + 7;
-    start = header.find("(", start);
-    end   = header.find(")", start + 1);
-
-    std::istringstream shape_stream(header.substr(start + 1, end - start - 1));
-    std::string        token;
-
-    shape.clear();
-    while (std::getline(shape_stream, token, ',')) {
-        if (token.find_first_not_of(' ') == std::string::npos) {
-            break;
-        }
-        shape.push_back(std::stoul(token));
-    }
-
-    return 0;
-}
-
-Tensor Tensor::loadNpy(const std::string& npy_file, const MemoryType where)
-{
-    DataType            type;
-    std::vector<size_t> shape;
-
-    FILE* f_ptr = fopen(npy_file.c_str(), "rb");
-    if (f_ptr == nullptr) {
-        throw std::runtime_error("Could not open file " + npy_file);
-    }
-    uint32_t header_len, start_data;
-    parseNpyIntro(f_ptr, header_len, start_data);
-    parseNpyHeader(f_ptr, header_len, type, shape);
-
-    const size_t size     = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
-    void*        data_cpu = malloc(size * Tensor::getTypeSize(type));
-    void*        data     = data_cpu;
-
-    size_t n_elems = fread(data_cpu, Tensor::getTypeSize(type), size, f_ptr);
-    FT_CHECK_WITH_INFO(n_elems == size, "reading tensor failed");
-    if (where == MEMORY_GPU) {
-        cudaMalloc(&data, size * Tensor::getTypeSize(type));
-        cudaMemcpy(data, data_cpu, size * Tensor::getTypeSize(type), cudaMemcpyHostToDevice);
-        free(data_cpu);
-    }
-
-    fclose(f_ptr);
-    return Tensor(where, type, shape, data);
-}
-
-size_t Tensor::size() const
-{
-    if (data == nullptr || shape.size() == 0) {
-        return 0;
-    }
-    return std::accumulate(shape.begin(), shape.end(), (size_t)1, std::multiplies<size_t>());
-}
-
-size_t Tensor::sizeBytes() const
-{
-    return size() * Tensor::getTypeSize(type);
-}
-
-std::string Tensor::whereToString() const
-{
-    static const std::unordered_map<MemoryType, std::string> mem_to_string{
-        {MEMORY_CPU, "CPU"}, {MEMORY_CPU_PINNED, "CPU_PINNED"}, {MEMORY_GPU, "GPU"}};
-    return mem_to_string.at(where);
-}
-
-std::string Tensor::toString() const
-{
-    std::string memtype_str = whereToString();
-
-    static const std::unordered_map<DataType, std::string> type_to_string{
-        {TYPE_BOOL, "BOOL"},
-        {TYPE_UINT8, "UINT8"},
-        {TYPE_UINT16, "UINT16"},
-        {TYPE_UINT32, "UINT32"},
-        {TYPE_UINT64, "UINT64"},
-        {TYPE_INT8, "INT8"},
-        {TYPE_INT16, "INT16"},
-        {TYPE_INT32, "INT32"},
-        {TYPE_INT64, "INT64"},
-        {TYPE_BF16, "BF16"},
-        {TYPE_FP16, "FP16"},
-        {TYPE_FP32, "FP32"},
-        {TYPE_FP64, "FP64"},
-        {TYPE_BYTES, "BYTES"},
-        {TYPE_INVALID, "INVALID"},
-        {TYPE_FP8_E4M3, "E4M3"},
-        {TYPE_VOID, "VOID"},
-    };
-    return fmtstr("Tensor[where=%s, type=%s, shape=%s, data=%p]",
-                  memtype_str.c_str(),
-                  type_to_string.at(type).c_str(),
-                  vec2str(shape).c_str(),
-                  data);
-}
-
-DataType Tensor::typeFromNumpyDesc(std::string type)
-{
-    static const std::unordered_map<std::string, DataType> type_map{{"?", TYPE_BOOL},
-                                                                    {"b", TYPE_BYTES},
-                                                                    {"u1", TYPE_UINT8},
-                                                                    {"u2", TYPE_UINT16},
-                                                                    {"u4", TYPE_UINT32},
-                                                                    {"u8", TYPE_UINT64},
-                                                                    {"i1", TYPE_INT8},
-                                                                    {"i2", TYPE_INT16},
-                                                                    {"i4", TYPE_INT32},
-                                                                    {"i8", TYPE_INT64},
-                                                                    {"f2", TYPE_FP16},
-                                                                    {"f4", TYPE_FP32},
-                                                                    {"f8", TYPE_FP64}};
-    return type_map.at(type);
-}
-
-size_t Tensor::getTypeSize(DataType type)
-{
-    static const std::unordered_map<DataType, size_t> type_map{{TYPE_BOOL, sizeof(bool)},
-                                                               {TYPE_BYTES, sizeof(char)},
-                                                               {TYPE_UINT8, sizeof(uint8_t)},
-                                                               {TYPE_UINT16, sizeof(uint16_t)},
-                                                               {TYPE_UINT32, sizeof(uint32_t)},
-                                                               {TYPE_UINT64, sizeof(uint64_t)},
-                                                               {TYPE_INT8, sizeof(int8_t)},
-                                                               {TYPE_INT16, sizeof(int16_t)},
-                                                               {TYPE_INT32, sizeof(int32_t)},
-                                                               {TYPE_INT64, sizeof(int64_t)},
-#ifdef ENABLE_BF16
-                                                               {TYPE_BF16, sizeof(__nv_bfloat16)},
-#endif
-#ifdef ENABLE_FP8
-                                                               {TYPE_FP8_E4M3, sizeof(__nv_fp8_e4m3)},
-#endif
-                                                               {TYPE_FP16, sizeof(half)},
-                                                               {TYPE_FP32, sizeof(float)},
-                                                               {TYPE_FP64, sizeof(double)}};
-    return type_map.at(type);
-}
-
-std::string Tensor::getNumpyTypeDesc(DataType type) const
-{
-    static const std::unordered_map<DataType, std::string> type_map{{TYPE_INVALID, "x"},
-                                                                    {TYPE_BOOL, "?"},
-                                                                    {TYPE_BYTES, "b"},
-                                                                    {TYPE_UINT8, "u1"},
-                                                                    {TYPE_UINT16, "u2"},
-                                                                    {TYPE_UINT32, "u4"},
-                                                                    {TYPE_UINT64, "u8"},
-                                                                    {TYPE_INT8, "i1"},
-                                                                    {TYPE_INT16, "i2"},
-                                                                    {TYPE_INT32, "i4"},
-                                                                    {TYPE_INT64, "i8"},
-                                                                    {TYPE_FP16, "f2"},
-                                                                    {TYPE_FP32, "f4"},
-                                                                    {TYPE_FP64, "f8"}};
-
-    if (type == TYPE_BF16) {
-        TM_LOG_WARNING("getNumpyTypeDesc(TYPE_BF16) returns an invalid type 'x' since Numpy doesn't "
-                       "support bfloat16 as of now, it will be properly extended if numpy supports. "
-                       "Please refer for the discussions https://github.com/numpy/numpy/issues/19808.");
-    }
-
-    return type_map.count(type) > 0 ? type_map.at(type) : "x";
-}
-
-void Tensor::saveNpy(const std::string& filename) const
-{
-    // Save tensor to NPY 1.0 format (see https://numpy.org/neps/nep-0001-npy-format.html)
-    void*  cpu_data     = (void*)data;
-    bool   is_data_temp = false;
-    size_t tensor_size  = size();
-    if (where == MemoryType::MEMORY_GPU) {
-        cpu_data     = malloc(tensor_size * Tensor::getTypeSize(type));
-        is_data_temp = true;
-        cudaDeviceSynchronize();
-        cudaMemcpy(cpu_data, data, tensor_size * Tensor::getTypeSize(type), cudaMemcpyDeviceToHost);
-    }
-
-    const char magic[] = "\x93"
-                         "NUMPY";
-    const uint8_t npy_major = 1;
-    const uint8_t npy_minor = 0;
-
-    std::stringstream header_stream;
-    header_stream << "{'descr': '" << getNumpyTypeDesc(type) << "', 'fortran_order': False, 'shape': (";
-    for (size_t i = 0; i < shape.size(); ++i) {
-        header_stream << shape[i];
-        if (i + 1 < shape.size() || shape.size() == 1) {
-            header_stream << ", ";
-        }
-    }
-    header_stream << ")}";
-    int base_length = 6 + 4 + header_stream.str().size();
-    int pad_length  = 16 * ((base_length + 1 + 15) / 16);  // Take ceiling of base_length + 1 (for '\n' ending)
-    for (int i = 0; i < pad_length - base_length; ++i) {
-        header_stream << ((i == pad_length - base_length - 1) ? "\n" : "\x20");
-    }
-    std::string    header     = header_stream.str();
-    const uint16_t header_len = header.size();
-
-    FILE* f_ptr = fopen(filename.c_str(), "wb");
-    FT_CHECK_WITH_INFO(f_ptr != nullptr, fmtstr("Unable to open %s for writing.\n", filename.c_str()));
-
-    fwrite(magic, sizeof(char), sizeof(magic) - 1, f_ptr);
-    fwrite(&npy_major, sizeof(uint8_t), 1, f_ptr);
-    fwrite(&npy_minor, sizeof(uint8_t), 1, f_ptr);
-    fwrite(&header_len, sizeof(uint16_t), 1, f_ptr);
-    fwrite(header.c_str(), sizeof(char), header_len, f_ptr);
-    fwrite(cpu_data, Tensor::getTypeSize(type), tensor_size, f_ptr);
-
-    fclose(f_ptr);
-
-    if (is_data_temp) {
-        free(cpu_data);
-    }
-}
-
-Tensor Tensor::slice(std::vector<size_t> shape, size_t offset) const
-{
-    if (this->data != nullptr) {
-        size_t n_elts        = this->size();
-        size_t n_sliced_elts = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
-        FT_CHECK_WITH_INFO(
-            n_sliced_elts + offset <= n_elts,
-            fmtstr("The number (%ld) of elements of sliced tensor exceeds that (%ld) of the original tensor",
-                   n_sliced_elts + offset,
-                   n_elts));
-    }
-    return Tensor(this->where, this->type, shape, this->getPtrWithOffset(offset));
-}
-
-TensorMap::TensorMap(const std::unordered_map<std::string, Tensor>& tensor_map)
-{
-    for (auto& kv : tensor_map) {
-        insert(kv.first, kv.second);
-    }
-}
-
-TensorMap::TensorMap(const std::vector<Tensor>& tensor_map)
-{
-    for (size_t i = 0; i < tensor_map.size(); i++) {
-        insert(std::to_string(i), tensor_map[i]);
-    }
-}
-
-TensorMap::TensorMap(std::initializer_list<std::pair<std::string, Tensor>> tensor_map)
-{
-    for (auto& pair : tensor_map) {
-        insert(pair.first, pair.second);
-    }
-}
-
-TensorMap::~TensorMap()
-{
-    tensor_map_.clear();
-}
-
-std::vector<std::string> TensorMap::keys() const
-{
-    std::vector<std::string> key_names;
-    for (auto& kv : tensor_map_) {
-        key_names.push_back(kv.first);
-    }
-    return key_names;
-}
-
-std::string TensorMap::toString()
-{
-    std::stringstream ss;
-    ss << "{";
-    std::vector<std::string> key_names = keys();
-    for (size_t i = 0; i < tensor_map_.size(); ++i) {
-        ss << key_names[i] << ": " << at(key_names[i]).toString();
-        if (i < tensor_map_.size() - 1) {
-            ss << ", ";
-        }
-    }
-    ss << "}";
-    return ss.str();
-}
-
-TensorMap TensorMap::fromNpyFolder(const std::string& base_folder)
-{
-    TensorMap ret_tensor;
-    for (auto const& entry : fs::directory_iterator{base_folder}) {
-        std::string filename = entry.path().stem().string();
-        size_t      len      = filename.length();
-        if (len < 4 || filename.compare(len - 4, 4, ".npy")) {
-            continue;
-        }
-
-        size_t pos = filename.find('-');
-        FT_CHECK_WITH_INFO(pos != std::string::npos, fmtstr("Invalid filename: %s\n", filename.c_str()));
-
-        MemoryType where;
-        if (filename.compare(0, pos, "GPU") == 0) {
-            where = MEMORY_GPU;
-        }
-        else if (filename.compare(0, pos, "CPU") == 0) {
-            where = MEMORY_CPU;
-        }
-        else if (filename.compare(0, pos, "CPU_PINNED") == 0) {
-            where = MEMORY_CPU_PINNED;
-        }
-        else {
-            FT_CHECK_WITH_INFO(false, fmtstr("Invalid filename: %s\n", filename.c_str()));
-        }
-        std::string key = filename.substr(pos + 1, len - pos - 5);
-
-        ret_tensor.tensor_map_.insert({key, Tensor::loadNpy(base_folder + "/" + filename, where)});
-    }
-    return ret_tensor;
-}
-
-void TensorMap::saveNpy(const std::string& base_folder)
-{
-    bool ret = fs::exists(base_folder) | fs::create_directory(base_folder);
-    FT_CHECK_WITH_INFO(ret == true, fmtstr("Could not create folder %s.\n", base_folder.c_str()));
-    for (const auto& item : tensor_map_) {
-        item.second.saveNpy(base_folder + "/" + item.second.whereToString() + "-" + item.first + ".npy");
-    }
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/Tensor.h b/src/turbomind/utils/Tensor.h
deleted file mode 100644
index bf9840314c..0000000000
--- a/src/turbomind/utils/Tensor.h
+++ /dev/null
@@ -1,582 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "src/turbomind/macro.h"
-#include "src/turbomind/utils/cuda_bf16_wrapper.h"
-#include "src/turbomind/utils/cuda_fp8_utils.h"
-#include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/string_utils.h"
-
-#include "stdlib.h"
-#include <cuda_fp16.h>
-#include <cuda_runtime_api.h>
-#include <numeric>
-#include <stdlib.h>
-#include <string>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <type_traits>
-#include <unordered_map>
-#include <vector>
-
-namespace turbomind {
-
-typedef enum datatype_enum
-{
-    TYPE_INVALID,
-    TYPE_BOOL,
-    TYPE_UINT8,
-    TYPE_UINT16,
-    TYPE_UINT32,
-    TYPE_UINT64,
-    TYPE_INT8,
-    TYPE_INT16,
-    TYPE_INT32,
-    TYPE_INT64,
-    TYPE_FP16,
-    TYPE_FP32,
-    TYPE_FP64,
-    TYPE_BYTES,
-    TYPE_BF16,
-    TYPE_FP8_E4M3,
-    TYPE_STR,
-    TYPE_VOID,
-} DataType;
-
-template<typename T>
-DataType getTensorType()
-{
-    if (std::is_same<T, float>::value || std::is_same<T, const float>::value) {
-        return TYPE_FP32;
-    }
-    else if (std::is_same<T, half>::value || std::is_same<T, const half>::value) {
-        return TYPE_FP16;
-    }
-#ifdef ENABLE_BF16
-    else if (std::is_same<T, __nv_bfloat16>::value || std::is_same<T, const __nv_bfloat16>::value) {
-        return TYPE_BF16;
-    }
-#endif
-#ifdef ENABLE_FP8
-    else if (std::is_same<T, __nv_fp8_e4m3>::value || std::is_same<T, const __nv_fp8_e4m3>::value) {
-        return TYPE_FP8_E4M3;
-    }
-#endif
-    else if (std::is_same<T, int>::value || std::is_same<T, const int>::value) {
-        return TYPE_INT32;
-    }
-    else if (std::is_same<T, int8_t>::value || std::is_same<T, const int8_t>::value) {
-        return TYPE_INT8;
-    }
-    else if (std::is_same<T, uint>::value || std::is_same<T, const uint>::value) {
-        return TYPE_UINT32;
-    }
-    else if (std::is_same<T, unsigned long>::value || std::is_same<T, const unsigned long>::value) {
-        return TYPE_UINT64;
-    }
-    else if (std::is_same<T, bool>::value || std::is_same<T, const bool>::value) {
-        return TYPE_BOOL;
-    }
-    else if (std::is_same<T, char>::value || std::is_same<T, const char>::value) {
-        return TYPE_BYTES;
-    }
-    else if (std::is_pointer_v<T> && sizeof(T) == sizeof(uint64_t)) {
-        return TYPE_UINT64;
-    }
-    else {
-        return TYPE_INVALID;
-    }
-}
-
-static inline size_t get_elem_size(DataType type)
-{
-    switch (type) {
-        case DataType::TYPE_FP16:
-        case DataType::TYPE_BF16:
-        case DataType::TYPE_INT16:
-            return 2;
-        case DataType::TYPE_FP32:
-        case DataType::TYPE_INT32:
-        case DataType::TYPE_UINT32:
-            return 4;
-        case DataType::TYPE_UINT64:
-        case DataType::TYPE_INT64:
-            return 8;
-        case DataType::TYPE_UINT8:
-            return 1;
-        default:
-            throw std::runtime_error("not supported");
-    }
-}
-
-typedef enum memorytype_enum
-{
-    MEMORY_CPU,
-    MEMORY_CPU_PINNED,
-    MEMORY_GPU
-} MemoryType;
-
-struct Tensor {
-    MemoryType          where;
-    DataType            type;
-    std::vector<size_t> shape;
-    void*               data;
-    std::vector<size_t> offsets = std::vector<size_t>{};
-
-    Tensor();
-    Tensor(const MemoryType _where, const DataType _type, const std::vector<size_t> _shape, const void* _data);
-    Tensor(const MemoryType          _where,
-           const DataType            _type,
-           const std::vector<size_t> _shape,
-           const void*               _data,
-           const std::vector<size_t> _offset);
-
-    size_t size() const;
-    size_t sizeBytes() const;
-
-    std::string whereToString() const;
-    std::string toString() const;
-    std::string getNumpyTypeDesc(DataType type) const;
-
-    void          saveNpy(const std::string& filename) const;
-    static Tensor loadNpy(const std::string& npy_file, const MemoryType where);
-
-    static DataType typeFromNumpyDesc(std::string type);
-    static size_t   getTypeSize(DataType type);
-
-    template<typename T>
-    inline T getVal(size_t index) const
-    {
-        TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
-        FT_CHECK(where == MEMORY_CPU);
-        FT_CHECK(data != nullptr);
-        FT_CHECK_WITH_INFO(index < size(), "index is larger than buffer size");
-
-        if (getTensorType<T>() != type) {
-            TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
-                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
-                         getNumpyTypeDesc(type).c_str());
-        }
-        return ((T*)data)[index];
-    }
-
-    template<typename T>
-    inline T getVal() const
-    {
-        TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
-        if (getTensorType<T>() != type) {
-            TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
-                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
-                         getNumpyTypeDesc(type).c_str());
-        }
-        return getVal<T>(0);
-    }
-
-    template<typename T>
-    inline T* getPtr() const
-    {
-        TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
-        if (getTensorType<T>() != type) {
-            TM_LOG_DEBUG("getPtr with type %s, but data type is: %s",
-                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
-                         getNumpyTypeDesc(type).c_str());
-        }
-        return (T*)data;
-    }
-
-    inline void* getPtrWithOffset(size_t offset) const
-    {
-        TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
-        if (data == nullptr) {
-            return (void*)data;
-        }
-        else {
-            FT_CHECK_WITH_INFO(offset < size(), "offset is larger than buffer size");
-            return (void*)((char*)data + offset * Tensor::getTypeSize(type));
-        }
-    }
-
-    template<typename T>
-    inline T* getPtrWithOffset(size_t offset) const
-    {
-        TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
-        if (getTensorType<T>() != type) {
-            TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
-                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
-                         getNumpyTypeDesc(type).c_str());
-        }
-        if (data == nullptr) {
-            return (T*)data;
-        }
-        else {
-            FT_CHECK_WITH_INFO(offset < size(),
-                               fmtstr("offset (%lu) is larger than buffer size (%lu)", offset, size()));
-            return ((T*)data) + offset;
-        }
-    }
-
-    template<typename T>
-    T max() const
-    {
-        if (getTensorType<T>() != type) {
-            TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
-                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
-                         getNumpyTypeDesc(type).c_str());
-        }
-        FT_CHECK_WITH_INFO(shape.size() > 0 && data != nullptr, "Should be a non-empty tensor.");
-        FT_CHECK_WITH_INFO(where == MEMORY_CPU || where == MEMORY_CPU_PINNED,
-                           "max() supports MEMORY_CPU or MEMORY_CPU_PINNED tensor.");
-        size_t max_idx = 0;
-        T      max_val = getVal<T>(max_idx);
-        for (size_t i = 1; i < size(); ++i) {
-            T val = getVal<T>(i);
-            if (val > max_val) {
-                max_idx = i;
-                max_val = val;
-            }
-        }
-        return max_val;
-    }
-
-    template<typename T>
-    T min() const
-    {
-        if (getTensorType<T>() != type) {
-            TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
-                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
-                         getNumpyTypeDesc(type).c_str());
-        }
-        FT_CHECK_WITH_INFO(shape.size() > 0 && data != nullptr, "Should be a non-empty tensor.");
-        FT_CHECK_WITH_INFO(where == MEMORY_CPU || where == MEMORY_CPU_PINNED,
-                           "min() supports MEMORY_CPU or MEMORY_CPU_PINNED tensor.");
-        size_t min_idx = 0;
-        T      min_val = getVal<T>(min_idx);
-        for (size_t i = 1; i < size(); ++i) {
-            T val = getVal<T>(i);
-            if (val < min_val) {
-                min_idx = i;
-                min_val = val;
-            }
-        }
-        return min_val;
-    }
-
-    template<typename T>
-    T any(T val) const
-    {
-        if (getTensorType<T>() != type) {
-            TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
-                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
-                         getNumpyTypeDesc(type).c_str());
-        }
-        FT_CHECK_WITH_INFO(shape.size() > 0 && data != nullptr, "Should be a non-empty tensor.");
-        FT_CHECK_WITH_INFO(where == MEMORY_CPU || where == MEMORY_CPU_PINNED,
-                           "any() supports MEMORY_CPU or MEMORY_CPU_PINNED tensor.");
-        for (size_t i = 0; i < size(); ++i) {
-            if (getVal<T>(i) == val) {
-                return true;
-            }
-        }
-        return false;
-    }
-
-    template<typename T>
-    T all(T val) const
-    {
-        if (getTensorType<T>() != type) {
-            TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
-                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
-                         getNumpyTypeDesc(type).c_str());
-        }
-        FT_CHECK_WITH_INFO(shape.size() > 0 && data != nullptr, "Should be a non-empty tensor.");
-        FT_CHECK_WITH_INFO(where == MEMORY_CPU || where == MEMORY_CPU_PINNED,
-                           "all() supports MEMORY_CPU or MEMORY_CPU_PINNED tensor.");
-        for (size_t i = 0; i < size(); ++i) {
-            if (getVal<T>(i) != val) {
-                return false;
-            }
-        }
-        return true;
-    }
-
-    void updateShape(size_t idx, size_t val)
-    {
-        // TODO: find a better way to update the shape
-        std::vector<size_t>& shape_ref = const_cast<std::vector<size_t>&>(shape);
-        shape_ref[idx]                 = val;
-    }
-
-    Tensor slice(std::vector<size_t> shape, size_t offset = 0) const;
-
-private:
-    static void parseNpyIntro(FILE*& f_ptr, uint32_t& header_len, uint32_t& start_data);
-    static int  parseNpyHeader(FILE*& f_ptr, uint32_t header_len, DataType& type, std::vector<size_t>& shape);
-};
-
-class TensorMap {
-private:
-    std::unordered_map<std::string, Tensor> tensor_map_;
-
-    inline bool isValid(const Tensor& tensor)
-    {
-        return tensor.size() > 0 && tensor.data != nullptr;
-    }
-
-public:
-    TensorMap() = default;
-    TensorMap(const std::unordered_map<std::string, Tensor>& tensor_map);
-    TensorMap(const std::vector<Tensor>& tensor_map);
-    TensorMap(std::initializer_list<std::pair<std::string, Tensor>> tensor_map);
-    ~TensorMap();
-
-    inline size_t size() const
-    {
-        return tensor_map_.size();
-    }
-
-    inline bool isExist(const std::string& key) const
-    {
-        TM_LOG_DEBUG("%s for key: %s", __PRETTY_FUNCTION__, key.c_str());
-        return tensor_map_.find(key) != tensor_map_.end();
-    }
-
-    std::vector<std::string> keys() const;
-
-    inline void insert(const std::string& key, const Tensor& value)
-    {
-        FT_CHECK_WITH_INFO(!isExist(key), fmtstr("Duplicated key %s", key.c_str()));
-        tensor_map_.insert({key, value});
-    }
-
-    inline void insertIfValid(const std::string& key, const Tensor& value)
-    {
-        if (isValid(value)) {
-            insert({key, value});
-        }
-    }
-
-    inline void insert(std::pair<std::string, Tensor> p)
-    {
-        tensor_map_.insert(p);
-    }
-
-    // prevent converting int or size_t to string automatically
-    Tensor at(int tmp)    = delete;
-    Tensor at(size_t tmp) = delete;
-
-    inline Tensor& at(const std::string& key)
-    {
-        TM_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
-        FT_CHECK_WITH_INFO(isExist(key),
-                           fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
-                                  key.c_str(),
-                                  vec2str(keys()).c_str()));
-        return tensor_map_.at(key);
-    }
-
-    inline Tensor at(const std::string& key) const
-    {
-        FT_CHECK_WITH_INFO(isExist(key),
-                           fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
-                                  key.c_str(),
-                                  vec2str(keys()).c_str()));
-        return tensor_map_.at(key);
-    }
-
-    inline Tensor& at(const std::string& key, Tensor& default_tensor)
-    {
-        TM_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
-        if (isExist(key)) {
-            return tensor_map_.at(key);
-        }
-        return default_tensor;
-    }
-
-    inline Tensor at(const std::string& key, Tensor& default_tensor) const
-    {
-        TM_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
-        if (isExist(key)) {
-            return tensor_map_.at(key);
-        }
-        return default_tensor;
-    }
-
-    inline Tensor& at(const std::string& key, Tensor&& default_tensor)
-    {
-        TM_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
-        if (isExist(key)) {
-            return tensor_map_.at(key);
-        }
-        return default_tensor;
-    }
-
-    inline Tensor at(const std::string& key, Tensor&& default_tensor) const
-    {
-        if (isExist(key)) {
-            return tensor_map_.at(key);
-        }
-        return default_tensor;
-    }
-
-    template<typename T>
-    inline T getVal(const std::string& key) const
-    {
-        FT_CHECK_WITH_INFO(isExist(key),
-                           fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
-                                  key.c_str(),
-                                  vec2str(keys()).c_str()));
-        return tensor_map_.at(key).getVal<T>();
-    }
-
-    template<typename T>
-    inline T getVal(const std::string& key, T default_value) const
-    {
-        if (isExist(key)) {
-            return tensor_map_.at(key).getVal<T>();
-        }
-        return default_value;
-    }
-
-    template<typename T>
-    inline T getValWithOffset(const std::string& key, size_t index) const
-    {
-        FT_CHECK_WITH_INFO(isExist(key),
-                           fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
-                                  key.c_str(),
-                                  vec2str(keys()).c_str()));
-        return tensor_map_.at(key).getVal<T>(index);
-    }
-
-    template<typename T>
-    inline T getValWithOffset(const std::string& key, size_t index, T default_value) const
-    {
-        if (isExist(key)) {
-            return tensor_map_.at(key).getVal<T>(index);
-        }
-        return default_value;
-    }
-
-    template<typename T>
-    inline T* getPtr(const std::string& key) const
-    {
-        FT_CHECK_WITH_INFO(isExist(key),
-                           fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
-                                  key.c_str(),
-                                  vec2str(keys()).c_str()));
-        return tensor_map_.at(key).getPtr<T>();
-    }
-
-    template<typename T>
-    inline T* getPtr(const std::string& key, T* default_ptr) const
-    {
-        if (isExist(key)) {
-            return tensor_map_.at(key).getPtr<T>();
-        }
-        return default_ptr;
-    }
-
-    template<typename T>
-    inline T* getPtrWithOffset(const std::string& key, size_t index) const
-    {
-        FT_CHECK_WITH_INFO(isExist(key),
-                           fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
-                                  key.c_str(),
-                                  vec2str(keys()).c_str()));
-        return tensor_map_.at(key).getPtrWithOffset<T>(index);
-    }
-
-    template<typename T>
-    inline T* getPtrWithOffset(const std::string& key, size_t index, T* default_ptr) const
-    {
-        if (isExist(key)) {
-            return tensor_map_.at(key).getPtrWithOffset<T>(index);
-        }
-        return default_ptr;
-    }
-
-    inline std::unordered_map<std::string, Tensor> getMap() const
-    {
-        return tensor_map_;
-    }
-
-    inline std::unordered_map<std::string, Tensor>::iterator begin()
-    {
-        return tensor_map_.begin();
-    }
-
-    inline std::unordered_map<std::string, Tensor>::iterator end()
-    {
-        return tensor_map_.end();
-    }
-
-    inline std::unordered_map<std::string, Tensor>& get()
-    {
-        return tensor_map_;
-    }
-
-    inline std::unordered_map<std::string, Tensor>::const_iterator begin() const
-    {
-        return tensor_map_.begin();
-    }
-
-    inline std::unordered_map<std::string, Tensor>::const_iterator end() const
-    {
-        return tensor_map_.end();
-    }
-
-    int count(const std::string& key) const
-    {
-        return tensor_map_.count(key);
-    }
-
-    bool empty() const
-    {
-        return tensor_map_.empty();
-    }
-
-    std::string      toString();
-    static TensorMap fromNpyFolder(const std::string& base_folder);
-    void             saveNpy(const std::string& base_folder);
-};
-
-struct ManagedTensor {
-    Tensor                tensor;
-    std::shared_ptr<void> data_holder;
-
-    Tensor* operator->() noexcept
-    {
-        return &tensor;
-    }
-
-    const Tensor* operator->() const noexcept
-    {
-        return &tensor;
-    }
-
-    Tensor& operator*() noexcept
-    {
-        return tensor;
-    }
-
-    const Tensor& operator*() const noexcept
-    {
-        return tensor;
-    }
-};
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/allocator.h b/src/turbomind/utils/allocator.h
deleted file mode 100644
index 88c299c3de..0000000000
--- a/src/turbomind/utils/allocator.h
+++ /dev/null
@@ -1,493 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * Memory Allocator
- **/
-
-#pragma once
-
-#include "cuda_utils.h"
-#include "src/turbomind/macro.h"
-#include <cuda_runtime.h>
-#include <unordered_map>
-#include <vector>
-
-#ifdef GOOGLE_CUDA
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#endif
-
-#ifdef TORCH_CUDA
-#include "torch/extension.h"
-#include <memory>
-#endif
-
-#include "src/turbomind/utils/logger.h"
-
-#if defined(CUDART_VERSION) && CUDART_VERSION < 11020
-#define CUDA_MEMORY_POOL_DISABLED
-#endif
-
-namespace turbomind {
-
-enum class AllocatorType
-{
-    CUDA,
-    TF,
-    TH
-};
-
-enum class ReallocType
-{
-    INCREASE,
-    REUSE,
-    DECREASE,
-};
-
-class IAllocator {
-public:
-    virtual ~IAllocator(){};
-
-    virtual void*        malloc(size_t size, const bool is_set_zero = true, bool is_host = false) = 0;
-    virtual void         free(void** ptr, bool is_host = false)                                   = 0;
-    virtual void         setStream(cudaStream_t stream)                                           = 0;
-    virtual cudaStream_t returnStream()                                                           = 0;
-    virtual void         memSet(void* ptr, const int val, const size_t size)                      = 0;
-
-    template<typename T>
-    void* reMalloc(T* ptr, size_t size, const bool is_set_zero = true, bool is_host = false)
-    {
-        TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-        size              = ((size + 31) / 32) * 32;  // make the buffer align with 32 bytes
-        void* void_ptr    = (void*)ptr;
-        void* ptr_address = getAddress(void_ptr);
-        if (isExist(ptr_address)) {
-            ReallocType realloc_type = isReMalloc(ptr_address, size);
-            if (realloc_type == ReallocType::INCREASE) {
-                TM_LOG_DEBUG("ReMalloc the buffer %p since it is too small.", void_ptr);
-                free((void**)(&void_ptr), is_host);
-                return malloc(size, is_set_zero, is_host);
-            }
-#if !defined(CUDA_MEMORY_POOL_DISABLED)
-            else if (realloc_type == ReallocType::DECREASE) {
-                TM_LOG_DEBUG("ReMalloc the buffer %p to release unused memory to memory pools.", void_ptr);
-                free((void**)(&void_ptr), is_host);
-                return malloc(size, is_set_zero, is_host);
-            }
-#endif
-            else {
-                TM_LOG_DEBUG("Reuse original buffer %p with size %d and do nothing for reMalloc.", void_ptr, size);
-                if (is_set_zero) {
-                    memSet(void_ptr, 0, size);
-                }
-                return void_ptr;
-            }
-        }
-        else {
-            TM_LOG_DEBUG("Cannot find buffer %p, mallocing new one.", void_ptr);
-            return malloc(size, is_set_zero, is_host);
-        }
-    }
-
-protected:
-    virtual bool        isExist(void* address) const                 = 0;
-    virtual ReallocType isReMalloc(void* address, size_t size) const = 0;
-
-    void* getAddress(void* ptr) const
-    {
-        return ptr;
-    }
-};
-
-template<AllocatorType AllocType_>
-class Allocator;
-
-template<>
-class Allocator<AllocatorType::CUDA>: public IAllocator {
-private:
-    enum class MemoryType
-    {
-        HOST,
-        DEVICE
-    };
-
-    const int                                                device_id_;
-    bool                                                     enable_peer_access_{false};
-    cudaStream_t                                             stream_ = 0;  // initialize as default stream
-    cudaMemPool_t                                            mempool_{};
-    std::unordered_map<void*, std::pair<size_t, MemoryType>> pointer_mapping_;
-
-    bool isExist(void* address) const
-    {
-        return pointer_mapping_.count(address) > 0;
-    }
-    ReallocType isReMalloc(void* address, size_t size) const
-    {
-        FT_CHECK(isExist(address));
-        if (pointer_mapping_.at(address).first < size) {
-            return ReallocType::INCREASE;
-        }
-        else if (pointer_mapping_.at(address).first == size) {
-            return ReallocType::REUSE;
-        }
-        else {
-            return ReallocType::DECREASE;
-        }
-    }
-
-public:
-    Allocator(int device_id, bool enable_peer_access = false):
-        device_id_(device_id), enable_peer_access_(enable_peer_access)
-    {
-        TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-#if defined(CUDA_MEMORY_POOL_DISABLED)
-        TM_LOG_WARNING(
-            "Async cudaMalloc/Free is not supported before CUDA 11.2. Using Sync cudaMalloc/Free."
-            "Note this may lead to hang with NCCL kernels launched in parallel; if so, try NCCL_LAUNCH_MODE=GROUP");
-#else
-
-        if (enable_peer_access) {
-            cudaMemPoolProps props{};
-            props.allocType     = cudaMemAllocationTypePinned;
-            props.handleTypes   = cudaMemHandleTypeNone;
-            props.location.type = cudaMemLocationTypeDevice;
-            props.location.id   = device_id;
-            check_cuda_error(cudaMemPoolCreate(&mempool_, &props));
-            cudaMemAccessDesc desc                  = {};
-            int               peer_access_available = 0;
-            int               device_count          = 1;
-            check_cuda_error(cudaGetDeviceCount(&device_count));
-            for (int i = 0; i < device_count; i++) {
-                if (i == device_id) {
-                    continue;
-                }
-                check_cuda_error(cudaDeviceCanAccessPeer(&peer_access_available, device_id, i));
-                if (!peer_access_available) {
-                    TM_LOG_WARNING("Devicle " + std::to_string(device_id) + " peer access Device " + std::to_string(i)
-                                   + " is not available.");
-                    continue;
-                }
-                desc.location.type = cudaMemLocationTypeDevice;
-                desc.location.id   = i;
-                desc.flags         = cudaMemAccessFlagsProtReadWrite;
-                check_cuda_error(cudaMemPoolSetAccess(mempool_, &desc, 1));
-            }
-        }
-        else {
-            check_cuda_error(cudaDeviceGetDefaultMemPool(&mempool_, device_id));
-        }
-        // set memory pool threshold to avoid shrinking the pool
-        uint64_t setVal = UINT64_MAX;
-        check_cuda_error(cudaMemPoolSetAttribute(mempool_, cudaMemPoolAttrReleaseThreshold, &setVal));
-#endif
-    }
-
-    virtual ~Allocator()
-    {
-        TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-        while (!pointer_mapping_.empty()) {
-            auto ptr           = pointer_mapping_.begin()->first;
-            auto size_and_type = pointer_mapping_.begin()->second;
-            free(&ptr, size_and_type.second == MemoryType::HOST);
-        }
-        if (enable_peer_access_) {  // We own the pool in this case
-            check_cuda_error(cudaMemPoolDestroy(mempool_));
-            mempool_ = {};
-        }
-    }
-
-    void setStream(cudaStream_t stream)
-    {
-        stream_ = stream;
-    }
-
-    cudaStream_t returnStream()
-    {
-        return stream_;
-    };
-
-    void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false)
-    {
-        TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-        if (size == 0) {
-            return nullptr;
-        }
-        void* ptr      = nullptr;
-        int   o_device = 0;
-
-        check_cuda_error(getSetDevice(device_id_, &o_device));
-        if (is_host) {
-            check_cuda_error(cudaMallocHost(&ptr, (size_t)(ceil(size / 32.)) * 32));
-        }
-        else {
-#if defined(CUDA_MEMORY_POOL_DISABLED)
-            check_cuda_error(cudaMalloc(&ptr, (size_t)(ceil(size / 32.)) * 32));
-#else
-            check_cuda_error(cudaMallocFromPoolAsync(&ptr, (size_t)(ceil(size / 32.)) * 32, mempool_, stream_));
-#endif
-        }
-        if (is_set_zero) {
-            check_cuda_error(cudaMemsetAsync(ptr, 0, (size_t)(ceil(size / 32.)) * 32, stream_));
-        }
-        check_cuda_error(getSetDevice(o_device));
-        TM_LOG_DEBUG("malloc buffer %p with size %ld", ptr, size);
-
-        pointer_mapping_.insert({getAddress(ptr), {size, is_host ? MemoryType::HOST : MemoryType::DEVICE}});
-
-        return ptr;
-    }
-
-    void free(void** ptr, bool _ = false)
-    {
-        TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-        void* address = getAddress(*ptr);
-        if (*ptr != nullptr) {
-            int o_device = 0;
-            if (pointer_mapping_.count(address)) {
-                const auto is_host = pointer_mapping_.at(address).second == MemoryType::HOST;
-                TM_LOG_DEBUG("Free buffer %p", address);
-                check_cuda_error(getSetDevice(device_id_, &o_device));
-                if (is_host) {
-                    check_cuda_error(cudaFreeHost(*ptr));
-                }
-                else {
-#if defined(CUDA_MEMORY_POOL_DISABLED)
-                    check_cuda_error(cudaFree(*ptr));
-#else
-                    check_cuda_error(cudaFreeAsync(*ptr, stream_));
-#endif
-                }
-                check_cuda_error(getSetDevice(o_device));
-                pointer_mapping_.erase(address);
-            }
-            else {
-                FT_CHECK_WITH_INFO(0,
-                                   fmtstr("pointer_mapping_ does not have information of ptr at %p.", address).c_str());
-            }
-        }
-        *ptr = nullptr;
-        return;
-    }
-
-    void memSet(void* ptr, const int val, const size_t size)
-    {
-        check_cuda_error(cudaMemsetAsync(ptr, val, size, stream_));
-    }
-};
-
-#ifdef GOOGLE_CUDA
-using namespace tensorflow;
-template<>
-class Allocator<AllocatorType::TF>: public IAllocator {
-    OpKernelContext*                               context_;
-    std::unordered_map<void*, tensorflow::Tensor>* pointer_mapping_;
-    cudaStream_t                                   stream_;
-
-    bool isExist(void* address) const
-    {
-        return pointer_mapping_->count(address) > 0;
-    }
-    ReallocType isReMalloc(void* address, size_t size) const
-    {
-        FT_CHECK(isExist(address));
-        size_t current_buffer_size = 1;
-        for (int i = 0; i < pointer_mapping_->at(address).dims(); i++) {
-            current_buffer_size *= pointer_mapping_->at(address).dim_size(i);
-        }
-        TM_LOG_DEBUG("current_buffer_size: %d, new buffer: %d", current_buffer_size, size);
-        if (current_buffer_size < size) {
-            return ReallocType::INCREASE;
-        }
-        else if (current_buffer_size == size) {
-            return ReallocType::REUSE;
-        }
-        else {
-            return ReallocType::DECREASE;
-        }
-    }
-
-public:
-    Allocator(OpKernelContext* context, cudaStream_t stream): context_(context), stream_(stream)
-    {
-        pointer_mapping_ = new std::unordered_map<void*, tensorflow::Tensor>();
-    }
-
-    void setStream(cudaStream_t stream)
-    {
-        stream_ = stream;
-    }
-
-    cudaStream_t returnStream()
-    {
-        return stream_;
-    };
-
-    void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false)
-    {
-        TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-        tensorflow::Tensor buf;
-        long long int      buf_size = ((long long int)ceil(size / 32.) * 32);
-        tensorflow::Status status;
-        if (is_host) {
-            tensorflow::AllocatorAttributes pinned_allocator;
-            pinned_allocator.set_on_host(true);
-            pinned_allocator.set_gpu_compatible(true);
-            status = context_->allocate_temp(DT_UINT8, TensorShape{buf_size}, &buf, pinned_allocator);
-        }
-        else {
-            status = context_->allocate_temp(DT_UINT8, TensorShape{buf_size}, &buf);
-        }
-
-        if (status != tensorflow::Status::OK()) {
-            throw std::runtime_error("TF error: context->allocate_temp failed");
-        }
-
-        auto  flat = buf.flat<uint8>();
-        void* ptr  = (void*)flat.data();
-        if (is_set_zero) {
-            cudaMemsetAsync(ptr, 0, buf_size, stream_);
-        }
-        pointer_mapping_->insert({getAddress(ptr), buf});
-
-        return ptr;
-    }
-
-    void free(void** ptr, bool is_host = false) const
-    {
-        TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-        void* address = getAddress(*ptr);
-        pointer_mapping_->erase(address);
-        *ptr = nullptr;
-        return;
-    }
-
-    virtual ~Allocator()
-    {
-        while (!pointer_mapping_->empty()) {
-            void* ptr = pointer_mapping_->begin()->second.flat<uint8>().data();
-            free(&ptr);
-        }
-        pointer_mapping_->clear();
-        delete pointer_mapping_;
-    }
-
-    void memSet(void* ptr, const int val, const size_t size)
-    {
-        check_cuda_error(cudaMemsetAsync(ptr, val, size, stream_));
-    }
-};
-#endif
-
-#ifdef TORCH_CUDA
-template<>
-class Allocator<AllocatorType::TH>: public IAllocator {
-    std::unordered_map<void*, torch::Tensor>* pointer_mapping_;
-
-    bool isExist(void* address) const
-    {
-        return pointer_mapping_->count(address) > 0;
-    }
-    ReallocType isReMalloc(void* address, size_t size) const
-    {
-        FT_CHECK(isExist(address));
-        size_t current_buffer_size = 1;
-        for (int i = 0; i < pointer_mapping_->at(address).dim(); i++) {
-            current_buffer_size *= pointer_mapping_->at(address).size(i);
-        }
-        TM_LOG_DEBUG(
-            "current_buffer_size: %d, original buffer: %p, new buffer: %d", current_buffer_size, address, size);
-        if (current_buffer_size < size) {
-            return ReallocType::INCREASE;
-        }
-        else if (current_buffer_size == size) {
-            return ReallocType::REUSE;
-        }
-        else {
-            return ReallocType::DECREASE;
-        }
-    }
-
-public:
-    Allocator()
-    {
-        pointer_mapping_ = new std::unordered_map<void*, torch::Tensor>();
-    }
-
-    void setStream(cudaStream_t stream)
-    {
-        // nothing to do here;
-    }
-
-    cudaStream_t returnStream()
-    {
-        // nothing to do here;
-        return 0;
-    };
-
-    void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false)
-    {
-        TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-        int64_t       buf_size = static_cast<int64_t>(ceil(size / 32.)) * 32;
-        torch::Tensor buf;
-        if (is_host) {
-            buf = torch::empty({buf_size}, torch::dtype(torch::kUInt8).device(torch::kCPU).pinned_memory(true));
-        }
-        else {
-            buf = torch::empty({buf_size}, torch::dtype(torch::kUInt8).device(torch::kCUDA));
-        }
-        void* ptr = buf.data_ptr();
-        if (is_set_zero) {
-            cudaMemset(ptr, 0, buf_size);
-        }
-        TM_LOG_DEBUG("malloc buffer %p with size %ld", ptr, buf_size);
-        pointer_mapping_->insert({getAddress(ptr), buf});
-        return ptr;
-    }
-
-    void free(void** ptr, bool is_host = false) const
-    {
-        TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-        void* address = getAddress(*ptr);
-        pointer_mapping_->erase(address);
-        *ptr = nullptr;
-        return;
-    }
-
-    virtual ~Allocator()
-    {
-        TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-        while (!pointer_mapping_->empty()) {
-            void* ptr = pointer_mapping_->begin()->second.data_ptr();
-            free(&ptr);
-        }
-        pointer_mapping_->clear();
-        delete pointer_mapping_;
-    }
-
-    void memSet(void* ptr, const int val, const size_t size)
-    {
-        check_cuda_error(cudaMemset(ptr, val, size));
-    }
-};
-#endif
-}  // namespace turbomind
diff --git a/src/turbomind/utils/anomaly_handler.cu b/src/turbomind/utils/anomaly_handler.cu
index e4e1eb6228..693e7c3569 100644
--- a/src/turbomind/utils/anomaly_handler.cu
+++ b/src/turbomind/utils/anomaly_handler.cu
@@ -1,8 +1,5 @@
 
-#include "src/turbomind/utils/anomaly_handler.h"
-#include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/logger.h"
-#include "src/turbomind/utils/memory_utils.h"
+
 #include <cmath>
 #include <cub/block/block_reduce.cuh>
 #include <optional>
@@ -10,6 +7,13 @@
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 
+#include "src/turbomind/core/data_type.h"
+#include "src/turbomind/models/llama/llama_utils.h"
+#include "src/turbomind/utils/anomaly_handler.h"
+#include "src/turbomind/utils/cuda_utils.h"
+#include "src/turbomind/utils/logger.h"
+#include "src/turbomind/utils/memory_utils.h"
+
 namespace turbomind {
 
 static std::optional<float> parse_float(const std::string& s, const std::string& key)
@@ -378,10 +382,25 @@ void AnomalyHandler::FixLogits(T* logits, int batch_size, int level)
     impl_->invokeFixLogitsAnomaly(logits, batch_size, level);
 }
 
+int AnomalyHandler::level() noexcept
+{
+    return Impl::g_level;
+}
+
 template void AnomalyHandler::FixLogits(float*, int, int);
 template void AnomalyHandler::FixLogits(half*, int, int);
 #ifdef ENABLE_BF16
 template void AnomalyHandler::FixLogits(__nv_bfloat16*, int, int);
 #endif
 
+void DebugTensor(Tensor& tensor, const std::string& key, int level)
+{
+    auto invoke = [&](auto t) {
+        using T = decltype(t);
+        AnomalyHandler::instance().CountAndFix((T*)tensor.raw_data(), tensor.size(), key, level);
+        // Compare((T*)tensor.raw_data(), tensor.size(), key, compare_mode, core::Context::stream().handle());
+    };
+    TM_DISPATCH_DTYPES(tensor.dtype(), invoke, float, half_t, bfloat16_t);
+}
+
 }  // namespace turbomind
diff --git a/src/turbomind/utils/anomaly_handler.h b/src/turbomind/utils/anomaly_handler.h
index 9603b8e781..00325183f9 100644
--- a/src/turbomind/utils/anomaly_handler.h
+++ b/src/turbomind/utils/anomaly_handler.h
@@ -4,11 +4,14 @@
 #pragma once
 
 #include <cstdint>
+#include <cuda_bf16.h>
 #include <cuda_runtime.h>
 #include <functional>
 #include <memory>
 #include <string>
 
+#include "src/turbomind/core/core.h"
+
 namespace turbomind {
 
 class AnomalyHandler {
@@ -21,6 +24,8 @@ class AnomalyHandler {
 
     static AnomalyHandler& instance();
 
+    static int level() noexcept;
+
     void Init(int rank, int vocab_size, int fallback, int max_batch_size, cudaStream_t stream) noexcept;
 
     template<class T>
@@ -47,4 +52,21 @@ void count_and_fix(T* data, size_t size, std::string key, int level)
     AnomalyHandler::instance().CountAndFix(data, size, key, level);
 }
 
+void DebugTensor(Tensor& tensor, const std::string& key, int level);
+
+inline void DebugTensor(Tensor&& tensor, const std::string& key, int level)
+{
+    DebugTensor(tensor, key, level);
+}
+
+#define TM_DEBUG_RAW(ptr, size, key, __level)                                                                          \
+    if (::turbomind::AnomalyHandler::level() >= __level) {                                                             \
+        ::turbomind::count_and_fix(ptr, size, key, __level);                                                           \
+    }
+
+#define TM_DEBUG_TENSOR(tensor, key, __level)                                                                          \
+    if (::turbomind::AnomalyHandler::level() >= __level) {                                                             \
+        ::turbomind::DebugTensor(tensor, key, __level);                                                                \
+    }
+
 }  // namespace turbomind
diff --git a/src/turbomind/utils/cublasAlgoMap.cc b/src/turbomind/utils/cublasAlgoMap.cc
deleted file mode 100644
index 1f9d5743c4..0000000000
--- a/src/turbomind/utils/cublasAlgoMap.cc
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "cublasAlgoMap.h"
-
-namespace turbomind {
-
-cublasAlgoMap::cublasAlgoMap(const std::string filename, const std::string sp_config_filename):
-    config_filename_(filename), sp_config_filename_(sp_config_filename)
-{
-    loadGemmConfig();
-    loadSpGemmConfig();
-}
-
-cublasAlgoMap::cublasAlgoMap(const cublasAlgoMap& algo_map):
-    config_filename_(algo_map.config_filename_),
-    sp_config_filename_(algo_map.sp_config_filename_),
-    algo_map_(algo_map.algo_map_),
-    sp_algo_map_(algo_map.sp_algo_map_)
-{
-}
-
-cublasAlgoMap::~cublasAlgoMap()
-{
-    algo_map_.clear();
-}
-
-void cublasAlgoMap::loadGemmConfig()
-{
-    FILE* fd;
-    fd = fopen(config_filename_.c_str(), "r");
-    if (fd == NULL) {
-        std::cout << "[WARNING] " << config_filename_ << " is not found; using default GEMM algo" << std::endl;
-        return;
-    }
-
-    int   batchCount2, m2, n2, k2, algoId, customOption, tile, splitK_val;
-    int   batch_size, seq_len, head_num, size_per_head, dataType;
-    int   swizzle, reductionScheme, workspaceSize, stages;
-    int   inner_shapeId, cluster_shapeId, mma_shapeId, cga_shapeId, sche_mode;
-    float exec_time;
-    char  tmp[1024];
-    if (!fgets(tmp, 1024, fd)) {
-        printf("[ERROR] fgets fail at %s:%d \n", __FILE__, __LINE__);
-        exit(-1);
-    }
-    while (fscanf(fd,
-                  "%d %d %d %d %d ### %d %d %d %d %d %d %d %d %d %d %d %d "
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-                  "%d %d "
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-                  "%d %d %d "
-#endif
-                  "%f\n",
-                  &batch_size,
-                  &seq_len,
-                  &head_num,
-                  &size_per_head,
-                  &dataType,
-                  &batchCount2,
-                  &n2,
-                  &m2,
-                  &k2,
-                  &algoId,
-                  &customOption,
-                  &tile,
-                  &splitK_val,
-                  &swizzle,
-                  &reductionScheme,
-                  &workspaceSize,
-                  &stages,
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-                  &inner_shapeId,
-                  &cluster_shapeId,
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-                  &mma_shapeId,
-                  &cga_shapeId,
-                  &sche_mode,
-#endif
-                  &exec_time)
-           != EOF) {
-        if (dataType != FLOAT_DATATYPE && dataType != HALF_DATATYPE && dataType != BFLOAT16_DATATYPE
-            && dataType != INT8_DATATYPE && dataType != FP8_DATATYPE) {
-            printf("[WARNING][readAlgoFromConfig] wrong dataType %d!\n", dataType);
-            continue;
-        }
-        cublasAlgoConfig_t markStr{batchCount2, m2, n2, k2, static_cast<CublasDataType>(dataType)};
-        // workspaceSize should be zero
-        if (algo_map_.find(markStr) == algo_map_.end()) {
-            algo_map_[markStr].algoId          = algoId;
-            algo_map_[markStr].customOption    = customOption;
-            algo_map_[markStr].tile            = tile;
-            algo_map_[markStr].splitK_val      = splitK_val;
-            algo_map_[markStr].swizzle         = swizzle;
-            algo_map_[markStr].reductionScheme = reductionScheme;
-            algo_map_[markStr].workspaceSize   = workspaceSize;
-            algo_map_[markStr].stages          = stages;
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-            algo_map_[markStr].inner_shapeId   = (uint16_t)inner_shapeId;
-            algo_map_[markStr].cluster_shapeId = (uint16_t)cluster_shapeId;
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-            algo_map_[markStr].mma_shapeId = (uint16_t)mma_shapeId;
-            algo_map_[markStr].cga_shapeId = (uint16_t)cga_shapeId;
-            algo_map_[markStr].sche_mode   = (uint16_t)sche_mode;
-#endif
-            algo_map_[markStr].exec_time = exec_time;
-        }
-    }
-    fclose(fd);
-}
-
-bool cublasAlgoMap::isExist(
-    const int batch_count, const int m, const int n, const int k, const CublasDataType data_type)
-{
-    cublasAlgoConfig_t mark{batch_count, n, m, k, data_type};
-    return algo_map_.find(mark) != algo_map_.end();
-}
-
-cublasLtMatmulAlgo_info
-cublasAlgoMap::getAlgo(const int batch_count, const int m, const int n, const int k, const CublasDataType data_type)
-{
-    cublasAlgoConfig_t mark{batch_count, n, m, k, data_type};
-    if (algo_map_.find(mark) != algo_map_.end()) {
-        return algo_map_[mark];
-    }
-    else {
-        cublasLtMatmulAlgo_info tmp_algo;
-        tmp_algo.algoId =
-            static_cast<int>(data_type == FLOAT_DATATYPE ? CUBLAS_GEMM_DEFAULT : CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-        tmp_algo.customOption    = -1;
-        tmp_algo.tile            = -1;
-        tmp_algo.splitK_val      = -1;
-        tmp_algo.swizzle         = -1;
-        tmp_algo.reductionScheme = -1;
-        tmp_algo.workspaceSize   = -1;
-        tmp_algo.stages          = -1;
-        tmp_algo.exec_time       = -1.0f;
-        return tmp_algo;
-    }
-}
-
-void cublasAlgoMap::loadSpGemmConfig()
-{
-    if (sp_config_filename_.empty()) {
-        return;
-    }
-    FILE* fd = fopen(sp_config_filename_.c_str(), "r");
-    if (fd == NULL) {
-        printf("[WARNING] %s is not found; using SPGEMM algo id 0\n", sp_config_filename_.c_str());
-        return;
-    }
-    sp_algo_map_.clear();
-    int   batch_size, seq_len, head_num, size_per_head, data_type;
-    int   batchCount, m, n, k, algoId;
-    float exec_time;
-    char  tmp[1024];
-    if (!fgets(tmp, 1024, fd)) {
-        printf("[ERROR] fgets fail at %s:%d \n", __FILE__, __LINE__);
-        exit(-1);
-    }
-    while (fscanf(fd,
-                  "%d %d %d %d %d ### %d %d %d %d %d %f\n",
-                  &batch_size,
-                  &seq_len,
-                  &head_num,
-                  &size_per_head,
-                  &data_type,
-                  &batchCount,
-                  &m,
-                  &n,
-                  &k,
-                  &algoId,
-                  &exec_time)
-           != EOF) {
-        char mark[256];
-        sprintf(mark, "%d_%d_%d_%d", batchCount, m, n, k);
-        std::string markStr(mark);
-        sp_algo_map_[markStr] = algoId;
-    }
-    fclose(fd);
-}
-
-int cublasAlgoMap::getSpAlgo(const int batch_count, const int m, const int n, const int k)
-{
-    char mark[256];
-    sprintf(mark, "%d_%d_%d_%d", batch_count, m, n, k);
-    if (sp_algo_map_.find(mark) != sp_algo_map_.end()) {
-        return sp_algo_map_[mark];
-    }
-    else {
-        // for remove padding, select algo 1 for simplicity
-        return 0;
-    }
-}
-
-bool cublasAlgoMap::isUseSparse(const int batch_count, const int m, const int n, const int k)
-{
-    // not available to use cusparselt.
-    if (m % 8 != 0 || n % 8 != 0 || k % 8 != 0) {
-        return false;
-    }
-    char mark[256];
-    sprintf(mark, "%d_%d_%d_%d", batch_count, m, n, k);
-    if (sp_algo_map_.find(mark) != sp_algo_map_.end()) {
-        return sp_algo_map_[mark] != -1;
-    }
-    else {
-        // no gemm test case, choose sparse according to sparse flag
-        return true;
-    }
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/cublasAlgoMap.h b/src/turbomind/utils/cublasAlgoMap.h
deleted file mode 100644
index 3e5b534a1b..0000000000
--- a/src/turbomind/utils/cublasAlgoMap.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "src/turbomind/utils/cuda_utils.h"
-#include <cublasLt.h>
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
-#include <map>
-#include <string>
-#include <unordered_map>
-#include <utility>
-
-#pragma once
-namespace turbomind {
-
-#define GEMM_NUM 6
-#define GEMM_CONFIG "gemm_config.in"
-#define IGEMM_CONFIG "igemm_config.in"
-#define SPGEMM_CONFIG "spgemm_config.in"
-#define SPIGEMM_CONFIG "spigemm_config.in"
-
-typedef struct {
-    int algoId, customOption, tile, splitK_val;
-    int swizzle, reductionScheme, workspaceSize;
-    // only used in cublasLt >= 11.0
-    int stages;
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-    uint16_t inner_shapeId, cluster_shapeId;
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-    uint16_t mma_shapeId, cga_shapeId, sche_mode;
-#endif
-    float exec_time;
-} cublasLtMatmulAlgo_info;
-
-/* Structure to store information about different run trials */
-typedef struct {
-    cublasLtMatmulAlgo_t      algo;
-    cublasStatus_t            status;
-    float                     time;
-    size_t                    workspaceSize;  // actual memory workspace needed
-    cublasMath_t              mathMode;
-    cublasLtReductionScheme_t reductionScheme;
-    int                       customOption;
-    float                     wavesCount;
-} customMatmulPerf_t;
-
-struct cublasAlgoConfig_t {
-    int            batch_count;
-    int            m;
-    int            n;
-    int            k;
-    CublasDataType data_type;
-    bool           operator==(cublasAlgoConfig_t const& config) const
-    {
-        return (batch_count == config.batch_count) && (m == config.m) && (n == config.n) && (k == config.k)
-               && (data_type == config.data_type);
-    }
-};
-
-class cublasAlgoConfig_hasher {
-public:
-    std::size_t operator()(cublasAlgoConfig_t const& config) const
-    {
-        return config.batch_count * 98317ull ^ config.m * 49157ull ^ config.n * 24593ull ^ config.k * 196613ull
-               ^ static_cast<int>(config.data_type) * 6151ull;
-    }
-};
-
-class cublasAlgoMap {
-private:
-    std::unordered_map<cublasAlgoConfig_t, cublasLtMatmulAlgo_info, cublasAlgoConfig_hasher> algo_map_;
-    std::string                                                                              config_filename_;
-    std::string                                                                              sp_config_filename_;
-    std::map<std::string, int>                                                               sp_algo_map_;
-
-public:
-    cublasAlgoMap(){};
-    explicit cublasAlgoMap(const std::string filename, const std::string sp_config_filename = "");
-    cublasAlgoMap(const cublasAlgoMap& map);
-    ~cublasAlgoMap();
-    void loadGemmConfig();
-    void loadSpGemmConfig();
-    int  getSpAlgo(const int batch_count, const int m, const int n, const int k);
-    bool isUseSparse(const int batch_count, const int m, const int n, const int k);
-
-    bool isExist(const int batch_count, const int m, const int n, const int k, const CublasDataType data_type);
-
-    cublasLtMatmulAlgo_info
-    getAlgo(const int batch_count, const int m, const int n, const int k, const CublasDataType data_type);
-};
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/cublasINT8MMWrapper.cc b/src/turbomind/utils/cublasINT8MMWrapper.cc
deleted file mode 100644
index 9afd21d088..0000000000
--- a/src/turbomind/utils/cublasINT8MMWrapper.cc
+++ /dev/null
@@ -1,559 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "cublasINT8MMWrapper.h"
-
-#ifndef CUDART_VERSION
-#error CUDART_VERSION Undefined!
-#endif
-
-namespace turbomind {
-cublasINT8MMWrapper::cublasINT8MMWrapper(cublasLtHandle_t cublaslt_handle,
-                                         cudaStream_t     stream,
-                                         cublasAlgoMap*   cublas_algo_map,
-                                         std::mutex*      mu,
-                                         bool             use_ORDER_COL32_2R_4R4):
-    cublasMMWrapper(nullptr, cublaslt_handle, stream, cublas_algo_map, mu, nullptr),
-    use_ORDER_COL32_2R_4R4_(use_ORDER_COL32_2R_4R4)
-{
-}
-
-cublasINT8MMWrapper::cublasINT8MMWrapper(cublasHandle_t   cublas_handle,
-                                         cublasLtHandle_t cublaslt_handle,
-                                         cudaStream_t     stream,
-                                         cublasAlgoMap*   cublas_algo_map,
-                                         std::mutex*      mu,
-                                         bool             use_ORDER_COL32_2R_4R4):
-    cublasMMWrapper(cublas_handle, cublaslt_handle, stream, cublas_algo_map, mu, nullptr),
-    use_ORDER_COL32_2R_4R4_(use_ORDER_COL32_2R_4R4)
-{
-}
-
-#ifdef SPARSITY_ENABLED
-cublasINT8MMWrapper::cublasINT8MMWrapper(cublasLtHandle_t   cublaslt_handle,
-                                         cusparseLtHandle_t cusparselt_handle,
-                                         cudaStream_t       stream,
-                                         cublasAlgoMap*     cublas_algo_map,
-                                         std::mutex*        mu,
-                                         bool               use_ORDER_COL32_2R_4R4):
-    cublasMMWrapper(nullptr, cublaslt_handle, cusparselt_handle, stream, cublas_algo_map, mu, nullptr),
-    use_ORDER_COL32_2R_4R4_(use_ORDER_COL32_2R_4R4)
-{
-}
-#endif
-
-cublasINT8MMWrapper::~cublasINT8MMWrapper()
-{
-    mu_ = nullptr;
-}
-
-cublasINT8MMWrapper::cublasINT8MMWrapper(const cublasINT8MMWrapper& wrapper):
-#ifdef SPARSITY_ENABLED
-    cublasMMWrapper(nullptr,
-                    wrapper.cublaslt_handle_,
-                    wrapper.cusparselt_handle_,
-                    wrapper.stream_,
-                    wrapper.cublas_algo_map_,
-                    wrapper.mu_,
-                    wrapper.allocator_),
-#else
-    cublasMMWrapper(
-        nullptr, wrapper.cublaslt_handle_, wrapper.stream_, wrapper.cublas_algo_map_, wrapper.mu_, wrapper.allocator_),
-#endif
-    use_ORDER_COL32_2R_4R4_(wrapper.use_ORDER_COL32_2R_4R4_)
-{
-}
-
-// for int8 cublasLtMM with algo
-// ATransform should be m*n, CUBLASLT_ORDER_COL32
-// kernel should be n*k, CUBLASLT_ORDER_COL4_4R2_8C or CUBLASLT_ORDER_COL32_2R_4R4
-// res is m*n, CUBLASLT_ORDER_COL32
-void cublasINT8MMWrapper::Gemm(int*          res,
-                               int           batchCount,
-                               int           m,
-                               int           n,
-                               int           k,
-                               int64_t       stridea,
-                               int64_t       strideb,
-                               int64_t       stridec,
-                               const int8_t* ATransform,
-                               const int8_t* kernel)
-{
-    mu_->lock();
-    cublasOperation_t opTranspose = CUBLAS_OP_T;
-#if (CUDART_VERSION >= 11000)
-    cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
-#else
-    cudaDataType_t computeType = CUDA_R_32I;
-#endif
-    cublasLtMatmulDesc_t   matmulDesc;
-    cublasLtMatrixLayout_t AtransformDesc = NULL;
-    cublasLtMatrixLayout_t BtransformDesc = NULL;
-    cublasLtMatrixLayout_t CtransformDesc = NULL;
-    cublasLtOrder_t        order_COL32    = CUBLASLT_ORDER_COL32;
-
-    cublasLtOrder_t order_matrixB;
-#if (CUDART_VERSION >= 11000)
-    if (use_ORDER_COL32_2R_4R4_) {
-        order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
-    }
-    else {
-        order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
-    }
-#else
-    order_matrixB              = CUBLASLT_ORDER_COL4_4R2_8C;
-#endif
-
-    int ldaTransform = 32 * m;
-    int ldbTransform;
-    if (use_ORDER_COL32_2R_4R4_) {
-        ldbTransform = 32 * ((n + 32 - 1) / 32) * 32;
-    }
-    else {
-        ldbTransform = 32 * ((n + 8 - 1) / 8) * 8;
-    }
-    int ldcTransform = 32 * m;
-
-    // create matmulDesc
-#if (CUDART_VERSION >= 11000)
-    cublasLtMatmulDescCreate(&matmulDesc, computeType, CUDA_R_32I);
-#else
-    cublasLtMatmulDescCreate(&matmulDesc, computeType);
-#endif
-    cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t));
-    cublasLtMatrixLayoutCreate(&AtransformDesc, CUDA_R_8I, m, k, ldaTransform);
-    cublasLtMatrixLayoutSetAttribute(AtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
-    cublasLtMatrixLayoutCreate(&BtransformDesc, CUDA_R_8I, n, k, ldbTransform);
-    cublasLtMatrixLayoutSetAttribute(
-        BtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_matrixB, sizeof(order_matrixB));
-    cublasLtMatrixLayoutCreate(&CtransformDesc, CUDA_R_32I, m, n, ldcTransform);
-    cublasLtMatrixLayoutSetAttribute(CtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
-    if (batchCount > 1) {
-        cublasLtMatrixLayoutSetAttribute(
-            AtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
-        cublasLtMatrixLayoutSetAttribute(
-            AtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, sizeof(stridea));
-        cublasLtMatrixLayoutSetAttribute(
-            BtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
-        cublasLtMatrixLayoutSetAttribute(
-            BtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, sizeof(strideb));
-        cublasLtMatrixLayoutSetAttribute(
-            CtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
-        cublasLtMatrixLayoutSetAttribute(
-            CtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, sizeof(stridec));
-    }
-
-    int alphaI = 1;
-    int betaI  = 0;
-
-    // get algo
-    cublasLtMatmulAlgo_t algo;
-    int                  findAlgo = 0;
-    if (cublas_algo_map_->isExist(batchCount, m, n, k, INT8_DATATYPE)) {
-        // printf("find algo %s\n", markStr.c_str());
-        findAlgo = 1;
-
-        cublasLtMatmulAlgo_info tmp_info = cublas_algo_map_->getAlgo(batchCount, m, n, k, INT8_DATATYPE);
-
-        cublasLtMatmulAlgoInit(cublaslt_handle_,
-                               computeType,
-                               CUDA_R_32I,
-                               CUDA_R_8I,
-                               CUDA_R_8I,
-                               CUDA_R_32I,
-                               CUDA_R_32I,
-                               tmp_info.algoId,
-                               &algo);
-        cublasLtMatmulAlgoConfigSetAttribute(
-            &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(tmp_info.customOption), sizeof(tmp_info.customOption));
-        cublasLtMatmulAlgoConfigSetAttribute(
-            &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tmp_info.tile), sizeof(tmp_info.tile));
-        cublasLtMatmulAlgoConfigSetAttribute(
-            &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(tmp_info.splitK_val), sizeof(tmp_info.splitK_val));
-        cublasLtMatmulAlgoConfigSetAttribute(
-            &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(tmp_info.swizzle), sizeof(tmp_info.swizzle));
-        cublasLtMatmulAlgoConfigSetAttribute(
-            &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(tmp_info.reductionScheme), sizeof(int));
-#if (CUDART_VERSION >= 11000)
-        cublasLtMatmulAlgoConfigSetAttribute(
-            &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(tmp_info.stages), sizeof(tmp_info.stages));
-#endif
-    }
-    else {
-        findAlgo = 1;
-        int algoId;
-        if (use_ORDER_COL32_2R_4R4_) {
-            algoId = 7;
-        }
-        else {
-            algoId = 6;
-        }
-        int swizzle         = 0;
-        int customOption    = 0;
-        int tile            = 20;
-        int splitK_val      = 0;
-        int reductionScheme = 0;
-        cublasLtMatmulAlgoInit(
-            cublaslt_handle_, computeType, CUDA_R_32I, CUDA_R_8I, CUDA_R_8I, CUDA_R_32I, CUDA_R_32I, algoId, &algo);
-        cublasLtMatmulAlgoConfigSetAttribute(
-            &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(customOption), sizeof(customOption));
-        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tile), sizeof(tile));
-        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(splitK_val), sizeof(splitK_val));
-        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(swizzle), sizeof(swizzle));
-        cublasLtMatmulAlgoConfigSetAttribute(
-            &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(reductionScheme), sizeof(int));
-#if (CUDART_VERSION >= 11000)
-        int stages;
-        if (use_ORDER_COL32_2R_4R4_) {
-            stages = 15;
-        }
-        else {
-            stages = 13;
-        }
-        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(stages), sizeof(stages));
-#endif
-    }
-
-    cublasLtMatmul(cublaslt_handle_,
-                   matmulDesc,
-                   &alphaI,
-                   ATransform,
-                   AtransformDesc,
-                   kernel,
-                   BtransformDesc,
-                   &betaI,
-                   res,
-                   CtransformDesc,
-                   res,
-                   CtransformDesc,
-                   (findAlgo == 1 ? (&algo) : NULL),
-                   NULL,
-                   0,
-                   stream_);
-
-    cublasLtMatmulDescDestroy(matmulDesc);
-    cublasLtMatrixLayoutDestroy(AtransformDesc);
-    cublasLtMatrixLayoutDestroy(BtransformDesc);
-    cublasLtMatrixLayoutDestroy(CtransformDesc);
-    sync_check_cuda_error();
-    mu_->unlock();
-}
-
-// for int8 IO cublasLtMM with algo
-// ATransform should be m*k CUBLASLT_ORDER_COL32
-// kernel should be n*k CUBLASLT_ORDER_COL4_4R2_8C
-// res is m*n CUBLASLT_ORDER_COL32
-void cublasINT8MMWrapper::Gemm(int8_t*       res,
-                               int           batchCount,
-                               int           m,
-                               int           n,
-                               int           k,
-                               int64_t       stridea,
-                               int64_t       strideb,
-                               int64_t       stridec,
-                               const float   alpha,
-                               const int8_t* ATransform,
-                               const int8_t* kernel)
-{
-    mu_->lock();
-    cublasOperation_t opTranspose = CUBLAS_OP_T;
-    // int8 gemm does not support CUBLAS_POINTER_MODE_DEVICE
-    // cublasLtPointerMode_t pointerMode = CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO;
-    cudaDataType_t scaleType = CUDA_R_32F;
-#if (CUDART_VERSION >= 11000)
-    cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
-#else
-    cudaDataType_t computeType = CUDA_R_32I;
-#endif
-    cublasLtMatmulDesc_t   matmulDesc;
-    cublasLtMatrixLayout_t AtransformDesc = NULL;
-    cublasLtMatrixLayout_t BtransformDesc = NULL;
-    cublasLtMatrixLayout_t CtransformDesc = NULL;
-    cublasLtOrder_t        order_COL32    = CUBLASLT_ORDER_COL32;
-
-    cublasLtOrder_t order_matrixB;
-#if (CUDART_VERSION >= 11000)
-    if (use_ORDER_COL32_2R_4R4_) {
-        order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
-    }
-    else {
-        order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
-    }
-#else
-    order_matrixB              = CUBLASLT_ORDER_COL4_4R2_8C;
-#endif
-
-    int ldaTransform = 32 * m;
-
-    int ldbTransform;
-    if (use_ORDER_COL32_2R_4R4_) {
-        ldbTransform = 32 * ((n + 32 - 1) / 32) * 32;
-    }
-    else {
-        ldbTransform = 32 * ((n + 8 - 1) / 8) * 8;
-    }
-
-    int ldcTransform = 32 * m;
-
-    // create matmulDesc
-#if (CUDART_VERSION >= 11000)
-    cublasLtMatmulDescCreate(&matmulDesc, computeType, scaleType);
-#else
-    cublasLtMatmulDescCreate(&matmulDesc, computeType);
-#endif
-    cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t));
-    cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, &scaleType, sizeof(scaleType));
-    // cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointerMode,
-    // sizeof(cublasLtPointerMode_t));
-    cublasLtMatrixLayoutCreate(&AtransformDesc, CUDA_R_8I, m, k, ldaTransform);
-    cublasLtMatrixLayoutSetAttribute(AtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
-    cublasLtMatrixLayoutCreate(&BtransformDesc, CUDA_R_8I, n, k, ldbTransform);
-    cublasLtMatrixLayoutSetAttribute(
-        BtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_matrixB, sizeof(order_matrixB));
-    cublasLtMatrixLayoutCreate(&CtransformDesc, CUDA_R_8I, m, n, ldcTransform);
-    cublasLtMatrixLayoutSetAttribute(CtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
-    if (batchCount > 1) {
-        cublasLtMatrixLayoutSetAttribute(
-            AtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
-        cublasLtMatrixLayoutSetAttribute(
-            AtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, sizeof(stridea));
-        cublasLtMatrixLayoutSetAttribute(
-            BtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
-        cublasLtMatrixLayoutSetAttribute(
-            BtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, sizeof(strideb));
-        cublasLtMatrixLayoutSetAttribute(
-            CtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
-        cublasLtMatrixLayoutSetAttribute(
-            CtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, sizeof(stridec));
-    }
-
-    // get algo
-    cublasLtMatmulAlgo_t algo;
-    int                  findAlgo = 0;
-    if (cublas_algo_map_->isExist(batchCount, m, n, k, INT8_DATATYPE)) {
-        findAlgo = 1;
-
-        cublasLtMatmulAlgo_info tmp_info = cublas_algo_map_->getAlgo(batchCount, m, n, k, INT8_DATATYPE);
-
-        cublasLtMatmulAlgoInit(cublaslt_handle_,
-                               computeType,
-                               CUDA_R_32F,
-                               CUDA_R_8I,
-                               CUDA_R_8I,
-                               CUDA_R_8I,
-                               CUDA_R_8I,
-                               tmp_info.algoId,
-                               &algo);
-        cublasLtMatmulAlgoConfigSetAttribute(
-            &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(tmp_info.customOption), sizeof(tmp_info.customOption));
-        cublasLtMatmulAlgoConfigSetAttribute(
-            &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tmp_info.tile), sizeof(tmp_info.tile));
-        cublasLtMatmulAlgoConfigSetAttribute(
-            &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(tmp_info.splitK_val), sizeof(tmp_info.splitK_val));
-        cublasLtMatmulAlgoConfigSetAttribute(
-            &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(tmp_info.swizzle), sizeof(tmp_info.swizzle));
-        cublasLtMatmulAlgoConfigSetAttribute(
-            &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(tmp_info.reductionScheme), sizeof(int));
-#if (CUDART_VERSION >= 11000)
-        cublasLtMatmulAlgoConfigSetAttribute(
-            &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(tmp_info.stages), sizeof(tmp_info.stages));
-#endif
-    }
-    else {
-        findAlgo = 1;
-        int algoId;
-        if (use_ORDER_COL32_2R_4R4_) {
-            algoId = 7;
-        }
-        else {
-            algoId = 6;
-        }
-        int swizzle         = 0;
-        int customOption    = 0;
-        int tile            = 20;
-        int splitK_val      = 0;
-        int reductionScheme = 0;
-        cublasLtMatmulAlgoInit(
-            cublaslt_handle_, computeType, CUDA_R_32F, CUDA_R_8I, CUDA_R_8I, CUDA_R_8I, CUDA_R_8I, algoId, &algo);
-        cublasLtMatmulAlgoConfigSetAttribute(
-            &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(customOption), sizeof(customOption));
-        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tile), sizeof(tile));
-        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(splitK_val), sizeof(splitK_val));
-        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(swizzle), sizeof(swizzle));
-        cublasLtMatmulAlgoConfigSetAttribute(
-            &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(reductionScheme), sizeof(int));
-#if (CUDART_VERSION >= 11000)
-        int stages;
-        if (use_ORDER_COL32_2R_4R4_) {
-            stages = 15;
-        }
-        else {
-            stages = 13;
-        }
-        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(stages), sizeof(stages));
-#endif
-    }
-
-    float beta = 0.0f;
-    cublasLtMatmul(cublaslt_handle_,
-                   matmulDesc,
-                   &alpha,
-                   ATransform,
-                   AtransformDesc,
-                   kernel,
-                   BtransformDesc,
-                   &beta,
-                   res,
-                   CtransformDesc,
-                   res,
-                   CtransformDesc,
-                   (findAlgo == 1 ? (&algo) : NULL),
-                   NULL,
-                   0,
-                   stream_);
-
-    cublasLtMatmulDescDestroy(matmulDesc);
-    cublasLtMatrixLayoutDestroy(AtransformDesc);
-    cublasLtMatrixLayoutDestroy(BtransformDesc);
-    cublasLtMatrixLayoutDestroy(CtransformDesc);
-    sync_check_cuda_error();
-    mu_->unlock();
-}
-
-template<typename T>
-int cublasINT8MMWrapper::getFusedINT8QKVType(const int k, const int n, const AttentionWeight<T>* attention_weights)
-{
-
-    int           fusedINT8QKV_type = 0;
-    const int8_t* Q_weight          = (const int8_t*)(attention_weights->query_weight.kernel);
-    const int8_t* K_weight          = (const int8_t*)(attention_weights->key_weight.kernel);
-    const int8_t* V_weight          = (const int8_t*)(attention_weights->value_weight.kernel);
-    // for QKV weight are DataType_ & continue
-    if ((attention_weights->query_weight.kernel + n * k == attention_weights->key_weight.kernel)
-        && (attention_weights->key_weight.kernel + n * k == attention_weights->value_weight.kernel)) {
-        fusedINT8QKV_type = 1;
-    }
-    // for QVK weight are int8 & continue
-    else if ((Q_weight + n * k == K_weight) && (K_weight + n * k == V_weight)) {
-        fusedINT8QKV_type = 2;
-    }
-    return fusedINT8QKV_type;
-}
-
-bool cublasINT8MMWrapper::getUseOrderCol322R4R4()
-{
-    return use_ORDER_COL32_2R_4R4_;
-}
-
-template int
-cublasINT8MMWrapper::getFusedINT8QKVType(const int k, const int n, const AttentionWeight<float>* attention_weights);
-
-template int
-cublasINT8MMWrapper::getFusedINT8QKVType(const int k, const int n, const AttentionWeight<half>* attention_weights);
-
-#ifdef SPARSITY_ENABLED
-// A is sparse weight [m,k], non transposed row major
-// B is activation input [k, n], non transposed col major
-void cublasINT8MMWrapper::SpGemm(
-    const int m, const int n, const int k, const float alpha, const void* A, const void* B, void* C)
-{
-    cudaDataType_t                 Atype        = CUDA_R_8I;
-    cudaDataType_t                 Btype        = CUDA_R_8I;
-    cudaDataType_t                 Ctype        = CUDA_R_8I;
-    cusparseComputeType            compute_type = CUSPARSE_COMPUTE_32I;
-    cusparseOrder_t                col_order    = CUSPARSE_ORDER_COL;
-    cusparseOrder_t                row_order    = CUSPARSE_ORDER_ROW;
-    cusparseOperation_t            opA          = CUSPARSE_OPERATION_NON_TRANSPOSE;
-    cusparseOperation_t            opB          = CUSPARSE_OPERATION_NON_TRANSPOSE;
-    cusparseLtMatmulDescriptor_t   matmul;
-    cusparseLtMatmulAlgSelection_t alg_sel;
-    cusparseLtMatmulPlan_t         plan;
-
-    auto     num_A_rows = m;
-    auto     num_A_cols = k;
-    auto     num_B_rows = k;
-    auto     num_B_cols = n;
-    auto     num_C_rows = m;
-    auto     num_C_cols = n;
-    unsigned alignment  = 16;
-    auto     lda        = num_A_cols;
-    auto     ldb        = num_B_rows;
-    auto     ldc        = num_C_rows;
-    float    _beta(0.0f);
-
-    char mark[256];
-    sprintf(mark, "%d_%d_%d_%d", 1, m, n, k);
-    if (sp_mat_A_desc_map_.find(mark) != sp_mat_A_desc_map_.end()) {
-        CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_,
-                                                      &matmul,
-                                                      opA,
-                                                      opB,
-                                                      &sp_mat_A_desc_map_[mark],
-                                                      &sp_mat_B_desc_map_[mark],
-                                                      &sp_mat_C_desc_map_[mark],
-                                                      &sp_mat_C_desc_map_[mark],
-                                                      compute_type))
-    }
-    else {
-        // initializing MatDesc takes a lot of time
-        cusparseLtMatDescriptor_t mat_A, mat_B, mat_C;
-        sp_mat_A_desc_map_[mark] = mat_A;
-        sp_mat_B_desc_map_[mark] = mat_B;
-        sp_mat_C_desc_map_[mark] = mat_C;
-        CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_,
-                                                          &sp_mat_A_desc_map_[mark],
-                                                          num_A_rows,
-                                                          num_A_cols,
-                                                          lda,
-                                                          alignment,
-                                                          Atype,
-                                                          row_order,
-                                                          CUSPARSELT_SPARSITY_50_PERCENT))
-        CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(
-            &cusparselt_handle_, &sp_mat_B_desc_map_[mark], num_B_rows, num_B_cols, ldb, alignment, Btype, col_order))
-        CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(
-            &cusparselt_handle_, &sp_mat_C_desc_map_[mark], num_C_rows, num_C_cols, ldc, alignment, Ctype, col_order))
-        CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_,
-                                                      &matmul,
-                                                      opA,
-                                                      opB,
-                                                      &sp_mat_A_desc_map_[mark],
-                                                      &sp_mat_B_desc_map_[mark],
-                                                      &sp_mat_C_desc_map_[mark],
-                                                      &sp_mat_C_desc_map_[mark],
-                                                      compute_type))
-    }
-    mu_->lock();
-    CHECK_CUSPARSE(
-        cusparseLtMatmulAlgSelectionInit(&cusparselt_handle_, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
-    int alg = cublas_algo_map_->getSpAlgo(1, num_A_rows, num_B_cols, num_A_cols);
-    CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
-        &cusparselt_handle_, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)))
-    size_t workspace_size;
-    CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&cusparselt_handle_, &alg_sel, &workspace_size))
-    CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&cusparselt_handle_, &plan, &matmul, &alg_sel, workspace_size))
-
-    void*        d_workspace = nullptr;
-    int          num_streams = 1;
-    cudaStream_t streams[1]  = {stream_};
-    CHECK_CUSPARSE(
-        cusparseLtMatmul(&cusparselt_handle_, &plan, &alpha, A, B, &_beta, C, C, d_workspace, streams, num_streams))
-    CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
-    sync_check_cuda_error();
-    mu_->unlock();
-}
-#endif
-}  // namespace turbomind
diff --git a/src/turbomind/utils/cublasINT8MMWrapper.h b/src/turbomind/utils/cublasINT8MMWrapper.h
deleted file mode 100644
index 631ef1f842..0000000000
--- a/src/turbomind/utils/cublasINT8MMWrapper.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "cuda_utils.h"
-#include "src/turbomind/layers/attention_layers/AttentionWeight.h"
-#include "src/turbomind/utils/cublasAlgoMap.h"
-#include "src/turbomind/utils/cublasMMWrapper.h"
-#include <cublasLt.h>
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
-#include <map>
-#include <mutex>
-#include <string>
-
-#pragma once
-namespace turbomind {
-
-class cublasINT8MMWrapper: public cublasMMWrapper {
-private:
-    bool use_ORDER_COL32_2R_4R4_;
-
-public:
-    cublasINT8MMWrapper(cublasLtHandle_t cublaslt_handle_,
-                        cudaStream_t     stream,
-                        cublasAlgoMap*   map,
-                        std::mutex*      mu,
-                        bool             use_ORDER_COL32_2R_4R4);
-
-    cublasINT8MMWrapper(cublasHandle_t   cublas_handle,
-                        cublasLtHandle_t cublaslt_handle,
-                        cudaStream_t     stream,
-                        cublasAlgoMap*   map,
-                        std::mutex*      mu,
-                        bool             use_ORDER_COL32_2R_4R4);
-#ifdef SPARSITY_ENABLED
-    cublasINT8MMWrapper(cublasLtHandle_t   cublaslt_handle_,
-                        cusparseLtHandle_t cusparselt_handle,
-                        cudaStream_t       stream,
-                        cublasAlgoMap*     map,
-                        std::mutex*        mu,
-                        bool               use_ORDER_COL32_2R_4R4);
-#endif
-
-    ~cublasINT8MMWrapper();
-
-    cublasINT8MMWrapper(const cublasINT8MMWrapper& wrapper);
-
-    void Gemm(int*          res,
-              int           batchCount,
-              int           m,
-              int           n,
-              int           k,
-              int64_t       stridea,
-              int64_t       strideb,
-              int64_t       stridec,
-              const int8_t* ATransform,
-              const int8_t* kernel);
-
-    void Gemm(int8_t*       res,
-              int           batchCount,
-              int           m,
-              int           n,
-              int           k,
-              int64_t       stridea,
-              int64_t       strideb,
-              int64_t       stridec,
-              const float   alpha,
-              const int8_t* ATransform,
-              const int8_t* kernel);
-
-    template<typename T>
-    int getFusedINT8QKVType(const int k, const int n, const AttentionWeight<T>* attention_weights);
-
-    bool getUseOrderCol322R4R4();
-
-#ifdef SPARSITY_ENABLED
-    void SpGemm(const int m, const int n, const int k, const float alpha, const void* A, const void* B, void* C);
-#endif
-};
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/cublasMMWrapper.cc b/src/turbomind/utils/cublasMMWrapper.cc
deleted file mode 100644
index cd70298b64..0000000000
--- a/src/turbomind/utils/cublasMMWrapper.cc
+++ /dev/null
@@ -1,1102 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "cublasMMWrapper.h"
-#include "cuda_utils.h"
-#include "src/turbomind/macro.h"
-
-#ifndef CUDART_VERSION
-#error CUDART_VERSION Undefined!
-#endif
-
-namespace turbomind {
-cublasMMWrapper::cublasMMWrapper(cublasHandle_t   cublas_handle,
-                                 cublasLtHandle_t cublaslt_handle,
-                                 cudaStream_t     stream,
-                                 cublasAlgoMap*   cublas_algo_map,
-                                 std::mutex*      mu,
-                                 IAllocator*      allocator):
-    cublas_handle_(cublas_handle),
-    cublaslt_handle_(cublaslt_handle),
-    stream_(stream),
-    cublas_algo_map_(cublas_algo_map),
-    mu_(mu),
-    allocator_(allocator)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    if (allocator_ != nullptr) {
-        cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);
-    }
-}
-
-#ifdef SPARSITY_ENABLED
-cublasMMWrapper::cublasMMWrapper(cublasHandle_t     cublas_handle,
-                                 cublasLtHandle_t   cublaslt_handle,
-                                 cusparseLtHandle_t cusparselt_handle,
-                                 cudaStream_t       stream,
-                                 cublasAlgoMap*     cublas_algo_map,
-                                 std::mutex*        mu,
-                                 IAllocator*        allocator):
-    cublas_handle_(cublas_handle),
-    cublaslt_handle_(cublaslt_handle),
-    cusparselt_handle_(cusparselt_handle),
-    stream_(stream),
-    cublas_algo_map_(cublas_algo_map),
-    mu_(mu),
-    allocator_(allocator)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    if (allocator_ != nullptr) {
-        cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);
-    }
-}
-#endif
-
-cublasMMWrapper::~cublasMMWrapper()
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    mu_ = nullptr;
-    if (allocator_ != nullptr) {
-        allocator_->free((void**)(&cublas_workspace_));
-        allocator_ = nullptr;
-    }
-}
-
-cublasMMWrapper::cublasMMWrapper(const cublasMMWrapper& wrapper):
-    cublas_handle_(wrapper.cublas_handle_),
-    cublaslt_handle_(wrapper.cublaslt_handle_),
-#ifdef SPARSITY_ENABLED
-    cusparselt_handle_(wrapper.cusparselt_handle_),
-#endif
-    stream_(wrapper.stream_),
-    cublas_algo_map_(wrapper.cublas_algo_map_),
-    mu_(wrapper.mu_),
-    allocator_(wrapper.allocator_)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    if (allocator_ != nullptr) {
-        cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);
-    }
-}
-
-void cublasMMWrapper::Gemm(cublasOperation_t transa,
-                           cublasOperation_t transb,
-                           const int         m,
-                           const int         n,
-                           const int         k,
-                           const void*       alpha,
-                           const void*       A,
-                           cudaDataType_t    Atype,
-                           int               lda,
-                           const void*       B,
-                           cudaDataType_t    Btype,
-                           int               ldb,
-                           const void*       beta,
-                           void*             C,
-                           cudaDataType_t    Ctype,
-                           int               ldc,
-                           cudaDataType_t    computeType,
-                           cublasGemmAlgo_t  algo)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    mu_->lock();
-    check_cuda_error(cublasGemmEx(cublas_handle_,
-                                  transa,
-                                  transb,
-                                  m,
-                                  n,
-                                  k,
-                                  alpha,
-                                  A,
-                                  Atype,
-                                  lda,
-                                  B,
-                                  Btype,
-                                  ldb,
-                                  beta,
-                                  C,
-                                  Ctype,
-                                  ldc,
-                                  computeType,
-                                  algo));
-    sync_check_cuda_error();
-    mu_->unlock();
-}
-
-void cublasMMWrapper::Gemm(cublasOperation_t transa,
-                           cublasOperation_t transb,
-                           const int         m,
-                           const int         n,
-                           const int         k,
-                           const void*       A,
-                           const int         lda,
-                           const void*       B,
-                           const int         ldb,
-                           void*             C,
-                           const int         ldc)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    Gemm(transa, transb, m, n, k, A, lda, B, ldb, C, ldc, 1.0f, 0.0f);
-}
-
-void cublasMMWrapper::Gemm(cublasOperation_t transa,
-                           cublasOperation_t transb,
-                           const int         m,
-                           const int         n,
-                           const int         k,
-                           const void*       A,
-                           const int         lda,
-                           const void*       B,
-                           const int         ldb,
-                           void*             C,
-                           const int         ldc,
-                           float             f_alpha,
-                           float             f_beta)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    half h_alpha = (half)(f_alpha);
-    half h_beta  = (half)(f_beta);
-
-    mu_->lock();
-    // TODO: default cublas libs
-    int  is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
-    bool using_cublasLt      = (Atype_ == CUDA_R_16F) ? true : false;
-    int  batch_count         = 1;
-    // fp32 use cublas as default
-    // fp16 use cublasLt as default
-    const void* alpha = is_fp16_computeType ? reinterpret_cast<void*>(&h_alpha) : reinterpret_cast<void*>(&f_alpha);
-    const void* beta  = is_fp16_computeType ? reinterpret_cast<void*>(&h_beta) : reinterpret_cast<void*>(&f_beta);
-
-    int findAlgo = cublas_algo_map_->isExist(batch_count, m, n, k, getCublasDataType(Atype_));
-
-    cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(Atype_));
-    if (findAlgo) {
-        if (info.stages != -1) {
-            using_cublasLt = true;
-        }
-        else {
-            using_cublasLt = false;
-        }
-    }
-
-    if (using_cublasLt) {
-        cublasLtMatmulDesc_t   operationDesc = NULL;
-        cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
-        cudaDataType_t         scaleType;
-#if (CUDART_VERSION >= 11000)
-        cublasComputeType_t computeType;
-#else
-        cudaDataType_t computeType;
-#endif
-
-        if (is_fp16_computeType) {
-#if (CUDART_VERSION >= 11000)
-            computeType = CUBLAS_COMPUTE_16F;
-#else
-            computeType = CUDA_R_16F;
-#endif
-            scaleType = CUDA_R_16F;
-        }
-        else {
-#if (CUDART_VERSION >= 11000)
-            computeType = CUBLAS_COMPUTE_32F;
-#else
-            computeType = CUDA_R_32F;
-#endif
-            scaleType = CUDA_R_32F;
-        }
-
-        // --------------------------------------
-        // Create descriptors for the original matrices
-        cublasLtMatrixLayoutCreate(&Adesc, Atype_, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
-        cublasLtMatrixLayoutCreate(&Bdesc, Btype_, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
-        cublasLtMatrixLayoutCreate(&Cdesc, Ctype_, m, n, ldc);
-#if (CUDART_VERSION >= 11000)
-        cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
-#else
-        cublasLtMatmulDescCreate(&operationDesc, computeType);
-#endif
-
-        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t));
-        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t));
-
-        cublasLtMatmulAlgo_t algo;
-        void*                workSpace     = cublas_workspace_;
-        int                  workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
-        if (findAlgo) {
-            if (info.workspaceSize > workspaceSize) {
-                findAlgo = 0;
-            }
-            else {
-                cublasLtMatmulAlgoInit(
-                    cublaslt_handle_, computeType, scaleType, Atype_, Btype_, Ctype_, Ctype_, info.algoId, &algo);
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption));
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile));
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val));
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle));
-                cublasLtMatmulAlgoConfigSetAttribute(&algo,
-                                                     CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
-                                                     &(info.reductionScheme),
-                                                     sizeof(info.reductionScheme));
-
-#if (CUDART_VERSION >= 11000)
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
-#endif
-
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID, &(info.inner_shapeId), sizeof(info.inner_shapeId));
-                cublasLtMatmulAlgoConfigSetAttribute(&algo,
-                                                     CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID,
-                                                     &(info.cluster_shapeId),
-                                                     sizeof(info.cluster_shapeId));
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_MMA_SHAPE_ID, &(info.mma_shapeId), sizeof(info.mma_shapeId));
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_CGA_SHAPE_ID, &(info.cga_shapeId), sizeof(info.cga_shapeId));
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_SCHEDULING_MODE, &(info.sche_mode), sizeof(info.sche_mode));
-#endif
-            }
-        }
-
-        cublasLtMatmul(cublaslt_handle_,
-                       operationDesc,
-                       alpha,
-                       A,
-                       Adesc,
-                       B,
-                       Bdesc,
-                       beta,
-                       C,
-                       Cdesc,
-                       C,
-                       Cdesc,
-                       (findAlgo == 1 ? (&algo) : NULL),
-                       workSpace,
-                       workspaceSize,
-                       stream_);
-
-        cublasLtMatmulDescDestroy(operationDesc);
-        cublasLtMatrixLayoutDestroy(Adesc);
-        cublasLtMatrixLayoutDestroy(Bdesc);
-        cublasLtMatrixLayoutDestroy(Cdesc);
-        sync_check_cuda_error();
-    }
-    else {
-        int cublasAlgo = info.algoId;
-        check_cuda_error(cublasGemmEx(cublas_handle_,
-                                      transa,
-                                      transb,
-                                      m,
-                                      n,
-                                      k,
-                                      alpha,
-                                      A,
-                                      Atype_,
-                                      lda,
-                                      B,
-                                      Btype_,
-                                      ldb,
-                                      beta,
-                                      C,
-                                      Ctype_,
-                                      ldc,
-                                      computeType_,
-                                      static_cast<cublasGemmAlgo_t>(cublasAlgo)));
-    }
-    mu_->unlock();
-}
-
-void cublasMMWrapper::setFP32GemmConfig()
-{
-    Atype_       = CUDA_R_32F;
-    Btype_       = CUDA_R_32F;
-    Ctype_       = CUDA_R_32F;
-    computeType_ = CUDA_R_32F;
-}
-
-void cublasMMWrapper::setFP16GemmConfig()
-{
-    Atype_       = CUDA_R_16F;
-    Btype_       = CUDA_R_16F;
-    Ctype_       = CUDA_R_16F;
-    computeType_ = CUDA_R_32F;
-}
-
-#ifdef ENABLE_BF16
-void cublasMMWrapper::setBF16GemmConfig()
-{
-    Atype_       = CUDA_R_16BF;
-    Btype_       = CUDA_R_16BF;
-    Ctype_       = CUDA_R_16BF;
-    computeType_ = CUDA_R_32F;
-}
-#endif
-
-void cublasMMWrapper::setGemmConfig(cudaDataType_t aType,
-                                    cudaDataType_t bType,
-                                    cudaDataType_t cType,
-                                    cudaDataType_t computeType)
-{
-    Atype_       = aType;
-    Btype_       = bType;
-    Ctype_       = cType;
-    computeType_ = computeType;
-}
-
-CublasDataType cublasMMWrapper::getCublasDataType(cudaDataType_t data_type)
-{
-    if (data_type == CUDA_R_16F) {
-        return HALF_DATATYPE;
-    }
-    else if (data_type == CUDA_R_32F) {
-        return FLOAT_DATATYPE;
-    }
-#ifdef ENABLE_BF16
-    else if (data_type == CUDA_R_16BF) {
-        return BFLOAT16_DATATYPE;
-    }
-#endif
-    return FLOAT_DATATYPE;
-}
-
-#if (CUDART_VERSION >= 11000)
-// input, weight, output are row-major
-// only works for cublas 11.x
-void cublasMMWrapper::Gemm(cublasOperation_t transa,
-                           cublasOperation_t transb,
-                           const int         m,
-                           const int         n,
-                           const int         k,
-                           const void*       A,
-                           const int         lda,
-                           const void*       B,
-                           const int         ldb,
-                           const void*       bias,
-                           void*             C,
-                           const int         ldc)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    cudaDataType_t      Atype, Btype, Ctype;
-    cublasComputeType_t computeType;
-    cudaDataType_t      scaleType;
-    float               alpha_float = 1.0f;
-    float               beta_float  = 0.0f;
-    half                alpha_half  = half(1.0f);
-    half                beta_half   = half(0.0f);
-    void *              alpha, *beta;
-
-    // int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
-    if (Atype_ == CUDA_R_32F) {
-        computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
-        Atype       = CUDA_R_32F;
-        Btype       = CUDA_R_32F;
-        Ctype       = CUDA_R_32F;
-        scaleType   = CUDA_R_32F;
-        alpha       = &alpha_float;
-        beta        = &beta_float;
-    }
-    else if (Atype_ == CUDA_R_16BF) {
-        computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
-        Atype       = CUDA_R_16BF;
-        Btype       = CUDA_R_16BF;
-        Ctype       = CUDA_R_16BF;
-        scaleType   = CUDA_R_32F;
-        alpha       = &alpha_float;
-        beta        = &beta_float;
-    }
-    else {
-        computeType = CUBLAS_COMPUTE_16F;
-        Atype       = CUDA_R_16F;
-        Btype       = CUDA_R_16F;
-        Ctype       = CUDA_R_16F;
-        scaleType   = CUDA_R_16F;
-        alpha       = &alpha_half;
-        beta        = &beta_half;
-    }
-
-    cublasLtMatmulDesc_t   operationDesc = NULL;
-    cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
-    cublasLtEpilogue_t     epi = CUBLASLT_EPILOGUE_BIAS;
-    cublasLtMatrixLayoutCreate(&Adesc, Atype, (transa == CUBLAS_OP_N) ? m : k, (transa == CUBLAS_OP_N) ? k : m, lda);
-    cublasLtMatrixLayoutCreate(&Bdesc, Btype, (transb == CUBLAS_OP_N) ? k : n, (transb == CUBLAS_OP_N) ? n : k, ldb);
-    cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, ldc);
-
-    cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
-    cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t));
-    cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t));
-    cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epi, sizeof(cublasLtEpilogue_t));
-    cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(const void*));
-    check_cuda_error(cublasLtMatmul(
-        cublaslt_handle_, operationDesc, alpha, A, Adesc, B, Bdesc, beta, C, Cdesc, C, Cdesc, NULL, NULL, 0, stream_));
-    cublasLtMatrixLayoutDestroy(Adesc);
-    cublasLtMatrixLayoutDestroy(Bdesc);
-    cublasLtMatrixLayoutDestroy(Cdesc);
-    cublasLtMatmulDescDestroy(operationDesc);
-}
-#endif
-void cublasMMWrapper::setStream(cudaStream_t stream)
-{
-    stream_ = stream;
-}
-
-void cublasMMWrapper::stridedBatchedGemm(cublasOperation_t transa,
-                                         cublasOperation_t transb,
-                                         const int         m,
-                                         const int         n,
-                                         const int         k,
-                                         const void*       A,
-                                         const int         lda,
-                                         const int64_t     strideA,
-                                         const void*       B,
-                                         const int         ldb,
-                                         const int64_t     strideB,
-                                         void*             C,
-                                         const int         ldc,
-                                         const int64_t     strideC,
-                                         const int         batch_count,
-                                         const float       f_alpha,
-                                         const float       f_beta)
-{
-    half h_alpha = (half)f_alpha;
-    half h_beta  = (half)f_beta;
-
-    mu_->lock();
-    int         is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
-    const void* alpha =
-        is_fp16_computeType ? reinterpret_cast<void*>(&h_alpha) : reinterpret_cast<const void*>(&f_alpha);
-    const void* beta = is_fp16_computeType ? reinterpret_cast<void*>(&h_beta) : reinterpret_cast<const void*>(&f_beta);
-    cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(Atype_));
-
-    check_cuda_error(cublasGemmStridedBatchedEx(cublas_handle_,
-                                                transa,
-                                                transb,
-                                                m,
-                                                n,
-                                                k,
-                                                alpha,
-                                                A,
-                                                Atype_,
-                                                lda,
-                                                strideA,
-                                                B,
-                                                Btype_,
-                                                ldb,
-                                                strideB,
-                                                beta,
-                                                C,
-                                                Ctype_,
-                                                ldc,
-                                                strideC,
-                                                batch_count,
-                                                computeType_,
-                                                static_cast<cublasGemmAlgo_t>(info.algoId)));
-
-    mu_->unlock();
-}
-
-void cublasMMWrapper::stridedBatchedGemm(cublasOperation_t transa,
-                                         cublasOperation_t transb,
-                                         const int         m,
-                                         const int         n,
-                                         const int         k,
-                                         const float       f_alpha,
-                                         const void*       A,
-                                         cudaDataType_t    AType,
-                                         const int         lda,
-                                         const int64_t     strideA,
-                                         const void*       B,
-                                         cudaDataType_t    BType,
-                                         const int         ldb,
-                                         const int64_t     strideB,
-                                         const float       f_beta,
-                                         void*             C,
-                                         cudaDataType_t    CType,
-                                         const int         ldc,
-                                         const int64_t     strideC,
-                                         const int         batch_count,
-                                         cudaDataType_t    computeType)
-{
-    half h_alpha = (half)f_alpha;
-    half h_beta  = (half)f_beta;
-
-    mu_->lock();
-    int         is_fp16_computeType = computeType == CUDA_R_16F ? 1 : 0;
-    const void* alpha =
-        is_fp16_computeType ? reinterpret_cast<void*>(&h_alpha) : reinterpret_cast<const void*>(&f_alpha);
-    const void* beta = is_fp16_computeType ? reinterpret_cast<void*>(&h_beta) : reinterpret_cast<const void*>(&f_beta);
-    cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(Atype_));
-
-    check_cuda_error(cublasGemmStridedBatchedEx(cublas_handle_,
-                                                transa,
-                                                transb,
-                                                m,
-                                                n,
-                                                k,
-                                                alpha,
-                                                A,
-                                                AType,
-                                                lda,
-                                                strideA,
-                                                B,
-                                                BType,
-                                                ldb,
-                                                strideB,
-                                                beta,
-                                                C,
-                                                CType,
-                                                ldc,
-                                                strideC,
-                                                batch_count,
-                                                computeType,
-                                                static_cast<cublasGemmAlgo_t>(info.algoId)));
-
-    mu_->unlock();
-}
-
-void cublasMMWrapper::batchedGemm(cublasOperation_t  transa,
-                                  cublasOperation_t  transb,
-                                  const int          m,
-                                  const int          n,
-                                  const int          k,
-                                  const void* const* A,
-                                  const int          lda,
-                                  const void* const* B,
-                                  const int          ldb,
-                                  void* const*       C,
-                                  const int          ldc,
-                                  const int          batch_count)
-{
-    float f_alpha = static_cast<float>(1.0f);
-    float f_beta  = static_cast<float>(0.0f);
-
-    half h_alpha = (half)1.0f;
-    half h_beta  = (half)0.0f;
-
-    mu_->lock();
-    int         is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
-    const void* alpha = is_fp16_computeType ? reinterpret_cast<void*>(&h_alpha) : reinterpret_cast<void*>(&f_alpha);
-    const void* beta  = is_fp16_computeType ? reinterpret_cast<void*>(&h_beta) : reinterpret_cast<void*>(&f_beta);
-    cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(Atype_));
-
-    check_cuda_error(cublasGemmBatchedEx(cublas_handle_,
-                                         transa,
-                                         transb,
-                                         m,
-                                         n,
-                                         k,
-                                         alpha,
-                                         A,
-                                         Atype_,
-                                         lda,
-                                         B,
-                                         Btype_,
-                                         ldb,
-                                         beta,
-                                         C,
-                                         Ctype_,
-                                         ldc,
-                                         batch_count,
-                                         computeType_,
-                                         static_cast<cublasGemmAlgo_t>(info.algoId)));
-    mu_->unlock();
-}
-
-bool cublasMMWrapper::isFuseBatchGemm(const int batch_count, const int m, const int k, const int n)
-{
-    CublasDataType data_type = getCublasDataType(Atype_);
-
-    if (cublas_algo_map_->isExist(batch_count, m, k, n, data_type) == false
-        || cublas_algo_map_->isExist(1, m, k, n, data_type) == false) {
-        return false;
-    }
-    else {
-        return cublas_algo_map_->getAlgo(batch_count, m, k, n, data_type).exec_time
-               < 3 * cublas_algo_map_->getAlgo(1, m, k, n, data_type).exec_time;
-    }
-}
-
-#ifdef SPARSITY_ENABLED
-void cublasMMWrapper::SpGemm(cublasOperation_t transa,
-                             cublasOperation_t transb,
-                             const int         m,
-                             const int         n,
-                             const int         k,
-                             const void*       A,
-                             const void*       B,
-                             void*             C)
-{
-    if (Atype_ != CUDA_R_16F || Btype_ != CUDA_R_16F || Ctype_ != CUDA_R_16F) {
-        throw std::runtime_error("\n[TM][ERROR] sparse GEMM only supports FP16 data type now.");
-    }
-    static bool not_printed_fp32_accumulation_warning = true;
-    if (computeType_ != CUDA_R_16F && not_printed_fp32_accumulation_warning) {
-        printf("[TM][WARNING] cublasMMWrapper sets to FP32 compute type, "
-               "but sparse gemm will use FP16 compute type since cusparselt "
-               "supports FP16 accumulation only.\n");
-        not_printed_fp32_accumulation_warning = false;
-    }
-    cusparseOrder_t     order = CUSPARSE_ORDER_COL;
-    cusparseOperation_t opA = (transa == CUBLAS_OP_N) ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
-    cusparseOperation_t opB = (transb == CUBLAS_OP_N) ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
-    cusparseComputeType compute_type = CUSPARSE_COMPUTE_16F;
-    cusparseLtMatmulDescriptor_t   matmul;
-    cusparseLtMatmulAlgSelection_t alg_sel;
-    cusparseLtMatmulPlan_t         plan;
-
-    bool     is_rowmajor    = (order == CUSPARSE_ORDER_ROW);
-    bool     isA_transposed = (opA != CUSPARSE_OPERATION_NON_TRANSPOSE);
-    bool     isB_transposed = (opB != CUSPARSE_OPERATION_NON_TRANSPOSE);
-    auto     num_A_rows     = (isA_transposed) ? k : m;
-    auto     num_A_cols     = (isA_transposed) ? m : k;
-    auto     num_B_rows     = (isB_transposed) ? n : k;
-    auto     num_B_cols     = (isB_transposed) ? k : n;
-    auto     num_C_rows     = m;
-    auto     num_C_cols     = n;
-    unsigned alignment      = 16;
-    auto     lda            = (is_rowmajor) ? num_A_cols : num_A_rows;
-    auto     ldb            = (is_rowmajor) ? num_B_cols : num_B_rows;
-    auto     ldc            = (is_rowmajor) ? num_C_cols : num_C_rows;
-    float    _alpha(1.0f);
-    float    _beta(0.0f);
-
-    char mark[256];
-    sprintf(mark, "%d_%d_%d_%d", 1, m, n, k);
-    if (sp_mat_A_desc_map_.find(mark) != sp_mat_A_desc_map_.end()) {
-        CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_,
-                                                      &matmul,
-                                                      opA,
-                                                      opB,
-                                                      &sp_mat_A_desc_map_[mark],
-                                                      &sp_mat_B_desc_map_[mark],
-                                                      &sp_mat_C_desc_map_[mark],
-                                                      &sp_mat_C_desc_map_[mark],
-                                                      compute_type))
-    }
-    else {
-        // initializing MatDesc takes a lot of time
-        cusparseLtMatDescriptor_t mat_A, mat_B, mat_C;
-        sp_mat_A_desc_map_[mark] = mat_A;
-        sp_mat_B_desc_map_[mark] = mat_B;
-        sp_mat_C_desc_map_[mark] = mat_C;
-        CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_,
-                                                          &sp_mat_A_desc_map_[mark],
-                                                          num_A_rows,
-                                                          num_A_cols,
-                                                          lda,
-                                                          alignment,
-                                                          Atype_,
-                                                          order,
-                                                          CUSPARSELT_SPARSITY_50_PERCENT))
-        CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(
-            &cusparselt_handle_, &sp_mat_B_desc_map_[mark], num_B_rows, num_B_cols, ldb, alignment, Btype_, order))
-        CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(
-            &cusparselt_handle_, &sp_mat_C_desc_map_[mark], num_C_rows, num_C_cols, ldc, alignment, Ctype_, order))
-        CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_,
-                                                      &matmul,
-                                                      opA,
-                                                      opB,
-                                                      &sp_mat_A_desc_map_[mark],
-                                                      &sp_mat_B_desc_map_[mark],
-                                                      &sp_mat_C_desc_map_[mark],
-                                                      &sp_mat_C_desc_map_[mark],
-                                                      compute_type))
-    }
-    mu_->lock();
-    CHECK_CUSPARSE(
-        cusparseLtMatmulAlgSelectionInit(&cusparselt_handle_, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
-    int alg = cublas_algo_map_->getSpAlgo(1, num_A_rows, num_B_cols, num_A_cols);
-    CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
-        &cusparselt_handle_, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)))
-    size_t workspace_size;
-    CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&cusparselt_handle_, &alg_sel, &workspace_size))
-    CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&cusparselt_handle_, &plan, &matmul, &alg_sel, workspace_size))
-
-    void*        d_workspace = nullptr;
-    int          num_streams = 1;
-    cudaStream_t streams[1]  = {stream_};
-    CHECK_CUSPARSE(
-        cusparseLtMatmul(&cusparselt_handle_, &plan, &_alpha, A, B, &_beta, C, C, d_workspace, streams, num_streams))
-    CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
-    sync_check_cuda_error();
-    mu_->unlock();
-}
-
-size_t cublasMMWrapper::getSparseMatrixSize(int m, int k)
-{
-    // Get a compressed matrix size of shape (m, k) used in cusparselt.
-    auto            Atype_     = CUDA_R_16F;
-    cusparseOrder_t order      = CUSPARSE_ORDER_COL;
-    unsigned        alignment  = 16;
-    int             num_A_rows = m;
-    int             num_A_cols = k;
-    int             lda        = num_A_rows;
-
-    cusparseLtMatDescriptor_t mat_A;
-    CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_,
-                                                      &mat_A,
-                                                      num_A_rows,
-                                                      num_A_cols,
-                                                      lda,
-                                                      alignment,
-                                                      Atype_,
-                                                      order,
-                                                      CUSPARSELT_SPARSITY_50_PERCENT));
-    size_t compressed_size = 0;
-    CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&cusparselt_handle_, &mat_A, &compressed_size));
-    return compressed_size;
-}
-
-void cublasMMWrapper::compressMatrix(const void* input, void* output, const int m, const int k)
-{
-    cusparseOrder_t           order = CUSPARSE_ORDER_COL;
-    cusparseOperation_t       opA   = CUSPARSE_OPERATION_NON_TRANSPOSE;
-    cusparseLtMatDescriptor_t mat_A;
-    unsigned                  alignment = 16;
-    CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
-        &cusparselt_handle_, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
-    CHECK_CUSPARSE(cusparseLtSpMMACompress2(&cusparselt_handle_, &mat_A, true, opA, input, output, stream_))
-    sync_check_cuda_error();
-}
-
-bool cublasMMWrapper::isUseSparse(const int batch_count, const int m, const int n, const int k)
-{
-    return cublas_algo_map_->isUseSparse(batch_count, m, n, k);
-}
-#endif
-
-std::pair<bool, cublasLtMatmulAlgo_t> cublasMMWrapper::findBestAlgo(cublasLtHandle_t       lightHandle,
-                                                                    cublasLtMatmulDesc_t   computeDesc,
-                                                                    const void*            alpha,
-                                                                    const void*            A,
-                                                                    cublasLtMatrixLayout_t Adesc,
-                                                                    const void*            B,
-                                                                    cublasLtMatrixLayout_t Bdesc,
-                                                                    const void*            beta,
-                                                                    const void*            C,
-                                                                    cublasLtMatrixLayout_t Cdesc,
-                                                                    void*                  D,
-                                                                    cublasLtMatrixLayout_t Ddesc,
-                                                                    cudaStream_t           stream)
-{
-#if (CUBLAS_VERSION) <= 11601
-    FT_CHECK_WITH_INFO(false, "CUBLAS version too low.");
-    return {false, cublasLtMatmulAlgo_t{}};
-#else
-    size_t returnSize;
-    int32_t pointer_mode;
-    cublasLtMatmulDescGetAttribute(
-        computeDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointer_mode, sizeof(pointer_mode), &returnSize);
-
-    std::vector<cublasLtMatmulHeuristicResult_t> heuristics(200);
-    cublasLtMatmulPreference_t preference;
-    check_cuda_error(cublasLtMatmulPreferenceCreate(&preference));
-    check_cuda_error(cublasLtMatmulPreferenceInit(preference));
-    uint64_t workspace_size = CUBLAS_WORKSPACE_SIZE;
-    check_cuda_error(cublasLtMatmulPreferenceSetAttribute(
-        preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_size, sizeof(workspace_size)));
-#if (CUBLAS_VERSION) <= 12000
-    uint32_t pointer_mode_mask = 0;
-    check_cuda_error(cublasLtMatmulPreferenceSetAttribute(
-        preference, CUBLASLT_MATMUL_PREF_EPILOGUE_MASK, &pointer_mode_mask, sizeof(pointer_mode_mask)));
-#endif
-
-    int return_count = 0;
-    auto ret = cublasLtMatmulAlgoGetHeuristic(lightHandle,
-                                              computeDesc,
-                                              Adesc,
-                                              Bdesc,
-                                              Cdesc,
-                                              Ddesc,
-                                              preference,
-                                              heuristics.size(),
-                                              heuristics.data(),
-                                              &return_count);
-    heuristics.resize(return_count);
-
-    std::map<int, std::vector<float>> algo_results;
-    for (const auto& heuristic : heuristics) {
-        cublasLtMatmulAlgo_t algo = heuristic.algo;
-        int32_t algo_id;
-        cublasLtMatmulAlgoConfigGetAttribute(&algo, CUBLASLT_ALGO_CONFIG_ID, &algo_id, sizeof(algo_id), &returnSize);
-
-        cudaEvent_t start_event, stop_event;
-        cudaEventCreate(&start_event);
-        cudaEventCreate(&stop_event);
-
-        float my_alpha = 1.0f;
-        float my_beta = 0.0f;
-
-        for (int i = 0; i < 11; i++) {
-            float duration_ms;
-            cudaEventRecord(start_event, stream);
-            check_cuda_error(cublasLtMatmul(lightHandle,
-                                            computeDesc,
-                                            alpha,
-                                            A,
-                                            Adesc,
-                                            B,
-                                            Bdesc,
-                                            beta,
-                                            C,
-                                            Cdesc,
-                                            D,
-                                            Ddesc,
-                                            &algo,
-                                            cublas_workspace_,
-                                            CUBLAS_WORKSPACE_SIZE,
-                                            stream));
-            cudaEventRecord(stop_event, stream);
-            cudaEventSynchronize(stop_event);
-            cudaEventElapsedTime(&duration_ms, start_event, stop_event);
-
-            algo_results[algo_id].push_back(duration_ms);
-        }
-        std::sort(algo_results[algo_id].begin(), algo_results[algo_id].end());
-    }
-
-    cublasLtMatmulHeuristicResult_t result;
-    float best_time = INFINITY;
-    for (const auto& heuristic : heuristics) {
-        cublasLtMatmulAlgo_t algo = heuristic.algo;
-        int32_t algo_id;
-        cublasLtMatmulAlgoConfigGetAttribute(&algo, CUBLASLT_ALGO_CONFIG_ID, &algo_id, sizeof(algo_id), &returnSize);
-        const auto& results = algo_results[algo_id];
-
-        if (results.size() > 0 && results[5] < best_time) {
-            best_time = results[5];
-            result = heuristic;
-        }
-    }
-
-    return {best_time != INFINITY, result.algo};
-#endif
-}
-
-cublasMMWrapper::MatrixLayout cublasMMWrapper::createMatrixLayout(cublasLtMatrixLayout_t Mdesc)
-{
-    size_t       returnSize;
-    MatrixLayout m_layout;
-
-    cublasLtMatrixLayoutGetAttribute(
-        Mdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &std::get<0>(m_layout), sizeof(std::get<0>(m_layout)), &returnSize);
-    cublasLtMatrixLayoutGetAttribute(
-        Mdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &std::get<1>(m_layout), sizeof(std::get<1>(m_layout)), &returnSize);
-    cublasLtMatrixLayoutGetAttribute(
-        Mdesc, CUBLASLT_MATRIX_LAYOUT_ROWS, &std::get<2>(m_layout), sizeof(std::get<2>(m_layout)), &returnSize);
-    cublasLtMatrixLayoutGetAttribute(
-        Mdesc, CUBLASLT_MATRIX_LAYOUT_COLS, &std::get<3>(m_layout), sizeof(std::get<3>(m_layout)), &returnSize);
-
-    return m_layout;
-}
-
-cublasStatus_t cublasMMWrapper::cublasLtMatmulWrapper(cublasLtHandle_t            lightHandle,
-                                                      cublasLtMatmulDesc_t        computeDesc,
-                                                      const void*                 alpha,
-                                                      const void*                 A,
-                                                      cublasLtMatrixLayout_t      Adesc,
-                                                      const void*                 B,
-                                                      cublasLtMatrixLayout_t      Bdesc,
-                                                      const void*                 beta,
-                                                      const void*                 C,
-                                                      cublasLtMatrixLayout_t      Cdesc,
-                                                      void*                       D,
-                                                      cublasLtMatrixLayout_t      Ddesc,
-                                                      const cublasLtMatmulAlgo_t* algo,
-                                                      void*                       workspace,
-                                                      size_t                      workspaceSizeInBytes,
-                                                      cudaStream_t                stream)
-{
-    cache_idx_t cache_idx{
-        computeDesc,
-        {createMatrixLayout(Adesc), createMatrixLayout(Bdesc), createMatrixLayout(Cdesc), createMatrixLayout(Ddesc)}};
-
-    cublasLtMatmulAlgo_t algo_value;
-    bool                 found_algo = false;
-    if (algo == nullptr) {
-        if (algo_cache.find(cache_idx) == algo_cache.end()) {
-            auto result =
-                findBestAlgo(lightHandle, computeDesc, alpha, A, Adesc, B, Bdesc, beta, C, Cdesc, D, Ddesc, stream);
-            if (result.first) {
-                algo_cache[cache_idx] = result.second;
-                algo_value            = result.second;
-                found_algo            = true;
-            }
-        }
-        else {
-            algo_value = algo_cache[cache_idx];
-            found_algo = true;
-        }
-    }
-
-    return cublasLtMatmul(lightHandle,
-                          computeDesc,
-                          alpha,
-                          A,
-                          Adesc,
-                          B,
-                          Bdesc,
-                          beta,
-                          C,
-                          Cdesc,
-                          D,
-                          Ddesc,
-                          found_algo ? &algo_value : algo,
-                          workspace,
-                          workspaceSizeInBytes,
-                          stream);
-}
-
-void cublasMMWrapper::_Int8Gemm(const int     m,
-                                const int     n,
-                                const int     k,
-                                const int8_t* A,
-                                const int     lda,
-                                const int8_t* B,
-                                const int     ldb,
-                                void*         C,
-                                const int     ldc,
-                                const void*   alpha,
-                                const int     mode,
-                                const bool    per_column_scaling)
-{
-    /* mode:
-     *  - 0: int8 * int8 -> int32 -> int8
-     *  - 1: int8 * int8 -> int32 -> int32
-     */
-#if (CUBLAS_VERSION) <= 11601
-    FT_CHECK_WITH_INFO(false, "CUBLAS version too low.");
-#else
-
-    mu_->lock();
-    const auto op_a = CUBLAS_OP_T;
-    const auto op_b = CUBLAS_OP_N;
-    const auto dataType = CUDA_R_8I;
-    const auto resultType = mode == 0 ? CUDA_R_8I : CUDA_R_32I;
-    const auto computeType = CUBLAS_COMPUTE_32I;
-    const auto scaleType = mode == 0 ? CUDA_R_32F : CUDA_R_32I;
-    const int batch_count = 1;
-    const void* beta;
-
-    int findAlgo = cublas_algo_map_->isExist(batch_count, m, n, k, getCublasDataType(dataType));
-
-    cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(dataType));
-
-    cublasLtMatmulDesc_t operationDesc = NULL;
-    cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
-
-    // --------------------------------------
-    // Create descriptors for the original matrices
-    check_cuda_error(cublasLtMatrixLayoutCreate(&Adesc, dataType, k, m, lda));
-    check_cuda_error(cublasLtMatrixLayoutCreate(&Bdesc, dataType, k, n, ldb));
-    check_cuda_error(cublasLtMatrixLayoutCreate(&Cdesc, resultType, m, n, ldc));
-
-    check_cuda_error(cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType));
-
-    auto pointer_mode = CUBLASLT_POINTER_MODE_HOST;
-    if (mode == 0) {
-        pointer_mode =
-            per_column_scaling ? CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST : CUBLASLT_POINTER_MODE_DEVICE;
-    }
-    check_cuda_error(
-        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &op_a, sizeof(cublasOperation_t)));
-    check_cuda_error(
-        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &op_b, sizeof(cublasOperation_t)));
-    check_cuda_error(
-        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSC, &op_b, sizeof(cublasOperation_t)));
-    check_cuda_error(cublasLtMatmulDescSetAttribute(
-        operationDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointer_mode, sizeof(pointer_mode)));
-
-    const int32_t int_one = 1;
-    const int32_t int_zero = 0;
-    const float float_zero = 0;
-    if (mode == 0) {
-        beta = per_column_scaling ? &float_zero : NULL;
-    }
-    else {
-        alpha = &int_one;
-        beta = &int_zero;
-    }
-
-    cublasLtMatmulAlgo_t algo;
-    void* workSpace = cublas_workspace_;
-    int workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
-
-    sync_check_cuda_error();
-    auto ret = cublasLtMatmulWrapper(cublaslt_handle_,
-                                     operationDesc,
-                                     alpha,
-                                     A,
-                                     Adesc,
-                                     B,
-                                     Bdesc,
-                                     beta,
-                                     C,
-                                     Cdesc,
-                                     C,
-                                     Cdesc,
-                                     NULL,
-                                     workSpace,
-                                     workspaceSize,
-                                     stream_);
-    check_cuda_error(ret);
-    sync_check_cuda_error();
-
-    cublasLtMatmulDescDestroy(operationDesc);
-    cublasLtMatrixLayoutDestroy(Adesc);
-    cublasLtMatrixLayoutDestroy(Bdesc);
-    cublasLtMatrixLayoutDestroy(Cdesc);
-    sync_check_cuda_error();
-    mu_->unlock();
-#endif
-}
-
-void cublasMMWrapper::Int8Gemm(const int     m,
-                               const int     n,
-                               const int     k,
-                               const int8_t* A,
-                               const int     lda,
-                               const int8_t* B,
-                               const int     ldb,
-                               int8_t*       C,
-                               const int     ldc,
-                               const float*  alpha,
-                               const bool    per_column_scaling)
-{
-    return _Int8Gemm(m, n, k, A, lda, B, ldb, C, ldc, alpha, 0, per_column_scaling);
-}
-
-void cublasMMWrapper::Int8Gemm(const int     m,
-                               const int     n,
-                               const int     k,
-                               const int8_t* A,
-                               const int     lda,
-                               const int8_t* B,
-                               const int     ldb,
-                               int32_t*      C,
-                               const int     ldc)
-{
-    return _Int8Gemm(m, n, k, A, lda, B, ldb, C, ldc, (float*)nullptr, 1, false);
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/cublasMMWrapper.h b/src/turbomind/utils/cublasMMWrapper.h
deleted file mode 100644
index 0f90a44057..0000000000
--- a/src/turbomind/utils/cublasMMWrapper.h
+++ /dev/null
@@ -1,297 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "cuda_utils.h"
-#include "src/turbomind/utils/allocator.h"
-#include "src/turbomind/utils/cublasAlgoMap.h"
-#include <array>
-#include <cublasLt.h>
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
-#include <map>
-#include <mutex>
-#include <string>
-
-#pragma once
-namespace turbomind {
-
-class cublasMMWrapper {
-protected:
-    cublasHandle_t   cublas_handle_;
-    cublasLtHandle_t cublaslt_handle_;
-#ifdef SPARSITY_ENABLED
-    cusparseLtHandle_t                               cusparselt_handle_;
-    std::map<std::string, cusparseLtMatDescriptor_t> sp_mat_A_desc_map_;
-    std::map<std::string, cusparseLtMatDescriptor_t> sp_mat_B_desc_map_;
-    std::map<std::string, cusparseLtMatDescriptor_t> sp_mat_C_desc_map_;
-#endif
-
-    cudaDataType_t Atype_;
-    cudaDataType_t Btype_;
-    cudaDataType_t Ctype_;
-    cudaDataType_t computeType_;
-
-    cudaStream_t   stream_;
-    cublasAlgoMap* cublas_algo_map_;
-    std::mutex*    mu_;
-
-    IAllocator* allocator_        = nullptr;
-    void*       cublas_workspace_ = nullptr;
-
-    friend class cublasINT8MMWrapper;
-
-    void _Int8Gemm(const int     m,
-                   const int     n,
-                   const int     k,
-                   const int8_t* A,
-                   const int     lda,
-                   const int8_t* B,
-                   const int     ldb,
-                   void*         C,
-                   const int     ldc,
-                   const void*   alpha,
-                   const int     mode,
-                   const bool    per_column_scaling);
-
-public:
-    cublasMMWrapper(cublasHandle_t   cublas_handle_,
-                    cublasLtHandle_t cublaslt_handle_,
-                    cudaStream_t     stream,
-                    cublasAlgoMap*   map,
-                    std::mutex*      mu,
-                    IAllocator*      allocator);
-
-#ifdef SPARSITY_ENABLED
-    cublasMMWrapper(cublasHandle_t     cublas_handle_,
-                    cublasLtHandle_t   cublaslt_handle_,
-                    cusparseLtHandle_t cusparselt_handle,
-                    cudaStream_t       stream,
-                    cublasAlgoMap*     map,
-                    std::mutex*        mu,
-                    IAllocator*        allocator);
-#endif
-
-    virtual ~cublasMMWrapper();
-
-    cublasMMWrapper(const cublasMMWrapper& wrapper);
-
-    virtual void cublasVersionCheck()
-    {
-        return;
-    };
-    cublasStatus_t cublasLtMatmulWrapper(cublasLtHandle_t            lightHandle,
-                                         cublasLtMatmulDesc_t        computeDesc,
-                                         const void*                 alpha,
-                                         const void*                 A,
-                                         cublasLtMatrixLayout_t      Adesc,
-                                         const void*                 B,
-                                         cublasLtMatrixLayout_t      Bdesc,
-                                         const void*                 beta,
-                                         const void*                 C,
-                                         cublasLtMatrixLayout_t      Cdesc,
-                                         void*                       D,
-                                         cublasLtMatrixLayout_t      Ddesc,
-                                         const cublasLtMatmulAlgo_t* algo,
-                                         void*                       workspace,
-                                         size_t                      workspaceSizeInBytes,
-                                         cudaStream_t                stream);
-
-    std::pair<bool, cublasLtMatmulAlgo_t> findBestAlgo(cublasLtHandle_t       lightHandle,
-                                                       cublasLtMatmulDesc_t   computeDesc,
-                                                       const void*            alpha,
-                                                       const void*            A,
-                                                       cublasLtMatrixLayout_t Adesc,
-                                                       const void*            B,
-                                                       cublasLtMatrixLayout_t Bdesc,
-                                                       const void*            beta,
-                                                       const void*            C,
-                                                       cublasLtMatrixLayout_t Cdesc,
-                                                       void*                  D,
-                                                       cublasLtMatrixLayout_t Ddesc,
-                                                       cudaStream_t           stream);
-
-    using MatrixLayout = std::tuple<cudaDataType_t, cublasLtOrder_t, uint64_t, uint64_t>;
-    using cache_idx_t  = std::tuple<cublasLtMatmulDesc_t, std::array<MatrixLayout, 4>>;
-    std::map<cache_idx_t, cublasLtMatmulAlgo_t> algo_cache;
-
-    MatrixLayout createMatrixLayout(cublasLtMatrixLayout_t Mdesc);
-
-    void Gemm(cublasOperation_t transa,
-              cublasOperation_t transb,
-              const int         m,
-              const int         n,
-              const int         k,
-              const void*       alpha,
-              const void*       A,
-              cudaDataType_t    Atype,
-              int               lda,
-              const void*       B,
-              cudaDataType_t    Btype,
-              int               ldb,
-              const void*       beta,
-              void*             C,
-              cudaDataType_t    Ctype,
-              int               ldc,
-              cudaDataType_t    computeType,
-              cublasGemmAlgo_t  algo);
-
-    void Gemm(cublasOperation_t transa,
-              cublasOperation_t transb,
-              const int         m,
-              const int         n,
-              const int         k,
-              const void*       A,
-              const int         lda,
-              const void*       B,
-              const int         ldb,
-              void*             C,
-              const int         ldc);
-
-    void Gemm(cublasOperation_t transa,
-              cublasOperation_t transb,
-              const int         m,
-              const int         n,
-              const int         k,
-              const void*       A,
-              const int         lda,
-              const void*       B,
-              const int         ldb,
-              void*             C,
-              const int         ldc,
-              float             f_alpha,
-              float             f_beta);
-
-    void Int8Gemm(const int     m,
-                  const int     n,
-                  const int     k,
-                  const int8_t* A,
-                  const int     lda,
-                  const int8_t* B,
-                  const int     ldb,
-                  int8_t*       C,
-                  const int     ldc,
-                  const float*  alpha,
-                  const bool    per_column_scaling = false);
-
-    void Int8Gemm(const int     m,
-                  const int     n,
-                  const int     k,
-                  const int8_t* A,
-                  const int     lda,
-                  const int8_t* B,
-                  const int     ldb,
-                  int32_t*      C,
-                  const int     ldc);
-
-    void setFP32GemmConfig();
-    void setFP16GemmConfig();
-#ifdef ENABLE_BF16
-    void setBF16GemmConfig();
-#endif
-    void setStream(cudaStream_t stream);
-
-    void setGemmConfig(cudaDataType_t aType, cudaDataType_t bType, cudaDataType_t cType, cudaDataType_t computeType);
-
-    CublasDataType getCublasDataType(cudaDataType_t data_type);
-
-#if (CUDART_VERSION >= 11000)
-    void Gemm(cublasOperation_t transa,
-              cublasOperation_t transb,
-              const int         m,
-              const int         n,
-              const int         k,
-              const void*       A,
-              const int         lda,
-              const void*       B,
-              const int         ldb,
-              const void*       bias,
-              void*             C,
-              const int         ldc);
-#endif
-
-    void stridedBatchedGemm(cublasOperation_t transa,
-                            cublasOperation_t transb,
-                            const int         m,
-                            const int         n,
-                            const int         k,
-                            const void*       A,
-                            const int         lda,
-                            const int64_t     strideA,
-                            const void*       B,
-                            const int         ldb,
-                            const int64_t     strideB,
-                            void*             C,
-                            const int         ldc,
-                            const int64_t     strideC,
-                            const int         batchCount,
-                            const float       f_alpha = 1.0f,
-                            const float       f_beta  = 0.0f);
-
-    void stridedBatchedGemm(cublasOperation_t transa,
-                            cublasOperation_t transb,
-                            const int         m,
-                            const int         n,
-                            const int         k,
-                            const float       f_alpha,
-                            const void*       A,
-                            cudaDataType_t    AType,
-                            const int         lda,
-                            const int64_t     strideA,
-                            const void*       B,
-                            cudaDataType_t    BType,
-                            const int         ldb,
-                            const int64_t     strideB,
-                            const float       f_beta,
-                            void*             C,
-                            cudaDataType_t    CType,
-                            const int         ldc,
-                            const int64_t     strideC,
-                            const int         batch_count,
-                            cudaDataType_t    computeType);
-
-    void batchedGemm(cublasOperation_t  transa,
-                     cublasOperation_t  transb,
-                     const int          m,
-                     const int          n,
-                     const int          k,
-                     const void* const* A,
-                     const int          lda,
-                     const void* const* B,
-                     const int          ldb,
-                     void* const*       C,
-                     const int          ldc,
-                     const int          batch_count);
-
-    bool isFuseBatchGemm(const int batch_count, const int m, const int k, const int n);
-
-#ifdef SPARSITY_ENABLED
-    void SpGemm(cublasOperation_t transa,
-                cublasOperation_t transb,
-                const int         m,
-                const int         n,
-                const int         k,
-                const void*       A,
-                const void*       B,
-                void*             C);
-
-    size_t getSparseMatrixSize(int m, int k);
-    void   compressMatrix(const void* input, void* output, const int m, const int k);
-
-    bool isUseSparse(const int batch_count, const int m, const int n, const int k);
-#endif
-};
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/cuda_fp8_utils.cu b/src/turbomind/utils/cuda_fp8_utils.cu
deleted file mode 100644
index 5651dab2e7..0000000000
--- a/src/turbomind/utils/cuda_fp8_utils.cu
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "cuda_fp8_utils.h"
-
-namespace turbomind {
-#ifdef ENABLE_FP8
-
-template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
-__global__ void quantizeMatrix(T_OUT* output, float const* input_scale, T_IN const* input, uint32_t size, uint32_t n)
-{
-    for (uint32_t i = threadIdx.x + blockIdx.x * blockDim.x; i < size; i += blockDim.x * gridDim.x) {
-        if (quantize_mode == QUANTIZE_MODE::PER_CHANNEL) {
-            output[i] = T_OUT((float)(input[i]) * __ldg(input_scale + (i % n)));
-        }
-        else {
-            output[i] = T_OUT((float)(input[i]) * __ldg(input_scale));
-        }
-    }
-}
-
-template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
-void invokeQuantizeMatrix(
-    T_OUT* output, float const* input_scale, T_IN const* input, uint32_t size, uint32_t n, cudaStream_t stream)
-{
-    dim3 grid(32);
-    dim3 block(256);
-    quantizeMatrix<T_OUT, T_IN, quantize_mode><<<grid, block, 0, stream>>>(output, input_scale, input, size, n);
-}
-
-#define defineinvokeQuantizeMatrix(type_out, type_in, mode)                                                            \
-    template void invokeQuantizeMatrix<type_out, type_in, mode>(type_out * output,                                     \
-                                                                float const*   input_scale,                            \
-                                                                type_in const* input,                                  \
-                                                                uint32_t       size,                                   \
-                                                                uint32_t       n,                                      \
-                                                                cudaStream_t   stream);
-
-defineinvokeQuantizeMatrix(__nv_fp8_e4m3, float, QUANTIZE_MODE::PER_CHANNEL);
-defineinvokeQuantizeMatrix(__nv_fp8_e4m3, float, QUANTIZE_MODE::PER_TENSOR);
-defineinvokeQuantizeMatrix(__nv_fp8_e4m3, half, QUANTIZE_MODE::PER_CHANNEL);
-defineinvokeQuantizeMatrix(__nv_fp8_e4m3, half, QUANTIZE_MODE::PER_TENSOR);
-defineinvokeQuantizeMatrix(half, __nv_fp8_e4m3, QUANTIZE_MODE::PER_CHANNEL);
-defineinvokeQuantizeMatrix(half, __nv_fp8_e4m3, QUANTIZE_MODE::PER_TENSOR);
-defineinvokeQuantizeMatrix(float, __nv_fp8_e4m3, QUANTIZE_MODE::PER_CHANNEL);
-defineinvokeQuantizeMatrix(float, __nv_fp8_e4m3, QUANTIZE_MODE::PER_TENSOR);
-#ifdef ENABLE_BF16
-defineinvokeQuantizeMatrix(__nv_fp8_e4m3, __nv_bfloat16, QUANTIZE_MODE::PER_CHANNEL);
-defineinvokeQuantizeMatrix(__nv_fp8_e4m3, __nv_bfloat16, QUANTIZE_MODE::PER_TENSOR);
-defineinvokeQuantizeMatrix(__nv_bfloat16, __nv_fp8_e4m3, QUANTIZE_MODE::PER_CHANNEL);
-defineinvokeQuantizeMatrix(__nv_bfloat16, __nv_fp8_e4m3, QUANTIZE_MODE::PER_TENSOR);
-#endif
-
-template<typename T_OUT, typename T_IN, typename T_FAKE>
-__global__ void fakeQuantize(T_OUT* dst, const T_IN* src, const int size)
-{
-    for (int tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
-        T_FAKE tmp = (T_FAKE)((float)src[tid]);
-        dst[tid]   = (T_OUT)((float)tmp);
-    }
-}
-
-template<typename T_OUT, typename T_IN, typename T_FAKE>
-void invokeFakeQuantize(T_OUT* dst, const T_IN* src, const int size, cudaStream_t stream)
-{
-    fakeQuantize<T_OUT, T_IN, T_FAKE><<<256, 256, 0, stream>>>(dst, src, size);
-}
-
-template void
-invokeFakeQuantize<float, float, __nv_fp8_e4m3>(float* dst, const float* src, const int size, cudaStream_t stream);
-template void
-invokeFakeQuantize<half, half, __nv_fp8_e4m3>(half* dst, const half* src, const int size, cudaStream_t stream);
-template void invokeFakeQuantize<__nv_bfloat16, __nv_bfloat16, __nv_fp8_e4m3>(__nv_bfloat16*       dst,
-                                                                              const __nv_bfloat16* src,
-                                                                              const int            size,
-                                                                              cudaStream_t         stream);
-
-template<typename T_W>
-__global__ void computeFP8QuantizeScale(float* quant_ptr, const T_W* weights, const int k, const int n)
-{
-    float max = -10000.f;
-    for (int i = 0; i < k; i++) {
-        float val = fabs((float)weights[i * n + blockIdx.x * blockDim.x + threadIdx.x]);
-        max       = max > val ? max : val;
-        if (threadIdx.x == 0 && blockIdx.x == 0 && i % 100 == 0) {
-            printf("max: %f, val: %f \n", max, val);
-        }
-    }
-    // quant_ptr[blockIdx.x * blockDim.x + threadIdx.x] = 1.0f;
-    // quant_ptr[blockIdx.x * blockDim.x + threadIdx.x] = FP8_E4M3_MAX / max;
-    quant_ptr[blockIdx.x * blockDim.x + threadIdx.x] = std::max(max / FP8_E4M3_MAX, 1.0f / 32.f);
-}
-
-template<typename T_W>
-void invokeComputeFP8QuantizeScale(float* quant_ptr, const T_W* weights, const int k, const int n, cudaStream_t stream)
-{
-    dim3 block(256);
-    dim3 grid;
-    grid.x = (n + 255) / 256;
-    computeFP8QuantizeScale<T_W><<<grid, block, 0, stream>>>(quant_ptr, weights, k, n);
-}
-
-#ifdef ENABLE_BF16
-template void invokeComputeFP8QuantizeScale(
-    float* quant_ptr, const __nv_bfloat16* weights, const int k, const int n, cudaStream_t stream);
-#endif
-template void
-invokeComputeFP8QuantizeScale(float* quant_ptr, const float* weights, const int k, const int n, cudaStream_t stream);
-
-#endif  // ENABLE_FP8
-}  // namespace turbomind
diff --git a/src/turbomind/utils/cuda_fp8_utils.h b/src/turbomind/utils/cuda_fp8_utils.h
deleted file mode 100644
index ba7f91c8bf..0000000000
--- a/src/turbomind/utils/cuda_fp8_utils.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#ifdef ENABLE_FP8
-#include <cuda_fp8.h>
-#include <cuda_runtime.h>
-#include <stdint.h>
-
-// #define FP8_MHA
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 900
-#define FUSE_GEMM_ACT
-#endif
-#define FP8_GEMM_OUTPUT_QUANT_DISABLE
-
-#ifdef FUSE_GEMM_ACT
-#define USE_QGMMA
-#endif
-
-namespace turbomind {
-
-const float FP8_E4M3_MAX = 480.0f;
-
-enum QUANTIZE_MODE
-{
-    PER_CHANNEL,
-    PER_TENSOR,
-    PER_CHANNEL_WEIGHT_PER_TENSOR_ACT
-};
-
-// Packed Data Type
-typedef struct __CUDA_ALIGN__(32) {
-    float array[8];
-} float8;
-
-typedef struct __CUDA_ALIGN__(16) {
-    half array[8];
-} half8;
-
-#ifdef ENABLE_BF16
-typedef struct __CUDA_ALIGN__(4) {
-    __nv_bfloat16 array[2];
-} __nv_bfloat16_2;
-
-typedef struct __CUDA_ALIGN__(8) {
-    __nv_bfloat162 x, y;
-} __nv_bfloat162_2_xy;
-
-typedef struct __CUDA_ALIGN__(8) {
-    __nv_bfloat16 array[4];
-} __nv_bfloat164;
-
-typedef struct __CUDA_ALIGN__(8) {
-    __nv_bfloat162 array[2];
-} __nv_bfloat162_2;
-
-typedef struct __CUDA_ALIGN__(16) {
-    __nv_bfloat16 array[8];
-} __nv_bfloat168;
-
-typedef struct __CUDA_ALIGN__(16) {
-    __nv_bfloat162 array[4];
-} __nv_bfloat162_4;
-
-typedef struct __CUDA_ALIGN__(32) {
-    __nv_bfloat16 array[16];
-} __nv_bfloat1616;
-#endif
-
-#ifdef ENABLE_FP8
-typedef struct __CUDA_ALIGN__(2) {
-    __nv_fp8_e4m3 array[2];
-} __nv_fp8_2_e4m3;
-
-typedef struct __CUDA_ALIGN__(4) {
-    __nv_fp8_e4m3 array[4];
-} __nv_fp8_4_e4m3;
-
-typedef struct __CUDA_ALIGN__(4) {
-    __nv_fp8x2_e4m3 array[2];
-} __nv_fp8x2_x2_e4m3;
-
-typedef struct __CUDA_ALIGN__(8) {
-    __nv_fp8_e4m3 array[8];
-} __nv_fp8_8_e4m3;
-
-typedef struct __CUDA_ALIGN__(8) {
-    __nv_fp8x2_e4m3 array[4];
-} __nv_fp8x2_x4_e4m3;
-
-typedef struct __CUDA_ALIGN__(16) {
-    __nv_fp8_e4m3 array[16];
-} __nv_fp8x16_e4m3;
-#endif
-
-// only BF16 and FP8
-template<typename T, int PACK_SIZE>
-struct PackType {
-    using type = float;
-};
-
-#ifdef ENABLE_BF16
-template<>
-struct PackType<__nv_bfloat16, 2> {
-    using type = __nv_bfloat16_2;
-};
-
-template<>
-struct PackType<__nv_bfloat16, 4> {
-    using type = __nv_bfloat164;
-};
-
-template<>
-struct PackType<__nv_bfloat16, 8> {
-    using type = __nv_bfloat168;
-};
-#endif
-
-#ifdef ENABLE_FP8
-template<>
-struct PackType<__nv_fp8_e4m3, 2> {
-    using type = __nv_fp8_2_e4m3;
-};
-
-template<>
-struct PackType<__nv_fp8_e4m3, 4> {
-    using type = __nv_fp8_4_e4m3;
-};
-
-template<>
-struct PackType<__nv_fp8_e4m3, 8> {
-    using type = __nv_fp8_8_e4m3;
-};
-#endif
-
-__inline__ __device__ void fp8x4_e4m3_to_bfloat2(__nv_bfloat162* out1, __nv_bfloat162* out2, const __nv_fp8x4_e4m3* in)
-{
-    const char4 tmp_val = reinterpret_cast<const char4*>(in)[0];
-    *out1               = __nv_bfloat162((float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.x)[0],
-                           (float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.y)[0]);
-    *out2               = __nv_bfloat162((float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.z)[0],
-                           (float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.w)[0]);
-}
-
-__inline__ __device__ __nv_bfloat162 fp8x2_e4m3_to_bfloat2(const __nv_fp8x2_e4m3* in)
-{
-    const char2    tmp_val = reinterpret_cast<const char2*>(in)[0];
-    __nv_bfloat162 out     = __nv_bfloat162((float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.x)[0],
-                                        (float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.y)[0]);
-    return out;
-}
-
-__inline__ __device__ void fp8x4_e4m3_to_half2(half2* out1, half2* out2, const __nv_fp8x4_e4m3* in)
-{
-    const char4 tmp_val = reinterpret_cast<const char4*>(in)[0];
-    *out1               = half2((float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.x)[0],
-                  (float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.y)[0]);
-    *out2               = half2((float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.z)[0],
-                  (float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.w)[0]);
-}
-
-__inline__ __device__ half2 fp8x2_e4m3_to_half2(const __nv_fp8x2_e4m3* in)
-{
-    const char2 tmp_val = reinterpret_cast<const char2*>(in)[0];
-    half2       out     = half2((float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.x)[0],
-                      (float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.y)[0]);
-    return out;
-}
-
-template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
-void invokeQuantizeMatrix(
-    T_OUT* output, float const* input_qua_amax_ptr, T_IN const* input, uint32_t size, uint32_t n, cudaStream_t stream);
-
-template<typename T_OUT, typename T_IN, typename T_FAKE>
-void invokeFakeQuantize(T_OUT* dst, const T_IN* src, const int size, cudaStream_t stream);
-
-template<typename T_W>
-void invokeComputeFP8QuantizeScale(float* quant_ptr, const T_W* weights, const int k, const int n, cudaStream_t stream);
-
-}  // namespace turbomind
-#endif  // ENABLE_FP8
diff --git a/src/turbomind/utils/cuda_type_utils.cuh b/src/turbomind/utils/cuda_type_utils.cuh
index f7f7b95273..0b03442c74 100644
--- a/src/turbomind/utils/cuda_type_utils.cuh
+++ b/src/turbomind/utils/cuda_type_utils.cuh
@@ -18,7 +18,6 @@
 
 #include "src/turbomind/utils/cuda_bf16_fallbacks.cuh"
 #include "src/turbomind/utils/cuda_bf16_wrapper.h"
-#include "src/turbomind/utils/cuda_fp8_utils.h"
 #include <cuda.h>
 #include <cuda_fp16.h>
 
diff --git a/src/turbomind/utils/cuda_utils.cc b/src/turbomind/utils/cuda_utils.cc
index 95b6e87c5c..455b7826cc 100644
--- a/src/turbomind/utils/cuda_utils.cc
+++ b/src/turbomind/utils/cuda_utils.cc
@@ -16,124 +16,31 @@
 
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/macro.h"
-#include "src/turbomind/utils/cuda_fp8_utils.h"
 #include <regex>
 
 namespace turbomind {
 
-/* **************************** debug tools ********************************* */
-
-template<typename T>
-void print_to_file(const T* result, const int size, const char* file, cudaStream_t stream, std::ios::openmode open_mode)
-{
-    cudaDeviceSynchronize();
-    check_cuda_error(cudaGetLastError());
-    printf("[INFO] file: %s with size %d.\n", file, size);
-    std::ofstream outFile(file, open_mode);
-    if (outFile) {
-        T* tmp = new T[size];
-        check_cuda_error(cudaMemcpyAsync(tmp, result, sizeof(T) * size, cudaMemcpyDeviceToHost, stream));
-        for (int i = 0; i < size; ++i) {
-            float val = (float)(tmp[i]);
-            outFile << val << std::endl;
-        }
-        delete[] tmp;
-    }
-    else {
-        throw std::runtime_error(std::string("[TM][ERROR] Cannot open file: ") + file + "\n");
-    }
-    cudaDeviceSynchronize();
-    check_cuda_error(cudaGetLastError());
-}
-
-template void
-print_to_file(const float* result, const int size, const char* file, cudaStream_t stream, std::ios::openmode open_mode);
-template void
-print_to_file(const half* result, const int size, const char* file, cudaStream_t stream, std::ios::openmode open_mode);
-#ifdef ENABLE_BF16
-template void print_to_file(
-    const __nv_bfloat16* result, const int size, const char* file, cudaStream_t stream, std::ios::openmode open_mode);
-#endif
-
-template<typename T>
-void print_abs_mean(const T* buf, uint size, cudaStream_t stream, std::string name)
+void syncAndCheck(const char* const file, int const line)
 {
-    if (buf == nullptr) {
-        TM_LOG_WARNING("It is an nullptr, skip!");
-        return;
-    }
-    cudaDeviceSynchronize();
-    check_cuda_error(cudaGetLastError());
-    T* h_tmp = new T[size];
-    cudaMemcpyAsync(h_tmp, buf, sizeof(T) * size, cudaMemcpyDeviceToHost, stream);
-    cudaDeviceSynchronize();
-    check_cuda_error(cudaGetLastError());
-    double   sum        = 0.0f;
-    uint64_t zero_count = 0;
-    float    max_val    = -1e10;
-    bool     find_inf   = false;
-    for (uint i = 0; i < size; i++) {
-        if (std::isinf((float)(h_tmp[i]))) {
-            find_inf = true;
-            continue;
-        }
-        sum += abs((double)h_tmp[i]);
-        if ((float)h_tmp[i] == 0.0f) {
-            zero_count++;
+    // When FT_DEBUG_LEVEL=DEBUG, must check error
+    static char* level_name = std::getenv("TM_DEBUG_LEVEL");
+    if (level_name != nullptr) {
+        static std::string level = std::string(level_name);
+        if (level == "DEBUG") {
+            cudaDeviceSynchronize();
+            cudaError_t result = cudaGetLastError();
+            if (result) {
+                TM_LOG_ERROR((std::string("CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + " " + file + ":"
+                              + std::to_string(line))
+                                 .c_str());
+                std::abort();
+            }
+            TM_LOG_DEBUG(fmtstr("run syncAndCheck at %s:%d", file, line));
         }
-        max_val = max_val > abs(float(h_tmp[i])) ? max_val : abs(float(h_tmp[i]));
     }
-    printf("[TM][INFO] %20s size: %u, abs mean: %f, abs sum: %f, abs max: %f, find inf: %s",
-           name.c_str(),
-           size,
-           sum / size,
-           sum,
-           max_val,
-           find_inf ? "true" : "false");
-    std::cout << std::endl;
-    delete[] h_tmp;
-    cudaDeviceSynchronize();
-    check_cuda_error(cudaGetLastError());
 }
 
-template void print_abs_mean(const float* buf, uint size, cudaStream_t stream, std::string name);
-template void print_abs_mean(const half* buf, uint size, cudaStream_t stream, std::string name);
-#ifdef ENABLE_BF16
-template void print_abs_mean(const __nv_bfloat16* buf, uint size, cudaStream_t stream, std::string name);
-#endif
-template void print_abs_mean(const int* buf, uint size, cudaStream_t stream, std::string name);
-template void print_abs_mean(const uint* buf, uint size, cudaStream_t stream, std::string name);
-template void print_abs_mean(const int8_t* buf, uint size, cudaStream_t stream, std::string name);
-#ifdef ENABLE_FP8
-template void print_abs_mean(const __nv_fp8_e4m3* buf, uint size, cudaStream_t stream, std::string name);
-#endif
-
-template<typename T>
-void print_to_screen(const T* result, const int size)
-{
-    if (result == nullptr) {
-        TM_LOG_WARNING("It is an nullptr, skip! \n");
-        return;
-    }
-    T* tmp = reinterpret_cast<T*>(malloc(sizeof(T) * size));
-    check_cuda_error(cudaMemcpy(tmp, result, sizeof(T) * size, cudaMemcpyDeviceToHost));
-    for (int i = 0; i < size; ++i) {
-        printf("%d, %f\n", i, static_cast<float>(tmp[i]));
-    }
-    free(tmp);
-}
-
-template void print_to_screen(const float* result, const int size);
-template void print_to_screen(const half* result, const int size);
-#ifdef ENABLE_BF16
-template void print_to_screen(const __nv_bfloat16* result, const int size);
-#endif
-template void print_to_screen(const int* result, const int size);
-template void print_to_screen(const uint* result, const int size);
-template void print_to_screen(const bool* result, const int size);
-#ifdef ENABLE_FP8
-template void print_to_screen(const __nv_fp8_e4m3* result, const int size);
-#endif
+/* **************************** debug tools ********************************* */
 
 template<typename T>
 void printMatrix(T* ptr, int m, int k, int stride, bool is_device_ptr)
@@ -335,35 +242,38 @@ template void check_abs_mean_val(const __nv_bfloat16* result, const int size);
 
 /* ***************************** common utils ****************************** */
 
-cudaError_t getSetDevice(int i_device, int* o_device)
+int getSMVersion()
 {
-    int         current_dev_id = 0;
-    cudaError_t err            = cudaSuccess;
+    int device{-1};
+    check_cuda_error(cudaGetDevice(&device));
+    int sm_major = 0;
+    int sm_minor = 0;
+    check_cuda_error(cudaDeviceGetAttribute(&sm_major, cudaDevAttrComputeCapabilityMajor, device));
+    check_cuda_error(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));
+    return sm_major * 10 + sm_minor;
+}
 
-    if (o_device != NULL) {
-        err = cudaGetDevice(&current_dev_id);
-        if (err != cudaSuccess) {
-            return err;
-        }
-        if (current_dev_id == i_device) {
-            *o_device = i_device;
-        }
-        else {
-            err = cudaSetDevice(i_device);
-            if (err != cudaSuccess) {
-                return err;
-            }
-            *o_device = current_dev_id;
-        }
-    }
-    else {
-        err = cudaSetDevice(i_device);
-        if (err != cudaSuccess) {
-            return err;
-        }
-    }
+std::string getDeviceName()
+{
+    int device{-1};
+    check_cuda_error(cudaGetDevice(&device));
+    cudaDeviceProp props;
+    check_cuda_error(cudaGetDeviceProperties(&props, device));
+    return std::string(props.name);
+}
 
-    return cudaSuccess;
+int getDevice()
+{
+    int current_dev_id = 0;
+    check_cuda_error(cudaGetDevice(&current_dev_id));
+    return current_dev_id;
+}
+
+int getDeviceCount()
+{
+    int count = 0;
+    check_cuda_error(cudaGetDeviceCount(&count));
+    return count;
 }
 
 bool is_16xx_series(const char* name)
diff --git a/src/turbomind/utils/cuda_utils.h b/src/turbomind/utils/cuda_utils.h
index d764bb343a..543d90812a 100644
--- a/src/turbomind/utils/cuda_utils.h
+++ b/src/turbomind/utils/cuda_utils.h
@@ -37,46 +37,6 @@
 
 namespace turbomind {
 
-#define MAX_CONFIG_NUM 20
-#define COL32_ 32
-// workspace for cublas gemm : 32MB
-#define CUBLAS_WORKSPACE_SIZE 33554432
-
-typedef struct __align__(4)
-{
-    half x, y, z, w;
-}
-half4;
-
-/* **************************** type definition ***************************** */
-
-enum CublasDataType
-{
-    FLOAT_DATATYPE    = 0,
-    HALF_DATATYPE     = 1,
-    BFLOAT16_DATATYPE = 2,
-    INT8_DATATYPE     = 3,
-    FP8_DATATYPE      = 4
-};
-
-enum FtCudaDataType
-{
-    FP32 = 0,
-    FP16 = 1,
-    BF16 = 2,
-    INT8 = 3,
-    FP8  = 4
-};
-
-enum class OperationType
-{
-    FP32,
-    FP16,
-    BF16,
-    INT8,
-    FP8
-};
-
 /* **************************** debug tools ********************************* */
 static const char* _cudaGetErrorEnum(cudaError_t error)
 {
@@ -123,40 +83,17 @@ template<typename T>
 void check(T result, char const* const func, const char* const file, int const line)
 {
     if (result) {
-        throw std::runtime_error(std::string("[TM][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + " "
-                                 + file + ":" + std::to_string(line) + " \n");
+        TM_LOG_ERROR((std::string("CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + " " + file + ":"
+                      + std::to_string(line))
+                         .c_str());
+        std::abort();
     }
 }
 
 #define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
 #define check_cuda_error_2(val, file, line) check((val), #val, file, line)
 
-inline void syncAndCheck(const char* const file, int const line)
-{
-    // When FT_DEBUG_LEVEL=DEBUG, must check error
-    static char* level_name = std::getenv("TM_DEBUG_LEVEL");
-    if (level_name != nullptr) {
-        static std::string level = std::string(level_name);
-        if (level == "DEBUG") {
-            cudaDeviceSynchronize();
-            cudaError_t result = cudaGetLastError();
-            if (result) {
-                throw std::runtime_error(std::string("[TM][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result))
-                                         + " " + file + ":" + std::to_string(line) + " \n");
-            }
-            TM_LOG_DEBUG(fmtstr("run syncAndCheck at %s:%d", file, line));
-        }
-    }
-
-#ifndef NDEBUG
-    cudaDeviceSynchronize();
-    cudaError_t result = cudaGetLastError();
-    if (result) {
-        throw std::runtime_error(std::string("[TM][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + " "
-                                 + file + ":" + std::to_string(line) + " \n");
-    }
-#endif
-}
+void syncAndCheck(const char* const file, int const line);
 
 #define sync_check_cuda_error() syncAndCheck(__FILE__, __LINE__)
 
@@ -179,19 +116,6 @@ inline void syncAndCheck(const char* const file, int const line)
         }                                                                                                              \
     }
 
-template<typename T>
-void print_to_file(const T*           result,
-                   const int          size,
-                   const char*        file,
-                   cudaStream_t       stream    = 0,
-                   std::ios::openmode open_mode = std::ios::out);
-
-template<typename T>
-void print_abs_mean(const T* buf, uint size, cudaStream_t stream, std::string name = "");
-
-template<typename T>
-void print_to_screen(const T* result, const int size);
-
 template<typename T>
 void printMatrix(T* ptr, int m, int k, int stride, bool is_device_ptr);
 
@@ -223,10 +147,10 @@ inline void myAssert(bool result, const char* const file, int const line, std::s
     }
 }
 
-#define FT_CHECK(val) myAssert(val, __FILE__, __LINE__)
+#define FT_CHECK(val) myAssert(bool(val), __FILE__, __LINE__)
 #define FT_CHECK_WITH_INFO(val, info)                                                                                  \
     do {                                                                                                               \
-        bool is_valid_val = (val);                                                                                     \
+        bool is_valid_val = bool(val);                                                                                 \
         if (!is_valid_val) {                                                                                           \
             turbomind::myAssert(is_valid_val, __FILE__, __LINE__, (info));                                             \
         }                                                                                                              \
@@ -234,89 +158,11 @@ inline void myAssert(bool result, const char* const file, int const line, std::s
 
 #define FT_THROW(info) throwRuntimeError(__FILE__, __LINE__, info)
 
-#ifdef SPARSITY_ENABLED
-#define CHECK_CUSPARSE(func)                                                                                           \
-    {                                                                                                                  \
-        cusparseStatus_t status = (func);                                                                              \
-        if (status != CUSPARSE_STATUS_SUCCESS) {                                                                       \
-            throw std::runtime_error(std::string("[TM][ERROR] CUSPARSE API failed at line ")                           \
-                                     + std::to_string(__LINE__) + " in file " + __FILE__ + ": "                        \
-                                     + cusparseGetErrorString(status) + " " + std::to_string(status));                 \
-        }                                                                                                              \
-    }
-#endif
-
-/*************Time Handling**************/
-class CudaTimer {
-private:
-    cudaEvent_t  event_start_;
-    cudaEvent_t  event_stop_;
-    cudaStream_t stream_;
-
-public:
-    explicit CudaTimer(cudaStream_t stream = 0)
-    {
-        stream_ = stream;
-    }
-    void start()
-    {
-        check_cuda_error(cudaEventCreate(&event_start_));
-        check_cuda_error(cudaEventCreate(&event_stop_));
-        check_cuda_error(cudaEventRecord(event_start_, stream_));
-    }
-    float stop()
-    {
-        float time;
-        check_cuda_error(cudaEventRecord(event_stop_, stream_));
-        check_cuda_error(cudaEventSynchronize(event_stop_));
-        check_cuda_error(cudaEventElapsedTime(&time, event_start_, event_stop_));
-        check_cuda_error(cudaEventDestroy(event_start_));
-        check_cuda_error(cudaEventDestroy(event_stop_));
-        return time;
-    }
-    ~CudaTimer() {}
-};
-
 /* ***************************** common utils ****************************** */
 
-inline void print_mem_usage(std::string time = "after allocation")
-{
-    size_t free_bytes, total_bytes;
-    check_cuda_error(cudaMemGetInfo(&free_bytes, &total_bytes));
-    float free  = static_cast<float>(free_bytes) / 1024.0 / 1024.0 / 1024.0;
-    float total = static_cast<float>(total_bytes) / 1024.0 / 1024.0 / 1024.0;
-    float used  = total - free;
-    printf("%-20s: free: %5.2f GB, total: %5.2f GB, used: %5.2f GB\n", time.c_str(), free, total, used);
-}
-
-inline int getSMVersion()
-{
-    int device{-1};
-    check_cuda_error(cudaGetDevice(&device));
-    int sm_major = 0;
-    int sm_minor = 0;
-    check_cuda_error(cudaDeviceGetAttribute(&sm_major, cudaDevAttrComputeCapabilityMajor, device));
-    check_cuda_error(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));
-    return sm_major * 10 + sm_minor;
-}
-
-inline int getMaxSharedMemoryPerBlock()
-{
-    int device{-1};
-    check_cuda_error(cudaGetDevice(&device));
-    int max_shared_memory_size = 0;
-    check_cuda_error(cudaDeviceGetAttribute(&max_shared_memory_size, cudaDevAttrMaxSharedMemoryPerBlock, device));
-    return max_shared_memory_size;
-}
+int getSMVersion();
 
-inline std::string getDeviceName()
-{
-    int device{-1};
-    check_cuda_error(cudaGetDevice(&device));
-    cudaDeviceProp props;
-    check_cuda_error(cudaGetDeviceProperties(&props, device));
-    return std::string(props.name);
-}
+std::string getDeviceName();
 
 template<class T>
 inline T div_up(T a, T n)
@@ -324,175 +170,9 @@ inline T div_up(T a, T n)
     return (a + n - 1) / n;
 }
 
-cudaError_t getSetDevice(int i_device, int* o_device = NULL);
-
-inline int getDevice()
-{
-    int current_dev_id = 0;
-    check_cuda_error(cudaGetDevice(&current_dev_id));
-    return current_dev_id;
-}
-
-inline int getDeviceCount()
-{
-    int count = 0;
-    check_cuda_error(cudaGetDeviceCount(&count));
-    return count;
-}
+int getDevice();
 
-template<typename T>
-CublasDataType getCublasDataType()
-{
-    if (std::is_same<T, half>::value) {
-        return HALF_DATATYPE;
-    }
-#ifdef ENABLE_BF16
-    else if (std::is_same<T, __nv_bfloat16>::value) {
-        return BFLOAT16_DATATYPE;
-    }
-#endif
-    else if (std::is_same<T, float>::value) {
-        return FLOAT_DATATYPE;
-    }
-    else {
-        FT_CHECK(false);
-        return FLOAT_DATATYPE;
-    }
-}
-
-template<typename T>
-cudaDataType_t getCudaDataType()
-{
-    if (std::is_same<T, half>::value) {
-        return CUDA_R_16F;
-    }
-#ifdef ENABLE_BF16
-    else if (std::is_same<T, __nv_bfloat16>::value) {
-        return CUDA_R_16BF;
-    }
-#endif
-    else if (std::is_same<T, float>::value) {
-        return CUDA_R_32F;
-    }
-    else {
-        FT_CHECK(false);
-        return CUDA_R_32F;
-    }
-}
-
-template<CublasDataType T>
-struct getTypeFromCudaDataType {
-    using Type = float;
-};
-
-template<>
-struct getTypeFromCudaDataType<HALF_DATATYPE> {
-    using Type = half;
-};
-
-#ifdef ENABLE_BF16
-template<>
-struct getTypeFromCudaDataType<BFLOAT16_DATATYPE> {
-    using Type = __nv_bfloat16;
-};
-#endif
-
-// clang-format off
-template<typename T> struct packed_type;
-template <>          struct packed_type<float>         { using type = float; }; // we don't need to pack float by default
-template <>          struct packed_type<half>          { using type = half2; };
-
-#ifdef ENABLE_BF16
-template<>
-struct packed_type<__nv_bfloat16> {
-    using type = __nv_bfloat162;
-};
-#endif
-
-template<typename T> struct num_elems;
-template <>          struct num_elems<float>           { static constexpr int value = 1; };
-template <>          struct num_elems<float2>          { static constexpr int value = 2; };
-template <>          struct num_elems<float4>          { static constexpr int value = 4; };
-template <>          struct num_elems<half>            { static constexpr int value = 1; };
-template <>          struct num_elems<half2>           { static constexpr int value = 2; };
-#ifdef ENABLE_BF16
-template <>          struct num_elems<__nv_bfloat16>   { static constexpr int value = 1; };
-template <>          struct num_elems<__nv_bfloat162>  { static constexpr int value = 2; };
-#endif
-
-template<typename T, int num> struct packed_as;
-template<typename T>          struct packed_as<T, 1>              { using type = T; };
-template<>                    struct packed_as<half,  2>          { using type = half2; };
-template<>                    struct packed_as<float,  2>         { using type = float2; };
-template<>                    struct packed_as<int8_t, 2>         { using type = int16_t; };
-template<>                    struct packed_as<int32_t, 2>        { using type = int2; };
-template<>                    struct packed_as<half2, 1>          { using type = half; };
-#ifdef ENABLE_BF16
-template<> struct packed_as<__nv_bfloat16,  2> { using type = __nv_bfloat162; };
-template<> struct packed_as<__nv_bfloat162, 1> { using type = __nv_bfloat16;  };
-#endif
-
-inline __device__ float2 operator*(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
-inline __device__ float2 operator*(float2 a, float  b) { return make_float2(a.x * b, a.y * b); }
-// clang-format on
-
-template<typename T1, typename T2>
-void compareTwoTensor(
-    const T1* pred, const T2* ref, const int size, const int print_size = 0, const std::string filename = "")
-{
-    T1* h_pred = new T1[size];
-    T2* h_ref  = new T2[size];
-    check_cuda_error(cudaMemcpy(h_pred, pred, size * sizeof(T1), cudaMemcpyDeviceToHost));
-    check_cuda_error(cudaMemcpy(h_ref, ref, size * sizeof(T2), cudaMemcpyDeviceToHost));
-
-    FILE* fd = nullptr;
-    if (filename != "") {
-        fd = fopen(filename.c_str(), "w");
-        fprintf(fd, "| %10s | %10s | %10s | %10s | \n", "pred", "ref", "abs_diff", "rel_diff(%)");
-    }
-
-    if (print_size > 0) {
-        TM_LOG_INFO("  id |   pred  |   ref   |abs diff | rel diff (%) |");
-    }
-    float mean_abs_diff = 0.0f;
-    float mean_rel_diff = 0.0f;
-    int   count         = 0;
-    for (int i = 0; i < size; i++) {
-        if (i < print_size) {
-            TM_LOG_INFO("%4d | % 6.4f | % 6.4f | % 6.4f | % 7.4f |",
-                        i,
-                        (float)h_pred[i],
-                        (float)h_ref[i],
-                        abs((float)h_pred[i] - (float)h_ref[i]),
-                        abs((float)h_pred[i] - (float)h_ref[i]) / (abs((float)h_ref[i]) + 1e-6f) * 100.f);
-        }
-        if ((float)h_pred[i] == 0) {
-            continue;
-        }
-        count += 1;
-        mean_abs_diff += abs((float)h_pred[i] - (float)h_ref[i]);
-        mean_rel_diff += abs((float)h_pred[i] - (float)h_ref[i]) / (abs((float)h_ref[i]) + 1e-6f) * 100.f;
-
-        if (fd != nullptr) {
-            fprintf(fd,
-                    "| %10.5f | %10.5f | %10.5f | %11.5f |\n",
-                    (float)h_pred[i],
-                    (float)h_ref[i],
-                    abs((float)h_pred[i] - (float)h_ref[i]),
-                    abs((float)h_pred[i] - (float)h_ref[i]) / (abs((float)h_ref[i]) + 1e-6f) * 100.f);
-        }
-    }
-    mean_abs_diff = mean_abs_diff / (float)count;
-    mean_rel_diff = mean_rel_diff / (float)count;
-    TM_LOG_INFO("mean_abs_diff: % 6.4f, mean_rel_diff: % 6.4f (%%)", mean_abs_diff, mean_rel_diff);
-
-    if (fd != nullptr) {
-        fprintf(fd, "mean_abs_diff: % 6.4f, mean_rel_diff: % 6.4f (%%)", mean_abs_diff, mean_rel_diff);
-        fclose(fd);
-    }
-    delete[] h_pred;
-    delete[] h_ref;
-}
+int getDeviceCount();
 
 bool is_16xx_series(const char* name);
 
diff --git a/src/turbomind/utils/gemm.cc b/src/turbomind/utils/gemm.cc
deleted file mode 100644
index 097c9a19e9..0000000000
--- a/src/turbomind/utils/gemm.cc
+++ /dev/null
@@ -1,1184 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "src/turbomind/utils/gemm.h"
-
-namespace turbomind {
-
-/* ***************************** GEMM Impl ******************************** */
-
-Gemm::Gemm(IAllocator* allocator, cudaStream_t stream, std::string config_file)
-{
-    allocator_ = allocator;
-    stream_    = stream;
-    mutex_     = new std::mutex();  // mutex per process
-    check_cuda_error(cublasCreate(&cublas_handle_));
-    check_cuda_error(cublasLtCreate(&cublaslt_handle_));
-    check_cuda_error(cublasSetStream(cublas_handle_, stream));
-
-    if (allocator_ != nullptr) {
-        workspace_ = allocator_->reMalloc(workspace_, WORKSPACE_SIZE);
-    }
-    loadGemmConfig(config_file);
-}
-
-Gemm::~Gemm()
-{
-    if (allocator_ != nullptr) {
-        allocator_->free((void**)(&workspace_));
-        allocator_ = nullptr;
-    }
-    cublasLtDestroy(cublaslt_handle_);
-    cublasDestroy(cublas_handle_);
-    delete cublas_algo_map_;
-    delete mutex_;
-}
-
-std::string Gemm::toString()
-{
-    const char* a_type_str       = a_type_ == TYPE_FP16 ? "FP16" : "FP32";
-    const char* b_type_str       = b_type_ == TYPE_FP16 ? "FP16" : "FP32";
-    const char* c_type_str       = c_type_ == TYPE_FP16 ? "FP16" : "FP32";
-    const char* compute_type_str = compute_type_ == TYPE_FP16 ? "FP16" : "FP32";
-    return fmtstr(
-        "Gemm[a_type=%s, b_type=%s, c_type=%s, compute_type=%s]", a_type_str, b_type_str, c_type_str, compute_type_str);
-}
-
-void Gemm::setAllocator(IAllocator* allocator)
-{
-    if (allocator_ != nullptr && workspace_ != nullptr) {
-        allocator_->free((void**)(&workspace_));
-    }
-    allocator_ = allocator;
-    if (allocator_ != nullptr) {
-        workspace_ = allocator_->reMalloc(workspace_, WORKSPACE_SIZE);
-    }
-}
-
-void Gemm::setCudaStream(cudaStream_t& stream)
-{
-    stream_ = stream;
-    cublasSetStream(cublas_handle_, stream);
-}
-
-void Gemm::setComputeType(DataType compute_type)
-{
-    checkDataTypeValidity(compute_type);
-    compute_type_ = compute_type;
-}
-
-void Gemm::setTypes(DataType a_type, DataType b_type, DataType c_type, DataType compute_type)
-{
-    checkDataTypeValidity(a_type);
-    checkDataTypeValidity(b_type);
-    checkDataTypeValidity(c_type);
-    a_type_ = a_type;
-    b_type_ = b_type;
-    c_type_ = c_type;
-    setComputeType(compute_type);
-}
-
-template<typename T>
-void Gemm::setDefaultTypes()
-{
-    if (std::is_same<T, float>::value) {
-        setTypes(TYPE_FP32, TYPE_FP32, TYPE_FP32, TYPE_FP32);
-    }
-    else if (std::is_same<T, half>::value) {
-        setTypes(TYPE_FP16, TYPE_FP16, TYPE_FP16, TYPE_FP16);
-    }
-    else {
-        throw GemmNotSupportedException("Gemm supports float or half type.");
-    }
-}
-
-void Gemm::loadGemmConfig(std::string config_file)
-{
-    if (cublas_algo_map_ != nullptr) {
-        delete cublas_algo_map_;  // unload the previous cublas map.
-    }
-    cublas_algo_map_ = new cublasAlgoMap(config_file);
-}
-
-void Gemm::gemm(const GemmOp              transa,
-                const GemmOp              transb,
-                const size_t              m,
-                const size_t              n,
-                const size_t              k,
-                const void*               input,
-                const DenseWeight<float>& weight,
-                void*                     output,
-                const float               alpha,
-                const float               beta)
-{
-    gemm(transa,
-         transb,
-         m,
-         n,
-         k,
-         input,
-         a_type_,
-         (transa == GEMM_OP_N) ? k : m,
-         (const void*)weight.kernel,
-         b_type_,
-         (transb == GEMM_OP_N) ? n : k,
-         output,
-         c_type_,
-         n,
-         alpha,
-         beta);
-}
-
-void Gemm::gemm(const GemmOp             transa,
-                const GemmOp             transb,
-                const size_t             m,
-                const size_t             n,
-                const size_t             k,
-                const void*              input,
-                const DenseWeight<half>& weight,
-                void*                    output,
-                const float              alpha,
-                const float              beta)
-{
-    gemm(transa,
-         transb,
-         m,
-         n,
-         k,
-         input,
-         a_type_,
-         (transa == GEMM_OP_N) ? k : m,
-         (const void*)weight.kernel,
-         b_type_,
-         (transb == GEMM_OP_N) ? n : k,
-         output,
-         c_type_,
-         n,
-         alpha,
-         beta);
-}
-
-void Gemm::gemm(const GemmOp transa,
-                const GemmOp transb,
-                const size_t m,
-                const size_t n,
-                const size_t k,
-                const void*  A,
-                const void*  B,
-                void*        C,
-                const float  alpha,
-                const float  beta)
-{
-    size_t lda = (transa == GEMM_OP_N) ? k : m;
-    size_t ldb = (transb == GEMM_OP_N) ? n : k;
-    size_t ldc = n;
-    gemm(transa, transb, m, n, k, A, a_type_, lda, B, b_type_, ldb, C, c_type_, ldc, alpha, beta);
-}
-
-void Gemm::gemm(const GemmOp transa,
-                const GemmOp transb,
-                const size_t m,
-                const size_t n,
-                const size_t k,
-                const void*  A,
-                const size_t lda,
-                const void*  B,
-                const size_t ldb,
-                void*        C,
-                const size_t ldc,
-                const float  alpha,
-                const float  beta)
-{
-    gemm(transa, transb, m, n, k, A, a_type_, lda, B, b_type_, ldb, C, c_type_, ldc, alpha, beta);
-}
-
-void Gemm::gemm(const GemmOp   transa,
-                const GemmOp   transb,
-                const size_t   m,
-                const size_t   n,
-                const size_t   k,
-                const void*    A,
-                const DataType Atype,
-                const size_t   lda,
-                const void*    B,
-                const DataType Btype,
-                const size_t   ldb,
-                void*          C,
-                const DataType Ctype,
-                const size_t   ldc,
-                const float    alpha,
-                const float    beta)
-{
-    TM_LOG_TRACE("Gemm::gemm [m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]", m, n, k, lda, ldb, ldc);
-
-    // Implementation copied from cublasMMWrapper::Gemm
-    // Switch A and B since both cublas and cublasLt assume a column major layout,
-    // while A and B are both row major layout.
-    const void* a_data_ptr = B;
-    const void* b_data_ptr = A;
-
-    cublasOperation_t a_op = getCublasOperation(transb);
-    cublasOperation_t b_op = getCublasOperation(transa);
-
-    cudaDataType_t a_type = getCublasDataType(Btype);
-    cudaDataType_t b_type = getCublasDataType(Atype);
-    cudaDataType_t c_type = getCublasDataType(Ctype);
-
-    // swap m and n
-    const size_t _m = n;
-    const size_t _n = m;
-
-    // swap lda and ldb;
-    const size_t _lda = ldb;
-    const size_t _ldb = lda;
-
-    mutex_->lock();
-    // Use cublas as default in FP32 and cublasLt as default in FP16
-    bool is_fp16_compute_type = compute_type_ == TYPE_FP16;
-    bool using_cublasLt       = Atype == TYPE_FP16;
-    int  batch_count          = 1;
-
-    half        h_alpha = (half)alpha;
-    half        h_beta  = (half)beta;
-    const void* alpha_ptr =
-        is_fp16_compute_type ? reinterpret_cast<const void*>(&h_alpha) : reinterpret_cast<const void*>(&alpha);
-    const void* beta_ptr =
-        is_fp16_compute_type ? reinterpret_cast<const void*>(&h_beta) : reinterpret_cast<const void*>(&beta);
-
-    // TODO: unify CUBLAS_DATA_TYPE and DataType.
-    int findAlgo =
-        cublas_algo_map_->isExist(batch_count, _m, _n, k, (a_type == CUDA_R_16F) ? HALF_DATATYPE : FLOAT_DATATYPE);
-    cublasLtMatmulAlgo_info info =
-        cublas_algo_map_->getAlgo(batch_count, _m, _n, k, (a_type == CUDA_R_16F) ? HALF_DATATYPE : FLOAT_DATATYPE);
-    if (findAlgo) {
-        using_cublasLt = (info.stages != -1);
-    }
-
-    if (using_cublasLt) {
-        const size_t a_rows = (a_op == getCublasOperation(GEMM_OP_N)) ? _m : k;
-        const size_t a_cols = (a_op == getCublasOperation(GEMM_OP_N)) ? k : _m;
-        const size_t b_rows = (b_op == getCublasOperation(GEMM_OP_N)) ? k : _n;
-        const size_t b_cols = (b_op == getCublasOperation(GEMM_OP_N)) ? _n : k;
-
-        cublasLtMatmulDesc_t   matmul_desc = NULL;
-        cublasLtMatrixLayout_t a_desc = NULL, b_desc = NULL, c_desc = NULL;
-        cudaDataType_t         scale_type   = getCublasDataType(compute_type_);
-        auto                   compute_type = getCublasComputeType(compute_type_);
-
-        // --------------------------------------
-        // Create descriptors for the original matrices
-        cublasLtMatrixLayoutCreate(&a_desc, a_type, a_rows, a_cols, _lda);
-        cublasLtMatrixLayoutCreate(&b_desc, b_type, b_rows, b_cols, _ldb);
-        cublasLtMatrixLayoutCreate(&c_desc, c_type, _m, _n, ldc);
-#if (CUDART_VERSION >= 11000)
-        cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_type);
-#else
-        cublasLtMatmulDescCreate(&matmul_desc, compute_type);
-#endif
-
-        cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSA, &a_op, sizeof(cublasOperation_t));
-        cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSB, &b_op, sizeof(cublasOperation_t));
-
-        cublasLtMatmulAlgo_t algo;
-        void*                workspace      = workspace_;
-        int                  workspace_size = workspace_ == nullptr ? 0 : CUBLAS_WORKSPACE_SIZE;
-        if (findAlgo) {
-            if (info.workspaceSize > workspace_size) {
-                findAlgo = 0;
-            }
-            else {
-                cublasLtMatmulAlgoInit(
-                    cublaslt_handle_, compute_type, scale_type, a_type, b_type, c_type, c_type, info.algoId, &algo);
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption));
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile));
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val));
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle));
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(info.reductionScheme), sizeof(int));
-#if (CUDART_VERSION >= 11000)
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
-#endif
-            }
-        }
-
-        cublasLtMatmul(cublaslt_handle_,
-                       matmul_desc,
-                       alpha_ptr,
-                       a_data_ptr,
-                       a_desc,
-                       b_data_ptr,
-                       b_desc,
-                       beta_ptr,
-                       C,
-                       c_desc,
-                       C,
-                       c_desc,
-                       (findAlgo == 1 ? (&algo) : NULL),
-                       workspace,
-                       workspace_size,
-                       stream_);
-
-        cublasLtMatmulDescDestroy(matmul_desc);
-        cublasLtMatrixLayoutDestroy(a_desc);
-        cublasLtMatrixLayoutDestroy(b_desc);
-        cublasLtMatrixLayoutDestroy(c_desc);
-        sync_check_cuda_error();
-    }
-    else {
-        cudaDataType_t compute_type = getCublasDataType(compute_type_);
-        int            cublas_algo  = info.algoId;
-        check_cuda_error(cublasGemmEx(cublas_handle_,
-                                      a_op,
-                                      b_op,
-                                      _m,
-                                      _n,
-                                      k,
-                                      alpha_ptr,
-                                      a_data_ptr,
-                                      a_type,
-                                      _lda,
-                                      b_data_ptr,
-                                      b_type,
-                                      _ldb,
-                                      beta_ptr,
-                                      C,
-                                      c_type,
-                                      ldc,
-                                      compute_type,
-                                      static_cast<cublasGemmAlgo_t>(cublas_algo)));
-        sync_check_cuda_error();
-    }
-    mutex_->unlock();
-}
-
-void Gemm::batchedGemm(const GemmOp       transa,
-                       const GemmOp       transb,
-                       const size_t       m,
-                       const size_t       n,
-                       const size_t       k,
-                       const void* const* A,
-                       const void* const* B,
-                       void* const*       C,
-                       const size_t       batch_size,
-                       const float        alpha,
-                       const float        beta)
-{
-    size_t lda = (transa == GEMM_OP_N) ? k : m;
-    size_t ldb = (transb == GEMM_OP_N) ? n : k;
-    size_t ldc = n;
-    batchedGemm(transa, transb, m, n, k, A, a_type_, lda, B, b_type_, ldb, C, c_type_, ldc, batch_size, alpha, beta);
-}
-
-void Gemm::batchedGemm(const GemmOp       transa,
-                       const GemmOp       transb,
-                       const size_t       m,
-                       const size_t       n,
-                       const size_t       k,
-                       const void* const* A,
-                       const size_t       lda,
-                       const void* const* B,
-                       const size_t       ldb,
-                       void* const*       C,
-                       const size_t       ldc,
-                       const size_t       batch_size,
-                       const float        alpha,
-                       const float        beta)
-{
-    batchedGemm(transa, transb, m, n, k, A, a_type_, lda, B, b_type_, ldb, C, c_type_, ldc, batch_size, alpha, beta);
-}
-
-void Gemm::batchedGemm(const GemmOp       transa,
-                       const GemmOp       transb,
-                       const size_t       m,
-                       const size_t       n,
-                       const size_t       k,
-                       const void* const* A,
-                       const DataType     Atype,
-                       const size_t       lda,
-                       const void* const* B,
-                       const DataType     Btype,
-                       const size_t       ldb,
-                       void* const*       C,
-                       const DataType     Ctype,
-                       const size_t       ldc,
-                       const size_t       batch_size,
-                       const float        alpha,
-                       const float        beta)
-{
-    TM_LOG_TRACE(
-        "Gemm::batchedGemm [b=%ld m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]", batch_size, m, n, k, lda, ldb, ldc);
-
-    // Switch A and B.
-    const void* const* a_data_ptr = B;
-    const void* const* b_data_ptr = A;
-
-    cublasOperation_t a_op = getCublasOperation(transb);
-    cublasOperation_t b_op = getCublasOperation(transa);
-
-    cudaDataType_t a_type = getCublasDataType(Btype);
-    cudaDataType_t b_type = getCublasDataType(Atype);
-    cudaDataType_t c_type = getCublasDataType(Ctype);
-
-    // swap m and n, lda and ldb
-    const size_t _m   = n;
-    const size_t _n   = m;
-    const size_t _lda = ldb;
-    const size_t _ldb = lda;
-
-    half h_alpha = (half)alpha;
-    half h_beta  = (half)beta;
-
-    mutex_->lock();
-    bool        is_fp16_compute_type = compute_type_ == TYPE_FP16;
-    const void* alpha_ptr =
-        is_fp16_compute_type ? reinterpret_cast<const void*>(&h_alpha) : reinterpret_cast<const void*>(&alpha);
-    const void* beta_ptr =
-        is_fp16_compute_type ? reinterpret_cast<const void*>(&h_beta) : reinterpret_cast<const void*>(&beta);
-    cublasLtMatmulAlgo_info info =
-        cublas_algo_map_->getAlgo(batch_size, m, n, k, (a_type == CUDA_R_16F) ? HALF_DATATYPE : FLOAT_DATATYPE);
-
-    check_cuda_error(cublasGemmBatchedEx(cublas_handle_,
-                                         a_op,
-                                         b_op,
-                                         _m,
-                                         _n,
-                                         k,
-                                         alpha_ptr,
-                                         a_data_ptr,
-                                         a_type,
-                                         _lda,
-                                         b_data_ptr,
-                                         b_type,
-                                         _ldb,
-                                         beta_ptr,
-                                         C,
-                                         c_type,
-                                         ldc,
-                                         batch_size,
-                                         getCublasComputeType(compute_type_),
-                                         static_cast<cublasGemmAlgo_t>(info.algoId)));
-    mutex_->unlock();
-}
-
-void Gemm::stridedBatchedGemm(GemmOp       transa,
-                              GemmOp       transb,
-                              const size_t m,
-                              const size_t n,
-                              const size_t k,
-                              const void*  A,
-                              const void*  B,
-                              void*        C,
-                              const size_t batch_size,
-                              const float  alpha,
-                              const float  beta)
-{
-    size_t  lda     = (transa == GEMM_OP_N) ? k : m;
-    size_t  ldb     = (transb == GEMM_OP_N) ? n : k;
-    size_t  ldc     = n;
-    int64_t stridea = m * k;
-    int64_t strideb = k * n;
-    int64_t stridec = m * n;
-
-    stridedBatchedGemm(transa,
-                       transb,
-                       m,
-                       n,
-                       k,
-                       A,
-                       a_type_,
-                       lda,
-                       stridea,
-                       B,
-                       b_type_,
-                       ldb,
-                       strideb,
-                       C,
-                       c_type_,
-                       ldc,
-                       stridec,
-                       batch_size,
-                       compute_type_,
-                       alpha,
-                       beta);
-}
-
-void Gemm::stridedBatchedGemm(GemmOp        transa,
-                              GemmOp        transb,
-                              const size_t  m,
-                              const size_t  n,
-                              const size_t  k,
-                              const void*   A,
-                              const int64_t strideA,
-                              const void*   B,
-                              const int64_t strideB,
-                              void*         C,
-                              const int64_t strideC,
-                              const size_t  batch_size,
-                              const float   alpha,
-                              const float   beta)
-{
-    size_t lda = (transa == GEMM_OP_N) ? k : m;
-    size_t ldb = (transb == GEMM_OP_N) ? n : k;
-    size_t ldc = n;
-    stridedBatchedGemm(transa,
-                       transb,
-                       m,
-                       n,
-                       k,
-                       A,
-                       a_type_,
-                       lda,
-                       strideA,
-                       B,
-                       b_type_,
-                       ldb,
-                       strideB,
-                       C,
-                       c_type_,
-                       ldc,
-                       strideC,
-                       batch_size,
-                       compute_type_,
-                       alpha,
-                       beta);
-}
-
-void Gemm::stridedBatchedGemm(GemmOp        transa,
-                              GemmOp        transb,
-                              const size_t  m,
-                              const size_t  n,
-                              const size_t  k,
-                              const void*   A,
-                              const size_t  lda,
-                              const int64_t strideA,
-                              const void*   B,
-                              const size_t  ldb,
-                              const int64_t strideB,
-                              void*         C,
-                              const size_t  ldc,
-                              const int64_t strideC,
-                              const size_t  batch_size,
-                              const float   alpha,
-                              const float   beta)
-{
-    stridedBatchedGemm(transa,
-                       transb,
-                       m,
-                       n,
-                       k,
-                       A,
-                       a_type_,
-                       lda,
-                       strideA,
-                       B,
-                       b_type_,
-                       ldb,
-                       strideB,
-                       C,
-                       c_type_,
-                       ldc,
-                       strideC,
-                       batch_size,
-                       compute_type_,
-                       alpha,
-                       beta);
-}
-
-void Gemm::stridedBatchedGemm(GemmOp        transa,
-                              GemmOp        transb,
-                              const size_t  m,
-                              const size_t  n,
-                              const size_t  k,
-                              const void*   A,
-                              DataType      Atype,
-                              const size_t  lda,
-                              const int64_t strideA,
-                              const void*   B,
-                              DataType      Btype,
-                              const size_t  ldb,
-                              const int64_t strideB,
-                              void*         C,
-                              DataType      Ctype,
-                              const size_t  ldc,
-                              const int64_t strideC,
-                              const size_t  batch_size,
-                              DataType      compute_type,
-                              const float   alpha,
-                              const float   beta)
-{
-    TM_LOG_TRACE("Gemm::stridedBatchedGemm [b=%ld, m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]",
-                 batch_size,
-                 m,
-                 n,
-                 k,
-                 lda,
-                 ldb,
-                 ldc);
-
-    // Switch A and B.
-    const void* a_data_ptr = B;
-    const void* b_data_ptr = A;
-
-    cublasOperation_t a_op = getCublasOperation(transb);
-    cublasOperation_t b_op = getCublasOperation(transa);
-
-    cudaDataType_t a_type = getCublasDataType(Btype);
-    cudaDataType_t b_type = getCublasDataType(Atype);
-    cudaDataType_t c_type = getCublasDataType(Ctype);
-
-    // swap m and n, lda and ldb, stride A and B
-    const size_t  _m       = n;
-    const size_t  _n       = m;
-    const size_t  _lda     = ldb;
-    const size_t  _ldb     = lda;
-    const int64_t _stridea = strideB;
-    const int64_t _strideb = strideA;
-
-    half h_alpha = (half)alpha;
-    half h_beta  = (half)beta;
-
-    mutex_->lock();
-    bool        is_fp16_compute_type = compute_type_ == TYPE_FP16;
-    const void* alpha_ptr =
-        is_fp16_compute_type ? reinterpret_cast<const void*>(&h_alpha) : reinterpret_cast<const void*>(&alpha);
-    const void* beta_ptr =
-        is_fp16_compute_type ? reinterpret_cast<const void*>(&h_beta) : reinterpret_cast<const void*>(&beta);
-    cublasLtMatmulAlgo_info info =
-        cublas_algo_map_->getAlgo(batch_size, m, n, k, (a_type == CUDA_R_16F) ? HALF_DATATYPE : FLOAT_DATATYPE);
-
-    check_cuda_error(cublasGemmStridedBatchedEx(cublas_handle_,
-                                                a_op,
-                                                b_op,
-                                                _m,
-                                                _n,
-                                                k,
-                                                alpha_ptr,
-                                                a_data_ptr,
-                                                a_type,
-                                                _lda,
-                                                _stridea,
-                                                b_data_ptr,
-                                                b_type,
-                                                _ldb,
-                                                _strideb,
-                                                beta_ptr,
-                                                C,
-                                                c_type,
-                                                ldc,
-                                                strideC,
-                                                batch_size,
-                                                getCublasComputeType(compute_type),
-                                                static_cast<cublasGemmAlgo_t>(info.algoId)));
-    mutex_->unlock();
-}
-
-void Gemm::checkDataTypeValidity(const DataType& type)
-{
-    if (type != TYPE_FP32 && type != TYPE_FP16) {
-        throw GemmNotSupportedException("Gemm supports TYPE_FP16 or TYPE_FP32");
-    }
-}
-
-/* ************************* End of GEMM Impl **************************** */
-
-// void Int8Gemm::gemm(Tensor& C,
-//                     const GemmOp transa,
-//                     const GemmOp transb,
-//                     const Tensor& A,
-//                     const Tensor& B,
-//                     const float alpha,
-//                     const float beta)
-// {
-
-// }
-
-/* ************************* SpGEMM Impl *********************************** */
-#ifdef SPARSITY_ENABLED
-SpGemm::SpGemm(IAllocator* allocator, cudaStream_t stream, std::string config_file, std::string spconfig_file):
-    Gemm(allocator, stream, config_file)
-{
-    CHECK_CUSPARSE(cusparseLtInit(&cusparselt_handle_));
-    // TODO(jaedeokk):
-    //   Let's make cublasAlgoMap load gemm/spgemm config separtely,
-    //   allowing us to inherit Gemm's constructor.
-    // cublas_algo_map_.loadSpGemmConfig(spconfig_file);  // enable this line later.
-
-    a_type_       = TYPE_FP16;
-    b_type_       = TYPE_FP16;
-    c_type_       = TYPE_FP16;
-    compute_type_ = TYPE_FP16;
-}
-
-SpGemm::~SpGemm()
-{
-    cusparseLtDestroy(&cusparselt_handle_);
-    // Need to destroy matmul description cache.
-    for (auto& kv : a_desc_map_) {  // kv = (mark, a_desc)
-        cusparseLtMatDescriptorDestroy(&a_desc_map_[kv.first]);
-    }
-    for (auto& kv : b_desc_map_) {  // kv = (mark, b_desc)
-        cusparseLtMatDescriptorDestroy(&b_desc_map_[kv.first]);
-    }
-    for (auto& kv : c_desc_map_) {  // kv = (mark, c_desc)
-        cusparseLtMatDescriptorDestroy(&c_desc_map_[kv.first]);
-    }
-}
-
-std::string SpGemm::toString()
-{
-    const char* a_type_str       = a_type_ == TYPE_FP16 ? "FP16" : "FP32";
-    const char* b_type_str       = b_type_ == TYPE_FP16 ? "FP16" : "FP32";
-    const char* c_type_str       = c_type_ == TYPE_FP16 ? "FP16" : "FP32";
-    const char* compute_type_str = compute_type_ == TYPE_FP16 ? "FP16" : "FP32";
-    return fmtstr("SpGemm[a_type=%s, b_type=%s, c_type=%s, compute_type=%s]",
-                  a_type_str,
-                  b_type_str,
-                  c_type_str,
-                  compute_type_str);
-}
-
-void SpGemm::loadGemmConfig(std::string config_file, std::string spconfig_file)
-{
-    if (cublas_algo_map_ != nullptr) {
-        delete cublas_algo_map_;  // unload algo map.
-    }
-    cublas_algo_map_ = new cublasAlgoMap(config_file, spconfig_file);
-}
-
-void SpGemm::checkDataTypeValidity(const DataType& type)
-{
-    if (type != TYPE_FP16) {
-        throw GemmNotSupportedException("Sparse GEMM only supports FP16 data type now.");
-    }
-}
-
-bool SpGemm::useBaseGemm(size_t batch_size, size_t m, size_t n, size_t k)
-{
-    return !cublas_algo_map_->isUseSparse(batch_size, m, n, k);
-}
-
-// Temporal gemm helper mtehod to use template T.
-template<typename T>
-void SpGemm::weightGemmHelper(const GemmOp          transa,
-                              const GemmOp          transb,
-                              const size_t          m,
-                              const size_t          n,
-                              const size_t          k,
-                              const void*           input,
-                              const DenseWeight<T>& weight,
-                              void*                 output,
-                              const float           alpha,
-                              const float           beta)
-{
-    size_t lda = (transa == GEMM_OP_N) ? k : m;
-    size_t ldb = (transb == GEMM_OP_N) ? n : k;
-    size_t ldc = n;
-    if (useBaseGemm(1, m, n, k) || weight.sp_kernel == nullptr) {
-        Gemm::gemm(transa,
-                   transb,
-                   m,
-                   n,
-                   k,
-                   input,
-                   a_type_,
-                   lda,
-                   (const void*)weight.kernel,
-                   b_type_,
-                   ldb,
-                   output,
-                   c_type_,
-                   ldc,
-                   alpha,
-                   beta);
-    }
-    else {
-        gemm(transa,
-             transb,
-             m,
-             n,
-             k,
-             input,
-             a_type_,
-             lda,
-             (const void*)weight.sp_kernel,
-             b_type_,
-             ldb,
-             output,
-             c_type_,
-             ldc,
-             alpha,
-             beta);
-    }
-}
-
-void SpGemm::gemm(const GemmOp              transa,
-                  const GemmOp              transb,
-                  const size_t              m,
-                  const size_t              n,
-                  const size_t              k,
-                  const void*               input,
-                  const DenseWeight<float>& weight,
-                  void*                     output,
-                  const float               alpha,
-                  const float               beta)
-{
-    weightGemmHelper<float>(transa, transb, m, n, k, input, weight, output, alpha, beta);
-}
-void SpGemm::gemm(const GemmOp             transa,
-                  const GemmOp             transb,
-                  const size_t             m,
-                  const size_t             n,
-                  const size_t             k,
-                  const void*              input,
-                  const DenseWeight<half>& weight,
-                  void*                    output,
-                  const float              alpha,
-                  const float              beta)
-{
-    weightGemmHelper<half>(transa, transb, m, n, k, input, weight, output, alpha, beta);
-}
-
-void SpGemm::gemm(const GemmOp   transa,
-                  const GemmOp   transb,
-                  const size_t   m,
-                  const size_t   n,
-                  const size_t   k,
-                  const void*    A,
-                  const DataType Atype,
-                  const size_t   lda,
-                  const void*    B,
-                  const DataType Btype,
-                  const size_t   ldb,
-                  void*          C,
-                  const DataType Ctype,
-                  const size_t   ldc,
-                  const float    alpha,
-                  const float    beta)
-{
-    TM_LOG_TRACE("SpGemm::gemm [m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]", m, n, k, lda, ldb, ldc);
-    checkDataTypeValidity(Atype);
-    checkDataTypeValidity(Btype);
-    checkDataTypeValidity(Ctype);
-    checkDataTypeValidity(compute_type_);
-
-    if (useBaseGemm(1, m, n, k)) {
-        // Compute by the base GEMM.
-        Gemm::gemm(transa, transb, m, n, k, A, Atype, lda, B, Btype, ldb, C, Ctype, ldc, alpha, beta);
-        return;
-    }
-
-    // Switch A/B due to column major layout in computation.
-    //  Typical usecase of Gemm family is to compute Y = X * W where X is an
-    //  input tensor and W is a kernel weight. Compression takes a lot time
-    //  so only the kernel weight (which is fixed in inference time) can be
-    //  sparse. Using B as sparse seems not stable, unfortunately.
-    //  (e.g. caching matrix descriptions is not correctly working.)
-    //  Thus, SpGemm considers a column major layout in computation to make
-    //  C^T = B^T * A^T, where a kernel weight "B" is located at the front.
-    const void* a_data = B;
-    const void* b_data = A;
-
-    cusparseOrder_t order = CUSPARSE_ORDER_COL;
-
-    cusparseOperation_t opA = getCusparseOperation(transb);
-    cusparseOperation_t opB = getCusparseOperation(transa);
-
-    cudaDataType_t a_type = getCublasDataType(Btype);
-    cudaDataType_t b_type = getCublasDataType(Atype);
-    cudaDataType_t c_type = getCublasDataType(Ctype);
-
-    const size_t _m   = n;
-    const size_t _n   = m;
-    const size_t _lda = ldb;
-    const size_t _ldb = lda;
-
-    const size_t a_rows = (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) ? _m : k;
-    const size_t a_cols = (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) ? k : _m;
-    const size_t b_rows = (opB == CUSPARSE_OPERATION_NON_TRANSPOSE) ? k : _n;
-    const size_t b_cols = (opB == CUSPARSE_OPERATION_NON_TRANSPOSE) ? _n : k;
-    const size_t c_rows = _m;
-    const size_t c_cols = _n;
-
-    const unsigned      alignment    = 16;
-    cusparseComputeType compute_type = getCusparseComputeType(compute_type_);
-
-    cusparseLtMatmulDescriptor_t   matmul;
-    cusparseLtMatmulAlgSelection_t alg_sel;
-    cusparseLtMatmulPlan_t         plan;
-
-    char mark[256];
-    sprintf(mark, "%d_%ld_%ld_%ld_%s_%s", 1, m, n, k, getGemmOpString(transb).c_str(), getGemmOpString(transa).c_str());
-    if (a_desc_map_.find(mark) != a_desc_map_.end()) {
-        CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_,
-                                                      &matmul,
-                                                      opA,
-                                                      opB,
-                                                      &a_desc_map_[mark],
-                                                      &b_desc_map_[mark],
-                                                      &c_desc_map_[mark],
-                                                      &c_desc_map_[mark],
-                                                      compute_type));
-    }
-    else {
-        // initializing MatDesc takes a lot of time
-        cusparseLtMatDescriptor_t a_desc, b_desc, c_desc;
-        a_desc_map_[mark] = a_desc;
-        b_desc_map_[mark] = b_desc;
-        c_desc_map_[mark] = c_desc;
-        CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_,
-                                                          &a_desc_map_[mark],
-                                                          a_rows,
-                                                          a_cols,
-                                                          _lda,
-                                                          alignment,
-                                                          a_type,
-                                                          order,
-                                                          CUSPARSELT_SPARSITY_50_PERCENT));
-        CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(
-            &cusparselt_handle_, &b_desc_map_[mark], b_rows, b_cols, _ldb, alignment, b_type, order));
-        CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(
-            &cusparselt_handle_, &c_desc_map_[mark], c_rows, c_cols, ldc, alignment, c_type, order));
-        CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_,
-                                                      &matmul,
-                                                      opA,
-                                                      opB,
-                                                      &a_desc_map_[mark],
-                                                      &b_desc_map_[mark],
-                                                      &c_desc_map_[mark],
-                                                      &c_desc_map_[mark],
-                                                      compute_type));
-    }
-
-    mutex_->lock();
-    CHECK_CUSPARSE(
-        cusparseLtMatmulAlgSelectionInit(&cusparselt_handle_, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT));
-    int alg = cublas_algo_map_->getSpAlgo(1, a_rows, b_cols, a_cols);
-    CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
-        &cusparselt_handle_, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)));
-    size_t workspace_size;
-    CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&cusparselt_handle_, &alg_sel, &workspace_size));
-    CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&cusparselt_handle_, &plan, &matmul, &alg_sel, workspace_size));
-
-    void*        d_workspace = nullptr;  // Can we use the workspace of the class?
-    int          num_streams = 1;
-    cudaStream_t streams[1]  = {stream_};
-    CHECK_CUSPARSE(cusparseLtMatmul(
-        &cusparselt_handle_, &plan, &alpha, a_data, b_data, &beta, C, C, d_workspace, streams, num_streams))
-    CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
-    mutex_->unlock();
-    sync_check_cuda_error();
-}
-#endif
-
-/* ************************* End of SpGEMM Impl ************************** */
-
-/* ***************************** GEMM utils ****************************** */
-
-std::shared_ptr<Gemm> createGemm(IAllocator* allocator, cudaStream_t stream, bool sparse, bool quantized)
-{
-    TM_LOG_TRACE(
-        "Create Gemm instance [sparse=%s, quantized=%s]", sparse ? "true" : "false", quantized ? "true" : "false");
-    std::shared_ptr<Gemm> gemm;
-    if (!sparse) {
-        if (!quantized) {
-            gemm = std::make_shared<Gemm>(allocator, stream);
-        }
-        else {
-            throw GemmNotSupportedException("Int8 Gemm is not supported yet");
-        }
-    }
-    else {
-#ifdef SPARSITY_ENABLED
-        if (sparse && !quantized) {
-            gemm = std::make_shared<SpGemm>(allocator, stream);
-        }
-        else {
-            throw GemmNotSupportedException("Int8 Sparse Gemm is not supported yet");
-        }
-#else
-        throw GemmNotSupportedException("Sparsity support is not enabled. To enabled sparisty, "
-                                        "please provide `-DSPARSITY_SUPPORT` flag for compilation.");
-#endif
-    }
-    return gemm;
-}
-
-cudaDataType_t getCublasDataType(DataType dtype)
-{
-    switch (dtype) {
-        case TYPE_FP16:
-            return CUDA_R_16F;
-        case TYPE_FP32:
-            return CUDA_R_32F;
-        default:
-            throw GemmNotSupportedException("Not supported data type.");
-    }
-}
-
-#if (CUDART_VERSION >= 11000)
-cublasComputeType_t getCublasComputeType(DataType ctype)
-{
-    switch (ctype) {
-        case TYPE_FP16:
-            return CUBLAS_COMPUTE_16F;
-        case TYPE_FP32:
-            return CUBLAS_COMPUTE_32F;
-        default:
-            throw GemmNotSupportedException("Not supported cublas compute type.");
-    }
-}
-#else
-cudaDataType_t getCublasComputeType(DataType ctype)
-{
-    switch (ctype) {
-        case TYPE_FP16:
-            return CUDA_R_16F;
-        case TYPE_FP32:
-            return CUDA_R_32F;
-        default:
-            throw GemmNotSupportedException("Not supported cublas compute type.");
-    }
-}
-#endif
-
-cublasOperation_t getCublasOperation(GemmOp op)
-{
-    switch (op) {
-        case GEMM_OP_N:
-            return CUBLAS_OP_N;
-        case GEMM_OP_T:
-            return CUBLAS_OP_T;
-        default:
-            throw GemmNotSupportedException("Unknown GemmOp provided.");
-    }
-}
-
-std::string getGemmOpString(const GemmOp& op)
-{
-    switch (op) {
-        case GEMM_OP_T:
-            return "T";
-        case GEMM_OP_N:
-            return "N";
-    }
-    throw GemmNotSupportedException("Unknown GemmOp provided.");
-}
-
-#ifdef SPARSITY_ENABLED
-cusparseOperation_t getCusparseOperation(GemmOp op)
-{
-    switch (op) {
-        case GEMM_OP_N:
-            return CUSPARSE_OPERATION_NON_TRANSPOSE;
-        case GEMM_OP_T:
-            return CUSPARSE_OPERATION_TRANSPOSE;
-        default:
-            throw GemmNotSupportedException("Unknown GemmOp provided.");
-    }
-}
-
-cusparseComputeType getCusparseComputeType(DataType ctype)
-{
-    if (ctype != TYPE_FP16) {
-        throw GemmNotSupportedException("Sparse GEMM supports TYPE_FP16 compute type only.");
-    }
-    return CUSPARSE_COMPUTE_16F;
-}
-
-void pruneMatrixB(void* data, const cudaStream_t& stream, const size_t k, const size_t n, const GemmOp trans)
-{
-    TM_LOG_TRACE("Prune matrix B [k=%ld, n=%ld, op=%s]", k, n, getGemmOpString(trans).c_str());
-
-    // Due to A/B switching, the matrix B will be used as a matrix A.
-    const cusparseOrder_t order     = CUSPARSE_ORDER_COL;
-    const size_t          rows      = (trans == GEMM_OP_N) ? n : k;
-    const size_t          cols      = (trans == GEMM_OP_N) ? k : n;
-    const size_t          ld        = rows;
-    const unsigned        alignment = 16;
-
-    const cusparseLtPruneAlg_t prune_alg = CUSPARSELT_PRUNE_SPMMA_STRIP;
-    const cusparseOperation_t  op        = getCusparseOperation(trans);
-    const cudaDataType_t       dtype     = CUDA_R_16F;  // fixed under cusparselt == 0.2.0.
-
-    // 0: B is sparse,  1: A is sparse
-    // B matrix will be used as A matrix at the SpGemm::gemm.
-    const int is_sparse_a = 1;
-
-    // TODO: Let the resource manager handle GPU-related resources later.
-    cusparseLtHandle_t handle;
-    CHECK_CUSPARSE(cusparseLtInit(&handle));
-    cusparseLtMatDescriptor_t mat_desc;
-    CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
-        &handle, &mat_desc, rows, cols, ld, alignment, dtype, order, CUSPARSELT_SPARSITY_50_PERCENT));
-    CHECK_CUSPARSE(cusparseLtSpMMAPrune2(&handle, &mat_desc, is_sparse_a, op, data, data, prune_alg, stream));
-    CHECK_CUSPARSE(cusparseLtMatDescriptorDestroy(&mat_desc));
-    CHECK_CUSPARSE(cusparseLtDestroy(&handle));
-}
-
-size_t compressMatrixB(void**              output,
-                       IAllocator&         allocator,
-                       const cudaStream_t& stream,
-                       const void*         input,
-                       const size_t        k,
-                       const size_t        n,
-                       const GemmOp        trans)
-{
-    TM_LOG_TRACE("compressMatrix [k=%ld, n=%ld, dtype=FP16]", k, n);
-
-    // swap A/B due to column/row major layout mismatch.
-    cusparseOrder_t order = CUSPARSE_ORDER_COL;
-    const size_t    rows  = (trans == GEMM_OP_N) ? n : k;
-    const size_t    cols  = (trans == GEMM_OP_N) ? k : n;
-    const size_t    ld    = rows;
-
-    cudaDataType_t            dtype    = CUDA_R_16F;  // fixed under cusparselt == 0.2.0.
-    cusparseLtSparsity_t      sparsity = CUSPARSELT_SPARSITY_50_PERCENT;
-    cusparseOperation_t       op       = getCusparseOperation(trans);
-    cusparseLtMatDescriptor_t mat_desc;
-    const unsigned            alignment   = 16;
-    const int                 is_sparse_a = 1;  // 0: B is sparse,  1: A is sparse
-
-    cusparseLtHandle_t handle;
-    CHECK_CUSPARSE(cusparseLtInit(&handle));
-
-    CHECK_CUSPARSE(
-        cusparseLtStructuredDescriptorInit(&handle, &mat_desc, rows, cols, ld, alignment, dtype, order, sparsity))
-
-    size_t compressed_size = 0;
-    CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &mat_desc, &compressed_size));
-    if (compressed_size == 0) {
-        throw GemmInvalidException("Fail to compute correct compressed_size, got 0. This error may be "
-                                   "caused by a too small input matrix.");
-    }
-
-    *output = allocator.malloc(compressed_size, false);
-    CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &mat_desc, is_sparse_a, op, input, *output, stream))
-
-    CHECK_CUSPARSE(cusparseLtMatDescriptorDestroy(&mat_desc));
-    CHECK_CUSPARSE(cusparseLtDestroy(&handle));
-    return compressed_size;
-}
-
-#endif
-
-/* ************************* End of GEMM utils **************************** */
-
-}  // end of namespace turbomind
diff --git a/src/turbomind/utils/gemm.h b/src/turbomind/utils/gemm.h
deleted file mode 100644
index 7cc5502da9..0000000000
--- a/src/turbomind/utils/gemm.h
+++ /dev/null
@@ -1,681 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cublasLt.h>
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
-#include <iostream>
-#include <map>
-#include <mutex>
-#include <stdexcept>
-#include <string>
-
-// TODO: Need to remove the dependency of the layer module.
-//   e.g. refactor Weight class to some base module.
-#include "src/turbomind/layers/DenseWeight.h"
-#include "src/turbomind/utils/Tensor.h"
-#include "src/turbomind/utils/allocator.h"
-#include "src/turbomind/utils/cublasAlgoMap.h"
-#include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/logger.h"
-#include "src/turbomind/utils/memory_utils.h"
-
-#ifndef CUDART_VERSION
-#error CUDART_VERSION Undefined!
-#endif
-
-// cublas default workspace size: 32MB. Let me make this as a Gemm property.
-#define WORKSPACE_SIZE 33554432
-
-namespace turbomind {
-
-// A wrapper of cublas or cusparse matrix operator.
-//  - GEMM_OP_N = CUBLAS_OP_N or CUSPARSE_OP_N
-//  - GEMM_OP_T = CUBLAS_OP_T or CUSPARSE_OP_T
-enum GemmOp
-{
-    GEMM_OP_N,
-    GEMM_OP_T
-};
-
-// A base class of the GEMM family.
-// In the current version Gemm is as a base class as well as an interface.
-class Gemm {
-
-public:
-    Gemm() = delete;  // Disable a default constructor
-    /**
-     * A Gemm class.
-     *
-     * NOTE:
-     *   A, B, C are assumed to have a row major layout, while a backend cuda libraries
-     *   assumes a column major layout. However, a family of Gemm has already handled
-     *   such discrepancy internally. Please use naively without a trick like switching
-     *   inputs A and B that aligns the matrix layout.
-     *
-     * Restriction: Supported in/out data or compute types: TYPE_FP16, TYPE_FP32.
-     *
-     * TODO:
-     *   Unify resource allocation/release from a singleton GPU resource managers.
-     *   Thus, allocator, stream can be replaced by a resource handler later.
-     *   E.g. Gemm(std::shared_ptr<ResourceManager> resource_manager), and
-     *        stream_ = resource_manager.getCudaStream();
-     *        buffer = resource_manager.malloc(...);
-     *
-     * @param allocator   Resource allocator.
-     * @param stream      A CUDA stream.
-     * @param config_file A file path of a GEMM configuration.
-     */
-    Gemm(IAllocator* allocator, cudaStream_t stream, std::string config_file = GEMM_CONFIG);
-    Gemm(Gemm const& other) = delete;
-    virtual ~Gemm();
-
-    virtual std::string toString();
-
-    /**
-     * @brief Set GEMM compute type.
-     *
-     * @param compute_type The data type of accumulation type inside GEMM computation.
-     *                     (Choices: TYPE_FP16, TYPE_FP32)
-     *
-     * @throw GemmNotSupportedException if a type is not TYPE_FP16 or TYPE_FP32.
-     * @throw std::runtime_error  if any exception inside CUDA.
-     */
-    void setComputeType(DataType compute_type);
-
-    /**
-     * @brief Set matrix data types and compute precision.
-     *
-     * Supported data or compute types: TYPE_FP16, TYPE_FP32
-     *
-     * @param a_type  The data type of a matrix A.
-     * @param b_type  The data type of a matrix B.
-     * @param c_type  The data type of a matrix C.
-     * @param compute_type  The data type of accumulation type inside GEMM computation.
-     *
-     * @throw GemmNotSupportedException if a type is not TYPE_FP16 or TYPE_FP32.
-     * @throw std::runtime_error  if any exception inside CUDA.
-     */
-    void setTypes(DataType a_type, DataType b_type, DataType c_type, DataType compute_type);
-
-    /**
-     * @brief Set matrix data and compute types by default values.
-     *
-     * Default configs:
-     *  - T=float : data type=TYPE_FP32, compute type=TYPE_FP32
-     *  - T=half  : data type=TYPE_FP16, compute type=TYPE_FP32
-     */
-    template<typename T>
-    void setDefaultTypes();
-
-    void loadGemmConfig(std::string config_file);
-
-    void setAllocator(IAllocator* allocator);
-    void setCudaStream(cudaStream_t& stream);
-
-    // Th APIs below are to see how the interface will change
-    // if it cooperates with Tensor. To enable it, we need to
-    // update the Tensor class. For instance, data is need to
-    // be of type (void*) rather than (const void*) to pass it
-    // as the output C of gemm.
-    // virtual void gemm(Tensor& C,
-    //                   const GemmOp transa,
-    //                   const GemmOp transb,
-    //                   const Tensor& A,
-    //                   const Tensor& B,
-    //                   const float alpha = 1.0f,
-    //                   const float beta = 0.0f);
-    //
-    // virtual void batchedMatmul(std::vector<Tensor> Carray,
-    //                            const GemmOp transa,
-    //                            const GemmOp transb,
-    //                            const std::vector<Tensor> Aarray,
-    //                            const std::vector<Tensor> Barray,
-    //                            const float alpha = 1.0f,
-    //                            const float beta = 0.0f);
-    //
-    // virtual void stridedBatchedGemm(Tensor& C,
-    //                                 const GemmOp transa,
-    //                                 const GemmOp transb,
-    //                                 const Tensor& A,
-    //                                 const Tensor& B,
-    //                                 const float alpha = 1.0f,
-    //                                 const float beta = 0.0f);
-
-    // TODO:
-    // This function cooperates with a Weight object to simply Gemm calls
-    // inside layers, computing the following formula
-    //     output(C) = input(A) * weight_kernel(B)
-    // where weight_kernel can be changed according to Gemm functions.
-    // DenseWeight is of a template struct, not allowing override the method.
-    // We temperally add an interface here for two cases float/half,
-    // but to finialze this function, we need an interface of a weight class
-    // which is not a template class.
-    virtual void gemm(const GemmOp              transa,
-                      const GemmOp              transb,
-                      const size_t              m,
-                      const size_t              n,
-                      const size_t              k,
-                      const void*               input,
-                      const DenseWeight<float>& weight,
-                      void*                     output,
-                      const float               alpha = 1.0f,
-                      const float               beta  = 0.0f);
-    virtual void gemm(const GemmOp             transa,
-                      const GemmOp             transb,
-                      const size_t             m,
-                      const size_t             n,
-                      const size_t             k,
-                      const void*              input,
-                      const DenseWeight<half>& weight,
-                      void*                    output,
-                      const float              alpha = 1.0f,
-                      const float              beta  = 0.0f);
-
-    virtual void gemm(const GemmOp transa,
-                      const GemmOp transb,
-                      const size_t m,
-                      const size_t n,
-                      const size_t k,
-                      const void*  A,
-                      const void*  B,
-                      void*        C,
-                      const float  alpha = 1.0f,
-                      const float  beta  = 0.0f);
-
-    virtual void gemm(const GemmOp transa,
-                      const GemmOp transb,
-                      const size_t m,
-                      const size_t n,
-                      const size_t k,
-                      const void*  A,
-                      const size_t lda,
-                      const void*  B,
-                      const size_t ldb,
-                      void*        C,
-                      const size_t ldc,
-                      const float  alpha = 1.0f,
-                      const float  beta  = 0.0f);
-    /**
-     * @brief Compute the matrix multiplication `C = \alpha * op(A) * op(B) + \beta * C`.
-     *
-     * @param transa A transpose operation of a matrix A (GEMM_OP_N or GEMM_OP_T).
-     * @param transb A transpose operation of a matrix B (GEMM_OP_N or GEMM_OP_T).
-     * @param m      A number of rows of a matrix op(A) and C.
-     * @param n      A number of columns of a matrix op(B) or C.
-     * @param k      A number of columns of op(A) and rows of op(B).
-     * @param A      A device pointer of a matrix A of dimension (m x lda).
-     * @param Atype  A data type of A (TYPE_FP16 or TYPE_FP32)
-     * @param lda    A leading dimension of the matrix A.
-     * @param B      A device pointer of a matrix B of dimension (k x ldb).
-     * @param Btype  A data type of B (TYPE_FP16 or TYPE_FP32)
-     * @param ldb    A leading dimension of the matrix B.
-     * @param C      (Output) A device pointer of a matrix C of dimension (m x ldc).
-     * @param Ctype  A data type of C (TYPE_FP16 or TYPE_FP32)
-     * @param ldc    A leading dimension of the matrix C.
-     * @param alpha  A scale factor for A*B (default: 1.0f).
-     * @param beta   A scale factor for C (default: 0.0f).
-     *
-     * @throw GemmNotSupportedException if a type is not TYPE_FP16 or TYPE_FP32.
-     * @throw std::runtime_error  if any exception inside CUDA.
-     */
-    virtual void gemm(const GemmOp   transa,
-                      const GemmOp   transb,
-                      const size_t   m,
-                      const size_t   n,
-                      const size_t   k,
-                      const void*    A,
-                      const DataType Atype,
-                      const size_t   lda,
-                      const void*    B,
-                      const DataType Btype,
-                      const size_t   ldb,
-                      void*          C,
-                      const DataType Ctype,
-                      const size_t   ldc,
-                      const float    alpha = 1.0f,
-                      const float    beta  = 0.0f);
-
-    virtual void batchedGemm(const GemmOp       transa,
-                             const GemmOp       transb,
-                             const size_t       m,
-                             const size_t       n,
-                             const size_t       k,
-                             const void* const* A,
-                             const void* const* B,
-                             void* const*       C,
-                             const size_t       batch_size,
-                             const float        alpha = 1.0f,
-                             const float        beta  = 0.0f);
-
-    virtual void batchedGemm(const GemmOp       transa,
-                             const GemmOp       transb,
-                             const size_t       m,
-                             const size_t       n,
-                             const size_t       k,
-                             const void* const* A,
-                             const size_t       lda,
-                             const void* const* B,
-                             const size_t       ldb,
-                             void* const*       C,
-                             const size_t       ldc,
-                             const size_t       batch_size,
-                             const float        alpha = 1.0f,
-                             const float        beta  = 0.0f);
-
-    /**
-     * @brief Compute the matrix multiplication of batch of matrices As and Bs
-     *
-     * For input batch A[i]/B[i] and output batch C[i], i = 0, ..., batch_size - 1,
-     *  `C[i] = \alpha * op(A[i]) * op(B[i]) + \beta * C[i]`.
-     *
-     * @param transa A transpose operation of a matrix A (GEMM_OP_N or GEMM_OP_T).
-     * @param transb A transpose operation of a matrix B (GEMM_OP_N or GEMM_OP_T).
-     * @param m      A number of rows of a matrix op(A) and C.
-     * @param n      A number of columns of a matrix op(B) or C.
-     * @param k      A number of columns of op(A) and rows of op(B).
-     * @param A      An array of device pointers of batch of input A matrices.
-     * @param Atype  A data type of A (TYPE_FP16 or TYPE_FP32)
-     * @param lda    A leading dimension of the matrix A.
-     * @param B      An array of device pointers of batch of input B matrices.
-     * @param Btype  A data type of B (TYPE_FP16 or TYPE_FP32)
-     * @param ldb    A leading dimension of the matrix B.
-     * @param C      (Output) An array of device pointers of batch of output C matrices.
-     * @param Ctype  A data type of C (TYPE_FP16 or TYPE_FP32)
-     * @param ldc    A leading dimension of the matrix C.
-     * @param alpha  A scale factor for A*B (default: 1.0f).
-     * @param beta   A scale factor for C (default: 0.0f).
-     *
-     * @throw GemmNotSupportedException if a type is not TYPE_FP16 or TYPE_FP32.
-     * @throw std::runtime_error  if any exception inside CUDA.
-     */
-    virtual void batchedGemm(const GemmOp       transa,
-                             const GemmOp       transb,
-                             const size_t       m,
-                             const size_t       n,
-                             const size_t       k,
-                             const void* const* A,
-                             const DataType     Atype,
-                             const size_t       lda,
-                             const void* const* B,
-                             const DataType     Btype,
-                             const size_t       ldb,
-                             void* const*       C,
-                             const DataType     Ctype,
-                             const size_t       ldc,
-                             const size_t       batch_size,
-                             const float        alpha = 1.0f,
-                             const float        beta  = 0.0f);
-
-    virtual void stridedBatchedGemm(GemmOp       transa,
-                                    GemmOp       transb,
-                                    const size_t m,
-                                    const size_t n,
-                                    const size_t k,
-                                    const void*  A,
-                                    const void*  B,
-                                    void*        C,
-                                    const size_t batch_size,
-                                    const float  alpha = 1.0f,
-                                    const float  beta  = 0.0f);
-
-    virtual void stridedBatchedGemm(GemmOp        transa,
-                                    GemmOp        transb,
-                                    const size_t  m,
-                                    const size_t  n,
-                                    const size_t  k,
-                                    const void*   A,
-                                    const int64_t strideA,
-                                    const void*   B,
-                                    const int64_t strideB,
-                                    void*         C,
-                                    const int64_t strideC,
-                                    const size_t  batch_size,
-                                    const float   alpha = 1.0f,
-                                    const float   beta  = 0.0f);
-
-    virtual void stridedBatchedGemm(GemmOp        transa,
-                                    GemmOp        transb,
-                                    const size_t  m,
-                                    const size_t  n,
-                                    const size_t  k,
-                                    const void*   A,
-                                    const size_t  lda,
-                                    const int64_t strideA,
-                                    const void*   B,
-                                    const size_t  ldb,
-                                    const int64_t strideB,
-                                    void*         C,
-                                    const size_t  ldc,
-                                    const int64_t strideC,
-                                    const size_t  batch_size,
-                                    const float   alpha = 1.0f,
-                                    const float   beta  = 0.0f);
-    /**
-     * @brief Compute the strided matrix multiplication of batch of matrices As and Bs
-     *
-     * For input batch A[i]/B[i] and output batch C[i], i = 0, ..., batch_size - 1,
-     *  `C[i] = \alpha * op(A[i]) * op(B[i]) + \beta * C[i]`.
-     *
-     * @param transa   A transpose operation of a matrix A (GEMM_OP_N or GEMM_OP_T).
-     * @param transb   A transpose operation of a matrix B (GEMM_OP_N or GEMM_OP_T).
-     * @param m        A number of rows of a matrix op(A) and C.
-     * @param n        A number of columns of a matrix op(B) or C.
-     * @param k        A number of columns of op(A) and rows of op(B).
-     * @param A        An array of device pointers of batch of input A matrices.
-     * @param Atype    A data type of A (TYPE_FP16 or TYPE_FP32)
-     * @param lda      A leading dimension of the matrix A.
-     * @param strideA  An offset in number of elements between matrix A[i] and A[i+1].
-     * @param B        An array of device pointers of batch of input B matrices.
-     * @param Btype    A data type of B (TYPE_FP16 or TYPE_FP32)
-     * @param ldb      A leading dimension of the matrix B.
-     * @param strideB  An offset in number of elements between matrix B[i] and B[i+1].
-     * @param C        (Output) An array of device pointers of batch of output C matrices.
-     * @param Ctype    A data type of C (TYPE_FP16 or TYPE_FP32)
-     * @param ldc      A leading dimension of the matrix C.
-     * @param strideC  An offset in number of elements between matrix C[i] and C[i+1].
-     * @param compute_type  An accumulation type of GEMM.
-     * @param alpha    A scale factor for A*B (default: 1.0f).
-     * @param beta     A scale factor for C (default: 0.0f).
-     *
-     * @throw GemmNotSupportedException if a type is not TYPE_FP16 or TYPE_FP32.
-     * @throw std::runtime_error  if any exception inside CUDA.
-     */
-    virtual void stridedBatchedGemm(GemmOp        transa,
-                                    GemmOp        transb,
-                                    const size_t  m,
-                                    const size_t  n,
-                                    const size_t  k,
-                                    const void*   A,
-                                    DataType      Atype,
-                                    const size_t  lda,
-                                    const int64_t strideA,
-                                    const void*   B,
-                                    DataType      Btype,
-                                    const size_t  ldb,
-                                    const int64_t strideB,
-                                    void*         C,
-                                    DataType      Ctype,
-                                    const size_t  ldc,
-                                    const int64_t strideC,
-                                    const size_t  batch_size,
-                                    DataType      compute_type,
-                                    const float   alpha = 1.0f,
-                                    const float   beta  = 0.0f);
-
-protected:
-    IAllocator*    allocator_ = nullptr;
-    cudaStream_t   stream_;
-    std::mutex*    mutex_           = nullptr;
-    cublasAlgoMap* cublas_algo_map_ = nullptr;
-
-    cublasHandle_t   cublas_handle_;
-    cublasLtHandle_t cublaslt_handle_;
-    void*            workspace_ = nullptr;
-
-    // use FP32 as default
-    DataType a_type_       = TYPE_FP32;
-    DataType b_type_       = TYPE_FP32;
-    DataType c_type_       = TYPE_FP32;
-    DataType compute_type_ = TYPE_FP32;
-
-    // Check if data and inputs are valid in the Gemm class.
-    virtual void checkDataTypeValidity(const DataType& type);
-};
-
-// class Int8Gemm : public Gemm {
-
-// protected:
-//     bool use_ORDER_COL32_2R_4R4_; // what is this?
-// };
-
-#ifdef SPARSITY_ENABLED
-
-/**
- * A Sparse Gemm class.
- *
- * NOTE:
- *   A, B, C are assumed to have a row major layout.
- *   There are two restrictions:
- *   - It supports the case when the matrix B is sparse.
- *   - Supported only TYPE_FP16 for in/out data or compute types.
- */
-class SpGemm: public Gemm {
-
-protected:
-    cusparseLtHandle_t                               cusparselt_handle_;
-    std::map<std::string, cusparseLtMatDescriptor_t> a_desc_map_;
-    std::map<std::string, cusparseLtMatDescriptor_t> b_desc_map_;
-    std::map<std::string, cusparseLtMatDescriptor_t> c_desc_map_;
-    bool                                             useBaseGemm(size_t batch_size, size_t m, size_t n, size_t k);
-
-public:
-    using Gemm::setComputeType;
-    using Gemm::setTypes;
-    using Gemm::setDefaultTypes;
-    using Gemm::setAllocator;
-    using Gemm::setCudaStream;
-    using Gemm::gemm;
-    using Gemm::batchedGemm;
-    using Gemm::stridedBatchedGemm;
-
-    /**
-     * @param allocator   Resource allocator.
-     * @param stream      A CUDA stream.
-     * @param config_file A file path of a GEMM configuration.
-     */
-    // TODO: Let's unify algo map loading part.
-    SpGemm(IAllocator*  allocator,
-           cudaStream_t stream,
-           std::string  config_file   = GEMM_CONFIG,
-           std::string  spconfig_file = SPGEMM_CONFIG);
-    ~SpGemm();
-    std::string toString() override;
-    void        loadGemmConfig(std::string config_file, std::string spconfig_file);
-
-    // Template method cannot be overridden.
-    void gemm(const GemmOp              transa,
-              const GemmOp              transb,
-              const size_t              m,
-              const size_t              n,
-              const size_t              k,
-              const void*               input,
-              const DenseWeight<float>& weight,
-              void*                     output,
-              const float               alpha = 1.0f,
-              const float               beta  = 0.0f) override;
-    void gemm(const GemmOp             transa,
-              const GemmOp             transb,
-              const size_t             m,
-              const size_t             n,
-              const size_t             k,
-              const void*              input,
-              const DenseWeight<half>& weight,
-              void*                    output,
-              const float              alpha = 1.0f,
-              const float              beta  = 0.0f) override;
-
-    void gemm(const GemmOp   transa,
-              const GemmOp   transb,
-              const size_t   m,
-              const size_t   n,
-              const size_t   k,
-              const void*    A,
-              const DataType Atype,
-              const size_t   lda,
-              const void*    B,
-              const DataType Btype,
-              const size_t   ldb,
-              void*          C,
-              const DataType Ctype,
-              const size_t   ldc,
-              const float    alpha = 1.0f,
-              const float    beta  = 0.0f) override;
-
-private:
-    void checkDataTypeValidity(const DataType& type) override;
-
-    // Temporal gemm helper mtehod to use template T.
-    template<typename T>
-    void weightGemmHelper(const GemmOp          transa,
-                          const GemmOp          transb,
-                          const size_t          m,
-                          const size_t          n,
-                          const size_t          k,
-                          const void*           input,
-                          const DenseWeight<T>& weight,
-                          void*                 output,
-                          const float           alpha,
-                          const float           beta);
-};
-
-// class Int8SpGemm : public Int8Gemm, public SpGemm {
-
-// };
-#endif
-
-/* ***************************** GEMM Exceptions ******************************* */
-
-class GemmInvalidShapeException: public std::exception {
-private:
-    std::string msg_ = "Invalid matrix shapes.";
-
-public:
-    explicit GemmInvalidShapeException() = default;
-
-    template<typename... Args>
-    explicit GemmInvalidShapeException(const std::string format, const Args&... args): msg_(fmtstr(format, args...))
-    {
-    }
-
-    const char* what() const throw()
-    {
-        return msg_.c_str();
-    }
-};
-
-class GemmNotSupportedException: public std::exception {
-private:
-    std::string msg_ = "Not supported exception.";
-
-public:
-    explicit GemmNotSupportedException() = default;
-
-    template<typename... Args>
-    explicit GemmNotSupportedException(const std::string format, const Args&... args): msg_(fmtstr(format, args...))
-    {
-    }
-
-    const char* what() const throw()
-    {
-        return msg_.c_str();
-    }
-};
-
-class GemmInvalidException: public std::exception {
-private:
-    std::string msg_ = "Invalid use of gemm.";
-
-public:
-    explicit GemmInvalidException() = default;
-
-    template<typename... Args>
-    explicit GemmInvalidException(const std::string format, const Args&... args): msg_(fmtstr(format, args...))
-    {
-    }
-
-    const char* what() const throw()
-    {
-        return msg_.c_str();
-    }
-};
-
-/* ************************ End of GEMM Exceptions ************************ */
-
-/* ***************************** GEMM utils ******************************* */
-
-/**
- * @brief Create method for the Gemm family.
- *
- * @param allocator  Resource allocator.
- * @param stream     A CUDA stream.
- * @param sparse     Whether to use sparse GEMM
- * @param quantized  Whether to use int8 quantized GEMM.
- * @return A shared pointer of a GemmCls instance.
- */
-std::shared_ptr<Gemm>
-createGemm(IAllocator* allocator, cudaStream_t stream, bool sparse = false, bool quantized = false);
-
-cudaDataType_t getCublasDataType(DataType dtype);
-#if (CUDART_VERSION >= 11000)
-cublasComputeType_t getCublasComputeType(DataType dtype);
-#else
-cudaDataType_t getCublasComputeType(DataType dtype);
-#endif
-cublasOperation_t getCublasOperation(GemmOp op);
-std::string       getGemmOpString(const GemmOp& op);
-
-#ifdef SPARSITY_ENABLED
-cusparseOperation_t getCusparseOperation(GemmOp op);
-cusparseComputeType getCusparseComputeType(DataType dtype);
-
-/**
- * @brief Prune a weight matrix (in-place).
- *
- * SpGemm supports a case when the sparse matrix is B in C=A*B.
- *
- * @param data    A data pointer
- * @param stream  A cuda stream object.
- * @param k       A number of rows of op(B).
- * @param n       A number of columns of op(B).
- * @param trans   A transpose operation that will be applied to the matrix
- *                (default: GEMM_OP_N).
- */
-void pruneMatrixB(
-    void* data, const cudaStream_t& stream, const size_t k, const size_t n, const GemmOp trans = GEMM_OP_N);
-
-/**
- * @brief Compress the B matrix in a specific sparsity format.
- *
- * @param output A pointer where to allocate memory buffer to store a compressed matrix.
- * @param alloactor  A resource allocator.
- * @param stream A cuda stream object.
- * @param input  An input matrix to compress.
- * @param k      A number of rows of op(B).
- * @param n      A number of columns of op(B).
- * @param trans  A transpose operation that will be applied to the matrix (default: GEMM_OP_N).
- *
- * @return A size of the allocated device buffer of the compressed matrix.
- *
- * @throw GemmInvalidException  if the input matrix does not have 2:4 sparsity.
- *              or if fail to compute a correct buffer size to store the compressed matrix.
- * @throw std::runtime_error  if any exception inside CUDA.
- */
-size_t compressMatrixB(void**              output,
-                       IAllocator&         allocator,
-                       const cudaStream_t& stream,
-                       const void*         input,
-                       const size_t        k,
-                       const size_t        n,
-                       const GemmOp        trans = GEMM_OP_N);
-
-#endif
-
-/* ************************* End of GEMM utils **************************** */
-
-}  // end of namespace turbomind
diff --git a/src/turbomind/utils/gemm_test/CMakeLists.txt b/src/turbomind/utils/gemm_test/CMakeLists.txt
deleted file mode 100644
index 3e65f65a36..0000000000
--- a/src/turbomind/utils/gemm_test/CMakeLists.txt
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-cmake_minimum_required(VERSION 3.8)
-
-find_package(CUDAToolkit REQUIRED)
-
-set(gemm_func_files
-  gemm_func.cc
-)
-
-set(encoder_gemm_func_files
-  encoder_gemm_func.cc
-)
-
-set(encoder_igemm_func_files
-  encoder_igemm_func.cc
-)
-
-set(decoding_gemm_func_files
-  decoding_gemm_func.cc
-)
-
-set(gpt_gemm_func_files
-  gpt_gemm_func.cc
-)
-
-set(xlnet_gemm_func_files
-  xlnet_gemm_func.cc
-)
-
-set(t5_gemm_func_files
-  t5_gemm_func.cc
-)
-
-set(swin_igemm_func_files
-  swin_igemm_func.cc
-)
-
-set(swin_gemm_func_files
-  swin_gemm_func.cc
-)
-
-add_library(gemm_func STATIC ${gemm_func_files})
-target_link_libraries(gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart cuda_utils logger)
-set_property(TARGET gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
-set_property(TARGET gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-
-add_library(encoder_gemm_func STATIC ${encoder_gemm_func_files})
-target_link_libraries(encoder_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
-if (SPARSITY_SUPPORT)
-target_link_libraries(encoder_gemm_func PUBLIC CUDA::cusparse -lcusparseLt)
-endif()
-set_property(TARGET encoder_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
-set_property(TARGET encoder_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-
-add_library(encoder_igemm_func STATIC ${encoder_igemm_func_files})
-target_link_libraries(encoder_igemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart cuda_utils logger)
-if (SPARSITY_SUPPORT)
-target_link_libraries(encoder_igemm_func PUBLIC CUDA::cusparse -lcusparseLt)
-endif()
-set_property(TARGET encoder_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
-set_property(TARGET encoder_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-
-add_library(decoding_gemm_func STATIC ${decoding_gemm_func_files})
-target_link_libraries(decoding_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
-set_property(TARGET decoding_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
-set_property(TARGET decoding_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-
-add_library(gpt_gemm_func STATIC ${gpt_gemm_func_files})
-target_link_libraries(gpt_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
-if (SPARSITY_SUPPORT)
-  target_link_libraries(gpt_gemm_func PUBLIC CUDA::cusparse -lcusparseLt)
-endif()
-set_property(TARGET gpt_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
-set_property(TARGET gpt_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-
-add_library(xlnet_gemm_func STATIC ${xlnet_gemm_func_files})
-target_link_libraries(xlnet_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
-set_property(TARGET xlnet_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
-set_property(TARGET xlnet_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-
-add_library(t5_gemm_func STATIC ${t5_gemm_func_files})
-target_link_libraries(t5_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
-if (SPARSITY_SUPPORT)
-  target_link_libraries(t5_gemm_func PUBLIC CUDA::cusparse -lcusparseLt)
-endif()
-set_property(TARGET t5_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
-set_property(TARGET t5_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-
-add_library(swin_igemm_func STATIC ${swin_igemm_func_files})
-target_link_libraries(swin_igemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func encoder_igemm_func cuda_utils logger)
-set_property(TARGET swin_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
-set_property(TARGET swin_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-
-add_library(swin_gemm_func STATIC ${swin_gemm_func_files})
-target_link_libraries(swin_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
-set_property(TARGET swin_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
-set_property(TARGET swin_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
diff --git a/src/turbomind/utils/gemm_test/decoding_gemm_func.cc b/src/turbomind/utils/gemm_test/decoding_gemm_func.cc
deleted file mode 100644
index 068ae98d81..0000000000
--- a/src/turbomind/utils/gemm_test/decoding_gemm_func.cc
+++ /dev/null
@@ -1,406 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "src/turbomind/utils/gemm_test/decoding_gemm_func.h"
-#include "src/turbomind/macro.h"
-#include <chrono>
-
-namespace turbomind {
-
-template<typename T>
-void generate_decoding_gemm_config(int   batch_size,
-                                   int   beam_width,
-                                   int   max_mem_seq_len,
-                                   int   head_num,
-                                   int   size_per_head,
-                                   int   inter_size,
-                                   int   vocab_size,
-                                   int   mem_hidden_units,
-                                   void* buffer_in,
-                                   bool  isAppend)
-{
-    void* cublas_workspace;
-    void* buffer;
-    int   workSpaceSize;
-
-#ifdef ENABLE_BF16
-    if (std::is_same<T, half>::value || std::is_same<T, __nv_bfloat16>::value) {
-#else
-    if (std::is_same<T, half>::value) {
-#endif  // ENABLE_BF16
-        // cublas_workspace_ should be the start pointer of cudaMalloc()
-        // to ensure 16B alignemnet
-        cublas_workspace = buffer_in;
-        buffer           = (void*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE);
-        workSpaceSize    = CUBLAS_WORKSPACE_SIZE;
-    }
-    else {
-        cublas_workspace = nullptr;
-        buffer           = buffer_in;
-        workSpaceSize    = 0;
-    }
-
-    struct cudaDeviceProp prop;
-    check_cuda_error(cudaGetDeviceProperties(&prop, 0));
-    printf("Device %s\n", prop.name);
-
-    // check config
-    FILE* fd;
-    int   line_count = 0;
-    if (!isAppend) {
-        fd = fopen(GEMM_CONFIG, "w+");
-    }
-    else {
-        fd = fopen(GEMM_CONFIG, "a+");
-        std::vector<std::string> config;
-        char                     line[1024];
-        while (fgets(line, 1024, fd) != NULL) {
-            config.push_back(std::string(line));
-        }
-        line_count = config.size();
-        if (config.size() >= (MAX_CONFIG_NUM * GEMM_NUM + 1))  // 6 cublas/cublasLt, first row is not included
-        {
-            int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * GEMM_NUM);
-            fclose(fd);
-            fd = fopen(GEMM_CONFIG, "w+");
-            fprintf(fd, "%s", config[0].c_str());
-            for (uint i = startIdx; i < config.size(); i++) {
-                fprintf(fd, "%s", config[i].c_str());
-            }
-            line_count = config.size() - (GEMM_NUM + 3);
-        }
-    }
-
-    const int hidden_units = head_num * size_per_head;
-    const int gemm_num     = 6;
-    int       M[gemm_num];
-    int       N[gemm_num];
-    int       K[gemm_num];
-    int       batchCount[gemm_num] = {1, 1, 1, 1, 1, 1};
-    char      mess[gemm_num][256];
-
-    // gemm 0
-    M[0] = batch_size * beam_width;
-    K[0] = hidden_units;
-    N[0] = K[0] * 3;
-    strcpy(mess[0], "from_tensor * weightQKV");
-
-    // gemm 1
-    M[1] = batch_size * beam_width;
-    K[1] = hidden_units;
-    N[1] = K[1];
-    strcpy(mess[1], "attr * output_kernel");
-
-    // gemm2
-    M[2] = batch_size * beam_width * max_mem_seq_len;
-    K[2] = mem_hidden_units;
-    N[2] = hidden_units;
-    strcpy(mess[2], "mem_tensor * weightK/V in cross attention");
-
-    // gemm 3
-    M[3] = batch_size * beam_width;
-    K[3] = hidden_units;
-    N[3] = inter_size;
-    strcpy(mess[3], "ffn gemm1 ");
-
-    // gemm 4
-    M[4] = batch_size * beam_width;
-    K[4] = inter_size;
-    N[4] = hidden_units;
-    strcpy(mess[4], "ffn gemm2");
-
-    // gemm5
-    M[5] = batch_size * beam_width;
-    K[5] = hidden_units;
-    N[5] = ceil(vocab_size / 8.) * 8;
-    strcpy(mess[5], "decoder_output * embedding_kernel -> embedding_output");
-
-    cublasHandle_t cublas_handle;
-    check_cuda_error(cublasCreate(&cublas_handle));
-    cublasLtHandle_t ltHandle;
-    check_cuda_error(cublasLtCreate(&ltHandle));
-
-    cudaDataType_t AType;
-    cudaDataType_t BType;
-    cudaDataType_t CType;
-    cudaDataType_t computeType;
-    int            startAlgo, endAlgo;
-    const int      ites = 100;
-
-    CublasDataType data_type;
-    if (std::is_same<T, float>::value) {
-        data_type   = FLOAT_DATATYPE;
-        AType       = CUDA_R_32F;
-        BType       = CUDA_R_32F;
-        CType       = CUDA_R_32F;
-        computeType = CUDA_R_32F;
-        startAlgo   = (int)CUBLAS_GEMM_DEFAULT;
-        endAlgo     = (int)CUBLAS_GEMM_ALGO23;
-    }
-    else if (std::is_same<T, half>::value) {
-        data_type   = HALF_DATATYPE;
-        AType       = CUDA_R_16F;
-        BType       = CUDA_R_16F;
-        CType       = CUDA_R_16F;
-        computeType = CUDA_R_32F;
-        startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-        endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
-    }
-#ifdef ENABLE_BF16
-    else if (std::is_same<T, __nv_bfloat16>::value) {
-        data_type   = BFLOAT16_DATATYPE;
-        AType       = CUDA_R_16BF;
-        BType       = CUDA_R_16BF;
-        CType       = CUDA_R_16BF;
-        computeType = CUDA_R_32F;
-        startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-        endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
-    }
-#endif
-    using scaleT = typename ScaleTypeConverter<T>::Type;
-
-    scaleT alpha = (scaleT)1.0f;
-    scaleT beta  = (scaleT)0.0f;
-
-    printf("***Encoder Gemm Testing Begin***\n");
-    printf("***Cublas Gemm Testing Begin***\n");
-    if (line_count == 0) {
-        fprintf(fd,
-                "batch_size, seq_len, head_num, size_per_head dataType ### batchCount, n, m, k, algoId, "
-                "customOption, tile, numSplitsK, swizzle, reductionScheme, workspaceSize, stages, exec_time\n");
-    }
-    for (int i = 0; i < gemm_num; ++i) {
-        int m = M[i], n = N[i], k = K[i];
-        printf("\n-----------------------------\n");
-        printf("GEMM test %d: [M: %d, K: %d, N: %d] %s\n", i, m, k, n, mess[i]);
-        T* d_A = (T*)buffer;
-        T* d_B = d_A + m * k * batchCount[i];
-        T* d_C = d_B + k * n * batchCount[i];
-
-        float exec_time = 99999.0f;
-        int   fast_algo = 0;
-        int   seq_len   = i == 2 ? max_mem_seq_len : 1;
-        for (int algo = startAlgo; algo <= endAlgo; algo++) {
-            cublasStatus_t status;
-            cudaDeviceSynchronize();
-            auto start = std::chrono::high_resolution_clock::now();
-            for (int ite = 0; ite < ites; ++ite) {
-                status = cublasGemmEx(cublas_handle,
-                                      CUBLAS_OP_N,
-                                      CUBLAS_OP_N,
-                                      n,
-                                      m,
-                                      k,
-                                      &alpha,
-                                      d_B,
-                                      BType,
-                                      n,
-                                      d_A,
-                                      AType,
-                                      k,
-                                      &beta,
-                                      d_C,
-                                      CType,
-                                      n,
-                                      computeType,
-                                      static_cast<cublasGemmAlgo_t>(algo));
-                if (status != CUBLAS_STATUS_SUCCESS) {
-                    break;
-                }
-            }
-            cudaDeviceSynchronize();
-            auto end = std::chrono::high_resolution_clock::now();
-            auto dur = std::chrono::duration<float, std::milli>(end - start);
-            if (status == CUBLAS_STATUS_SUCCESS) {
-                printf("algo_%d costs %.3fms \n", algo, dur.count() / ites);
-                if (dur.count() / ites < exec_time) {
-                    exec_time = dur.count() / ites;
-                    fast_algo = algo;
-                }
-            }
-        }
-        printf("fast_algo %d costs %.3f ms\n", fast_algo, exec_time);
-
-        // for fp16 and bf16, we compare cublasLt
-        if (data_type != FLOAT_DATATYPE) {
-            printf("***cublasLt Gemm Testing Begin***\n");
-            // Let try a fixed number of combinations
-            const int          ALGO_COMBINATIONS = 5000;
-            customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
-
-            LtHgemmCustomFind<T, scaleT>(ltHandle,
-                                         batch_size * beam_width,
-                                         seq_len,
-                                         head_num,
-                                         size_per_head,
-                                         n,
-                                         m,
-                                         k,
-                                         &alpha,
-                                         d_B,
-                                         d_A,
-                                         &beta,
-                                         d_C,
-                                         cublas_workspace,
-                                         workSpaceSize,
-                                         fd,
-                                         perfResults,
-                                         ALGO_COMBINATIONS);
-            if (perfResults[0].time < exec_time) {
-                printPerfStructure(batch_size * beam_width,
-                                   seq_len,
-                                   head_num,
-                                   size_per_head,
-                                   n,
-                                   m,
-                                   k,
-                                   perfResults[0],
-                                   fd,
-                                   data_type,
-                                   0);
-            }
-            else {
-                fprintf(fd,
-                        "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-                        "-1 -1 "
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-                        "-1 -1 -1 "
-#endif
-                        "%f\n",
-                        batch_size * beam_width,
-                        seq_len,
-                        head_num,
-                        size_per_head,
-                        data_type,
-                        batchCount[i],
-                        n,
-                        m,
-                        k,
-                        fast_algo,
-                        exec_time);
-            }
-            printf("***cublasLt Gemm Testing End***\n");
-        }
-        else {
-            fprintf(fd,
-                    "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-                    "-1 -1 "
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-                    "-1 -1 -1 "
-#endif
-                    "%f\n",
-                    batch_size * beam_width,
-                    seq_len,
-                    head_num,
-                    size_per_head,
-                    data_type,
-                    batchCount[i],
-                    n,
-                    m,
-                    k,
-                    fast_algo,
-                    exec_time);
-        }
-    }
-    printf("***cublas Gemm Testing End***\n\n");
-    fclose(fd);
-    printf("***Decoding Gemm Testing End***\n");
-    return;
-}
-
-template void generate_decoding_gemm_config<float>(int   batch_size,
-                                                   int   beam_width,
-                                                   int   seq_len,
-                                                   int   head_num,
-                                                   int   size_per_head,
-                                                   int   inter_size,
-                                                   int   vocab_size,
-                                                   int   mem_hidden_units,
-                                                   void* buffer_in,
-                                                   bool  isAppend);
-
-template void generate_decoding_gemm_config<half>(int   batch_size,
-                                                  int   beam_width,
-                                                  int   seq_len,
-                                                  int   head_num,
-                                                  int   size_per_head,
-                                                  int   inter_size,
-                                                  int   vocab_size,
-                                                  int   mem_hidden_units,
-                                                  void* buffer_in,
-                                                  bool  isAppend);
-
-#ifdef ENABLE_BF16
-template void generate_decoding_gemm_config<__nv_bfloat16>(int   batch_size,
-                                                           int   beam_width,
-                                                           int   seq_len,
-                                                           int   head_num,
-                                                           int   size_per_head,
-                                                           int   inter_size,
-                                                           int   vocab_size,
-                                                           int   mem_hidden_units,
-                                                           void* buffer_in,
-                                                           bool  isAppend);
-#endif
-
-size_t calDecodingGemmTestBufSizeInByte(int            batch_size,
-                                        int            beam_width,
-                                        int            max_mem_seq_len,
-                                        int            head_num,
-                                        int            size_per_head,
-                                        int            inter_size,
-                                        int            memory_hidden_units,
-                                        int            vocab_size,
-                                        CublasDataType data_type)
-{
-    size_t       buf_size_in_byte   = 0;
-    const size_t tensor_para_size   = 1;
-    const size_t hidden_units       = head_num * size_per_head;
-    const size_t local_head_num     = head_num / tensor_para_size;
-    const size_t local_hidden_units = local_head_num * size_per_head;
-
-    // int wordSize = (data_type == FLOAT_DATATYPE ? sizeof(float) : sizeof(half));
-    // Because we always use float for some buffer, set the wordSize to float directly.
-    int wordSize = sizeof(float);
-
-    size_t              m = batch_size * beam_width;
-    std::vector<size_t> buff_size;
-    // for qkv gemm
-    buff_size.push_back(m * hidden_units + hidden_units * 3 * local_hidden_units + m * 3 * local_hidden_units);
-    // for attention output gemm
-    buff_size.push_back(m * hidden_units + hidden_units * local_hidden_units + m * local_hidden_units);
-    // for memory_tensor gemm
-    buff_size.push_back(m * max_mem_seq_len * memory_hidden_units + memory_hidden_units * local_hidden_units
-                        + m * max_mem_seq_len * local_hidden_units);
-    // for context ffn gemm
-    buff_size.push_back(m * inter_size / tensor_para_size + hidden_units * inter_size / tensor_para_size
-                        + m * hidden_units);
-    // for vocab
-    buff_size.push_back(m * hidden_units + hidden_units * ceil(vocab_size / 8.) * 8 / tensor_para_size
-                        + m * ceil(vocab_size / 8.) * 8 / tensor_para_size);
-
-    for (auto t : buff_size) {
-        buf_size_in_byte = buf_size_in_byte > t ? buf_size_in_byte : t;
-    }
-    buf_size_in_byte *= wordSize;
-    buf_size_in_byte += ((data_type == HALF_DATATYPE || data_type == BFLOAT16_DATATYPE) ? CUBLAS_WORKSPACE_SIZE : 0);
-
-    return buf_size_in_byte;
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/gemm_test/decoding_gemm_func.h b/src/turbomind/utils/gemm_test/decoding_gemm_func.h
deleted file mode 100644
index 9f17b358b7..0000000000
--- a/src/turbomind/utils/gemm_test/decoding_gemm_func.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "src/turbomind/utils/cublasAlgoMap.h"
-#include "src/turbomind/utils/cuda_bf16_wrapper.h"
-#include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/gemm_test/gemm_func.h"
-
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-#include <cuda_fp16.h>
-#include <cuda_profiler_api.h>
-#include <map>
-#ifdef __linux__
-#include <sys/time.h>
-#include <unistd.h>
-#endif
-#include <vector>
-
-namespace turbomind {
-
-template<typename T>
-void generate_decoding_gemm_config(int   batch_size,
-                                   int   beam_width,
-                                   int   seq_len,
-                                   int   head_num,
-                                   int   size_per_head,
-                                   int   inter_size,
-                                   int   vocab_size,
-                                   int   mem_hidden_units,
-                                   void* buffer_in,
-                                   bool  isAppend);
-
-size_t calDecodingGemmTestBufSizeInByte(int            batch_size,
-                                        int            beam_width,
-                                        int            max_mem_seq_len,
-                                        int            head_num,
-                                        int            size_per_head,
-                                        int            inter_size,
-                                        int            memory_hidden_units,
-                                        int            vocab_size,
-                                        CublasDataType data_type);
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/gemm_test/encoder_gemm_func.cc b/src/turbomind/utils/gemm_test/encoder_gemm_func.cc
deleted file mode 100644
index 9acd82c6ca..0000000000
--- a/src/turbomind/utils/gemm_test/encoder_gemm_func.cc
+++ /dev/null
@@ -1,566 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "src/turbomind/utils/gemm_test/encoder_gemm_func.h"
-#include "src/turbomind/macro.h"
-#include <chrono>
-
-namespace turbomind {
-
-template<typename T>
-void generate_encoder_gemm_config(
-    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer_in, bool isAppend, int tensor_para_size)
-{
-    void* cublas_workspace;
-    void* buffer;
-    int   workSpaceSize;
-
-#ifdef ENABLE_BF16
-    if (std::is_same<T, half>::value || std::is_same<T, __nv_bfloat16>::value) {
-#else
-    if (std::is_same<T, half>::value) {
-#endif  // ENABLE_BF16
-        // cublas_workspace_ should be the start pointer of cudaMalloc()
-        // to ensure 16B alignemnet
-        cublas_workspace = buffer_in;
-        buffer           = (void*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE);
-        workSpaceSize    = CUBLAS_WORKSPACE_SIZE;
-    }
-    else {
-        cublas_workspace = nullptr;
-        buffer           = buffer_in;
-        workSpaceSize    = 0;
-    }
-
-    struct cudaDeviceProp prop;
-    check_cuda_error(cudaGetDeviceProperties(&prop, 0));
-    printf("Device %s\n", prop.name);
-
-    // check config
-    FILE* fd;
-    int   line_count = 0;
-    if (!isAppend) {
-        fd = fopen(GEMM_CONFIG, "w+");
-    }
-    else {
-        fd = fopen(GEMM_CONFIG, "a+");
-        std::vector<std::string> config;
-        char                     line[1024];
-        while (fgets(line, 1024, fd) != NULL) {
-            config.push_back(std::string(line));
-        }
-        line_count = config.size();
-        if (config.size() >= (MAX_CONFIG_NUM * GEMM_NUM + 1))  // 6 cublas/cublasLt, first row is not included
-        {
-            int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * GEMM_NUM);
-            fclose(fd);
-            fd = fopen(GEMM_CONFIG, "w+");
-            fprintf(fd, "%s", config[0].c_str());
-            for (uint i = startIdx; i < config.size(); i++) {
-                fprintf(fd, "%s", config[i].c_str());
-            }
-            line_count = config.size() - (GEMM_NUM + 3);
-        }
-    }
-
-    const int gemm_num = 7;
-    int       M[gemm_num];
-    int       N[gemm_num];
-    int       K[gemm_num];
-    int       batchCount[gemm_num] = {1, 1, 1, 1, 1, 1, 1};
-    char      mess[gemm_num][256];
-    float     exec_times[gemm_num];
-
-    // gemm1
-    M[0] = batch_size * seq_len;
-    K[0] = head_num * size_per_head;
-    N[0] = (head_num / tensor_para_size) * size_per_head;
-    strcpy(mess[0], "from_tensor * weightQ/K/V");
-
-    // gemm2
-    M[1] = M[0];
-    K[1] = head_num * size_per_head;
-    N[1] = 4 * head_num * size_per_head / tensor_para_size;
-    strcpy(mess[1], "attr_output * inter_kernel");
-
-    // gemm3
-    M[2] = M[0];
-    K[2] = 4 * head_num * size_per_head / tensor_para_size;
-    N[2] = head_num * size_per_head;
-    strcpy(mess[2], "inter_matmul * output_kernel");
-
-    M[3]          = seq_len;
-    N[3]          = seq_len;
-    K[3]          = size_per_head;
-    batchCount[3] = batch_size * (head_num / tensor_para_size);
-    strcpy(mess[3], "attention batched Gemm1");
-
-    M[4]          = seq_len;
-    N[4]          = size_per_head;
-    K[4]          = seq_len;
-    batchCount[4] = batch_size * (head_num / tensor_para_size);
-    strcpy(mess[4], "attention batched Gemm2");
-
-    M[5]          = batch_size * seq_len;
-    N[5]          = (head_num / tensor_para_size) * size_per_head;
-    K[5]          = head_num * size_per_head;
-    batchCount[5] = 3;
-    strcpy(mess[5], "from_tensor * weight_QKV in BatchGemm");
-
-    M[6] = batch_size * seq_len;
-    K[6] = (head_num / tensor_para_size) * size_per_head;
-    N[6] = head_num * size_per_head;
-    strcpy(mess[6], "attr * output_kernel");
-
-    cublasHandle_t cublas_handle;
-    check_cuda_error(cublasCreate(&cublas_handle));
-    cublasLtHandle_t ltHandle;
-    check_cuda_error(cublasLtCreate(&ltHandle));
-
-    cudaDataType_t AType;
-    cudaDataType_t BType;
-    cudaDataType_t CType;
-    cudaDataType_t computeType;
-    int            startAlgo, endAlgo;
-    const int      ites = 100;
-
-    CublasDataType data_type;
-    if (std::is_same<T, float>::value) {
-        data_type   = FLOAT_DATATYPE;
-        AType       = CUDA_R_32F;
-        BType       = CUDA_R_32F;
-        CType       = CUDA_R_32F;
-        computeType = CUDA_R_32F;
-        startAlgo   = (int)CUBLAS_GEMM_DEFAULT;
-        endAlgo     = (int)CUBLAS_GEMM_ALGO23;
-    }
-    else if (std::is_same<T, half>::value) {
-        data_type   = HALF_DATATYPE;
-        AType       = CUDA_R_16F;
-        BType       = CUDA_R_16F;
-        CType       = CUDA_R_16F;
-        computeType = CUDA_R_32F;
-        startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-        endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
-    }
-#ifdef ENABLE_BF16
-    else if (std::is_same<T, __nv_bfloat16>::value) {
-        data_type   = BFLOAT16_DATATYPE;
-        AType       = CUDA_R_16BF;
-        BType       = CUDA_R_16BF;
-        CType       = CUDA_R_16BF;
-        computeType = CUDA_R_32F;
-        startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-        endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
-    }
-#endif
-    using scaleT = typename ScaleTypeConverter<T, false>::Type;
-
-    scaleT alpha = (scaleT)1.0f;
-    scaleT beta  = (scaleT)0.0f;
-
-    printf("***Encoder Gemm Testing Begin***\n");
-    printf("***Cublas Gemm Testing Begin***\n");
-    if (line_count == 0) {
-        fprintf(fd,
-                "batch_size, seq_len, head_num, size_per_head dataType ### batchCount, n, m, k, algoId, "
-                "customOption, tile, numSplitsK, swizzle, reductionScheme, workspaceSize, stages, exec_time\n");
-    }
-    for (int i = 0; i < gemm_num; ++i) {
-        // if(i != 0 && i != 5) continue;
-
-        int m = M[i], n = N[i], k = K[i];
-        printf("\n-----------------------------\n");
-        printf("GEMM test %d: [M: %d, K: %d, N: %d] %s\n", i, m, k, n, mess[i]);
-        T* d_A = (T*)buffer;
-        T* d_B = d_A + m * k * batchCount[i];
-        T* d_C = d_B + k * n * batchCount[i];
-
-        // array of pointer for batchedGemm
-        T* harray[12];
-        harray[0]  = (T*)buffer;
-        harray[1]  = (T*)((char*)buffer + sizeof(T) * m * k);
-        harray[2]  = (T*)((char*)buffer + 2 * sizeof(T) * m * k);
-        harray[4]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k);
-        harray[5]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + sizeof(T) * k * n);
-        harray[6]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 2 * sizeof(T) * k * n);
-        harray[8]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n);
-        harray[9]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n + sizeof(T) * m * n);
-        harray[10] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n + 2 * sizeof(T) * m * n);
-
-        T** darray = 0;
-        check_cuda_error(cudaMalloc((void**)&darray, sizeof(T*) * 12));
-        cudaMemcpy((void*)darray, (void*)harray, sizeof(T*) * 12, cudaMemcpyHostToDevice);
-        T** dAarray = darray;
-        T** dBarray = darray + 4;
-        T** dCarray = darray + 8;
-
-        float exec_time = 99999.0f;
-        int   fast_algo = 0;
-        for (int algo = startAlgo; algo <= endAlgo; algo++) {
-            cublasStatus_t status;
-            cudaDeviceSynchronize();
-            auto start = std::chrono::high_resolution_clock::now();
-            for (int ite = 0; ite < ites; ++ite) {
-                if (i < 3) {
-                    status = cublasGemmEx(cublas_handle,
-                                          CUBLAS_OP_N,
-                                          CUBLAS_OP_N,
-                                          n,
-                                          m,
-                                          k,
-                                          &alpha,
-                                          d_B,
-                                          BType,
-                                          n,
-                                          d_A,
-                                          AType,
-                                          k,
-                                          &beta,
-                                          d_C,
-                                          CType,
-                                          n,
-                                          computeType,
-                                          static_cast<cublasGemmAlgo_t>(algo));
-                }
-                else if (i == 3) {
-                    status = cublasGemmStridedBatchedEx(cublas_handle,
-                                                        CUBLAS_OP_T,
-                                                        CUBLAS_OP_N,
-                                                        seq_len,
-                                                        seq_len,
-                                                        size_per_head,
-                                                        &alpha,
-                                                        d_B,
-                                                        BType,
-                                                        size_per_head,
-                                                        seq_len * size_per_head,
-                                                        d_A,
-                                                        AType,
-                                                        size_per_head,
-                                                        seq_len * size_per_head,
-                                                        &beta,
-                                                        d_C,
-                                                        CType,
-                                                        seq_len,
-                                                        seq_len * seq_len,
-                                                        batch_size * head_num,
-                                                        computeType,
-                                                        static_cast<cublasGemmAlgo_t>(algo));
-                }
-                else if (i == 4) {
-                    status = cublasGemmStridedBatchedEx(cublas_handle,
-                                                        CUBLAS_OP_N,
-                                                        CUBLAS_OP_N,
-                                                        size_per_head,
-                                                        seq_len,
-                                                        seq_len,
-                                                        &alpha,
-                                                        d_B,
-                                                        BType,
-                                                        size_per_head,
-                                                        seq_len * size_per_head,
-                                                        d_A,
-                                                        AType,
-                                                        seq_len,
-                                                        seq_len * seq_len,
-                                                        &beta,
-                                                        d_C,
-                                                        CType,
-                                                        size_per_head,
-                                                        seq_len * size_per_head,
-                                                        batch_size * head_num,
-                                                        computeType,
-                                                        static_cast<cublasGemmAlgo_t>(algo));
-                }
-                else if (i == 5) {
-                    status = cublasGemmBatchedEx(cublas_handle,
-                                                 CUBLAS_OP_N,
-                                                 CUBLAS_OP_N,
-                                                 n,
-                                                 m,
-                                                 k,
-                                                 &alpha,
-                                                 (const void* const*)dBarray,
-                                                 BType,
-                                                 n,
-                                                 (const void* const*)dAarray,
-                                                 AType,
-                                                 k,
-                                                 &beta,
-                                                 (void* const*)dCarray,
-                                                 CType,
-                                                 n,
-                                                 3,
-                                                 computeType,
-                                                 static_cast<cublasGemmAlgo_t>(algo));
-                }
-                if (status != CUBLAS_STATUS_SUCCESS) {
-                    break;
-                }
-            }
-            cudaDeviceSynchronize();
-            auto end = std::chrono::high_resolution_clock::now();
-            auto dur = std::chrono::duration<float, std::milli>(end - start);
-            if (status == CUBLAS_STATUS_SUCCESS) {
-                printf("algo_%d costs %.3fms \n", algo, dur.count() / ites);
-                if (dur.count() / ites < exec_time) {
-                    exec_time = dur.count() / ites;
-                    fast_algo = algo;
-                }
-            }
-        }
-        printf("fast_algo %d costs %.3f ms\n", fast_algo, exec_time);
-
-        // for fp16 and bf16, we compare cublasLt
-        if (i < 3 && data_type != FLOAT_DATATYPE) {
-            printf("***cublasLt Gemm Testing Begin***\n");
-            // Let try a fixed number of combinations
-            const int          ALGO_COMBINATIONS = 5000;
-            customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
-            LtHgemmCustomFind<T, scaleT>(ltHandle,
-                                         batch_size,
-                                         seq_len,
-                                         head_num,
-                                         size_per_head,
-                                         n,
-                                         m,
-                                         k,
-                                         &alpha,
-                                         d_B,
-                                         d_A,
-                                         &beta,
-                                         d_C,
-                                         cublas_workspace,
-                                         workSpaceSize,
-                                         fd,
-                                         perfResults,
-                                         ALGO_COMBINATIONS);
-            if (perfResults[0].time < exec_time) {
-                printPerfStructure(
-                    batch_size, seq_len, head_num, size_per_head, n, m, k, perfResults[0], fd, data_type, 0);
-                exec_time = perfResults[0].time;
-            }
-            else {
-                fprintf(fd,
-                        "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-                        "-1 -1 "
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-                        "-1 -1 -1 "
-#endif
-                        "%f\n",
-                        batch_size,
-                        seq_len,
-                        head_num,
-                        size_per_head,
-                        data_type,
-                        batchCount[i],
-                        n,
-                        m,
-                        k,
-                        fast_algo,
-                        exec_time);
-            }
-            printf("***cublasLt Gemm Testing End***\n");
-        }
-        else {
-            fprintf(fd,
-                    "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-                    "-1 -1 "
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-                    "-1 -1 -1 "
-#endif
-                    "%f\n",
-                    batch_size,
-                    seq_len,
-                    head_num,
-                    size_per_head,
-                    data_type,
-                    batchCount[i],
-                    n,
-                    m,
-                    k,
-                    fast_algo,
-                    exec_time);
-        }
-        exec_times[i] = exec_time;
-        cudaFree(darray);
-    }
-    printf("***cublas Gemm Testing End***\n\n");
-    fclose(fd);
-    printf("***Encoder Gemm Testing End***\n");
-
-#ifdef SPARSITY_ENABLED
-    bool do_sparse_test = false;
-    if (prop.major == 8 && (prop.minor == 0 || prop.minor == 6)) {
-        do_sparse_test = true;
-    }
-    if (do_sparse_test && sizeof(T) == sizeof(half)) {
-        printf("***cusparseLt Gemm Testing Begin***\n");
-        // only first 3 cases can be sparse
-        const int spgemm_num = 3;
-        if (!isAppend) {
-            fd = fopen(SPGEMM_CONFIG, "w+");
-        }
-        else {
-            fd = fopen(SPGEMM_CONFIG, "a+");
-            std::vector<std::string> config;
-            char                     line[1024];
-            while (fgets(line, 1024, fd) != NULL) {
-                config.push_back(std::string(line));
-            }
-            line_count = config.size();
-            if (config.size() >= (MAX_CONFIG_NUM * spgemm_num + 1))  // 6 cublas/cublasLt, first row is not included
-            {
-                int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * spgemm_num);
-                fclose(fd);
-                fd = fopen(SPGEMM_CONFIG, "w+");
-                fprintf(fd, "%s", config[0].c_str());
-                for (uint i = startIdx; i < config.size(); i++) {
-                    fprintf(fd, "%s", config[i].c_str());
-                }
-                line_count = config.size() - (spgemm_num + 3);
-            }
-        }
-        if (line_count == 0) {
-            fprintf(
-                fd,
-                "batch_size, seq_len, head_num, size_per_head dataType ### batchCount, m, n, k, algoId, exec_time\n");
-        }
-        cusparseLtHandle_t handle;
-        CHECK_CUSPARSE(cusparseLtInit(&handle));
-        cusparseOrder_t     order        = CUSPARSE_ORDER_COL;
-        cusparseOperation_t opA          = CUSPARSE_OPERATION_NON_TRANSPOSE;
-        cusparseOperation_t opB          = CUSPARSE_OPERATION_NON_TRANSPOSE;
-        cusparseComputeType compute_type = CUSPARSE_COMPUTE_16F;
-        unsigned            alignment    = 16;
-        cudaStream_t        stream       = 0;
-        float               alpha2       = 1.0f;
-        float               beta2        = 0.0f;
-        for (int i = 0; i < spgemm_num; ++i) {
-            // to be compatible with spgemm wrapper, we let A be the weight matrix
-            // so m and n are swapped
-            // A: mxk B: kxn C:mxn
-            int m = N[i], n = M[i], k = K[i];
-            printf("\n-----------------------------\n");
-            printf("GEMM test %d: [M: %d, K: %d, N: %d]\n", i, m, k, n);
-            T* d_A = (T*)buffer;
-            T* d_B = d_A + m * k * batchCount[i];
-            T* d_C = d_B + k * n * batchCount[i];
-            T* dA_compressed;
-            {
-                cusparseLtMatDescriptor_t mat_A;
-                CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
-                    &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
-                CHECK_CUSPARSE(
-                    cusparseLtSpMMAPrune2(&handle, &mat_A, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
-                size_t compressed_size;
-                CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &mat_A, &compressed_size))
-                check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size));
-                CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &mat_A, true, opA, d_A, dA_compressed, stream))
-            }
-
-            float exec_time = 99999.0f;
-            int   fast_algo = 0;
-            for (int alg = 0; alg < 4; ++alg) {
-                cudaDeviceSynchronize();
-                cusparseLtMatDescriptor_t mat_A, mat_B, mat_C;
-                void*                     d_workspace = nullptr;
-                int                       num_streams = 1;
-                cudaStream_t              streams[1]  = {stream};
-                CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
-                    &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
-                CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_B, k, n, k, alignment, CUDA_R_16F, order))
-                CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_16F, order))
-                auto start = std::chrono::high_resolution_clock::now();
-                for (int ite = 0; ite < ites; ++ite) {
-                    // initializing MatDesc takes a lot of time
-                    // and these descs can be stored to other place
-                    // whereas storing MatMulPlan to other place will cause errors
-                    cusparseLtMatmulDescriptor_t   matmul;
-                    cusparseLtMatmulAlgSelection_t alg_sel;
-                    cusparseLtMatmulPlan_t         plan;
-                    CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(
-                        &handle, &matmul, opA, opB, &mat_A, &mat_B, &mat_C, &mat_C, compute_type))
-                    CHECK_CUSPARSE(
-                        cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
-                    CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
-                        &handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)))
-                    size_t workspace_size;
-                    CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&handle, &alg_sel, &workspace_size))
-                    CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel, workspace_size))
-                    CHECK_CUSPARSE(cusparseLtMatmul(&handle,
-                                                    &plan,
-                                                    &alpha2,
-                                                    dA_compressed,
-                                                    d_B,
-                                                    &beta2,
-                                                    d_C,
-                                                    d_C,
-                                                    d_workspace,
-                                                    streams,
-                                                    num_streams))
-                    CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
-                }
-                cudaDeviceSynchronize();
-                auto end = std::chrono::high_resolution_clock::now();
-                auto dur = std::chrono::duration<float, std::milli>(end - start);
-                printf("algo_%d costs %.3fms \n", alg, dur.count() / ites);
-                if (dur.count() < exec_time) {
-                    exec_time = dur.count();
-                    fast_algo = alg;
-                }
-            }
-            exec_time /= ites;
-            if (exec_time >= exec_times[i]) {
-                fast_algo = -1;
-            }
-            printf("fast_algo %d\n", fast_algo);
-            fprintf(fd,
-                    "%d %d %d %d %d ### %d %d %d %d %d %f\n",
-                    batch_size,
-                    seq_len,
-                    head_num,
-                    size_per_head,
-                    HALF_DATATYPE,
-                    batchCount[i],
-                    m,
-                    n,
-                    k,
-                    fast_algo,
-                    exec_time);
-            cudaFree(dA_compressed);
-        }
-        CHECK_CUSPARSE(cusparseLtDestroy(&handle))
-        fclose(fd);
-        printf("***cusparseLt Gemm Testing End***\n");
-    }
-#endif
-    return;
-}
-
-template void generate_encoder_gemm_config<float>(
-    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend, int tensor_para_size);
-template void generate_encoder_gemm_config<half>(
-    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend, int tensor_para_size);
-#ifdef ENABLE_BF16
-template void generate_encoder_gemm_config<__nv_bfloat16>(
-    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend, int tensor_para_size);
-#endif
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/gemm_test/encoder_gemm_func.h b/src/turbomind/utils/gemm_test/encoder_gemm_func.h
deleted file mode 100644
index 35c62ca771..0000000000
--- a/src/turbomind/utils/gemm_test/encoder_gemm_func.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "src/turbomind/utils/cublasAlgoMap.h"
-#include "src/turbomind/utils/cuda_bf16_wrapper.h"
-#include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/gemm_test/gemm_func.h"
-
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-#include <cuda_fp16.h>
-#include <cuda_profiler_api.h>
-#include <map>
-#ifdef __linux__
-#include <sys/time.h>
-#include <unistd.h>
-#endif
-#include <vector>
-
-namespace turbomind {
-
-template<typename T>
-void generate_encoder_gemm_config(int   batch_size,
-                                  int   seq_len,
-                                  int   head_num,
-                                  int   size_per_head,
-                                  void* buffer,
-                                  bool  isAppend         = true,
-                                  int   tensor_para_size = 1);
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/gemm_test/encoder_igemm_func.cc b/src/turbomind/utils/gemm_test/encoder_igemm_func.cc
deleted file mode 100644
index c2cf26bf82..0000000000
--- a/src/turbomind/utils/gemm_test/encoder_igemm_func.cc
+++ /dev/null
@@ -1,1334 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "encoder_igemm_func.h"
-#include "src/turbomind/macro.h"
-#include <chrono>
-
-#ifndef CUDART_VERSION
-#error CUDART_VERSION Undefined!
-#endif
-
-namespace turbomind {
-
-int batch_size_;
-int seq_len_;
-int head_num_;
-int size_per_head_;
-
-static const char* showStatus(cublasStatus_t error)
-{
-    switch (error) {
-        case CUBLAS_STATUS_SUCCESS:
-            return "CUBLAS_STATUS_SUCCESS";
-
-        case CUBLAS_STATUS_NOT_INITIALIZED:
-            return "CUBLAS_STATUS_NOT_INITIALIZED";
-
-        case CUBLAS_STATUS_ALLOC_FAILED:
-            return "CUBLAS_STATUS_ALLOC_FAILED";
-
-        case CUBLAS_STATUS_INVALID_VALUE:
-            return "CUBLAS_STATUS_INVALID_VALUE";
-
-        case CUBLAS_STATUS_ARCH_MISMATCH:
-            return "CUBLAS_STATUS_ARCH_MISMATCH";
-
-        case CUBLAS_STATUS_MAPPING_ERROR:
-            return "CUBLAS_STATUS_MAPPING_ERROR";
-
-        case CUBLAS_STATUS_EXECUTION_FAILED:
-            return "CUBLAS_STATUS_EXECUTION_FAILED";
-
-        case CUBLAS_STATUS_INTERNAL_ERROR:
-            return "CUBLAS_STATUS_INTERNAL_ERROR";
-
-        case CUBLAS_STATUS_NOT_SUPPORTED:
-            return "CUBLAS_STATUS_NOT_SUPPORTED";
-
-        case CUBLAS_STATUS_LICENSE_ERROR:
-            return "CUBLAS_STATUS_LICENSE_ERROR";
-    }
-
-    return "<unknown>";
-}
-
-// Utility function to print customMatmulPerf_t structure
-int printPerfStructure(int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint)
-{
-    int algoId, tile, swizzle, customOption, numSplitsK, reductionScheme, stages;
-
-    const cublasLtMatmulAlgo_t* matmulAlgo = &perf.algo;
-    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_ID, &algoId, sizeof(algoId), NULL);
-    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tile, sizeof(tile), NULL);
-    cublasLtMatmulAlgoConfigGetAttribute(
-        matmulAlgo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &numSplitsK, sizeof(numSplitsK), NULL);
-    cublasLtMatmulAlgoConfigGetAttribute(
-        matmulAlgo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &reductionScheme, sizeof(reductionScheme), NULL);
-    cublasLtMatmulAlgoConfigGetAttribute(
-        matmulAlgo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &swizzle, sizeof(swizzle), NULL);
-    cublasLtMatmulAlgoConfigGetAttribute(
-        matmulAlgo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption), NULL);
-#if (CUDART_VERSION >= 11000)
-    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
-#else
-    stages                     = 0;
-#endif
-
-    printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d stages=%d} status %d "
-           "time %f workspace=%d mathMode=%d waves=%f\n",
-           algoId,
-           tile,
-           matmulTileName[tile],
-           numSplitsK,
-           reductionScheme,
-           swizzle,
-           customOption,
-           stages,
-           perf.status,
-           perf.time,
-           (int)perf.workspaceSize,
-           (int)perf.mathMode,
-           perf.wavesCount);
-
-    // chose the fastest algo that does not need workspace
-    if ((int)perf.workspaceSize == 0 && hasPrint == 0) {
-        fprintf(fout,
-                "%d %d %d %d %d ### 1 %d %d %d %d %d %d %d %d %d %d %d %f\n",
-                batch_size_,
-                seq_len_,
-                head_num_,
-                size_per_head_,
-                INT8_DATATYPE,
-                m,
-                n,
-                k,
-                algoId,
-                customOption,
-                tile,
-                numSplitsK,
-                swizzle,
-                reductionScheme,
-                (int)perf.workspaceSize,
-                stages,
-                perf.time);
-        return 1;
-    }
-    else {
-        return hasPrint;
-    }
-}
-
-int printBatchPerfStructure(
-    int batchCount, int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint)
-{
-    int algoId, tile, swizzle, customOption, numSplitsK, reductionScheme, stages;
-
-    const cublasLtMatmulAlgo_t* matmulAlgo = &perf.algo;
-    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_ID, &algoId, sizeof(algoId), NULL);
-    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tile, sizeof(tile), NULL);
-    cublasLtMatmulAlgoConfigGetAttribute(
-        matmulAlgo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &numSplitsK, sizeof(numSplitsK), NULL);
-    cublasLtMatmulAlgoConfigGetAttribute(
-        matmulAlgo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &reductionScheme, sizeof(reductionScheme), NULL);
-    cublasLtMatmulAlgoConfigGetAttribute(
-        matmulAlgo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &swizzle, sizeof(swizzle), NULL);
-    cublasLtMatmulAlgoConfigGetAttribute(
-        matmulAlgo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption), NULL);
-#if (CUDART_VERSION >= 11000)
-    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
-#else
-    stages                     = 0;
-#endif
-
-    printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d stages=%d} status %d "
-           "time %f workspace=%d mathMode=%d waves=%f\n",
-           algoId,
-           tile,
-           matmulTileName[tile],
-           numSplitsK,
-           reductionScheme,
-           swizzle,
-           customOption,
-           stages,
-           perf.status,
-           perf.time,
-           (int)perf.workspaceSize,
-           (int)perf.mathMode,
-           perf.wavesCount);
-
-    // chose the fastest algo that does not need workspace
-    if ((int)perf.workspaceSize == 0 && hasPrint == 0) {
-        fprintf(fout,
-                "%d %d %d %d %d ### %d %d %d %d %d %d %d %d %d %d %d %d %f\n",
-                batch_size_,
-                seq_len_,
-                head_num_,
-                size_per_head_,
-                INT8_DATATYPE,
-                batchCount,
-                m,
-                n,
-                k,
-                algoId,
-                customOption,
-                tile,
-                numSplitsK,
-                swizzle,
-                reductionScheme,
-                (int)perf.workspaceSize,
-                stages,
-                perf.time);
-        return 1;
-    }
-    else {
-        return hasPrint;
-    }
-}
-
-static inline bool time_compare(const customMatmulPerf_t& perf_a, const customMatmulPerf_t& perf_b)
-{
-    return ((perf_a.status == CUBLAS_STATUS_SUCCESS) && (perf_a.time < perf_b.time));
-}
-
-static cublasStatus_t customMatmulRun(cublasLtHandle_t            ltHandle,  // to get the capabilities (required a GPU)
-                                      cublasLtMatmulDesc_t        operationDesc,
-                                      const void*                 alpha, /* host or device pointer */
-                                      const void*                 A,
-                                      cublasLtMatrixLayout_t      Adesc,
-                                      const void*                 B,
-                                      cublasLtMatrixLayout_t      Bdesc,
-                                      const void*                 beta, /* host or device pointer */
-                                      const void*                 C,
-                                      cublasLtMatrixLayout_t      Cdesc,
-                                      void*                       D,
-                                      cublasLtMatrixLayout_t      Ddesc,
-                                      const cublasLtMatmulAlgo_t& algo,
-                                      int                         kernelRepeats,
-                                      void*                       workSpace,
-                                      size_t                      workSpaceSizeInBytes,
-                                      customMatmulPerf_t&         perfResults,
-                                      cudaStream_t                stream)
-{
-    cublasLtMatmulHeuristicResult_t heurResult;
-    /* Looping over the Algo */
-    int            repeats = kernelRepeats;
-    cublasStatus_t algoStatus =
-        cublasLtMatmulAlgoCheck(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, &algo, &heurResult);
-    if (algoStatus == CUBLAS_STATUS_SUCCESS) {
-        if (heurResult.workspaceSize <= workSpaceSizeInBytes) {
-            cublasStatus_t oneRunStatus;
-            cudaDeviceSynchronize();
-            auto start = std::chrono::high_resolution_clock::now();
-            for (int loop = 0; loop < repeats; loop++) {
-                oneRunStatus = cublasLtMatmul(ltHandle,
-                                              operationDesc,
-                                              alpha,
-                                              A,
-                                              Adesc,
-                                              B,
-                                              Bdesc,
-                                              beta,
-                                              C,
-                                              Cdesc,
-                                              D,
-                                              Ddesc,
-                                              &algo,
-                                              workSpace,
-                                              workSpaceSizeInBytes,
-                                              stream);
-            }
-            cudaDeviceSynchronize();
-            auto end = std::chrono::high_resolution_clock::now();
-            auto dur = std::chrono::duration<float, std::milli>(end - start);
-            if (oneRunStatus != CUBLAS_STATUS_SUCCESS) {
-                algoStatus = oneRunStatus;
-            }
-            float time = dur.count();
-            // For the moment only add successful findings
-            if (algoStatus == CUBLAS_STATUS_SUCCESS) {
-                perfResults.algo          = algo;
-                perfResults.time          = time / repeats;
-                perfResults.workspaceSize = heurResult.workspaceSize;
-                perfResults.wavesCount    = heurResult.wavesCount;
-            }
-        }
-        else {
-            // printf("not enough workspace! %ld\n", heurResult.workspaceSize);
-            algoStatus = CUBLAS_STATUS_NOT_SUPPORTED;  // Not enough workspace
-        }
-    }
-    else {
-        // printf("check fail!\n");
-    }
-    return algoStatus;
-}
-
-// Sample wrapper running through multiple algo and config attributes combination for INT8 gemm using cublasLt low-level
-// API
-template<typename T, typename scaleT>
-int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
-                      int              m,
-                      int              n,
-                      int              k,
-                      const scaleT*    alpha, /* host pointer */
-                      const int8_t*    A,
-                      const int8_t*    B,
-                      const scaleT*    beta, /* host pointer */
-                      T*               C,
-                      void*            workSpace,
-                      size_t           workSpaceSize,
-                      FILE*            fout)
-{
-    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
-
-    cublasLtMatmulDesc_t   operationDesc = NULL;
-    cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
-    cudaStream_t           stream = 0;
-    // SplitK value that we are going to try when SplitK is supported for a given algo
-    const int splitKSequenceA[] = {2, 3, 4, 5, 6, 8, 12, 16, 32};
-    // Let try a fixed number of combinations
-#define ALGO_COMBINATIONS 50000
-    int                AlgoCombinations = ALGO_COMBINATIONS;
-    int                AlgoCount        = 0;
-    int                kernelRepeats    = 100;  // number of time the CUDA kernels will be run back to back
-    customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
-    int                nbAlgoIds = 0;
-#define ALGO_IDS 100
-    int algoIdA[ALGO_IDS];
-
-    cudaDataType_t Atype, Btype, Ctype, scaleType;
-    Atype = CUDA_R_8I;
-    Btype = CUDA_R_8I;
-
-    if (std::is_same<T, int32_t>::value && std::is_same<scaleT, int>::value) {
-        Ctype     = CUDA_R_32I;
-        scaleType = CUDA_R_32I;
-    }
-    else if (std::is_same<T, int8_t>::value && std::is_same<scaleT, float>::value) {
-        Ctype     = CUDA_R_8I;
-        scaleType = CUDA_R_32F;
-    }
-    else {
-        printf("[ERROR]<T,scaleT> of igemm is invalid\n");
-        exit(-1);
-    }
-
-#if (CUDART_VERSION >= 11000)
-    cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
-#else
-    cudaDataType_t computeType = CUDA_R_32I;
-#endif
-    cublasOperation_t opTranspose = CUBLAS_OP_T;
-
-    bool use_ORDER_COL32_2R_4R4 = false;
-#if (CUDART_VERSION >= 11000)
-    int device{-1};
-    cudaGetDevice(&device);
-    cudaDeviceProp props;
-    cudaGetDeviceProperties(&props, device);
-    if (props.major * 10 + props.minor >= 80) {
-        use_ORDER_COL32_2R_4R4 = true;
-    }
-#endif
-    cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32;
-    cublasLtOrder_t order_matrixB;
-#if (CUDART_VERSION >= 11000)
-    if (use_ORDER_COL32_2R_4R4) {
-        order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
-    }
-    else {
-        order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
-    }
-#else
-    order_matrixB              = CUBLASLT_ORDER_COL4_4R2_8C;
-#endif
-
-    int ldaTransform = 32 * m;
-    int ldbTransform;
-    if (use_ORDER_COL32_2R_4R4) {
-        ldbTransform = 32 * ((n + 32 - 1) / 32) * 32;
-    }
-    else {
-        ldbTransform = 32 * ((n + 8 - 1) / 8) * 8;
-    }
-
-    int ldcTransform = 32 * m;
-
-#if (CUDART_VERSION >= 11000)
-    status = cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
-#else
-    status                     = cublasLtMatmulDescCreate(&operationDesc, scaleType);
-#endif
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        goto CLEANUP;
-    }
-    cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t));
-
-    // Create matrix descriptors.
-    status = cublasLtMatrixLayoutCreate(&Adesc, Atype, m, k, ldaTransform);
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        goto CLEANUP;
-    }
-    status = cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        goto CLEANUP;
-    }
-    status = cublasLtMatrixLayoutCreate(&Bdesc, Btype, n, k, ldbTransform);
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        goto CLEANUP;
-    }
-    status =
-        cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_matrixB, sizeof(order_matrixB));
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        goto CLEANUP;
-    }
-    status = cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, ldcTransform);
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        goto CLEANUP;
-    }
-    status = cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        goto CLEANUP;
-    }
-
-    // Request AlgoId available for IGEMM
-    status = cublasLtMatmulAlgoGetIds(
-        ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, ALGO_IDS, algoIdA, &nbAlgoIds);
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        goto CLEANUP;
-    }
-
-    // Loop over the Algo IDs
-    for (int idx = 0; (idx < nbAlgoIds) && (AlgoCount < AlgoCombinations); idx++) {
-        cublasLtMatmulAlgo_t algo;
-        size_t               sizeWritten = 0;
-        /* Initialize algo structure with given Algp ID */
-        status =
-            cublasLtMatmulAlgoInit(ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, algoIdA[idx], &algo);
-        if (status != CUBLAS_STATUS_SUCCESS) {
-            continue;
-        }
-        // Query the tiles enums supported by that algo
-        cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_TILE_IDS, NULL, 0, &sizeWritten);
-        int  nbTiles = int(sizeWritten / sizeof(int));
-        int* tileA   = new int[nbTiles == 0 ? 1 : nbTiles];
-        if (nbTiles == 0) {
-            tileA[0] = CUBLASLT_MATMUL_TILE_UNDEFINED;
-            nbTiles  = 1;
-        }
-#if (CUDART_VERSION >= 11000)
-        cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, NULL, 0, &sizeWritten);
-        int              nbStages = int(sizeWritten / sizeof(int));
-        std::vector<int> stagesA(nbStages == 0 ? 1 : nbStages);
-        if (nbStages == 0) {
-            stagesA[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED;
-            nbStages   = 1;
-        }
-        else {
-            cublasLtMatmulAlgoCapGetAttribute(
-                &algo, CUBLASLT_ALGO_CAP_STAGES_IDS, stagesA.data(), sizeof(int) * nbStages, &sizeWritten);
-        }
-#endif
-        int splitkSupport, redMask, swizzlingMax, customOptionMax;
-        // Retrieve Algo Capabilities attributes to be able to setup loop over the different combinations
-        cublasLtMatmulAlgoCapGetAttribute(
-            &algo, CUBLASLT_ALGO_CAP_TILE_IDS, tileA, sizeof(int) * nbTiles, &sizeWritten);
-        cublasLtMatmulAlgoCapGetAttribute(
-            &algo, CUBLASLT_ALGO_CAP_SPLITK_SUPPORT, &splitkSupport, sizeof(splitkSupport), &sizeWritten);
-        cublasLtMatmulAlgoCapGetAttribute(
-            &algo, CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK, &redMask, sizeof(redMask), &sizeWritten);
-        cublasLtMatmulAlgoCapGetAttribute(
-            &algo, CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT, &swizzlingMax, sizeof(swizzlingMax), &sizeWritten);
-        cublasLtMatmulAlgoCapGetAttribute(
-            &algo, CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX, &customOptionMax, sizeof(customOptionMax), &sizeWritten);
-        /* Loop over the different tiles */
-        for (int tileIdx = 0; tileIdx < nbTiles; tileIdx++) {
-#if (CUDART_VERSION >= 11000)
-            /* Loop over different stages count */
-            for (int stagesIdx = 0; stagesIdx < nbStages; stagesIdx++) {
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stagesA[stagesIdx], sizeof(stagesA[stagesIdx]));
-#endif
-                /* Loop over the different custom option if any */
-                for (int customOption = 0; customOption <= customOptionMax; customOption++) {
-                    cublasLtMatmulAlgoConfigSetAttribute(
-                        &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption));
-                    /* Loop over the CTAs swizzling support */
-                    for (int k = 0; k <= swizzlingMax; k++) {
-                        int splitK_trial = 0;
-                        if (splitkSupport) {
-                            splitK_trial += sizeof(splitKSequenceA) / sizeof(splitKSequenceA[0]);
-                        }
-                        // Loop over the splitK value over a fixed sequence splitKSequenceA in addition to the case
-                        // where splitK is not enabled
-                        for (int l = 0; (l < (1 + splitK_trial)) && (AlgoCount < AlgoCombinations); l++) {
-                            /* Setup attribute of the algo to run */
-                            cublasLtMatmulAlgoConfigSetAttribute(
-                                &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tileA[tileIdx], sizeof(tileA[tileIdx]));
-                            int splitK_val = 0;
-                            int redScheme  = CUBLASLT_REDUCTION_SCHEME_NONE;
-                            cublasLtMatmulAlgoConfigSetAttribute(
-                                &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &splitK_val, sizeof(splitK_val));
-                            cublasLtMatmulAlgoConfigSetAttribute(
-                                &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &k, sizeof(k));
-                            cublasLtMatmulAlgoConfigSetAttribute(
-                                &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &redScheme, sizeof(int));
-
-                            if (l > 0) {  // Split-K case
-                                splitK_val = splitKSequenceA[l - 1];
-                                cublasLtMatmulAlgoConfigSetAttribute(&algo,
-                                                                     CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
-                                                                     &splitKSequenceA[l - 1],
-                                                                     sizeof(splitKSequenceA[l - 1]));
-                                /* Going over all the reduction scheme  */
-                                for (redScheme = 1;
-                                     redScheme <= (int)CUBLASLT_REDUCTION_SCHEME_MASK && (AlgoCount < AlgoCombinations);
-                                     redScheme = redScheme << 1) {
-                                    if (redScheme & redMask) {
-                                        cublasLtMatmulAlgoConfigSetAttribute(&algo,
-                                                                             CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
-                                                                             &redScheme,
-                                                                             sizeof(redScheme));
-                                        status                        = customMatmulRun(ltHandle,
-                                                                 operationDesc,
-                                                                 alpha, /* host or device pointer */
-                                                                 A,
-                                                                 Adesc,
-                                                                 B,
-                                                                 Bdesc,
-                                                                 beta, /* host or device pointer */
-                                                                 C,
-                                                                 Cdesc,
-                                                                 C,
-                                                                 Cdesc,
-                                                                 algo,
-                                                                 kernelRepeats,
-                                                                 workSpace,
-                                                                 workSpaceSize,
-                                                                 perfResults[AlgoCount],
-                                                                 stream);
-                                        perfResults[AlgoCount].status = status;
-                                        if (status == CUBLAS_STATUS_SUCCESS) {
-                                            AlgoCount++;
-                                        }
-                                    }  // end if
-                                }      // end for
-                            }
-                            else {  // Non-splitK case
-                                /* if user preference is ok with workspace */
-                                if (AlgoCount < AlgoCombinations) {
-                                    status                        = customMatmulRun(ltHandle,
-                                                             operationDesc,
-                                                             alpha, /* host or device pointer */
-                                                             A,
-                                                             Adesc,
-                                                             B,
-                                                             Bdesc,
-                                                             beta, /* host or device pointer */
-                                                             C,
-                                                             Cdesc,
-                                                             C,
-                                                             Cdesc,
-                                                             algo,
-                                                             kernelRepeats,
-                                                             workSpace,
-                                                             workSpaceSize,
-                                                             perfResults[AlgoCount],
-                                                             stream);
-                                    perfResults[AlgoCount].status = status;
-                                    if (status == CUBLAS_STATUS_SUCCESS) {
-                                        AlgoCount++;
-                                    }
-                                }
-                            }
-                        }  // end l
-                    }      // end k
-                }          // end customOption
-#if (CUDART_VERSION >= 11000)
-            }  // end stagesIdx
-#endif
-        }  // end tileIdx
-        delete[] tileA;
-    }  // end idx
-    // Sort the results per run duration
-    std::sort(perfResults, perfResults + AlgoCount, time_compare);
-    // Print timing and perf details
-    for (int i = 0, hasPrint = 0; i < AlgoCount; i++) {
-        printf("result %03d : ", i);
-        hasPrint = printPerfStructure(m, n, k, perfResults[i], fout, hasPrint);
-    }
-
-CLEANUP:
-    // Descriptors are no longer needed as all GPU work was already enqueued
-    if (Cdesc) {
-        cublasLtMatrixLayoutDestroy(Cdesc);
-    }
-    if (Bdesc) {
-        cublasLtMatrixLayoutDestroy(Bdesc);
-    }
-    if (Adesc) {
-        cublasLtMatrixLayoutDestroy(Adesc);
-    }
-    if (operationDesc) {
-        cublasLtMatmulDescDestroy(operationDesc);
-    }
-    return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
-}
-
-template int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
-                               int              m,
-                               int              n,
-                               int              k,
-                               const int*       alpha, /* host pointer */
-                               const int8_t*    A,
-                               const int8_t*    B,
-                               const int*       beta, /* host pointer */
-                               int32_t*         C,
-                               void*            workSpace,
-                               size_t           workSpaceSize,
-                               FILE*            fout);
-
-template int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
-                               int              m,
-                               int              n,
-                               int              k,
-                               const float*     alpha, /* host pointer */
-                               const int8_t*    A,
-                               const int8_t*    B,
-                               const float*     beta, /* host pointer */
-                               int8_t*          C,
-                               void*            workSpace,
-                               size_t           workSpaceSize,
-                               FILE*            fout);
-
-template<typename T, typename scaleT>
-int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
-                           int              batchCount,
-                           int              m,
-                           int              n,
-                           int              k,
-                           const scaleT*    alpha, /* host pointer */
-                           const int8_t*    A,
-                           const int8_t*    B,
-                           const scaleT*    beta, /* host pointer */
-                           T*               C,
-                           void*            workSpace,
-                           size_t           workSpaceSize,
-                           FILE*            fout)
-{
-    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
-
-    cublasLtMatmulDesc_t   operationDesc = NULL;
-    cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
-    cudaStream_t           stream = 0;
-    // SplitK value that we are going to try when SplitK is supported for a given algo
-    const int splitKSequenceA[] = {2, 3, 4, 5, 6, 8, 12, 16, 32};
-    // Let try a fixed number of combinations
-#define ALGO_COMBINATIONS 50000
-    int                AlgoCombinations = ALGO_COMBINATIONS;
-    int                AlgoCount        = 0;
-    int                kernelRepeats    = 100;  // number of time the CUDA kernels will be run back to back
-    customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
-    int                nbAlgoIds = 0;
-#define ALGO_IDS 100
-    int algoIdA[ALGO_IDS];
-
-    cudaDataType_t Atype, Btype, Ctype, scaleType;
-    Atype = CUDA_R_8I;
-    Btype = CUDA_R_8I;
-
-    if (std::is_same<T, int32_t>::value && std::is_same<scaleT, int>::value) {
-        Ctype     = CUDA_R_32I;
-        scaleType = CUDA_R_32I;
-    }
-    else if (std::is_same<T, int8_t>::value && std::is_same<scaleT, float>::value) {
-        Ctype     = CUDA_R_8I;
-        scaleType = CUDA_R_32F;
-    }
-    else {
-        printf("[ERROR]<T,scaleT> of igemm is invalid\n");
-        exit(-1);
-    }
-
-#if (CUDART_VERSION >= 11000)
-    cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
-#else
-    cudaDataType_t computeType = CUDA_R_32I;
-#endif
-    cublasOperation_t opTranspose = CUBLAS_OP_T;
-
-    bool use_ORDER_COL32_2R_4R4 = false;
-#if (CUDART_VERSION >= 11000)
-    int device{-1};
-    cudaGetDevice(&device);
-    cudaDeviceProp props;
-    cudaGetDeviceProperties(&props, device);
-    if (props.major * 10 + props.minor >= 80) {
-        use_ORDER_COL32_2R_4R4 = true;
-    }
-#endif
-    cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32;
-    cublasLtOrder_t order_matrixB;
-#if (CUDART_VERSION >= 11000)
-    if (use_ORDER_COL32_2R_4R4) {
-        order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
-    }
-    else {
-        order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
-    }
-#else
-    order_matrixB              = CUBLASLT_ORDER_COL4_4R2_8C;
-#endif
-
-    int ldaTransform = 32 * m;
-    int ldbTransform;
-    if (use_ORDER_COL32_2R_4R4) {
-        ldbTransform = 32 * ((n + 32 - 1) / 32) * 32;
-    }
-    else {
-        ldbTransform = 32 * ((n + 8 - 1) / 8) * 8;
-    }
-
-    int ldcTransform = 32 * m;
-
-    int64_t stridea, strideb, stridec;
-    stridea = m * k;
-    strideb = n * k;
-    stridec = m * n;
-
-#if (CUDART_VERSION >= 11000)
-    status = cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
-#else
-    status                     = cublasLtMatmulDescCreate(&operationDesc, scaleType);
-#endif
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        goto CLEANUP;
-    }
-    cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t));
-
-    // Create matrix descriptors.
-    status = cublasLtMatrixLayoutCreate(&Adesc, Atype, m, k, ldaTransform);
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        goto CLEANUP;
-    }
-    status = cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        goto CLEANUP;
-    }
-    cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
-    cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, sizeof(stridea));
-
-    status = cublasLtMatrixLayoutCreate(&Bdesc, Btype, n, k, ldbTransform);
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        goto CLEANUP;
-    }
-    status =
-        cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_matrixB, sizeof(order_matrixB));
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        goto CLEANUP;
-    }
-    cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
-    cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, sizeof(strideb));
-
-    status = cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, ldcTransform);
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        goto CLEANUP;
-    }
-    status = cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        goto CLEANUP;
-    }
-    cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
-    cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, sizeof(stridec));
-
-    // Request AlgoId available for IGEMM
-    status = cublasLtMatmulAlgoGetIds(
-        ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, ALGO_IDS, algoIdA, &nbAlgoIds);
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        goto CLEANUP;
-    }
-
-    // Loop over the Algo IDs
-    for (int idx = 0; (idx < nbAlgoIds) && (AlgoCount < AlgoCombinations); idx++) {
-        cublasLtMatmulAlgo_t algo;
-        size_t               sizeWritten = 0;
-        /* Initialize algo structure with given Algp ID */
-        status =
-            cublasLtMatmulAlgoInit(ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, algoIdA[idx], &algo);
-        if (status != CUBLAS_STATUS_SUCCESS) {
-            continue;
-        }
-        // Query the tiles enums supported by that algo
-        cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_TILE_IDS, NULL, 0, &sizeWritten);
-        int  nbTiles = int(sizeWritten / sizeof(int));
-        int* tileA   = new int[nbTiles == 0 ? 1 : nbTiles];
-        if (nbTiles == 0) {
-            tileA[0] = CUBLASLT_MATMUL_TILE_UNDEFINED;
-            nbTiles  = 1;
-        }
-#if (CUDART_VERSION >= 11000)
-        cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, NULL, 0, &sizeWritten);
-        int              nbStages = int(sizeWritten / sizeof(int));
-        std::vector<int> stagesA(nbStages == 0 ? 1 : nbStages);
-        if (nbStages == 0) {
-            stagesA[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED;
-            nbStages   = 1;
-        }
-        else {
-            cublasLtMatmulAlgoCapGetAttribute(
-                &algo, CUBLASLT_ALGO_CAP_STAGES_IDS, stagesA.data(), sizeof(int) * nbStages, &sizeWritten);
-        }
-#endif
-        int splitkSupport, redMask, swizzlingMax, customOptionMax;
-        // Retrieve Algo Capabilities attributes to be able to setup loop over the different combinations
-        cublasLtMatmulAlgoCapGetAttribute(
-            &algo, CUBLASLT_ALGO_CAP_TILE_IDS, tileA, sizeof(int) * nbTiles, &sizeWritten);
-        cublasLtMatmulAlgoCapGetAttribute(
-            &algo, CUBLASLT_ALGO_CAP_SPLITK_SUPPORT, &splitkSupport, sizeof(splitkSupport), &sizeWritten);
-        cublasLtMatmulAlgoCapGetAttribute(
-            &algo, CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK, &redMask, sizeof(redMask), &sizeWritten);
-        cublasLtMatmulAlgoCapGetAttribute(
-            &algo, CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT, &swizzlingMax, sizeof(swizzlingMax), &sizeWritten);
-        cublasLtMatmulAlgoCapGetAttribute(
-            &algo, CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX, &customOptionMax, sizeof(customOptionMax), &sizeWritten);
-        /* Loop over the different tiles */
-        for (int tileIdx = 0; tileIdx < nbTiles; tileIdx++) {
-#if (CUDART_VERSION >= 11000)
-            /* Loop over different stages count */
-            for (int stagesIdx = 0; stagesIdx < nbStages; stagesIdx++) {
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stagesA[stagesIdx], sizeof(stagesA[stagesIdx]));
-#endif
-                /* Loop over the different custom option if any */
-                for (int customOption = 0; customOption <= customOptionMax; customOption++) {
-                    cublasLtMatmulAlgoConfigSetAttribute(
-                        &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption));
-                    /* Loop over the CTAs swizzling support */
-                    for (int k = 0; k <= swizzlingMax; k++) {
-                        int splitK_trial = 0;
-                        if (splitkSupport) {
-                            splitK_trial += sizeof(splitKSequenceA) / sizeof(splitKSequenceA[0]);
-                        }
-                        // Loop over the splitK value over a fixed sequence splitKSequenceA in addition to the case
-                        // where splitK is not enabled
-                        for (int l = 0; (l < (1 + splitK_trial)) && (AlgoCount < AlgoCombinations); l++) {
-                            /* Setup attribute of the algo to run */
-                            cublasLtMatmulAlgoConfigSetAttribute(
-                                &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tileA[tileIdx], sizeof(tileA[tileIdx]));
-                            int splitK_val = 0;
-                            int redScheme  = CUBLASLT_REDUCTION_SCHEME_NONE;
-                            cublasLtMatmulAlgoConfigSetAttribute(
-                                &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &splitK_val, sizeof(splitK_val));
-                            cublasLtMatmulAlgoConfigSetAttribute(
-                                &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &k, sizeof(k));
-                            cublasLtMatmulAlgoConfigSetAttribute(
-                                &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &redScheme, sizeof(int));
-
-                            if (l > 0) {  // Split-K case
-                                splitK_val = splitKSequenceA[l - 1];
-                                cublasLtMatmulAlgoConfigSetAttribute(&algo,
-                                                                     CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
-                                                                     &splitKSequenceA[l - 1],
-                                                                     sizeof(splitKSequenceA[l - 1]));
-                                /* Going over all the reduction scheme  */
-                                for (redScheme = 1;
-                                     redScheme <= (int)CUBLASLT_REDUCTION_SCHEME_MASK && (AlgoCount < AlgoCombinations);
-                                     redScheme = redScheme << 1) {
-                                    if (redScheme & redMask) {
-                                        cublasLtMatmulAlgoConfigSetAttribute(&algo,
-                                                                             CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
-                                                                             &redScheme,
-                                                                             sizeof(redScheme));
-                                        status                        = customMatmulRun(ltHandle,
-                                                                 operationDesc,
-                                                                 alpha, /* host or device pointer */
-                                                                 A,
-                                                                 Adesc,
-                                                                 B,
-                                                                 Bdesc,
-                                                                 beta, /* host or device pointer */
-                                                                 C,
-                                                                 Cdesc,
-                                                                 C,
-                                                                 Cdesc,
-                                                                 algo,
-                                                                 kernelRepeats,
-                                                                 workSpace,
-                                                                 workSpaceSize,
-                                                                 perfResults[AlgoCount],
-                                                                 stream);
-                                        perfResults[AlgoCount].status = status;
-                                        if (status == CUBLAS_STATUS_SUCCESS) {
-                                            AlgoCount++;
-                                        }
-                                    }  // end if
-                                }      // end for
-                            }
-                            else {  // Non-splitK case
-                                /* if user preference is ok with workspace */
-                                if (AlgoCount < AlgoCombinations) {
-                                    status                        = customMatmulRun(ltHandle,
-                                                             operationDesc,
-                                                             alpha, /* host or device pointer */
-                                                             A,
-                                                             Adesc,
-                                                             B,
-                                                             Bdesc,
-                                                             beta, /* host or device pointer */
-                                                             C,
-                                                             Cdesc,
-                                                             C,
-                                                             Cdesc,
-                                                             algo,
-                                                             kernelRepeats,
-                                                             workSpace,
-                                                             workSpaceSize,
-                                                             perfResults[AlgoCount],
-                                                             stream);
-                                    perfResults[AlgoCount].status = status;
-                                    if (status == CUBLAS_STATUS_SUCCESS) {
-                                        AlgoCount++;
-                                    }
-                                }
-                            }
-                        }  // end l
-                    }      // end k
-                }          // end customOption
-#if (CUDART_VERSION >= 11000)
-            }  // end stagesIdx
-#endif
-        }  // end tileIdx
-        delete[] tileA;
-    }  // end idx
-    // Sort the results per run duration
-    std::sort(perfResults, perfResults + AlgoCount, time_compare);
-    // Print timing and perf details
-    for (int i = 0, hasPrint = 0; i < AlgoCount; i++) {
-        printf("result %03d : ", i);
-        hasPrint = printBatchPerfStructure(batchCount, m, n, k, perfResults[i], fout, hasPrint);
-    }
-
-CLEANUP:
-    // Descriptors are no longer needed as all GPU work was already enqueued
-    if (Cdesc) {
-        cublasLtMatrixLayoutDestroy(Cdesc);
-    }
-    if (Bdesc) {
-        cublasLtMatrixLayoutDestroy(Bdesc);
-    }
-    if (Adesc) {
-        cublasLtMatrixLayoutDestroy(Adesc);
-    }
-    if (operationDesc) {
-        cublasLtMatmulDescDestroy(operationDesc);
-    }
-    return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
-}
-
-template int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
-                                    int              batchCount,
-                                    int              m,
-                                    int              n,
-                                    int              k,
-                                    const int*       alpha, /* host pointer */
-                                    const int8_t*    A,
-                                    const int8_t*    B,
-                                    const int*       beta, /* host pointer */
-                                    int32_t*         C,
-                                    void*            workSpace,
-                                    size_t           workSpaceSize,
-                                    FILE*            fout);
-
-template int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
-                                    int              batchCount,
-                                    int              m,
-                                    int              n,
-                                    int              k,
-                                    const float*     alpha, /* host pointer */
-                                    const int8_t*    A,
-                                    const int8_t*    B,
-                                    const float*     beta, /* host pointer */
-                                    int8_t*          C,
-                                    void*            workSpace,
-                                    size_t           workSpaceSize,
-                                    FILE*            fout);
-
-// initialize matrix in column-major
-void matInit(int rows, int cols, int8_t* p, int ld)
-{
-    srand(time(NULL));
-
-    for (int c = 0; c < cols; c++) {
-        for (int r = 0; r < rows; r++) {
-            int index = r + c * ld;
-
-            p[index] = rand() % 255 - 127;
-        }
-    }
-}
-
-int batch_igemm_config(int batchCount, int m, int n, int k, FILE* fout, void* buffer)
-{
-    printf("batchCount %d m %d n %d k %d\n", batchCount, m, n, k);
-    int alpha = 1;
-    int beta  = 0;
-
-    int8_t*  d_A = (int8_t*)buffer;                       // m * k, stored in column-major
-    int8_t*  d_B = d_A + batchCount * m * k;              // k * n, stored in column-major
-    int32_t* d_C = (int32_t*)(d_B + batchCount * k * n);  // m * n, stored in column-major
-
-    cublasLtHandle_t ltHandle;
-    cublasLtCreate(&ltHandle);
-
-    LtBatchIgemmCustomFind(ltHandle,
-                           batchCount,
-                           m,
-                           n,
-                           k,
-                           &alpha, /* host pointer */
-                           d_A,
-                           d_B,
-                           &beta, /* host pointer */
-                           d_C,
-                           NULL,
-                           0,
-                           fout);
-    // free memory
-    cublasLtDestroy(ltHandle);
-    return 0;
-}
-
-int igemm_config(int m, int n, int k, FILE* fout, void* buffer)
-{
-    printf("batchCount %d m %d n %d k %d\n", 1, m, n, k);
-    int alpha = 1;
-    int beta  = 0;
-
-    int8_t*  d_A = (int8_t*)buffer;          // m * k, stored in column-major
-    int8_t*  d_B = d_A + m * k;              // k * n, stored in column-major
-    int32_t* d_C = (int32_t*)(d_B + k * n);  // m * n, stored in column-major
-
-    cublasLtHandle_t ltHandle;
-    cublasLtCreate(&ltHandle);
-
-    LtIgemmCustomFind(ltHandle,
-                      m,
-                      n,
-                      k,
-                      &alpha, /* host pointer */
-                      d_A,
-                      d_B,
-                      &beta, /* host pointer */
-                      d_C,
-                      NULL,
-                      0,
-                      fout);
-
-    cublasLtDestroy(ltHandle);
-    return 0;
-}
-
-int generate_encoder_igemm_config(
-    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend)
-{
-
-    // ensure program running on SM >= 7.5
-    struct cudaDeviceProp prop;
-    check_cuda_error(cudaGetDeviceProperties(&prop, 0));
-    if (!(prop.major >= 8 || (prop.major >= 7 && prop.minor >= 5))) {
-        printf("[ERROR] INT8 mode > 0 is only supported on device with sm >= 7.5\n ");
-        exit(-1);
-    }
-    printf("Device %s\n", prop.name);
-
-    // check config
-    FILE* fout;
-    if (!isAppend) {
-        fout = fopen(IGEMM_CONFIG, "w+");
-        fprintf(
-            fout,
-            "batch_size seq_len head_num size_per_head dataType ### batchCount m n k algoId customOption tile splitK_val swizzle reductionScheme workspaceSize stages exec_time\n");
-    }
-    else {
-        fout = fopen(IGEMM_CONFIG, "a+");
-        std::vector<std::string> config;
-        char                     line[1024];
-        while (fgets(line, 1024, fout) != NULL) {
-            config.push_back(std::string(line));
-        }
-        if (config.size() >= MAX_CONFIG_NUM * GEMM_NUM) {
-            int startIdx = config.size() - (MAX_CONFIG_NUM - 1) * GEMM_NUM;
-            fclose(fout);
-            fout = fopen(IGEMM_CONFIG, "w+");
-            for (int i = startIdx; i < (int)config.size(); i++) {
-                fprintf(fout, "%s", config[i].c_str());
-            }
-        }
-    }
-
-    batch_size_    = batch_size;
-    seq_len_       = seq_len;
-    head_num_      = head_num;
-    size_per_head_ = size_per_head;
-    int m          = batch_size * seq_len;
-    int n          = head_num * size_per_head;
-    int k          = n;
-    int batchCount;
-
-    printf("***Encoder IGemm Testing Begin***\n");
-    printf("\n-----------------------------\n");
-
-    batchCount = 3;
-    m          = batch_size * seq_len;
-    k          = head_num * size_per_head;
-    n          = k;
-    if (n % 32 != 0 || k % 32 != 0) {
-        printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
-    }
-    else {
-        batch_igemm_config(batchCount, m, n, k, fout, buffer);
-    }
-
-    printf("\n-----------------------------\n");
-    m          = seq_len;
-    n          = seq_len;
-    k          = size_per_head;
-    batchCount = batch_size * head_num;
-    if (n % 32 != 0 || k % 32 != 0) {
-        printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
-    }
-    else {
-        batch_igemm_config(batchCount, m, n, k, fout, buffer);
-    }
-
-    printf("\n-----------------------------\n");
-    m          = seq_len;
-    n          = size_per_head;
-    k          = seq_len;
-    batchCount = batch_size * head_num;
-    if (n % 32 != 0 || k % 32 != 0) {
-        printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
-    }
-    else {
-        batch_igemm_config(batchCount, m, n, k, fout, buffer);
-    }
-
-    printf("\n-----------------------------\n");
-    m = batch_size * seq_len;
-    n = head_num * size_per_head;
-    k = head_num * size_per_head;
-    if (n % 32 != 0 || k % 32 != 0) {
-        printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
-    }
-    else {
-        igemm_config(m, n, k, fout, buffer);
-    }
-
-    printf("\n-----------------------------\n");
-    n = 4 * n;
-    if (n % 32 != 0 || k % 32 != 0) {
-        printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
-    }
-    else {
-        igemm_config(m, n, k, fout, buffer);
-    }
-
-    printf("\n-----------------------------\n");
-    n = k;
-    k = 4 * n;
-    if (n % 32 != 0 || k % 32 != 0) {
-        printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
-    }
-    else {
-        igemm_config(m, n, k, fout, buffer);
-    }
-
-    fclose(fout);
-    printf("\n-----------------------------\n");
-    printf("***Encoder IGemm Testing End***\n");
-
-#ifdef SPARSITY_ENABLED
-    bool do_sparse_test = false;
-    if (prop.major == 8 && (prop.minor == 0 || prop.minor == 6)) {
-        do_sparse_test = true;
-    }
-    if (do_sparse_test) {
-        printf("***cusparseLt Gemm Testing Begin***\n");
-        const int spgemm_num = 3;
-        FILE*     fd;
-        int       line_count = 0;
-        const int ites       = 100;
-        if (!isAppend) {
-            fd = fopen(SPIGEMM_CONFIG, "w+");
-        }
-        else {
-            fd = fopen(SPIGEMM_CONFIG, "a+");
-            std::vector<std::string> config;
-            char                     line[1024];
-            while (fgets(line, 1024, fd) != NULL) {
-                config.push_back(std::string(line));
-            }
-            line_count = config.size();
-            if (config.size() >= (MAX_CONFIG_NUM * spgemm_num + 1))  // 6 cublas/cublasLt, first row is not included
-            {
-                int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * spgemm_num);
-                fclose(fd);
-                fd = fopen(SPIGEMM_CONFIG, "w+");
-                fprintf(fd, "%s", config[0].c_str());
-                for (uint i = startIdx; i < config.size(); i++) {
-                    fprintf(fd, "%s", config[i].c_str());
-                }
-                line_count = config.size() - (spgemm_num + 3);
-            }
-        }
-        if (line_count == 0) {
-            fprintf(
-                fd,
-                "batch_size, seq_len, head_num, size_per_head dataType ### batchCount, m, n, k, algoId, exec_time\n");
-        }
-
-        int M[spgemm_num];
-        int N[spgemm_num];
-        int K[spgemm_num];
-        // gemm1
-        M[0] = batch_size * seq_len;
-        K[0] = head_num * size_per_head;
-        N[0] = K[0];
-        // gemm2
-        M[1] = M[0];
-        K[1] = K[0];
-        N[1] = 4 * N[0];
-        // gemm3
-        M[2] = M[0];
-        K[2] = 4 * K[0];
-        N[2] = N[0];
-
-        cusparseLtHandle_t handle;
-        CHECK_CUSPARSE(cusparseLtInit(&handle));
-        cusparseOrder_t     col_order    = CUSPARSE_ORDER_COL;
-        cusparseOrder_t     row_order    = CUSPARSE_ORDER_ROW;
-        cusparseOperation_t opA          = CUSPARSE_OPERATION_NON_TRANSPOSE;
-        cusparseOperation_t opB          = CUSPARSE_OPERATION_NON_TRANSPOSE;
-        cusparseComputeType compute_type = CUSPARSE_COMPUTE_32I;
-        unsigned            alignment    = 16;
-        cudaStream_t        stream       = 0;
-        float               alpha2       = 1.0f;
-        float               beta2        = 0.0f;
-        for (int i = 0; i < spgemm_num; ++i) {
-            // to be compatible with spgemm wrapper, we let A be the weight matrix
-            // so m and n are swapped
-            // A: mxk B: kxn C:mxn
-            int m = N[i], n = M[i], k = K[i];
-            printf("\n-----------------------------\n");
-            printf("GEMM test %d: [M: %d, K: %d, N: %d]\n", i, m, k, n);
-            int8_t* d_A = (int8_t*)buffer;
-            int8_t* d_B = d_A + m * k;
-            int8_t* d_C = d_B + k * n;
-            int8_t* dA_compressed;
-            {
-                cusparseLtMatDescriptor_t mat_A;
-                CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
-                    &handle, &mat_A, m, k, k, alignment, CUDA_R_8I, row_order, CUSPARSELT_SPARSITY_50_PERCENT))
-                CHECK_CUSPARSE(
-                    cusparseLtSpMMAPrune2(&handle, &mat_A, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
-                size_t compressed_size;
-                CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &mat_A, &compressed_size))
-                check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size));
-                CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &mat_A, true, opA, d_A, dA_compressed, stream))
-            }
-            cudaDeviceSynchronize();
-            cudaError_t result = cudaGetLastError();
-            if (result) {
-                throw std::runtime_error(std::string("[TM][ERROR] CUDA runtime error: "));
-            }
-
-            float exec_time = 99999.0f;
-            int   fast_algo = 0;
-            for (int alg = 0; alg < 4; ++alg) {
-                cudaDeviceSynchronize();
-                cusparseLtMatDescriptor_t mat_A, mat_B, mat_C;
-                void*                     d_workspace = nullptr;
-                int                       num_streams = 1;
-                cudaStream_t              streams[1]  = {stream};
-                CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
-                    &handle, &mat_A, m, k, k, alignment, CUDA_R_8I, row_order, CUSPARSELT_SPARSITY_50_PERCENT))
-                CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_B, k, n, k, alignment, CUDA_R_8I, col_order))
-                CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_8I, col_order))
-                auto start = std::chrono::high_resolution_clock::now();
-                for (int ite = 0; ite < ites; ++ite) {
-                    // initializing MatDesc takes a lot of time
-                    // and these descs can be stored to other place
-                    // whereas storing MatMulPlan to other place will cause errors
-                    cusparseLtMatmulDescriptor_t   matmul;
-                    cusparseLtMatmulAlgSelection_t alg_sel;
-                    cusparseLtMatmulPlan_t         plan;
-                    CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(
-                        &handle, &matmul, opA, opB, &mat_A, &mat_B, &mat_C, &mat_C, compute_type))
-                    CHECK_CUSPARSE(
-                        cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
-                    CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
-                        &handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)))
-                    size_t workspace_size;
-                    CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&handle, &alg_sel, &workspace_size))
-                    CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel, workspace_size))
-                    CHECK_CUSPARSE(cusparseLtMatmul(&handle,
-                                                    &plan,
-                                                    &alpha2,
-                                                    dA_compressed,
-                                                    d_B,
-                                                    &beta2,
-                                                    d_C,
-                                                    d_C,
-                                                    d_workspace,
-                                                    streams,
-                                                    num_streams))
-                    CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
-                }
-                cudaDeviceSynchronize();
-                auto end = std::chrono::high_resolution_clock::now();
-                auto dur = std::chrono::duration<float, std::milli>(end - start);
-                printf("algo_%d costs %.3fms \n", alg, dur.count() / ites);
-                if (dur.count() < exec_time) {
-                    exec_time = dur.count();
-                    fast_algo = alg;
-                }
-            }
-            exec_time /= ites;
-            printf("fast_algo %d\n", fast_algo);
-            fprintf(fd,
-                    "%d %d %d %d %d ### 1 %d %d %d %d %f\n",
-                    batch_size,
-                    seq_len,
-                    head_num,
-                    size_per_head,
-                    HALF_DATATYPE,
-                    m,
-                    n,
-                    k,
-                    fast_algo,
-                    exec_time);
-            cudaFree(dA_compressed);
-        }
-        CHECK_CUSPARSE(cusparseLtDestroy(&handle))
-        fclose(fd);
-        printf("***cusparseLt Gemm Testing End***\n");
-    }
-#endif
-    return 0;
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/gemm_test/encoder_igemm_func.h b/src/turbomind/utils/gemm_test/encoder_igemm_func.h
deleted file mode 100644
index 4cadeed026..0000000000
--- a/src/turbomind/utils/gemm_test/encoder_igemm_func.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "src/turbomind/utils/cublasAlgoMap.h"
-#include "src/turbomind/utils/cuda_utils.h"
-#include <algorithm>
-#include <cublasLt.h>
-#include <cuda_runtime.h>
-#include <map>
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __linux__
-#include <sys/time.h>
-#include <unistd.h>
-#endif
-#include <time.h>
-#include <vector>
-
-namespace turbomind {
-
-/* CAUTION : must match cublasLtMatmulTile_t */
-const char* const matmulTileName[] = {"UNDEF",  "8x8",     "8x16",    "16x8",    "8x32",    "16x16",  "32x8",
-                                      "8x64",   "16x32",   "32x16",   "64x8",    "32x32",   "32x64",  "64x32",
-                                      "32x128", "64x64",   "128x32",  "64x128",  "128x64",  "64x256", "128x128",
-                                      "256x64", "64x512",  "128x256", "256x128", "512x64",  "64x96",  "96*64",
-                                      "96x128", "128x160", "160x128", "192x128", "128x192", "128x96", "END"};
-
-int generate_encoder_igemm_config(
-    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend = true);
-
-int printPerfStructure(int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint);
-
-int printBatchPerfStructure(
-    int batchCount, int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint);
-
-template<typename T, typename scaleT>
-int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
-                      int              m,
-                      int              n,
-                      int              k,
-                      const scaleT*    alpha, /* host pointer */
-                      const int8_t*    A,
-                      const int8_t*    B,
-                      const scaleT*    beta, /* host pointer */
-                      T*               C,
-                      void*            workSpace,
-                      size_t           workSpaceSize,
-                      FILE*            fout);
-
-template<typename T, typename scaleT>
-int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
-                           int              batchCount,
-                           int              m,
-                           int              n,
-                           int              k,
-                           const scaleT*    alpha, /* host pointer */
-                           const int8_t*    A,
-                           const int8_t*    B,
-                           const scaleT*    beta, /* host pointer */
-                           T*               C,
-                           void*            workSpace,
-                           size_t           workSpaceSize,
-                           FILE*            fout);
-
-void matInit(int rows, int cols, int8_t* p, int ld);
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/gemm_test/gemm_func.cc b/src/turbomind/utils/gemm_test/gemm_func.cc
deleted file mode 100644
index 0a4645481b..0000000000
--- a/src/turbomind/utils/gemm_test/gemm_func.cc
+++ /dev/null
@@ -1,990 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "encoder_gemm_func.h"
-#include <assert.h>
-#include <sys/types.h>
-#include <vector>
-
-#ifndef CUDART_VERSION
-#error CUDART_VERSION Undefined!
-#endif
-
-namespace turbomind {
-
-// Utility function to print customMatmulPerf_t structure
-int printPerfStructure(int                       batch_size,
-                       int                       seq_len,
-                       int                       head_num,
-                       int                       size_per_head,
-                       int                       m,
-                       int                       n,
-                       int                       k,
-                       const customMatmulPerf_t& perf,
-                       FILE*                     fout,
-                       CublasDataType            data_type,
-                       int                       hasPrint,
-                       int                       batch_count)
-{
-    int algoId, tile, swizzle, customOption, numSplitsK, reductionScheme, stages;
-
-    const cublasLtMatmulAlgo_t* matmulAlgo = &perf.algo;
-    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_ID, &algoId, sizeof(algoId), NULL);
-    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tile, sizeof(tile), NULL);
-    cublasLtMatmulAlgoConfigGetAttribute(
-        matmulAlgo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &numSplitsK, sizeof(numSplitsK), NULL);
-    cublasLtMatmulAlgoConfigGetAttribute(
-        matmulAlgo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &reductionScheme, sizeof(reductionScheme), NULL);
-    cublasLtMatmulAlgoConfigGetAttribute(
-        matmulAlgo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &swizzle, sizeof(swizzle), NULL);
-    cublasLtMatmulAlgoConfigGetAttribute(
-        matmulAlgo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption), NULL);
-#if (CUDART_VERSION >= 11000)
-    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
-#else
-    stages = 0;
-#endif
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-    uint16_t inner_shapeId, cluster_shapeId;
-    cublasLtMatmulAlgoConfigGetAttribute(
-        matmulAlgo, CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID, &inner_shapeId, sizeof(inner_shapeId), NULL);
-    cublasLtMatmulAlgoConfigGetAttribute(
-        matmulAlgo, CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID, &cluster_shapeId, sizeof(cluster_shapeId), NULL);
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-    uint16_t mma_shapeId, cga_shapeId, sche_mode;
-    cublasLtMatmulAlgoConfigGetAttribute(
-        matmulAlgo, CUBLASLT_ALGO_CONFIG_MMA_SHAPE_ID, &mma_shapeId, sizeof(mma_shapeId), NULL);
-    cublasLtMatmulAlgoConfigGetAttribute(
-        matmulAlgo, CUBLASLT_ALGO_CONFIG_CGA_SHAPE_ID, &cga_shapeId, sizeof(cga_shapeId), NULL);
-    cublasLtMatmulAlgoConfigGetAttribute(
-        matmulAlgo, CUBLASLT_ALGO_CONFIG_SCHEDULING_MODE, &sche_mode, sizeof(sche_mode), NULL);
-#endif
-
-    printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d "
-#if (CUDART_VERSION >= 11000)
-           "stages=%d "
-#endif
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-           "inner_shapeId=%d cluster_shapeId=%d"
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-           "mma_shapeId=%d cga_shapeId=%d schedule_mode=%d"
-#endif
-           "} status %d "
-           "time %fms workspace=%d mathMode=%d waves=%f\n",
-           algoId,
-           tile,
-           matmulTileName[tile],
-           numSplitsK,
-           reductionScheme,
-           swizzle,
-           customOption,
-#if (CUDART_VERSION >= 11000)
-           stages,
-#endif
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-           inner_shapeId,
-           cluster_shapeId,
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-           mma_shapeId,
-           cga_shapeId,
-           sche_mode,
-#endif
-           perf.status,
-           perf.time,
-           (int)perf.workspaceSize,
-           (int)perf.mathMode,
-           perf.wavesCount);
-    if (hasPrint == 0) {
-        fprintf(fout,
-                "%d %d %d %d %d ### %d %d %d %d %d %d %d %d %d %d %d %d "
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-                "%d %d "
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-                "%d %d %d "
-#endif
-                "%f\n",
-                batch_size,
-                seq_len,
-                head_num,
-                size_per_head,
-                data_type,
-                batch_count,
-                m,
-                n,
-                k,
-                algoId,
-                customOption,
-                tile,
-                numSplitsK,
-                swizzle,
-                reductionScheme,
-                (int)perf.workspaceSize,
-                stages,
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-                inner_shapeId,
-                cluster_shapeId,
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-                mma_shapeId,
-                cga_shapeId,
-                sche_mode,
-#endif
-                perf.time);
-        return 1;
-    }
-    else {
-        return hasPrint;
-    }
-}
-
-static inline bool time_compare(const customMatmulPerf_t& perf_a, const customMatmulPerf_t& perf_b)
-{
-    return ((perf_a.status == CUBLAS_STATUS_SUCCESS) && (perf_a.time < perf_b.time));
-}
-
-static cublasStatus_t customMatmulRun(cublasLtHandle_t            ltHandle,  // to get the capabilities (required a GPU)
-                                      cublasLtMatmulDesc_t        operationDesc,
-                                      const void*                 alpha, /* host or device pointer */
-                                      const void*                 A,
-                                      cublasLtMatrixLayout_t      Adesc,
-                                      const void*                 B,
-                                      cublasLtMatrixLayout_t      Bdesc,
-                                      const void*                 beta, /* host or device pointer */
-                                      const void*                 C,
-                                      cublasLtMatrixLayout_t      Cdesc,
-                                      void*                       D,
-                                      cublasLtMatrixLayout_t      Ddesc,
-                                      const cublasLtMatmulAlgo_t& algo,
-                                      int                         kernelRepeats,
-                                      void*                       workSpace,
-                                      size_t                      workSpaceSizeInBytes,
-                                      customMatmulPerf_t&         perfResults,
-                                      cudaStream_t                stream,
-                                      cudaEvent_t&                startEvent,
-                                      cudaEvent_t&                stopEvent)
-{
-    cublasLtMatmulHeuristicResult_t heurResult;
-    /* Looping over the Algo */
-    int            repeats = kernelRepeats;
-    cublasStatus_t algoStatus =
-        cublasLtMatmulAlgoCheck(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, &algo, &heurResult);
-
-    if (algoStatus == CUBLAS_STATUS_SUCCESS) {
-        if (heurResult.workspaceSize <= workSpaceSizeInBytes) {
-            cudaError_t err, err1, err2, err3;
-            err = cudaEventRecord(startEvent, stream);
-            for (int loop = 0; loop < repeats; loop++) {
-                cublasStatus_t oneRunStatus = cublasLtMatmul(ltHandle,
-                                                             operationDesc,
-                                                             alpha,
-                                                             A,
-                                                             Adesc,
-                                                             B,
-                                                             Bdesc,
-                                                             beta,
-                                                             C,
-                                                             Cdesc,
-                                                             D,
-                                                             Ddesc,
-                                                             &algo,
-                                                             workSpace,
-                                                             workSpaceSizeInBytes,
-                                                             stream);
-                if (oneRunStatus != CUBLAS_STATUS_SUCCESS) {
-                    algoStatus = oneRunStatus;
-                    break;
-                }
-            }
-            err1 = cudaEventRecord(stopEvent, stream);
-            err2 = cudaEventSynchronize(stopEvent);
-            float time;
-            err3 = cudaEventElapsedTime(&time, startEvent, stopEvent);
-            if ((err != cudaSuccess) || (err1 != cudaSuccess) || (err2 != cudaSuccess) || (err3 != cudaSuccess)) {
-                algoStatus = CUBLAS_STATUS_INTERNAL_ERROR;
-            }
-            // For the moment only add successful findings
-            if (algoStatus == CUBLAS_STATUS_SUCCESS) {
-                perfResults.algo          = algo;
-                perfResults.time          = time / repeats;
-                perfResults.workspaceSize = heurResult.workspaceSize;
-                perfResults.wavesCount    = heurResult.wavesCount;
-            }
-        }
-        else {
-            // printf("not enough workspace! %ld\n", heurResult.workspaceSize);
-            algoStatus = CUBLAS_STATUS_NOT_SUPPORTED;  // Not enough workspace
-        }
-    }
-
-    return algoStatus;
-}
-
-template<typename T, typename scaleT>
-int LtHgemmCustomFind(cublasLtHandle_t   ltHandle,
-                      int                batch_size,
-                      int                seq_len,
-                      int                head_num,
-                      int                size_per_head,
-                      int                m,
-                      int                n,
-                      int                k,
-                      const scaleT*      alpha, /* host pointer */
-                      const T*           A,
-                      const T*           B,
-                      const scaleT*      beta, /* host pointer */
-                      T*                 C,
-                      void*              workSpace,
-                      size_t             workSpaceSize,
-                      FILE*              fout,
-                      customMatmulPerf_t perfResults[],
-                      int                AlgoCombinations,
-                      cudaDataType_t     dtype_fp8,
-                      int                batchCount,
-                      int64_t            strideA,
-                      int64_t            strideB,
-                      int64_t            strideD)
-{
-    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
-    cudaEvent_t    startEvent;
-    cudaEvent_t    stopEvent;
-    CublasDataType data_type;
-
-    cublasLtMatmulDesc_t   operationDesc = NULL;
-    cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL, Ddesc = NULL;
-
-    cudaStream_t stream = 0;
-    // SplitK value that we are going to try when SplitK is supported for a
-    // given algo
-    const int splitKSequenceA[] = {2, 3, 4, 5, 6, 8, 12, 16, 32};
-    // Let try a fixed number of combinations
-    int                               AlgoCount         = 0;
-    int                               AlgoCountRestrict = 0;            // workspace == 0
-    const int                         maxNumTraversal   = 50;           // max number of traversal
-    std::vector<cublasLtMatmulAlgo_t> algos(AlgoCombinations);          // 0 <= workspace <= 32MB
-    std::vector<cublasLtMatmulAlgo_t> algosRestrict(AlgoCombinations);  // workspace == 0
-    const int                         kernelRepeats = 100;  // number of time the CUDA kernels will be run back to back
-    int                               nbAlgoIds     = 0;    // Number of algorithms actually returned by
-                                                            // cublasLtMatmulAlgoGetIds function.
-#define ALGO_IDS 100                                        // Number of algorithms requested.
-    int algoIdA[ALGO_IDS];                                  // Array containing the algorithm IDs returned by
-                                                            // cublasLtMatmulAlgoGetIds function.
-    cudaDataType_t Atype, Btype, Ctype, scaleType, Dtype;
-#if (CUDART_VERSION >= 11000)
-    cublasComputeType_t computeType;
-#else
-    cudaDataType_t computeType;
-#endif
-
-    if (std::is_same<T, float>::value) {
-        data_type = FLOAT_DATATYPE;
-        Atype = CUDA_R_32F, Btype = CUDA_R_32F, Ctype = CUDA_R_32F, Dtype = CUDA_R_32F;
-    }
-    else if (std::is_same<T, half>::value) {
-        data_type = HALF_DATATYPE;
-        Atype = CUDA_R_16F, Btype = CUDA_R_16F, Ctype = CUDA_R_16F, Dtype = CUDA_R_16F;
-    }
-#ifdef ENABLE_BF16
-    else if (std::is_same<T, __nv_bfloat16>::value) {
-        data_type = BFLOAT16_DATATYPE;
-        Atype = CUDA_R_16BF, Btype = CUDA_R_16BF, Ctype = CUDA_R_16BF, Dtype = CUDA_R_16BF;
-    }
-#endif
-#ifdef ENABLE_FP8
-    else if (std::is_same<T, __nv_fp8_e4m3>::value) {
-        data_type = FP8_DATATYPE;
-        Atype = CUDA_R_8F_E4M3, Btype = CUDA_R_8F_E4M3, Ctype = CUDA_R_16BF;
-#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
-        Dtype = CUDA_R_16BF;
-#else
-        Dtype = dtype_fp8;
-#endif
-    }
-#endif
-
-    if (sizeof(scaleT) == sizeof(float)) {
-        scaleType = CUDA_R_32F;
-#if (CUDART_VERSION >= 11000)
-        computeType = CUBLAS_COMPUTE_32F;
-#else
-        computeType = CUDA_R_32F;
-#endif
-    }
-    else {
-        scaleType = CUDA_R_16F;
-#if (CUDART_VERSION >= 11000)
-        computeType = CUBLAS_COMPUTE_16F;
-#else
-        computeType = CUDA_R_16F;
-#endif
-    }
-
-    const cublasOperation_t tA = data_type == FP8_DATATYPE ? CUBLAS_OP_T : CUBLAS_OP_N;
-
-// Create operation descriptor; see cublasLtMatmulDescAttributes_t for
-// details about defaults; here we just need to set the transforms for A and
-// B
-#if (CUDART_VERSION >= 11000)
-    status = cublasLtMatmulDescCreate(&operationDesc, computeType,
-                                      scaleType);  //  creates a matrix multiply descriptor
-#else
-    status = cublasLtMatmulDescCreate(&operationDesc, computeType);
-#endif
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        goto CLEANUP;
-    }
-
-    status = cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &tA, sizeof(tA));
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        goto CLEANUP;
-    }
-#ifdef ENABLE_FP8
-    if (data_type == FP8_DATATYPE) {
-        const int8_t fastAccuMode = 1;  // enable fast imprecise accum
-        status                    = cublasLtMatmulDescSetAttribute(
-            operationDesc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(decltype(fastAccuMode)));
-        if (status != CUBLAS_STATUS_SUCCESS) {
-            goto CLEANUP;
-        }
-    }
-#endif
-
-    // Create matrix descriptors. We are good with the details here so no need
-    // to set any extra attributes
-    if (data_type == FP8_DATATYPE) {
-        status = cublasLtMatrixLayoutCreate(&Adesc, Atype, k, m, k);
-    }
-    else {
-        status = cublasLtMatrixLayoutCreate(&Adesc, Atype, m, k, m);
-    }
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        goto CLEANUP;
-    }
-
-    status = cublasLtMatrixLayoutCreate(&Bdesc, Btype, k, n, k);
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        goto CLEANUP;
-    }
-
-    status = cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, m);
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        goto CLEANUP;
-    }
-    status = cublasLtMatrixLayoutCreate(&Ddesc, Dtype, m, n, m);
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        goto CLEANUP;
-    }
-
-    if (batchCount > 1) {
-        check_cuda_error(cublasLtMatrixLayoutSetAttribute(
-            Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
-        check_cuda_error(cublasLtMatrixLayoutSetAttribute(
-            Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
-        check_cuda_error(cublasLtMatrixLayoutSetAttribute(
-            Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
-        check_cuda_error(cublasLtMatrixLayoutSetAttribute(
-            Ddesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
-
-        check_cuda_error(cublasLtMatrixLayoutSetAttribute(
-            Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideA, sizeof(strideA)));
-        check_cuda_error(cublasLtMatrixLayoutSetAttribute(
-            Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideB, sizeof(strideB)));
-        check_cuda_error(cublasLtMatrixLayoutSetAttribute(
-            Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
-        check_cuda_error(cublasLtMatrixLayoutSetAttribute(
-            Ddesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
-    }
-
-    // Create CUDA event to time the execution time of each algo
-    if (cudaEventCreate(&startEvent, cudaEventBlockingSync) != cudaSuccess) {
-        goto CLEANUP;
-    }
-    if (cudaEventCreate(&stopEvent, cudaEventBlockingSync) != cudaSuccess) {
-        goto CLEANUP;
-    }
-
-    // Request the 100 first AlgoId available
-    status = cublasLtMatmulAlgoGetIds(
-        ltHandle, computeType, scaleType, Atype, Btype, Ctype, Dtype, ALGO_IDS, algoIdA, &nbAlgoIds);
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        goto CLEANUP;
-    }
-    if (nbAlgoIds > ALGO_IDS) {
-        printf(
-            "Warning: the algo id count is not large enough to guarantee the best algo %d, %d\n", nbAlgoIds, ALGO_IDS);
-    }
-
-    // Loop over the Algo IDs
-    // This loop doesn't work for fp8 gemm
-    for (int idx = 0; (idx < nbAlgoIds) && (AlgoCount < AlgoCombinations); idx++) {
-        cublasLtMatmulAlgo_t algo;
-        size_t               sizeWritten = 0;
-        /* Initialize algo structure with given Algp ID */
-        status =
-            cublasLtMatmulAlgoInit(ltHandle, computeType, scaleType, Atype, Btype, Ctype, Dtype, algoIdA[idx], &algo);
-        if (status != CUBLAS_STATUS_SUCCESS) {
-            continue;
-        }
-        // Query the tiles enums supported by that algo
-        cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_TILE_IDS, NULL, 0, &sizeWritten);
-        int  nbTiles = int(sizeWritten / sizeof(int));
-        int* tileA   = new int[nbTiles == 0 ? 1 : nbTiles];
-        if (nbTiles == 0) {
-            tileA[0] = CUBLASLT_MATMUL_TILE_UNDEFINED;
-            nbTiles  = 1;
-        }
-#if (CUDART_VERSION >= 11000)
-        cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, NULL, 0, &sizeWritten);
-        int              nbStages = int(sizeWritten / sizeof(int));
-        std::vector<int> stagesA(nbStages == 0 ? 1 : nbStages);
-        if (nbStages == 0) {
-            stagesA[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED;
-            nbStages   = 1;
-        }
-        else {
-            cublasLtMatmulAlgoCapGetAttribute(
-                &algo, CUBLASLT_ALGO_CAP_STAGES_IDS, stagesA.data(), sizeof(int) * nbStages, &sizeWritten);
-        }
-#endif
-        int splitkSupport, redMask, swizzlingMax, customOptionMax;
-        // Retrieve Algo Capabilities attributes to be able to setup loop over
-        // the different combinations
-        cublasLtMatmulAlgoCapGetAttribute(
-            &algo, CUBLASLT_ALGO_CAP_TILE_IDS, tileA, sizeof(int) * nbTiles, &sizeWritten);
-        cublasLtMatmulAlgoCapGetAttribute(
-            &algo, CUBLASLT_ALGO_CAP_SPLITK_SUPPORT, &splitkSupport, sizeof(splitkSupport), &sizeWritten);
-        cublasLtMatmulAlgoCapGetAttribute(
-            &algo, CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK, &redMask, sizeof(redMask), &sizeWritten);
-        cublasLtMatmulAlgoCapGetAttribute(
-            &algo, CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT, &swizzlingMax, sizeof(swizzlingMax), &sizeWritten);
-        cublasLtMatmulAlgoCapGetAttribute(
-            &algo, CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX, &customOptionMax, sizeof(customOptionMax), &sizeWritten);
-
-        /* Loop over the different tiles */
-        for (int tileIdx = 0; tileIdx < nbTiles; tileIdx++) {
-#if (CUDART_VERSION >= 11000)
-            /* Loop over different stages count */
-            for (int stagesIdx = 0; stagesIdx < nbStages; stagesIdx++) {
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stagesA[stagesIdx], sizeof(stagesA[stagesIdx]));
-#endif
-                /* Loop over the different custom option if any */
-                for (int customOption = 0; customOption <= customOptionMax; customOption++) {
-                    cublasLtMatmulAlgoConfigSetAttribute(
-                        &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption));
-                    /* Loop over the CTAs swizzling support */
-                    for (int k = 0; k <= swizzlingMax; k++) {
-                        int splitK_trial = 0;
-                        if (splitkSupport) {
-                            splitK_trial += sizeof(splitKSequenceA) / sizeof(splitKSequenceA[0]);
-                        }
-                        // Loop over the splitK value over a fixed sequence
-                        // splitKSequenceA in addition to the case where splitK
-                        // is not enabled
-                        for (int l = 0; (l < (1 + splitK_trial)) && (AlgoCount < AlgoCombinations); l++) {
-                            /* Setup attribute of the algo to run */
-                            cublasLtMatmulAlgoConfigSetAttribute(
-                                &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tileA[tileIdx], sizeof(tileA[tileIdx]));
-                            int splitK_val = 0;
-                            int redScheme  = CUBLASLT_REDUCTION_SCHEME_NONE;
-                            cublasLtMatmulAlgoConfigSetAttribute(
-                                &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &splitK_val, sizeof(splitK_val));
-                            cublasLtMatmulAlgoConfigSetAttribute(
-                                &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &k, sizeof(k));
-                            cublasLtMatmulAlgoConfigSetAttribute(
-                                &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &redScheme, sizeof(int));
-
-                            if (l > 0) {  // Split-K case
-                                splitK_val = splitKSequenceA[l - 1];
-                                cublasLtMatmulAlgoConfigSetAttribute(&algo,
-                                                                     CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
-                                                                     &splitKSequenceA[l - 1],
-                                                                     sizeof(splitKSequenceA[l - 1]));
-                                /* Going over all the reduction scheme  */
-                                for (redScheme = 1;
-                                     redScheme < (int)CUBLASLT_REDUCTION_SCHEME_MASK && (AlgoCount < AlgoCombinations);
-                                     redScheme = redScheme << 1) {
-                                    if (redScheme & redMask) {
-                                        cublasLtMatmulAlgoConfigSetAttribute(&algo,
-                                                                             CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
-                                                                             &redScheme,
-                                                                             sizeof(redScheme));
-
-                                        cublasLtMatmulHeuristicResult_t heurResult;
-                                        cublasStatus_t                  algoStatus = cublasLtMatmulAlgoCheck(
-                                            ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Cdesc, &algo, &heurResult);
-                                        if (heurResult.workspaceSize > workSpaceSize) {
-                                            // printf("not enough workspace!
-                                            // %ld\n",
-                                            // heurResult.workspaceSize);
-                                            algoStatus = CUBLAS_STATUS_NOT_SUPPORTED;  // Not enough workspace
-                                        }
-                                        else if (heurResult.workspaceSize == 0) {
-                                            if (algoStatus == CUBLAS_STATUS_SUCCESS) {
-                                                algosRestrict[AlgoCountRestrict++] = algo;
-                                            }
-                                        }
-                                        if (algoStatus == CUBLAS_STATUS_SUCCESS) {
-                                            algos[AlgoCount++] = algo;
-                                        }
-                                    }  // end if
-                                }      // end for
-                            }
-                            else {  // Non-splitK case
-                                /* if user preference is ok with workspace */
-                                if (AlgoCount < AlgoCombinations) {
-                                    cublasLtMatmulHeuristicResult_t heurResult;
-                                    cublasStatus_t                  algoStatus = cublasLtMatmulAlgoCheck(
-                                        ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Cdesc, &algo, &heurResult);
-                                    if (heurResult.workspaceSize > workSpaceSize) {
-                                        // printf("not enough workspace! %ld\n",
-                                        // heurResult.workspaceSize);
-                                        algoStatus = CUBLAS_STATUS_NOT_SUPPORTED;  // Not
-                                                                                   // enough
-                                                                                   // workspace
-                                    }
-                                    else if (heurResult.workspaceSize == 0) {
-                                        if (algoStatus == CUBLAS_STATUS_SUCCESS) {
-                                            algosRestrict[AlgoCountRestrict++] = algo;
-                                        }
-                                    }
-                                    if (algoStatus == CUBLAS_STATUS_SUCCESS) {
-                                        algos[AlgoCount++] = algo;
-                                    }
-                                }
-                            }
-                        }  // end l
-                    }      // end k
-                }          // end customOption
-#if (CUDART_VERSION >= 11000)
-            }  // end stagesIdx
-#endif
-        }  // end tileIdx
-        delete[] tileA;
-    }  // end idx
-
-    printf("AlgoCount: %d\n", AlgoCount);
-    if (data_type == FP8_DATATYPE) {
-        assert(AlgoCount == 0);
-    }
-    if (AlgoCount < maxNumTraversal && data_type != FP8_DATATYPE) {
-        // 0 <= workspacesize <= 32MB
-        for (int i = 0; i < AlgoCount; i++) {
-            status                = customMatmulRun(ltHandle,
-                                     operationDesc,
-                                     alpha, /* host or device pointer */
-                                     A,
-                                     Adesc,
-                                     B,
-                                     Bdesc,
-                                     beta, /* host or device pointer */
-                                     C,
-                                     Cdesc,
-                                     C,
-                                     Cdesc,
-                                     algos[i],
-                                     kernelRepeats,
-                                     workSpace,
-                                     workSpaceSize,
-                                     perfResults[i],
-                                     stream,
-                                     startEvent,
-                                     stopEvent);
-            perfResults[i].status = status;
-            // if (status == CUBLAS_STATUS_SUCCESS) AlgoCount++;
-        }
-    }
-    else {
-        // Heuristic + workspacesize==0
-        AlgoCount = 0;
-        nbAlgoIds = 0;
-        cublasLtMatmulPreference_t pref;
-        cublasLtMatmulPreferenceCreate(&pref);
-        uint64_t maxWorkSpaceSize = workSpaceSize;  //(32MB)
-        cublasLtMatmulPreferenceSetAttribute(
-            pref, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &maxWorkSpaceSize, sizeof(maxWorkSpaceSize));
-        cublasLtMatmulHeuristicResult_t heuristicResultsArray[maxNumTraversal];
-
-        cublasLtMatmulAlgoGetHeuristic(ltHandle,
-                                       operationDesc,
-                                       Adesc,
-                                       Bdesc,
-                                       Cdesc,
-                                       Ddesc,
-                                       pref,
-                                       maxNumTraversal,
-                                       heuristicResultsArray,
-                                       &nbAlgoIds);
-        cublasLtMatmulPreferenceDestroy(pref);
-        printf("return %d and run heuristic algo\n", nbAlgoIds);
-        for (int i = 0; i < nbAlgoIds; i++) {
-            if (heuristicResultsArray[i].state == CUBLAS_STATUS_SUCCESS) {
-                status                        = customMatmulRun(ltHandle,
-                                         operationDesc,
-                                         alpha, /* host or device pointer */
-                                         A,
-                                         Adesc,
-                                         B,
-                                         Bdesc,
-                                         beta, /* host or device pointer */
-                                         C,
-                                         Cdesc,
-                                         C,
-                                         Ddesc,
-                                         heuristicResultsArray[i].algo,
-                                         kernelRepeats,
-                                         workSpace,
-                                         workSpaceSize,
-                                         perfResults[AlgoCount],
-                                         stream,
-                                         startEvent,
-                                         stopEvent);
-                perfResults[AlgoCount].status = status;
-                if (status == CUBLAS_STATUS_SUCCESS) {
-                    AlgoCount++;
-                }
-            }
-        }
-
-        // workspacesize==0
-        printf("workspacesize==0, run %d algos\n", AlgoCountRestrict);
-        for (int i = 0; i < AlgoCountRestrict && i < (maxNumTraversal - nbAlgoIds); i++) {
-            status                        = customMatmulRun(ltHandle,
-                                     operationDesc,
-                                     alpha, /* host or device pointer */
-                                     A,
-                                     Adesc,
-                                     B,
-                                     Bdesc,
-                                     beta, /* host or device pointer */
-                                     C,
-                                     Cdesc,
-                                     C,
-                                     Ddesc,
-                                     algosRestrict[i],
-                                     kernelRepeats,
-                                     NULL,
-                                     0,
-                                     perfResults[AlgoCount],
-                                     stream,
-                                     startEvent,
-                                     stopEvent);
-            perfResults[AlgoCount].status = status;
-            if (status == CUBLAS_STATUS_SUCCESS) {
-                AlgoCount++;
-            }
-        }
-    }
-
-    // Sort the results per run duration
-    std::sort(perfResults, perfResults + AlgoCount, time_compare);
-    // Print timing and perf details
-    for (int i = 0, hasPrint = 1; i < AlgoCount; i++) {
-        printf("result %03d : ", i);
-        hasPrint = printPerfStructure(batch_size,
-                                      seq_len,
-                                      head_num,
-                                      size_per_head,
-                                      m,
-                                      n,
-                                      k,
-                                      perfResults[i],
-                                      fout,
-                                      data_type,
-                                      hasPrint,
-                                      batchCount);
-    }
-
-CLEANUP:
-    // Descriptors are no longer needed as all GPU work was already enqueued
-    if (Cdesc) {
-        cublasLtMatrixLayoutDestroy(Cdesc);
-    }
-    if (Bdesc) {
-        cublasLtMatrixLayoutDestroy(Bdesc);
-    }
-    if (Adesc) {
-        cublasLtMatrixLayoutDestroy(Adesc);
-    }
-    if (operationDesc) {
-        cublasLtMatmulDescDestroy(operationDesc);
-    }
-    if (startEvent) {
-        cudaEventDestroy(startEvent);
-    }
-    if (stopEvent) {
-        cudaEventDestroy(stopEvent);
-    }
-    return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
-}
-
-template int LtHgemmCustomFind(cublasLtHandle_t   ltHandle,
-                               int                batch_size,
-                               int                seq_len,
-                               int                head_num,
-                               int                size_per_head,
-                               int                m,
-                               int                n,
-                               int                k,
-                               const float*       alpha, /* host pointer */
-                               const float*       A,
-                               const float*       B,
-                               const float*       beta, /* host pointer */
-                               float*             C,
-                               void*              workSpace,
-                               size_t             workSpaceSize,
-                               FILE*              fout,
-                               customMatmulPerf_t perfResults[],
-                               int                AlgoCombinations,
-                               cudaDataType_t     dtype_fp8,
-                               int                batchCount,
-                               int64_t            strideA,
-                               int64_t            strideB,
-                               int64_t            strideD);
-
-template int LtHgemmCustomFind(cublasLtHandle_t   ltHandle,
-                               int                batch_size,
-                               int                seq_len,
-                               int                head_num,
-                               int                size_per_head,
-                               int                m,
-                               int                n,
-                               int                k,
-                               const half*        alpha, /* host pointer */
-                               const half*        A,
-                               const half*        B,
-                               const half*        beta, /* host pointer */
-                               half*              C,
-                               void*              workSpace,
-                               size_t             workSpaceSize,
-                               FILE*              fout,
-                               customMatmulPerf_t perfResults[],
-                               int                AlgoCombinations,
-                               cudaDataType_t     dtype_fp8,
-                               int                batchCount,
-                               int64_t            strideA,
-                               int64_t            strideB,
-                               int64_t            strideD);
-
-#ifdef ENABLE_BF16
-template int LtHgemmCustomFind(cublasLtHandle_t     ltHandle,
-                               int                  batch_size,
-                               int                  seq_len,
-                               int                  head_num,
-                               int                  size_per_head,
-                               int                  m,
-                               int                  n,
-                               int                  k,
-                               const float*         alpha, /* host pointer */
-                               const __nv_bfloat16* A,
-                               const __nv_bfloat16* B,
-                               const float*         beta, /* host pointer */
-                               __nv_bfloat16*       C,
-                               void*                workSpace,
-                               size_t               workSpaceSize,
-                               FILE*                fout,
-                               customMatmulPerf_t   perfResults[],
-                               int                  AlgoCombinations,
-                               cudaDataType_t       dtype_fp8,
-                               int                  batchCount,
-                               int64_t              strideA,
-                               int64_t              strideB,
-                               int64_t              strideD);
-#endif
-
-#ifdef ENABLE_FP8
-template int LtHgemmCustomFind(cublasLtHandle_t     ltHandle,
-                               int                  batch_size,
-                               int                  seq_len,
-                               int                  head_num,
-                               int                  size_per_head,
-                               int                  m,
-                               int                  n,
-                               int                  k,
-                               const float*         alpha, /* host pointer */
-                               const __nv_fp8_e4m3* A,
-                               const __nv_fp8_e4m3* B,
-                               const float*         beta, /* host pointer */
-                               __nv_fp8_e4m3*       C,
-                               void*                workSpace,
-                               size_t               workSpaceSize,
-                               FILE*                fout,
-                               customMatmulPerf_t   perfResults[],
-                               int                  AlgoCombinations,
-                               cudaDataType_t       dtype_fp8,
-                               int                  batchCount,
-                               int64_t              strideA,
-                               int64_t              strideB,
-                               int64_t              strideD);
-#endif
-
-template int LtHgemmCustomFind(cublasLtHandle_t   ltHandle,
-                               int                batch_size,
-                               int                seq_len,
-                               int                head_num,
-                               int                size_per_head,
-                               int                m,
-                               int                n,
-                               int                k,
-                               const float*       alpha, /* host pointer */
-                               const half*        A,
-                               const half*        B,
-                               const float*       beta, /* host pointer */
-                               half*              C,
-                               void*              workSpace,
-                               size_t             workSpaceSize,
-                               FILE*              fout,
-                               customMatmulPerf_t perfResults[],
-                               int                AlgoCombinations,
-                               cudaDataType_t     dtype_fp8,
-                               int                batchCount,
-                               int64_t            strideA,
-                               int64_t            strideB,
-                               int64_t            strideD);
-
-size_t calGemmTestBufSizeInByte(int            batch_size,
-                                int            seq_len,
-                                int            head_num,
-                                int            size_per_head,
-                                int            inter_size,
-                                int            vocab_size,
-                                int            int8_mode,
-                                CublasDataType data_type)
-{
-    size_t buf_size_in_byte;
-    if (int8_mode > 0) {
-        int m = batch_size * seq_len;
-        int n = head_num * size_per_head;
-        int k = n;
-
-        size_t size1 = 3 * (m * k * sizeof(int8_t) + k * n * sizeof(int8_t) + m * n * sizeof(int));
-        size_t size2 = batch_size * head_num
-                       * (seq_len * size_per_head * sizeof(int8_t) + size_per_head * seq_len * sizeof(int8_t)
-                          + seq_len * seq_len * sizeof(int));
-        size_t size3 = batch_size * head_num
-                       * (seq_len * seq_len * sizeof(int8_t) + seq_len * size_per_head * sizeof(int8_t)
-                          + seq_len * size_per_head * sizeof(int));
-        size_t size4     = m * k * sizeof(int8_t) + k * inter_size * sizeof(int8_t) + m * inter_size * sizeof(int);
-        size_t size5     = m * k * sizeof(int8_t) + k * vocab_size * sizeof(int8_t) + m * vocab_size * sizeof(int);
-        buf_size_in_byte = size1 > size2 ? size1 : size2;
-        buf_size_in_byte = buf_size_in_byte > size3 ? buf_size_in_byte : size3;
-        buf_size_in_byte = buf_size_in_byte > size4 ? buf_size_in_byte : size4;
-        buf_size_in_byte = buf_size_in_byte > size5 ? buf_size_in_byte : size5;
-    }
-    else {
-        size_t m = batch_size * seq_len;
-        size_t n = head_num * size_per_head;
-        size_t k = n;
-        // TODO need to add bfloat16 here
-        int    wordSize = (data_type == FLOAT_DATATYPE ? sizeof(float) : sizeof(half));
-        size_t size1    = 3 * (m * k + k * n + m * n) * wordSize;
-        size_t size2    = (size_t)batch_size * (size_t)head_num
-                       * ((size_t)seq_len * (size_t)seq_len + (size_t)seq_len * (size_t)size_per_head
-                          + (size_t)seq_len * (size_t)size_per_head)
-                       * (size_t)wordSize;
-        size_t size3     = (m * k + k * inter_size + m * inter_size) * wordSize;
-        size_t size4     = (m * k + k * vocab_size + m * vocab_size) * wordSize;
-        buf_size_in_byte = size1 > size2 ? size1 : size2;
-        buf_size_in_byte = buf_size_in_byte > size3 ? buf_size_in_byte : size3;
-        buf_size_in_byte = buf_size_in_byte > size4 ? buf_size_in_byte : size4;
-        buf_size_in_byte +=
-            ((data_type == HALF_DATATYPE || data_type == BFLOAT16_DATATYPE) ? CUBLAS_WORKSPACE_SIZE : 0);
-    }
-    return buf_size_in_byte;
-}
-
-size_t calGemmTestBufSizeInByteXlnet(
-    int batch_size, int seq_len, int head_num, int size_per_head, int inter_size, int hidden_units, int is_fp16)
-{
-    int M[10]          = {0};
-    int N[10]          = {0};
-    int K[10]          = {0};
-    int batchCount[10] = {0};
-
-    // gemm1
-    M[0]          = hidden_units;
-    N[0]          = seq_len * batch_size;
-    K[0]          = hidden_units;
-    batchCount[0] = 3;
-
-    // gemm2
-    M[1]          = hidden_units;
-    N[1]          = seq_len * 2;
-    K[1]          = hidden_units;
-    batchCount[1] = 1;
-
-    // gemm3
-    M[2]          = seq_len;
-    N[2]          = seq_len;
-    K[2]          = size_per_head;
-    batchCount[2] = batch_size * head_num;
-
-    // gemm4
-    M[3]          = seq_len * 2;
-    N[3]          = seq_len;
-    K[3]          = size_per_head;
-    batchCount[3] = batch_size * head_num;
-
-    // gemm5
-    M[4]          = 2;
-    N[4]          = seq_len;
-    K[4]          = size_per_head;
-    batchCount[4] = batch_size * head_num;
-
-    // gemm6
-    M[5] = head_num;
-    N[5] = seq_len;
-    K[5] = 2;
-    // gemm7
-    M[6]          = size_per_head;
-    N[6]          = seq_len;
-    K[6]          = seq_len;
-    batchCount[6] = batch_size * head_num;
-
-    // gemm8
-    M[7]          = hidden_units;
-    N[7]          = seq_len;
-    K[7]          = hidden_units;
-    batchCount[7] = batch_size;
-
-    // gemm9
-    M[8]          = inter_size;
-    N[8]          = seq_len;
-    K[8]          = hidden_units;
-    batchCount[8] = batch_size;
-
-    // gemm10
-    M[9]          = hidden_units;
-    N[9]          = seq_len;
-    K[9]          = inter_size;
-    batchCount[9] = batch_size;
-
-    size_t max_size = 0;
-
-    for (int i = 0; i < 10; ++i) {
-        int    m = M[i], n = N[i], k = K[i];
-        size_t size = (M[i] * N[i] + M[i] * K[i] + N[i] * K[i]) * batchCount[i];
-        if (size > max_size) {
-            max_size = size;
-        }
-    }
-
-    int size_per_ele = 4;
-    if (is_fp16 == true) {
-        size_per_ele = 2;
-    }
-    return max_size * size_per_ele;
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/gemm_test/gemm_func.h b/src/turbomind/utils/gemm_test/gemm_func.h
deleted file mode 100644
index b33ae2132b..0000000000
--- a/src/turbomind/utils/gemm_test/gemm_func.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "encoder_igemm_func.h"  // TODO(bhsueh) Remove this include
-#include "src/turbomind/utils/cublasAlgoMap.h"
-#include "src/turbomind/utils/cuda_utils.h"
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-#ifdef ENABLE_BF16
-#include <cuda_fp16.h>
-#endif
-#ifdef ENABLE_FP8
-#include <cuda_fp8.h>
-#endif
-#ifdef __linux__
-#include <sys/time.h>
-#include <unistd.h>
-#endif
-#include <cuda_profiler_api.h>
-#include <map>
-#include <vector>
-
-namespace turbomind {
-
-// Scale Type Converter
-// is_fp16_compute_type is only valid when T = half
-template<typename T, bool is_fp16_compute_type = false>
-struct ScaleTypeConverter {
-    using Type = float;
-};
-
-template<>
-struct ScaleTypeConverter<half, true> {
-    using Type = half;
-};
-
-template<typename T, typename scaleT>
-int LtHgemmCustomFind(cublasLtHandle_t   ltHandle,
-                      int                batch_size,
-                      int                seq_len,
-                      int                head_num,
-                      int                size_per_head,
-                      int                m,
-                      int                n,
-                      int                k,
-                      const scaleT*      alpha, /* host pointer */
-                      const T*           A,
-                      const T*           B,
-                      const scaleT*      beta, /* host pointer */
-                      T*                 C,
-                      void*              workSpace,
-                      size_t             workSpaceSize,
-                      FILE*              fout,
-                      customMatmulPerf_t perfResults[],
-                      int                AlgoCombinations,
-                      cudaDataType_t     dtype_fp8  = CUDA_R_32F,
-                      int                batchCount = 1,
-                      int64_t            strideA    = 0,
-                      int64_t            strideB    = 0,
-                      int64_t            strideD    = 0);
-
-size_t calGemmTestBufSizeInByte(int            batch_size,
-                                int            seq_len,
-                                int            head_num,
-                                int            size_per_head,
-                                int            inter_size,
-                                int            vocab_size,
-                                int            int8_mode,
-                                CublasDataType data_type);
-
-size_t calGemmTestBufSizeInByteXlnet(
-    int batch_size, int seq_len, int head_num, int size_per_head, int inter_size, int hidden_units, int is_fp16);
-
-int printPerfStructure(int                       batch_size,
-                       int                       seq_len,
-                       int                       head_num,
-                       int                       size_per_head,
-                       int                       m,
-                       int                       n,
-                       int                       k,
-                       const customMatmulPerf_t& perf,
-                       FILE*                     fout,
-                       CublasDataType            data_type,
-                       int                       hasPrint,
-                       int                       batch_count = 1);
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/gemm_test/gpt_gemm_func.cc b/src/turbomind/utils/gemm_test/gpt_gemm_func.cc
deleted file mode 100644
index 68e665930f..0000000000
--- a/src/turbomind/utils/gemm_test/gpt_gemm_func.cc
+++ /dev/null
@@ -1,811 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "src/turbomind/utils/gemm_test/gpt_gemm_func.h"
-#include "src/turbomind/macro.h"
-#include <chrono>
-
-namespace turbomind {
-
-bool isSparseGemmAvailable(size_t m, size_t n, size_t k)
-{
-    return m % 8 == 0 && n % 8 == 0 && k % 8 == 0;
-}
-
-template<typename T>
-void generate_gpt_gemm_config(int   batch_size,
-                              int   beam_width,
-                              int   max_input_len,
-                              int   head_num,
-                              int   size_per_head,
-                              int   inter_size,
-                              int   vocab_size,
-                              int   tensor_para_size,
-                              void* buffer_in,
-                              bool  isAppend)
-{
-    FT_CHECK(head_num % tensor_para_size == 0);
-    void* cublas_workspace;
-    void* buffer;
-    int   workSpaceSize;
-#if 0
-    bool  workspace_flag = std::is_same<T, half>::value;
-#ifdef ENABLE_FP8
-    workspace_flag = workspace_flag || std::is_same<T, __nv_fp8_e4m3>::value;
-#endif
-#if ENABLE_BF16
-    workspace_flag = workspace_flag || std::is_same<T, __nv_bfloat16>::value;
-#endif
-#endif
-    // algorithms with workspace perform worse than evaluated
-    const bool workspace_flag = 0;
-    if (workspace_flag) {
-        // cublas_workspace_ should be the start pointer of cudaMalloc()
-        // to ensure 16B alignemnet
-        cublas_workspace = buffer_in;
-        buffer           = (void*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE);
-        workSpaceSize    = CUBLAS_WORKSPACE_SIZE;
-    }
-    else {
-        cublas_workspace = nullptr;
-        buffer           = buffer_in;
-        workSpaceSize    = 0;
-    }
-
-    struct cudaDeviceProp prop;
-    check_cuda_error(cudaGetDeviceProperties(&prop, 0));
-    printf("Device %s\n", prop.name);
-
-    // check config
-    FILE* fd;
-    int   line_count = 0;
-    if (!isAppend) {
-        fd = fopen(GEMM_CONFIG, "w+");
-    }
-    else {
-        fd = fopen(GEMM_CONFIG, "a+");
-        std::vector<std::string> config;
-        char                     line[1024];
-        while (fgets(line, 1024, fd) != NULL) {
-            config.push_back(std::string(line));
-        }
-        line_count = config.size();
-        // if (config.size() >= (MAX_CONFIG_NUM * GEMM_NUM + 1))  // 6 cublas/cublasLt, first row is not included
-        // {
-        //     int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * GEMM_NUM);
-        //     fclose(fd);
-        //     fd = fopen(GEMM_CONFIG, "w+");
-        //     fprintf(fd, "%s", config[0].c_str());
-        //     for (uint i = startIdx; i < config.size(); i++) {
-        //         fprintf(fd, "%s", config[i].c_str());
-        //     }
-        //     line_count = config.size() - (GEMM_NUM + 3);
-        // }
-    }
-
-    const int hidden_units         = head_num * size_per_head;
-    const int local_head_num       = head_num / tensor_para_size;
-    const int local_hidden_units   = local_head_num * size_per_head;
-    const int max_input_len_padded = (max_input_len + 15) / 16 * 16;
-    const int gemm_num             = 11;
-    int       M[gemm_num];
-    int       N[gemm_num];
-    int       K[gemm_num];
-    int       batchCount[gemm_num];
-    int64_t   strideA[gemm_num];
-    int64_t   strideB[gemm_num];
-    int64_t   strideD[gemm_num];
-    char      mess[gemm_num][256];
-    float     exec_times[gemm_num];
-
-    // gemm 0
-    M[0]          = batch_size * beam_width * max_input_len;
-    K[0]          = hidden_units;
-    N[0]          = 3 * local_hidden_units;
-    batchCount[0] = 1;
-    strideA[0]    = 0;
-    strideB[0]    = 0;
-    strideD[0]    = 0;
-    strcpy(mess[0], "context from_tensor * weightQKV");
-
-    // gemm 1
-    M[1]          = max_input_len_padded;
-    K[1]          = size_per_head;
-    N[1]          = max_input_len_padded;
-    batchCount[1] = batch_size * beam_width * local_head_num;
-    strideA[1]    = max_input_len_padded * size_per_head;
-    strideB[1]    = max_input_len_padded * size_per_head;
-    strideD[1]    = max_input_len_padded * max_input_len_padded;
-    strcpy(mess[1], "context batch gemm Q*K^T");
-
-    // gemm 2
-    M[2]          = max_input_len_padded;
-    K[2]          = max_input_len_padded;
-    N[2]          = size_per_head;
-    batchCount[2] = batch_size * beam_width * local_head_num;
-    strideA[2]    = max_input_len_padded * size_per_head;
-    strideB[2]    = max_input_len_padded * max_input_len_padded;
-    strideD[2]    = max_input_len_padded * size_per_head;
-    strcpy(mess[2], "context batch gemm QK*V^T");
-
-    // gemm 3
-    M[3]          = batch_size * beam_width * max_input_len;
-    K[3]          = local_hidden_units;
-    N[3]          = hidden_units;
-    batchCount[3] = 1;
-    strideA[3]    = 0;
-    strideB[3]    = 0;
-    strideD[3]    = 0;
-    strcpy(mess[3], "context attr * output_kernel");
-
-    // gemm 4
-    M[4]          = batch_size * beam_width * max_input_len;
-    K[4]          = hidden_units;
-    N[4]          = inter_size / tensor_para_size;
-    batchCount[4] = 1;
-    strideA[4]    = 0;
-    strideB[4]    = 0;
-    strideD[4]    = 0;
-    strcpy(mess[4], "context ffn gemm 1");
-
-    // gemm 5
-    M[5]          = batch_size * beam_width * max_input_len;
-    K[5]          = inter_size / tensor_para_size;
-    N[5]          = hidden_units;
-    batchCount[5] = 1;
-    strideA[5]    = 0;
-    strideB[5]    = 0;
-    strideD[5]    = 0;
-    strcpy(mess[5], "context ffn gemm 2");
-
-    // gemm 6
-    M[6]          = batch_size * beam_width;
-    K[6]          = hidden_units;
-    N[6]          = 3 * local_hidden_units;
-    batchCount[6] = 1;
-    strideA[6]    = 0;
-    strideB[6]    = 0;
-    strideD[6]    = 0;
-    strcpy(mess[6], "from_tensor * weightQKV");
-
-    // gemm 7
-    M[7]          = batch_size * beam_width;
-    K[7]          = local_hidden_units;
-    N[7]          = hidden_units;
-    batchCount[7] = 1;
-    strideA[7]    = 0;
-    strideB[7]    = 0;
-    strideD[7]    = 0;
-    strcpy(mess[7], "attr * output_kernel");
-
-    // gemm 8
-    M[8]          = batch_size * beam_width;
-    K[8]          = hidden_units;
-    N[8]          = inter_size / tensor_para_size;
-    batchCount[8] = 1;
-    strideA[8]    = 0;
-    strideB[8]    = 0;
-    strideD[8]    = 0;
-    strcpy(mess[8], "ffn gemm 1");
-
-    // gemm 9
-    M[9]          = batch_size * beam_width;
-    K[9]          = inter_size / tensor_para_size;
-    N[9]          = hidden_units;
-    batchCount[9] = 1;
-    strideA[9]    = 0;
-    strideB[9]    = 0;
-    strideD[9]    = 0;
-    strcpy(mess[9], "ffn gemm 2");
-
-    // gemm 10
-    M[10]          = batch_size * beam_width;
-    K[10]          = hidden_units;
-    N[10]          = ceil(vocab_size / 8.) * 8 / tensor_para_size;
-    batchCount[10] = 1;
-    strideA[10]    = 0;
-    strideB[10]    = 0;
-    strideD[10]    = 0;
-    strcpy(mess[10], "logits gemm");
-
-    cublasHandle_t cublas_handle;
-    check_cuda_error(cublasCreate(&cublas_handle));
-    cublasLtHandle_t ltHandle;
-    check_cuda_error(cublasLtCreate(&ltHandle));
-
-    cudaDataType_t AType;
-    cudaDataType_t BType;
-    cudaDataType_t CType;
-    cudaDataType_t DType;
-    cudaDataType_t DType_FP8[gemm_num];
-    cudaDataType_t computeType;
-    int            startAlgo, endAlgo;
-    const int      ites = 100;
-
-    CublasDataType data_type;
-    if (std::is_same<T, float>::value) {
-        data_type   = FLOAT_DATATYPE;
-        AType       = CUDA_R_32F;
-        BType       = CUDA_R_32F;
-        CType       = CUDA_R_32F;
-        DType       = CUDA_R_32F;
-        computeType = CUDA_R_32F;
-        startAlgo   = (int)CUBLAS_GEMM_DEFAULT;
-        endAlgo     = (int)CUBLAS_GEMM_ALGO23;
-    }
-    else if (std::is_same<T, half>::value) {
-        data_type   = HALF_DATATYPE;
-        AType       = CUDA_R_16F;
-        BType       = CUDA_R_16F;
-        CType       = CUDA_R_16F;
-        DType       = CUDA_R_16F;
-        computeType = CUDA_R_32F;
-        startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-        endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
-    }
-#ifdef ENABLE_BF16
-    else if (std::is_same<T, __nv_bfloat16>::value) {
-        data_type   = BFLOAT16_DATATYPE;
-        AType       = CUDA_R_16BF;
-        BType       = CUDA_R_16BF;
-        CType       = CUDA_R_16BF;
-        DType       = CUDA_R_16BF;
-        computeType = CUDA_R_32F;
-        startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-        endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
-    }
-#endif
-#ifdef ENABLE_FP8
-    else if (std::is_same<T, __nv_fp8_e4m3>::value) {
-        data_type = FP8_DATATYPE;
-        AType     = CUDA_R_8F_E4M3;
-        BType     = CUDA_R_8F_E4M3;
-        CType     = CUDA_R_16BF;
-#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
-        DType = CUDA_R_16BF
-#else
-        DType_FP8[0] = CUDA_R_8F_E4M3;
-        DType_FP8[1] = CUDA_R_16BF;
-        DType_FP8[2] = CUDA_R_8F_E4M3;
-        DType_FP8[3] = CUDA_R_16BF;
-        DType_FP8[4] = CUDA_R_16BF;
-        DType_FP8[5] = CUDA_R_16BF;
-#ifdef FP8_MHA
-        DType_FP8[6] = CUDA_R_8F_E4M3;
-#else
-        DType_FP8[6] = CUDA_R_16BF;
-#endif
-        DType_FP8[7] = CUDA_R_16BF;
-        DType_FP8[8] = CUDA_R_16BF;
-        DType_FP8[9] = CUDA_R_16BF;
-#endif
-            computeType = CUDA_R_32F;
-        startAlgo       = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-        endAlgo         = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
-    }
-#endif
-    float alpha = (float)1.0f;
-    float beta  = (float)0.0f;
-
-    printf("***Encoder Gemm Testing Begin***\n");
-    printf("***Cublas Gemm Testing Begin***\n");
-    if (line_count == 0) {
-        fprintf(fd,
-                "batch_size, seq_len, head_num, size_per_head dataType ### batchCount, n, m, k, algoId, "
-                "customOption, tile, numSplitsK, swizzle, reductionScheme, workspaceSize, stages, "
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-                "inner_shapeId, cluster_shapeId, "
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-                "mma_shapeId, cga_shapeId, schedule_mode, "
-#endif
-                "exec_time\n");
-    }
-
-    for (int i = 0; i < gemm_num; ++i) {
-        // tuning of context gemm and logits gemm is not working yet
-        if (i <= 5 || i == 10) {
-            continue;
-        }
-        int seq_len = i <= 5 ? max_input_len : 1;
-
-        int m = M[i], n = N[i], k = K[i];
-        printf("\n-----------------------------\n");
-        printf("GEMM test %d: [M: %d, K: %d, N: %d] %s\n", i, m, k, n, mess[i]);
-        T* d_A = (T*)buffer;
-        T* d_B = d_A + m * k * batchCount[i];
-        T* d_C = d_B + k * n * batchCount[i];
-
-        float exec_time = 99999.0f;
-        int   fast_algo = 0;
-        for (int algo = startAlgo; algo <= endAlgo; algo++) {
-            cublasStatus_t status;
-            cudaDeviceSynchronize();
-            auto start = std::chrono::high_resolution_clock::now();
-            for (int ite = 0; ite < ites; ++ite) {
-                if (i == 1) {
-                    status = cublasGemmStridedBatchedEx(cublas_handle,
-                                                        CUBLAS_OP_T,
-                                                        CUBLAS_OP_N,
-                                                        max_input_len,
-                                                        max_input_len,
-                                                        size_per_head,
-                                                        &alpha,
-                                                        d_B,
-                                                        BType,
-                                                        size_per_head,
-                                                        max_input_len * size_per_head,
-                                                        d_A,
-                                                        AType,
-                                                        size_per_head,
-                                                        max_input_len * size_per_head,
-                                                        &beta,
-                                                        d_C,
-                                                        CUDA_R_32F,  // CType,
-                                                        max_input_len,
-                                                        max_input_len * max_input_len,
-                                                        batchCount[i],
-                                                        computeType,
-                                                        static_cast<cublasGemmAlgo_t>(algo));
-                }
-                else if (i == 2) {
-                    status = cublasGemmStridedBatchedEx(cublas_handle,
-                                                        CUBLAS_OP_N,
-                                                        CUBLAS_OP_N,
-                                                        size_per_head,
-                                                        max_input_len,
-                                                        max_input_len,
-                                                        &alpha,
-                                                        d_B,
-                                                        BType,
-                                                        size_per_head,
-                                                        max_input_len * size_per_head,
-                                                        d_A,
-                                                        AType,
-                                                        max_input_len,
-                                                        max_input_len * max_input_len,
-                                                        &beta,
-                                                        d_C,
-                                                        CType,
-                                                        size_per_head,
-                                                        max_input_len * size_per_head,
-                                                        batchCount[i],
-                                                        computeType,
-                                                        static_cast<cublasGemmAlgo_t>(algo));
-                }
-                else if (i == 10) {
-                    status = cublasGemmEx(cublas_handle,
-                                          CUBLAS_OP_T,
-                                          CUBLAS_OP_N,
-                                          n,
-                                          m,
-                                          k,
-                                          &alpha,
-                                          d_B,
-                                          BType,
-                                          k,
-                                          d_A,
-                                          AType,
-                                          k,
-                                          &beta,
-                                          d_C,
-                                          CType,
-                                          n,
-                                          computeType,
-                                          static_cast<cublasGemmAlgo_t>(algo));
-                }
-                else {
-                    status = cublasGemmEx(cublas_handle,
-                                          CUBLAS_OP_N,
-                                          CUBLAS_OP_N,
-                                          n,
-                                          m,
-                                          k,
-                                          &alpha,
-                                          d_B,
-                                          BType,
-                                          n,
-                                          d_A,
-                                          AType,
-                                          k,
-                                          &beta,
-                                          d_C,
-                                          CType,
-                                          n,
-                                          computeType,
-                                          static_cast<cublasGemmAlgo_t>(algo));
-                }
-
-                if (status != CUBLAS_STATUS_SUCCESS) {
-                    break;
-                }
-            }
-            cudaDeviceSynchronize();
-            auto end = std::chrono::high_resolution_clock::now();
-            auto dur = std::chrono::duration<float, std::milli>(end - start);
-            if (status == CUBLAS_STATUS_SUCCESS) {
-                printf("algo_%d costs %.3fms \n", algo, dur.count() / ites);
-                if (dur.count() / ites < exec_time) {
-                    exec_time = dur.count() / ites;
-                    fast_algo = algo;
-                }
-            }
-            sync_check_cuda_error();
-        }
-
-        printf("fast_algo %d costs %.3f ms\n", fast_algo, exec_time);
-
-        // for fp16 and bf16, we compare cublasLt
-        // for fp8, compare cublaslt for all gemm kernels
-        if ((data_type != FLOAT_DATATYPE && i != 1 && i != 2 && i != 10) || data_type == FP8_DATATYPE) {
-            printf("***cublasLt Gemm Testing Beign***\n");
-            // Let try a fixed number of combinations
-            const int          ALGO_COMBINATIONS = 10000;
-            customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
-
-            // for gpt, computeType & scaleType should be FP32
-            LtHgemmCustomFind<T, float>(ltHandle,
-                                        batch_size * beam_width,
-                                        i == 1 || i == 2 ? max_input_len : 1,
-                                        head_num,
-                                        size_per_head,
-                                        n,
-                                        m,
-                                        k,
-                                        &alpha,
-                                        d_B,
-                                        d_A,
-                                        &beta,
-                                        d_C,
-                                        cublas_workspace,
-                                        workSpaceSize,
-                                        fd,
-                                        perfResults,
-                                        ALGO_COMBINATIONS,
-                                        DType_FP8[i],
-                                        batchCount[i],
-                                        strideA[i],
-                                        strideB[i],
-                                        strideD[i]);
-            if (perfResults[0].time < exec_time) {
-                printPerfStructure(batch_size * beam_width,
-                                   seq_len,
-                                   head_num,
-                                   size_per_head,
-                                   n,
-                                   m,
-                                   k,
-                                   perfResults[0],
-                                   fd,
-                                   data_type,
-                                   0,
-                                   batchCount[i]);
-            }
-            else {
-                fprintf(fd,
-                        "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-                        "-1 -1 "
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-                        "-1 -1 -1 "
-#endif
-                        "%f\n",
-                        batch_size * beam_width,
-                        seq_len,
-                        head_num,
-                        size_per_head,
-                        data_type,
-                        batchCount[i],
-                        n,
-                        m,
-                        k,
-                        fast_algo,
-                        exec_time);
-            }
-            printf("***cublasLt Gemm Testing End***\n");
-        }
-        else {
-            fprintf(fd,
-                    "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-                    "-1 -1 "
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-                    "-1 -1 -1 "
-#endif
-                    "%f\n",
-                    batch_size * beam_width,
-                    seq_len,
-                    head_num,
-                    size_per_head,
-                    data_type,
-                    batchCount[i],
-                    n,
-                    m,
-                    k,
-                    fast_algo,
-                    exec_time);
-        }
-        sync_check_cuda_error();
-        exec_times[i] = exec_time;
-    }
-    printf("***cublas Gemm Testing End***\n\n");
-    fclose(fd);
-
-#ifdef SPARSITY_ENABLED
-    bool do_sparse_test = false;
-    if (prop.major == 8 && (prop.minor == 0 || prop.minor == 6) && sizeof(T) == sizeof(half)) {
-        do_sparse_test = true;
-    }
-    if (do_sparse_test) {
-        printf("***cusparseLt Gemm Testing Begin***\n");
-        // Only first 8 cases can be sparse
-        // - QKV kernel, Projection, FC1, FC2 in context or decoding.
-        const int spgemm_num = 8;
-        if (!isAppend) {
-            fd = fopen(SPGEMM_CONFIG, "w+");
-        }
-        else {
-            fd = fopen(SPGEMM_CONFIG, "a+");
-            std::vector<std::string> config;
-            char                     line[1024];
-            while (fgets(line, 1024, fd) != NULL) {
-                config.push_back(std::string(line));
-            }
-            line_count = config.size();
-            // gemm_num configs (cublas/cublasLt), first row is not included
-            if (config.size() >= (MAX_CONFIG_NUM * spgemm_num + 1)) {
-                int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * spgemm_num);
-                fclose(fd);
-                fd = fopen(SPGEMM_CONFIG, "w+");
-                fprintf(fd, "%s", config[0].c_str());
-                for (uint i = startIdx; i < config.size(); i++) {
-                    fprintf(fd, "%s", config[i].c_str());
-                }
-                line_count = config.size() - (spgemm_num + 3);
-            }
-        }
-        if (line_count == 0) {
-            // header line
-            fprintf(fd,
-                    "batch_size, seq_len, head_num, size_per_head dataType "
-                    "### batchCount, m, n, k, algoId, exec_time\n");
-        }
-
-        cusparseLtHandle_t handle;
-        CHECK_CUSPARSE(cusparseLtInit(&handle));
-        cusparseOrder_t     order = CUSPARSE_ORDER_COL;
-        cusparseOperation_t opA   = CUSPARSE_OPERATION_NON_TRANSPOSE;
-        cusparseOperation_t opB   = CUSPARSE_OPERATION_NON_TRANSPOSE;
-        // let's make this optional
-        cusparseComputeType compute_type = CUSPARSE_COMPUTE_16F;
-        unsigned            alignment    = 16;
-        cudaStream_t        stream       = 0;
-        float               alpha2       = 1.0f;
-        float               beta2        = 0.0f;
-        for (int i = 0; i < gemm_num; ++i) {
-            // skip qk or attn or logit gemms.
-            if (i == 1 || i == 2 || i == 10) {
-                continue;
-            }
-
-            // seq_len is always 1 except context gemms.
-            int seq_len = i <= 5 ? max_input_len : 1;
-
-            // to be compatible with spgemm wrapper, we let A be the weight matrix
-            // so m and n are swapped
-            // A: mxk B: kxn C:mxn
-            int m = N[i], n = M[i], k = K[i];
-            printf("\n-----------------------------\n");
-            printf("GEMM test %d: [M: %d, K: %d, N: %d]\n", i, m, k, n);
-
-            if (n % 8 != 0) {
-                n = div_up(n, 8) * 8;  // pad n to be multiple of 8 as FT does.
-            }
-
-            T* d_A = (T*)buffer;
-            T* d_B = d_A + m * k * batchCount[i];
-            T* d_C = d_B + k * n * batchCount[i];
-            T* dA_compressed;
-            {
-                cusparseLtMatDescriptor_t mat_A;
-                CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
-                    &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
-                CHECK_CUSPARSE(
-                    cusparseLtSpMMAPrune2(&handle, &mat_A, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
-                size_t compressed_size;
-                CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &mat_A, &compressed_size))
-                check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size));
-                CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &mat_A, true, opA, d_A, dA_compressed, stream))
-            }
-
-            float exec_time = 99999.0f;
-            int   fast_algo = 0;
-            if (isSparseGemmAvailable(m, n, k)) {
-                for (int alg = 0; alg < 4; ++alg) {
-                    cudaDeviceSynchronize();
-                    cusparseLtMatDescriptor_t mat_A, mat_B, mat_C;
-                    void*                     d_workspace = nullptr;
-                    int                       num_streams = 1;
-                    cudaStream_t              streams[1]  = {stream};
-                    CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
-                        &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
-                    CHECK_CUSPARSE(
-                        cusparseLtDenseDescriptorInit(&handle, &mat_B, k, n, k, alignment, CUDA_R_16F, order))
-                    CHECK_CUSPARSE(
-                        cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_16F, order))
-                    cudaDeviceSynchronize();
-                    auto start = std::chrono::high_resolution_clock::now();
-                    for (int ite = 0; ite < ites; ++ite) {
-                        // initializing MatDesc takes a lot of time
-                        // and these descs can be stored to other place
-                        // whereas storing MatMulPlan to other place will cause errors
-                        cusparseLtMatmulDescriptor_t   matmul;
-                        cusparseLtMatmulAlgSelection_t alg_sel;
-                        cusparseLtMatmulPlan_t         plan;
-                        CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(
-                            &handle, &matmul, opA, opB, &mat_A, &mat_B, &mat_C, &mat_C, compute_type))
-                        CHECK_CUSPARSE(
-                            cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
-                        CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
-                            &handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)))
-                        size_t workspace_size;
-                        CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&handle, &alg_sel, &workspace_size))
-                        CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel, workspace_size))
-                        CHECK_CUSPARSE(cusparseLtMatmul(&handle,
-                                                        &plan,
-                                                        &alpha2,
-                                                        dA_compressed,
-                                                        d_B,
-                                                        &beta2,
-                                                        d_C,
-                                                        d_C,
-                                                        d_workspace,
-                                                        streams,
-                                                        num_streams))
-                        CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
-                    }
-                    cudaDeviceSynchronize();
-                    auto end = std::chrono::high_resolution_clock::now();
-                    auto dur = std::chrono::duration<float, std::milli>(end - start);
-                    printf("algo_%d costs %.3fms \n", alg, dur.count() / ites);
-                    if (dur.count() < exec_time) {
-                        exec_time = dur.count();
-                        fast_algo = alg;
-                    }
-                }
-            }
-            exec_time /= ites;
-            if (exec_time >= exec_times[i]) {
-                fast_algo = -1;
-            }
-            printf("fast_algo %d\n", fast_algo);
-            fprintf(fd,
-                    "%d %d %d %d %d ### %d %d %d %d %d %f\n",
-                    batch_size * beam_width,
-                    seq_len,
-                    head_num,
-                    size_per_head,
-                    data_type,
-                    batchCount[i],
-                    m,
-                    n,
-                    k,
-                    fast_algo,
-                    exec_time);
-            cudaFree(dA_compressed);
-        }
-        CHECK_CUSPARSE(cusparseLtDestroy(&handle))
-        fclose(fd);
-        printf("***cusparseLt Gemm Testing End***\n");
-    }
-#endif
-
-    printf("***GPT Gemm Testing End***\n");
-    return;
-}
-
-template void generate_gpt_gemm_config<float>(int   batch_size,
-                                              int   beam_width,
-                                              int   max_input_len,
-                                              int   head_num,
-                                              int   size_per_head,
-                                              int   inter_size,
-                                              int   vocab_size,
-                                              int   tensor_para_size,
-                                              void* buffer_in,
-                                              bool  isAppend);
-
-template void generate_gpt_gemm_config<half>(int   batch_size,
-                                             int   beam_width,
-                                             int   max_input_len,
-                                             int   head_num,
-                                             int   size_per_head,
-                                             int   inter_size,
-                                             int   vocab_size,
-                                             int   tensor_para_size,
-                                             void* buffer_in,
-                                             bool  isAppend);
-
-#ifdef ENABLE_BF16
-template void generate_gpt_gemm_config<__nv_bfloat16>(int   batch_size,
-                                                      int   beam_width,
-                                                      int   max_input_len,
-                                                      int   head_num,
-                                                      int   size_per_head,
-                                                      int   inter_size,
-                                                      int   vocab_size,
-                                                      int   tensor_para_size,
-                                                      void* buffer_in,
-                                                      bool  isAppend);
-#endif
-
-#ifdef ENABLE_FP8
-template void generate_gpt_gemm_config<__nv_fp8_e4m3>(int   batch_size,
-                                                      int   beam_width,
-                                                      int   max_input_len,
-                                                      int   head_num,
-                                                      int   size_per_head,
-                                                      int   inter_size,
-                                                      int   vocab_size,
-                                                      int   tensor_para_size,
-                                                      void* buffer_in,
-                                                      bool  isAppend);
-#endif
-
-size_t calGptGemmTestBufSizeInByte(int            batch_size,
-                                   int            beam_width,
-                                   int            max_input_len,
-                                   int            head_num,
-                                   int            size_per_head,
-                                   int            inter_size,
-                                   int            vocab_size,
-                                   int            tensor_para_size,
-                                   CublasDataType data_type)
-{
-    size_t       buf_size_in_byte   = 0;
-    const size_t hidden_units       = head_num * size_per_head;
-    const size_t local_head_num     = head_num / tensor_para_size;
-    const size_t local_hidden_units = local_head_num * size_per_head;
-
-    // int wordSize = (data_type == FLOAT_DATATYPE ? sizeof(float) : sizeof(half));
-    // Because we always use float for some buffer, set the wordSize to float directly.
-    int wordSize = sizeof(float);
-
-    size_t              m = batch_size * beam_width * max_input_len;
-    std::vector<size_t> buff_size;
-    // for context qkv gemm
-    buff_size.push_back(m * hidden_units + hidden_units * 3 * local_hidden_units + m * 3 * local_hidden_units);
-    // for context batch gemm
-    buff_size.push_back(m * local_hidden_units + m * local_hidden_units
-                        + batch_size * beam_width * head_num * max_input_len * max_input_len);
-    // for context ffn gemm
-    buff_size.push_back(m * inter_size / tensor_para_size + hidden_units * inter_size / tensor_para_size
-                        + m * hidden_units);
-    // for vocab
-    buff_size.push_back(m * hidden_units + hidden_units * ceil(vocab_size / 8.) * 8 / tensor_para_size
-                        + m * ceil(vocab_size / 8.) * 8 / tensor_para_size);
-
-    for (auto t : buff_size) {
-        buf_size_in_byte = buf_size_in_byte > t ? buf_size_in_byte : t;
-    }
-    buf_size_in_byte *= wordSize;
-    buf_size_in_byte += ((data_type == HALF_DATATYPE || data_type == BFLOAT16_DATATYPE || data_type == FP8_DATATYPE) ?
-                             CUBLAS_WORKSPACE_SIZE :
-                             0);
-
-    return buf_size_in_byte;
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/gemm_test/gpt_gemm_func.h b/src/turbomind/utils/gemm_test/gpt_gemm_func.h
deleted file mode 100644
index bcbe131d8b..0000000000
--- a/src/turbomind/utils/gemm_test/gpt_gemm_func.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "src/turbomind/utils/cublasAlgoMap.h"
-#include "src/turbomind/utils/cuda_bf16_wrapper.h"
-#include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/gemm_test/gemm_func.h"
-
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-#ifdef ENABLE_BF16
-#include <cuda_fp16.h>
-#endif
-#ifdef ENABLE_FP8
-#include <cuda_fp8.h>
-#endif
-#include <cuda_profiler_api.h>
-#include <map>
-#ifdef __linux__
-#include <sys/time.h>
-#include <unistd.h>
-#endif
-#include <vector>
-
-namespace turbomind {
-
-template<typename T>
-void generate_gpt_gemm_config(int   batch_size,
-                              int   beam_width,
-                              int   seq_len,
-                              int   head_num,
-                              int   size_per_head,
-                              int   inter_size,
-                              int   vocab_size,
-                              int   tensor_para_size,
-                              void* buffer_in,
-                              bool  isAppend);
-
-size_t calGptGemmTestBufSizeInByte(int            batch_size,
-                                   int            beam_width,
-                                   int            max_input_len,
-                                   int            head_num,
-                                   int            size_per_head,
-                                   int            inter_size,
-                                   int            vocab_size,
-                                   int            tensor_para_size,
-                                   CublasDataType data_type);
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/gemm_test/swin_gemm_func.cc b/src/turbomind/utils/gemm_test/swin_gemm_func.cc
deleted file mode 100644
index b43f250b03..0000000000
--- a/src/turbomind/utils/gemm_test/swin_gemm_func.cc
+++ /dev/null
@@ -1,403 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "src/turbomind/utils/gemm_test/swin_gemm_func.h"
-#include "src/turbomind/macro.h"
-#include <chrono>
-
-namespace turbomind {
-
-template<typename T>
-void generate_swin_gemm_config(
-    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer_in, bool isAppend)
-{
-    void* cublas_workspace;
-    void* buffer;
-    int   workSpaceSize;
-#ifdef ENABLE_BF16
-    if (std::is_same<T, half>::value || std::is_same<T, __nv_bfloat16>::value) {
-#else
-    if (std::is_same<T, half>::value) {
-#endif  // ENABLE_BF16
-        // cublas_workspace_ should be the start pointer of cudaMalloc()
-        // to ensure 16B alignemnet
-        cublas_workspace = buffer_in;
-        buffer           = (void*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE);
-        workSpaceSize    = CUBLAS_WORKSPACE_SIZE;
-    }
-    else {
-        cublas_workspace = nullptr;
-        buffer           = buffer_in;
-        workSpaceSize    = 0;
-    }
-
-    struct cudaDeviceProp prop;
-    check_cuda_error(cudaGetDeviceProperties(&prop, 0));
-    printf("Device %s\n", prop.name);
-
-    // check config
-    FILE* fd;
-    int   line_count = 0;
-    if (!isAppend) {
-        fd = fopen(GEMM_CONFIG, "w+");
-        fprintf(
-            fd,
-            "batch_size seq_len head_num size_per_head dataType ### batchCount n m k algoId customOption tile splitK_val swizzle reductionScheme workspaceSize stages exec_time\n");
-    }
-    else {
-        fd = fopen(GEMM_CONFIG, "a+");
-        std::vector<std::string> config;
-        char                     line[1024];
-        while (fgets(line, 1024, fd) != NULL) {
-            config.push_back(std::string(line));
-        }
-        line_count = config.size();
-        if (config.size() >= (MAX_CONFIG_NUM * GEMM_NUM + 1))  // 6 cublas/cublasLt, first row is not included
-        {
-            int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * GEMM_NUM);
-            fclose(fd);
-            fd = fopen(GEMM_CONFIG, "w+");
-            fprintf(fd, "%s", config[0].c_str());
-            for (uint i = startIdx; i < config.size(); i++) {
-                fprintf(fd, "%s", config[i].c_str());
-            }
-            line_count = config.size() - (GEMM_NUM + 3);
-        }
-    }
-
-    const int gemm_num            = 7;
-    const int NUM_OF_BASIC_LAYERS = 4;
-    int       M[gemm_num];
-    int       N[gemm_num];
-    int       K[gemm_num];
-    int       batchCount[gemm_num] = {1, 1, 1, 1, 1, 1, 1};
-    char      mess[gemm_num][256];
-    float     exec_times[gemm_num];
-
-    printf("***Encoder Gemm Testing Begin***\n");
-    printf("***Cublas Gemm Testing Begin***\n");
-    for (int basic_layer = 0; basic_layer < NUM_OF_BASIC_LAYERS; basic_layer++) {
-        // gemm1
-        M[0] = batch_size * seq_len;
-        K[0] = head_num * size_per_head;
-        N[0] = 3 * K[0];
-        strcpy(mess[0], "from_tensor * weightQ/K/V");
-
-        // gemm2
-        M[1] = M[0];
-        K[1] = K[0];
-        N[1] = K[0];
-        strcpy(mess[1], "attr * output_kernel");
-
-        // gemm3
-        M[2] = M[0];
-        K[2] = K[0];
-        N[2] = 4 * K[0];
-        strcpy(mess[2], "attr_output * inter_kernel");
-
-        // gemm3
-        M[3] = M[0];
-        K[3] = 4 * K[0];
-        N[3] = K[0];
-        strcpy(mess[3], "inter_matmul * output_kernel");
-
-        M[4] = M[0] / 4;
-        K[4] = 4 * K[0];
-        N[4] = 2 * K[0];
-        strcpy(mess[4], "patchMerge gemm");
-
-        M[5]          = seq_len;
-        N[5]          = seq_len;
-        K[5]          = size_per_head;
-        batchCount[5] = batch_size * head_num;
-        strcpy(mess[5], "attention batched Gemm1");
-
-        M[6]          = seq_len;
-        N[6]          = size_per_head;
-        K[6]          = seq_len;
-        batchCount[6] = batch_size * head_num;
-        strcpy(mess[6], "attention batched Gemm2");
-
-        cublasHandle_t cublas_handle;
-        check_cuda_error(cublasCreate(&cublas_handle));
-        cublasLtHandle_t ltHandle;
-        check_cuda_error(cublasLtCreate(&ltHandle));
-
-        cudaDataType_t AType;
-        cudaDataType_t BType;
-        cudaDataType_t CType;
-        cudaDataType_t computeType;
-        int            startAlgo, endAlgo;
-        const int      ites = 100;
-
-        CublasDataType data_type;
-        if (std::is_same<T, float>::value) {
-            data_type   = FLOAT_DATATYPE;
-            AType       = CUDA_R_32F;
-            BType       = CUDA_R_32F;
-            CType       = CUDA_R_32F;
-            computeType = CUDA_R_32F;
-            startAlgo   = (int)CUBLAS_GEMM_DEFAULT;
-            endAlgo     = (int)CUBLAS_GEMM_ALGO23;
-        }
-        else if (std::is_same<T, half>::value) {
-            data_type   = HALF_DATATYPE;
-            AType       = CUDA_R_16F;
-            BType       = CUDA_R_16F;
-            CType       = CUDA_R_16F;
-            computeType = CUDA_R_32F;
-            startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-            endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
-        }
-#ifdef ENABLE_BF16
-        else if (std::is_same<T, __nv_bfloat16>::value) {
-            data_type   = BFLOAT16_DATATYPE;
-            AType       = CUDA_R_16BF;
-            BType       = CUDA_R_16BF;
-            CType       = CUDA_R_16BF;
-            computeType = CUDA_R_32F;
-            startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-            endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
-        }
-#endif
-        using scaleT = typename ScaleTypeConverter<T, false>::Type;
-
-        scaleT alpha = (scaleT)1.0f;
-        scaleT beta  = (scaleT)0.0f;
-
-        for (int i = 0; i < gemm_num; ++i) {
-            // if(i != 0 && i != 5) continue;
-
-            int m = M[i], n = N[i], k = K[i];
-            printf("\n-----------------------------\n");
-            printf("GEMM test %d: [M: %d, K: %d, N: %d] %s\n", i, m, k, n, mess[i]);
-            T* d_A = (T*)buffer;
-            T* d_B = d_A + m * k * batchCount[i];
-            T* d_C = d_B + k * n * batchCount[i];
-
-            // array of pointer for batchedGemm
-            T* harray[12];
-            harray[0]  = (T*)buffer;
-            harray[1]  = (T*)((char*)buffer + sizeof(T) * m * k);
-            harray[2]  = (T*)((char*)buffer + 2 * sizeof(T) * m * k);
-            harray[4]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k);
-            harray[5]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + sizeof(T) * k * n);
-            harray[6]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 2 * sizeof(T) * k * n);
-            harray[8]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n);
-            harray[9]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n + sizeof(T) * m * n);
-            harray[10] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n + 2 * sizeof(T) * m * n);
-
-            T** darray = 0;
-            check_cuda_error(cudaMalloc((void**)&darray, sizeof(T*) * 12));
-            cudaMemcpy((void*)darray, (void*)harray, sizeof(T*) * 12, cudaMemcpyHostToDevice);
-            T** dAarray = darray;
-            T** dBarray = darray + 4;
-            T** dCarray = darray + 8;
-
-            float exec_time = 99999.0f;
-            int   fast_algo = 0;
-            for (int algo = startAlgo; algo <= endAlgo; algo++) {
-                cublasStatus_t status;
-                cudaDeviceSynchronize();
-                auto start = std::chrono::high_resolution_clock::now();
-                for (int ite = 0; ite < ites; ++ite) {
-                    if (i < 5) {
-                        status = cublasGemmEx(cublas_handle,
-                                              CUBLAS_OP_N,
-                                              CUBLAS_OP_N,
-                                              n,
-                                              m,
-                                              k,
-                                              &alpha,
-                                              d_B,
-                                              BType,
-                                              n,
-                                              d_A,
-                                              AType,
-                                              k,
-                                              &beta,
-                                              d_C,
-                                              CType,
-                                              n,
-                                              computeType,
-                                              static_cast<cublasGemmAlgo_t>(algo));
-                    }
-                    else if (i == 5) {
-                        status = cublasGemmStridedBatchedEx(cublas_handle,
-                                                            CUBLAS_OP_T,
-                                                            CUBLAS_OP_N,
-                                                            seq_len,
-                                                            seq_len,
-                                                            size_per_head,
-                                                            &alpha,
-                                                            d_B,
-                                                            BType,
-                                                            size_per_head,
-                                                            seq_len * size_per_head,
-                                                            d_A,
-                                                            AType,
-                                                            size_per_head,
-                                                            seq_len * size_per_head,
-                                                            &beta,
-                                                            d_C,
-                                                            CType,
-                                                            seq_len,
-                                                            seq_len * seq_len,
-                                                            batch_size * head_num,
-                                                            computeType,
-                                                            static_cast<cublasGemmAlgo_t>(algo));
-                    }
-                    else if (i == 6) {
-                        status = cublasGemmStridedBatchedEx(cublas_handle,
-                                                            CUBLAS_OP_N,
-                                                            CUBLAS_OP_N,
-                                                            size_per_head,
-                                                            seq_len,
-                                                            seq_len,
-                                                            &alpha,
-                                                            d_B,
-                                                            BType,
-                                                            size_per_head,
-                                                            seq_len * size_per_head,
-                                                            d_A,
-                                                            AType,
-                                                            seq_len,
-                                                            seq_len * seq_len,
-                                                            &beta,
-                                                            d_C,
-                                                            CType,
-                                                            size_per_head,
-                                                            seq_len * size_per_head,
-                                                            batch_size * head_num,
-                                                            computeType,
-                                                            static_cast<cublasGemmAlgo_t>(algo));
-                    }
-                    if (status != CUBLAS_STATUS_SUCCESS) {
-                        break;
-                    }
-                }
-                cudaDeviceSynchronize();
-                auto end = std::chrono::high_resolution_clock::now();
-                auto dur = std::chrono::duration<float, std::milli>(end - start);
-                if (status == CUBLAS_STATUS_SUCCESS) {
-                    printf("algo_%d costs %.3fms \n", algo, dur.count() / ites);
-                    if (dur.count() / ites < exec_time) {
-                        exec_time = dur.count() / ites;
-                        fast_algo = algo;
-                    }
-                }
-            }
-            printf("fast_algo %d costs %.3f ms\n", fast_algo, exec_time);
-
-            // for fp16 and bf16, we compare cublasLt
-            if (i < 5 && data_type != FLOAT_DATATYPE) {
-                printf("***cublasLt Gemm Testing Begin***\n");
-                // Let try a fixed number of combinations
-                const int          ALGO_COMBINATIONS = 5000;
-                customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
-
-                LtHgemmCustomFind<T, scaleT>(ltHandle,
-                                             batch_size,
-                                             seq_len,
-                                             head_num,
-                                             size_per_head,
-                                             n,
-                                             m,
-                                             k,
-                                             &alpha,
-                                             d_B,
-                                             d_A,
-                                             &beta,
-                                             d_C,
-                                             cublas_workspace,
-                                             workSpaceSize,
-                                             fd,
-                                             perfResults,
-                                             ALGO_COMBINATIONS);
-                if (perfResults[0].time < exec_time) {
-                    printPerfStructure(
-                        batch_size, seq_len, head_num, size_per_head, n, m, k, perfResults[0], fd, data_type, 0);
-                    exec_time = perfResults[0].time;
-                }
-                else {
-                    fprintf(fd,
-                            "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-                            "-1 -1 "
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-                            "-1 -1 -1 "
-#endif
-                            "%f\n",
-                            batch_size,
-                            seq_len,
-                            head_num,
-                            size_per_head,
-                            data_type,
-                            batchCount[i],
-                            n,
-                            m,
-                            k,
-                            fast_algo,
-                            exec_time);
-                }
-                printf("***cublasLt Gemm Testing End***\n");
-            }
-            else {
-                fprintf(fd,
-                        "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-                        "-1 -1 "
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-                        "-1 -1 -1 "
-#endif
-                        "%f\n",
-                        batch_size,
-                        seq_len,
-                        head_num,
-                        size_per_head,
-                        data_type,
-                        batchCount[i],
-                        n,
-                        m,
-                        k,
-                        fast_algo,
-                        exec_time);
-            }
-            exec_times[i] = exec_time;
-            cudaFree(darray);
-        }
-
-        if (basic_layer != NUM_OF_BASIC_LAYERS - 1) {
-            batch_size = batch_size / 4;
-            head_num   = head_num * 2;
-        }
-    }
-    printf("***cublas Gemm Testing End***\n\n");
-    fclose(fd);
-    printf("***Encoder Gemm Testing End***\n");
-    return;
-}
-
-template void generate_swin_gemm_config<float>(
-    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend);
-template void generate_swin_gemm_config<half>(
-    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend);
-#ifdef ENABLE_BF16
-template void generate_swin_gemm_config<__nv_bfloat16>(
-    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend);
-#endif
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/gemm_test/swin_gemm_func.h b/src/turbomind/utils/gemm_test/swin_gemm_func.h
deleted file mode 100644
index 815da7b197..0000000000
--- a/src/turbomind/utils/gemm_test/swin_gemm_func.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "src/turbomind/utils/cublasAlgoMap.h"
-#include "src/turbomind/utils/cuda_bf16_wrapper.h"
-#include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/gemm_test/gemm_func.h"
-
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-#include <cuda_fp16.h>
-#include <cuda_profiler_api.h>
-#include <map>
-#ifdef __linux__
-#include <sys/time.h>
-#include <unistd.h>
-#endif
-#include <vector>
-
-namespace turbomind {
-
-template<typename T>
-void generate_swin_gemm_config(
-    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend = true);
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/gemm_test/swin_igemm_func.cc b/src/turbomind/utils/gemm_test/swin_igemm_func.cc
deleted file mode 100644
index 08b28b1656..0000000000
--- a/src/turbomind/utils/gemm_test/swin_igemm_func.cc
+++ /dev/null
@@ -1,282 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "swin_igemm_func.h"
-#include <chrono>
-
-namespace turbomind {
-
-static const char* showStatus(cublasStatus_t error)
-{
-    switch (error) {
-        case CUBLAS_STATUS_SUCCESS:
-            return "CUBLAS_STATUS_SUCCESS";
-
-        case CUBLAS_STATUS_NOT_INITIALIZED:
-            return "CUBLAS_STATUS_NOT_INITIALIZED";
-
-        case CUBLAS_STATUS_ALLOC_FAILED:
-            return "CUBLAS_STATUS_ALLOC_FAILED";
-
-        case CUBLAS_STATUS_INVALID_VALUE:
-            return "CUBLAS_STATUS_INVALID_VALUE";
-
-        case CUBLAS_STATUS_ARCH_MISMATCH:
-            return "CUBLAS_STATUS_ARCH_MISMATCH";
-
-        case CUBLAS_STATUS_MAPPING_ERROR:
-            return "CUBLAS_STATUS_MAPPING_ERROR";
-
-        case CUBLAS_STATUS_EXECUTION_FAILED:
-            return "CUBLAS_STATUS_EXECUTION_FAILED";
-
-        case CUBLAS_STATUS_INTERNAL_ERROR:
-            return "CUBLAS_STATUS_INTERNAL_ERROR";
-
-        case CUBLAS_STATUS_NOT_SUPPORTED:
-            return "CUBLAS_STATUS_NOT_SUPPORTED";
-
-        case CUBLAS_STATUS_LICENSE_ERROR:
-            return "CUBLAS_STATUS_LICENSE_ERROR";
-    }
-
-    return "<unknown>";
-}
-
-static inline bool time_compare(const customMatmulPerf_t& perf_a, const customMatmulPerf_t& perf_b)
-{
-    return ((perf_a.status == CUBLAS_STATUS_SUCCESS) && (perf_a.time < perf_b.time));
-}
-
-static cublasStatus_t customMatmulRun(cublasLtHandle_t            ltHandle,  // to get the capabilities (required a GPU)
-                                      cublasLtMatmulDesc_t        operationDesc,
-                                      const void*                 alpha, /* host or device pointer */
-                                      const void*                 A,
-                                      cublasLtMatrixLayout_t      Adesc,
-                                      const void*                 B,
-                                      cublasLtMatrixLayout_t      Bdesc,
-                                      const void*                 beta, /* host or device pointer */
-                                      const void*                 C,
-                                      cublasLtMatrixLayout_t      Cdesc,
-                                      void*                       D,
-                                      cublasLtMatrixLayout_t      Ddesc,
-                                      const cublasLtMatmulAlgo_t& algo,
-                                      int                         kernelRepeats,
-                                      void*                       workSpace,
-                                      size_t                      workSpaceSizeInBytes,
-                                      customMatmulPerf_t&         perfResults,
-                                      cudaStream_t                stream)
-{
-    cublasLtMatmulHeuristicResult_t heurResult;
-    /* Looping over the Algo */
-    int            repeats = kernelRepeats;
-    cublasStatus_t algoStatus =
-        cublasLtMatmulAlgoCheck(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, &algo, &heurResult);
-    if (algoStatus == CUBLAS_STATUS_SUCCESS) {
-        if (heurResult.workspaceSize <= workSpaceSizeInBytes) {
-            cublasStatus_t oneRunStatus;
-            cudaDeviceSynchronize();
-            auto start = std::chrono::high_resolution_clock::now();
-            for (int loop = 0; loop < repeats; loop++) {
-                oneRunStatus = cublasLtMatmul(ltHandle,
-                                              operationDesc,
-                                              alpha,
-                                              A,
-                                              Adesc,
-                                              B,
-                                              Bdesc,
-                                              beta,
-                                              C,
-                                              Cdesc,
-                                              D,
-                                              Ddesc,
-                                              &algo,
-                                              workSpace,
-                                              workSpaceSizeInBytes,
-                                              stream);
-            }
-            cudaDeviceSynchronize();
-            auto end = std::chrono::high_resolution_clock::now();
-            auto dur = std::chrono::duration<float, std::milli>(end - start);
-            if (oneRunStatus != CUBLAS_STATUS_SUCCESS) {
-                algoStatus = oneRunStatus;
-            }
-            float time = dur.count();
-            // For the moment only add successful findings
-            if (algoStatus == CUBLAS_STATUS_SUCCESS) {
-                perfResults.algo          = algo;
-                perfResults.time          = time / repeats;
-                perfResults.workspaceSize = heurResult.workspaceSize;
-                perfResults.wavesCount    = heurResult.wavesCount;
-            }
-        }
-        else {
-            // printf("not enough workspace! %ld\n", heurResult.workspaceSize);
-            algoStatus = CUBLAS_STATUS_NOT_SUPPORTED;  // Not enough workspace
-        }
-    }
-    else {
-        // printf("check fail!\n");
-    }
-    return algoStatus;
-}
-
-int igemm_config_INT8IO(int m, int n, int k, FILE* fout, void* buffer)
-{
-    printf("batchCount %d m %d n %d k %d\n", 1, m, n, k);
-    float alpha = 1.0f;
-    float beta  = 0.0f;
-
-    int8_t* d_A = (int8_t*)buffer;         // m * k, stored in column-major
-    int8_t* d_B = d_A + m * k;             // k * n, stored in column-major
-    int8_t* d_C = (int8_t*)(d_B + k * n);  // m * n, stored in column-major
-
-    cublasLtHandle_t ltHandle;
-    cublasLtCreate(&ltHandle);
-
-    LtIgemmCustomFind(ltHandle,
-                      m,
-                      n,
-                      k,
-                      &alpha, /* host pointer */
-                      d_A,
-                      d_B,
-                      &beta, /* host pointer */
-                      d_C,
-                      NULL,
-                      0,
-                      fout);
-
-    cublasLtDestroy(ltHandle);
-    return 0;
-}
-
-int generate_swin_igemm_config(
-    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend)
-{
-
-    // ensure program running on SM >= 7.5
-    struct cudaDeviceProp prop;
-    check_cuda_error(cudaGetDeviceProperties(&prop, 0));
-    if (!(prop.major >= 8 || (prop.major >= 7 && prop.minor >= 5))) {
-        printf("[ERROR] INT8 mode > 0 is only supported on device with sm >= 7.5\n ");
-        exit(-1);
-    }
-    printf("Device %s\n", prop.name);
-
-    // check config
-    FILE* fout;
-    if (!isAppend) {
-        fout = fopen(IGEMM_CONFIG, "w+");
-        fprintf(
-            fout,
-            "batch_size seq_len head_num size_per_head dataType ### batchCount m n k algoId customOption tile splitK_val swizzle reductionScheme workspaceSize stages exec_time\n");
-    }
-    else {
-        fout = fopen(IGEMM_CONFIG, "a+");
-        std::vector<std::string> config;
-        char                     line[1024];
-        while (fgets(line, 1024, fout) != NULL) {
-            config.push_back(std::string(line));
-        }
-        if (config.size() >= MAX_CONFIG_NUM * GEMM_NUM) {
-            int startIdx = config.size() - (MAX_CONFIG_NUM - 1) * GEMM_NUM;
-            fclose(fout);
-            fout = fopen(IGEMM_CONFIG, "w+");
-            for (int i = startIdx; i < (int)config.size(); i++) {
-                fprintf(fout, "%s", config[i].c_str());
-            }
-        }
-    }
-
-    int       m = batch_size * seq_len;
-    int       n = head_num * size_per_head;
-    int       k = n;
-    int       batchCount;
-    const int NUM_OF_BASIC_LAYERS = 4;
-
-    printf("***Swin IGemm Testing Begin***\n");
-
-    for (int basic_layer = 0; basic_layer < NUM_OF_BASIC_LAYERS; basic_layer++) {
-        printf("\n-----------------------------\n");
-        batchCount = 1;
-        m          = batch_size * seq_len;
-        k          = head_num * size_per_head;
-        n          = 3 * head_num * size_per_head;
-        if (n % 32 != 0 || k % 32 != 0) {
-            printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
-        }
-        else {
-            igemm_config_INT8IO(m, n, k, fout, buffer);
-        }
-
-        printf("\n-----------------------------\n");
-        m = batch_size * seq_len;
-        n = head_num * size_per_head;
-        k = head_num * size_per_head;
-        if (n % 32 != 0 || k % 32 != 0) {
-            printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
-        }
-        else {
-            igemm_config_INT8IO(m, n, k, fout, buffer);
-        }
-
-        printf("\n-----------------------------\n");
-        m = batch_size * seq_len;
-        n = 4 * head_num * size_per_head;
-        k = head_num * size_per_head;
-        if (n % 32 != 0 || k % 32 != 0) {
-            printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
-        }
-        else {
-            igemm_config_INT8IO(m, n, k, fout, buffer);
-        }
-
-        printf("\n-----------------------------\n");
-        m = batch_size * seq_len;
-        n = head_num * size_per_head;
-        k = 4 * head_num * size_per_head;
-        if (n % 32 != 0 || k % 32 != 0) {
-            printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
-        }
-        else {
-            igemm_config_INT8IO(m, n, k, fout, buffer);
-        }
-
-        if (basic_layer != NUM_OF_BASIC_LAYERS - 1) {
-            printf("\n-----------------------------\n");
-            batch_size = batch_size / 4;
-            head_num   = head_num * 2;
-            m          = batch_size * seq_len;
-            n          = head_num * size_per_head;
-            k          = 2 * head_num * size_per_head;
-            if (n % 32 != 0 || k % 32 != 0) {
-                printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
-            }
-            else {
-                igemm_config_INT8IO(m, n, k, fout, buffer);
-            }
-        }
-        printf("\n-----------------------------\n");
-    }
-
-    fclose(fout);
-    printf("\n-----------------------------\n");
-    printf("***Swin IGemm Testing End***\n");
-    return 0;
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/gemm_test/swin_igemm_func.h b/src/turbomind/utils/gemm_test/swin_igemm_func.h
deleted file mode 100644
index 21603dc57d..0000000000
--- a/src/turbomind/utils/gemm_test/swin_igemm_func.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "src/turbomind/utils/cublasAlgoMap.h"
-#include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/gemm_test/encoder_igemm_func.h"
-#include <algorithm>
-#include <cublasLt.h>
-#include <cuda_runtime.h>
-#include <map>
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#ifdef __linux__
-#include <sys/time.h>
-#include <unistd.h>
-#endif
-#include <vector>
-
-namespace turbomind {
-
-/* CAUTION : must match cublasLtMatmulTile_t */
-// const char* const matmulTileName[] = {
-//     "UNDEF",  "8x8",    "8x16",    "16x8",   "8x32",   "16x16",   "32x8",    "8x64",   "16x32",
-//     "32x16",  "64x8",   "32x32",   "32x64",  "64x32",  "32x128",  "64x64",   "128x32", "64x128",
-//     "128x64", "64x256", "128x128", "256x64", "64x512", "128x256", "256x128", "512x64",
-// };
-
-int generate_swin_igemm_config(
-    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend = true);
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/gemm_test/t5_gemm_func.cc b/src/turbomind/utils/gemm_test/t5_gemm_func.cc
deleted file mode 100644
index 44d26a37b7..0000000000
--- a/src/turbomind/utils/gemm_test/t5_gemm_func.cc
+++ /dev/null
@@ -1,837 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "src/turbomind/utils/gemm_test/t5_gemm_func.h"
-#include "src/turbomind/macro.h"
-#include <chrono>
-
-namespace turbomind {
-
-bool isSparseGemmAvailable(size_t m, size_t n, size_t k)
-{
-    return m % 8 == 0 && n % 8 == 0 && k % 8 == 0;
-}
-
-template<typename T>
-void generate_t5_gemm_config(int   batch_size,
-                             int   beam_width,
-                             int   max_mem_seq_len,
-                             int   encoder_d_model,
-                             int   encoder_head_num,
-                             int   encoder_size_per_head,
-                             int   encoder_inter_size,
-                             int   decoder_d_model,
-                             int   decoder_head_num,
-                             int   decoder_size_per_head,
-                             int   decoder_inter_size,
-                             int   decoder_vocab_size,
-                             int   tensor_para_size,
-                             void* buffer_in,
-                             bool  isAppend,
-                             bool  is_fp16_compute_type)
-{
-    FT_CHECK(encoder_head_num % tensor_para_size == 0);
-    FT_CHECK(decoder_head_num % tensor_para_size == 0);
-
-    void* cublas_workspace;
-    void* buffer;
-    int   workSpaceSize;
-#ifdef ENABLE_BF16
-    if (std::is_same<T, half>::value || std::is_same<T, __nv_bfloat16>::value) {
-#else
-    if (std::is_same<T, half>::value) {
-#endif  // ENABLE_BF16
-        // cublas_workspace_ should be the start pointer of cudaMalloc()
-        // to ensure 16B alignemnet
-        cublas_workspace = buffer_in;
-        buffer           = (void*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE);
-        workSpaceSize    = CUBLAS_WORKSPACE_SIZE;
-    }
-    else {
-        cublas_workspace = nullptr;
-        buffer           = buffer_in;
-        workSpaceSize    = 0;
-    }
-
-    struct cudaDeviceProp prop;
-    check_cuda_error(cudaGetDeviceProperties(&prop, 0));
-    printf("Device %s\n", prop.name);
-
-    // check config
-    FILE* fd;
-    int   line_count = 0;
-    if (!isAppend) {
-        fd = fopen(GEMM_CONFIG, "w+");
-    }
-    else {
-        fd = fopen(GEMM_CONFIG, "a+");
-        std::vector<std::string> config;
-        char                     line[1024];
-        while (fgets(line, 1024, fd) != NULL) {
-            config.push_back(std::string(line));
-        }
-        line_count = config.size();
-        if (config.size() >= (MAX_CONFIG_NUM * GEMM_NUM + 1))  // 6 cublas/cublasLt, first row is not included
-        {
-            int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * GEMM_NUM);
-            fclose(fd);
-            fd = fopen(GEMM_CONFIG, "w+");
-            fprintf(fd, "%s", config[0].c_str());
-            for (uint i = startIdx; i < config.size(); i++) {
-                fprintf(fd, "%s", config[i].c_str());
-            }
-            line_count = config.size() - (GEMM_NUM + 3);
-        }
-    }
-
-    const int gemm_num = 12;
-    int       M[gemm_num];
-    int       N[gemm_num];
-    int       K[gemm_num];
-    int       batchCount[gemm_num];
-    char      mess[gemm_num][256];
-    float     exec_times[gemm_num];
-
-    // gemm 0
-    M[0]          = batch_size * max_mem_seq_len;
-    K[0]          = encoder_d_model;
-    N[0]          = encoder_head_num / tensor_para_size * encoder_size_per_head;
-    batchCount[0] = 3;
-    strcpy(mess[0], "encoder from_tensor * batched gemm weightQKV");
-
-    // gemm 1
-    M[1]          = max_mem_seq_len;
-    K[1]          = encoder_size_per_head;
-    N[1]          = max_mem_seq_len;
-    batchCount[1] = batch_size * encoder_head_num / tensor_para_size;
-    strcpy(mess[1], "encoder batch strided gemm Q*K^T");
-
-    // gemm 2
-    M[2]          = max_mem_seq_len;
-    K[2]          = max_mem_seq_len;
-    N[2]          = encoder_size_per_head;
-    batchCount[2] = batch_size * encoder_head_num / tensor_para_size;
-    strcpy(mess[2], "encoder batch strided gemm QK*V^T");
-
-    // gemm 3
-    M[3]          = batch_size * max_mem_seq_len;
-    K[3]          = encoder_head_num / tensor_para_size * encoder_size_per_head;
-    N[3]          = encoder_d_model;
-    batchCount[3] = 1;
-    strcpy(mess[3], "encoder attr * output_kernel");
-
-    // gemm 4
-    M[4]          = batch_size * max_mem_seq_len;
-    K[4]          = encoder_d_model;
-    N[4]          = encoder_inter_size / tensor_para_size;
-    batchCount[4] = 1;
-    strcpy(mess[4], "encoder ffn gemm 1");
-
-    // gemm 5
-    M[5]          = batch_size * max_mem_seq_len;
-    K[5]          = encoder_inter_size / tensor_para_size;
-    N[5]          = encoder_d_model;
-    batchCount[5] = 1;
-    strcpy(mess[5], "encoder ffn gemm 2");
-
-    // gemm 6
-    M[6]          = batch_size * beam_width;
-    K[6]          = decoder_d_model;
-    N[6]          = 3 * decoder_head_num / tensor_para_size * decoder_size_per_head;
-    batchCount[6] = 1;
-    strcpy(mess[6], "from_tensor * weightQKV");
-
-    // gemm 7
-    M[7]          = batch_size * beam_width;
-    K[7]          = decoder_head_num / tensor_para_size * decoder_size_per_head;
-    N[7]          = decoder_d_model;
-    batchCount[7] = 1;
-    strcpy(mess[7], "attr * output_kernel");
-
-    // gemm 8
-    M[8]          = batch_size * beam_width;
-    K[8]          = decoder_d_model;
-    N[8]          = decoder_inter_size / tensor_para_size;
-    batchCount[8] = 1;
-    strcpy(mess[8], "ffn gemm 1");
-
-    // gemm 9
-    M[9]          = batch_size * beam_width;
-    K[9]          = decoder_inter_size / tensor_para_size;
-    N[9]          = decoder_d_model;
-    batchCount[9] = 1;
-    strcpy(mess[9], "ffn gemm 2");
-
-    // gemm 10
-    size_t decoder_vocab_size_padded = ((size_t)ceil(decoder_vocab_size / 1. / tensor_para_size) * tensor_para_size);
-    if (!std::is_same<T, float>::value) {
-        decoder_vocab_size_padded = ((size_t)ceil(decoder_vocab_size_padded / 8.) * 8);
-    }
-    M[10]          = batch_size * beam_width;
-    K[10]          = decoder_d_model;
-    N[10]          = decoder_vocab_size_padded / tensor_para_size;
-    batchCount[10] = 1;
-    strcpy(mess[10], "logits gemm");
-
-    // gemm 11
-    M[11]          = batch_size * max_mem_seq_len;
-    K[11]          = encoder_d_model;
-    N[11]          = encoder_head_num / tensor_para_size * encoder_size_per_head;
-    batchCount[11] = 1;
-    strcpy(mess[11], "encoder from_tensor * splited qkv weight");
-
-    cublasHandle_t cublas_handle;
-    check_cuda_error(cublasCreate(&cublas_handle));
-    cublasLtHandle_t ltHandle;
-    check_cuda_error(cublasLtCreate(&ltHandle));
-
-    cudaDataType_t AType;
-    cudaDataType_t BType;
-    cudaDataType_t CType;
-    cudaDataType_t computeType;
-    int            startAlgo, endAlgo;
-    const int      ites = 100;
-
-    CublasDataType data_type;
-    if (std::is_same<T, float>::value) {
-        data_type   = FLOAT_DATATYPE;
-        AType       = CUDA_R_32F;
-        BType       = CUDA_R_32F;
-        CType       = CUDA_R_32F;
-        computeType = CUDA_R_32F;
-        startAlgo   = (int)CUBLAS_GEMM_DEFAULT;
-        endAlgo     = (int)CUBLAS_GEMM_ALGO23;
-    }
-    else if (std::is_same<T, half>::value) {
-        data_type   = HALF_DATATYPE;
-        AType       = CUDA_R_16F;
-        BType       = CUDA_R_16F;
-        CType       = CUDA_R_16F;
-        computeType = CUDA_R_32F;
-        startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-        endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
-    }
-#ifdef ENABLE_BF16
-    else if (std::is_same<T, __nv_bfloat16>::value) {
-        data_type   = BFLOAT16_DATATYPE;
-        AType       = CUDA_R_16BF;
-        BType       = CUDA_R_16BF;
-        CType       = CUDA_R_16BF;
-        computeType = CUDA_R_32F;
-        startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-        endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
-    }
-#endif
-    float f_alpha = (float)1.0f;
-    float f_beta  = (float)0.0f;
-
-    half h_alpha = (half)(1.0f);
-    half h_beta  = (half)(0.0f);
-
-    void* alpha = computeType == CUDA_R_16F ? (void*)(&h_alpha) : (void*)(&f_alpha);
-    void* beta  = computeType == CUDA_R_16F ? (void*)(&h_beta) : (void*)(&f_beta);
-
-    printf("***Encoder Gemm Testing Begin***\n");
-    printf("***Cublas Gemm Testing Begin***\n");
-    if (line_count == 0) {
-        fprintf(fd,
-                "batch_size, seq_len, head_num, size_per_head dataType ### batchCount, n, m, k, algoId, "
-                "customOption, tile, numSplitsK, swizzle, reductionScheme, workspaceSize, stages, exec_time\n");
-    }
-    for (int i = 0; i < gemm_num; ++i) {
-        int seq_len       = (i <= 5 || i == 11) ? max_mem_seq_len : 1;
-        int head_num      = ((i <= 5 || i == 11) ? encoder_head_num : decoder_head_num) / tensor_para_size;
-        int size_per_head = (i <= 5 || i == 11) ? encoder_size_per_head : decoder_size_per_head;
-
-        int m = M[i], n = N[i], k = K[i];
-        printf("\n-----------------------------\n");
-        printf("GEMM test %d: [M: %d, K: %d, N: %d] %s\n", i, m, k, n, mess[i]);
-        T* d_A = (T*)buffer;
-        T* d_B = d_A + m * k * batchCount[i];
-        T* d_C = d_B + k * n * batchCount[i];
-
-        // array of pointer for batchedGemm
-        T* harray[12];
-        harray[0]  = (T*)buffer;
-        harray[1]  = (T*)((char*)buffer + sizeof(T) * m * k);
-        harray[2]  = (T*)((char*)buffer + 2 * sizeof(T) * m * k);
-        harray[4]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k);
-        harray[5]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + sizeof(T) * k * n);
-        harray[6]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 2 * sizeof(T) * k * n);
-        harray[8]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n);
-        harray[9]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n + sizeof(T) * m * n);
-        harray[10] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n + 2 * sizeof(T) * m * n);
-
-        T** darray = 0;
-        check_cuda_error(cudaMalloc((void**)&darray, sizeof(T*) * 12));
-        cudaMemcpy((void*)darray, (void*)harray, sizeof(T*) * 12, cudaMemcpyHostToDevice);
-        T** dAarray = darray;
-        T** dBarray = darray + 4;
-        T** dCarray = darray + 8;
-
-        float exec_time = 99999.0f;
-        int   fast_algo = 0;
-        for (int algo = startAlgo; algo <= endAlgo; algo++) {
-            cublasStatus_t status;
-            cudaDeviceSynchronize();
-            auto start = std::chrono::high_resolution_clock::now();
-            for (int ite = 0; ite < ites; ++ite) {
-                if (i == 0) {
-                    status = cublasGemmBatchedEx(cublas_handle,
-                                                 CUBLAS_OP_N,
-                                                 CUBLAS_OP_N,
-                                                 n,
-                                                 m,
-                                                 k,
-                                                 alpha,
-                                                 (const void* const*)dBarray,
-                                                 BType,
-                                                 n,
-                                                 (const void* const*)dAarray,
-                                                 AType,
-                                                 k,
-                                                 beta,
-                                                 (void* const*)dCarray,
-                                                 CType,
-                                                 n,
-                                                 batchCount[i],
-                                                 computeType,
-                                                 static_cast<cublasGemmAlgo_t>(algo));
-                }
-                else if (i == 1) {
-                    status = cublasGemmStridedBatchedEx(cublas_handle,
-                                                        CUBLAS_OP_T,
-                                                        CUBLAS_OP_N,
-                                                        max_mem_seq_len,
-                                                        max_mem_seq_len,
-                                                        encoder_size_per_head,
-                                                        alpha,
-                                                        d_B,
-                                                        BType,
-                                                        encoder_size_per_head,
-                                                        max_mem_seq_len * encoder_size_per_head,
-                                                        d_A,
-                                                        AType,
-                                                        encoder_size_per_head,
-                                                        max_mem_seq_len * encoder_size_per_head,
-                                                        beta,
-                                                        d_C,
-                                                        CType,  // CType,
-                                                        max_mem_seq_len,
-                                                        max_mem_seq_len * max_mem_seq_len,
-                                                        batchCount[i],
-                                                        computeType,
-                                                        static_cast<cublasGemmAlgo_t>(algo));
-                }
-                else if (i == 2) {
-                    status = cublasGemmStridedBatchedEx(cublas_handle,
-                                                        CUBLAS_OP_N,
-                                                        CUBLAS_OP_N,
-                                                        encoder_size_per_head,
-                                                        max_mem_seq_len,
-                                                        max_mem_seq_len,
-                                                        alpha,
-                                                        d_B,
-                                                        BType,
-                                                        encoder_size_per_head,
-                                                        max_mem_seq_len * encoder_size_per_head,
-                                                        d_A,
-                                                        AType,
-                                                        max_mem_seq_len,
-                                                        max_mem_seq_len * max_mem_seq_len,
-                                                        beta,
-                                                        d_C,
-                                                        CType,
-                                                        encoder_size_per_head,
-                                                        max_mem_seq_len * encoder_size_per_head,
-                                                        batchCount[i],
-                                                        computeType,
-                                                        static_cast<cublasGemmAlgo_t>(algo));
-                }
-                else if (i == 10) {
-                    status = cublasGemmEx(cublas_handle,
-                                          CUBLAS_OP_T,
-                                          CUBLAS_OP_N,
-                                          n,
-                                          m,
-                                          k,
-                                          alpha,
-                                          d_B,
-                                          BType,
-                                          k,
-                                          d_A,
-                                          AType,
-                                          k,
-                                          beta,
-                                          d_C,
-                                          CType,
-                                          n,
-                                          computeType,
-                                          static_cast<cublasGemmAlgo_t>(algo));
-                }
-                else {
-                    status = cublasGemmEx(cublas_handle,
-                                          CUBLAS_OP_N,
-                                          CUBLAS_OP_N,
-                                          n,
-                                          m,
-                                          k,
-                                          alpha,
-                                          d_B,
-                                          BType,
-                                          n,
-                                          d_A,
-                                          AType,
-                                          k,
-                                          beta,
-                                          d_C,
-                                          CType,
-                                          n,
-                                          computeType,
-                                          static_cast<cublasGemmAlgo_t>(algo));
-                }
-
-                if (status != CUBLAS_STATUS_SUCCESS) {
-                    break;
-                }
-            }
-            cudaDeviceSynchronize();
-            auto end = std::chrono::high_resolution_clock::now();
-            auto dur = std::chrono::duration<float, std::milli>(end - start);
-            if (status == CUBLAS_STATUS_SUCCESS) {
-                printf("algo_%d costs %.3fms \n", algo, dur.count() / ites);
-                if (dur.count() / ites < exec_time) {
-                    exec_time = dur.count() / ites;
-                    fast_algo = algo;
-                }
-            }
-            sync_check_cuda_error();
-        }
-
-        printf("fast_algo %d costs %.3f ms\n", fast_algo, exec_time);
-
-        using scaleT = float;
-
-        if (is_fp16_compute_type) {
-            using scaleT = typename ScaleTypeConverter<T, true>::Type;
-        }
-
-        // for fp16 and bf16, we compare cublasLt
-        if (data_type != FLOAT_DATATYPE && i != 1 && i != 2 && i != 0 && i != 10) {
-            printf("***cublasLt Gemm Testing Begin***\n");
-            // Let try a fixed number of combinations
-            const int          ALGO_COMBINATIONS = 5000;
-            customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
-
-            // for t5, computeType & scaleType should be FP32
-            if (is_fp16_compute_type) {
-                using scaleT       = typename ScaleTypeConverter<T, true>::Type;
-                scaleT alpha_scale = (scaleT)1.0f;
-                scaleT beta_scale  = (scaleT)0.0f;
-
-                LtHgemmCustomFind<T, scaleT>(ltHandle,
-                                             m,
-                                             seq_len,
-                                             head_num,
-                                             size_per_head,
-                                             n,
-                                             m,
-                                             k,
-                                             &(alpha_scale),
-                                             d_B,
-                                             d_A,
-                                             &(beta_scale),
-                                             d_C,
-                                             cublas_workspace,
-                                             workSpaceSize,
-                                             fd,
-                                             perfResults,
-                                             ALGO_COMBINATIONS);
-            }
-            else {
-                LtHgemmCustomFind<T, float>(ltHandle,
-                                            m,
-                                            seq_len,
-                                            head_num,
-                                            size_per_head,
-                                            n,
-                                            m,
-                                            k,
-                                            &(f_alpha),
-                                            d_B,
-                                            d_A,
-                                            &(f_beta),
-                                            d_C,
-                                            cublas_workspace,
-                                            workSpaceSize,
-                                            fd,
-                                            perfResults,
-                                            ALGO_COMBINATIONS);
-            }
-
-            if (perfResults[0].time < exec_time) {
-                printPerfStructure(batch_size * (i <= 5 || i == 1 ? 1 : beam_width),
-                                   seq_len,
-                                   head_num,
-                                   size_per_head,
-                                   n,
-                                   m,
-                                   k,
-                                   perfResults[0],
-                                   fd,
-                                   data_type,
-                                   0);
-            }
-            else {
-                fprintf(fd,
-                        "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-                        "-1 -1 "
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-                        "-1 -1 -1 "
-#endif
-                        "%f\n",
-                        batch_size * (i <= 5 || i == 1 ? 1 : beam_width),
-                        seq_len,
-                        head_num,
-                        size_per_head,
-                        data_type,
-                        batchCount[i],
-                        n,
-                        m,
-                        k,
-                        fast_algo,
-                        exec_time);
-            }
-            printf("***cublasLt Gemm Testing End***\n");
-        }
-        else {
-            fprintf(fd,
-                    "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-                    "-1 -1 "
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-                    "-1 -1 -1 "
-#endif
-                    "%f\n",
-                    batch_size * (i <= 5 || i == 1 ? 1 : beam_width),
-                    seq_len,
-                    head_num,
-                    size_per_head,
-                    data_type,
-                    batchCount[i],
-                    n,
-                    m,
-                    k,
-                    fast_algo,
-                    exec_time);
-        }
-        sync_check_cuda_error();
-        exec_times[i] = exec_time;
-    }
-    printf("***cublas Gemm Testing End***\n\n");
-    fclose(fd);
-
-#ifdef SPARSITY_ENABLED
-    bool do_sparse_test = false;
-    if (prop.major == 8 && (prop.minor == 0 || prop.minor == 6) && sizeof(T) == sizeof(half)) {
-        do_sparse_test = true;
-    }
-    if (do_sparse_test) {
-        printf("***cusparseLt Gemm Testing Begin***\n");
-        // Only first 8 cases can be sparse
-        // - QKV kernel, Projection, FC1, FC2 in context or decoding.
-        const int spgemm_num = 8;
-        if (!isAppend) {
-            fd = fopen(SPGEMM_CONFIG, "w+");
-        }
-        else {
-            fd = fopen(SPGEMM_CONFIG, "a+");
-            std::vector<std::string> config;
-            char                     line[1024];
-            while (fgets(line, 1024, fd) != NULL) {
-                config.push_back(std::string(line));
-            }
-            line_count = config.size();
-            // gemm_num configs (cublas/cublasLt), first row is not included
-            if (config.size() >= (MAX_CONFIG_NUM * spgemm_num + 1)) {
-                int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * spgemm_num);
-                fclose(fd);
-                fd = fopen(SPGEMM_CONFIG, "w+");
-                fprintf(fd, "%s", config[0].c_str());
-                for (uint i = startIdx; i < config.size(); i++) {
-                    fprintf(fd, "%s", config[i].c_str());
-                }
-                line_count = config.size() - (spgemm_num + 3);
-            }
-        }
-        if (line_count == 0) {
-            // header line
-            fprintf(fd,
-                    "batch_size, seq_len, head_num, size_per_head dataType "
-                    "### batchCount, m, n, k, algoId, exec_time\n");
-        }
-
-        cusparseLtHandle_t handle;
-        CHECK_CUSPARSE(cusparseLtInit(&handle));
-        cusparseOrder_t     order = CUSPARSE_ORDER_COL;
-        cusparseOperation_t opA   = CUSPARSE_OPERATION_NON_TRANSPOSE;
-        cusparseOperation_t opB   = CUSPARSE_OPERATION_NON_TRANSPOSE;
-        // let's make this optional
-        cusparseComputeType compute_type = CUSPARSE_COMPUTE_16F;
-        unsigned            alignment    = 16;
-        cudaStream_t        stream       = 0;
-        float               alpha2       = 1.0f;
-        float               beta2        = 0.0f;
-        for (int i = 0; i < gemm_num; ++i) {
-            // skip qk or attn or logit gemms.
-            if (i == 1 || i == 2 || i == 10) {
-                continue;
-            }
-
-            // seq_len is always 1 except context gemms.
-            int seq_len       = i <= 5 ? max_mem_seq_len : 1;
-            int head_num      = (i <= 5 ? encoder_head_num : decoder_head_num) / tensor_para_size;
-            int size_per_head = i <= 5 ? encoder_size_per_head : decoder_size_per_head;
-
-            // to be compatible with spgemm wrapper, we let A be the weight matrix
-            // so m and n are swapped
-            // A: mxk B: kxn C:mxn
-            int m = N[i], n = M[i], k = K[i];
-            printf("\n-----------------------------\n");
-            printf("GEMM test %d: [M: %d, K: %d, N: %d]\n", i, m, k, n);
-            T* d_A = (T*)buffer;
-            T* d_B = d_A + m * k * batchCount[i];
-            T* d_C = d_B + k * n * batchCount[i];
-            T* dA_compressed;
-            {
-                cusparseLtMatDescriptor_t mat_A;
-                CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
-                    &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
-                CHECK_CUSPARSE(
-                    cusparseLtSpMMAPrune2(&handle, &mat_A, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
-                size_t compressed_size;
-                CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &mat_A, &compressed_size))
-                check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size));
-                CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &mat_A, true, opA, d_A, dA_compressed, stream))
-            }
-
-            float exec_time = 99999.0f;
-            int   fast_algo = 0;
-            if (isSparseGemmAvailable(m, n, k)) {
-                for (int alg = 0; alg < 4; ++alg) {
-                    cudaDeviceSynchronize();
-                    cusparseLtMatDescriptor_t mat_A, mat_B, mat_C;
-                    void*                     d_workspace = nullptr;
-                    int                       num_streams = 1;
-                    cudaStream_t              streams[1]  = {stream};
-                    CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
-                        &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
-                    CHECK_CUSPARSE(
-                        cusparseLtDenseDescriptorInit(&handle, &mat_B, k, n, k, alignment, CUDA_R_16F, order))
-                    CHECK_CUSPARSE(
-                        cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_16F, order))
-                    cudaDeviceSynchronize();
-                    auto start = std::chrono::high_resolution_clock::now();
-                    for (int ite = 0; ite < ites; ++ite) {
-                        // initializing MatDesc takes a lot of time
-                        // and these descs can be stored to other place
-                        // whereas storing MatMulPlan to other place will cause errors
-                        cusparseLtMatmulDescriptor_t   matmul;
-                        cusparseLtMatmulAlgSelection_t alg_sel;
-                        cusparseLtMatmulPlan_t         plan;
-                        CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(
-                            &handle, &matmul, opA, opB, &mat_A, &mat_B, &mat_C, &mat_C, compute_type))
-                        CHECK_CUSPARSE(
-                            cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
-                        CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
-                            &handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)))
-                        size_t workspace_size;
-                        CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&handle, &alg_sel, &workspace_size))
-                        CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel, workspace_size))
-                        CHECK_CUSPARSE(cusparseLtMatmul(&handle,
-                                                        &plan,
-                                                        &alpha2,
-                                                        dA_compressed,
-                                                        d_B,
-                                                        &beta2,
-                                                        d_C,
-                                                        d_C,
-                                                        d_workspace,
-                                                        streams,
-                                                        num_streams))
-                        CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
-                    }
-                    cudaDeviceSynchronize();
-                    auto end = std::chrono::high_resolution_clock::now();
-                    auto dur = std::chrono::duration<float, std::milli>(end - start);
-                    printf("algo_%d costs %.3fms \n", alg, dur.count() / ites);
-                    if (dur.count() < exec_time) {
-                        exec_time = dur.count();
-                        fast_algo = alg;
-                    }
-                }
-            }
-            exec_time /= ites;
-            if (exec_time >= exec_times[i]) {
-                fast_algo = -1;
-            }
-            printf("fast_algo %d\n", fast_algo);
-            fprintf(fd,
-                    "%d %d %d %d %d ### %d %d %d %d %d %f\n",
-                    batch_size * beam_width,
-                    seq_len,
-                    head_num,
-                    size_per_head,
-                    data_type,
-                    batchCount[i],
-                    m,
-                    n,
-                    k,
-                    fast_algo,
-                    exec_time);
-            cudaFree(dA_compressed);
-        }
-        CHECK_CUSPARSE(cusparseLtDestroy(&handle))
-        fclose(fd);
-        printf("***cusparseLt Gemm Testing End***\n");
-    }
-#endif
-
-    printf("***T5 Gemm Testing End***\n");
-    return;
-}
-
-template void generate_t5_gemm_config<float>(int   batch_size,
-                                             int   beam_width,
-                                             int   max_mem_seq_len,
-                                             int   encoder_d_model,
-                                             int   encoder_head_num,
-                                             int   encoder_size_per_head,
-                                             int   encoder_inter_size,
-                                             int   decoder_d_model,
-                                             int   decoder_head_num,
-                                             int   decoder_size_per_head,
-                                             int   decoder_inter_size,
-                                             int   decoder_vocab_size,
-                                             int   tensor_para_size,
-                                             void* buffer_in,
-                                             bool  isAppend,
-                                             bool  is_fp16_compute_type);
-
-template void generate_t5_gemm_config<half>(int   batch_size,
-                                            int   beam_width,
-                                            int   max_mem_seq_len,
-                                            int   encoder_d_model,
-                                            int   encoder_head_num,
-                                            int   encoder_size_per_head,
-                                            int   encoder_inter_size,
-                                            int   decoder_d_model,
-                                            int   decoder_head_num,
-                                            int   decoder_size_per_head,
-                                            int   decoder_inter_size,
-                                            int   decoder_vocab_size,
-                                            int   tensor_para_size,
-                                            void* buffer_in,
-                                            bool  isAppend,
-                                            bool  is_fp16_compute_type);
-
-#ifdef ENABLE_BF16
-template void generate_t5_gemm_config<__nv_bfloat16>(int   batch_size,
-                                                     int   beam_width,
-                                                     int   max_mem_seq_len,
-                                                     int   encoder_d_model,
-                                                     int   encoder_head_num,
-                                                     int   encoder_size_per_head,
-                                                     int   encoder_inter_size,
-                                                     int   decoder_d_model,
-                                                     int   decoder_head_num,
-                                                     int   decoder_size_per_head,
-                                                     int   decoder_inter_size,
-                                                     int   decoder_vocab_size,
-                                                     int   tensor_para_size,
-                                                     void* buffer_in,
-                                                     bool  isAppend,
-                                                     bool  is_fp16_compute_type);
-#endif
-
-size_t calT5GemmTestBufSizeInByte(int            batch_size,
-                                  int            beam_width,
-                                  int            max_mem_seq_len,
-                                  int            encoder_d_model,
-                                  int            encoder_head_num,
-                                  int            encoder_size_per_head,
-                                  int            encoder_inter_size,
-                                  int            decoder_d_model,
-                                  int            decoder_head_num,
-                                  int            decoder_size_per_head,
-                                  int            decoder_inter_size,
-                                  int            decoder_vocab_size,
-                                  int            tensor_para_size,
-                                  CublasDataType data_type)
-{
-    const size_t local_encoder_head_num     = encoder_head_num / tensor_para_size;
-    const size_t local_encoder_hidden_units = local_encoder_head_num * encoder_size_per_head;
-    const size_t local_encoder_inter_size   = encoder_inter_size / tensor_para_size;
-    const size_t local_decoder_head_num     = decoder_head_num / tensor_para_size;
-    const size_t local_decoder_hidden_units = local_decoder_head_num * decoder_size_per_head;
-    const size_t local_decoder_inter_size   = decoder_inter_size / tensor_para_size;
-
-    size_t              m = batch_size * max_mem_seq_len;
-    std::vector<size_t> buff_size;
-
-    // encoder qkv gemm
-    buff_size.push_back(
-        3 * (m * encoder_d_model + encoder_d_model * local_encoder_hidden_units + m * local_encoder_hidden_units));
-    // encoder batch gemm
-    buff_size.push_back(m * local_encoder_hidden_units + m * local_encoder_hidden_units
-                        + batch_size * beam_width * local_encoder_head_num * max_mem_seq_len * max_mem_seq_len);
-    // encoder ffn gemm
-    buff_size.push_back(m * local_encoder_inter_size + encoder_d_model * local_encoder_inter_size
-                        + m * encoder_d_model);
-
-    m = batch_size * beam_width;
-    // decoder qkv gemm
-    buff_size.push_back(m * decoder_d_model + decoder_d_model * 3 * local_decoder_hidden_units
-                        + 3 * m * local_decoder_hidden_units);
-    // decoder cross mem gemm
-    buff_size.push_back(m * max_mem_seq_len * encoder_d_model + encoder_d_model * local_decoder_hidden_units
-                        + m * max_mem_seq_len * local_decoder_hidden_units);
-    // decoder ffn gemm
-    buff_size.push_back(m * local_decoder_inter_size + decoder_d_model * local_decoder_inter_size
-                        + m * decoder_d_model);
-    // decoder vocab gemm
-    size_t decoder_vocab_size_padded = ((size_t)ceil(decoder_vocab_size / 1. / tensor_para_size) * tensor_para_size);
-    if (data_type != FLOAT_DATATYPE) {
-        decoder_vocab_size_padded = ((size_t)ceil(decoder_vocab_size_padded / 8.) * 8);
-    }
-    buff_size.push_back(m * decoder_d_model + decoder_d_model * decoder_vocab_size_padded / tensor_para_size
-                        + m * decoder_vocab_size_padded / tensor_para_size);
-
-    size_t buf_size_in_byte = 0;
-    // int wordSize = (data_type == FLOAT_DATATYPE ? sizeof(float) : sizeof(half));
-    // Because we always use float for some buffer, set the wordSize to float directly.
-    int wordSize = sizeof(float);
-    for (auto t : buff_size) {
-        buf_size_in_byte = buf_size_in_byte > t ? buf_size_in_byte : t;
-    }
-    buf_size_in_byte *= wordSize;
-    buf_size_in_byte += ((data_type == HALF_DATATYPE || data_type == BFLOAT16_DATATYPE) ? CUBLAS_WORKSPACE_SIZE : 0);
-
-    return buf_size_in_byte;
-}
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/gemm_test/t5_gemm_func.h b/src/turbomind/utils/gemm_test/t5_gemm_func.h
deleted file mode 100644
index e0883095ae..0000000000
--- a/src/turbomind/utils/gemm_test/t5_gemm_func.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "src/turbomind/utils/cublasAlgoMap.h"
-#include "src/turbomind/utils/cuda_bf16_wrapper.h"
-#include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/gemm_test/gemm_func.h"
-
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-#include <cuda_fp16.h>
-#include <cuda_profiler_api.h>
-#include <map>
-#ifdef __linux__
-#include <sys/time.h>
-#include <unistd.h>
-#endif
-#include <vector>
-
-namespace turbomind {
-
-template<typename T>
-void generate_t5_gemm_config(int   batch_size,
-                             int   beam_width,
-                             int   max_mem_seq_len,
-                             int   encoder_d_model,
-                             int   encoder_head_num,
-                             int   encoder_size_per_head,
-                             int   encoder_inter_size,
-                             int   decoder_d_model,
-                             int   decoder_head_num,
-                             int   decoder_size_per_head,
-                             int   decoder_inter_size,
-                             int   decoder_vocab_size,
-                             int   tensor_para_size,
-                             void* buffer_in,
-                             bool  isAppend,
-                             bool  is_fp16_compute_type);
-
-size_t calT5GemmTestBufSizeInByte(int            batch_size,
-                                  int            beam_width,
-                                  int            max_mem_seq_len,
-                                  int            encoder_d_model,
-                                  int            encoder_head_num,
-                                  int            encoder_size_per_head,
-                                  int            encoder_inter_size,
-                                  int            decoder_d_model,
-                                  int            decoder_head_num,
-                                  int            decoder_size_per_head,
-                                  int            decoder_inter_size,
-                                  int            decoder_vocab_size,
-                                  int            tensor_para_size,
-                                  CublasDataType data_type);
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/gemm_test/xlnet_gemm_func.cc b/src/turbomind/utils/gemm_test/xlnet_gemm_func.cc
deleted file mode 100644
index 885b693c29..0000000000
--- a/src/turbomind/utils/gemm_test/xlnet_gemm_func.cc
+++ /dev/null
@@ -1,463 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "src/turbomind/utils/gemm_test/xlnet_gemm_func.h"
-#include "src/turbomind/macro.h"
-#include <chrono>
-
-namespace turbomind {
-
-template<typename T>
-void generate_xlnet_gemm_config(int   batch_size,
-                                int   seq_len,
-                                int   head_num,
-                                int   size_per_head,
-                                int   hidden_units_,
-                                int   inter_size_,
-                                void* buffer_in,
-                                bool  isAppend)
-{
-    void* cublas_workspace;
-    void* buffer;
-    int   workSpaceSize;
-
-#ifdef ENABLE_BF16
-    if (std::is_same<T, half>::value || std::is_same<T, __nv_bfloat16>::value) {
-#else
-    if (std::is_same<T, half>::value) {
-#endif  // ENABLE_BF16
-        // cublas_workspace_ should be the start pointer of cudaMalloc()
-        // to ensure 16B alignemnet
-        cublas_workspace = buffer_in;
-        buffer           = (void*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE);
-        workSpaceSize    = CUBLAS_WORKSPACE_SIZE;
-    }
-    else {
-        cublas_workspace = nullptr;
-        buffer           = buffer_in;
-        workSpaceSize    = 0;
-    }
-
-    struct cudaDeviceProp prop;
-    check_cuda_error(cudaGetDeviceProperties(&prop, 0));
-    printf("Device %s\n", prop.name);
-
-    // check config
-    FILE* fd;
-    int   line_count = 0;
-    if (!isAppend) {
-        fd = fopen(GEMM_CONFIG, "w+");
-    }
-    else {
-        fd = fopen(GEMM_CONFIG, "a+");
-        std::vector<std::string> config;
-        char                     line[1024];
-        while (fgets(line, 1024, fd) != NULL) {
-            config.push_back(std::string(line));
-        }
-        line_count = config.size();
-        if (config.size() >= (MAX_CONFIG_NUM * GEMM_NUM + 1))  // 6 cublas/cublasLt, first row is not included
-        {
-            int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * GEMM_NUM);
-            fclose(fd);
-            fd = fopen(GEMM_CONFIG, "w+");
-            fprintf(fd, "%s", config[0].c_str());
-            for (uint i = startIdx; i < config.size(); i++) {
-                fprintf(fd, "%s", config[i].c_str());
-            }
-            line_count = config.size() - (GEMM_NUM + 3);
-        }
-    }
-
-    const int         gemm_num = 10;
-    int               M[gemm_num];
-    int               N[gemm_num];
-    int               K[gemm_num];
-    int               lda[gemm_num];
-    int               strideA[gemm_num];
-    int               ldb[gemm_num];
-    int               strideB[gemm_num];
-    int               ldc[gemm_num];
-    int               strideC[gemm_num];
-    cublasOperation_t transa[gemm_num]     = {CUBLAS_OP_N,
-                                          CUBLAS_OP_N,
-                                          CUBLAS_OP_T,
-                                          CUBLAS_OP_T,
-                                          CUBLAS_OP_T,
-                                          CUBLAS_OP_T,
-                                          CUBLAS_OP_N,
-                                          CUBLAS_OP_T,
-                                          CUBLAS_OP_N,
-                                          CUBLAS_OP_N};
-    cublasOperation_t transb[gemm_num]     = {CUBLAS_OP_N};
-    int               batchCount[gemm_num] = {1};
-    char              mess[gemm_num][256];
-
-    // gemm1
-    M[0]          = hidden_units_;
-    N[0]          = seq_len * batch_size;
-    K[0]          = hidden_units_;
-    lda[0]        = hidden_units_;
-    strideA[0]    = hidden_units_ * hidden_units_;
-    ldb[0]        = hidden_units_;
-    strideB[0]    = 0;
-    ldc[0]        = hidden_units_;
-    strideC[0]    = seq_len * batch_size * hidden_units_;
-    batchCount[0] = 3;
-    strcpy(mess[0], "from_tensor * weightQ/K/V");
-
-    // gemm2
-    M[1]          = hidden_units_;
-    N[1]          = seq_len * 2;
-    K[1]          = hidden_units_;
-    batchCount[1] = 1;
-    strcpy(mess[1], " k_head_r_");
-
-    // gemm3
-    M[2]          = seq_len;
-    N[2]          = seq_len;
-    K[2]          = size_per_head;
-    lda[2]        = size_per_head;
-    strideA[2]    = seq_len * size_per_head;
-    ldb[2]        = size_per_head;
-    strideB[2]    = seq_len * size_per_head;
-    ldc[2]        = seq_len;
-    strideC[2]    = seq_len * seq_len;
-    batchCount[2] = batch_size * head_num;
-    strcpy(mess[2], "ac");
-
-    // gemm4
-    M[3]       = seq_len * 2;
-    N[3]       = seq_len;
-    K[3]       = size_per_head;
-    lda[3]     = size_per_head;
-    strideA[3] = seq_len * 2 * size_per_head;
-    ldb[3]     = size_per_head;
-    strideB[3] = seq_len * size_per_head;
-    ldc[3]     = seq_len * 2;
-    strideC[3] = seq_len * seq_len * 2;
-
-    batchCount[3] = batch_size * head_num;
-    strcpy(mess[3], "bd");
-
-    // gemm5
-    M[4]          = 2;
-    N[4]          = seq_len;
-    K[4]          = size_per_head;
-    lda[4]        = size_per_head;
-    strideA[4]    = 2 * size_per_head;
-    ldb[4]        = size_per_head;
-    strideB[4]    = seq_len * size_per_head;
-    ldc[4]        = 2;
-    strideC[4]    = seq_len * 2;
-    batchCount[4] = batch_size * head_num;
-    strcpy(mess[4], "ef");
-
-    // gemm6
-    M[5]       = head_num;
-    N[5]       = seq_len;
-    K[5]       = 2;
-    lda[5]     = 2;
-    strideA[5] = 2 * head_num;
-    ldb[5]     = 2;
-    strideB[5] = seq_len * 2;
-    ldc[5]     = head_num;
-    strideC[5] = seq_len * head_num;
-
-    batchCount[5] = batch_size * seq_len;
-    strcpy(mess[5], "seg_mat");
-    // gemm7
-    M[6]       = size_per_head;
-    N[6]       = seq_len;
-    K[6]       = seq_len;
-    lda[6]     = size_per_head;
-    strideA[6] = seq_len * size_per_head;
-    ldb[6]     = seq_len;
-    strideB[6] = seq_len * seq_len;
-    ldc[6]     = size_per_head;
-    strideC[6] = seq_len * size_per_head;
-
-    batchCount[6] = batch_size * head_num;
-    strcpy(mess[6], "attn_vec");
-
-    // gemm8
-    M[7]          = hidden_units_;
-    N[7]          = seq_len * batch_size;
-    K[7]          = hidden_units_;
-    lda[7]        = hidden_units_;
-    batchCount[7] = 1;
-    strcpy(mess[7], "attn_out");
-
-    // gemm9
-    M[8]          = inter_size_;
-    N[8]          = seq_len * batch_size;
-    K[8]          = hidden_units_;
-    batchCount[8] = 1;
-    strcpy(mess[8], "output_fc1_");
-
-    // gemm10
-    M[9]          = hidden_units_;
-    N[9]          = seq_len * batch_size;
-    K[9]          = inter_size_;
-    batchCount[9] = 1;
-
-    strcpy(mess[9], "output_fc2_");
-
-    cublasHandle_t cublas_handle;
-    check_cuda_error(cublasCreate(&cublas_handle));
-    cublasLtHandle_t ltHandle;
-    check_cuda_error(cublasLtCreate(&ltHandle));
-
-    cudaDataType_t AType;
-    cudaDataType_t BType;
-    cudaDataType_t CType;
-    cudaDataType_t computeType;
-    int            startAlgo, endAlgo;
-    const int      ites = 100;
-
-    CublasDataType data_type;
-    if (std::is_same<T, float>::value) {
-        data_type   = FLOAT_DATATYPE;
-        AType       = CUDA_R_32F;
-        BType       = CUDA_R_32F;
-        CType       = CUDA_R_32F;
-        computeType = CUDA_R_32F;
-        startAlgo   = (int)CUBLAS_GEMM_DEFAULT;
-        endAlgo     = (int)CUBLAS_GEMM_ALGO23;
-    }
-    else if (std::is_same<T, half>::value) {
-        data_type   = HALF_DATATYPE;
-        AType       = CUDA_R_16F;
-        BType       = CUDA_R_16F;
-        CType       = CUDA_R_16F;
-        computeType = CUDA_R_32F;
-        startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-        endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
-    }
-#ifdef ENABLE_BF16
-    else if (std::is_same<T, __nv_bfloat16>::value) {
-        data_type   = BFLOAT16_DATATYPE;
-        AType       = CUDA_R_16BF;
-        BType       = CUDA_R_16BF;
-        CType       = CUDA_R_16BF;
-        computeType = CUDA_R_32F;
-        startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-        endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
-    }
-#endif
-
-    using scaleT = typename ScaleTypeConverter<T, false>::Type;
-
-    scaleT alpha = (scaleT)1.0f;
-    scaleT beta  = (scaleT)0.0f;
-
-    printf("***Xlnet Gemm Testing Begin***\n");
-    printf("***Cublas Gemm Testing Begin***\n");
-    if (line_count == 0) {
-        fprintf(fd,
-                "batch_size, seq_len, head_num, size_per_head dataType ### "
-                "batchCount, n, m, k, algoId, "
-                "customOption, tile, numSplitsK, swizzle, reductionScheme, "
-                "workspaceSize, stages, exec_time\n");
-    }
-    for (int i = 0; i < gemm_num; ++i) {
-        int m = M[i], n = N[i], k = K[i];
-        printf("\n-----------------------------\n");
-        printf("GEMM test %d: [M: %d, K: %d, N: %d] %s\n", i, m, k, n, mess[i]);
-        T* d_A = (T*)buffer;
-        T* d_B = d_A + m * k * batchCount[i];
-        T* d_C = d_B + k * n * batchCount[i];
-
-        float exec_time = 99999.0f;
-        int   fast_algo = 0;
-        for (int algo = startAlgo; algo <= endAlgo; algo++) {
-            cublasStatus_t status;
-            cudaDeviceSynchronize();
-            auto start = std::chrono::high_resolution_clock::now();
-            for (int ite = 0; ite < ites; ++ite) {
-                if (i == 1 || i == 7 || i == 8 || i == 9) {
-                    status = cublasGemmEx(cublas_handle,
-                                          transa[i],
-                                          transb[i],
-                                          n,
-                                          m,
-                                          k,
-                                          &alpha,
-                                          d_A,
-                                          AType,
-                                          n,
-                                          d_B,
-                                          AType,
-                                          k,
-                                          &beta,
-                                          d_C,
-                                          CType,
-                                          n,
-                                          computeType,
-                                          static_cast<cublasGemmAlgo_t>(algo));
-                }
-                else {
-                    status = cublasGemmStridedBatchedEx(cublas_handle,
-                                                        transa[i],
-                                                        transb[i],
-                                                        m,
-                                                        n,
-                                                        k,
-                                                        &alpha,
-                                                        d_A,
-                                                        BType,
-                                                        lda[i],
-                                                        strideA[i],
-                                                        d_B,
-                                                        AType,
-                                                        ldb[i],
-                                                        strideB[i],
-                                                        &beta,
-                                                        d_C,
-                                                        CType,
-                                                        ldc[i],
-                                                        strideC[i],
-                                                        batchCount[i],
-                                                        computeType,
-                                                        static_cast<cublasGemmAlgo_t>(algo));
-                }
-                if (status != CUBLAS_STATUS_SUCCESS) {
-                    break;
-                }
-            }
-            cudaDeviceSynchronize();
-            auto end = std::chrono::high_resolution_clock::now();
-            auto dur = std::chrono::duration<float, std::milli>(end - start);
-            if (status == CUBLAS_STATUS_SUCCESS) {
-                printf("algo_%d costs %.3fms \n", algo, dur.count() / ites);
-                if (dur.count() / ites < exec_time) {
-                    exec_time = dur.count() / ites;
-                    fast_algo = algo;
-                }  // end if diffTime
-            }      // end status
-        }          // end for algo
-
-        printf("fast_algo %d costs %.3f ms\n", fast_algo, exec_time);
-
-        if ((i == 1 || i == 7 || i == 8 || i == 9) && data_type != FLOAT_DATATYPE) {
-            printf("***cublasLt Gemm Testing Begin***\n");
-            // Let try a fixed number of combinations
-            const int          ALGO_COMBINATIONS = 5000;
-            customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
-
-            LtHgemmCustomFind<T, scaleT>(ltHandle,
-                                         batch_size,
-                                         seq_len,
-                                         head_num,
-                                         size_per_head,
-                                         n,
-                                         m,
-                                         k,
-                                         &alpha,
-                                         d_B,
-                                         d_A,
-                                         &beta,
-                                         d_C,
-                                         cublas_workspace,
-                                         workSpaceSize,
-                                         fd,
-                                         perfResults,
-                                         ALGO_COMBINATIONS);
-            if (perfResults[0].time < exec_time) {
-                printPerfStructure(
-                    batch_size, seq_len, head_num, size_per_head, n, m, k, perfResults[0], fd, data_type, 0);
-                exec_time = perfResults[0].time;
-            }
-            else {
-                fprintf(fd,
-                        "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-                        "-1 -1 "
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-                        "-1 -1 -1 "
-#endif
-                        "%f\n",
-                        batch_size,
-                        seq_len,
-                        head_num,
-                        size_per_head,
-                        data_type,
-                        batchCount[i],
-                        n,
-                        m,
-                        k,
-                        fast_algo,
-                        exec_time);
-            }
-            printf("***cublasLt Gemm Testing End***\n");
-        }
-        else {
-            fprintf(fd,
-                    "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-                    "-1 -1 "
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-                    "-1 -1 -1 "
-#endif
-                    "%f\n",
-                    batch_size,
-                    seq_len,
-                    head_num,
-                    size_per_head,
-                    data_type,
-                    batchCount[i],
-                    n,
-                    m,
-                    k,
-                    fast_algo,
-                    exec_time);
-        }  // end else fp16
-    }      // end i
-    printf("***cublas Gemm Testing End***\n\n");
-    fclose(fd);
-    printf("***Xlnet Gemm Testing End***\n");
-
-    return;
-}
-
-template void generate_xlnet_gemm_config<float>(int   batch_size,
-                                                int   seq_len,
-                                                int   head_num,
-                                                int   size_per_head,
-                                                int   hidden_units_,
-                                                int   inter_size_,
-                                                void* buffer_in,
-                                                bool  isAppend);
-template void generate_xlnet_gemm_config<half>(int   batch_size,
-                                               int   seq_len,
-                                               int   head_num,
-                                               int   size_per_head,
-                                               int   hidden_units_,
-                                               int   inter_size_,
-                                               void* buffer_in,
-                                               bool  isAppend);
-#ifdef ENABLE_BF16
-template void generate_xlnet_gemm_config<__nv_bfloat16>(int   batch_size,
-                                                        int   seq_len,
-                                                        int   head_num,
-                                                        int   size_per_head,
-                                                        int   hidden_units_,
-                                                        int   inter_size_,
-                                                        void* buffer_in,
-                                                        bool  isAppend);
-#endif
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/gemm_test/xlnet_gemm_func.h b/src/turbomind/utils/gemm_test/xlnet_gemm_func.h
deleted file mode 100644
index 240805af4b..0000000000
--- a/src/turbomind/utils/gemm_test/xlnet_gemm_func.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "src/turbomind/utils/cublasAlgoMap.h"
-#include "src/turbomind/utils/cuda_bf16_wrapper.h"
-#include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/gemm_test/gemm_func.h"
-
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-#include <cuda_fp16.h>
-#include <cuda_profiler_api.h>
-#include <map>
-#ifdef __linux__
-#include <sys/time.h>
-#include <unistd.h>
-#endif
-#include <vector>
-
-namespace turbomind {
-
-template<typename T>
-void generate_xlnet_gemm_config(int   batch_size,
-                                int   seq_len,
-                                int   head_num,
-                                int   size_per_head,
-                                int   hidden_units_,
-                                int   inter_size_,
-                                void* buffer_in,
-                                bool  isAppend = true);
-
-}  // namespace turbomind
diff --git a/src/turbomind/utils/memory_utils.cu b/src/turbomind/utils/memory_utils.cu
index e9a79ea5a1..a31bfd631d 100644
--- a/src/turbomind/utils/memory_utils.cu
+++ b/src/turbomind/utils/memory_utils.cu
@@ -15,687 +15,11 @@
  */
 
 #include "src/turbomind/macro.h"
-#include "src/turbomind/utils/Tensor.h"
-#include "src/turbomind/utils/cuda_type_utils.cuh"
-#include "src/turbomind/utils/logger.h"
+#include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/memory_utils.h"
-#include <curand_kernel.h>
-#include <sys/stat.h>
-#include <unordered_map>
 
 namespace turbomind {
 
-template<typename T>
-void deviceMalloc(T** ptr, size_t size, cudaStream_t st, bool is_random_initialize)
-{
-    check_cuda_error(cudaMallocAsync((void**)(ptr), sizeof(T) * size, st));
-    if (is_random_initialize) {
-        cudaRandomUniform(*ptr, size, st);
-    }
-}
-
-template void deviceMalloc(float** ptr, size_t size, cudaStream_t, bool is_random_initialize);
-template void deviceMalloc(half** ptr, size_t size, cudaStream_t, bool is_random_initialize);
-#ifdef ENABLE_BF16
-template void deviceMalloc(__nv_bfloat16** ptr, size_t size, cudaStream_t, bool is_random_initialize);
-#endif
-template void deviceMalloc(uint16_t** ptr, size_t size, cudaStream_t, bool is_random_initialize);
-template void deviceMalloc(int** ptr, size_t size, cudaStream_t, bool is_random_initialize);
-template void deviceMalloc(bool** ptr, size_t size, cudaStream_t, bool is_random_initialize);
-template void deviceMalloc(char** ptr, size_t size, cudaStream_t, bool is_random_initialize);
-template void deviceMalloc(int8_t** ptr, size_t size, cudaStream_t, bool is_random_initialize);
-#ifdef ENABLE_FP8
-template void deviceMalloc(__nv_fp8_e4m3** ptr, size_t size, cudaStream_t, bool is_random_initialize);
-#endif
-
-template<typename T>
-void deviceFree(T*& ptr, cudaStream_t st)
-{
-    if (ptr != NULL) {
-        check_cuda_error(cudaFreeAsync(ptr, st));
-        ptr = NULL;
-    }
-}
-
-template void deviceFree(float*& ptr, cudaStream_t);
-template void deviceFree(half*& ptr, cudaStream_t);
-#ifdef ENABLE_BF16
-template void deviceFree(__nv_bfloat16*& ptr, cudaStream_t);
-#endif
-template void deviceFree(unsigned short*& ptr, cudaStream_t);
-template void deviceFree(int*& ptr, cudaStream_t);
-template void deviceFree(bool*& ptr, cudaStream_t);
-template void deviceFree(char*& ptr, cudaStream_t);
-template void deviceFree(int8_t*& ptr, cudaStream_t);
-template void deviceFree(void*& ptr, cudaStream_t);
-#ifdef ENABLE_FP8
-template void deviceFree(__nv_fp8_e4m3*& ptr, cudaStream_t);
-#endif
-
-namespace {
-
-template<class T>
-__global__ void fill_kernel(T* devptr, size_t size, T value)
-{
-    const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-    for (size_t i = idx; i < size; i += blockDim.x * gridDim.x) {
-        devptr[i] = value;
-    }
-}
-
-}  // namespace
-
-template<typename T>
-void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream)
-{
-    constexpr int threads = 512;
-    const int     blocks  = (size + threads - 1) / threads;
-    fill_kernel<<<blocks, threads, 0, stream>>>(devptr, size, value);
-}
-
-template void deviceFill(float* devptr, size_t size, float value, cudaStream_t stream);
-template void deviceFill(half* devptr, size_t size, half value, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void deviceFill(__nv_bfloat16* devptr, size_t size, __nv_bfloat16 value, cudaStream_t stream);
-#endif
-template void deviceFill(int* devptr, size_t size, int value, cudaStream_t stream);
-template void deviceFill(bool* devptr, size_t size, bool value, cudaStream_t stream);
-
-template<typename T>
-void cudaD2Hcpy(T* tgt, const T* src, const size_t size)
-{
-    check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyDeviceToHost));
-}
-
-template void cudaD2Hcpy(float* tgt, const float* src, size_t size);
-template void cudaD2Hcpy(half* tgt, const half* src, size_t size);
-#ifdef ENABLE_BF16
-template void cudaD2Hcpy(__nv_bfloat16* tgt, const __nv_bfloat16* src, size_t size);
-#endif
-template void cudaD2Hcpy(int* tgt, const int* src, size_t size);
-template void cudaD2Hcpy(bool* tgt, const bool* src, size_t size);
-#ifdef ENABLE_FP8
-template void cudaD2Hcpy(__nv_fp8_e4m3* tgt, const __nv_fp8_e4m3* src, size_t size);
-#endif
-template void cudaD2Hcpy(unsigned long long* tgt, const unsigned long long* src, size_t size);
-template void cudaD2Hcpy(unsigned int* tgt, const unsigned int* src, size_t size);
-template void cudaD2Hcpy(int8_t* tgt, const int8_t* src, size_t size);
-
-template<typename T>
-void cudaH2Dcpy(T* tgt, const T* src, const size_t size)
-{
-    if (tgt == nullptr || src == nullptr) {
-        TM_LOG_ERROR("cudaH2Dcpy: dst=%p src=%p, size=%d", tgt, src, (int)(sizeof(T) * size));
-    }
-    check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyHostToDevice));
-}
-
-template void cudaH2Dcpy(float* tgt, const float* src, size_t size);
-template void cudaH2Dcpy(half* tgt, const half* src, size_t size);
-#ifdef ENABLE_BF16
-template void cudaH2Dcpy(__nv_bfloat16* tgt, const __nv_bfloat16* src, size_t size);
-#endif
-template void cudaH2Dcpy(int* tgt, const int* src, size_t size);
-template void cudaH2Dcpy(bool* tgt, const bool* src, size_t size);
-#ifdef ENABLE_FP8
-template void cudaH2Dcpy(__nv_fp8_e4m3* tgt, const __nv_fp8_e4m3* src, size_t size);
-#endif
-template void cudaH2Dcpy(unsigned long long* tgt, const unsigned long long* src, size_t size);
-template void cudaH2Dcpy(unsigned int* tgt, const unsigned int* src, size_t size);
-template void cudaH2Dcpy(int8_t* tgt, const int8_t* src, size_t size);
-
-template<typename T>
-void cudaD2Dcpy(T* tgt, const T* src, const size_t size)
-{
-    check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyDeviceToDevice));
-}
-
-template void cudaD2Dcpy(float* tgt, const float* src, size_t size);
-template void cudaD2Dcpy(half* tgt, const half* src, size_t size);
-#ifdef ENABLE_BF16
-template void cudaD2Dcpy(__nv_bfloat16* tgt, const __nv_bfloat16* src, size_t size);
-#endif
-template void cudaD2Dcpy(int* tgt, const int* src, size_t size);
-template void cudaD2Dcpy(bool* tgt, const bool* src, size_t size);
-template void cudaD2Dcpy(int8_t* tgt, const int8_t* src, size_t size);
-#ifdef ENABLE_FP8
-template void cudaD2Dcpy(__nv_fp8_e4m3* tgt, const __nv_fp8_e4m3* src, size_t size);
-#endif
-template void cudaD2Dcpy(unsigned long long* tgt, const unsigned long long* src, size_t size);
-
-template<typename T_OUT, typename T_IN>
-__global__ void cudaCast(T_OUT* dst, T_IN* src, const size_t size)
-{
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
-        dst[tid] = (T_OUT)((float)(src[tid]));
-    }
-}
-
-template<typename T_OUT, typename T_IN>
-void invokeCudaCast(T_OUT* dst, T_IN const* const src, const size_t size, cudaStream_t stream)
-{
-    cudaCast<<<256, 256, 0, stream>>>(dst, src, size);
-}
-
-template void invokeCudaCast(float* dst, half const* const src, const size_t size, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void invokeCudaCast(float* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream);
-template void invokeCudaCast(__nv_bfloat16* dst, float const* const src, const size_t size, cudaStream_t stream);
-template void invokeCudaCast(__nv_bfloat16* dst, half const* const src, const size_t size, cudaStream_t stream);
-template void invokeCudaCast(half* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream);
-#endif
-#ifdef ENABLE_FP8
-template void invokeCudaCast(float* dst, __nv_fp8_e4m3 const* const src, const size_t size, cudaStream_t stream);
-template void
-invokeCudaCast(__nv_bfloat16* dst, __nv_fp8_e4m3 const* const src, const size_t size, cudaStream_t stream);
-template void invokeCudaCast(half* dst, __nv_fp8_e4m3 const* const src, const size_t size, cudaStream_t stream);
-template void invokeCudaCast(__nv_fp8_e4m3* dst, float const* const src, const size_t size, cudaStream_t stream);
-template void
-invokeCudaCast(__nv_fp8_e4m3* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream);
-template void invokeCudaCast(__nv_fp8_e4m3* dst, half const* const src, const size_t size, cudaStream_t stream);
-#endif
-
-template<typename T>
-void cudaAutoCpy(T* tgt, const T* src, const size_t size, cudaStream_t stream)
-{
-    if (stream != NULL) {
-        check_cuda_error(cudaMemcpyAsync(tgt, src, sizeof(T) * size, cudaMemcpyDefault, stream));
-    }
-    else {
-        check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyDefault));
-    }
-}
-
-template void cudaAutoCpy(float* tgt, const float* src, size_t size, cudaStream_t stream);
-template void cudaAutoCpy(half* tgt, const half* src, size_t size, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void cudaAutoCpy(__nv_bfloat16* tgt, const __nv_bfloat16* src, size_t size, cudaStream_t stream);
-#endif
-template void cudaAutoCpy(int* tgt, const int* src, size_t size, cudaStream_t stream);
-template void cudaAutoCpy(bool* tgt, const bool* src, size_t size, cudaStream_t stream);
-template void cudaAutoCpy(int8_t* tgt, const int8_t* src, size_t size, cudaStream_t stream);
-template void cudaAutoCpy(uint* tgt, const uint* src, size_t size, cudaStream_t stream);
-template void cudaAutoCpy(unsigned long long* tgt, const unsigned long long* src, size_t size, cudaStream_t stream);
-template void cudaAutoCpy(char* tgt, const char* src, size_t size, cudaStream_t stream);
-
-template void cudaAutoCpy(float const** tgt, float const* const* src, size_t size, cudaStream_t stream);
-template void cudaAutoCpy(half const** tgt, half const* const* src, size_t size, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void cudaAutoCpy(__nv_bfloat16 const** tgt, __nv_bfloat16 const* const* src, size_t size, cudaStream_t stream);
-#endif
-template void cudaAutoCpy(int const** tgt, int const* const* src, size_t size, cudaStream_t stream);
-template void cudaAutoCpy(bool const** tgt, bool const* const* src, size_t size, cudaStream_t stream);
-template void cudaAutoCpy(int8_t const** tgt, int8_t const* const* src, size_t size, cudaStream_t stream);
-template void
-cudaAutoCpy(unsigned long long const** tgt, unsigned long long const* const* src, size_t size, cudaStream_t stream);
-
-template<typename T>
-__global__ void cuda_random_uniform_kernel(T* buffer, const size_t size, const int seq_offset)
-{
-    const int     idx = blockIdx.x * blockDim.x + threadIdx.x;
-    curandState_t local_state;
-    curand_init((unsigned long long int)1337, idx + seq_offset, 0, &local_state);
-    for (size_t index = idx; index < size; index += blockDim.x * gridDim.x) {
-        buffer[index] = (T)(curand_uniform(&local_state) * 0.2f - 0.1f);
-    }
-}
-
-template<>
-__global__ void cuda_random_uniform_kernel<int>(int* buffer, const size_t size, const int seq_offset)
-{
-    const int     idx = blockIdx.x * blockDim.x + threadIdx.x;
-    curandState_t local_state;
-    curand_init((float)1337.f, idx + seq_offset, 0, &local_state);
-    for (size_t index = idx; index < size; index += blockDim.x * gridDim.x) {
-        buffer[index] = curand(&local_state);
-    }
-}
-
-template<>
-__global__ void cuda_random_uniform_kernel<bool>(bool* buffer, const size_t size, const int seq_offset)
-{
-    const int     idx = blockIdx.x * blockDim.x + threadIdx.x;
-    curandState_t local_state;
-    curand_init((float)1337.f, idx + seq_offset, 0, &local_state);
-    for (size_t index = idx; index < size; index += blockDim.x * gridDim.x) {
-        buffer[index] = (curand(&local_state) % 2 == 0);
-    }
-}
-
-template<>
-__global__ void cuda_random_uniform_kernel<char>(char* buffer, const size_t size, const int seq_offset)
-{
-    const int     idx = blockIdx.x * blockDim.x + threadIdx.x;
-    curandState_t local_state;
-    curand_init((float)1337.f, idx + seq_offset, 0, &local_state);
-    for (size_t index = idx; index < size; index += blockDim.x * gridDim.x) {
-        buffer[index] = curand(&local_state) % 0xFF;
-    }
-}
-
-template<typename T>
-void cudaRandomUniform(T* buffer, const size_t size, cudaStream_t st)
-{
-    static int seq_offset = 0;
-    cuda_random_uniform_kernel<T><<<256, 256, 0, st>>>(buffer, size, seq_offset);
-    seq_offset += 256 * 256;
-}
-
-template void cudaRandomUniform(float* buffer, const size_t size, cudaStream_t);
-template void cudaRandomUniform(half* buffer, const size_t size, cudaStream_t);
-#ifdef ENABLE_BF16
-template void cudaRandomUniform(__nv_bfloat16* buffer, const size_t size, cudaStream_t);
-#endif
-template void cudaRandomUniform(int* buffer, const size_t size, cudaStream_t);
-template void cudaRandomUniform(bool* buffer, const size_t size, cudaStream_t);
-template void cudaRandomUniform(char* buffer, const size_t size, cudaStream_t);
-#ifdef ENABLE_FP8
-template void cudaRandomUniform(__nv_fp8_e4m3* buffer, const size_t size, cudaStream_t);
-#endif
-
-// loads data from binary file. If it succeeds, returns a non-empty vector. If loading fails or
-// the product of the elements in shape is 0, this function will return an empty vector.
-template<typename T>
-std::vector<T> loadWeightFromBinHelper(std::vector<size_t> shape, std::string filename)
-{
-    if (shape.size() > 2) {
-        printf("[ERROR] shape should have less than two dims \n");
-        return std::vector<T>();
-    }
-
-    size_t dim0 = shape[0], dim1 = 1;
-    if (shape.size() == 2) {
-        dim1 = shape[1];
-    }
-
-    size_t size = dim0 * dim1;
-    if (size == 0) {
-        TM_LOG_WARNING("shape is zero, skip loading weight from file %s \n", filename.c_str());
-        return std::vector<T>();
-    }
-
-    std::vector<T> host_array(size);
-    std::ifstream  in(filename, std::ios::in | std::ios::binary);
-    if (!in.is_open()) {
-        TM_LOG_WARNING("file %s cannot be opened, loading model fails! \n", filename.c_str());
-        return std::vector<T>();
-    }
-
-    size_t loaded_data_size = sizeof(T) * size;
-    in.seekg(0, in.end);
-    const auto file_size_in_bytes = (size_t)in.tellg();
-    in.seekg(0, in.beg);
-
-    TM_LOG_DEBUG("Read " + std::to_string(loaded_data_size) + " bytes from " + filename);
-    in.read((char*)host_array.data(), loaded_data_size);
-
-    if (file_size_in_bytes != loaded_data_size) {
-        TM_LOG_WARNING("file %s has %ld, but request %ld, loading model fails!",
-                       filename.c_str(),
-                       file_size_in_bytes,
-                       loaded_data_size);
-        return std::vector<T>();
-    }
-    in.close();
-    // If we succeed, return an array with values.
-    return host_array;
-}
-
-std::vector<float> loadArrayFromBin(std::vector<size_t> shape, std::string filename)
-{
-    return loadWeightFromBinHelper<float>(shape, filename);
-}
-
-template<typename T, typename T_IN>
-int loadWeightFromBinFunc(T* ptr, std::vector<size_t> shape, std::string filename)
-{
-    std::vector<T_IN> host_array = loadWeightFromBinHelper<T_IN>(shape, filename);
-
-    if (host_array.empty()) {
-        return 0;
-    }
-
-    if (std::is_same<T, T_IN>::value == true) {
-        cudaH2Dcpy(ptr, (T*)host_array.data(), host_array.size());
-    }
-    else {
-        T_IN* ptr_2 = nullptr;
-        deviceMalloc(&ptr_2, host_array.size(), nullptr, false);
-        cudaH2Dcpy(ptr_2, host_array.data(), host_array.size());
-        invokeCudaD2DcpyConvert(ptr, ptr_2, host_array.size());
-        deviceFree(ptr_2, nullptr);
-    }
-    return 0;
-}
-
-template int loadWeightFromBinFunc<float, float>(float* ptr, std::vector<size_t> shape, std::string filename);
-template int loadWeightFromBinFunc<half, float>(half* ptr, std::vector<size_t> shape, std::string filename);
-template int loadWeightFromBinFunc<float, half>(float* ptr, std::vector<size_t> shape, std::string filename);
-template int loadWeightFromBinFunc<half, half>(half* ptr, std::vector<size_t> shape, std::string filename);
-template int loadWeightFromBinFunc<int8_t, int8_t>(int8_t* ptr, std::vector<size_t> shape, std::string filename);
-#ifdef ENABLE_BF16
-template int
-loadWeightFromBinFunc<__nv_bfloat16, float>(__nv_bfloat16* ptr, std::vector<size_t> shape, std::string filename);
-template int
-loadWeightFromBinFunc<__nv_bfloat16, half>(__nv_bfloat16* ptr, std::vector<size_t> shape, std::string filename);
-template int loadWeightFromBinFunc<float, __nv_bfloat16>(float* ptr, std::vector<size_t> shape, std::string filename);
-template int loadWeightFromBinFunc<half, __nv_bfloat16>(half* ptr, std::vector<size_t> shape, std::string filename);
-template int loadWeightFromBinFunc<__nv_bfloat16, __nv_bfloat16>(__nv_bfloat16*      ptr,
-                                                                 std::vector<size_t> shape,
-                                                                 std::string         filename);
-#endif  // ENABLE_BF16
-template int loadWeightFromBinFunc<int, int>(int* ptr, std::vector<size_t> shape, std::string filename);
-#ifdef ENABLE_FP8
-template int
-loadWeightFromBinFunc<__nv_fp8_e4m3, float>(__nv_fp8_e4m3* ptr, std::vector<size_t> shape, std::string filename);
-#endif  // ENABLE_FP8
-
-template<typename T>
-int loadWeightFromBin(T* ptr, std::vector<size_t> shape, std::string filename, FtCudaDataType model_file_type)
-{
-    switch (model_file_type) {
-        case FtCudaDataType::FP32:
-            loadWeightFromBinFunc<T, float>(ptr, shape, filename);
-            break;
-        case FtCudaDataType::FP16:
-            loadWeightFromBinFunc<T, half>(ptr, shape, filename);
-            break;
-        case FtCudaDataType::INT8:
-            loadWeightFromBinFunc<T, int8_t>(ptr, shape, filename);
-            break;
-#ifdef ENABLE_BF16
-        case FtCudaDataType::BF16:
-            loadWeightFromBinFunc<T, __nv_bfloat16>(ptr, shape, filename);
-            break;
-#endif
-#ifdef ENABLE_FP8
-        case FtCudaDataType::FP8:
-            loadWeightFromBinFunc<T, float>(ptr, shape, filename);
-            break;
-#endif
-        default:
-            TM_LOG_ERROR("Does not support FtCudaDataType=%d", model_file_type);
-            FT_CHECK(false);
-    }
-    return 0;
-}
-
-template<>
-int loadWeightFromBin(int* ptr, std::vector<size_t> shape, std::string filename, FtCudaDataType model_file_type)
-{
-    loadWeightFromBinFunc<int, int>(ptr, shape, filename);
-    return 0;
-}
-
-template int
-loadWeightFromBin(float* ptr, std::vector<size_t> shape, std::string filename, FtCudaDataType model_file_type);
-template int
-loadWeightFromBin(half* ptr, std::vector<size_t> shape, std::string filename, FtCudaDataType model_file_type);
-template int
-loadWeightFromBin(int8_t* ptr, std::vector<size_t> shape, std::string filename, FtCudaDataType model_file_type);
-#ifdef ENABLE_BF16
-template int
-loadWeightFromBin(__nv_bfloat16* ptr, std::vector<size_t> shape, std::string filename, FtCudaDataType model_file_type);
-#endif
-#ifdef ENABLE_FP8
-template int
-loadWeightFromBin(__nv_fp8_e4m3* ptr, std::vector<size_t> shape, std::string filename, FtCudaDataType model_file_type);
-#endif
-template int
-loadWeightFromBin(int* ptr, std::vector<size_t> shape, std::string filename, FtCudaDataType model_file_type);
-
-template<typename T_IN, typename T_OUT>
-__global__ void cudaD2DcpyConvert(T_OUT* dst, const T_IN* src, const size_t size)
-{
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
-        dst[tid] = cuda_cast<T_OUT>(src[tid]);
-    }
-}
-
-template<typename T_IN, typename T_OUT>
-void invokeCudaD2DcpyConvert(T_OUT* tgt, const T_IN* src, const size_t size, cudaStream_t stream)
-{
-    cudaD2DcpyConvert<<<256, 256, 0, stream>>>(tgt, src, size);
-}
-
-template void invokeCudaD2DcpyConvert(int8_t* tgt, const float* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(float* tgt, const int8_t* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(float* tgt, const int* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(half* tgt, const int* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(float* tgt, const float* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(half* tgt, const float* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(float* tgt, const half* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(uint* tgt, const int* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(int* tgt, const uint* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(int* tgt, const float* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(int* tgt, const half* src, const size_t size, cudaStream_t stream);
-
-#ifdef ENABLE_BF16
-template void invokeCudaD2DcpyConvert(__nv_bfloat16* tgt, const float* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(__nv_bfloat16* tgt, const int* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(float* tgt, const __nv_bfloat16* src, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DcpyConvert(int* tgt, const __nv_bfloat16* src, const size_t size, cudaStream_t stream);
-#endif  // ENABLE_BF16
-
-template<typename T_IN, typename T_OUT>
-__global__ void
-cudaD2DScaleCpyConvert(T_OUT* dst, const T_IN* src, const float* scale, bool invert_scale, const size_t size)
-{
-    const float scale_value = invert_scale ? 1.0f / scale[0] : scale[0];
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
-        dst[tid] = cuda_cast<T_OUT>(cuda_cast<float>(src[tid]) * scale_value);
-    }
-}
-
-template<typename T_IN, typename T_OUT>
-void invokeCudaD2DScaleCpyConvert(
-    T_OUT* tgt, const T_IN* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream)
-{
-    cudaD2DScaleCpyConvert<<<256, 256, 0, stream>>>(tgt, src, scale, invert_scale, size);
-}
-
-// clang-format off
-template void invokeCudaD2DScaleCpyConvert(float* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const float* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DScaleCpyConvert(half* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const half* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void invokeCudaD2DScaleCpyConvert(__nv_bfloat16* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
-template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const __nv_bfloat16* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
-#endif  // ENABLE_BF16
-#ifdef ENABLE_FP8
-template void invokeCudaD2DScaleCpyConvert(float* tgt, const __nv_fp8_e4m3* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
-#endif  // ENABLE_FP8
-// clang-format on
-
-void invokeCudaD2DcpyHalf2Float(float* dst, half* src, const size_t size, cudaStream_t stream)
-{
-    invokeCudaD2DcpyConvert(dst, src, size, stream);
-}
-
-void invokeCudaD2DcpyFloat2Half(half* dst, float* src, const size_t size, cudaStream_t stream)
-{
-    invokeCudaD2DcpyConvert(dst, src, size, stream);
-}
-
-template<typename T>
-void saveToBinary(const T* ptr, const size_t size, std::string filename)
-{
-
-    std::vector<T> h_ptr(size);
-    cudaD2Hcpy(h_ptr.data(), ptr, size);
-    std::vector<float> float_ptr(size);
-    for (size_t i = 0; i < size; i++) {
-        float_ptr[i] = (float)h_ptr[i];
-    }
-
-    std::ofstream out(filename, std::ios::out | std::ios::binary);
-    FT_CHECK_WITH_INFO(out.is_open(), "Fail to open file " + filename);
-
-    out.write((char*)float_ptr.data(), size * sizeof(float));
-}
-
-template void saveToBinary(const float* ptr, const size_t size, std::string filename);
-template void saveToBinary(const half* ptr, const size_t size, std::string filename);
-#ifdef ENABLE_BF16
-template void saveToBinary(const __nv_bfloat16* ptr, const size_t size, std::string filename);
-#endif  // ENABLE_BF16
-
-template<>
-void saveToBinary(const int* ptr, const size_t size, std::string filename)
-{
-    std::vector<int> h_ptr(size);
-    cudaD2Hcpy(h_ptr.data(), ptr, size);
-    std::ofstream out(filename, std::ios::out | std::ios::binary);
-    FT_CHECK_WITH_INFO(out.is_open(), "Fail to open file " + filename);
-    out.write((char*)h_ptr.data(), size * sizeof(int));
-}
-
-template<typename T_IN, typename T_fake_type>
-__global__ void fakeCast(T_IN* input_ptr, const size_t size)
-{
-    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) {
-        T_fake_type tmp_val = (T_fake_type)((float)input_ptr[i]);
-        input_ptr[i]        = (T_IN)((float)tmp_val);
-    }
-}
-
-template<typename T_IN, typename T_fake_type>
-void invokeFakeCast(T_IN* input_ptr, const size_t size, cudaStream_t stream)
-{
-    dim3 block(256);
-    dim3 grid((size + 255) / 256);
-    fakeCast<T_IN, T_fake_type><<<grid, block, 0, stream>>>(input_ptr, size);
-}
-
-#ifdef ENABLE_FP8
-__global__ void cudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size)
-{
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
-        dst[tid] = (float)(src[tid]);
-    }
-}
-
-void invokeCudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream)
-{
-    cudaD2Dcpyfp82Float<<<256, 256, 0, stream>>>(dst, src, size);
-}
-
-__global__ void cudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size)
-{
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
-        dst[tid] = (half)((float)(src[tid]));
-    }
-}
-
-void invokeCudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream)
-{
-    cudaD2Dcpyfp82Half<<<256, 256, 0, stream>>>(dst, src, size);
-}
-
-__global__ void cudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size)
-{
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
-        dst[tid] = (__nv_fp8_e4m3)src[tid];
-    }
-}
-
-void invokeCudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size, cudaStream_t stream)
-{
-    cudaD2DcpyFloat2fp8<<<256, 256, 0, stream>>>(dst, src, size);
-}
-
-__global__ void cudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size)
-{
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
-        dst[tid] = (__nv_fp8_e4m3)src[tid];
-    }
-}
-
-void invokeCudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size, cudaStream_t stream)
-{
-    cudaD2DcpyHalf2fp8<<<256, 256, 0, stream>>>(dst, src, size);
-}
-
-__global__ void cudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size)
-{
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
-        dst[tid] = (__nv_fp8_e4m3)src[tid];
-    }
-}
-
-void invokeCudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size, cudaStream_t stream)
-{
-    cudaD2DcpyBfloat2fp8<<<256, 256, 0, stream>>>(dst, src, size);
-}
-
-#endif  // ENABLE_FP8
-
-template<typename T_OUT, typename T_IN>
-__global__ void transpose(T_OUT* dst, T_IN* src, const int dim0, const int dim1)
-{
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1; tid += blockDim.x * gridDim.x) {
-        const int src_col_id                = tid % dim1;
-        const int src_row_id                = tid / dim1;
-        dst[src_col_id * dim0 + src_row_id] = (T_OUT)(src[tid]);
-    }
-}
-
-template<typename T>
-void invokeInPlaceTranspose(T* data, T* workspace, const int dim0, const int dim1)
-{
-    // copy data to workspace, and then transpose from workspace to data
-    cudaD2Dcpy(workspace, data, dim0 * dim1);
-    transpose<<<256, 256>>>(data, workspace, dim0, dim1);
-}
-
-#ifdef ENABLE_FP8
-template void invokeInPlaceTranspose(__nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const int dim0, const int dim1);
-#endif  // ENABLE_FP8
-#ifdef ENABLE_BF16
-template void invokeInPlaceTranspose(__nv_bfloat16* data, __nv_bfloat16* workspace, const int dim0, const int dim1);
-#endif  // ENABLE_BF16
-template void invokeInPlaceTranspose(float* data, float* workspace, const int dim0, const int dim1);
-
-template<typename T_OUT, typename T_IN>
-__global__ void transpose0213(T_OUT* dst, T_IN* src, const int dim0, const int dim1, const int dim2, const int dim3)
-{
-    // src permutation: [0, 1, 2, 3]
-    // dst permutation: [0, 2, 1, 3]
-    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1 * dim2 * dim3;
-         tid += blockDim.x * gridDim.x) {
-        int       tmp_idx   = tid;
-        const int dim_3_idx = tmp_idx % dim3;
-        tmp_idx             = (tmp_idx - dim_3_idx) / dim3;
-        const int dim_2_idx = tmp_idx % dim2;
-        tmp_idx             = (tmp_idx - dim_2_idx) / dim2;
-        const int dim_1_idx = tmp_idx % dim1;
-        tmp_idx             = (tmp_idx - dim_1_idx) / dim1;
-        const int dim_0_idx = tmp_idx % dim0;
-        dst[dim_0_idx * dim1 * dim2 * dim3 + dim_2_idx * dim1 * dim3 + dim_1_idx * dim3 + dim_3_idx] = src[tid];
-    }
-}
-
-template<typename T>
-void invokeInPlaceTranspose0213(T* data, T* workspace, const int dim0, const int dim1, const int dim2, const int dim3)
-{
-    // copy data to workspace, and then transpose from workspace to data
-    // Note that this kernel is used for pre-processing and not very efficient.
-    cudaD2Dcpy(workspace, data, dim0 * dim1 * dim2 * dim3);
-    transpose0213<<<256, 256>>>(data, workspace, dim0, dim1, dim2, dim3);
-}
-
-#ifdef ENABLE_FP8
-template void invokeInPlaceTranspose0213(
-    __nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const int dim0, const int dim1, const int dim2, const int dim3);
-#endif  // ENABLE_FP8
-#ifdef ENABLE_BF16
-template void invokeInPlaceTranspose0213(
-    __nv_bfloat16* data, __nv_bfloat16* workspace, const int dim0, const int dim1, const int dim2, const int dim3);
-#endif  // ENABLE_BF16
-template void invokeInPlaceTranspose0213(
-    float* data, float* workspace, const int dim0, const int dim1, const int dim2, const int dim3);
-
 template<typename T_OUT, typename T_IN>
 __global__ void transpose102(T_OUT* dst, T_IN* src, const int dim0, const int dim1, const int dim2)
 {
@@ -720,139 +44,19 @@ void invokeInPlaceTranspose102(
     // Note that this kernel is used for pre-processing and not very efficient.
     const size_t count = dim0 * dim1 * dim2;
     if (copy) {
-        cudaAutoCpy(workspace, data, count, stream);
+        check_cuda_error(cudaMemcpyAsync(workspace, data, sizeof(T) * count, cudaMemcpyDefault, stream));
     }
     const int block = 512;
     const int grid  = std::min((count + block - 1) / block, (size_t)8192);
     transpose102<<<grid, block, 0, stream>>>(data, workspace, dim0, dim1, dim2);
 }
 
-#ifdef ENABLE_FP8
-template void invokeInPlaceTranspose102(__nv_fp8_e4m3* data,
-                                        __nv_fp8_e4m3* workspace,
-                                        const int      dim0,
-                                        const int      dim1,
-                                        const int      dim2,
-                                        bool           copy,
-                                        cudaStream_t   stream);
-#endif  // ENABLE_FP8
-#ifdef ENABLE_BF16
-template void invokeInPlaceTranspose102(__nv_bfloat16* data,
-                                        __nv_bfloat16* workspace,
-                                        const int      dim0,
-                                        const int      dim1,
-                                        const int      dim2,
-                                        bool           copy,
-                                        cudaStream_t   stream);
-#endif  // ENABLE_BF16
-template void invokeInPlaceTranspose102(
-    half* data, half* workspace, const int dim0, const int dim1, const int dim2, bool copy, cudaStream_t stream);
-template void invokeInPlaceTranspose102(
-    float* data, float* workspace, const int dim0, const int dim1, const int dim2, bool copy, cudaStream_t stream);
-
-template<typename T>
-void __global__ multiplyScale(T* tensor, float scale, const size_t size)
-{
-    for (size_t index = threadIdx.x + blockIdx.x * blockDim.x; index < size; index += blockDim.x * gridDim.x) {
-        tensor[index] = (T)(((float)tensor[index]) * scale);
-    }
-}
-
-template<typename T>
-void invokeMultiplyScale(T* tensor, float scale, const size_t size, cudaStream_t stream)
-{
-    int block = 256;
-    int grid  = (size + 255) / 256;
-    multiplyScale<<<grid, block, 0, stream>>>(tensor, scale, size);
-}
-
-template void invokeMultiplyScale(float* tensor, float scale, const size_t size, cudaStream_t stream);
-template void invokeMultiplyScale(half* tensor, float scale, const size_t size, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void invokeMultiplyScale(__nv_bfloat16* tensor, float scale, const size_t size, cudaStream_t stream);
-#endif
-#ifdef ENABLE_FP8
-template void invokeMultiplyScale(__nv_fp8_e4m3* tensor, float scale, const size_t size, cudaStream_t stream);
-#endif
-
-template<typename T>
-void __global__ divideScale(T* tensor, float scale, const size_t size)
-{
-    for (size_t index = threadIdx.x + blockIdx.x * blockDim.x; index < size; index += blockDim.x * gridDim.x) {
-        tensor[index] = (T)(((float)tensor[index]) / scale);
-    }
-}
-
-template<typename T>
-void invokeDivideScale(T* tensor, float scale, const size_t size, cudaStream_t stream)
-{
-    int block = 256;
-    int grid  = (size + 255) / 256;
-    divideScale<<<grid, block, 0, stream>>>(tensor, scale, size);
-}
-
-template void invokeDivideScale(float* tensor, float scale, const size_t size, cudaStream_t stream);
-template void invokeDivideScale(half* tensor, float scale, const size_t size, cudaStream_t stream);
-#ifdef ENABLE_BF16
-template void invokeDivideScale(__nv_bfloat16* tensor, float scale, const size_t size, cudaStream_t stream);
-#endif
-#ifdef ENABLE_FP8
-template void invokeDivideScale(__nv_fp8_e4m3* tensor, float scale, const size_t size, cudaStream_t stream);
-#endif
-#ifdef ENABLE_BF16
-template void invokeFakeCast<float, __nv_bfloat16>(float* input_ptr, const size_t size, cudaStream_t stream);
-template void
-invokeFakeCast<__nv_bfloat16, __nv_bfloat16>(__nv_bfloat16* input_ptr, const size_t size, cudaStream_t stream);
-template void invokeFakeCast<half, __nv_bfloat16>(half* input_ptr, const size_t size, cudaStream_t stream);
-#endif
-template void invokeFakeCast<float, half>(float* input_ptr, const size_t size, cudaStream_t stream);
-template void invokeFakeCast<float, float>(float* input_ptr, const size_t size, cudaStream_t stream);
-#ifdef ENABLE_FP8
-template void invokeFakeCast<float, __nv_fp8_e4m3>(float* input_ptr, const size_t size, cudaStream_t stream);
-template void invokeFakeCast<half, __nv_fp8_e4m3>(half* input_ptr, const size_t size, cudaStream_t stream);
-template void
-invokeFakeCast<__nv_bfloat16, __nv_fp8_e4m3>(__nv_bfloat16* input_ptr, const size_t size, cudaStream_t stream);
-#endif
-
-size_t cuda_datatype_size(FtCudaDataType dt)
-{
-    static const std::unordered_map<FtCudaDataType, size_t> sizes{{FtCudaDataType::FP32, sizeof(float)},
-                                                                  {FtCudaDataType::FP16, sizeof(half)}
-#ifdef ENABLE_BF16
-                                                                  ,
-                                                                  {FtCudaDataType::BF16, sizeof(__nv_bfloat16)}
-#endif
-    };
-
-    return sizes.at(dt);
-}
-
-template<typename T>
-__global__ void check_range(T* buffer, size_t size, T min, T max, bool* d_within_range)
-{
-    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) {
-        const T val = buffer[i];
-        if (val < min || val > max) {
-            *d_within_range = false;
-        }
-    }
-}
-
-template<typename T>
-bool invokeCheckRange(T* buffer, const size_t size, T min, T max, bool* d_within_range, cudaStream_t stream)
-{
-    cudaMemsetAsync(d_within_range, true, sizeof(bool), stream);
-
-    dim3 block(256);
-    dim3 grid((size + 255) / 256);
-    check_range<T><<<grid, block, 0, stream>>>(buffer, size, min, max, d_within_range);
-
-    bool result;
-    cudaD2Hcpy(&result, d_within_range, 1);
-    return result;
-}
-
-template bool
-invokeCheckRange<int>(int* buffer, const size_t size, int min, int max, bool* d_within_range, cudaStream_t stream);
+template void invokeInPlaceTranspose102(uint16_t*    data,
+                                        uint16_t*    workspace,
+                                        const int    dim0,
+                                        const int    dim1,
+                                        const int    dim2,
+                                        bool         copy,
+                                        cudaStream_t stream);
 
 }  // namespace turbomind
diff --git a/src/turbomind/utils/memory_utils.h b/src/turbomind/utils/memory_utils.h
index 03a0ef7b33..a61408281f 100644
--- a/src/turbomind/utils/memory_utils.h
+++ b/src/turbomind/utils/memory_utils.h
@@ -16,130 +16,12 @@
 
 #pragma once
 
-#include "src/turbomind/utils/Tensor.h"
-#include "src/turbomind/utils/cuda_fp8_utils.h"
-#include "src/turbomind/utils/cuda_utils.h"
+#include <cuda_runtime.h>
 
 namespace turbomind {
 
-template<typename T>
-void deviceMalloc(T** ptr, size_t size, cudaStream_t st, bool is_random_initialize = false);
-
-template<typename T>
-void deviceFree(T*& ptr, cudaStream_t st);
-
-template<typename T>
-void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream = {});
-
-template<typename T>
-void cudaD2Hcpy(T* tgt, const T* src, const size_t size);
-
-template<typename T>
-void cudaH2Dcpy(T* tgt, const T* src, const size_t size);
-
-template<typename T>
-void cudaD2Dcpy(T* tgt, const T* src, const size_t size);
-
-template<typename T>
-void cudaAutoCpy(T* tgt, const T* src, const size_t size, cudaStream_t stream = {});
-
-template<typename T>
-void cudaRandomUniform(T* buffer, const size_t size, cudaStream_t stream = {});
-
-template<typename T>
-int loadWeightFromBin(T*                  ptr,
-                      std::vector<size_t> shape,
-                      std::string         filename,
-                      FtCudaDataType      model_file_type = FtCudaDataType::FP32);
-
-std::vector<float> loadArrayFromBin(std::vector<size_t> shape, std::string filename);
-
-// template<typename T>
-// int loadWeightFromBinAndQuantizeForWeightOnly(int8_t*             quantized_weight_ptr,
-//                                               T*                  scale_ptr,
-//                                               std::vector<size_t> shape,
-//                                               std::string         filename,
-//                                               FtCudaDataType      model_file_type = FtCudaDataType::FP32);
-
-void invokeCudaD2DcpyHalf2Float(float* dst, half* src, const size_t size, cudaStream_t stream);
-void invokeCudaD2DcpyFloat2Half(half* dst, float* src, const size_t size, cudaStream_t stream);
-#ifdef ENABLE_FP8
-void invokeCudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream);
-void invokeCudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream);
-void invokeCudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size, cudaStream_t stream);
-void invokeCudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size, cudaStream_t stream);
-void invokeCudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size, cudaStream_t stream);
-#endif  // ENABLE_FP8
-#ifdef ENABLE_BF16
-void invokeCudaD2DcpyBfloat2Float(float* dst, __nv_bfloat16* src, const size_t size, cudaStream_t stream);
-#endif  // ENABLE_BF16
-
-template<typename T_OUT, typename T_IN>
-void invokeCudaCast(T_OUT* dst, T_IN const* const src, const size_t size, cudaStream_t stream);
-
-template<typename T, size_t n_dims>
-__inline__ __host__ __device__ size_t dim2flat(const T (&idx)[n_dims], const T (&dims)[n_dims])
-{
-    size_t flat_idx = 0;
-    for (size_t i = 0; i < n_dims; i++) {
-        flat_idx += idx[i];
-        if (i + 1 < n_dims)
-            flat_idx *= dims[i + 1];
-    }
-    return flat_idx;
-}
-
-template<typename T1, size_t n_dims, typename T2>
-__inline__ __host__ __device__ void flat2dim(T1 flat_idx, const T2 (&dims)[n_dims], T2 (&idx)[n_dims])
-{
-    for (int i = n_dims - 1; i >= 0; i--) {
-        idx[i] = flat_idx % dims[i];
-        flat_idx /= dims[i];
-    }
-}
-
-template<typename T>
-void invokeInPlaceTranspose(T* data, T* workspace, const int dim0, const int dim1);
-
-template<typename T>
-void invokeInPlaceTranspose0213(T* data, T* workspace, const int dim0, const int dim1, const int dim2, const int dim3);
-
 template<typename T>
 void invokeInPlaceTranspose102(
     T* data, T* workspace, const int dim0, const int dim1, const int dim2, bool copy = true, cudaStream_t stream = 0);
 
-template<typename T>
-void invokeMultiplyScale(T* tensor, float scale, const size_t size, cudaStream_t stream);
-
-template<typename T>
-void invokeDivideScale(T* tensor, float scale, const size_t size, cudaStream_t stream);
-
-template<typename T_IN, typename T_OUT>
-void invokeCudaD2DcpyConvert(T_OUT* tgt, const T_IN* src, const size_t size, cudaStream_t stream = 0);
-
-template<typename T_IN, typename T_OUT>
-void invokeCudaD2DScaleCpyConvert(
-    T_OUT* tgt, const T_IN* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream = 0);
-
-inline bool checkIfFileExist(const std::string& file_path)
-{
-    std::ifstream in(file_path, std::ios::in | std::ios::binary);
-    if (in.is_open()) {
-        in.close();
-        return true;
-    }
-    return false;
-}
-
-template<typename T>
-void saveToBinary(const T* ptr, const size_t size, std::string filename);
-
-template<typename T_IN, typename T_fake_type>
-void invokeFakeCast(T_IN* input_ptr, const size_t size, cudaStream_t stream);
-
-size_t cuda_datatype_size(FtCudaDataType dt);
-
-template<typename T>
-bool invokeCheckRange(T* buffer, const size_t size, T min, T max, bool* d_within_range, cudaStream_t stream);
-
 }  // namespace turbomind
diff --git a/src/turbomind/utils/mpi_utils.cc b/src/turbomind/utils/mpi_utils.cc
deleted file mode 100644
index 737e428d04..0000000000
--- a/src/turbomind/utils/mpi_utils.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "src/turbomind/utils/mpi_utils.h"
-
-namespace turbomind {
-namespace mpi {
-
-#ifdef BUILD_MULTI_GPU
-MPI_Datatype getMpiDtype(MpiType dtype)
-{
-    static const std::unordered_map<MpiType, MPI_Datatype> dtype_map{
-        {MPI_TYPE_BYTE, MPI_BYTE},
-        {MPI_TYPE_CHAR, MPI_CHAR},
-        {MPI_TYPE_INT, MPI_INT},
-        {MPI_TYPE_INT64_T, MPI_INT64_T},
-        {MPI_TYPE_UINT32_T, MPI_UINT32_T},
-        {MPI_TYPE_UNSIGNED_LONG_LONG, MPI_UNSIGNED_LONG_LONG},
-    };
-    return dtype_map.at(dtype);
-}
-#endif
-
-void initialize(int* argc, char*** argv)
-{
-#ifdef BUILD_MULTI_GPU
-    MPICHECK(MPI_Init(argc, argv));
-#endif
-}
-
-void finalize()
-{
-#ifdef BUILD_MULTI_GPU
-    MPICHECK(MPI_Finalize());
-#endif
-}
-
-bool isInitialized()
-{
-    int mpi_initialized = 0;
-#ifdef BUILD_MULTI_GPU
-    MPICHECK(MPI_Initialized(&mpi_initialized));
-#endif
-    return static_cast<bool>(mpi_initialized);
-}
-
-void initThread(int* argc, char*** argv, MpiThreadSupport required, int* provided)
-{
-#ifdef BUILD_MULTI_GPU
-    switch (required) {
-        case THREAD_SINGLE:
-            MPICHECK(MPI_Init_thread(argc, argv, MPI_THREAD_SINGLE, provided));
-            break;
-        case THREAD_FUNNELED:
-            MPICHECK(MPI_Init_thread(argc, argv, MPI_THREAD_FUNNELED, provided));
-            break;
-        case THREAD_SERIALIZED:
-            MPICHECK(MPI_Init_thread(argc, argv, MPI_THREAD_SERIALIZED, provided));
-            break;
-        case THREAD_MULTIPLE:
-            MPICHECK(MPI_Init_thread(argc, argv, MPI_THREAD_MULTIPLE, provided));
-            break;
-        default:
-            break;
-    }
-#endif
-}
-
-int getCommWorldRank()
-{
-    int rank = 0;
-#ifdef BUILD_MULTI_GPU
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-#endif
-    return rank;
-}
-
-int getCommWorldSize()
-{
-    int world_size = 1;
-#ifdef BUILD_MULTI_GPU
-    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
-#endif
-    return world_size;
-}
-
-void barrier(MpiComm comm)
-{
-#ifdef BUILD_MULTI_GPU
-    MPICHECK(MPI_Barrier(comm.group));
-#endif
-}
-
-void barrier()
-{
-#ifdef BUILD_MULTI_GPU
-    MPICHECK(MPI_Barrier(MPI_COMM_WORLD));
-#endif
-}
-
-void bcast(void* buffer, size_t size, MpiType dtype, int root, MpiComm comm)
-{
-#ifdef BUILD_MULTI_GPU
-    MPICHECK(MPI_Bcast(buffer, size, getMpiDtype(dtype), root, comm.group));
-#endif
-}
-
-}  // namespace mpi
-}  // namespace turbomind
diff --git a/src/turbomind/utils/mpi_utils.h b/src/turbomind/utils/mpi_utils.h
deleted file mode 100644
index 0eef1f2cc1..0000000000
--- a/src/turbomind/utils/mpi_utils.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "src/turbomind/utils/logger.h"
-
-#ifdef BUILD_MULTI_GPU
-#include <mpi.h>
-#endif
-#include <stdio.h>
-#include <unordered_map>
-
-namespace turbomind {
-
-#ifdef BUILD_MULTI_GPU
-#define MPICHECK(cmd)                                                                                                  \
-    do {                                                                                                               \
-        int e = cmd;                                                                                                   \
-        if (e != MPI_SUCCESS) {                                                                                        \
-            printf("Failed: MPI error %s:%d '%d'\n", __FILE__, __LINE__, e);                                           \
-            exit(EXIT_FAILURE);                                                                                        \
-        }                                                                                                              \
-    } while (0)
-#else
-#define MPICHECK(cmd) printf("[WARNING] No MPI\n");
-#endif
-
-// A wrapper module of the MPI library.
-namespace mpi {
-
-// A wrapper of MPI data type. MPI_TYPE_{data_type}
-enum MpiType
-{
-    MPI_TYPE_BYTE,
-    MPI_TYPE_CHAR,
-    MPI_TYPE_INT,
-    MPI_TYPE_INT64_T,
-    MPI_TYPE_UINT32_T,
-    MPI_TYPE_UNSIGNED_LONG_LONG,
-};
-
-// A wrapper of the level of MPI thread support
-enum MpiThreadSupport
-{
-    THREAD_SINGLE,
-    THREAD_FUNNELED,
-    THREAD_SERIALIZED,
-    THREAD_MULTIPLE
-};
-
-struct MpiComm {
-#ifdef BUILD_MULTI_GPU
-    MPI_Comm group;
-    MpiComm(){};
-    MpiComm(MPI_Comm g): group(g){};
-#endif
-};
-
-#ifdef BUILD_MULTI_GPU
-#define COMM_WORLD MpiComm(MPI_COMM_WORLD)
-#else
-#define COMM_WORLD MpiComm()
-#endif
-
-#ifdef BUILD_MULTI_GPU
-MPI_Datatype getMpiDtype(MpiType dtype);
-#endif
-
-void initialize(int* argc, char*** argv);
-void initThread(int* argc, char*** argv, MpiThreadSupport required, int* provided);
-void finalize();
-bool isInitialized();
-void barrier(MpiComm comm);
-void barrier();
-
-int getCommWorldRank();
-int getCommWorldSize();
-
-void bcast(void* buffer, size_t size, MpiType dtype, int root, MpiComm comm);
-
-}  // namespace mpi
-}  // namespace turbomind
diff --git a/tests/csrc/CMakeLists.txt b/tests/csrc/CMakeLists.txt
index 61a9b7383d..82fae2ea3d 100644
--- a/tests/csrc/CMakeLists.txt
+++ b/tests/csrc/CMakeLists.txt
@@ -13,7 +13,3 @@
 # limitations under the License.
 
 add_subdirectory(unittests)
-if(BUILD_PYT)
-    add_subdirectory(gemm_dequantize)
-    add_subdirectory(int8_gemm)
-endif()
diff --git a/tests/csrc/gemm_dequantize/CMakeLists.txt b/tests/csrc/gemm_dequantize/CMakeLists.txt
deleted file mode 100644
index dd02ecdc61..0000000000
--- a/tests/csrc/gemm_dequantize/CMakeLists.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if (TORCH_VERSION VERSION_GREATER_EQUAL "1.9.0")
-    set(gemm_dq_test_files
-        th_gemm_dequantize.cc
-    )
-
-    add_definitions(-DTORCH_CUDA=1)
-
-    set(LIB_NAME "gemm_dq_unit_ops")
-    add_library(${LIB_NAME} SHARED ${gemm_dq_test_files})
-    set_target_properties(${LIB_NAME} PROPERTIES
-                          CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-    target_link_libraries(${LIB_NAME} "${TORCH_LIBRARIES}" fpA_intB_gemm logger)
-else()
-    message("TORCH_VERSION ${TORCH_VERSION} < 1.9.0, skipping compiling th_moe_ops.cc because QUInt4x2 is supported after torch 1.9.0")
-endif()
diff --git a/tests/csrc/gemm_dequantize/th_gemm_dequantize.cc b/tests/csrc/gemm_dequantize/th_gemm_dequantize.cc
deleted file mode 100644
index e00a4eceef..0000000000
--- a/tests/csrc/gemm_dequantize/th_gemm_dequantize.cc
+++ /dev/null
@@ -1,372 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cublas_v2.h>
-#include <iostream>
-#include <vector>
-
-#include "torch/csrc/cuda/Stream.h"
-#include <torch/custom_class.h>
-#include <torch/script.h>
-
-#include "src/turbomind/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h"
-#include "src/turbomind/th_op/th_utils.h"
-#include "src/turbomind/utils/cuda_bf16_wrapper.h"
-
-#include "cutlass/numeric_types.h"
-
-using torch::Tensor;
-
-namespace torch_ext {
-
-namespace ft = turbomind;
-
-template<typename T, typename WeightType>
-Tensor fused_gemm_dq_helper(
-    Tensor input_activations, Tensor weight, Tensor scales, const int64_t timing_iterations, float& avg_time)
-{
-    const at::ScalarType _st    = input_activations.scalar_type();
-    const int            m      = input_activations.size(0);
-    const int            n      = scales.size(0);
-    const int            k      = input_activations.size(1);
-    auto                 stream = at::cuda::getCurrentCUDAStream().stream();
-
-    const T*          input_act_ptr = get_ptr<const T>(input_activations);
-    const WeightType* weight_ptr    = get_ptr<const WeightType>(weight);
-    const T*          scales_ptr    = get_ptr<const T>(scales);
-
-    turbomind::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
-    const int                                          ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
-
-    auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false));
-    auto ws_tensor     = torch::empty({ws_bytes}, torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
-
-    T*    output_tensor_ptr = get_ptr<T>(output_tensor);
-    char* ws_ptr            = get_ptr<char>(ws_tensor);
-
-    cudaEvent_t start, stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-
-    cudaEventRecord(start, stream);
-    for (int64_t iter = 0; iter < timing_iterations; ++iter) {
-        fused_gemm_dq_runner.gemm(
-            input_act_ptr, weight_ptr, scales_ptr, output_tensor_ptr, m, n, k, ws_ptr, ws_bytes, stream);
-    }
-    cudaEventRecord(stop, stream);
-    cudaEventSynchronize(stop);
-    float total_time_ms = 0;
-    cudaEventElapsedTime(&total_time_ms, start, stop);
-    avg_time = total_time_ms / float(timing_iterations);
-
-    return output_tensor;
-}
-
-Tensor
-_fused_gemm_dq(Tensor input_activations, Tensor weight, Tensor scales, int64_t timing_iterations, float& avg_time)
-{
-    const at::ScalarType _st = input_activations.scalar_type();
-    CHECK_INPUT(scales, _st);
-
-    TORCH_CHECK(input_activations.dim() == 2, "Invalid rank for activations");
-    TORCH_CHECK(weight.dim() == 2, "Invalid rank for weight");
-    TORCH_CHECK(scales.dim() == 1, "Invalid rank for scales");
-
-    const int m = input_activations.size(0);
-    const int n = scales.size(0);
-    const int k = input_activations.size(1);
-
-    TORCH_CHECK(input_activations.size(1) == weight.size(0), "dim 1 of act and dim 0 of weight must be equal");
-
-    // We signal int4 by having the last weight dim be half the size of the scales.
-    // This is because int4 elements are packed into a single byte.
-    torch::ScalarType quant_type = weight.scalar_type();
-    if (weight.size(-1) == scales.size(-1) / 2) {
-        quant_type = at::ScalarType::QUInt4x2;
-    }
-    else {
-        TORCH_CHECK(weight.size(-1) == scales.size(-1),
-                    "Last dim of weight and scales must be equal for int8 "
-                    "or last dim of scale must be 2x last dim of weight for int4.");
-    }
-
-    Tensor output_tensor;
-    switch (_st) {
-        case at::ScalarType::Half: {
-            if (quant_type == torch::kInt8) {
-                output_tensor =
-                    fused_gemm_dq_helper<half, uint8_t>(input_activations, weight, scales, timing_iterations, avg_time);
-            }
-            else if (quant_type == at::ScalarType::QUInt4x2) {
-                output_tensor = fused_gemm_dq_helper<half, cutlass::uint4b_t>(
-                    input_activations, weight, scales, timing_iterations, avg_time);
-            }
-            else {
-                std::string err_msg = "Unsupported weight type " + std::string(at::toString(quant_type));
-                throw std::runtime_error(err_msg);
-            }
-            break;
-        }
-#ifdef ENABLE_BF16
-        case at::ScalarType::BFloat16: {
-            if (quant_type == torch::kInt8) {
-                output_tensor = fused_gemm_dq_helper<__nv_bfloat16, uint8_t>(
-                    input_activations, weight, scales, timing_iterations, avg_time);
-            }
-            else if (quant_type == at::ScalarType::QUInt4x2) {
-                output_tensor = fused_gemm_dq_helper<__nv_bfloat16, cutlass::uint4b_t>(
-                    input_activations, weight, scales, timing_iterations, avg_time);
-            }
-            else {
-                std::string err_msg = "Unsupported weight type " + std::string(at::toString(quant_type));
-                throw std::runtime_error(err_msg);
-            }
-            break;
-        }
-#endif
-        default:
-            throw std::runtime_error("Unsupported tensor type. Got " + std::string(at::toString(_st)));
-    }
-    return output_tensor;
-}
-
-Tensor fused_gemm_dq(Tensor input_activations, Tensor weight, Tensor scales)
-{
-    float dummy = 0.f;
-    return _fused_gemm_dq(input_activations, weight, scales, 1, dummy);
-}
-
-Tensor
-bench_cublas(Tensor input_activations, Tensor weight_dequantized, const int64_t timing_iterations, float& avg_time)
-{
-    using namespace turbomind;
-    const int m = input_activations.size(0);
-    const int n = weight_dequantized.size(1);
-    const int k = input_activations.size(1);
-
-    const void* input_act_ptr = get_ptr<const void>(input_activations);
-    const void* weight_ptr    = get_ptr<const void>(weight_dequantized);
-
-    cublasHandle_t       handle = at::cuda::getCurrentCUDABlasHandle();
-    const at::ScalarType _st    = input_activations.scalar_type();
-
-    TORCH_CHECK(input_activations.size(1) == weight_dequantized.size(0),
-                "CUBLAS_BENCH: dim 1 of act and dim 0 of weight must be equal");
-    CHECK_INPUT(input_activations, _st);
-    CHECK_INPUT(weight_dequantized, _st);
-
-    auto  output_tensor     = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false));
-    void* output_tensor_ptr = get_ptr<void>(output_tensor);
-
-    TORCH_CHECK(_st == at::ScalarType::Half || _st == at::ScalarType::BFloat16, "Input type must be float or bfloat");
-    cudaDataType_t cublasType = _st == at::ScalarType::Half ? CUDA_R_16F : CUDA_R_16BF;
-
-    float alpha = 1.0f;
-    float beta  = 0.0f;
-
-    auto stream = at::cuda::getCurrentCUDAStream().stream();
-    cublasSetStream(handle, stream);
-
-    cudaEvent_t start, stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-
-    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
-    cudaEventRecord(start, stream);
-    for (int64_t iter = 0; iter < timing_iterations; ++iter) {
-        status = cublasGemmEx(handle,
-                              CUBLAS_OP_N,
-                              CUBLAS_OP_N,
-                              n,
-                              m,
-                              k,
-                              &alpha,
-                              weight_ptr,
-                              cublasType,
-                              n,
-                              input_act_ptr,
-                              cublasType,
-                              k,
-                              &beta,
-                              output_tensor_ptr,
-                              cublasType,
-                              n,
-                              CUBLAS_COMPUTE_32F,
-                              CUBLAS_GEMM_DEFAULT);
-    }
-    cudaEventRecord(stop, stream);
-    cudaEventSynchronize(stop);
-    float total_time_ms = 0;
-    cudaEventElapsedTime(&total_time_ms, start, stop);
-    avg_time = total_time_ms / float(timing_iterations);
-    check_cuda_error(status);
-    return output_tensor;
-}
-
-std::vector<std::vector<Tensor>> benchmark_against_cublas_fp(Tensor        input_activations,
-                                                             Tensor        weight_quantized,
-                                                             Tensor        scales,
-                                                             Tensor        weight_dequantized,
-                                                             const int64_t timing_iterations)
-{
-    float  cublas_time   = 0.f;
-    float  ft_time       = 0.f;
-    Tensor cublas_result = bench_cublas(input_activations, weight_dequantized, timing_iterations, cublas_time);
-    Tensor ft_result     = _fused_gemm_dq(input_activations, weight_quantized, scales, timing_iterations, ft_time);
-
-    auto timing_tensor =
-        torch::empty({2}, torch::dtype(at::ScalarType::Float).device(torch::kCPU).requires_grad(false));
-    timing_tensor[0] = cublas_time;
-    timing_tensor[1] = ft_time;
-
-    // const int m = input_activations.size(0);
-    // const int n = weight_dequantized.size(1);
-    // const int k = input_activations.size(1);
-    // std::cout << "m, n, k" << m << ", " << n << ", " << k << std::endl;
-    // std::cout << "cuBLAS time (ms) " << cublas_time << std::endl;
-    // std::cout << "FT time (ms) " << ft_time << std::endl;
-
-    return {{timing_tensor}, {cublas_result, ft_result}};
-}
-
-template<typename T, typename WeightType>
-Tensor fused_gemm_dq_bias_act_helper(
-    Tensor input_activations, Tensor weight, Tensor scales, Tensor bias, ft::ActivationType activation_type)
-{
-    const at::ScalarType _st    = input_activations.scalar_type();
-    const int            m      = input_activations.size(0);
-    const int            n      = scales.size(0);
-    const int            k      = input_activations.size(1);
-    auto                 stream = at::cuda::getCurrentCUDAStream().stream();
-
-    const T*          input_act_ptr = get_ptr<const T>(input_activations);
-    const WeightType* weight_ptr    = get_ptr<const WeightType>(weight);
-    const T*          scales_ptr    = get_ptr<const T>(scales);
-    const T*          bias_ptr      = get_ptr<const T>(bias);
-
-    turbomind::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
-    const int                                          ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
-
-    auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false));
-    auto ws_tensor     = torch::empty({ws_bytes}, torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
-
-    T*    output_tensor_ptr = get_ptr<T>(output_tensor);
-    char* ws_ptr            = get_ptr<char>(ws_tensor);
-
-    fused_gemm_dq_runner.gemm_bias_act(input_act_ptr,
-                                       weight_ptr,
-                                       scales_ptr,
-                                       bias_ptr,
-                                       output_tensor_ptr,
-                                       m,
-                                       n,
-                                       k,
-                                       activation_type,
-                                       ws_ptr,
-                                       ws_bytes,
-                                       stream);
-
-    return output_tensor;
-}
-
-Tensor fused_gemm_dq_bias_act(
-    Tensor input_activations, Tensor weight, Tensor scales, Tensor bias, std::string activation_type_str)
-{
-    const at::ScalarType _st = input_activations.scalar_type();
-    CHECK_INPUT(scales, _st);
-    CHECK_INPUT(bias, _st);
-
-    TORCH_CHECK(input_activations.dim() == 2, "Invalid rank for activations");
-    TORCH_CHECK(weight.dim() == 2, "Invalid rank for weight");
-    TORCH_CHECK(scales.dim() == 1, "Invalid rank for scales");
-    TORCH_CHECK(bias.dim() == 1, "Invalid rank for bias");
-
-    const int m = input_activations.size(0);
-    const int n = scales.size(0);
-    const int k = input_activations.size(1);
-
-    TORCH_CHECK(bias.size(0) == n, "Must have 1 bias value for each output column");
-    TORCH_CHECK(input_activations.size(1) == weight.size(0), "dim 1 of act and dim 0 of weight must be equal");
-
-    // We signal int4 by having the last weight dim be half the size of the scales.
-    // This is because int4 elements are packed into a single byte.
-    torch::ScalarType quant_type = weight.scalar_type();
-    if (weight.size(-1) == scales.size(-1) / 2) {
-        quant_type = at::ScalarType::QUInt4x2;
-    }
-    else {
-        TORCH_CHECK(weight.size(-1) == scales.size(-1),
-                    "Last dim of weight and scales must be equal for int8 "
-                    "or last dim of scale must be 2x last dim of weight for int4.");
-    }
-
-    ft::ActivationType activation_type = ft::ActivationType::InvalidType;
-    if (activation_type_str == "identity") {
-        activation_type = ft::ActivationType::Identity;
-    }
-    else {
-        activation_type = ft::getActivationType(activation_type_str);
-    }
-
-    TORCH_CHECK(!isGatedActivation(activation_type), "Fused gated activations not supported.");
-
-    Tensor output_tensor;
-    switch (_st) {
-        case at::ScalarType::Half: {
-            if (quant_type == torch::kInt8) {
-                output_tensor = fused_gemm_dq_bias_act_helper<half, uint8_t>(
-                    input_activations, weight, scales, bias, activation_type);
-            }
-            else if (quant_type == at::ScalarType::QUInt4x2) {
-                output_tensor = fused_gemm_dq_bias_act_helper<half, cutlass::uint4b_t>(
-                    input_activations, weight, scales, bias, activation_type);
-            }
-            else {
-                std::string err_msg = "Unsupported weight type " + std::string(at::toString(quant_type));
-                throw std::runtime_error(err_msg);
-            }
-            break;
-        }
-#ifdef ENABLE_BF16
-        case at::ScalarType::BFloat16: {
-            if (quant_type == torch::kInt8) {
-                output_tensor = fused_gemm_dq_bias_act_helper<__nv_bfloat16, uint8_t>(
-                    input_activations, weight, scales, bias, activation_type);
-            }
-            else if (quant_type == at::ScalarType::QUInt4x2) {
-                output_tensor = fused_gemm_dq_bias_act_helper<__nv_bfloat16, cutlass::uint4b_t>(
-                    input_activations, weight, scales, bias, activation_type);
-            }
-            else {
-                std::string err_msg = "Unsupported weight type " + std::string(at::toString(quant_type));
-                throw std::runtime_error(err_msg);
-            }
-            break;
-        }
-#endif
-        default:
-            throw std::runtime_error("Unsupported tensor type. Got " + std::string(at::toString(_st)));
-    }
-    return output_tensor;
-}
-
-TORCH_LIBRARY(gemm_dq_unit_ops, m)
-{
-    m.def("fused_gemm_dq", fused_gemm_dq);
-    m.def("benchmark_against_cublas_fp", benchmark_against_cublas_fp);
-    m.def("fused_gemm_dq_bias_act", fused_gemm_dq_bias_act);
-}
-}  // namespace torch_ext
diff --git a/tests/csrc/gemm_dequantize/th_gemm_dequantize.py b/tests/csrc/gemm_dequantize/th_gemm_dequantize.py
deleted file mode 100644
index 0946fe3191..0000000000
--- a/tests/csrc/gemm_dequantize/th_gemm_dequantize.py
+++ /dev/null
@@ -1,247 +0,0 @@
-# flake8: noqa
-import unittest
-
-import torch
-
-
-def random_tensor(shape, dtype, device, mean=0, std=1):
-    return torch.empty(shape, dtype=dtype, device=device).normal_(mean, std)
-
-
-class TestGemmDequantize(unittest.TestCase):
-
-    def setUp(self) -> None:
-        torch.classes.load_library('lib/libth_transformer.so')
-        torch.classes.load_library('lib/libgemm_dq_unit_ops.so')
-        self.unpack_packed_int4s = torch.ops.turbomind.unpack_int4_packed_tensor_to_int8
-        self.pack_int4s = torch.ops.turbomind.pack_int8_tensor_to_packed_int4
-        self.fused_gemm_dq = torch.ops.gemm_dq_unit_ops.fused_gemm_dq
-        self.fused_gemm_dq_bias_act = torch.ops.gemm_dq_unit_ops.fused_gemm_dq_bias_act
-        self.bench = torch.ops.gemm_dq_unit_ops.benchmark_against_cublas_fp
-        self.preprocess_weights_for_mixed_gemm = torch.ops.turbomind.preprocess_weights_for_mixed_gemm
-
-        self.symmetric_quantizer = torch.ops.turbomind._symmetric_quantize_last_axis_of_batched_matrix
-
-        torch.manual_seed(734876213)
-
-    def dequantize_test_helper(self, weight_type, quant_type):
-        assert quant_type == torch.int8 or quant_type == torch.quint4x2
-
-        lower_bound = -128 if quant_type == torch.int8 else -8
-        upper_bound = 127 if quant_type == torch.int8 else 7
-
-        m, n, k = 64, 128, 64
-        weights = torch.randint(lower_bound, upper_bound, [k, n], dtype=torch.int8, device='cpu')
-
-        packed_weight = self.pack_int4s(weights) if quant_type == torch.quint4x2 else weights
-        cuda_weights = self.preprocess_weights_for_mixed_gemm(packed_weight, quant_type).to('cuda')
-        weights = weights.to('cuda')
-
-        act = torch.eye(m, dtype=weight_type, device='cuda')
-        scales = torch.ones([n], dtype=weight_type, device='cuda')
-
-        actual = self.fused_gemm_dq(act, cuda_weights, scales)
-        torch.testing.assert_close(actual, weights, atol=0, rtol=0, check_dtype=False)
-
-    def test_fp16_int8_dequantize(self):
-        self.dequantize_test_helper(torch.float16, torch.int8)
-
-    def test_bf16_int8_dequantize(self):
-        self.dequantize_test_helper(torch.bfloat16, torch.int8)
-
-    def test_fp16_int4_dequantize(self):
-        self.dequantize_test_helper(torch.float16, torch.quint4x2)
-
-    def test_bf16_int4_dequantize(self):
-        self.dequantize_test_helper(torch.bfloat16, torch.quint4x2)
-
-    def apply_act(self, inp, act_str):
-        if act_str == 'identity':
-            return inp
-        elif act_str == 'silu':
-            return torch.nn.SiLU()(inp)
-        elif act_str == 'relu':
-            return torch.nn.ReLU()(inp)
-        elif act_str == 'gelu':
-            return torch.nn.GELU(approximate='tanh')(inp)
-        else:
-            assert False, 'Unsupported activation'
-
-    def gemm_dequant_test_helper(self,
-                                 compute_type,
-                                 weight_dtype,
-                                 gemm_ms,
-                                 gemm_ns,
-                                 gemm_ks,
-                                 rtol,
-                                 atol,
-                                 act_str='only_gemm',
-                                 benchmark=False):
-        assert weight_dtype == torch.int8 or weight_dtype == torch.quint4x2, 'Weight must be quantized'
-
-        for gemm_k in gemm_ks:
-            for gemm_n in gemm_ns:
-                torch_weights_cpu = random_tensor((gemm_k, gemm_n), dtype=compute_type, device='cpu', mean=0, std=0.002)
-                ref_torch_weights, processed_torch_weights, torch_weight_scales = self.symmetric_quantizer(
-                    torch_weights_cpu, weight_dtype)
-                ref_torch_weights = self.unpack_packed_int4s(
-                    ref_torch_weights) if weight_dtype == torch.quint4x2 else ref_torch_weights
-                ref_torch_weights = ref_torch_weights.to('cuda')
-                processed_torch_weights = processed_torch_weights.to('cuda')
-                torch_weight_scales = torch_weight_scales.to('cuda')
-                torch_biases = random_tensor((gemm_n), dtype=compute_type, device='cuda', mean=0, std=0.1)
-
-                for num_rows in gemm_ms:
-                    torch_activations = torch.randn(size=(num_rows, gemm_k), dtype=compute_type, device='cuda')
-
-                    scales_unsqueezed = torch_weight_scales.unsqueeze(0)
-                    casted_weights = ref_torch_weights.to(torch_activations.dtype)
-                    dequantized_weights = torch.multiply(casted_weights, scales_unsqueezed)
-                    if benchmark:
-                        assert act_str == 'only_gemm', 'Benchmarks against cublas must use just GEMM.'
-                        torch.cuda.profiler.start()
-                        times, results = self.bench(torch_activations, processed_torch_weights, torch_weight_scales,
-                                                    dequantized_weights, 200)
-                        torch.cuda.profiler.stop()
-                        times = times[0]
-                        cublas_time = times[0].item()
-                        ft_time = times[1].item()
-                        ft_speedup = cublas_time / ft_time
-                        print('{},{},{},{},{},{}'.format(num_rows, gemm_n, gemm_k, cublas_time, ft_time, ft_speedup))
-                        reference_result = results[0]
-                        ft_result = results[1]
-                    else:
-                        if act_str == 'only_gemm':
-                            reference_result = torch.matmul(torch_activations, dequantized_weights)
-                            ft_result = self.fused_gemm_dq(torch_activations, processed_torch_weights,
-                                                           torch_weight_scales)
-                        else:
-                            reference_result = torch.matmul(torch_activations, dequantized_weights)
-                            reference_result += torch_biases.unsqueeze(0)
-                            reference_result = self.apply_act(reference_result, act_str)
-
-                            ft_result = self.fused_gemm_dq_bias_act(torch_activations, processed_torch_weights,
-                                                                    torch_weight_scales, torch_biases, act_str)
-
-                    msg = 'FC1 Failed on m={}, n={}, k={}'.format(num_rows, gemm_n, gemm_k)
-                    torch.testing.assert_close(ft_result,
-                                               reference_result,
-                                               rtol=rtol,
-                                               atol=atol,
-                                               msg=msg,
-                                               check_dtype=False)
-
-    def test_fp16_int8_gemm(self):
-        self.gemm_dequant_test_helper(torch.float16,
-                                      torch.int8,
-                                      gemm_ms=[256, 177, 195, 125, 66, 33, 8, 2, 1],
-                                      gemm_ns=[1024, 2048, 4096],
-                                      gemm_ks=[4096, 8192, 16384],
-                                      rtol=0.001,
-                                      atol=0.002)
-
-    def test_fp16_int4_gemm(self):
-        self.gemm_dequant_test_helper(torch.float16,
-                                      torch.quint4x2,
-                                      gemm_ms=[256, 177, 195, 125, 66, 33, 8, 2, 1],
-                                      gemm_ns=[1024, 2048, 4096],
-                                      gemm_ks=[4096, 8192, 16384],
-                                      rtol=0.001,
-                                      atol=0.002)
-
-    def test_bf16_int8_gemm(self):
-        self.gemm_dequant_test_helper(torch.bfloat16,
-                                      torch.int8,
-                                      gemm_ms=[256, 177, 195, 125, 66, 33, 8, 2, 1],
-                                      gemm_ns=[1024, 2048, 4096],
-                                      gemm_ks=[4096, 8192, 16384],
-                                      rtol=0.01,
-                                      atol=0.01)
-
-    def test_bf16_int4_gemm(self):
-        self.gemm_dequant_test_helper(torch.bfloat16,
-                                      torch.quint4x2,
-                                      gemm_ms=[256, 177, 195, 125, 66, 33, 8, 2, 1],
-                                      gemm_ns=[1024, 2048, 4096],
-                                      gemm_ks=[4096, 8192, 16384],
-                                      rtol=0.01,
-                                      atol=0.01)
-
-    def test_fp16_int8_gemm_bias(self):
-        self.gemm_dequant_test_helper(torch.float16,
-                                      torch.int8,
-                                      gemm_ms=[256],
-                                      gemm_ns=[1024],
-                                      gemm_ks=[8192],
-                                      rtol=0.001,
-                                      atol=0.002,
-                                      act_str='identity')
-
-    def test_fp16_int8_gemm_bias_relu(self):
-        self.gemm_dequant_test_helper(torch.float16,
-                                      torch.int8,
-                                      gemm_ms=[256],
-                                      gemm_ns=[1024],
-                                      gemm_ks=[8192],
-                                      rtol=0.001,
-                                      atol=0.002,
-                                      act_str='relu')
-
-    def test_fp16_int8_gemm_bias_gelu(self):
-        self.gemm_dequant_test_helper(torch.float16,
-                                      torch.int8,
-                                      gemm_ms=[256],
-                                      gemm_ns=[1024],
-                                      gemm_ks=[8192],
-                                      rtol=0.001,
-                                      atol=0.002,
-                                      act_str='gelu')
-
-    def test_fp16_int8_gemm_bias_silu(self):
-        self.gemm_dequant_test_helper(torch.float16,
-                                      torch.int8,
-                                      gemm_ms=[256],
-                                      gemm_ns=[1024],
-                                      gemm_ks=[8192],
-                                      rtol=0.001,
-                                      atol=0.002,
-                                      act_str='silu')
-
-    def bench_helper(self, act_type, quant_type, rtol, atol):
-        # Warm, using bfloat here since it seems to reliably use cublas.
-        x = random_tensor([20480, 20480], torch.bfloat16, device='cuda')
-        warm_iters = 30
-        for iter in range(warm_iters):
-            res = x @ x
-
-        m_shapes = torch.arange(0, 12)
-        m_shapes = 2**m_shapes
-
-        self.gemm_dequant_test_helper(act_type,
-                                      quant_type,
-                                      gemm_ms=[128],
-                                      gemm_ns=[1536],
-                                      gemm_ks=[12288],
-                                      rtol=rtol,
-                                      atol=atol,
-                                      benchmark=True)
-
-    @unittest.skip("This is a benchmark so don't run by default")
-    def test_fp16_int8_cublas(self):
-        self.bench_helper(torch.float16, torch.int8, 1e-3, 0.002)
-
-    @unittest.skip("This is a benchmark so don't run by default")
-    def test_bf16_int8_cublas(self):
-        self.bench_helper(torch.bfloat16, torch.int8, 1e-2, 1e-2)
-
-    @unittest.skip("This is a benchmark so don't run by default")
-    def test_fp16_int4_cublas(self):
-        self.bench_helper(torch.float16, torch.quint4x2, 1e-3, 0.002)
-
-    @unittest.skip("This is a benchmark so don't run by default")
-    def test_bf16_int4_cublas(self):
-        self.bench_helper(torch.bfloat16, torch.quint4x2, 1e-2, 1e-2)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/csrc/int8_gemm/CMakeLists.txt b/tests/csrc/int8_gemm/CMakeLists.txt
deleted file mode 100644
index fe8b14455a..0000000000
--- a/tests/csrc/int8_gemm/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set(int8_test_files
-    int8_gemm_test.cu
-)
-
-add_definitions(-DTORCH_CUDA=1)
-
-set(EXE_NAME "int8_gemm_test")
-add_executable(${EXE_NAME} ${int8_test_files})
-set_target_properties(${EXE_NAME} PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-target_link_libraries(${EXE_NAME} PUBLIC "${TORCH_LIBRARIES}" int8_gemm tensor logger)
diff --git a/tests/csrc/int8_gemm/int8_gemm_test.cu b/tests/csrc/int8_gemm/int8_gemm_test.cu
deleted file mode 100644
index 0dc10b214d..0000000000
--- a/tests/csrc/int8_gemm/int8_gemm_test.cu
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <chrono>
-#include <cstdlib>
-#include <cublas_v2.h>
-#include <iostream>
-#include <vector>
-
-#include "torch/csrc/cuda/Stream.h"
-#include <torch/custom_class.h>
-#include <torch/script.h>
-
-#include "src/turbomind/kernels/cutlass_kernels/int8_gemm/int8_gemm.h"
-#include "src/turbomind/th_op/th_utils.h"
-#include "src/turbomind/utils/cuda_bf16_wrapper.h"
-#include "src/turbomind/utils/logger.h"
-
-#include "cutlass/numeric_types.h"
-
-using torch::Tensor;
-using torch_ext::get_ptr;
-
-namespace ft = turbomind;
-
-template<typename T>
-void int8_gemm_test(const int            m,
-                    const int            n,
-                    const int            k,
-                    const at::ScalarType output_data_type,
-                    const QuantMode      quant_mode,
-                    const int            iters)
-{
-    const bool per_token_quant =
-        quant_mode == QuantMode::PerTokenChannelQuant || quant_mode == QuantMode::PerTokenQuant;
-    const bool per_channel_quant =
-        quant_mode == QuantMode::PerTokenChannelQuant || quant_mode == QuantMode::PerChannelQuant;
-    const int row_scale_size = per_token_quant ? m : 1;
-    const int col_scale_size = per_channel_quant ? n : 1;
-
-    const at::ScalarType at_int32 = at::ScalarType::Int;
-    const at::ScalarType at_int8  = at::ScalarType::Char;
-    const at::ScalarType at_fp16  = at::ScalarType::Half;
-    const at::ScalarType at_bf16  = at::ScalarType::BFloat16;
-    const at::ScalarType at_fp32  = at::ScalarType::Float;
-
-    using std::chrono::high_resolution_clock;
-    using std::chrono::duration_cast;
-    using std::chrono::microseconds;
-
-    torch::manual_seed(0);
-
-    auto x = torch::randint(-128, 128, {m, k}, torch::dtype(at_int32).requires_grad(false));
-    auto w = torch::randint(-128, 128, {k, n}, torch::dtype(at_int32).requires_grad(false));
-
-    ft::FT_CHECK(torch::allclose(x, x.to(at_int8).to(at_int32)));
-    ft::FT_CHECK(torch::allclose(w, w.to(at_int8).to(at_int32)));
-
-    auto y = torch::matmul(x, w);
-
-    ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, {(size_t)m, (size_t)k}, get_ptr<int32_t>(x)}.saveNpy("x.npy");
-    ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, {(size_t)k, (size_t)n}, get_ptr<int32_t>(w)}.saveNpy("w.npy");
-    ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y)}.saveNpy("y.npy");
-
-    auto x_gpu       = x.to(at_int8).to(torch::kCUDA);
-    auto w_T_gpu     = w.to(at_int8).to(torch::kCUDA).t().contiguous();
-    auto w_gpu       = w.to(at_int8).to(torch::kCUDA);
-    auto y_gpu       = torch::zeros({m, n}, torch::dtype(output_data_type).device(torch::kCUDA).requires_grad(false));
-    auto y_gpu_int32 = torch::zeros({m, n}, torch::dtype(at_int32).device(torch::kCUDA).requires_grad(false));
-
-    auto alpha_row_cultass = torch::ones({row_scale_size, 1}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100)
-                             * torch::randint(1, 10, {row_scale_size, 1}, torch::dtype(at_fp32));
-    auto alpha_col_cutlass = torch::ones({1, col_scale_size}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100)
-                             * torch::randint(1, 10, {1, col_scale_size}, torch::dtype(at_fp32));
-
-    auto alpha_row_torch = alpha_row_cultass.expand({m, 1});
-    auto alpha_col_torch = alpha_col_cutlass.expand({1, n});
-
-    // std::cout << alpha_row << std::endl;
-    auto alpha_row_gpu = alpha_row_cultass.to(torch::kCUDA);
-    auto alpha_col_gpu = alpha_col_cutlass.to(torch::kCUDA);
-
-    auto alpha_row_col_scale_gpu = torch::matmul(alpha_row_torch, alpha_col_torch).to(torch::kCUDA);
-
-    ft::CutlassInt8GemmRunner<T> cutlass_runner_half;
-
-    auto stream = at::cuda::getCurrentCUDAStream().stream();
-    // warm_up
-    cutlass_runner_half.gemm(get_ptr<int8_t>(x_gpu),
-                             get_ptr<int8_t>(w_T_gpu),
-                             quant_mode,
-                             get_ptr<float>(alpha_col_gpu),
-                             get_ptr<float>(alpha_row_gpu),
-                             get_ptr<T>(y_gpu),
-                             m,
-                             n,
-                             k,
-                             nullptr,
-                             0,
-                             stream);
-
-    ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)m, (size_t)k}, get_ptr<int8_t>(x_gpu)}.saveNpy("x_gpu.npy");
-    ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)n, (size_t)k}, get_ptr<int8_t>(w_T_gpu)}.saveNpy("w_T_gpu.npy");
-    ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)k, (size_t)n}, get_ptr<int8_t>(w_gpu)}.saveNpy("w_gpu.npy");
-    ft::Tensor{ft::MEMORY_GPU, ft::TYPE_FP16, {(size_t)m, (size_t)n}, get_ptr<T>(y_gpu)}.saveNpy("y_gpu.npy");
-    ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y_gpu_int32)}.saveNpy(
-        "y_gpu_int32.npy");
-
-    ft::check_cuda_error(cudaStreamSynchronize(stream));
-    auto start = high_resolution_clock::now();
-
-    for (int i = 0; i < iters; ++i) {
-        cutlass_runner_half.gemm(get_ptr<int8_t>(x_gpu),
-                                 get_ptr<int8_t>(w_T_gpu),
-                                 quant_mode,
-                                 get_ptr<float>(alpha_col_gpu),
-                                 get_ptr<float>(alpha_row_gpu),
-                                 get_ptr<T>(y_gpu),
-                                 m,
-                                 n,
-                                 k,
-                                 nullptr,
-                                 0,
-                                 stream);
-    }
-
-    ft::check_cuda_error(cudaStreamSynchronize(stream));
-    auto end = high_resolution_clock::now();
-
-    auto duration = duration_cast<microseconds>(end - start);
-
-    if (torch::allclose(
-            (y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(output_data_type), y_gpu)) {
-        TM_LOG_INFO("SUCCESS " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
-    }
-    else {
-        TM_LOG_ERROR("FAILED " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
-        // std::cout << "diff " << (y.to(torch::kCUDA).to(at_fp32) *
-        // alpha_row_col_scale_gpu.to(torch::kCUDA)).to(at_fp16) - y_gpu << std::endl;
-    }
-}
-
-int main(int argc, char** argv)
-{
-    if (argc != 7) {
-        TM_LOG_ERROR(
-            "arguments missing, needs m, n, k, data_type(fp16=0, bf16=1), quant_mode (perTensor=0, perToken=1, perChannel=2, perTokenChannel=3), iters.");
-        return 0;
-    }
-
-    const int            m                = atoi(argv[1]);
-    const int            n                = atoi(argv[2]);
-    const int            k                = atoi(argv[3]);
-    const at::ScalarType output_data_type = atoi(argv[4]) == 0 ? at::ScalarType::Half : at::ScalarType::BFloat16;
-    const QuantMode      quant_mode       = static_cast<QuantMode>(atoi(argv[5]));
-    if (quant_mode == QuantMode::PerChannelQuant) {
-        printf("per channel quant \n");
-    }
-    const int iters = atoi(argv[6]);
-
-    if (output_data_type == at::ScalarType::Half) {
-        int8_gemm_test<half>(m, n, k, output_data_type, quant_mode, iters);
-    }
-    else {
-#if ENABLE_BF16
-        int8_gemm_test<__nv_bfloat16>(m, n, k, output_data_type, quant_mode, iters);
-#endif
-    }
-
-    return 0;
-}
diff --git a/tests/csrc/unittests/CMakeLists.txt b/tests/csrc/unittests/CMakeLists.txt
index 01f926de60..454f9476f5 100644
--- a/tests/csrc/unittests/CMakeLists.txt
+++ b/tests/csrc/unittests/CMakeLists.txt
@@ -36,7 +36,6 @@ add_executable(unittest
     test_penalty_kernels.cu
     test_sampling_kernels.cu
     test_sampling_layer.cu
-    test_tensor.cu
 )
 
 # automatic discovery of unit tests
@@ -64,11 +63,7 @@ target_link_libraries(  # Libs for test_sampling_layer
   unittest PUBLIC
     CUDA::cublas CUDA::cublasLt CUDA::cudart
     cublasMMWrapper memory_utils
-    DynamicDecodeLayer tensor cuda_utils logger
+    DynamicDecodeLayer cuda_utils logger
 )
 target_link_libraries(  # Libs for test_tensor
-  unittest PUBLIC tensor cuda_utils logger)
-
-remove_definitions(-DTORCH_CUDA=1)
-add_executable(test_gemm test_gemm.cu)
-target_link_libraries(test_gemm PUBLIC CUDA::cublas CUDA::cudart CUDA::curand gemm cublasMMWrapper tensor cuda_utils logger)
+  unittest PUBLIC cuda_utils logger)
diff --git a/tests/csrc/unittests/test_gemm.cu b/tests/csrc/unittests/test_gemm.cu
deleted file mode 100644
index be7fed531d..0000000000
--- a/tests/csrc/unittests/test_gemm.cu
+++ /dev/null
@@ -1,1023 +0,0 @@
-#include <assert.h>
-#include <cublas_v2.h>
-#include <math.h>
-#include <numeric>
-#include <stdexcept>
-#include <tuple>
-#include <vector>
-
-#include "src/turbomind/layers/DenseWeight.h"
-#include "src/turbomind/utils/allocator.h"
-#include "src/turbomind/utils/cublasMMWrapper.h"
-#include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/gemm.h"
-#include "src/turbomind/utils/logger.h"
-#include "src/turbomind/utils/memory_utils.h"
-
-using namespace turbomind;
-
-// Can be replaced by the function provided by a test framework
-
-class TestFailureError: public std::exception {
-private:
-    std::string msg_;
-
-public:
-    explicit TestFailureError() = default;
-    explicit TestFailureError(std::string name, std::string msg = "")
-    {
-        msg_ = fmtstr("TEST FAIL [%s] %s", name.c_str(), msg.c_str());
-    }
-    const char* what() const throw()
-    {
-        return msg_.c_str();
-    }
-};
-
-#define EXPECT_TRUE(cond)                                                                                              \
-    do {                                                                                                               \
-        if (!(cond)) {                                                                                                 \
-            TM_LOG_ERROR("TEST FAIL [%s] at %s:%d", __func__, __FILE__, __LINE__);                                     \
-            throw TestFailureError(__func__);                                                                          \
-        }                                                                                                              \
-    } while (false)
-
-#define EXPECT_ALMOST_EQUAL(name, dtype, ctype, out, ref)                                                              \
-    do {                                                                                                               \
-        bool is_ok = checkResult<dtype, ctype>(name, out, ref);                                                        \
-        if (!is_ok) {                                                                                                  \
-            TM_LOG_ERROR("TEST FAIL [%s] at %s:%d", __func__, __FILE__, __LINE__);                                     \
-            throw TestFailureError(__func__);                                                                          \
-        }                                                                                                              \
-    } while (false)
-
-////////////////////////////////////////////////////////////////////////////////////
-
-// TensorWrapper is to handle a tensor object as well as its memory buffer,
-// because tensor.data is const we cannot set values.
-class TensorWrapper {
-private:
-    IAllocator* allocator;
-
-public:
-    std::vector<size_t> shape;
-    DataType            type;
-    Tensor*             tensor;
-    void*               data;
-
-    TensorWrapper(IAllocator* allocator, DataType dtype, std::vector<size_t> shape, bool zero_init = false)
-    {
-        this->allocator = allocator;
-        this->type      = dtype;
-        this->shape     = shape;
-
-        size_t tensor_memsize = this->memsize();
-        this->data            = this->allocator->malloc(tensor_memsize, false);
-        if (zero_init) {
-            check_cuda_error(cudaMemset(data, 0x0, tensor_memsize));
-        }
-        else {
-            setRandomValues();
-        }
-        this->tensor = new Tensor(MEMORY_GPU, dtype, shape, data);
-    }
-
-    TensorWrapper(TensorWrapper const& other):
-        allocator(other.allocator), shape(other.shape), type(other.type), data(other.data), tensor(other.tensor)
-    {
-        TM_LOG_DEBUG("TensorWrapper copy: this=%p other=%p", data, other.data);
-    }
-    ~TensorWrapper()
-    {
-        delete tensor;
-        allocator->free((void**)(&data));
-    }
-
-    void setInvalidValues()
-    {
-        size_t type_size   = tensor->type == TYPE_FP32 ? sizeof(float) : sizeof(half);
-        size_t tensor_size = type_size * tensor->size();
-        // Fill by a random number to guarantee invalid values
-        check_cuda_error(cudaMemset(data, 0xdc, tensor_size));
-    }
-
-    void setRandomValues()
-    {
-        // random initialization
-        size_t num_elements = this->size();
-        switch (this->type) {
-            case TYPE_FP32:
-                cudaRandomUniform((float*)data, num_elements);
-                break;
-            case TYPE_FP16:
-                cudaRandomUniform((half*)data, num_elements);
-                break;
-            default:
-                // Will be added more if needed.
-                throw std::runtime_error("Not supported data type");
-        }
-    }
-
-    size_t size()
-    {
-        size_t n_elements = 1;
-        for (size_t s : this->shape) {
-            n_elements *= s;
-        }
-        return n_elements;
-    }
-
-    size_t memsize()
-    {
-        size_t type_size = 0;
-        switch (this->type) {
-            case TYPE_FP32:
-                type_size = sizeof(float);
-                break;
-            case TYPE_FP16:
-                type_size = sizeof(half);
-                break;
-            default:
-                throw std::runtime_error("Not supported data type.");
-        }
-        return type_size * this->size();
-    }
-};
-
-template<DataType computeType>
-void computeReference(GemmOp         transa,
-                      GemmOp         transb,
-                      TensorWrapper& C,
-                      TensorWrapper& A,
-                      TensorWrapper& B,
-                      float          alpha = 1.0f,
-                      float          beta  = 0.0f)
-{
-    size_t m = C.shape[0];
-    size_t n = C.shape[1];
-    size_t k = A.shape[1];
-
-    size_t lda = (transa == GEMM_OP_N) ? k : m;
-    size_t ldb = (transb == GEMM_OP_N) ? n : k;
-    size_t ldc = n;
-
-    cudaDataType_t atype        = (A.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
-    cudaDataType_t btype        = (B.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
-    cudaDataType_t ctype        = (C.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
-    cudaDataType_t compute_type = (computeType == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
-
-    cublasHandle_t cublas_handle;
-    check_cuda_error(cublasCreate(&cublas_handle));
-
-    half        h_alpha = (half)alpha;
-    half        h_beta  = (half)beta;
-    const void* _alpha  = (computeType == TYPE_FP16) ? (const void*)&h_alpha : (const void*)&alpha;
-    const void* _beta   = (computeType == TYPE_FP16) ? (const void*)&h_beta : (const void*)&beta;
-
-    check_cuda_error(cublasGemmEx(cublas_handle,
-                                  getCublasOperation(transb),
-                                  getCublasOperation(transa),
-                                  n,
-                                  m,
-                                  k,
-                                  _alpha,
-                                  (const void*)B.data,
-                                  btype,
-                                  ldb,
-                                  (const void*)A.data,
-                                  atype,
-                                  lda,
-                                  _beta,
-                                  (void*)C.data,
-                                  ctype,
-                                  ldc,
-                                  compute_type,
-                                  CUBLAS_GEMM_DEFAULT));
-    check_cuda_error(cublasDestroy(cublas_handle));
-    cudaDeviceSynchronize();
-}
-
-bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8)
-{
-    // Params: a = value to compare and b = reference
-    // This function follows implementation of numpy.isclose(), which checks
-    //   abs(a - b) <= (atol + rtol * abs(b)).
-    // Note that the inequality above is asymmetric where b is considered as
-    // a reference value. To account into both absolute/relative errors, it
-    // uses absolute tolerance and relative tolerance at the same time. The
-    // default values of atol and rtol borrowed from numpy.isclose(). For the
-    // case of nan value, the result will be true.
-    if (isnan(a) && isnan(b)) {
-        return true;
-    }
-    return fabs(a - b) <= (atol + rtol * fabs(b));
-}
-
-template<typename T>
-bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, float atol, float rtol)
-{
-    assert(out.type == ref.type);
-
-    size_t out_size = out.size();
-    size_t ref_size = ref.size();
-    T*     h_out    = reinterpret_cast<T*>(malloc(sizeof(T) * out_size));
-    T*     h_ref    = reinterpret_cast<T*>(malloc(sizeof(T) * ref_size));
-
-    cudaMemcpy(h_out, out.data, sizeof(T) * out_size, cudaMemcpyDeviceToHost);
-    cudaMemcpy(h_ref, ref.data, sizeof(T) * ref_size, cudaMemcpyDeviceToHost);
-    cudaDeviceSynchronize();
-
-    size_t failures = 0;
-    for (size_t i = 0; i < out_size; ++i) {
-        // The values for the output and the reference.
-        float a = (float)h_out[i];
-        float b = (float)h_ref[i];
-
-        bool ok = almostEqual(a, b, atol, rtol);
-        // Print the error.
-        if (!ok && failures < 4) {
-            TM_LOG_ERROR(">> invalid result for i=%lu:", i);
-            TM_LOG_ERROR(">>    found......: %10.6f", a);
-            TM_LOG_ERROR(">>    expected...: %10.6f", b);
-            TM_LOG_ERROR(">>    error......: %.6f", fabsf(a - b));
-            TM_LOG_ERROR(">>    tol........: %.6f", atol + rtol * fabs(b));
-        }
-
-        // Update the number of failures.
-        failures += ok ? 0 : 1;
-    }
-
-    // Allow not matched up to 1% elements.
-    size_t tol_failures = (size_t)(0.01 * out_size);
-    TM_LOG_INFO("check....... %30s : %s (failures: %.2f%% atol: %.2e rtol: %.2e)",
-                name.c_str(),
-                failures <= tol_failures ? "OK" : "FAILED",
-                100. * failures / out_size,
-                atol,
-                rtol);
-    return failures <= tol_failures;
-}
-
-template<typename T, DataType computeType>
-bool checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref)
-{
-    float atol  = (computeType == TYPE_FP32) ? 1e-6f : 1e-3f;
-    float rtol  = (computeType == TYPE_FP32) ? 1e-4f : 1e-1f;
-    bool  is_ok = false;
-    if (sizeof(T) == 4) {
-        is_ok = _checkResult<float>(name, out, ref, atol, rtol);
-    }
-    else {
-        is_ok = _checkResult<half>(name, out, ref, atol, rtol);
-    }
-    return is_ok;
-}
-
-template<typename T, DataType computeType>
-bool checkResult(TensorWrapper& out, TensorWrapper& ref)
-{
-    return checkResult<T, computeType>("", out, ref);
-}
-
-template<typename T>
-std::string toString()
-{
-    std::string str = "dtype=";
-    str += std::is_same<T, float>::value ? "FP32" : "FP16";
-    return str;
-}
-
-template<typename T, DataType ctype>
-std::string toString()
-{
-    std::string str = "dtype=";
-    str += std::is_same<T, float>::value ? "FP32" : "FP16";
-    str += ", compute_type=";
-    str += (ctype == TYPE_FP32) ? "FP32" : "FP16";
-    return str;
-}
-
-std::string toString(GemmOp op)
-{
-    return op == GEMM_OP_N ? "N" : "T";
-}
-
-struct GemmOpPair {
-    GemmOp transa;
-    GemmOp transb;
-};
-
-static const std::vector<GemmOpPair> op_pairs{
-    {GEMM_OP_N, GEMM_OP_N}, {GEMM_OP_N, GEMM_OP_T}, {GEMM_OP_T, GEMM_OP_N}, {GEMM_OP_T, GEMM_OP_T}};
-
-static inline std::string getTestName(const char* func_name, GemmOp transa, GemmOp transb, size_t m, size_t n, size_t k)
-{
-    return fmtstr("%s [opA=%s, opB=%s, m=%ld, n=%ld, k=%ld]",
-                  func_name,
-                  getGemmOpString(transa).c_str(),
-                  getGemmOpString(transb).c_str(),
-                  m,
-                  n,
-                  k);
-}
-
-static inline std::string getTestName(const char* func_name, GemmOpPair op_pairs, size_t m, size_t n, size_t k)
-{
-    return getTestName(func_name, op_pairs.transa, op_pairs.transb, m, n, k);
-}
-
-/////////////////////////////////// Unittests //////////////////////////////////////////
-
-template<typename T, DataType computeType>
-void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k)
-{
-    TM_LOG_INFO(
-        "Matmul function correctness test [m=%ld, n=%ld, k=%ld, %s]", m, n, k, toString<T, computeType>().c_str());
-    cudaStream_t stream;
-    check_cuda_error(cudaStreamCreate(&stream));
-
-    Allocator<AllocatorType::CUDA> allocator(getDevice());
-
-    DataType      dtype = getTensorType<T>();
-    TensorWrapper a_tensor(&allocator, dtype, {m, k}, false);
-    TensorWrapper b_tensor(&allocator, dtype, {k, n}, false);
-    TensorWrapper c_tensor(&allocator, dtype, {m, n}, true);
-    TensorWrapper expected(&allocator, dtype, {m, n}, true);
-
-    std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false);
-    gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
-
-    for (auto& op_pair : op_pairs) {
-        std::string tc_name = getTestName(__func__, op_pair, m, n, k);
-        TM_LOG_DEBUG(tc_name);
-        computeReference<computeType>(op_pair.transa, op_pair.transb, expected, a_tensor, b_tensor);
-
-        size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
-        size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k;
-        size_t ldc = n;
-
-        c_tensor.setInvalidValues();  // to guarantee C has invalid data
-        gemm->gemm(op_pair.transa,
-                   op_pair.transb,
-                   m,
-                   n,
-                   k,
-                   a_tensor.data,
-                   a_tensor.type,
-                   lda,
-                   b_tensor.data,
-                   b_tensor.type,
-                   ldb,
-                   c_tensor.data,
-                   c_tensor.type,
-                   ldc);
-        EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
-
-        c_tensor.setInvalidValues();
-        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_tensor.data, ldb, c_tensor.data, ldc);
-        EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected);
-
-        c_tensor.setInvalidValues();
-        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_tensor.data, c_tensor.data);
-        EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
-
-        c_tensor.setInvalidValues();
-        gemm->gemm(op_pair.transa,
-                   op_pair.transb,
-                   m,
-                   n,
-                   k,
-                   a_tensor.data,
-                   DenseWeight<T>{(const T*)b_tensor.data, nullptr, nullptr},
-                   c_tensor.data);
-        EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected);
-    }
-    check_cuda_error(cudaStreamDestroy(stream));
-}
-
-template<typename T, DataType computeType>
-void testGemmConsistencyMatmul(size_t m, size_t n, size_t k)
-{
-    // Test if Gemm is consistent with cublasWrapper
-    TM_LOG_INFO(
-        "Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]", m, n, k, toString<T, computeType>().c_str());
-
-    Allocator<AllocatorType::CUDA> allocator(getDevice());
-    cudaStream_t                   stream;
-    check_cuda_error(cudaStreamCreate(&stream));
-
-    DataType      dtype = getTensorType<T>();
-    TensorWrapper a_tensor(&allocator, dtype, {m, k}, false);
-    TensorWrapper b_tensor(&allocator, dtype, {k, n}, false);
-    TensorWrapper c_tensor(&allocator, dtype, {m, n}, true);
-    TensorWrapper expected(&allocator, dtype, {m, n}, true);
-
-    cublasHandle_t   cublas_handle;
-    cublasLtHandle_t cublaslt_handle;
-    check_cuda_error(cublasCreate(&cublas_handle));
-    check_cuda_error(cublasLtCreate(&cublaslt_handle));
-    check_cuda_error(cublasSetStream(cublas_handle, stream));
-    cublasAlgoMap   cublas_algo_map(GEMM_CONFIG);
-    std::mutex*     cublas_wrapper_mutex = new std::mutex();
-    cublasMMWrapper cublas_wrapper(
-        cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator);
-
-    cudaDataType_t cuda_dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
-    cudaDataType_t cuda_ctype = (DataType::TYPE_FP32 == computeType) ? CUDA_R_32F : CUDA_R_16F;
-    cublas_wrapper.setGemmConfig(cuda_dtype, cuda_dtype, cuda_dtype, cuda_ctype);
-
-    std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false);
-    gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
-
-    for (auto& op_pair : op_pairs) {
-        std::string tc_name = getTestName(__func__, op_pair, m, n, k);
-
-        // Switch A/B because Gemm expects column major layout as cublas does.
-        size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
-        size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k;
-        size_t ldc = n;
-        cublas_wrapper.Gemm(getCublasOperation(op_pair.transb),
-                            getCublasOperation(op_pair.transa),
-                            n,
-                            m,
-                            k,
-                            b_tensor.data,
-                            ldb,
-                            a_tensor.data,
-                            lda,
-                            expected.data,
-                            ldc);
-
-        c_tensor.setInvalidValues();  // to guarantee C has invalid data
-        gemm->gemm(op_pair.transa,
-                   op_pair.transb,
-                   m,
-                   n,
-                   k,
-                   a_tensor.data,
-                   a_tensor.type,
-                   lda,
-                   b_tensor.data,
-                   b_tensor.type,
-                   ldb,
-                   c_tensor.data,
-                   c_tensor.type,
-                   ldc);
-        EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
-
-        c_tensor.setInvalidValues();
-        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_tensor.data, ldb, c_tensor.data, ldc);
-        EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected);
-
-        c_tensor.setInvalidValues();
-        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_tensor.data, c_tensor.data);
-        EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
-
-        c_tensor.setInvalidValues();
-        gemm->gemm(op_pair.transa,
-                   op_pair.transb,
-                   m,
-                   n,
-                   k,
-                   a_tensor.data,
-                   DenseWeight<T>{(const T*)b_tensor.data, nullptr, nullptr},
-                   c_tensor.data);
-        EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected);
-    }
-
-    delete cublas_wrapper_mutex;
-    check_cuda_error(cublasLtDestroy(cublaslt_handle));
-    check_cuda_error(cublasDestroy(cublas_handle));
-    check_cuda_error(cudaStreamDestroy(stream));
-}
-
-template<typename T, DataType computeType>
-void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k)
-{
-    // Test if Gemm is consistent with cublasWrapper
-    TM_LOG_INFO("Batched gemm function consistency test [m=%ld, n=%ld, k=%ld, %s]",
-                m,
-                n,
-                k,
-                toString<T, computeType>().c_str());
-
-    Allocator<AllocatorType::CUDA> allocator(getDevice());
-    cudaStream_t                   stream;
-    check_cuda_error(cudaStreamCreate(&stream));
-
-    // batch of in/out tensors
-    DataType                    a_type = getTensorType<T>();
-    DataType                    b_type = getTensorType<T>();
-    DataType                    c_type = getTensorType<T>();
-    std::vector<TensorWrapper*> a_tensors;
-    std::vector<TensorWrapper*> b_tensors;
-    std::vector<TensorWrapper*> c_tensors;
-    std::vector<TensorWrapper*> expecteds;
-    const size_t                batch_size = 3;
-    for (size_t i = 0; i < batch_size; ++i) {
-        a_tensors.push_back(new TensorWrapper(&allocator, a_type, {m, k}, false));
-        b_tensors.push_back(new TensorWrapper(&allocator, b_type, {k, n}, false));
-        c_tensors.push_back(new TensorWrapper(&allocator, c_type, {m, n}, true));
-        expecteds.push_back(new TensorWrapper(&allocator, c_type, {m, n}, true));
-    }
-
-    const T* hA[]{(const T*)a_tensors[0]->data,
-                  (const T*)a_tensors[1]->data,
-                  (const T*)a_tensors[2]->data,
-                  nullptr,  // for memory alignment.
-                  (const T*)b_tensors[0]->data,
-                  (const T*)b_tensors[1]->data,
-                  (const T*)b_tensors[2]->data,
-                  nullptr,  // for memory alignment.
-                  (const T*)c_tensors[0]->data,
-                  (const T*)c_tensors[1]->data,
-                  (const T*)c_tensors[2]->data,
-                  nullptr,  // for memory alignment.
-                  (const T*)expecteds[0]->data,
-                  (const T*)expecteds[1]->data,
-                  (const T*)expecteds[2]->data};
-
-    T** batch_tensor_ptrs = reinterpret_cast<T**>(allocator.malloc(sizeof(T*) * 16, false));
-    check_cuda_error(cudaMemcpyAsync((void*)batch_tensor_ptrs, hA, sizeof(T*) * 16, cudaMemcpyHostToDevice, stream));
-    const void* const* batch_a        = reinterpret_cast<const void* const*>(batch_tensor_ptrs);
-    const void* const* batch_b        = reinterpret_cast<const void* const*>(batch_tensor_ptrs + 4);
-    void* const*       batch_c        = reinterpret_cast<void* const*>(batch_tensor_ptrs + 8);
-    void* const*       batch_expected = reinterpret_cast<void* const*>(batch_tensor_ptrs + 12);
-
-    cublasHandle_t   cublas_handle;
-    cublasLtHandle_t cublaslt_handle;
-    check_cuda_error(cublasCreate(&cublas_handle));
-    check_cuda_error(cublasLtCreate(&cublaslt_handle));
-    check_cuda_error(cublasSetStream(cublas_handle, stream));
-    cublasAlgoMap   cublas_algo_map(GEMM_CONFIG);
-    std::mutex*     cublas_wrapper_mutex = new std::mutex();
-    cublasMMWrapper cublas_wrapper(
-        cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator);
-
-    cudaDataType_t dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
-    cudaDataType_t ctype = (computeType == DataType::TYPE_FP32) ? CUDA_R_32F : CUDA_R_16F;
-    cublas_wrapper.setGemmConfig(dtype, dtype, dtype, ctype);
-
-    std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false);
-    gemm->setTypes(a_type, b_type, c_type, computeType);
-
-    for (auto& op_pair : op_pairs) {
-        std::string tc_name = getTestName(__func__, op_pair, m, n, k);
-        TM_LOG_DEBUG(tc_name);
-
-        size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
-        size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k;
-        size_t ldc = n;
-
-        // Switch A/B because Gemm expects column major layout as cublas does.
-        cublas_wrapper.batchedGemm(getCublasOperation(op_pair.transb),  // N
-                                   getCublasOperation(op_pair.transa),  // T
-                                   n,
-                                   m,
-                                   k,
-                                   (const void* const*)batch_b,
-                                   ldb,
-                                   (const void* const*)batch_a,
-                                   lda,
-                                   (void* const*)batch_expected,
-                                   ldc,
-                                   batch_size);
-
-        gemm->batchedGemm(op_pair.transa,
-                          op_pair.transb,
-                          m,
-                          n,
-                          k,
-                          batch_a,
-                          a_type,
-                          lda,
-                          batch_b,
-                          b_type,
-                          ldb,
-                          batch_c,
-                          c_type,
-                          ldc,
-                          batch_size);
-        for (size_t i = 0; i < batch_size; ++i) {
-            EXPECT_ALMOST_EQUAL(
-                tc_name + " api1 batch" + std::to_string(i), T, computeType, *c_tensors[i], *expecteds[i]);
-        }
-
-        for (size_t i = 0; i < batch_size; ++i) {
-            c_tensors[i]->setInvalidValues();
-        }
-        gemm->batchedGemm(
-            op_pair.transa, op_pair.transb, m, n, k, batch_a, lda, batch_b, ldb, batch_c, ldc, batch_size);
-        for (size_t i = 0; i < batch_size; ++i) {
-            EXPECT_ALMOST_EQUAL(
-                tc_name + " api2 batch" + std::to_string(i), T, computeType, *c_tensors[i], *expecteds[i]);
-        }
-
-        for (size_t i = 0; i < batch_size; ++i) {
-            c_tensors[i]->setInvalidValues();
-        }
-        gemm->batchedGemm(op_pair.transa, op_pair.transb, m, n, k, batch_a, batch_b, batch_c, batch_size);
-        for (size_t i = 0; i < batch_size; ++i) {
-            EXPECT_ALMOST_EQUAL(
-                tc_name + " api3 batch" + std::to_string(i), T, computeType, *c_tensors[i], *expecteds[i]);
-        }
-    }
-    a_tensors.clear();
-    b_tensors.clear();
-    c_tensors.clear();
-    expecteds.clear();
-    delete cublas_wrapper_mutex;
-    check_cuda_error(cublasLtDestroy(cublaslt_handle));
-    check_cuda_error(cublasDestroy(cublas_handle));
-    check_cuda_error(cudaStreamDestroy(stream));
-}
-
-template<typename T, DataType computeType>
-void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t n, size_t k)
-{
-    // Test if Gemm is consistent with cublasWrapper
-    TM_LOG_INFO("Strided batched gemm function consistency test [bsz=%ld, m=%ld, n=%ld, k=%ld, %s]",
-                batch_size,
-                m,
-                n,
-                k,
-                toString<T, computeType>().c_str());
-
-    Allocator<AllocatorType::CUDA> allocator(getDevice());
-    cudaStream_t                   stream;
-    check_cuda_error(cudaStreamCreate(&stream));
-
-    DataType      data_type = getTensorType<T>();
-    TensorWrapper a_tensor(&allocator, data_type, {batch_size, m, k}, false);
-    TensorWrapper b_tensor(&allocator, data_type, {batch_size, k, n}, false);
-    TensorWrapper c_tensor(&allocator, data_type, {batch_size, m, n}, true);
-    TensorWrapper expected(&allocator, data_type, {batch_size, m, n}, true);
-
-    cublasHandle_t   cublas_handle;
-    cublasLtHandle_t cublaslt_handle;
-    check_cuda_error(cublasCreate(&cublas_handle));
-    check_cuda_error(cublasLtCreate(&cublaslt_handle));
-    check_cuda_error(cublasSetStream(cublas_handle, stream));
-    cublasAlgoMap   cublas_algo_map(GEMM_CONFIG);
-    std::mutex*     cublas_wrapper_mutex = new std::mutex();
-    cublasMMWrapper cublas_wrapper(
-        cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator);
-
-    cudaDataType_t dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
-    cudaDataType_t ctype = (computeType == DataType::TYPE_FP32) ? CUDA_R_32F : CUDA_R_16F;
-    cublas_wrapper.setGemmConfig(dtype, dtype, dtype, ctype);
-
-    std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false);
-    gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
-
-    for (auto& op_pair : op_pairs) {
-        std::string tc_name = getTestName(__func__, op_pair, m, n, k);
-
-        // Switch A/B because Gemm expects column major layout as cublas does.
-        size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
-        size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k;
-        size_t ldc = n;
-
-        int64_t stridea = m * k;
-        int64_t strideb = k * n;
-        int64_t stridec = m * n;
-
-        float alpha = 1.0f;
-        float beta  = 0.0f;
-
-        cublas_wrapper.stridedBatchedGemm(getCublasOperation(op_pair.transb),
-                                          getCublasOperation(op_pair.transa),
-                                          n,
-                                          m,
-                                          k,
-                                          alpha,
-                                          b_tensor.data,
-                                          getCublasDataType(b_tensor.type),
-                                          ldb,
-                                          strideb,
-                                          a_tensor.data,
-                                          getCublasDataType(a_tensor.type),
-                                          lda,
-                                          stridea,
-                                          beta,
-                                          expected.data,
-                                          getCublasDataType(expected.type),
-                                          ldc,
-                                          stridec,
-                                          batch_size,
-                                          getCublasDataType(computeType));
-
-        c_tensor.setInvalidValues();  // to guarantee C has invalid data
-        gemm->stridedBatchedGemm(op_pair.transa,
-                                 op_pair.transb,
-                                 m,
-                                 n,
-                                 k,
-                                 a_tensor.data,
-                                 a_tensor.type,
-                                 lda,
-                                 stridea,
-                                 b_tensor.data,
-                                 b_tensor.type,
-                                 ldb,
-                                 strideb,
-                                 c_tensor.data,
-                                 c_tensor.type,
-                                 ldc,
-                                 stridec,
-                                 batch_size,
-                                 computeType,
-                                 alpha,
-                                 beta);
-        EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
-
-        c_tensor.setInvalidValues();
-        gemm->stridedBatchedGemm(op_pair.transa,
-                                 op_pair.transb,
-                                 m,
-                                 n,
-                                 k,
-                                 a_tensor.data,
-                                 lda,
-                                 stridea,
-                                 b_tensor.data,
-                                 ldb,
-                                 strideb,
-                                 c_tensor.data,
-                                 ldc,
-                                 stridec,
-                                 batch_size,
-                                 alpha,
-                                 beta);
-        EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected);
-
-        c_tensor.setInvalidValues();
-        gemm->stridedBatchedGemm(op_pair.transa,
-                                 op_pair.transb,
-                                 m,
-                                 n,
-                                 k,
-                                 a_tensor.data,
-                                 stridea,
-                                 b_tensor.data,
-                                 strideb,
-                                 c_tensor.data,
-                                 stridec,
-                                 batch_size,
-                                 alpha,
-                                 beta);
-        EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
-
-        c_tensor.setInvalidValues();
-        gemm->stridedBatchedGemm(op_pair.transa,
-                                 op_pair.transb,
-                                 m,
-                                 n,
-                                 k,
-                                 a_tensor.data,
-                                 b_tensor.data,
-                                 c_tensor.data,
-                                 batch_size,
-                                 alpha,
-                                 beta);
-        EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected);
-    }
-
-    delete cublas_wrapper_mutex;
-    check_cuda_error(cublasLtDestroy(cublaslt_handle));
-    check_cuda_error(cublasDestroy(cublas_handle));
-    check_cuda_error(cudaStreamDestroy(stream));
-}
-
-#ifdef SPARSITY_ENABLED
-// The current SpGemm only supports TYPE_FP16 for T, computeType,
-// but let us keep these template variables for later use.
-template<typename T, DataType computeType>
-void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k)
-{
-    TM_LOG_INFO(
-        "Sparse gemm function correctness test [m=%ld, n=%ld, k=%ld, %s]", m, n, k, toString<T, computeType>().c_str());
-    cudaStream_t stream;
-    check_cuda_error(cudaStreamCreate(&stream));
-
-    Allocator<AllocatorType::CUDA> allocator(getDevice());
-
-    DataType      dtype = getTensorType<T>();
-    TensorWrapper a_tensor(&allocator, dtype, {m, k}, false);
-    TensorWrapper b_tensor(&allocator, dtype, {k, n}, false);
-    TensorWrapper c_tensor(&allocator, dtype, {m, n}, true);
-    TensorWrapper expected(&allocator, dtype, {m, n}, true);
-
-    std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, true, false);
-    gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
-
-    for (auto& op_pair : op_pairs) {
-        // A/B will be switched in SpGemm.
-        std::string tc_name = getTestName(__func__, op_pair, m, n, k);
-        TM_LOG_DEBUG(tc_name);
-
-        b_tensor.setRandomValues();
-        pruneMatrixB(b_tensor.data, stream, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
-        computeReference<computeType>(op_pair.transa, op_pair.transb, expected, a_tensor, b_tensor);
-
-        void* b_compressed;
-        compressMatrixB(
-            &b_compressed, allocator, stream, b_tensor.data, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
-
-        size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
-        size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k;
-        size_t ldc = n;
-
-        c_tensor.setInvalidValues();  // to guarantee C has invalid data
-        gemm->gemm(op_pair.transa,
-                   op_pair.transb,
-                   m,
-                   n,
-                   k,
-                   a_tensor.data,
-                   a_tensor.type,
-                   lda,
-                   b_compressed,
-                   b_tensor.type,
-                   ldb,
-                   c_tensor.data,
-                   c_tensor.type,
-                   ldc);
-        EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
-
-        c_tensor.setInvalidValues();
-        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_compressed, ldb, c_tensor.data, ldc);
-        EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected);
-
-        c_tensor.setInvalidValues();
-        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_compressed, c_tensor.data);
-        EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
-
-        c_tensor.setInvalidValues();
-        gemm->gemm(op_pair.transa,
-                   op_pair.transb,
-                   m,
-                   n,
-                   k,
-                   a_tensor.data,
-                   DenseWeight<T>{(const T*)b_tensor.data, nullptr, (const T*)b_compressed},
-                   c_tensor.data);
-        EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected);
-
-        allocator.free((void**)(&b_compressed));
-    }
-    check_cuda_error(cudaStreamDestroy(stream));
-}
-
-template<typename T, DataType computeType>
-void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k)
-{
-    // Test if Gemm is consistent with cublasWrapper
-    TM_LOG_INFO("Sparse Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]",
-                m,
-                n,
-                k,
-                toString<T, computeType>().c_str());
-
-    Allocator<AllocatorType::CUDA> allocator(getDevice());
-    cudaStream_t                   stream;
-    check_cuda_error(cudaStreamCreate(&stream));
-
-    DataType      dtype = getTensorType<T>();
-    TensorWrapper a_tensor(&allocator, dtype, {m, k}, false);
-    TensorWrapper b_tensor(&allocator, dtype, {k, n}, false);
-    TensorWrapper c_tensor(&allocator, dtype, {m, n}, true);
-    TensorWrapper expected(&allocator, dtype, {m, n}, true);
-
-    cublasHandle_t   cublas_handle;
-    cublasLtHandle_t cublaslt_handle;
-    check_cuda_error(cublasCreate(&cublas_handle));
-    check_cuda_error(cublasLtCreate(&cublaslt_handle));
-    check_cuda_error(cublasSetStream(cublas_handle, stream));
-    cublasAlgoMap   cublas_algo_map(GEMM_CONFIG);
-    std::mutex*     cublas_wrapper_mutex = new std::mutex();
-    cublasMMWrapper cublas_wrapper(
-        cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, &allocator);
-
-    cudaDataType_t cu_dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
-    cudaDataType_t cu_ctype = (DataType::TYPE_FP32 == computeType) ? CUDA_R_32F : CUDA_R_16F;
-    cublas_wrapper.setGemmConfig(cu_dtype, cu_dtype, cu_dtype, cu_ctype);
-
-    std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, true, false);
-    gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
-
-    for (auto& op_pair : op_pairs) {
-        std::string tc_name = getTestName(__func__, op_pair, m, n, k);
-        TM_LOG_DEBUG(tc_name);
-
-        b_tensor.setRandomValues();
-        pruneMatrixB(b_tensor.data, stream, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
-
-        // Switch A/B because Gemm expects column major layout as cublas does.
-        size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
-        size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k;
-        size_t ldc = n;
-        cublas_wrapper.Gemm(getCublasOperation(op_pair.transb),
-                            getCublasOperation(op_pair.transa),
-                            n,
-                            m,
-                            k,
-                            b_tensor.data,
-                            ldb,
-                            a_tensor.data,
-                            lda,
-                            expected.data,
-                            ldc);
-
-        void* b_compressed;
-        compressMatrixB(
-            &b_compressed, allocator, stream, b_tensor.data, b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
-
-        c_tensor.setInvalidValues();  // to guarantee C has invalid data
-        gemm->gemm(op_pair.transa,
-                   op_pair.transb,
-                   m,
-                   n,
-                   k,
-                   a_tensor.data,
-                   a_tensor.type,
-                   lda,
-                   b_compressed,
-                   b_tensor.type,
-                   ldb,
-                   c_tensor.data,
-                   c_tensor.type,
-                   ldc);
-        EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
-
-        c_tensor.setInvalidValues();
-        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, lda, b_compressed, ldb, c_tensor.data, ldc);
-        EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
-
-        c_tensor.setInvalidValues();
-        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k, a_tensor.data, b_compressed, c_tensor.data);
-        EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
-    }
-
-    delete cublas_wrapper_mutex;
-    check_cuda_error(cublasLtDestroy(cublaslt_handle));
-    check_cuda_error(cublasDestroy(cublas_handle));
-    check_cuda_error(cudaStreamDestroy(stream));
-}
-#endif
-
-int main(int argc, char* argv[])
-{
-    // testGemmCreate();
-    using testcase_t = std::tuple<size_t, size_t, size_t>;
-
-    std::vector<testcase_t> testcases = {
-        {16, 32, 64}, {255, 255, 255}, {1041, 2047, 9999}, {1041, 1, 9999}, {1041, 999, 1}};
-
-    // Computation correctness tests
-    for (testcase_t& tc : testcases) {
-        size_t m = std::get<0>(tc);
-        size_t n = std::get<1>(tc);
-        size_t k = std::get<2>(tc);
-
-        testGemmCorrectnessMatmul<float, TYPE_FP32>(m, n, k);
-        testGemmCorrectnessMatmul<half, TYPE_FP32>(m, n, k);
-        testGemmCorrectnessMatmul<half, TYPE_FP16>(m, n, k);
-
-        testGemmConsistencyMatmul<float, TYPE_FP32>(m, n, k);
-        testGemmConsistencyMatmul<half, TYPE_FP32>(m, n, k);
-        testGemmConsistencyMatmul<half, TYPE_FP16>(m, n, k);
-
-        testGemmConsistencyBatchedMatmul<float, TYPE_FP32>(m, n, k);
-        testGemmConsistencyBatchedMatmul<half, TYPE_FP32>(m, n, k);
-        testGemmConsistencyBatchedMatmul<half, TYPE_FP16>(m, n, k);
-
-        testGemmConsistencyStridedBatchedMatmul<float, TYPE_FP32>(7, m, n, k);
-        testGemmConsistencyStridedBatchedMatmul<half, TYPE_FP32>(7, m, n, k);
-        testGemmConsistencyStridedBatchedMatmul<half, TYPE_FP16>(7, m, n, k);
-    }
-
-#ifdef SPARSITY_ENABLED
-    // Reset for SpGemm test.
-    testcases.clear();
-    testcases.insert(testcases.end(),
-                     {{8, 32, 32},  // minimum possible example.
-                      {8, 32, 64},
-                      {64, 64, 64},
-                      {16, 32, 64},
-                      {1024, 32, 1024},
-                      {1024, 1024, 32},
-                      {16, 1024, 1024},
-                      {1024, 1024, 1024}});
-
-    for (testcase_t& tc : testcases) {
-        size_t m = std::get<0>(tc);
-        size_t n = std::get<1>(tc);
-        size_t k = std::get<2>(tc);
-        testSpGemmCorrectnessMatmul<half, TYPE_FP16>(m, n, k);
-        testSpGemmConsistencyMatmul<half, TYPE_FP16>(m, n, k);
-    }
-#endif
-    TM_LOG_INFO("Test done");
-    return 0;
-}
diff --git a/tests/csrc/unittests/test_int8.cu b/tests/csrc/unittests/test_int8.cu
deleted file mode 100644
index 6831c56ea1..0000000000
--- a/tests/csrc/unittests/test_int8.cu
+++ /dev/null
@@ -1,95 +0,0 @@
-#include <algorithm>
-#include <iostream>
-#include <math.h>
-#include <stdlib.h>
-#include <string>
-#include <vector>
-
-#include "src/turbomind/kernels/transpose_int8_kernels.h"
-#include "src/turbomind/utils/Tensor.h"
-#include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/memory_utils.h"
-
-#include <algorithm>
-#include <iostream>
-#include <random>
-
-#include "gtest_utils.h"
-
-using namespace turbomind;
-
-class Int8TestSuite: public FtTestBase {
-
-public:
-    void SetUp() override
-    {
-        FtTestBase::SetUp();
-    }
-    void TearDown() override
-    {
-        FtTestBase::TearDown();
-    }
-
-protected:
-    using FtTestBase::stream;
-    using FtTestBase::allocator;
-
-    struct cudaDeviceProp prop;
-
-    void testTransposition();
-};
-
-void fill_tensor_random(Tensor a)
-{
-    const size_t                          num_elems = a.size();
-    std::vector<int8_t>                   host_values(num_elems);
-    std::uniform_int_distribution<int8_t> int8_random(-128, 127);
-    std::mt19937                          rng(0);
-
-    std::generate(host_values.begin(), host_values.end(), [&int8_random, &rng]() { return int8_random(rng); });
-    cudaH2Dcpy(a.getPtr<int8_t>(), host_values.data(), num_elems);
-}
-
-void reference_transpose_host(std::vector<int8_t>& a_t_host, const Tensor& a)
-{
-    std::vector<int8_t> a_host(a.size());
-    cudaD2Hcpy(a_host.data(), a.getPtr<int8_t>(), a.size());
-
-    for (unsigned int i = 0; i < a.shape[0]; i++) {
-        for (unsigned int j = 0; j < a.shape[1]; j++) {
-            a_t_host[j * a.shape[0] + i] = a_host[i * a.shape[1] + j];
-        }
-    }
-}
-
-void Int8TestSuite::testTransposition()
-{
-    const int m = 32;
-    const int k = 2048;
-    const int n = 2048;
-
-    int8_t *a_data, *a_t_data;
-
-    cudaMalloc(&a_data, m * k * sizeof(int8_t));
-    Tensor a{MEMORY_GPU, TYPE_INT8, {32, 2048}, a_data};
-    fill_tensor_random(a);
-
-    cudaMalloc(&a_t_data, k * m * sizeof(int8_t));
-    Tensor a_t{MEMORY_GPU, TYPE_INT8, {2048, 32}, a_t_data};
-
-    std::vector<int8_t> a_t_host_ref(a_t.size());
-    reference_transpose_host(a_t_host_ref, a);
-
-    invokeTransposeInt8Tensor(a_t, a);
-    bool result = checkResult("", a_t.getPtr<int8_t>(), a_t_host_ref.data(), a_t.size());
-
-    cudaFree(a_data);
-    cudaFree(a_t_data);
-
-    EXPECT_TRUE(result);
-}
-
-TEST_F(Int8TestSuite, TranspositionCorrectness)
-{
-    this->testTransposition();
-}
diff --git a/tests/csrc/unittests/test_tensor.cu b/tests/csrc/unittests/test_tensor.cu
deleted file mode 100644
index 4211ed3409..0000000000
--- a/tests/csrc/unittests/test_tensor.cu
+++ /dev/null
@@ -1,256 +0,0 @@
-#include <iostream>
-#include <unordered_map>
-#include <vector>
-
-#include <gtest/gtest.h>
-
-#include "src/turbomind/utils/Tensor.h"
-
-using namespace turbomind;
-
-namespace {
-
-#define EXPECT_EQUAL_TENSORS(t1, t2)                                                                                   \
-    do {                                                                                                               \
-        EXPECT_TRUE(t1.where == t2.where);                                                                             \
-        EXPECT_TRUE(t1.type == t2.type);                                                                               \
-        EXPECT_TRUE(t1.shape == t2.shape);                                                                             \
-        EXPECT_TRUE(t1.data == t2.data);                                                                               \
-    } while (false)
-
-TEST(TensorMapTest, HasKeyCorrectness)
-{
-    bool*  v1 = new bool(true);
-    float* v2 = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
-    Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, v1};
-    Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, v2};
-
-    TensorMap map({{"t1", t1}, {"t2", t2}});
-    EXPECT_TRUE(map.isExist("t1"));
-    EXPECT_TRUE(map.isExist("t2"));
-    EXPECT_FALSE(map.isExist("t3"));
-
-    delete v1;
-    delete[] v2;
-}
-
-TEST(TensorMapTest, InsertCorrectness)
-{
-    int*   v1 = new int[4]{1, 10, 20, 30};
-    float* v2 = new float[2]{1.0f, 2.0f};
-    Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
-    Tensor t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v2);
-
-    TensorMap map({{"t1", t1}});
-    EXPECT_TRUE(map.size() == 1);
-    EXPECT_TRUE(map.isExist("t1"));
-    EXPECT_EQUAL_TENSORS(map.at("t1"), t1);
-    EXPECT_FALSE(map.isExist("t2"));
-}
-
-TEST(TensorMapTest, InsertDoesNotAllowNoneTensor)
-{
-    TensorMap map;
-    EXPECT_TRUE(map.size() == 0);
-    // forbid a none tensor.
-    EXPECT_THROW(map.insert("none", {}), std::runtime_error);
-
-    // forbid a tensor having null data pointer.
-    Tensor none_data_tensor = Tensor(MEMORY_CPU, TYPE_INT32, {}, nullptr);
-    EXPECT_THROW(map.insert("empty", none_data_tensor), std::runtime_error);
-}
-
-TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey)
-{
-    int*      v1 = new int[4]{1, 10, 20, 30};
-    Tensor    t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
-    Tensor    t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v1);
-    TensorMap map({{"t1", t1}});
-    EXPECT_TRUE(map.size() == 1);
-    // forbid a duplicated key.
-    EXPECT_THROW(map.insert("t1", t2), std::runtime_error);
-    delete[] v1;
-}
-
-TEST(TensorMapTest, GetValCorrectness)
-{
-    int*   v1 = new int[4]{1, 10, 20, 30};
-    Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
-
-    TensorMap map({{"t1", t1}});
-    EXPECT_TRUE(map.size() == 1);
-    // throw exception since the map doesn't have a key "t3".
-    EXPECT_THROW(map.getVal<int>("t3"), std::runtime_error);
-    EXPECT_TRUE(map.getVal<int>("t1") == 1);
-    EXPECT_TRUE(map.getVal<int>("t1", 3) == 1);
-
-    // map doesn't have t2 so return the default value 3.
-    EXPECT_TRUE(map.getVal<int>("t2", 3) == 3);
-
-    v1[0] += 1;  // update value.
-    EXPECT_TRUE(map.getVal<int>("t1") == 2);
-    EXPECT_TRUE(map.getVal<int>("t1", 3) == 2);
-
-    size_t index = 2;
-    EXPECT_TRUE(map.getValWithOffset<int>("t1", index) == 20);
-    EXPECT_TRUE(map.getValWithOffset<int>("t1", index, 3) == 20);
-    EXPECT_TRUE(map.getValWithOffset<int>("t2", index, 3) == 3);
-    delete[] v1;
-}
-
-TEST(TensorMapTest, GetTensorCorrectness)
-{
-    bool*  t1_val = new bool(true);
-    float* t2_val = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
-    Tensor t1     = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
-    Tensor t2     = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val};
-
-    int*   default_val    = new int[4]{0, 1, 2, 3};
-    Tensor default_tensor = Tensor{MEMORY_CPU, TYPE_INT32, {4}, default_val};
-
-    TensorMap map({{"t1", t1}, {"t2", t2}});
-    EXPECT_THROW(map.at("t3"), std::runtime_error);
-    EXPECT_EQUAL_TENSORS(map.at("t1", default_tensor), t1);
-    EXPECT_EQUAL_TENSORS(map.at("t2", default_tensor), t2);
-    EXPECT_EQUAL_TENSORS(map.at("t3", default_tensor), default_tensor);
-    EXPECT_EQUAL_TENSORS(map.at("t3", {}), Tensor());
-
-    delete[] default_val;
-    delete[] t2_val;
-    delete[] t1_val;
-}
-
-TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap)
-{
-    bool*  t1_val = new bool(true);
-    float* t2_val = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
-    Tensor t1     = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
-    Tensor t2     = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val};
-
-    int*   default_val    = new int[4]{0, 1, 2, 3};
-    Tensor default_tensor = Tensor{MEMORY_CPU, TYPE_INT32, {4}, default_val};
-
-    const TensorMap map({{"t1", t1}, {"t2", t2}});
-    EXPECT_THROW(map.at("t3"), std::runtime_error);
-    EXPECT_EQUAL_TENSORS(map.at("t1", default_tensor), t1);
-    EXPECT_EQUAL_TENSORS(map.at("t2", default_tensor), t2);
-    EXPECT_EQUAL_TENSORS(map.at("t3", default_tensor), default_tensor);
-    EXPECT_EQUAL_TENSORS(map.at("t3", {}), Tensor());
-
-    delete[] default_val;
-    delete[] t2_val;
-    delete[] t1_val;
-}
-
-TEST(TensorTest, EmptyTensorMinMaxRaiseError)
-{
-    Tensor t1;
-    EXPECT_THROW(t1.min<int>(), std::runtime_error);
-    EXPECT_THROW(t1.max<int>(), std::runtime_error);
-
-    Tensor t2 = Tensor{MEMORY_CPU, TYPE_INT32, {}, nullptr};
-    EXPECT_THROW(t2.min<int>(), std::runtime_error);
-    EXPECT_THROW(t2.max<int>(), std::runtime_error);
-}
-
-using TensorTypes = testing::Types<int8_t, int, float>;
-
-template<typename T>
-class TensorFuncTest: public testing::Test {};
-
-TYPED_TEST_SUITE(TensorFuncTest, TensorTypes);
-
-TYPED_TEST(TensorFuncTest, MaxCorrectness)
-{
-    using T = TypeParam;
-
-    size_t size = 4;
-
-    T* v1 = new T[size]{T(1), T(2), T(3), T(4)};
-    T* v2 = new T[size]{T(4), T(3), T(2), T(1)};
-    T* v3 = new T[size]{T(1), T(2), T(4), T(3)};
-
-    Tensor t1 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v1);
-    Tensor t2 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v2);
-    Tensor t3 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v3);
-
-    EXPECT_EQ(t1.max<T>(), T(4));
-    EXPECT_EQ(t2.max<T>(), T(4));
-    EXPECT_EQ(t3.max<T>(), T(4));
-
-    delete[] v1;
-    delete[] v2;
-    delete[] v3;
-}
-
-TYPED_TEST(TensorFuncTest, MinCorrectness)
-{
-    using T = TypeParam;
-
-    size_t size = 4;
-
-    T* v1 = new T[size]{T(1), T(2), T(3), T(4)};
-    T* v2 = new T[size]{T(4), T(3), T(2), T(1)};
-    T* v3 = new T[size]{T(1), T(2), T(4), T(3)};
-
-    Tensor t1 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v1);
-    Tensor t2 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v2);
-    Tensor t3 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v3);
-
-    EXPECT_EQ(t1.min<T>(), T(1));
-    EXPECT_EQ(t2.min<T>(), T(1));
-    EXPECT_EQ(t3.min<T>(), T(1));
-
-    delete[] v1;
-    delete[] v2;
-    delete[] v3;
-}
-
-TYPED_TEST(TensorFuncTest, AnyCorrectness)
-{
-    using T = TypeParam;
-
-    T*     v = new T[4]{T(1), T(2), T(3), T(4)};
-    Tensor t = Tensor{MEMORY_CPU, getTensorType<T>(), {4}, v};
-    EXPECT_TRUE(t.any<T>(T(1)));
-    EXPECT_FALSE(t.any<T>(T(5)));
-    delete[] v;
-}
-
-TYPED_TEST(TensorFuncTest, AllCorrectness)
-{
-    using T = TypeParam;
-
-    constexpr size_t size = 4;
-    T*               v1   = new T[size]{T(1), T(1), T(1), T(1)};
-    T*               v2   = new T[size]{T(1), T(1), T(1), T(2)};
-    Tensor           t1   = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v1};
-    Tensor           t2   = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v2};
-    EXPECT_TRUE(t1.all<T>(T(1)));
-    EXPECT_FALSE(t2.all<T>(T(2)));
-    delete[] v1;
-    delete[] v2;
-}
-
-TYPED_TEST(TensorFuncTest, SliceCorrectness)
-{
-    using T = TypeParam;
-
-    constexpr int size = 12;
-    T*            v    = new T[size];
-    for (int i = 0; i < size; ++i) {
-        v[i] = i;
-    }
-
-    DataType dtype = getTensorType<T>();
-    Tensor   t1    = Tensor(MEMORY_CPU, dtype, {3, 4}, v);
-    Tensor   t2    = t1.slice({2, 4}, 4);
-
-    EXPECT_EQUAL_TENSORS(t2, Tensor(MEMORY_CPU, dtype, {2, 4}, &v[4]));
-    // An overflowed tensor throws an exception.
-    EXPECT_THROW(t1.slice({2, 4}, 5), std::runtime_error);
-
-    delete[] v;
-}
-
-}  // end of namespace