ROCm · gshtras · Nov 15, 2024 · Nov 13, 2024 · Nov 15, 2024 · Nov 15, 2024
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -68,12 +68,8 @@ jobs:
           bash -x .github/workflows/scripts/build.sh
           wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
           asset_name=${wheel_name//"linux"/"manylinux1"}
-          gradlib_wheel_name=$(find gradlib/dist -name "*whl" -print0 | xargs -0 -n 1 basename)
-          gradlib_asset_name=${gradlib_wheel_name//"linux"/"manylinux1"}
           echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
           echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
-          echo "gradlib_wheel_name=${gradlib_wheel_name}" >> "$GITHUB_ENV"
-          echo "gradlib_asset_name=${gradlib_asset_name}" >> "$GITHUB_ENV"
 
       - name: Upload vllm Release Asset
         uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
@@ -84,13 +80,3 @@ jobs:
           asset_path: ./dist/${{ env.wheel_name }}
           asset_name: ${{ env.asset_name }}
           asset_content_type: application/*
-      - name: Upload gradlib Release Asset
-        uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          upload_url: ${{ needs.release.outputs.upload_url }}
-          asset_path: ./gradlib/dist/${{ env.gradlib_wheel_name }}
-          asset_name: ${{ env.gradlib_asset_name }}
-          asset_content_type: application/*
-
diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
@@ -18,6 +18,3 @@ export MAX_JOBS=32
 
 # Build
 $python_executable setup.py bdist_wheel --dist-dir=dist
-cd gradlib
-$python_executable setup.py bdist_wheel --dist-dir=dist
-cd ..
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -508,6 +508,24 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
     ARCHITECTURES ${VLLM_GPU_ARCHES}
     USE_SABI 3
     WITH_SOABI)
+
+  #
+  # _gradlib_C extension
+  #
+  set(VLLM_GRADLIB_EXT_SRC
+    "csrc/gradlib/torch_bindings.cpp"
+    "csrc/gradlib/hipbsolgemm.cu"
+    "csrc/gradlib/rocsolgemm.cu")
+
+  define_gpu_extension_target(
+    _gradlib_C
+    DESTINATION vllm
+    LANGUAGE ${VLLM_GPU_LANG}
+    SOURCES ${VLLM_GRADLIB_EXT_SRC}
+    COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+    ARCHITECTURES ${VLLM_GPU_ARCHES}
+    USE_SABI 3
+    WITH_SOABI)
 endif()
 
 # vllm-flash-attn currently only supported on CUDA

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
@@ -148,7 +148,7 @@ FROM scratch AS export_flash_attn_0
 FROM export_flash_attn_${BUILD_FA} AS export_flash_attn
 
 # -----------------------
-# vLLM (and gradlib) fetch stages
+# vLLM fetch stages
 FROM base AS fetch_vllm_0
 ONBUILD COPY ./ vllm/
 FROM base AS fetch_vllm_1
@@ -160,7 +160,7 @@ ONBUILD RUN git clone ${VLLM_REPO} \
 FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
 
 # -----------------------
-# vLLM (and gradlib) build stages
+# vLLM build stages
 FROM fetch_vllm AS build_vllm
 ARG COMMON_WORKDIR
 ARG USE_CYTHON
@@ -184,13 +184,9 @@ RUN cd vllm \
     && python3 setup.py clean --all  \
     && if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
     && python3 setup.py bdist_wheel --dist-dir=dist
-# Build gradlib
-RUN cd vllm/gradlib \
-    && python3 setup.py clean --all && python3 setup.py bdist_wheel --dist-dir=dist
 FROM scratch AS export_vllm
 ARG COMMON_WORKDIR
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
-COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/gradlib/dist/*.whl /
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/rocm_patch /rocm_patch
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements*.txt /
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
@@ -265,7 +261,7 @@ RUN if [ ${BUILD_RPD} -eq "1" ]; then \
     && make && make install \
     && cd hipMarker && python setup.py install ; fi
 
-# Install vLLM (and gradlib)
+# Install vLLM
 # Make sure punica kernels are built (for LoRA)
 ENV VLLM_INSTALL_PUNICA_KERNELS=1
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
@@ -277,7 +273,7 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
            *"rocm-6.1"*) \
                cp rocm_patch/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6;; \
            *) ;; esac \
-    && pip uninstall -y vllm gradlib \
+    && pip uninstall -y vllm \
     && pip install *.whl
 
 # Copy over the benchmark scripts as well

diff --git a/gradlib/csrc/hipbsolgemm.cu → csrc/gradlib/hipbsolgemm.cu b/gradlib/csrc/hipbsolgemm.cu → csrc/gradlib/hipbsolgemm.cu
@@ -6,7 +6,6 @@
 // __HIP_NO_HALF_CONVERSIONS__ #endif
 
 #include <torch/torch.h>
-#include <torch/extension.h>
 #include <ATen/ATen.h>
 #include <ATen/autocast_mode.h>
 #include <ATen/cuda/CUDABlas.h>
@@ -119,7 +118,7 @@ std::map<at::ScalarType, hipDataType> dtype_map{
 }  // namespace
 
 // find all hipblaslt solutions for given gemm problem
-std::vector<int> hipblasLtMatmul_findallsols_wrapper(
+std::vector<int64_t> hipblasLtMatmul_findallsols_wrapper(
     hipblasLtHandle_t handle, hipblasOperation_t op_A, hipblasOperation_t op_B,
     int m, int n, int k, const void* alpha, const void* a, int lda,
     const void* b, int ldb, const void* beta, void* c, int ldc,
@@ -163,7 +162,7 @@ std::vector<int> hipblasLtMatmul_findallsols_wrapper(
       handle, hipblaslt_ext::GemmType::HIPBLASLT_GEMM, op_A, op_B, intype,
       intype, outtype, outtype, HIPBLAS_COMPUTE_32F, heuristicResult));
 
-  std::vector<int> algoIndex;
+  std::vector<int64_t> algoIndex;
   int returned_algo_count = heuristicResult.size();
   // for (int i = 0; i < returnedAlgoCount; i++) {
   for (int i = 0; i < returned_algo_count; i++) {
@@ -290,12 +289,12 @@ hipblasStatus_t hipblasLtMatmul_sol_wrapper(
 }
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 torch::Tensor hipb_mm(const torch::Tensor& mat1, const torch::Tensor& mat2,
-                      const int solution_index,
-                      at::optional<torch::Tensor> bias = at::nullopt,
-                      at::optional<py::object> out_dtype = at::nullopt,
-                      at::optional<torch::Tensor> scale1 = at::nullopt,
-                      at::optional<torch::Tensor> scale2 = at::nullopt,
-                      at::optional<torch::Tensor> scaleOut = at::nullopt) {
+                      const int64_t solution_index,
+                      at::optional<torch::Tensor> bias,
+                      at::optional<c10::ScalarType> out_dtype,
+                      at::optional<torch::Tensor> scale1,
+                      at::optional<torch::Tensor> scale2,
+                      at::optional<torch::Tensor> scaleOut) {
   auto mat1_strides{mat1.strides()};
   auto mat2_strides{mat2.strides()};
   auto mat1_sizes{mat1.sizes()};
@@ -309,10 +308,7 @@ torch::Tensor hipb_mm(const torch::Tensor& mat1, const torch::Tensor& mat2,
               "mat1 dim 1 must match mat2 dim 0");
 
   auto inDtype{mat1.options().dtype().toScalarType()};
-  auto outDtype{
-      out_dtype.has_value()
-          ? torch::python::detail::py_object_to_dtype(out_dtype.value())
-          : inDtype};
+  auto outDtype{out_dtype.has_value() ? out_dtype.value() : inDtype};
   auto options{at::TensorOptions().dtype(outDtype).device(at::kCUDA)};
   auto result{torch::empty({mat1_sizes[0], mat2_sizes[1]}, options)};
 
@@ -392,10 +388,10 @@ torch::Tensor hipb_mm(const torch::Tensor& mat1, const torch::Tensor& mat2,
 }
 
 // find all hipblas solutions and return them to python land
-std::vector<int> hipb_findallsols(
+std::vector<int64_t> hipb_findallsols(
     const torch::Tensor& mat1, const torch::Tensor& mat2,
     at::optional<torch::Tensor> bias = at::nullopt,
-    at::optional<py::object> out_dtype = at::nullopt) {
+    at::optional<c10::ScalarType> out_dtype = at::nullopt) {
   auto mat1_strides{mat1.strides()};
   auto mat2_strides{mat2.strides()};
   auto mat1_sizes{mat1.sizes()};
@@ -408,10 +404,7 @@ std::vector<int> hipb_findallsols(
               "mat1 dim 1 must match mat2 dim 0");
 
   auto inType{mat1.options().dtype().toScalarType()};
-  auto outType{
-      out_dtype.has_value()
-          ? torch::python::detail::py_object_to_dtype(out_dtype.value())
-          : inType};
+  auto outType{out_dtype.has_value() ? out_dtype.value() : inType};
 
   auto options{at::TensorOptions().dtype(outType).device(at::kCUDA)};
   auto result{torch::empty({mat1_sizes[0], mat2_sizes[1]}, options)};
@@ -504,17 +497,3 @@ void hipb_destroy_extension() {
   // CHECK_HIP_ERROR(hipEventDestroy(start));
   // CHECK_HIP_ERROR(hipEventDestroy(stop));
 }
-
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("hipb_create_extension", &hipb_create_extension, "create_extension");
-  m.def("hipb_destroy_extension", &hipb_destroy_extension, "destroy_extension");
-  m.def("hipb_mm", &hipb_mm, "hipb_mm", py::arg("mat1"), py::arg("mat2"),
-        py::arg("solution_index"), py::arg("bias") = at::nullopt,
-        py::arg("out_dtype") = at::nullopt, py::arg("scale1") = at::nullopt,
-        py::arg("scale2") = at::nullopt, py::arg("scaleOut") = at::nullopt);
-  m.def("hipb_findallsols", &hipb_findallsols, "hipb_findallsols",
-        py::arg("mat1"), py::arg("mat2"), py::arg("bias") = at::nullopt,
-        py::arg("out_dtype") = at::nullopt);
-}
diff --git a/csrc/gradlib/ops.h b/csrc/gradlib/ops.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <torch/all.h>
+
+void hipb_create_extension();
+void hipb_destroy_extension();
+torch::Tensor hipb_mm(const torch::Tensor& mat1, const torch::Tensor& mat2,
+                      const int64_t solution_index,
+                      at::optional<torch::Tensor> bias = at::nullopt,
+                      at::optional<c10::ScalarType> out_dtype = at::nullopt,
+                      at::optional<torch::Tensor> scale1 = at::nullopt,
+                      at::optional<torch::Tensor> scale2 = at::nullopt,
+                      at::optional<torch::Tensor> scaleOut = at::nullopt);
+
+std::vector<int64_t> hipb_findallsols(const torch::Tensor& mat1,
+                                      const torch::Tensor& mat2,
+                                      at::optional<torch::Tensor> bias,
+                                      at::optional<c10::ScalarType> out_dtype);
+
+void rocb_create_extension();
+void rocb_destroy_extension();
+torch::Tensor RocSolIdxBlas(const torch::Tensor& mat1,
+                            const torch::Tensor& mat2,
+                            const int64_t solution_index);
+
+std::vector<int64_t> RocFindAllSolIdxBlas(const torch::Tensor& mat1,
+                                          const torch::Tensor& mat2);