Skip to content

Gradlib torch extension cmake #282

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 0 additions & 14 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,8 @@ jobs:
bash -x .github/workflows/scripts/build.sh
wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
asset_name=${wheel_name//"linux"/"manylinux1"}
gradlib_wheel_name=$(find gradlib/dist -name "*whl" -print0 | xargs -0 -n 1 basename)
gradlib_asset_name=${gradlib_wheel_name//"linux"/"manylinux1"}
echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
echo "gradlib_wheel_name=${gradlib_wheel_name}" >> "$GITHUB_ENV"
echo "gradlib_asset_name=${gradlib_asset_name}" >> "$GITHUB_ENV"

- name: Upload vllm Release Asset
uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
Expand All @@ -84,13 +80,3 @@ jobs:
asset_path: ./dist/${{ env.wheel_name }}
asset_name: ${{ env.asset_name }}
asset_content_type: application/*
- name: Upload gradlib Release Asset
uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
upload_url: ${{ needs.release.outputs.upload_url }}
asset_path: ./gradlib/dist/${{ env.gradlib_wheel_name }}
asset_name: ${{ env.gradlib_asset_name }}
asset_content_type: application/*

3 changes: 0 additions & 3 deletions .github/workflows/scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,3 @@ export MAX_JOBS=32

# Build
$python_executable setup.py bdist_wheel --dist-dir=dist
cd gradlib
$python_executable setup.py bdist_wheel --dist-dir=dist
cd ..
18 changes: 18 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,24 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
ARCHITECTURES ${VLLM_GPU_ARCHES}
USE_SABI 3
WITH_SOABI)

#
# _gradlib_C extension
#
set(VLLM_GRADLIB_EXT_SRC
"csrc/gradlib/torch_bindings.cpp"
"csrc/gradlib/hipbsolgemm.cu"
"csrc/gradlib/rocsolgemm.cu")

define_gpu_extension_target(
_gradlib_C
DESTINATION vllm
LANGUAGE ${VLLM_GPU_LANG}
SOURCES ${VLLM_GRADLIB_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
USE_SABI 3
WITH_SOABI)
endif()

# vllm-flash-attn currently only supported on CUDA
Expand Down
12 changes: 4 additions & 8 deletions Dockerfile.rocm
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ FROM scratch AS export_flash_attn_0
FROM export_flash_attn_${BUILD_FA} AS export_flash_attn

# -----------------------
# vLLM (and gradlib) fetch stages
# vLLM fetch stages
FROM base AS fetch_vllm_0
ONBUILD COPY ./ vllm/
FROM base AS fetch_vllm_1
Expand All @@ -160,7 +160,7 @@ ONBUILD RUN git clone ${VLLM_REPO} \
FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm

# -----------------------
# vLLM (and gradlib) build stages
# vLLM build stages
FROM fetch_vllm AS build_vllm
ARG COMMON_WORKDIR
ARG USE_CYTHON
Expand All @@ -184,13 +184,9 @@ RUN cd vllm \
&& python3 setup.py clean --all \
&& if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
&& python3 setup.py bdist_wheel --dist-dir=dist
# Build gradlib
RUN cd vllm/gradlib \
&& python3 setup.py clean --all && python3 setup.py bdist_wheel --dist-dir=dist
FROM scratch AS export_vllm
ARG COMMON_WORKDIR
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/gradlib/dist/*.whl /
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/rocm_patch /rocm_patch
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements*.txt /
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
Expand Down Expand Up @@ -265,7 +261,7 @@ RUN if [ ${BUILD_RPD} -eq "1" ]; then \
&& make && make install \
&& cd hipMarker && python setup.py install ; fi

# Install vLLM (and gradlib)
# Install vLLM
# Make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
Expand All @@ -277,7 +273,7 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
*"rocm-6.1"*) \
cp rocm_patch/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6;; \
*) ;; esac \
&& pip uninstall -y vllm gradlib \
&& pip uninstall -y vllm \
&& pip install *.whl

# Copy over the benchmark scripts as well
Expand Down
45 changes: 12 additions & 33 deletions gradlib/csrc/hipbsolgemm.cu → csrc/gradlib/hipbsolgemm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
// __HIP_NO_HALF_CONVERSIONS__ #endif

#include <torch/torch.h>
#include <torch/extension.h>
#include <ATen/ATen.h>
#include <ATen/autocast_mode.h>
#include <ATen/cuda/CUDABlas.h>
Expand Down Expand Up @@ -119,7 +118,7 @@ std::map<at::ScalarType, hipDataType> dtype_map{
} // namespace

// find all hipblaslt solutions for given gemm problem
std::vector<int> hipblasLtMatmul_findallsols_wrapper(
std::vector<int64_t> hipblasLtMatmul_findallsols_wrapper(
hipblasLtHandle_t handle, hipblasOperation_t op_A, hipblasOperation_t op_B,
int m, int n, int k, const void* alpha, const void* a, int lda,
const void* b, int ldb, const void* beta, void* c, int ldc,
Expand Down Expand Up @@ -163,7 +162,7 @@ std::vector<int> hipblasLtMatmul_findallsols_wrapper(
handle, hipblaslt_ext::GemmType::HIPBLASLT_GEMM, op_A, op_B, intype,
intype, outtype, outtype, HIPBLAS_COMPUTE_32F, heuristicResult));

std::vector<int> algoIndex;
std::vector<int64_t> algoIndex;
int returned_algo_count = heuristicResult.size();
// for (int i = 0; i < returnedAlgoCount; i++) {
for (int i = 0; i < returned_algo_count; i++) {
Expand Down Expand Up @@ -290,12 +289,12 @@ hipblasStatus_t hipblasLtMatmul_sol_wrapper(
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
torch::Tensor hipb_mm(const torch::Tensor& mat1, const torch::Tensor& mat2,
const int solution_index,
at::optional<torch::Tensor> bias = at::nullopt,
at::optional<py::object> out_dtype = at::nullopt,
at::optional<torch::Tensor> scale1 = at::nullopt,
at::optional<torch::Tensor> scale2 = at::nullopt,
at::optional<torch::Tensor> scaleOut = at::nullopt) {
const int64_t solution_index,
at::optional<torch::Tensor> bias,
at::optional<c10::ScalarType> out_dtype,
at::optional<torch::Tensor> scale1,
at::optional<torch::Tensor> scale2,
at::optional<torch::Tensor> scaleOut) {
auto mat1_strides{mat1.strides()};
auto mat2_strides{mat2.strides()};
auto mat1_sizes{mat1.sizes()};
Expand All @@ -309,10 +308,7 @@ torch::Tensor hipb_mm(const torch::Tensor& mat1, const torch::Tensor& mat2,
"mat1 dim 1 must match mat2 dim 0");

auto inDtype{mat1.options().dtype().toScalarType()};
auto outDtype{
out_dtype.has_value()
? torch::python::detail::py_object_to_dtype(out_dtype.value())
: inDtype};
auto outDtype{out_dtype.has_value() ? out_dtype.value() : inDtype};
auto options{at::TensorOptions().dtype(outDtype).device(at::kCUDA)};
auto result{torch::empty({mat1_sizes[0], mat2_sizes[1]}, options)};

Expand Down Expand Up @@ -392,10 +388,10 @@ torch::Tensor hipb_mm(const torch::Tensor& mat1, const torch::Tensor& mat2,
}

// find all hipblas solutions and return them to python land
std::vector<int> hipb_findallsols(
std::vector<int64_t> hipb_findallsols(
const torch::Tensor& mat1, const torch::Tensor& mat2,
at::optional<torch::Tensor> bias = at::nullopt,
at::optional<py::object> out_dtype = at::nullopt) {
at::optional<c10::ScalarType> out_dtype = at::nullopt) {
auto mat1_strides{mat1.strides()};
auto mat2_strides{mat2.strides()};
auto mat1_sizes{mat1.sizes()};
Expand All @@ -408,10 +404,7 @@ std::vector<int> hipb_findallsols(
"mat1 dim 1 must match mat2 dim 0");

auto inType{mat1.options().dtype().toScalarType()};
auto outType{
out_dtype.has_value()
? torch::python::detail::py_object_to_dtype(out_dtype.value())
: inType};
auto outType{out_dtype.has_value() ? out_dtype.value() : inType};

auto options{at::TensorOptions().dtype(outType).device(at::kCUDA)};
auto result{torch::empty({mat1_sizes[0], mat2_sizes[1]}, options)};
Expand Down Expand Up @@ -504,17 +497,3 @@ void hipb_destroy_extension() {
// CHECK_HIP_ERROR(hipEventDestroy(start));
// CHECK_HIP_ERROR(hipEventDestroy(stop));
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("hipb_create_extension", &hipb_create_extension, "create_extension");
m.def("hipb_destroy_extension", &hipb_destroy_extension, "destroy_extension");
m.def("hipb_mm", &hipb_mm, "hipb_mm", py::arg("mat1"), py::arg("mat2"),
py::arg("solution_index"), py::arg("bias") = at::nullopt,
py::arg("out_dtype") = at::nullopt, py::arg("scale1") = at::nullopt,
py::arg("scale2") = at::nullopt, py::arg("scaleOut") = at::nullopt);
m.def("hipb_findallsols", &hipb_findallsols, "hipb_findallsols",
py::arg("mat1"), py::arg("mat2"), py::arg("bias") = at::nullopt,
py::arg("out_dtype") = at::nullopt);
}
27 changes: 27 additions & 0 deletions csrc/gradlib/ops.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#pragma once

#include <torch/all.h>

void hipb_create_extension();
void hipb_destroy_extension();
torch::Tensor hipb_mm(const torch::Tensor& mat1, const torch::Tensor& mat2,
const int64_t solution_index,
at::optional<torch::Tensor> bias = at::nullopt,
at::optional<c10::ScalarType> out_dtype = at::nullopt,
at::optional<torch::Tensor> scale1 = at::nullopt,
at::optional<torch::Tensor> scale2 = at::nullopt,
at::optional<torch::Tensor> scaleOut = at::nullopt);

std::vector<int64_t> hipb_findallsols(const torch::Tensor& mat1,
const torch::Tensor& mat2,
at::optional<torch::Tensor> bias,
at::optional<c10::ScalarType> out_dtype);

void rocb_create_extension();
void rocb_destroy_extension();
torch::Tensor RocSolIdxBlas(const torch::Tensor& mat1,
const torch::Tensor& mat2,
const int64_t solution_index);

std::vector<int64_t> RocFindAllSolIdxBlas(const torch::Tensor& mat1,
const torch::Tensor& mat2);
Loading