Skip to content

Commit

Permalink
add max/min cuda kernel and test
Browse files Browse the repository at this point in the history
  • Loading branch information
kilinchange committed Jan 30, 2024
1 parent 5e13361 commit 86a8f34
Show file tree
Hide file tree
Showing 7 changed files with 324 additions and 14 deletions.
2 changes: 2 additions & 0 deletions src/04kernel/include/kernel/collectors/select.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ namespace refactor::kernel {
Min,
};

std::string_view opName(SelectType type);

struct SelectCollector final : public InfoCollector {
SelectType selectType;

Expand Down
18 changes: 11 additions & 7 deletions src/04kernel/src/attributes/broadcaster.cc
Original file line number Diff line number Diff line change
Expand Up @@ -86,13 +86,17 @@ namespace refactor::kernel {
}()) {}

void Broadcaster::locate(dim_t k, dim_t ans[]) const noexcept {
long rem = k;
std::fill_n(ans, inputsCount, 0);
for (auto i : range0_(strides.size() / (inputsCount + 1))) {
auto dim = strides.data() + (inputsCount + 1) * i;
auto div = std::div(rem, dim[inputsCount]);
for (auto j : range0_(inputsCount)) { ans[j] += dim[j] * div.quot; }
rem = div.rem;
if (!needBroadcast()) {
std::fill_n(ans, inputsCount, k);
} else {
long rem = k;
std::fill_n(ans, inputsCount, 0);
for (auto i : range0_(strides.size() / (inputsCount + 1))) {
auto dim = strides.data() + (inputsCount + 1) * i;
auto div = std::div(rem, dim[inputsCount]);
for (auto j : range0_(inputsCount)) { ans[j] += dim[j] * div.quot; }
rem = div.rem;
}
}
}

Expand Down
22 changes: 22 additions & 0 deletions src/04kernel/src/collectors/select.cc
Original file line number Diff line number Diff line change
@@ -1,7 +1,27 @@
#include "kernel/collectors/select.h"
#include "../kernels/select/cpu_kernel.hh"
#include "../kernels/select/cuda_kernel.hh"

namespace refactor::kernel {

#define REGISTER(T) \
if (auto ptr = T::build(selectType, inputs); ptr) { \
ans.emplace_back(std::move(ptr)); \
}

#define CASE(OP) \
case SelectType::OP: \
return #OP

std::string_view opName(SelectType type) {
switch (type) {
CASE(Max);
CASE(Min);
default:
UNREACHABLE();
}
}

SelectCollector::SelectCollector(decltype(_target) target, SelectType type) noexcept
: InfoCollector(target), selectType(type) {}

Expand All @@ -10,8 +30,10 @@ namespace refactor::kernel {
std::vector<KernelBox> ans;
switch (_target) {
case decltype(_target)::Cpu:
REGISTER(SelectCpu)
break;
case decltype(_target)::Nvidia:
REGISTER(SelectCuda)
break;
default:
UNREACHABLEX(void, "Unknown target");
Expand Down
7 changes: 0 additions & 7 deletions src/04kernel/src/kernels/concat/cuda_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,6 @@ extern "C" __global__ void kernel(
}
auto segments = ss.str();

ss.str("");
for (auto i : range0_(inputCount)) {
ss << std::endl
<< " reinterpret_cast<char const *>(inputs[" << i << "]), ";
}
auto castInputs = ss.str();

ss.str("");
ss << "Concat_" << info.blockCount << ',' << unit;
for (auto seg : info.segments) {
Expand Down
162 changes: 162 additions & 0 deletions src/04kernel/src/kernels/select/cuda_kernel.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
#include "cuda_kernel.hh"

#ifdef USE_CUDA
#include "../../generator/nvrtc_repo.h"
#include "kernel/cuda/threads_distributer.cuh"
#endif

namespace refactor::kernel {
using K = SelectCuda;

K::SelectCuda(decltype(dataType) dataType_,
decltype(selectType) selectType_,
decltype(broadcaster) broadcaster_,
decltype(inputsNum) inputsNum_) noexcept
: dataType(dataType_),
selectType(selectType_),
broadcaster(broadcaster_),
inputsNum(inputsNum_) {}

auto K::build(SelectType selectType_, TensorRefs inputs_) noexcept -> KernelBox {
#ifndef USE_CUDA
return nullptr;
#endif

return std::make_unique<K>(inputs_[0].get().dataType, selectType_, Broadcaster(inputs_), inputs_.size());
}

auto K::typeId() noexcept -> size_t {
static uint8_t ID = 1;
return reinterpret_cast<size_t>(&ID);
}

auto K::kernelTypeId() const noexcept -> size_t {
return typeId();
}
auto K::description() const noexcept -> std::string_view {
return "Performing select operation on Nvidia GPU";
}

#ifdef USE_CUDA

constexpr static const char *NO_BROADCAST = R"~(
struct Inputs {{
{dt} const *const addr[{inputsNum}];
}};
__device__ __forceinline__ static {dt} fn({dt} a, {dt} b) {{
return {op};
}}
extern "C" __global__ void kernel(
{dt} *__restrict__ output,
Inputs inputs
) {{
for (auto tid = blockIdx.x * blockDim.x + threadIdx.x,
step = blockDim.x * gridDim.x;
tid < {n};
tid += step) {{
output[tid] = inputs.addr[0][tid];
for (auto idx = 1; idx < {inputsNum}; ++idx) {{
output[tid] = fn(inputs.addr[idx][tid], output[tid]);
}}
}}
}}
)~";

constexpr static const char *BROADCAST = R"~(
struct Inputs {{
{dt} const *const addr[{inputsNum}];
}};
struct Strides {{
unsigned int s[({inputsNum}+1) * {rank}];
}};
__device__ __forceinline__ static {dt} fn({dt} a, {dt} b) {{
return {op};
}}
extern "C" __global__ void kernel(
{dt} *__restrict__ output,
Inputs inputs,
Strides strides
) {{
for (auto tid = blockIdx.x * blockDim.x + threadIdx.x,
step = blockDim.x * gridDim.x;
tid < {n};
tid += step) {{
auto rem = tid;
size_t ans[{inputsNum}]{{}};
for (auto i = 0; i < {rank}; ++i) {{
auto dim = strides.s + ({inputsNum} + 1) * i;
auto quot = rem / dim[{inputsNum}];
for (auto j = 0; j < {inputsNum}; ++j) {{ ans[j] += dim[j] * quot; }}
rem %= dim[{inputsNum}];
}}
output[tid] = inputs.addr[0][ans[0]];
for (auto idx = 1; idx < {inputsNum}; ++idx) {{
output[tid] = fn(inputs.addr[idx][ans[idx]], output[tid]);
}}
}}
}}
)~";

constexpr static std::string_view op(SelectType op, DataType dt) {
switch (op) {
case SelectType::Max:
return "a > b ? a : b";
case SelectType::Min:
return "a < b ? a : b";
default:
UNREACHABLE();
}
}

auto K::lower(Resources &) const noexcept -> RoutineWorkspace {
using namespace runtime;

auto postfix = fmt::format("_{}_{}", dataType.name(), opName(selectType));
auto dt_ = nvrtc::dataType(dataType);
auto op_ = op(selectType, dataType);
auto params = cuda::ThreadsDistributer()(broadcaster.outputsCount);

if (!broadcaster.needBroadcast()) {
auto name = fmt::format("select{}", postfix);
auto code = fmt::format(NO_BROADCAST,
fmt::arg("dt", dt_),
fmt::arg("op", op_),
fmt::arg("inputsNum", inputsNum),
fmt::arg("n", params.n));
return [params, h = nvrtc::Handler::compile(name.c_str(), code.c_str(), "kernel")]//
(Resources &, void *, void const *const *inputs, void *const *outputs) {
auto output = outputs[0];
void *args[]{&output, const_cast<void **>(inputs)};
h->launch(params.gridSize, 1, 1,
params.blockSize, 1, 1,
0, args);
};
} else {
auto name = fmt::format("select{}", postfix);
auto rank = broadcaster.strides.size() / (broadcaster.inputsCount + 1);
auto code = fmt::format(
BROADCAST,
fmt::arg("dt", dt_),
fmt::arg("op", op_),
fmt::arg("inputsNum", inputsNum),
fmt::arg("n", params.n),
fmt::arg("rank", rank));
return [params, h = nvrtc::Handler::compile(name.c_str(), code.c_str(), "kernel"),
strides = broadcaster.strides]//
(Resources &, void *, void const *const *inputs, void *const *outputs) {
void *args[]{const_cast<void **>(outputs), const_cast<void **>(inputs), const_cast<dim_t *>(strides.data())};
h->launch(params.gridSize, 1, 1,
params.blockSize, 1, 1,
0, args);
};
}
}

#endif

}// namespace refactor::kernel
31 changes: 31 additions & 0 deletions src/04kernel/src/kernels/select/cuda_kernel.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#ifndef KERNEL_SELECT_CUDA_KERNEL_HH
#define KERNEL_SELECT_CUDA_KERNEL_HH

#include "kernel/attributes/broadcaster.h"
#include "kernel/collectors/select.h"
#include "kernel/kernel.h"
#include "kernel/tensor.h"

namespace refactor::kernel {

struct SelectCuda final : public Kernel {
DataType dataType;
SelectType selectType;
Broadcaster broadcaster;
size_t inputsNum;

SelectCuda(decltype(dataType), decltype(selectType), decltype(broadcaster), decltype(inputsNum)) noexcept;

static KernelBox build(SelectType, TensorRefs) noexcept;
static size_t typeId() noexcept;

size_t kernelTypeId() const noexcept final;
std::string_view description() const noexcept final;
#ifdef USE_CUDA
RoutineWorkspace lower(Resources &) const noexcept final;
#endif
};

}// namespace refactor::kernel

#endif// KERNEL_SELECT_CUDA_KERNEL_HH
96 changes: 96 additions & 0 deletions src/04kernel/test/kernels/select/test_cuda.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#ifdef USE_CUDA

#include "../../../src/kernels/select/cuda_kernel.hh"
#include "hardware/device_manager.h"
#include <functional>
#include <gtest/gtest.h>
#include <numeric>

using namespace refactor;
using namespace kernel;
using namespace hardware;

static void testSelect(const SelectType selectType, const std::vector<Shape> &shapes, const Shape &outShape, const std::vector<std::vector<float>> &data,
const std::vector<float> expectData) {
// build routine
TensorRefs dataTensors;
std::vector<Tensor> tensorsVec;
for (size_t i = 0; i < shapes.size(); ++i) {
tensorsVec.push_back(Tensor(DataType::F32, shapes[i], LayoutType::Others, nullptr));
}
for (size_t i = 0; i < shapes.size(); ++i) {
dataTensors.push_back(std::cref(tensorsVec[i]));
}
auto result = Tensor::share(DataType::F32, outShape);
auto kernel = SelectCuda::build(selectType, dataTensors);
ASSERT_TRUE(kernel);
auto res = runtime::Resources();
auto routine = kernel->lower(res).routine;
// cuda malloc
auto &dev = *device::init(Device::Type::Nvidia, 0, "");
Arc<Device::Blob>
gpuIns[]{
dev.malloc(dataTensors[0].get().bytesSize()),
dev.malloc(dataTensors[1].get().bytesSize()),
dev.malloc(dataTensors[2].get().bytesSize()),
},
gpuOut = dev.malloc(result->bytesSize());
// put input data
gpuIns[0]->copyFromHost(data[0].data(), dataTensors[0].get().bytesSize());
gpuIns[1]->copyFromHost(data[1].data(), dataTensors[1].get().bytesSize());
gpuIns[2]->copyFromHost(data[2].data(), dataTensors[2].get().bytesSize());
// inference
{
void const *inputs[]{*gpuIns[0], *gpuIns[1], *gpuIns[2]};
void *outputs[]{*gpuOut};
routine(res, nullptr, inputs, outputs);
}
// check
std::vector<float> out(result->elementsSize());
gpuOut->copyToHost(out.data(), result->bytesSize());
for (auto i : range0_(expectData.size())) {
EXPECT_FLOAT_EQ(expectData[i], out[i]);
}
}

TEST(kernel, SelectCuda) {
// no need broadcast
testSelect(SelectType::Max,
{{1, 3}, {1, 3}, {1, 3}},
{1, 3},
{{3, 2, 1}, {1, 4, 4}, {2, 5, 3}},
{3, 5, 4});

testSelect(SelectType::Min,
{{1, 3}, {1, 3}, {1, 3}},
{1, 3},
{{3, 2, 1}, {1, 4, 4}, {2, 5, 3}},
{1, 2, 1});

// need broadcast
testSelect(SelectType::Max,
{{3}, {1, 3}, {1, 3}},
{1, 3},
{{3, 3, 3}, {1, 4, 4}, {2, 5, 3}},
{3, 5, 4});

testSelect(SelectType::Min,
{{3}, {1, 3}, {1, 3}},
{1, 3},
{{3, 3, 3}, {1, 4, 4}, {2, 5, 3}},
{1, 3, 3});

testSelect(SelectType::Max,
{{1}, {1, 3}, {1, 3}},
{1, 3},
{{3}, {1, 4, 4}, {2, 5, 3}},
{3, 5, 4});

testSelect(SelectType::Min,
{{1}, {1, 3}, {1, 3}},
{1, 3},
{{3}, {1, 4, 4}, {2, 5, 3}},
{1, 3, 3});
}

#endif

0 comments on commit 86a8f34

Please sign in to comment.