-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
5e13361
commit 86a8f34
Showing
7 changed files
with
324 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
#include "cuda_kernel.hh" | ||
|
||
#ifdef USE_CUDA | ||
#include "../../generator/nvrtc_repo.h" | ||
#include "kernel/cuda/threads_distributer.cuh" | ||
#endif | ||
|
||
namespace refactor::kernel { | ||
using K = SelectCuda; | ||
|
||
K::SelectCuda(decltype(dataType) dataType_, | ||
decltype(selectType) selectType_, | ||
decltype(broadcaster) broadcaster_, | ||
decltype(inputsNum) inputsNum_) noexcept | ||
: dataType(dataType_), | ||
selectType(selectType_), | ||
broadcaster(broadcaster_), | ||
inputsNum(inputsNum_) {} | ||
|
||
auto K::build(SelectType selectType_, TensorRefs inputs_) noexcept -> KernelBox { | ||
#ifndef USE_CUDA | ||
return nullptr; | ||
#endif | ||
|
||
return std::make_unique<K>(inputs_[0].get().dataType, selectType_, Broadcaster(inputs_), inputs_.size()); | ||
} | ||
|
||
auto K::typeId() noexcept -> size_t { | ||
static uint8_t ID = 1; | ||
return reinterpret_cast<size_t>(&ID); | ||
} | ||
|
||
auto K::kernelTypeId() const noexcept -> size_t { | ||
return typeId(); | ||
} | ||
auto K::description() const noexcept -> std::string_view { | ||
return "Performing select operation on Nvidia GPU"; | ||
} | ||
|
||
#ifdef USE_CUDA | ||
|
||
constexpr static const char *NO_BROADCAST = R"~( | ||
struct Inputs {{ | ||
{dt} const *const addr[{inputsNum}]; | ||
}}; | ||
__device__ __forceinline__ static {dt} fn({dt} a, {dt} b) {{ | ||
return {op}; | ||
}} | ||
extern "C" __global__ void kernel( | ||
{dt} *__restrict__ output, | ||
Inputs inputs | ||
) {{ | ||
for (auto tid = blockIdx.x * blockDim.x + threadIdx.x, | ||
step = blockDim.x * gridDim.x; | ||
tid < {n}; | ||
tid += step) {{ | ||
output[tid] = inputs.addr[0][tid]; | ||
for (auto idx = 1; idx < {inputsNum}; ++idx) {{ | ||
output[tid] = fn(inputs.addr[idx][tid], output[tid]); | ||
}} | ||
}} | ||
}} | ||
)~"; | ||
|
||
constexpr static const char *BROADCAST = R"~( | ||
struct Inputs {{ | ||
{dt} const *const addr[{inputsNum}]; | ||
}}; | ||
struct Strides {{ | ||
unsigned int s[({inputsNum}+1) * {rank}]; | ||
}}; | ||
__device__ __forceinline__ static {dt} fn({dt} a, {dt} b) {{ | ||
return {op}; | ||
}} | ||
extern "C" __global__ void kernel( | ||
{dt} *__restrict__ output, | ||
Inputs inputs, | ||
Strides strides | ||
) {{ | ||
for (auto tid = blockIdx.x * blockDim.x + threadIdx.x, | ||
step = blockDim.x * gridDim.x; | ||
tid < {n}; | ||
tid += step) {{ | ||
auto rem = tid; | ||
size_t ans[{inputsNum}]{{}}; | ||
for (auto i = 0; i < {rank}; ++i) {{ | ||
auto dim = strides.s + ({inputsNum} + 1) * i; | ||
auto quot = rem / dim[{inputsNum}]; | ||
for (auto j = 0; j < {inputsNum}; ++j) {{ ans[j] += dim[j] * quot; }} | ||
rem %= dim[{inputsNum}]; | ||
}} | ||
output[tid] = inputs.addr[0][ans[0]]; | ||
for (auto idx = 1; idx < {inputsNum}; ++idx) {{ | ||
output[tid] = fn(inputs.addr[idx][ans[idx]], output[tid]); | ||
}} | ||
}} | ||
}} | ||
)~"; | ||
|
||
constexpr static std::string_view op(SelectType op, DataType dt) { | ||
switch (op) { | ||
case SelectType::Max: | ||
return "a > b ? a : b"; | ||
case SelectType::Min: | ||
return "a < b ? a : b"; | ||
default: | ||
UNREACHABLE(); | ||
} | ||
} | ||
|
||
auto K::lower(Resources &) const noexcept -> RoutineWorkspace { | ||
using namespace runtime; | ||
|
||
auto postfix = fmt::format("_{}_{}", dataType.name(), opName(selectType)); | ||
auto dt_ = nvrtc::dataType(dataType); | ||
auto op_ = op(selectType, dataType); | ||
auto params = cuda::ThreadsDistributer()(broadcaster.outputsCount); | ||
|
||
if (!broadcaster.needBroadcast()) { | ||
auto name = fmt::format("select{}", postfix); | ||
auto code = fmt::format(NO_BROADCAST, | ||
fmt::arg("dt", dt_), | ||
fmt::arg("op", op_), | ||
fmt::arg("inputsNum", inputsNum), | ||
fmt::arg("n", params.n)); | ||
return [params, h = nvrtc::Handler::compile(name.c_str(), code.c_str(), "kernel")]// | ||
(Resources &, void *, void const *const *inputs, void *const *outputs) { | ||
auto output = outputs[0]; | ||
void *args[]{&output, const_cast<void **>(inputs)}; | ||
h->launch(params.gridSize, 1, 1, | ||
params.blockSize, 1, 1, | ||
0, args); | ||
}; | ||
} else { | ||
auto name = fmt::format("select{}", postfix); | ||
auto rank = broadcaster.strides.size() / (broadcaster.inputsCount + 1); | ||
auto code = fmt::format( | ||
BROADCAST, | ||
fmt::arg("dt", dt_), | ||
fmt::arg("op", op_), | ||
fmt::arg("inputsNum", inputsNum), | ||
fmt::arg("n", params.n), | ||
fmt::arg("rank", rank)); | ||
return [params, h = nvrtc::Handler::compile(name.c_str(), code.c_str(), "kernel"), | ||
strides = broadcaster.strides]// | ||
(Resources &, void *, void const *const *inputs, void *const *outputs) { | ||
void *args[]{const_cast<void **>(outputs), const_cast<void **>(inputs), const_cast<dim_t *>(strides.data())}; | ||
h->launch(params.gridSize, 1, 1, | ||
params.blockSize, 1, 1, | ||
0, args); | ||
}; | ||
} | ||
} | ||
|
||
#endif | ||
|
||
}// namespace refactor::kernel |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#ifndef KERNEL_SELECT_CUDA_KERNEL_HH | ||
#define KERNEL_SELECT_CUDA_KERNEL_HH | ||
|
||
#include "kernel/attributes/broadcaster.h" | ||
#include "kernel/collectors/select.h" | ||
#include "kernel/kernel.h" | ||
#include "kernel/tensor.h" | ||
|
||
namespace refactor::kernel { | ||
|
||
struct SelectCuda final : public Kernel { | ||
DataType dataType; | ||
SelectType selectType; | ||
Broadcaster broadcaster; | ||
size_t inputsNum; | ||
|
||
SelectCuda(decltype(dataType), decltype(selectType), decltype(broadcaster), decltype(inputsNum)) noexcept; | ||
|
||
static KernelBox build(SelectType, TensorRefs) noexcept; | ||
static size_t typeId() noexcept; | ||
|
||
size_t kernelTypeId() const noexcept final; | ||
std::string_view description() const noexcept final; | ||
#ifdef USE_CUDA | ||
RoutineWorkspace lower(Resources &) const noexcept final; | ||
#endif | ||
}; | ||
|
||
}// namespace refactor::kernel | ||
|
||
#endif// KERNEL_SELECT_CUDA_KERNEL_HH |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
#ifdef USE_CUDA | ||
|
||
#include "../../../src/kernels/select/cuda_kernel.hh" | ||
#include "hardware/device_manager.h" | ||
#include <functional> | ||
#include <gtest/gtest.h> | ||
#include <numeric> | ||
|
||
using namespace refactor; | ||
using namespace kernel; | ||
using namespace hardware; | ||
|
||
static void testSelect(const SelectType selectType, const std::vector<Shape> &shapes, const Shape &outShape, const std::vector<std::vector<float>> &data, | ||
const std::vector<float> expectData) { | ||
// build routine | ||
TensorRefs dataTensors; | ||
std::vector<Tensor> tensorsVec; | ||
for (size_t i = 0; i < shapes.size(); ++i) { | ||
tensorsVec.push_back(Tensor(DataType::F32, shapes[i], LayoutType::Others, nullptr)); | ||
} | ||
for (size_t i = 0; i < shapes.size(); ++i) { | ||
dataTensors.push_back(std::cref(tensorsVec[i])); | ||
} | ||
auto result = Tensor::share(DataType::F32, outShape); | ||
auto kernel = SelectCuda::build(selectType, dataTensors); | ||
ASSERT_TRUE(kernel); | ||
auto res = runtime::Resources(); | ||
auto routine = kernel->lower(res).routine; | ||
// cuda malloc | ||
auto &dev = *device::init(Device::Type::Nvidia, 0, ""); | ||
Arc<Device::Blob> | ||
gpuIns[]{ | ||
dev.malloc(dataTensors[0].get().bytesSize()), | ||
dev.malloc(dataTensors[1].get().bytesSize()), | ||
dev.malloc(dataTensors[2].get().bytesSize()), | ||
}, | ||
gpuOut = dev.malloc(result->bytesSize()); | ||
// put input data | ||
gpuIns[0]->copyFromHost(data[0].data(), dataTensors[0].get().bytesSize()); | ||
gpuIns[1]->copyFromHost(data[1].data(), dataTensors[1].get().bytesSize()); | ||
gpuIns[2]->copyFromHost(data[2].data(), dataTensors[2].get().bytesSize()); | ||
// inference | ||
{ | ||
void const *inputs[]{*gpuIns[0], *gpuIns[1], *gpuIns[2]}; | ||
void *outputs[]{*gpuOut}; | ||
routine(res, nullptr, inputs, outputs); | ||
} | ||
// check | ||
std::vector<float> out(result->elementsSize()); | ||
gpuOut->copyToHost(out.data(), result->bytesSize()); | ||
for (auto i : range0_(expectData.size())) { | ||
EXPECT_FLOAT_EQ(expectData[i], out[i]); | ||
} | ||
} | ||
|
||
TEST(kernel, SelectCuda) { | ||
// no need broadcast | ||
testSelect(SelectType::Max, | ||
{{1, 3}, {1, 3}, {1, 3}}, | ||
{1, 3}, | ||
{{3, 2, 1}, {1, 4, 4}, {2, 5, 3}}, | ||
{3, 5, 4}); | ||
|
||
testSelect(SelectType::Min, | ||
{{1, 3}, {1, 3}, {1, 3}}, | ||
{1, 3}, | ||
{{3, 2, 1}, {1, 4, 4}, {2, 5, 3}}, | ||
{1, 2, 1}); | ||
|
||
// need broadcast | ||
testSelect(SelectType::Max, | ||
{{3}, {1, 3}, {1, 3}}, | ||
{1, 3}, | ||
{{3, 3, 3}, {1, 4, 4}, {2, 5, 3}}, | ||
{3, 5, 4}); | ||
|
||
testSelect(SelectType::Min, | ||
{{3}, {1, 3}, {1, 3}}, | ||
{1, 3}, | ||
{{3, 3, 3}, {1, 4, 4}, {2, 5, 3}}, | ||
{1, 3, 3}); | ||
|
||
testSelect(SelectType::Max, | ||
{{1}, {1, 3}, {1, 3}}, | ||
{1, 3}, | ||
{{3}, {1, 4, 4}, {2, 5, 3}}, | ||
{3, 5, 4}); | ||
|
||
testSelect(SelectType::Min, | ||
{{1}, {1, 3}, {1, 3}}, | ||
{1, 3}, | ||
{{3}, {1, 4, 4}, {2, 5, 3}}, | ||
{1, 3, 3}); | ||
} | ||
|
||
#endif |