Skip to content

Commit

Permalink
feat: mlu上跑通llama/gpt2,结果正确
Browse files Browse the repository at this point in the history
  • Loading branch information
Chamberlain0w0 authored and YdrMaster committed Jan 31, 2024
1 parent 4d426b2 commit 2583eb3
Show file tree
Hide file tree
Showing 13 changed files with 181 additions and 61 deletions.
6 changes: 6 additions & 0 deletions src/04kernel/src/collectors/global_pool.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "kernel/collectors/global_pool.h"
#include "../kernels/pool/cudnn_kernel.hh"
#include "../kernels/pool/cnnl_kernel.hh"

namespace refactor::kernel {

Expand Down Expand Up @@ -28,6 +29,11 @@ namespace refactor::kernel {
ans.emplace_back(std::move(ptr));
}
break;
case decltype(_target)::Mlu:
if (auto ptr = PoolCnnl::build(type, false, kernelShape, attributes, x, y); ptr) {
ans.emplace_back(std::move(ptr));
}
break;
default:
UNREACHABLEX(void, "Unknown target");
}
Expand Down
14 changes: 8 additions & 6 deletions src/04kernel/src/kernels/gather/cnnl_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "../../utilities/bang/cnnl_context.hh"
#include "../../utilities/bang/cnnl_functions.h"
#endif
#include <iostream>

namespace refactor::kernel {
using K = GatherCnnl;
Expand All @@ -15,11 +16,11 @@ namespace refactor::kernel {
#ifndef USE_BANG
return nullptr;
#endif

return std::make_unique<K>(decltype(info){
input.dataType,
DataType::I32,
axis,
axis ? axis : 0,
std::vector<int>(input.shape.begin(), input.shape.end()),
std::vector<int>(index.shape.begin(), index.shape.end()),
std::vector<int>(output.shape.begin(), output.shape.end()),
Expand Down Expand Up @@ -70,15 +71,16 @@ namespace refactor::kernel {

res.fetchOrStore<CnnlContext>();
auto routine = [d = std::move(d),
shape = info.inDim.data(), workspaceSize,
shape = std::vector<int>(info.inDim.begin(), info.inDim.end()),
workspaceSize,
dim = info.axis](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
BANG_ASSERT(cnrtMemcpy(workspace, (void*) shape, workspaceSize, CNRT_MEM_TRANS_DIR_HOST2DEV));
res.fetchOrStore<CnnlContext>()->copyFromCPU(workspace, shape.data(), workspaceSize);
CNNL_ASSERT(cnnlGatherV2(res.fetchOrStore<CnnlContext>()->handle, dim,
d->inDesc, inputs[0], reinterpret_cast<const int *>(workspace),
d->indexDesc, reinterpret_cast<const int *>(inputs[1]),
d->indexDesc, reinterpret_cast<const int *>(inputs[1]),
d->outDesc, outputs[0]));
BANG_ASSERT(cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue));
};
};

return {std::move(routine), workspaceSize};
}
Expand Down
11 changes: 6 additions & 5 deletions src/04kernel/src/kernels/reduce/cnnl_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,15 @@ namespace refactor::kernel {

std::vector<int>
dimsI(shape.begin(), shape.end()),
dimsO(shape.begin(), shape.end());
dimsO(shape.begin(), shape.end()),
indices(axes.begin(), axes.end());
for (auto axis : axes) {
dimsO[axis] = 1;
}
// setCnnlTensor(d->x, dataType, slice(dimsI.data(), dimsI.size()));
// setCnnlTensor(d->y, dataType, slice(dimsO.data(), dimsO.size()));
CNNL_ASSERT(cnnlSetTensorDescriptor(d->x, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(dataType), dimsI.size(), dimsI.data()));
CNNL_ASSERT(cnnlSetTensorDescriptor(d->y, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(dataType), dimsO.size(), dimsO.data()));
CNNL_ASSERT(cnnlSetTensorDescriptor(d->x, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(dataType), dimsI.size(), dimsI.data()));
CNNL_ASSERT(cnnlSetTensorDescriptor(d->y, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(dataType), dimsO.size(), dimsO.data()));

// clang-format off
auto reduceOp = reduceType == ReduceType::Mean ? CNNL_REDUCE_AVG
Expand All @@ -91,12 +92,12 @@ namespace refactor::kernel {
: UNREACHABLEX(cnnlReduceOp_t, "");
// clang-format on
CNNL_ASSERT(cnnlSetReduceDescriptor_v2(
d->reduce, (int *) (axes.data()), axes.size(), reduceOp,
d->reduce, indices.data(), indices.size(), reduceOp,
cnnlDataTypeConvert(d->f32 ? DataType::F32 : DataType::F64),
CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES, 0.0));

auto handler = res.fetchOrStore<CnnlContext>()->handle;
size_t idxWorkspaceSize = axes.size() * sizeof(int);
size_t idxWorkspaceSize = indices.size() * sizeof(int);
// idxWorkspaceSize = hardware::alignBytes(idxWorkspaceSize, 256);
size_t workspaceSize;
// get workspace
Expand Down
9 changes: 6 additions & 3 deletions src/04kernel/src/kernels/softmax/cnnl_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,11 @@ namespace refactor::kernel {
static_cast<cnnlSoftmaxAlgorithm_t>(algo),
dataType != DataType::F64);
int dims[]{pre, mid, post};
cnnlSoftmaxMode_t mode = (post == 1) ? CNNL_SOFTMAX_MODE_HIGH_DIMENSION
: (pre == 1) ? CNNL_SOFTMAX_MODE_LOW_DIMENSION
: CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
// cnnlSoftmaxMode_t mode = (pre == 1) ? CNNL_SOFTMAX_MODE_HIGH_DIMENSION
// : (post == 1) ? CNNL_SOFTMAX_MODE_LOW_DIMENSION
// : CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
// FIXME(bolun): CNNL Softmax mode
cnnlSoftmaxMode_t mode = CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;

// cnnlSoftmaxForward_v2 is applied to a 3D input tensor only
CNNL_ASSERT(cnnlSetTensorDescriptor(d->t, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(dataType), 3, dims));
Expand All @@ -78,6 +80,7 @@ namespace refactor::kernel {
CNNL_COMPUTATION_ULTRAHIGH_PRECISION,
&a, d->t, inputs[0],
&b, d->t, outputs[0]));
res.fetchOrStore<CnnlContext>()->queueSync();
};
}

Expand Down
82 changes: 39 additions & 43 deletions src/04kernel/src/kernels/where/cnnl_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,24 @@ namespace refactor::kernel {
#ifndef USE_BANG
return nullptr;
#endif
return std::make_unique<K>(decltype(info) {
inputs[1].get().dataType,
inputs[0].get().shape,
inputs[1].get().shape,
inputs[2].get().shape,
outputs[0].get().shape,
});
std::vector<int> cDim(inputs[0].get().shape.begin(), inputs[0].get().shape.end()),
xDim(inputs[1].get().shape.begin(), inputs[1].get().shape.end()),
yDim(inputs[2].get().shape.begin(), inputs[2].get().shape.end()),
ansDim(outputs[0].get().shape.begin(), outputs[0].get().shape.end());
if (ansDim.size() == 0) {
ansDim.push_back(1);
}
if (xDim.size() == 0) {
xDim.push_back(1);
}
if (yDim.size() == 0) {
yDim.push_back(1);
}
if (cDim.size() == 0) {
cDim.push_back(1);
}
return std::make_unique<K>(decltype(info){
inputs[1].get().dataType, cDim, xDim, yDim, ansDim});
}
auto K::typeId() noexcept -> size_t {
static uint8_t ID = 1;
Expand All @@ -44,11 +55,10 @@ namespace refactor::kernel {

struct Descriptors {
cnnlTensorDescriptor_t cond, x, y, ans;
bool f32;

explicit Descriptors(decltype(f32) f32_)
explicit Descriptors()
: cond(nullptr), x(nullptr), y(nullptr),
ans(nullptr), f32(f32_) {
ans(nullptr) {
CNNL_ASSERT(cnnlCreateTensorDescriptor(&cond));
CNNL_ASSERT(cnnlCreateTensorDescriptor(&x));
CNNL_ASSERT(cnnlCreateTensorDescriptor(&y));
Expand All @@ -64,49 +74,35 @@ namespace refactor::kernel {
Descriptors(const Descriptors &) = delete;
Descriptors(Descriptors &&) = delete;
};
auto d = std::make_shared<Descriptors>(info.dataType != DT::F64);

std::vector<int> cDim(info.condDim.begin(), info.condDim.end()),
xDim(info.thenDim.begin(), info.thenDim.end()),
yDim(info.elseDim.begin(), info.elseDim.end()),
ansDim(info.outputDim.begin(), info.outputDim.end());

auto rightAlign = [](std::vector<int> &dim, uint32_t targetLength) {
if (dim.size() < targetLength) {
dim.insert(dim.begin(), targetLength - dim.size(), 1);
}
};
if (ansDim.size() == 0) {
ansDim.push_back(1);
}
rightAlign(cDim, ansDim.size());
rightAlign(xDim, ansDim.size());
rightAlign(yDim, ansDim.size());

CNNL_ASSERT(cnnlSetTensorDescriptor(d->cond, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(DT::Bool), cDim.size(), cDim.data()));
CNNL_ASSERT(cnnlSetTensorDescriptor(d->x, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), xDim.size(), xDim.data()));
CNNL_ASSERT(cnnlSetTensorDescriptor(d->y, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), yDim.size(), yDim.data()));
CNNL_ASSERT(cnnlSetTensorDescriptor(d->ans, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), ansDim.size(), ansDim.data()));
auto d = std::make_shared<Descriptors>();

CNNL_ASSERT(cnnlSetTensorDescriptor(
d->cond, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(DT::Bool),
info.condDim.size(), info.condDim.data()));
CNNL_ASSERT(cnnlSetTensorDescriptor(
d->x, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType),
info.thenDim.size(), info.thenDim.data()));
CNNL_ASSERT(cnnlSetTensorDescriptor(
d->y, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType),
info.elseDim.size(), info.elseDim.data()));
CNNL_ASSERT(cnnlSetTensorDescriptor(
d->ans, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(info.dataType),
info.outputDim.size(), info.outputDim.data()));

auto handle = res.fetchOrStore<CnnlContext>()->handle;
size_t workspaceSize;
CNNL_ASSERT(cnnlGetSelectV2WorkspaceSize(handle, d->cond, d->x, d->y, &workspaceSize));

res.fetchOrStore<CnnlContext>();
auto routine = [d = std::move(d), workspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
// fetch cnnl handle from resources
auto handle = res.fetchOrStore<CnnlContext>()->handle;
auto cond = inputs[0],
x = inputs[1],
y = inputs[2];
auto ans = outputs[0];

CNNL_ASSERT(cnnlSelectV2(
handle, d->cond, cond, d->x, x,
d->y, y, workspace, workspaceSize,
d->ans, ans));
res.fetchOrStore<CnnlContext>()->handle,
d->cond, inputs[0], d->x, inputs[1],
d->y, inputs[2], workspace, workspaceSize,
d->ans, outputs[0]));

cnrtQueueSync(res.fetchOrStore<CnnlContext>()->queue);
res.fetchOrStore<CnnlContext>()->queueSync();
};

return {std::move(routine), workspaceSize};
Expand Down
4 changes: 1 addition & 3 deletions src/04kernel/src/kernels/where/cnnl_kernel.hh
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,10 @@

namespace refactor::kernel {

using Shape = absl::InlinedVector<dim_t, 4>;

struct WhereCnnl final : public Kernel {
struct {
DataType dataType;
Shape condDim, thenDim, elseDim, outputDim;
std::vector<int> condDim, thenDim, elseDim, outputDim;
} info;

WhereCnnl(decltype(info)) noexcept;
Expand Down
9 changes: 9 additions & 0 deletions src/04kernel/src/utilities/bang/cnnl_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,15 @@ namespace refactor::kernel::cnnl {
return "CnnlContext";
}

void CnnlContext::copyFromCPU(void *dst, const void *src, size_t size) {
BANG_ASSERT(cnrtMemcpy(dst, const_cast<void *>(src), size,
CNRT_MEM_TRANS_DIR_HOST2DEV));
}

void CnnlContext::queueSync() {
BANG_ASSERT(cnrtQueueSync(queue));
}

}// namespace refactor::kernel::cnnl

#endif
2 changes: 2 additions & 0 deletions src/04kernel/src/utilities/bang/cnnl_context.hh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ namespace refactor::kernel::cnnl {
size_t resourceTypeId() const noexcept final;
std::string_view description() const noexcept final;

void copyFromCPU(void *dst, const void *src, size_t size);
void queueSync();
};

}// namespace refactor::kernel::cnnl
Expand Down
27 changes: 27 additions & 0 deletions src/04kernel/src/utilities/bang/cnrt_functions.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#ifdef USE_BANG
#include "cnrt_functions.h"
#include "cnnl_functions.h"
#include <cnrt.h>
#include <cstdio>

namespace refactor::kernel::cnnl {

int currentDevice() {
int device;
BANG_ASSERT(cnrtGetDevice(&device));
return device;
}

void sync() {
BANG_ASSERT(cnrtSyncDevice());
}

void copyOut(void *dst, const void *src, size_t size) {
sync();
BANG_ASSERT(cnrtMemcpy(dst, const_cast<void *>(src), size,
CNRT_MEM_TRANS_DIR_DEV2HOST));
}

}// namespace refactor::kernel::cnnl

#endif
16 changes: 16 additions & 0 deletions src/04kernel/src/utilities/bang/cnrt_functions.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#ifndef KERNEL_CNRT_FUNCTIONS_H
#define KERNEL_CNRT_FUNCTIONS_H

#include "common.h"

namespace refactor::kernel::cnnl {

int currentDevice();

void sync();

void copyOut(void *dst, const void *src, size_t size);

}// namespace refactor::kernel::cnnl

#endif// KERNEL_CNRT_FUNCTIONS_H
45 changes: 45 additions & 0 deletions src/04kernel/test/kernels/gather/test_gather_cnnl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,51 @@ TEST(kernel, GatherCnnl) {
EXPECT_FLOAT_EQ(c[i], result[i]);
}
}

// Case axis = 1, indexType= int32
{
// Create Tensor and build kernels
auto data = Tensor::share(DataType::F32, Shape{32, 16}, LayoutType::NCHW);
auto indices = Tensor::share(DataType::I64, Shape{1, 4}, LayoutType::NCHW);
auto output = Tensor::share(DataType::F32, Shape{1, 4, 16}, LayoutType::NCHW);
GatherInfo info(0, *data, *indices);
auto cnnlKernel = GatherCnnl::build(0, *data, *indices, *output);
auto cpuKernel = GatherCpu::build(info);
ASSERT_TRUE(cnnlKernel && cpuKernel);
auto res = runtime::Resources();
auto [cnnlRoutine, workspaceSize] = cnnlKernel->lower(res);
auto cpuRoutine = cpuKernel->lower(res).routine;
// Init inputs and outputs
std::vector<float> a;
for (auto i = 0; i < data->elementsSize(); i++) {
a.push_back(i + 0.1f);
}
std::vector<int64_t> b(indices->elementsSize(), 0);
std::vector<float> c(output->elementsSize());
auto workspace = dev.malloc(workspaceSize),
aMLU = dev.malloc(data->bytesSize()),
bMLU = dev.malloc(indices->bytesSize()),
cMLU = dev.malloc(output->bytesSize());
aMLU->copyFromHost(a.data(), data->bytesSize());
bMLU->copyFromHost(b.data(), indices->bytesSize());
// Compute
{
void const *inputs[]{*aMLU, *bMLU};
void *outputs[]{*cMLU};
cnnlRoutine(res, *workspace, inputs, outputs);
}
{
void const *inputs[]{a.data(), b.data()};
void *outputs[]{c.data()};
cpuRoutine(res, nullptr, inputs, outputs);
}
// Compare
std::vector<float> result(output->elementsSize());
cMLU->copyToHost(result.data(), output->bytesSize());
for (auto i : range0_(c.size())) {
EXPECT_FLOAT_EQ(c[i], result[i]);
}
}
}

#endif
4 changes: 4 additions & 0 deletions src/09python_ffi/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ pybind11_add_module(python_ffi SHARED ${PYFFI_SRC})
target_link_libraries(python_ffi PRIVATE onnx llm communication)
target_include_directories(python_ffi PRIVATE include)

if(USE_BANG)
target_include_directories(python_ffi PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../04kernel/src/utilities/bang)
endif()

# EXAMPLE_VERSION_INFO is defined by setup.py and passed into the C++ code as a
# define (VERSION_INFO) here.
# target_compile_definitions(python_ffi
Expand Down
Loading

0 comments on commit 2583eb3

Please sign in to comment.