diff --git a/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc b/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc index e3efcd1d..1330cfa7 100644 --- a/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/batch_normalization/cnnl_kernel.cc @@ -128,8 +128,8 @@ namespace refactor::kernel { auto y = outputs[0]; void *xTrans = workspace; - void *yTrans = xTrans + xTransSize; - void *cursor = yTrans + xTransSize; + void *yTrans = reinterpret_cast(xTrans) + xTransSize; + void *cursor = reinterpret_cast(yTrans) + xTransSize; // transpose NCHW input to NHWC CNNL_ASSERT(cnnlTranspose_v2(handle, d->NCHW2NHWC, d->inDesc, x, @@ -147,7 +147,6 @@ namespace refactor::kernel { CNNL_ASSERT(cnnlTranspose_v2(handle, d->NHWC2NCHW, d->inDescTrans, yTrans, d->inDesc, y, cursor, workspaceSize)); - BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); }; return {std::move(routine), totalWorkspaceSize}; diff --git a/src/04kernel/src/kernels/cast/cnnl_kernel.cc b/src/04kernel/src/kernels/cast/cnnl_kernel.cc index 95120974..54798e67 100644 --- a/src/04kernel/src/kernels/cast/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/cast/cnnl_kernel.cc @@ -65,7 +65,6 @@ namespace refactor::kernel { return [d = std::move(d)](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { CNNL_ASSERT(cnnlCastDataType(res.fetchOrStore()->handle, d->inDesc, inputs[0], d->cast, d->outDesc, outputs[0])); - // BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); }; } diff --git a/src/04kernel/src/kernels/clip/cnnl_kernel.cc b/src/04kernel/src/kernels/clip/cnnl_kernel.cc index 423939bc..fe65e99b 100644 --- a/src/04kernel/src/kernels/clip/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/clip/cnnl_kernel.cc @@ -57,7 +57,6 @@ namespace refactor::kernel { CNNL_POINTER_MODE_DEVICE, d->t, inputs[0], inputs[1], hasMax ? inputs[2] : nullptr, d->t, outputs[0])); - BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); }; } diff --git a/src/04kernel/src/kernels/concat/cnnl_kernel.cc b/src/04kernel/src/kernels/concat/cnnl_kernel.cc index 6c7a0fba..c35b1c33 100644 --- a/src/04kernel/src/kernels/concat/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/concat/cnnl_kernel.cc @@ -52,7 +52,7 @@ namespace refactor::kernel { } ~Descriptors() noexcept(false) { CNNL_ASSERT(cnnlDestroyTensorDescriptor(in)); - for (auto i = 0; i < out.size(); i++) { + for (size_t i = 0; i < out.size(); i++) { CNNL_ASSERT(cnnlDestroyTensorDescriptor(out[i])); } } @@ -62,7 +62,7 @@ namespace refactor::kernel { }; auto d = std::make_shared(info.num, info.dataType != DT::F64); setCnnlTensor(d->in, info.dataType, slice(info.inDim.data(), info.inDim.size())); - for (auto i = 0; i < info.outDims.size(); i++) { + for (size_t i = 0; i < info.outDims.size(); i++) { setCnnlTensor(d->out[i], info.dataType, slice(info.outDims[i].data(), info.outDims[i].size())); } diff --git a/src/04kernel/src/kernels/conv/cnnl_kernel.cc b/src/04kernel/src/kernels/conv/cnnl_kernel.cc index 85dcb951..187ac4b5 100644 --- a/src/04kernel/src/kernels/conv/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/conv/cnnl_kernel.cc @@ -209,9 +209,9 @@ namespace refactor::kernel { // } void *xTrans = workspace; - void *wTrans = xTrans + xTransSize; - void *yTrans = wTrans + wTransSize; - void *opWorkspace = yTrans + yTransSize; + void *wTrans = reinterpret_cast(xTrans) + xTransSize; + void *yTrans = reinterpret_cast(wTrans) + wTransSize; + void *opWorkspace = reinterpret_cast(yTrans) + yTransSize; // transpose NCHW input to NHWC CNNL_ASSERT(cnnlTranspose_v2(handle, d->NCHW2NHWC, d->x, x, diff --git a/src/04kernel/src/kernels/expand/cnnl_kernel.cc b/src/04kernel/src/kernels/expand/cnnl_kernel.cc index 18f58642..426eac49 100644 --- a/src/04kernel/src/kernels/expand/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/expand/cnnl_kernel.cc @@ -60,7 +60,6 @@ namespace refactor::kernel { return [d = std::move(d)](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { CNNL_ASSERT(cnnlExpand(res.fetchOrStore()->handle, d->inDesc, inputs[0], d->outDesc, outputs[0])); - // BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); }; } #endif diff --git a/src/04kernel/src/kernels/gather/cnnl_kernel.cc b/src/04kernel/src/kernels/gather/cnnl_kernel.cc index b4d5aa15..abdf0d5c 100644 --- a/src/04kernel/src/kernels/gather/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/gather/cnnl_kernel.cc @@ -79,7 +79,6 @@ namespace refactor::kernel { d->inDesc, inputs[0], reinterpret_cast(workspace), d->indexDesc, reinterpret_cast(inputs[1]), d->outDesc, outputs[0])); - BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); }; return {std::move(routine), workspaceSize}; diff --git a/src/04kernel/src/kernels/mat_mul/cnnl_kernel.cc b/src/04kernel/src/kernels/mat_mul/cnnl_kernel.cc index 7be05b7b..3eac3572 100644 --- a/src/04kernel/src/kernels/mat_mul/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/mat_mul/cnnl_kernel.cc @@ -141,7 +141,6 @@ namespace refactor::kernel { workspace, algoWorkspaceSize)); } - BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); }; return {std::move(routine), algoWorkspaceSize}; diff --git a/src/04kernel/src/kernels/pool/cnnl_kernel.cc b/src/04kernel/src/kernels/pool/cnnl_kernel.cc index 083125b1..929ea578 100644 --- a/src/04kernel/src/kernels/pool/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/pool/cnnl_kernel.cc @@ -130,7 +130,7 @@ namespace refactor::kernel { auto handle = res.fetchOrStore()->handle; void *extraInputDev = workspace; - void *poolWorkSpace = workspace + extraInputSize; + void *poolWorkSpace = reinterpret_cast(workspace) + extraInputSize; void *extraInputHost = malloc(extraInputSize); CNNL_ASSERT(cnnlInitPoolingExtraInput(handle, d->pooling, d->x, d->y, extraInputHost)); @@ -145,7 +145,7 @@ namespace refactor::kernel { &b, extraInputDev, d->y, outputs[0], poolWorkSpace, workspaceSize)); - BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); + res.fetchOrStore()->queueSync(); free(extraInputHost); }; diff --git a/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc b/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc index 11712b4f..8b4647ac 100644 --- a/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc +++ b/src/04kernel/src/kernels/simple_binary/binary_cnnl.cc @@ -180,7 +180,6 @@ namespace refactor::kernel { workspace, workspaceSize)); } - BANG_ASSERT(cnrtQueueSync(res.fetchOrStore()->queue)); }; return {std::move(routine), workspaceSize}; diff --git a/src/04kernel/src/kernels/slice/cnnl_kernel.cc b/src/04kernel/src/kernels/slice/cnnl_kernel.cc index b5f8b4c8..85bc9093 100644 --- a/src/04kernel/src/kernels/slice/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/slice/cnnl_kernel.cc @@ -64,7 +64,7 @@ namespace refactor::kernel { CNNL_ASSERT(cnnlSetTensorDescriptor(d->in, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.inDim.size(), info.inDim.data())); CNNL_ASSERT(cnnlSetTensorDescriptor(d->out, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.outDim.size(), info.outDim.data())); std::vector begin, end, stride; - for (auto i = 0; i < info.dims.size(); i++) { + for (size_t i = 0; i < info.dims.size(); i++) { // [begin, end), end is not inclued begin.push_back(info.dims[i].start); auto sign = info.dims[i].step > 0 ? 1 : -1; diff --git a/src/04kernel/src/kernels/softmax/cnnl_kernel.cc b/src/04kernel/src/kernels/softmax/cnnl_kernel.cc index 0633195d..babaf33c 100644 --- a/src/04kernel/src/kernels/softmax/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/softmax/cnnl_kernel.cc @@ -80,7 +80,6 @@ namespace refactor::kernel { CNNL_COMPUTATION_ULTRAHIGH_PRECISION, &a, d->t, inputs[0], &b, d->t, outputs[0])); - res.fetchOrStore()->queueSync(); }; } diff --git a/src/04kernel/src/kernels/split/cnnl_kernel.cc b/src/04kernel/src/kernels/split/cnnl_kernel.cc index c967bff2..8f686d59 100644 --- a/src/04kernel/src/kernels/split/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/split/cnnl_kernel.cc @@ -69,7 +69,7 @@ namespace refactor::kernel { } ~Descriptors() noexcept(false) { CNNL_ASSERT(cnnlDestroyTensorDescriptor(in)); - for (auto i = 0; i < out.size(); i++) { + for (size_t i = 0; i < out.size(); i++) { CNNL_ASSERT(cnnlDestroyTensorDescriptor(out[i])); } } @@ -81,7 +81,7 @@ namespace refactor::kernel { // setCnnlTensor(d->in, info.dataType, slice(info.inDim.data(), info.inDim.size())); CNNL_ASSERT(cnnlSetTensorDescriptor(d->in, CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.inDim.size(), info.inDim.data())); - for (auto i = 0; i < info.outDims.size(); i++) { + for (size_t i = 0; i < info.outDims.size(); i++) { // setCnnlTensor(d->out[i], info.dataType, slice(info.outDims[i].data(), info.outDims[i].size())); CNNL_ASSERT(cnnlSetTensorDescriptor(d->out[i], CNNL_LAYOUT_NCHW, cnnlDataTypeConvert(info.dataType), info.outDims[i].size(), info.outDims[i].data())); } diff --git a/src/04kernel/src/kernels/where/cnnl_kernel.cc b/src/04kernel/src/kernels/where/cnnl_kernel.cc index 50b7c9d8..14f8d667 100644 --- a/src/04kernel/src/kernels/where/cnnl_kernel.cc +++ b/src/04kernel/src/kernels/where/cnnl_kernel.cc @@ -102,7 +102,6 @@ namespace refactor::kernel { d->y, inputs[2], workspace, workspaceSize, d->ans, outputs[0])); - res.fetchOrStore()->queueSync(); }; return {std::move(routine), workspaceSize}; diff --git a/src/04kernel/src/utilities/bang/cnrt_functions.cc b/src/04kernel/src/utilities/bang/cnrt_functions.cc index 2ea66194..26c1b975 100644 --- a/src/04kernel/src/utilities/bang/cnrt_functions.cc +++ b/src/04kernel/src/utilities/bang/cnrt_functions.cc @@ -4,7 +4,7 @@ #include #include -namespace refactor::kernel::cnnl { +namespace refactor::kernel::bang { int currentDevice() { int device; @@ -22,6 +22,6 @@ namespace refactor::kernel::cnnl { CNRT_MEM_TRANS_DIR_DEV2HOST)); } -}// namespace refactor::kernel::cnnl +}// namespace refactor::kernel::bang #endif diff --git a/src/04kernel/src/utilities/bang/cnrt_functions.h b/src/04kernel/src/utilities/bang/cnrt_functions.h index ef119819..3a05195c 100644 --- a/src/04kernel/src/utilities/bang/cnrt_functions.h +++ b/src/04kernel/src/utilities/bang/cnrt_functions.h @@ -3,7 +3,7 @@ #include "common.h" -namespace refactor::kernel::cnnl { +namespace refactor::kernel::bang { int currentDevice(); @@ -11,6 +11,6 @@ namespace refactor::kernel::cnnl { void copyOut(void *dst, const void *src, size_t size); -}// namespace refactor::kernel::cnnl +}// namespace refactor::kernel::bang #endif// KERNEL_CNRT_FUNCTIONS_H diff --git a/src/04kernel/test/kernels/batch_normalization/test_cnnl.cpp b/src/04kernel/test/kernels/batch_normalization/test_cnnl.cpp index 14a1a07f..d1ad9bd7 100644 --- a/src/04kernel/test/kernels/batch_normalization/test_cnnl.cpp +++ b/src/04kernel/test/kernels/batch_normalization/test_cnnl.cpp @@ -2,6 +2,7 @@ #include "../../../src/kernels/batch_normalization/cnnl_kernel.hh" #include "../../../src/kernels/batch_normalization/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include @@ -57,6 +58,7 @@ TEST(kernel, BatchNormalizationCnnl) { void const *inputs[]{*mluIn, *mluScale, *mluBias, *mluMean, *mluVar}; void *outputs[]{*mluOut}; rMlu(res, *workspace, inputs, outputs); + kernel::bang::sync(); } // take output data std::vector result(outTensor->elementsSize()); diff --git a/src/04kernel/test/kernels/cast/test_cnnl.cpp b/src/04kernel/test/kernels/cast/test_cnnl.cpp index 72f84247..94297357 100644 --- a/src/04kernel/test/kernels/cast/test_cnnl.cpp +++ b/src/04kernel/test/kernels/cast/test_cnnl.cpp @@ -1,7 +1,8 @@ #ifdef USE_BANG -#include "../../../src/kernels/cast/cpu_kernel.hh" #include "../../../src/kernels/cast/cnnl_kernel.hh" +#include "../../../src/kernels/cast/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include #include @@ -34,6 +35,7 @@ TEST(kernel, CastCnnl) { void const *inputs[]{*xMlu}; void *outputs[]{*yMlu}; routine(res, nullptr, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{x_.data()}; diff --git a/src/04kernel/test/kernels/clip/test_cnnl.cpp b/src/04kernel/test/kernels/clip/test_cnnl.cpp index ab29a1cb..ff2e7729 100644 --- a/src/04kernel/test/kernels/clip/test_cnnl.cpp +++ b/src/04kernel/test/kernels/clip/test_cnnl.cpp @@ -1,7 +1,8 @@ #ifdef USE_BANG -#include "../../../src/kernels/clip/cpu_kernel.hh" #include "../../../src/kernels/clip/cnnl_kernel.hh" +#include "../../../src/kernels/clip/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include #include @@ -36,6 +37,7 @@ TEST(kernel, ClipCnnl) { void const *inputs[]{*mluMem, *mluMin, *mluMax}; void *outputs[]{*mluMem}; routine(res, nullptr, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{value.data(), &min, &max}; diff --git a/src/04kernel/test/kernels/concat/test_cnnl.cpp b/src/04kernel/test/kernels/concat/test_cnnl.cpp index 2911c9e9..ecc817ac 100644 --- a/src/04kernel/test/kernels/concat/test_cnnl.cpp +++ b/src/04kernel/test/kernels/concat/test_cnnl.cpp @@ -1,7 +1,8 @@ #ifdef USE_BANG -#include "../../../src/kernels/concat/cpu_kernel.hh" #include "../../../src/kernels/concat/cnnl_kernel.hh" +#include "../../../src/kernels/concat/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include #include @@ -65,6 +66,7 @@ TEST(kernel, ConcatCnnl) { void const *inputs[]{*mluIns[0], *mluIns[1], *mluIns[2], *mluIns[3]}; void *outputs[]{*mluOut}; routine(res, *workspace, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{cpuIns[0].data(), cpuIns[1].data(), cpuIns[2].data(), cpuIns[3].data()}; diff --git a/src/04kernel/test/kernels/conv/test_cnnl.cpp b/src/04kernel/test/kernels/conv/test_cnnl.cpp index bf4bff3c..74e799f5 100644 --- a/src/04kernel/test/kernels/conv/test_cnnl.cpp +++ b/src/04kernel/test/kernels/conv/test_cnnl.cpp @@ -1,6 +1,7 @@ #ifdef USE_BANG #include "../../../src/kernels/conv/cnnl_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include #include @@ -39,15 +40,7 @@ void testConvCnnl(int rank, const int64_t *pads, const int64_t *strides, const i void const *inputs[]{*xMlu, *wMlu}; void *outputs[]{*yMlu}; routine(res, *workspace, inputs, outputs); - - xMlu->copyToHost(xData.data(), xTensor->bytesSize()); - wMlu->copyToHost(wData.data(), wTensor->bytesSize()); - // fmt::println("{}", vec2str(xData)); - // fmt::println("{}", vec2str(wData)); - - // std::vector ws(workspaceSize); - // workspace->copyToHost(ws.data(), workspaceSize); - // fmt::println("{}", vec2str(ws)); + kernel::bang::sync(); // take output data std::vector result(yTensor->elementsSize()); diff --git a/src/04kernel/test/kernels/expand/test_cnnl.cpp b/src/04kernel/test/kernels/expand/test_cnnl.cpp index e06e5977..43fb07e8 100644 --- a/src/04kernel/test/kernels/expand/test_cnnl.cpp +++ b/src/04kernel/test/kernels/expand/test_cnnl.cpp @@ -2,6 +2,7 @@ #include "../../../src/kernels/expand/cnnl_kernel.hh" #include "../../../src/kernels/expand/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include #include @@ -36,6 +37,7 @@ TEST(kernel, ExpandCnnl) { void const *inputs[]{*mluIn}; void *outputs[]{*mluOut}; routine(res, nullptr, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{data.data()}; diff --git a/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp b/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp index 020b5f91..b63682d2 100644 --- a/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp +++ b/src/04kernel/test/kernels/gather/test_gather_cnnl.cpp @@ -2,6 +2,7 @@ #include "../src/kernels/gather/cnnl_kernel.hh" #include "../src/kernels/gather/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include @@ -39,6 +40,7 @@ TEST(kernel, GatherCnnl) { void const *inputs[]{*aMLU, *bMLU}; void *outputs[]{*cMLU}; cnnlRoutine(res, *workspace, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{a.data(), b.data()}; @@ -81,6 +83,7 @@ TEST(kernel, GatherCnnl) { void const *inputs[]{*aMLU, *bMLU}; void *outputs[]{*cMLU}; cnnlRoutine(res, *workspace, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{a.data(), b.data()}; @@ -110,7 +113,7 @@ TEST(kernel, GatherCnnl) { auto cpuRoutine = cpuKernel->lower(res).routine; // Init inputs and outputs std::vector a; - for (auto i = 0; i < data->elementsSize(); i++) { + for (size_t i = 0; i < data->elementsSize(); i++) { a.push_back(i + 0.1f); } std::vector b(indices->elementsSize(), 0); @@ -126,6 +129,7 @@ TEST(kernel, GatherCnnl) { void const *inputs[]{*aMLU, *bMLU}; void *outputs[]{*cMLU}; cnnlRoutine(res, *workspace, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{a.data(), b.data()}; diff --git a/src/04kernel/test/kernels/mat_mul/test_cnnl.cpp b/src/04kernel/test/kernels/mat_mul/test_cnnl.cpp index 251c5c89..f079b444 100644 --- a/src/04kernel/test/kernels/mat_mul/test_cnnl.cpp +++ b/src/04kernel/test/kernels/mat_mul/test_cnnl.cpp @@ -2,6 +2,7 @@ #include "../src/kernels/mat_mul/cnnl_kernel.hh" #include "../src/kernels/mat_mul/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include @@ -48,6 +49,7 @@ TEST(kernel, MatMulCnnl_OnlyBias) { void const *inputs[]{*ma, *mb, *mc}; void *outputs[]{*my}; routine(res, *workspace, inputs, outputs); + kernel::bang::sync(); // take output data std::vector result(Y->elementsSize()); my->copyToHost(result.data(), Y->bytesSize()); @@ -91,6 +93,7 @@ TEST(kernel, MatMulCnnl_Broadcast) { void const *inputs[]{*ma, *mb, *mc}; void *outputs[]{*my}; mluRoutine(res, *workspace, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{dataA.data(), dataB.data(), dataC.data()}; @@ -135,6 +138,7 @@ TEST(kernel, MatMulCnnl_TransABNoBias) { void const *inputs[]{*ma, *mb}; void *outputs[]{*my}; mluRoutine(res, *workspace, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{dataA.data(), dataB.data()}; @@ -189,6 +193,7 @@ TEST(kernel, MatMulCnnl_Large) { void const *inputs[]{*ma, *mb, *mc}; void *outputs[]{*my}; mluRoutine(res, *workspace, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{dataA.data(), dataB.data(), dataC.data()}; diff --git a/src/04kernel/test/kernels/pool/test_cnnl.cpp b/src/04kernel/test/kernels/pool/test_cnnl.cpp index 405bf3f8..1adf4513 100644 --- a/src/04kernel/test/kernels/pool/test_cnnl.cpp +++ b/src/04kernel/test/kernels/pool/test_cnnl.cpp @@ -1,6 +1,7 @@ #ifdef USE_BANG #include "../../../src/kernels/pool/cnnl_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include @@ -32,6 +33,7 @@ void testPoolCnnl(PoolType poolType, int rank, const int64_t *pads, const int64_ void const *inputs[]{*mluMem}; void *outputs[]{*mluMem}; routine(res, *workspace, inputs, outputs); + kernel::bang::sync(); // take output data std::vector result(yTensor->elementsSize()); mluMem->copyToHost(result.data(), yTensor->bytesSize()); diff --git a/src/04kernel/test/kernels/reduce/test_cnnl.cpp b/src/04kernel/test/kernels/reduce/test_cnnl.cpp index 32952fea..113fe766 100644 --- a/src/04kernel/test/kernels/reduce/test_cnnl.cpp +++ b/src/04kernel/test/kernels/reduce/test_cnnl.cpp @@ -1,6 +1,7 @@ #ifdef USE_BANG #include "../../../src/kernels/reduce/cnnl_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include @@ -28,6 +29,7 @@ static void testReducemean(const Shape &shape, const std::vector &data, void const *inputs[]{*mluMemIn}; void *outputs[]{*mluMemOut}; routine(res, *workspace, inputs, outputs); + kernel::bang::sync(); } // take output data Shape outDimArray; diff --git a/src/04kernel/test/kernels/scatter_nd/test_cnnl.cpp b/src/04kernel/test/kernels/scatter_nd/test_cnnl.cpp index 1cdf7216..cf5b9c36 100644 --- a/src/04kernel/test/kernels/scatter_nd/test_cnnl.cpp +++ b/src/04kernel/test/kernels/scatter_nd/test_cnnl.cpp @@ -1,7 +1,8 @@ #ifdef USE_BANG -#include "../../../src/kernels/scatter_nd/cpu_kernel.hh" #include "../../../src/kernels/scatter_nd/cnnl_kernel.hh" +#include "../../../src/kernels/scatter_nd/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include #include @@ -48,6 +49,7 @@ TEST(kernel, ScatterNDCnnl) { void const *inputs[]{*mluData, *mluIndices, *mluUpdates}; void *outputs[]{*mluOut}; routine(res, nullptr, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{data_.data(), indices_.data(), updates_.data()}; diff --git a/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp b/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp index 7bd18bf6..dfd3c9b8 100644 --- a/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp +++ b/src/04kernel/test/kernels/simple_binary/test_binary_cnnl.cpp @@ -2,6 +2,7 @@ #include "../src/kernels/simple_binary/binary_cnnl.hh" #include "../src/kernels/simple_binary/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include @@ -38,6 +39,7 @@ void testBinaryCnnl(SimpleBinaryType binaryOPT, Shape dimA, Shape dimB, Shape di void const *inputs[]{*aMLU, *bMLU}; void *outputs[]{*cMLU}; routine(res, *workspace, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{a.data(), b.data()}; @@ -100,27 +102,6 @@ TEST(kernel, BinaryCnnlFMod) { Shape{2, 5, 10, 20, 3, 4}); } -// TEST(kernel, BinaryCnnlMod) { -// testBinaryCnnl(SimpleBinaryType::Mod, -// Shape{2, 5, 10, 20, 3, 4}, -// Shape{2, 5, 10, 20, 3, 4}, -// Shape{2, 5, 10, 20, 3, 4}); -// } - -// TEST(kernel, BinaryCnnlFmodI8) { -// testBinaryCnnl(SimpleBinaryType::Fmod, -// Shape{2, 5, 10, 20, 3, 4}, -// Shape{2, 5, 10, 20, 3, 4}, -// Shape{2, 5, 10, 20, 3, 4}); -// } - -// TEST(kernel, BinaryCnnlFmodF32) { -// testBinaryCnnl(SimpleBinaryType::Fmod, -// Shape{2, 5, 10, 20, 3, 4}, -// Shape{2, 5, 10, 20, 3, 4}, -// Shape{2, 5, 10, 20, 3, 4}); -// } - TEST(kernel, BinaryCnnlBroadcast) { testBinaryCnnl(SimpleBinaryType::Add, Shape{1, 2, 3, 4, 5, 6}, Shape{}, Shape{1, 2, 3, 4, 5, 6}); } diff --git a/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp b/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp index e4b9428e..8e1e8a9f 100644 --- a/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp +++ b/src/04kernel/test/kernels/simple_unary/test_cnnl.cpp @@ -3,6 +3,7 @@ #include "../../../src/kernels/simple_unary/cnnl_activation_kernel.hh" #include "../../../src/kernels/simple_unary/cnnl_simple_unary_kernel.hh" #include "../../../src/kernels/simple_unary/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include @@ -32,6 +33,7 @@ static void testOp(SimpleUnaryType opType, bool activation = true) { void const *inputs[]{*mluMem}; void *outputs[]{*mluMem}; routine(res, nullptr, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{data.data()}; diff --git a/src/04kernel/test/kernels/slice/test_cnnl.cpp b/src/04kernel/test/kernels/slice/test_cnnl.cpp index 914a8fee..1685d7aa 100644 --- a/src/04kernel/test/kernels/slice/test_cnnl.cpp +++ b/src/04kernel/test/kernels/slice/test_cnnl.cpp @@ -1,7 +1,8 @@ #ifdef USE_BANG -#include "../../../src/kernels/slice/cpu_kernel.hh" #include "../../../src/kernels/slice/cnnl_kernel.hh" +#include "../../../src/kernels/slice/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include #include @@ -45,6 +46,7 @@ TEST(kernel, SliceCnnl) { void const *inputs[]{*mluIn}; void *outputs[]{*mluOut}; routine(res, nullptr, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{data.data()}; diff --git a/src/04kernel/test/kernels/softmax/test_cnnl.cpp b/src/04kernel/test/kernels/softmax/test_cnnl.cpp index a8c7fb28..09874d20 100644 --- a/src/04kernel/test/kernels/softmax/test_cnnl.cpp +++ b/src/04kernel/test/kernels/softmax/test_cnnl.cpp @@ -1,7 +1,8 @@ #ifdef USE_BANG -#include "../../../src/kernels/softmax/cpu_kernel.hh" #include "../../../src/kernels/softmax/cnnl_kernel.hh" +#include "../../../src/kernels/softmax/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include @@ -39,6 +40,7 @@ TEST(kernel, SoftmaxCnnl) { void const *inputs[]{*mluIn}; void *outputs[]{*mluOut}; rCnnl(res, nullptr, inputs, outputs); + kernel::bang::sync(); } // take output data std::vector result(outTensor->elementsSize()); diff --git a/src/04kernel/test/kernels/split/test_cnnl.cpp b/src/04kernel/test/kernels/split/test_cnnl.cpp index 5afa434e..71e69b21 100644 --- a/src/04kernel/test/kernels/split/test_cnnl.cpp +++ b/src/04kernel/test/kernels/split/test_cnnl.cpp @@ -1,7 +1,8 @@ #ifdef USE_BANG -#include "../../../src/kernels/split/cpu_kernel.hh" #include "../../../src/kernels/split/cnnl_kernel.hh" +#include "../../../src/kernels/split/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include #include @@ -64,6 +65,7 @@ TEST(kernel, SplitCnnl) { void const *inputs[]{*mluIn}; void *outputs[]{*mluOuts[0], *mluOuts[1], *mluOuts[2], *mluOuts[3]}; routine(res, *workspace, inputs, outputs); + kernel::bang::sync(); } { void const *inputs[]{data.data()}; diff --git a/src/04kernel/test/kernels/transpose/test_cnnl.cpp b/src/04kernel/test/kernels/transpose/test_cnnl.cpp index 4f4301d8..f4689071 100644 --- a/src/04kernel/test/kernels/transpose/test_cnnl.cpp +++ b/src/04kernel/test/kernels/transpose/test_cnnl.cpp @@ -2,6 +2,7 @@ #include "../../../src/kernels/transpose/cnnl_kernel.hh" #include "../../../src/kernels/transpose/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include #include @@ -42,6 +43,7 @@ TEST(kernel, TransposeCnnl) { void const *inputs[]{*mluIn}; void *outputs[]{*mluOut}; routine(res, *workspace, inputs, outputs); + kernel::bang::sync(); } // take output data std::vector result(dataTensor->elementsSize()); diff --git a/src/04kernel/test/kernels/where/test_cnnl.cpp b/src/04kernel/test/kernels/where/test_cnnl.cpp index aaad6585..6e26ddbd 100644 --- a/src/04kernel/test/kernels/where/test_cnnl.cpp +++ b/src/04kernel/test/kernels/where/test_cnnl.cpp @@ -2,6 +2,7 @@ #include "../../../src/kernels/where/cnnl_kernel.hh" #include "../../../src/kernels/where/cpu_kernel.hh" +#include "../src/utilities/bang/cnrt_functions.h" #include "hardware/device_manager.h" #include @@ -49,6 +50,7 @@ void testWhereCnnl(Shape cDim, Shape xDim, Shape yDim, Shape outDim) { void const *inputs[]{*mluC, *mluX, *mluY}; void *outputs[]{*mluOut}; rCnnl(res, *workspace, inputs, outputs); + kernel::bang::sync(); } // take output data std::vector result(outTensor->elementsSize()); diff --git a/src/09python_ffi/src/executor.cc b/src/09python_ffi/src/executor.cc index 947410cc..1d0e543f 100644 --- a/src/09python_ffi/src/executor.cc +++ b/src/09python_ffi/src/executor.cc @@ -76,7 +76,7 @@ namespace refactor::python_ffi { auto ans = _stream.bench(sync ? kernel::cuda::sync : nullptr); #else #ifdef USE_BANG - auto ans = _stream.bench(sync ? kernel::cnnl::sync : nullptr); + auto ans = _stream.bench(sync ? kernel::bang::sync : nullptr); #else auto ans = _stream.bench(nullptr); #endif @@ -222,7 +222,7 @@ namespace refactor::python_ffi { kernel::cuda::copyOut(buffer.data(), addresses[idx], size); #endif #ifdef USE_BANG - kernel::cnnl::copyOut(buffer.data(), addresses[idx], size); + kernel::bang::copyOut(buffer.data(), addresses[idx], size); #endif auto file = path / fmt::format("data{:06}.{}", dataIdx++, format);