From 7bb52ea05d5ad65dba5e947eaa5714c8616e441c Mon Sep 17 00:00:00 2001 From: Curio Yang Date: Sun, 29 Sep 2024 14:14:05 +0800 Subject: [PATCH 1/6] fix cmake path --- cmake/nncaseruntimeConfig.cmake.in | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cmake/nncaseruntimeConfig.cmake.in b/cmake/nncaseruntimeConfig.cmake.in index cce581029..664c43d7d 100644 --- a/cmake/nncaseruntimeConfig.cmake.in +++ b/cmake/nncaseruntimeConfig.cmake.in @@ -1,5 +1,4 @@ include(${CMAKE_CURRENT_LIST_DIR}/nncaseruntimeTargets.cmake) -if(NOT TARGET gsl-lite) - find_package(gsl-lite REQUIRED) -endif() \ No newline at end of file +set(nncaseruntime_INCLUDE_DIRS ${CMAKE_CURRENT_LIST_DIR}/../../../include) +set(nncaseruntime_LIBS ${CMAKE_CURRENT_LIST_DIR}/../../libNncase.Runtime.Native.a) \ No newline at end of file From 0e243b543ab1ab711ccb2107573f6024ee60e178 Mon Sep 17 00:00:00 2001 From: Curio Yang Date: Tue, 15 Oct 2024 14:16:51 +0800 Subject: [PATCH 2/6] optmize yolov10 OPS: - add reduce max|min (TODO:mean|sum| prod) rvv impl - opt softmax(v1) --- .../src/kernels/stackvm/optimized/reduce.cpp | 5 + .../stackvm/optimized/riscv64/CMakeLists.txt | 1 + .../stackvm/optimized/riscv64/reduce.cpp | 256 ++++++++++-------- .../stackvm/optimized/riscv64/softmax.cpp | 93 +++++-- 4 files changed, 208 insertions(+), 147 deletions(-) diff --git a/src/Native/src/kernels/stackvm/optimized/reduce.cpp b/src/Native/src/kernels/stackvm/optimized/reduce.cpp index 77c197fb2..dc7b0e038 100644 --- a/src/Native/src/kernels/stackvm/optimized/reduce.cpp +++ b/src/Native/src/kernels/stackvm/optimized/reduce.cpp @@ -32,6 +32,11 @@ result optimized::reduce( gsl::span in_shape, gsl::span axis, gsl::span in_strides, gsl::span out_strides, bool keep_dims, kernel_context &context) noexcept { +#if __riscv_vector + return stackvm::optimized::reduce(typecode, op, init_value, input, output, + in_shape, axis, in_strides, out_strides, + keep_dims, context); +#endif return stackvm::reference::reduce(typecode, op, init_value, input, output, in_shape, axis, in_strides, out_strides, keep_dims, context); diff --git a/src/Native/src/kernels/stackvm/optimized/riscv64/CMakeLists.txt b/src/Native/src/kernels/stackvm/optimized/riscv64/CMakeLists.txt index e69de29bb..0759b9e50 100644 --- a/src/Native/src/kernels/stackvm/optimized/riscv64/CMakeLists.txt +++ b/src/Native/src/kernels/stackvm/optimized/riscv64/CMakeLists.txt @@ -0,0 +1 @@ +cmake_minimum_required (VERSION 3.13) diff --git a/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp b/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp index 86908a239..0b6fe3b8e 100644 --- a/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp +++ b/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp @@ -18,6 +18,7 @@ #include #include #include +#include using namespace nncase; using namespace nncase::runtime; @@ -26,140 +27,157 @@ using namespace nncase::kernels; using namespace nncase::kernels::stackvm; using namespace nncase::kernels::stackvm::optimized; -#if __riscv_vector -static void reduce_block(int dim, int block, const float *input, float *out, - int gap) { - __asm volatile( - "vsetvli t0, %[block], e32, m8;" - "mv a1, %[input];" - "mv a3, %[gap];" - "mv a0, %[dim];" - "fcvt.s.w ft0, a0;" - "slli a3, a3, 2;" - "vmv.v.x v8, x0;" - "reduce_block%=:;" - "vle32.v v16, (a1);" - "vfadd.vv v8, v16, v8;" - "add a1,a1,a3;" - "addi a0, a0, -1;" - "bnez a0, reduce_block%=;" - "vfdiv.vf v8, v8, ft0;" - "vse32.v v8, ( %[out]);" ::[dim] "r"(dim), - [block] "r"(block), [input] "r"(input), [out] "r"(out), [gap] "r"(gap) - : "t0", "a0", "a1", "a3", "ft0", "v8", "v16"); -} +// #if __riscv_vector -static void reduce_mean(const float *input, float *out, int dim, int n) { - __asm volatile( - - "mv a1, %[input];" - "mv a2, %[out]; " - "mv a3, %[n];" - "fcvt.s.w ft0, %[dim];" - "reduce_mean_n_cycle%=:;" - "mv a0, %[dim];" - "vsetvli t0, a0, e32, m8;" - "vmv.s.x v16, x0;" - "reduce_mean2%=:;" - "vsetvli t0, a0, e32, m8;" - "vle32.v v8, (a1);" - "slli t1,t0, 2;" - "sub a0,a0,t0;" - "add a1, a1, t1;" - "vfredusum.vs v16,v8,v16;" - "bnez a0, reduce_mean2%=;" - "vfmv.f.s ft1, v16;" - "fdiv.s ft1,ft1,ft0;" - "fsw ft1, (a2);" - "addi a2, a2, 4;" - "addi a3,a3, -1;" - "bnez a3, reduce_mean_n_cycle%=;" ::[dim] "r"(dim), - [n] "r"(n), [input] "r"(input), [out] "r"(out) - : "t0", "t1", "a0", "a1", "a2", "a3", "ft0", "ft1", "v8", "v16"); -} +result reduce_max_impl(const float *in, float *out, size_t outter_size, size_t inner_size, size_t reduce_size){ + for (size_t i = 0; i < outter_size; ++i) { + size_t outer_offset = i * reduce_size * inner_size; + for (size_t j = 0; j < inner_size; ++j) { + size_t base_index = outer_offset + j; + float max_val = in[base_index]; // 初始化最大值为第一个元素 + size_t remaining = reduce_size; -static void reduce_mean_s(int32_t c, int32_t dim, const float *input, - float *out, int32_t gap) { -#define BLOCK_N 32 - while (c--) { - const float *tmp_input = input; - for (int j = 0; j < gap / BLOCK_N; ++j) { - reduce_block(dim, BLOCK_N, tmp_input, out, gap); - tmp_input += BLOCK_N; - out += BLOCK_N; - } - int left_number = gap & (BLOCK_N - 1); - if (left_number) { - reduce_block(dim, left_number, tmp_input, out, gap); - out += left_number; + // set vlen and convert scaler to vector + if(0) + { + size_t vl = vsetvl_e32m1(remaining); + vfloat32m1_t v_max = vfmv_v_f_f32m1(max_val, vl); + + // process full registers data. + while (remaining / vl > 0) { + vfloat32m1_t v_in = vle32_v_f32m1(&in[base_index], vl); + v_max = vfmax_vv_f32m1(v_max, v_in, vl); + + remaining -= vl; + base_index += vl; + } + vfloat32m1_t reduced_max_ = + vfredmax_vs_f32m1_f32m1(v_max, v_max, v_max, vl); + max_val = vfmv_f_s_f32m1_f32(reduced_max_); + + // process the remaining elements + if (remaining > 0) { + vl = vsetvl_e32m1(remaining); + v_max = vfmv_v_f_f32m1(max_val, vl); + vfloat32m1_t v_in = vle32_v_f32m1(&in[base_index], vl); + v_max = vfmax_vv_f32m1(v_max, v_in, vl); + reduced_max_ = + vfredmax_vs_f32m1_f32m1(v_max, v_max, v_max, vl); + max_val = vfmv_f_s_f32m1_f32(reduced_max_); + } + } + else + { + // set vlen and convert scaler to vector + size_t vl = vsetvl_e32m4(remaining); + vfloat32m4_t v_max = vfmv_v_f_f32m4(max_val, vl); + + // process full registers data. + while (remaining / vl > 0) { + vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl); + v_max = vfmax_vv_f32m4(v_max, v_in, vl); + + remaining -= vl; + base_index += vl; + } + vfloat32m1_t reduced_max_ = vfredmax_vs_f32m4_f32m1( + vundefined_f32m1(), v_max, vfmv_v_f_f32m1(max_val, vl), vl); + max_val = vfmv_f_s_f32m1_f32(reduced_max_); + + // process the remaining elements + + // 处理剩余的元素 + if (remaining > 0) { + vl = vsetvl_e32m4(remaining); + v_max = vfmv_v_f_f32m4(max_val, vl); + vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl); + v_max = vfmax_vv_f32m4(v_max, v_in, vl); + reduced_max_ = vfredmax_vs_f32m4_f32m1( + vundefined_f32m1(), v_max, vfmv_v_f_f32m1(max_val, vl), + vl); + max_val = vfmv_f_s_f32m1_f32(reduced_max_); + } + } + out[i * inner_size + j] = max_val; // 存储结果 } - input += dim * gap; } + return ok(); } -static int compute_size_by_index(gsl::span input, int start_index, - int end_index) { - int init_value = 1; - for (int i = start_index; i < end_index; ++i) { - init_value *= input[i]; - } - return init_value; -} +result reduce_min_impl(const float *in, float *out, size_t outter_size, size_t inner_size, size_t reduce_size){ + for (size_t i = 0; i < outter_size; ++i) { + size_t outer_offset = i * reduce_size * inner_size; + for (size_t j = 0; j < inner_size; ++j) { + size_t base_index = outer_offset + j; + float min_val = in[base_index]; // 初始化最大值为第一个元素 + size_t remaining = reduce_size; -static int get_parameter(gsl::span in_shape, - gsl::span axis, gsl::span out) { - int min_index = axis[0]; - int max_index = axis[0]; - for (int i = 1; i < (int)axis.size(); ++i) { - int value = axis[i]; - if (value < min_index) - min_index = value; - else if (value > max_index) - max_index = value; - } - int _sum1 = (max_index + min_index) * (max_index - min_index + 1) >> 1; - int _sum2 = axis[0]; - for (int i = 1; i < (int)axis.size(); ++i) { - _sum2 += axis[i]; - } - if (_sum2 != _sum1) { - return 1; + // set vlen and convert scaler to vector + size_t vl = vsetvl_e32m1(remaining); + vfloat32m1_t v_min = vfmv_v_f_f32m1(min_val, vl); + + // process full registers data. + while (remaining / vl > 0) { + vfloat32m1_t v_in = vle32_v_f32m1(&in[base_index], vl); + v_min = vfmin_vv_f32m1(v_min, v_in, vl); + + remaining -= vl; + base_index += vl; + } + vfloat32m1_t reduced_min_ = vfredmin_vs_f32m1_f32m1(v_min, v_min, v_min, vl); + min_val = vfmv_f_s_f32m1_f32(reduced_min_); + + // process the remaining elements + vl = vsetvl_e32m1(remaining); + v_min = vfmv_v_f_f32m1(min_val, vl); + vfloat32m1_t v_in = vle32_v_f32m1(&in[base_index], vl); + v_min = vfmin_vv_f32m1(v_min, v_in, vl); + reduced_min_ = vfredmin_vs_f32m1_f32m1(v_min, v_min, v_min, vl); + min_val = vfmv_f_s_f32m1_f32(reduced_min_); + + out[i * inner_size + j] = min_val; // 存储结果 + } } - out[0] = compute_size_by_index(in_shape, min_index, max_index + 1); - out[1] = compute_size_by_index(in_shape, 0, min_index); - out[2] = compute_size_by_index(in_shape, max_index + 1, in_shape.size()); - return 0; + return ok(); } -#endif +// #endif result optimized::reduce( - typecode_t typecode, nncase::runtime::stackvm::reduce_op_t op, - const gsl::byte *init_value, const gsl::byte *input, gsl::byte *output, + NNCASE_UNUSED typecode_t typecode, nncase::runtime::stackvm::reduce_op_t op, + NNCASE_UNUSED const gsl::byte *init_value, const gsl::byte *input, gsl::byte *output, gsl::span in_shape, gsl::span axis, - gsl::span in_strides, gsl::span out_strides, - bool keep_dims, kernel_context &context) noexcept { + NNCASE_UNUSED gsl::span in_strides, NNCASE_UNUSED gsl::span out_strides, + NNCASE_UNUSED bool keep_dims, NNCASE_UNUSED kernel_context &context) noexcept { #if __riscv_vector - do { - if (op == reduce_op_t::mean && typecode == dt_float32) { - int parameters[3]; - int ret = get_parameter(in_shape, axis, parameters); - if (ret) { - break; - } - auto input_data = IN_CAST(float, input); - auto out_data = OUT_CAST(float, output); - int gap = parameters[2]; - if (gap == 1) { - reduce_mean(input_data, out_data, parameters[0], parameters[1]); - } else { - reduce_mean_s(parameters[1], parameters[0], input_data, - out_data, gap); - } - return ok(); + // The type of axis is 'size_t'. It is real axis. + // 计算inner_size、outter_size + size_t inner_size = 1, outter_size = 1; + size_t reduce_size = in_shape[axis[0]]; + + for (size_t i = 0; i < axis[0]; i++) { outter_size *= in_shape[i]; } + + for (size_t i = axis[0]+1; i < in_shape.size(); i++) { inner_size *= in_shape[i]; } + + const float* in = reinterpret_cast(input); + float* out = reinterpret_cast(output); + if (axis.size() == 1) + { + switch(op) + { + case reduce_op_t::max: + return reduce_max_impl(in, out, outter_size, inner_size, reduce_size); + case reduce_op_t::min: + return reduce_min_impl(in, out, outter_size, inner_size, reduce_size); + break; + case reduce_op_t::sum: + case reduce_op_t::mean: + case reduce_op_t::prod: + default: + break; } - } while (0); -#endif + } +#endif return stackvm::reference::reduce(typecode, op, init_value, input, output, in_shape, axis, in_strides, out_strides, keep_dims, context); diff --git a/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp b/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp index 58442a40c..9656c18b0 100644 --- a/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp +++ b/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp @@ -249,51 +249,88 @@ result optimized_softmax_impl(const T *input, T *output, float *ptr_output_vl = ptr_output; // max - float max = std::numeric_limits::lowest(); - while (n) { - auto vl = vsetvl_e32m8(n); - auto v = vle32_v_f32m8(ptr_input_vl, vl); - auto s = vfmv_s_f_f32m1(vundefined_f32m1(), max, vl); + float max = *ptr_input_vl; + { + size_t vl = vsetvl_e32m4(n); + vfloat32m4_t s = vfmv_v_f_f32m4(max, vl); + while(n / vl > 0){ + vfloat32m4_t v = vle32_v_f32m4(ptr_input_vl, vl); + s = vfmax_vv_f32m4(s, v, vl); - s = vfredmax_vs_f32m8_f32m1(s, v, s, vl); - max = vfmv_f_s_f32m1_f32(s); - ptr_input_vl += vl; - n -= vl; + n -= vl; + ptr_input_vl += vl; + } + + vfloat32m1_t reduced_max_ = vfredmax_vs_f32m4_f32m1( + vundefined_f32m1(), s, vfmv_v_f_f32m1(max, vl), vl); + max = vfmv_f_s_f32m1_f32(reduced_max_); + + if(n > 0){ + vl = vsetvl_e32m4(n); + s = vfmv_v_f_f32m4(max, vl); + vfloat32m4_t v = vle32_v_f32m4(ptr_input_vl, vl); + s = vfmax_vv_f32m4(s, v, vl); + reduced_max_ = vfredmax_vs_f32m4_f32m1( + vundefined_f32m1(), s, vfmv_v_f_f32m1(max, vl), vl); + max = vfmv_f_s_f32m1_f32(reduced_max_); + } } // exp((x - max) * beta) and sum(exp) float sum = 0.f; ptr_input_vl = ptr_input; n = axis_dim; - while (n) { - auto vl = vsetvl_e32m8(n); - auto v_in = vle32_v_f32m8(ptr_input_vl, vl); + { + auto vl = vsetvl_e32m4(n); auto s = vfmv_s_f_f32m1(vundefined_f32m1(), sum, vl); + while (n / vl > 0) { - auto v_out = exp_ps( - vfmul_vf_f32m8(vfsub_vf_f32m8(v_in, max, vl), beta, vl), - vl); - s = vfredosum_vs_f32m8_f32m1(s, v_out, s, vl); + auto v_in = vle32_v_f32m4(ptr_input_vl, vl); + auto v_out = exp_ps( + vfmul_vf_f32m4(vfsub_vf_f32m4(v_in, max, vl), beta, vl), + vl); + s = vfredosum_vs_f32m4_f32m1(s, v_out, s, vl); - vse32_v_f32m8(ptr_output_vl, v_out, vl); + vse32_v_f32m4(ptr_output_vl, v_out, vl); + + ptr_input_vl += vl; + ptr_output_vl += vl; + n -= vl; + } sum = vfmv_f_s_f32m1_f32(s); - ptr_input_vl += vl; - ptr_output_vl += vl; - n -= vl; + if (n > 0) { + vl = vsetvl_e32m4(n); + s = vfmv_s_f_f32m1(vundefined_f32m1(), sum, vl); + auto v_in = vle32_v_f32m4(ptr_input_vl, vl); + auto v_out = exp_ps( + vfmul_vf_f32m4(vfsub_vf_f32m4(v_in, max, vl), beta, vl), + vl); + s = vfredosum_vs_f32m4_f32m1(s, v_out, s, vl); + + vse32_v_f32m4(ptr_output_vl, v_out, vl); + sum = vfmv_f_s_f32m1_f32(s); + } } - // div ptr_input_vl = ptr_input; ptr_output_vl = ptr_output; n = axis_dim; sum = 1.0f / sum; - while (n) { - auto vl = vsetvl_e32m8(n); - auto v_out = vle32_v_f32m8(ptr_output_vl, vl); - v_out = vfmul_vf_f32m8(v_out, sum, vl); - vse32_v_f32m8(ptr_output_vl, v_out, vl); - ptr_output_vl += vl; - n -= vl; + { + auto vl = vsetvl_e32m4(n); + while (n/vl>0) { + auto v_out = vle32_v_f32m4(ptr_output_vl, vl); + v_out = vfmul_vf_f32m4(v_out, sum, vl); + vse32_v_f32m4(ptr_output_vl, v_out, vl); + ptr_output_vl += vl; + n -= vl; + } + if (n > 0){ + vl = vsetvl_e32m4(n); + auto v_out = vle32_v_f32m4(ptr_output_vl, vl); + v_out = vfmul_vf_f32m4(v_out, sum, vl); + vse32_v_f32m4(ptr_output_vl, v_out, vl); + } } ptr_input += axis_dim; From 9a824a91a51344f57697b129ff8f6e4ab6fe63f3 Mon Sep 17 00:00:00 2001 From: Curio Yang Date: Wed, 16 Oct 2024 17:06:15 +0800 Subject: [PATCH 3/6] - add reduce prod - fix reduce sum | mean - opt softmax sum --- .../stackvm/optimized/riscv64/reduce.cpp | 143 +++++++++++++++--- .../stackvm/optimized/riscv64/softmax.cpp | 16 +- 2 files changed, 129 insertions(+), 30 deletions(-) diff --git a/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp b/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp index 0b6fe3b8e..53fa35742 100644 --- a/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp +++ b/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp @@ -27,18 +27,18 @@ using namespace nncase::kernels; using namespace nncase::kernels::stackvm; using namespace nncase::kernels::stackvm::optimized; -// #if __riscv_vector +#if __riscv_vector result reduce_max_impl(const float *in, float *out, size_t outter_size, size_t inner_size, size_t reduce_size){ for (size_t i = 0; i < outter_size; ++i) { size_t outer_offset = i * reduce_size * inner_size; for (size_t j = 0; j < inner_size; ++j) { size_t base_index = outer_offset + j; - float max_val = in[base_index]; // 初始化最大值为第一个元素 + float max_val = in[base_index]; size_t remaining = reduce_size; // set vlen and convert scaler to vector - if(0) + if(0) //m1 { size_t vl = vsetvl_e32m1(remaining); vfloat32m1_t v_max = vfmv_v_f_f32m1(max_val, vl); @@ -66,7 +66,7 @@ result reduce_max_impl(const float *in, float *out, size_t outter_size, si max_val = vfmv_f_s_f32m1_f32(reduced_max_); } } - else + else // m4 { // set vlen and convert scaler to vector size_t vl = vsetvl_e32m4(remaining); @@ -85,8 +85,6 @@ result reduce_max_impl(const float *in, float *out, size_t outter_size, si max_val = vfmv_f_s_f32m1_f32(reduced_max_); // process the remaining elements - - // 处理剩余的元素 if (remaining > 0) { vl = vsetvl_e32m4(remaining); v_max = vfmv_v_f_f32m4(max_val, vl); @@ -98,7 +96,7 @@ result reduce_max_impl(const float *in, float *out, size_t outter_size, si max_val = vfmv_f_s_f32m1_f32(reduced_max_); } } - out[i * inner_size + j] = max_val; // 存储结果 + out[i * inner_size + j] = max_val; } } return ok(); @@ -109,38 +107,128 @@ result reduce_min_impl(const float *in, float *out, size_t outter_size, si size_t outer_offset = i * reduce_size * inner_size; for (size_t j = 0; j < inner_size; ++j) { size_t base_index = outer_offset + j; - float min_val = in[base_index]; // 初始化最大值为第一个元素 + float min_val = in[base_index]; size_t remaining = reduce_size; // set vlen and convert scaler to vector - size_t vl = vsetvl_e32m1(remaining); - vfloat32m1_t v_min = vfmv_v_f_f32m1(min_val, vl); + size_t vl = vsetvl_e32m4(remaining); + vfloat32m4_t v_min = vfmv_v_f_f32m4(min_val, vl); // process full registers data. while (remaining / vl > 0) { - vfloat32m1_t v_in = vle32_v_f32m1(&in[base_index], vl); - v_min = vfmin_vv_f32m1(v_min, v_in, vl); + vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl); + v_min = vfmin_vv_f32m4(v_min, v_in, vl); remaining -= vl; base_index += vl; } - vfloat32m1_t reduced_min_ = vfredmin_vs_f32m1_f32m1(v_min, v_min, v_min, vl); + vfloat32m1_t reduced_min_ = vfredmin_vs_f32m4_f32m1( + vundefined_f32m1(), v_min, vfmv_v_f_f32m1(min_val, vl), vl); min_val = vfmv_f_s_f32m1_f32(reduced_min_); // process the remaining elements - vl = vsetvl_e32m1(remaining); - v_min = vfmv_v_f_f32m1(min_val, vl); - vfloat32m1_t v_in = vle32_v_f32m1(&in[base_index], vl); - v_min = vfmin_vv_f32m1(v_min, v_in, vl); - reduced_min_ = vfredmin_vs_f32m1_f32m1(v_min, v_min, v_min, vl); - min_val = vfmv_f_s_f32m1_f32(reduced_min_); + if (remaining > 0) { + vl = vsetvl_e32m4(remaining); + v_min = vfmv_v_f_f32m4(min_val, vl); + vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl); + v_min = vfmin_vv_f32m4(v_min, v_in, vl); + reduced_min_ = vfredmin_vs_f32m4_f32m1( + vundefined_f32m1(), v_min, vfmv_v_f_f32m1(min_val, vl), vl); + min_val = vfmv_f_s_f32m1_f32(reduced_min_); + } - out[i * inner_size + j] = min_val; // 存储结果 + out[i * inner_size + j] = min_val; } } return ok(); } -// #endif + +result reduce_sum_impl(const float *in, float *out, size_t outter_size, size_t inner_size, size_t reduce_size){ + for (size_t i = 0; i < outter_size; ++i) { + size_t outer_offset = i * reduce_size * inner_size; + for (size_t j = 0; j < inner_size; ++j) { + size_t base_index = outer_offset + j; + float sum = 0.0f; + size_t remaining = reduce_size; + + // set vlen and convert scaler to vector + size_t vl = vsetvl_e32m4(remaining); + vfloat32m4_t v_sum = vfmv_v_f_f32m4(0.0f, vl); + + // process full registers data. + while (remaining / vl > 0) { + vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl); + v_sum = vfadd_vv_f32m4(v_sum, v_in, vl); + + remaining -= vl; + base_index += vl; + } + vfloat32m1_t reduced_sum_ = vfredosum_vs_f32m4_f32m1( + vundefined_f32m1(), v_sum, vfmv_v_f_f32m1(0.0f, vl), vl); + sum += vfmv_f_s_f32m1_f32(reduced_sum_); + + // process the remaining elements + if (remaining > 0) { + vl = vsetvl_e32m4(remaining); + v_sum = vfmv_v_f_f32m4(0.0f, vl); + vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl); + v_sum = vfadd_vv_f32m4(v_sum, v_in, vl); + reduced_sum_ = vfredosum_vs_f32m4_f32m1( + vundefined_f32m1(), v_sum, vfmv_v_f_f32m1(0.0f, vl), vl); + sum += vfmv_f_s_f32m1_f32(reduced_sum_); + } + + out[i * inner_size + j] = sum; + } + } + return ok(); +} + +result reduce_prod_impl(const float *in, float *out, size_t outter_size, + size_t inner_size, size_t reduce_size) { + for (size_t i = 0; i < outter_size; ++i) { + size_t outer_offset = i * reduce_size * inner_size; + for (size_t j = 0; j < inner_size; ++j) { + size_t base_index = outer_offset + j; + float acc = 1.0f; + size_t remaining = reduce_size; + + // set vlen and convert scaler to vector + size_t vl = vsetvl_e32m4(remaining); + vfloat32m4_t v_acc = vfmv_v_f_f32m4(1.0f, vl); + + // process full registers data. + while (remaining / vl > 0) { + vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl); + v_acc = vfmul_vv_f32m4(v_acc, v_in, vl); + + remaining -= vl; + base_index += vl; + } + for (size_t i = 0; i < vl; i++) { + acc *= vfmv_f_s_f32m4_f32( + vslidedown_vx_f32m4(vundefined_f32m4(), v_acc, i, vl)); + } + + // process the remaining elements + if (remaining > 0) { + vl = vsetvl_e32m4(remaining); + v_acc = vfmv_v_f_f32m4(1.0f, vl); + vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl); + v_acc = vfmul_vv_f32m4(v_acc, v_in, vl); + for (size_t i = 0; i < vl; i++) { + acc *= vfmv_f_s_f32m4_f32( + vslidedown_vx_f32m4(vundefined_f32m4(), v_acc, i, vl)); + } + } + + out[i * inner_size + j] = acc; + } + } + return ok(); +} + +#endif result optimized::reduce( NNCASE_UNUSED typecode_t typecode, nncase::runtime::stackvm::reduce_op_t op, @@ -160,7 +248,7 @@ result optimized::reduce( const float* in = reinterpret_cast(input); float* out = reinterpret_cast(output); - if (axis.size() == 1) + if (axis.size() == 1 && axis[0] == in_shape.size() - 1) { switch(op) { @@ -168,14 +256,23 @@ result optimized::reduce( return reduce_max_impl(in, out, outter_size, inner_size, reduce_size); case reduce_op_t::min: return reduce_min_impl(in, out, outter_size, inner_size, reduce_size); - break; case reduce_op_t::sum: + return reduce_sum_impl(in, out, outter_size, inner_size, reduce_size); case reduce_op_t::mean: + reduce_sum_impl(in, out, outter_size, inner_size, reduce_size).unwrap(); + for(size_t i = 0; i < outter_size; i++) + { + out[i] *= 1.0f / reduce_size; + } + return ok(); case reduce_op_t::prod: + return reduce_prod_impl(in, out, outter_size, inner_size, reduce_size); default: break; } } + // TODO: implement non-last axis reduce + // TODO: implement multi-axis reduce #endif return stackvm::reference::reduce(typecode, op, init_value, input, output, diff --git a/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp b/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp index 9656c18b0..38302ef58 100644 --- a/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp +++ b/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp @@ -282,33 +282,35 @@ result optimized_softmax_impl(const T *input, T *output, n = axis_dim; { auto vl = vsetvl_e32m4(n); - auto s = vfmv_s_f_f32m1(vundefined_f32m1(), sum, vl); + auto s = vfmv_v_f_f32m4(0.0f, vl); while (n / vl > 0) { auto v_in = vle32_v_f32m4(ptr_input_vl, vl); auto v_out = exp_ps( vfmul_vf_f32m4(vfsub_vf_f32m4(v_in, max, vl), beta, vl), vl); - s = vfredosum_vs_f32m4_f32m1(s, v_out, s, vl); - + s = vfadd_vv_f32m4(s, v_out, vl); vse32_v_f32m4(ptr_output_vl, v_out, vl); ptr_input_vl += vl; ptr_output_vl += vl; n -= vl; } - sum = vfmv_f_s_f32m1_f32(s); + vfloat32m1_t reduce_sum_ = vfredosum_vs_f32m4_f32m1( + vundefined_f32m1(), s, vfmv_v_f_f32m1(0.0f, vl), vl); + sum += vfmv_f_s_f32m1_f32(reduce_sum_); + if (n > 0) { vl = vsetvl_e32m4(n); - s = vfmv_s_f_f32m1(vundefined_f32m1(), sum, vl); auto v_in = vle32_v_f32m4(ptr_input_vl, vl); auto v_out = exp_ps( vfmul_vf_f32m4(vfsub_vf_f32m4(v_in, max, vl), beta, vl), vl); - s = vfredosum_vs_f32m4_f32m1(s, v_out, s, vl); + reduce_sum_ = vfredosum_vs_f32m4_f32m1(vundefined_f32m1(), v_out, + vfmv_v_f_f32m1(0.0f, vl), vl); vse32_v_f32m4(ptr_output_vl, v_out, vl); - sum = vfmv_f_s_f32m1_f32(s); + sum += vfmv_f_s_f32m1_f32(reduce_sum_); } } // div From 4458a11d9f396241dcb261ead1d5b33c6ff703e7 Mon Sep 17 00:00:00 2001 From: curioyang Date: Wed, 16 Oct 2024 09:10:36 +0000 Subject: [PATCH 4/6] Apply code-format changes --- .../stackvm/optimized/riscv64/reduce.cpp | 67 +++++++++++-------- .../stackvm/optimized/riscv64/softmax.cpp | 11 +-- 2 files changed, 45 insertions(+), 33 deletions(-) diff --git a/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp b/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp index 53fa35742..62326326d 100644 --- a/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp +++ b/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp @@ -29,7 +29,8 @@ using namespace nncase::kernels::stackvm::optimized; #if __riscv_vector -result reduce_max_impl(const float *in, float *out, size_t outter_size, size_t inner_size, size_t reduce_size){ +result reduce_max_impl(const float *in, float *out, size_t outter_size, + size_t inner_size, size_t reduce_size) { for (size_t i = 0; i < outter_size; ++i) { size_t outer_offset = i * reduce_size * inner_size; for (size_t j = 0; j < inner_size; ++j) { @@ -38,7 +39,7 @@ result reduce_max_impl(const float *in, float *out, size_t outter_size, si size_t remaining = reduce_size; // set vlen and convert scaler to vector - if(0) //m1 + if (0) // m1 { size_t vl = vsetvl_e32m1(remaining); vfloat32m1_t v_max = vfmv_v_f_f32m1(max_val, vl); @@ -65,8 +66,7 @@ result reduce_max_impl(const float *in, float *out, size_t outter_size, si vfredmax_vs_f32m1_f32m1(v_max, v_max, v_max, vl); max_val = vfmv_f_s_f32m1_f32(reduced_max_); } - } - else // m4 + } else // m4 { // set vlen and convert scaler to vector size_t vl = vsetvl_e32m4(remaining); @@ -102,7 +102,8 @@ result reduce_max_impl(const float *in, float *out, size_t outter_size, si return ok(); } -result reduce_min_impl(const float *in, float *out, size_t outter_size, size_t inner_size, size_t reduce_size){ +result reduce_min_impl(const float *in, float *out, size_t outter_size, + size_t inner_size, size_t reduce_size) { for (size_t i = 0; i < outter_size; ++i) { size_t outer_offset = i * reduce_size * inner_size; for (size_t j = 0; j < inner_size; ++j) { @@ -143,7 +144,8 @@ result reduce_min_impl(const float *in, float *out, size_t outter_size, si return ok(); } -result reduce_sum_impl(const float *in, float *out, size_t outter_size, size_t inner_size, size_t reduce_size){ +result reduce_sum_impl(const float *in, float *out, size_t outter_size, + size_t inner_size, size_t reduce_size) { for (size_t i = 0; i < outter_size; ++i) { size_t outer_offset = i * reduce_size * inner_size; for (size_t j = 0; j < inner_size; ++j) { @@ -185,7 +187,7 @@ result reduce_sum_impl(const float *in, float *out, size_t outter_size, si } result reduce_prod_impl(const float *in, float *out, size_t outter_size, - size_t inner_size, size_t reduce_size) { + size_t inner_size, size_t reduce_size) { for (size_t i = 0; i < outter_size; ++i) { size_t outer_offset = i * reduce_size * inner_size; for (size_t j = 0; j < inner_size; ++j) { @@ -217,8 +219,8 @@ result reduce_prod_impl(const float *in, float *out, size_t outter_size, vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl); v_acc = vfmul_vv_f32m4(v_acc, v_in, vl); for (size_t i = 0; i < vl; i++) { - acc *= vfmv_f_s_f32m4_f32( - vslidedown_vx_f32m4(vundefined_f32m4(), v_acc, i, vl)); + acc *= vfmv_f_s_f32m4_f32( + vslidedown_vx_f32m4(vundefined_f32m4(), v_acc, i, vl)); } } @@ -232,41 +234,50 @@ result reduce_prod_impl(const float *in, float *out, size_t outter_size, result optimized::reduce( NNCASE_UNUSED typecode_t typecode, nncase::runtime::stackvm::reduce_op_t op, - NNCASE_UNUSED const gsl::byte *init_value, const gsl::byte *input, gsl::byte *output, - gsl::span in_shape, gsl::span axis, - NNCASE_UNUSED gsl::span in_strides, NNCASE_UNUSED gsl::span out_strides, - NNCASE_UNUSED bool keep_dims, NNCASE_UNUSED kernel_context &context) noexcept { + NNCASE_UNUSED const gsl::byte *init_value, const gsl::byte *input, + gsl::byte *output, gsl::span in_shape, + gsl::span axis, + NNCASE_UNUSED gsl::span in_strides, + NNCASE_UNUSED gsl::span out_strides, + NNCASE_UNUSED bool keep_dims, + NNCASE_UNUSED kernel_context &context) noexcept { #if __riscv_vector // The type of axis is 'size_t'. It is real axis. // 计算inner_size、outter_size size_t inner_size = 1, outter_size = 1; size_t reduce_size = in_shape[axis[0]]; - for (size_t i = 0; i < axis[0]; i++) { outter_size *= in_shape[i]; } + for (size_t i = 0; i < axis[0]; i++) { + outter_size *= in_shape[i]; + } - for (size_t i = axis[0]+1; i < in_shape.size(); i++) { inner_size *= in_shape[i]; } + for (size_t i = axis[0] + 1; i < in_shape.size(); i++) { + inner_size *= in_shape[i]; + } - const float* in = reinterpret_cast(input); - float* out = reinterpret_cast(output); - if (axis.size() == 1 && axis[0] == in_shape.size() - 1) - { - switch(op) - { + const float *in = reinterpret_cast(input); + float *out = reinterpret_cast(output); + if (axis.size() == 1 && axis[0] == in_shape.size() - 1) { + switch (op) { case reduce_op_t::max: - return reduce_max_impl(in, out, outter_size, inner_size, reduce_size); + return reduce_max_impl(in, out, outter_size, inner_size, + reduce_size); case reduce_op_t::min: - return reduce_min_impl(in, out, outter_size, inner_size, reduce_size); + return reduce_min_impl(in, out, outter_size, inner_size, + reduce_size); case reduce_op_t::sum: - return reduce_sum_impl(in, out, outter_size, inner_size, reduce_size); + return reduce_sum_impl(in, out, outter_size, inner_size, + reduce_size); case reduce_op_t::mean: - reduce_sum_impl(in, out, outter_size, inner_size, reduce_size).unwrap(); - for(size_t i = 0; i < outter_size; i++) - { + reduce_sum_impl(in, out, outter_size, inner_size, reduce_size) + .unwrap(); + for (size_t i = 0; i < outter_size; i++) { out[i] *= 1.0f / reduce_size; } return ok(); case reduce_op_t::prod: - return reduce_prod_impl(in, out, outter_size, inner_size, reduce_size); + return reduce_prod_impl(in, out, outter_size, inner_size, + reduce_size); default: break; } diff --git a/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp b/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp index 38302ef58..4539baee7 100644 --- a/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp +++ b/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp @@ -253,7 +253,7 @@ result optimized_softmax_impl(const T *input, T *output, { size_t vl = vsetvl_e32m4(n); vfloat32m4_t s = vfmv_v_f_f32m4(max, vl); - while(n / vl > 0){ + while (n / vl > 0) { vfloat32m4_t v = vle32_v_f32m4(ptr_input_vl, vl); s = vfmax_vv_f32m4(s, v, vl); @@ -265,7 +265,7 @@ result optimized_softmax_impl(const T *input, T *output, vundefined_f32m1(), s, vfmv_v_f_f32m1(max, vl), vl); max = vfmv_f_s_f32m1_f32(reduced_max_); - if(n > 0){ + if (n > 0) { vl = vsetvl_e32m4(n); s = vfmv_v_f_f32m4(max, vl); vfloat32m4_t v = vle32_v_f32m4(ptr_input_vl, vl); @@ -306,7 +306,8 @@ result optimized_softmax_impl(const T *input, T *output, auto v_out = exp_ps( vfmul_vf_f32m4(vfsub_vf_f32m4(v_in, max, vl), beta, vl), vl); - reduce_sum_ = vfredosum_vs_f32m4_f32m1(vundefined_f32m1(), v_out, + reduce_sum_ = + vfredosum_vs_f32m4_f32m1(vundefined_f32m1(), v_out, vfmv_v_f_f32m1(0.0f, vl), vl); vse32_v_f32m4(ptr_output_vl, v_out, vl); @@ -320,14 +321,14 @@ result optimized_softmax_impl(const T *input, T *output, sum = 1.0f / sum; { auto vl = vsetvl_e32m4(n); - while (n/vl>0) { + while (n / vl > 0) { auto v_out = vle32_v_f32m4(ptr_output_vl, vl); v_out = vfmul_vf_f32m4(v_out, sum, vl); vse32_v_f32m4(ptr_output_vl, v_out, vl); ptr_output_vl += vl; n -= vl; } - if (n > 0){ + if (n > 0) { vl = vsetvl_e32m4(n); auto v_out = vle32_v_f32m4(ptr_output_vl, vl); v_out = vfmul_vf_f32m4(v_out, sum, vl); From a2bc1b8283411535306b09953b58de750258b5ea Mon Sep 17 00:00:00 2001 From: Curio Yang Date: Fri, 18 Oct 2024 09:48:44 +0800 Subject: [PATCH 5/6] fix review --- src/Native/src/kernels/stackvm/optimized/reduce.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Native/src/kernels/stackvm/optimized/reduce.cpp b/src/Native/src/kernels/stackvm/optimized/reduce.cpp index dc7b0e038..4cbe6d669 100644 --- a/src/Native/src/kernels/stackvm/optimized/reduce.cpp +++ b/src/Native/src/kernels/stackvm/optimized/reduce.cpp @@ -36,8 +36,9 @@ result optimized::reduce( return stackvm::optimized::reduce(typecode, op, init_value, input, output, in_shape, axis, in_strides, out_strides, keep_dims, context); -#endif +#else return stackvm::reference::reduce(typecode, op, init_value, input, output, in_shape, axis, in_strides, out_strides, keep_dims, context); +#endif } From 1bc4c934f1334b195accf4169816ac7ea7007d54 Mon Sep 17 00:00:00 2001 From: Curio Yang Date: Sun, 29 Sep 2024 14:14:05 +0800 Subject: [PATCH 6/6] fix cmake path --- cmake/nncaseruntimeConfig.cmake.in | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cmake/nncaseruntimeConfig.cmake.in b/cmake/nncaseruntimeConfig.cmake.in index cce581029..664c43d7d 100644 --- a/cmake/nncaseruntimeConfig.cmake.in +++ b/cmake/nncaseruntimeConfig.cmake.in @@ -1,5 +1,4 @@ include(${CMAKE_CURRENT_LIST_DIR}/nncaseruntimeTargets.cmake) -if(NOT TARGET gsl-lite) - find_package(gsl-lite REQUIRED) -endif() \ No newline at end of file +set(nncaseruntime_INCLUDE_DIRS ${CMAKE_CURRENT_LIST_DIR}/../../../include) +set(nncaseruntime_LIBS ${CMAKE_CURRENT_LIST_DIR}/../../libNncase.Runtime.Native.a) \ No newline at end of file