From 7bb52ea05d5ad65dba5e947eaa5714c8616e441c Mon Sep 17 00:00:00 2001
From: Curio Yang <curioyhq@gmail.com>
Date: Sun, 29 Sep 2024 14:14:05 +0800
Subject: [PATCH 1/6] fix cmake path

---
 cmake/nncaseruntimeConfig.cmake.in | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/cmake/nncaseruntimeConfig.cmake.in b/cmake/nncaseruntimeConfig.cmake.in
index cce581029..664c43d7d 100644
--- a/cmake/nncaseruntimeConfig.cmake.in
+++ b/cmake/nncaseruntimeConfig.cmake.in
@@ -1,5 +1,4 @@
 include(${CMAKE_CURRENT_LIST_DIR}/nncaseruntimeTargets.cmake)
 
-if(NOT TARGET gsl-lite)
-    find_package(gsl-lite REQUIRED)
-endif()
\ No newline at end of file
+set(nncaseruntime_INCLUDE_DIRS ${CMAKE_CURRENT_LIST_DIR}/../../../include)
+set(nncaseruntime_LIBS ${CMAKE_CURRENT_LIST_DIR}/../../libNncase.Runtime.Native.a)
\ No newline at end of file

From 0e243b543ab1ab711ccb2107573f6024ee60e178 Mon Sep 17 00:00:00 2001
From: Curio Yang <curioyhq@gmail.com>
Date: Tue, 15 Oct 2024 14:16:51 +0800
Subject: [PATCH 2/6] optmize yolov10 OPS: - add reduce max|min (TODO:mean|sum|
 prod) rvv impl - opt softmax(v1)

---
 .../src/kernels/stackvm/optimized/reduce.cpp  |   5 +
 .../stackvm/optimized/riscv64/CMakeLists.txt  |   1 +
 .../stackvm/optimized/riscv64/reduce.cpp      | 256 ++++++++++--------
 .../stackvm/optimized/riscv64/softmax.cpp     |  93 +++++--
 4 files changed, 208 insertions(+), 147 deletions(-)

diff --git a/src/Native/src/kernels/stackvm/optimized/reduce.cpp b/src/Native/src/kernels/stackvm/optimized/reduce.cpp
index 77c197fb2..dc7b0e038 100644
--- a/src/Native/src/kernels/stackvm/optimized/reduce.cpp
+++ b/src/Native/src/kernels/stackvm/optimized/reduce.cpp
@@ -32,6 +32,11 @@ result<void> optimized::reduce(
     gsl::span<const size_t> in_shape, gsl::span<const size_t> axis,
     gsl::span<const size_t> in_strides, gsl::span<const size_t> out_strides,
     bool keep_dims, kernel_context &context) noexcept {
+#if __riscv_vector
+    return stackvm::optimized::reduce(typecode, op, init_value, input, output,
+                                      in_shape, axis, in_strides, out_strides,
+                                      keep_dims, context);
+#endif
     return stackvm::reference::reduce(typecode, op, init_value, input, output,
                                       in_shape, axis, in_strides, out_strides,
                                       keep_dims, context);
diff --git a/src/Native/src/kernels/stackvm/optimized/riscv64/CMakeLists.txt b/src/Native/src/kernels/stackvm/optimized/riscv64/CMakeLists.txt
index e69de29bb..0759b9e50 100644
--- a/src/Native/src/kernels/stackvm/optimized/riscv64/CMakeLists.txt
+++ b/src/Native/src/kernels/stackvm/optimized/riscv64/CMakeLists.txt
@@ -0,0 +1 @@
+cmake_minimum_required (VERSION 3.13)
diff --git a/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp b/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp
index 86908a239..0b6fe3b8e 100644
--- a/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp
+++ b/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp
@@ -18,6 +18,7 @@
 #include <nncase/kernels/kernel_utils.h>
 #include <nncase/runtime/runtime_op_utility.h>
 #include <nncase/runtime/util.h>
+#include <riscv_vector.h>
 
 using namespace nncase;
 using namespace nncase::runtime;
@@ -26,140 +27,157 @@ using namespace nncase::kernels;
 using namespace nncase::kernels::stackvm;
 using namespace nncase::kernels::stackvm::optimized;
 
-#if __riscv_vector
-static void reduce_block(int dim, int block, const float *input, float *out,
-                         int gap) {
-    __asm volatile(
-        "vsetvli t0, %[block], e32, m8;"
-        "mv a1, %[input];"
-        "mv a3, %[gap];"
-        "mv a0, %[dim];"
-        "fcvt.s.w ft0, a0;"
-        "slli a3, a3, 2;"
-        "vmv.v.x v8, x0;"
-        "reduce_block%=:;"
-        "vle32.v v16, (a1);"
-        "vfadd.vv v8, v16, v8;"
-        "add a1,a1,a3;"
-        "addi a0, a0, -1;"
-        "bnez a0, reduce_block%=;"
-        "vfdiv.vf v8, v8, ft0;"
-        "vse32.v v8, ( %[out]);" ::[dim] "r"(dim),
-        [block] "r"(block), [input] "r"(input), [out] "r"(out), [gap] "r"(gap)
-        : "t0", "a0", "a1", "a3", "ft0", "v8", "v16");
-}
+// #if __riscv_vector
 
-static void reduce_mean(const float *input, float *out, int dim, int n) {
-    __asm volatile(
-
-        "mv a1, %[input];"
-        "mv a2,  %[out]; "
-        "mv a3, %[n];"
-        "fcvt.s.w ft0, %[dim];"
-        "reduce_mean_n_cycle%=:;"
-        "mv a0, %[dim];"
-        "vsetvli t0, a0, e32, m8;"
-        "vmv.s.x v16, x0;"
-        "reduce_mean2%=:;"
-        "vsetvli t0, a0, e32, m8;"
-        "vle32.v v8, (a1);"
-        "slli t1,t0, 2;"
-        "sub a0,a0,t0;"
-        "add a1, a1, t1;"
-        "vfredusum.vs v16,v8,v16;"
-        "bnez a0, reduce_mean2%=;"
-        "vfmv.f.s ft1, v16;"
-        "fdiv.s ft1,ft1,ft0;"
-        "fsw ft1, (a2);"
-        "addi a2, a2, 4;"
-        "addi a3,a3, -1;"
-        "bnez a3, reduce_mean_n_cycle%=;" ::[dim] "r"(dim),
-        [n] "r"(n), [input] "r"(input), [out] "r"(out)
-        : "t0", "t1", "a0", "a1", "a2", "a3", "ft0", "ft1", "v8", "v16");
-}
+result<void> reduce_max_impl(const float *in, float *out, size_t outter_size, size_t inner_size, size_t reduce_size){
+    for (size_t i = 0; i < outter_size; ++i) {
+        size_t outer_offset = i * reduce_size * inner_size;
+        for (size_t j = 0; j < inner_size; ++j) {
+            size_t base_index = outer_offset + j;
+            float max_val = in[base_index];  // 初始化最大值为第一个元素
+            size_t remaining = reduce_size;
 
-static void reduce_mean_s(int32_t c, int32_t dim, const float *input,
-                          float *out, int32_t gap) {
-#define BLOCK_N 32
-    while (c--) {
-        const float *tmp_input = input;
-        for (int j = 0; j < gap / BLOCK_N; ++j) {
-            reduce_block(dim, BLOCK_N, tmp_input, out, gap);
-            tmp_input += BLOCK_N;
-            out += BLOCK_N;
-        }
-        int left_number = gap & (BLOCK_N - 1);
-        if (left_number) {
-            reduce_block(dim, left_number, tmp_input, out, gap);
-            out += left_number;
+            // set vlen and convert scaler to vector
+            if(0)
+            {
+                size_t vl = vsetvl_e32m1(remaining);
+                vfloat32m1_t v_max = vfmv_v_f_f32m1(max_val, vl);
+
+                // process full registers data.
+                while (remaining / vl > 0) {
+                    vfloat32m1_t v_in = vle32_v_f32m1(&in[base_index], vl);
+                    v_max = vfmax_vv_f32m1(v_max, v_in, vl);
+
+                    remaining -= vl;
+                    base_index += vl;
+                }
+                vfloat32m1_t reduced_max_ =
+                    vfredmax_vs_f32m1_f32m1(v_max, v_max, v_max, vl);
+                max_val = vfmv_f_s_f32m1_f32(reduced_max_);
+
+                // process the remaining elements
+                if (remaining > 0) {
+                    vl = vsetvl_e32m1(remaining);
+                    v_max = vfmv_v_f_f32m1(max_val, vl);
+                    vfloat32m1_t v_in = vle32_v_f32m1(&in[base_index], vl);
+                    v_max = vfmax_vv_f32m1(v_max, v_in, vl);
+                    reduced_max_ =
+                        vfredmax_vs_f32m1_f32m1(v_max, v_max, v_max, vl);
+                    max_val = vfmv_f_s_f32m1_f32(reduced_max_);
+                }
+            }
+            else
+            {
+                // set vlen and convert scaler to vector
+                size_t vl = vsetvl_e32m4(remaining);
+                vfloat32m4_t v_max = vfmv_v_f_f32m4(max_val, vl);
+
+                // process full registers data.
+                while (remaining / vl > 0) {
+                    vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl);
+                    v_max = vfmax_vv_f32m4(v_max, v_in, vl);
+
+                    remaining -= vl;
+                    base_index += vl;
+                }
+                vfloat32m1_t reduced_max_ = vfredmax_vs_f32m4_f32m1(
+                    vundefined_f32m1(), v_max, vfmv_v_f_f32m1(max_val, vl), vl);
+                max_val = vfmv_f_s_f32m1_f32(reduced_max_);
+
+                // process the remaining elements
+
+                // 处理剩余的元素
+                if (remaining > 0) {
+                    vl = vsetvl_e32m4(remaining);
+                    v_max = vfmv_v_f_f32m4(max_val, vl);
+                    vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl);
+                    v_max = vfmax_vv_f32m4(v_max, v_in, vl);
+                    reduced_max_ = vfredmax_vs_f32m4_f32m1(
+                        vundefined_f32m1(), v_max, vfmv_v_f_f32m1(max_val, vl),
+                        vl);
+                    max_val = vfmv_f_s_f32m1_f32(reduced_max_);
+                }
+            }
+            out[i * inner_size + j] = max_val;  // 存储结果
         }
-        input += dim * gap;
     }
+    return ok();
 }
 
-static int compute_size_by_index(gsl::span<const size_t> input, int start_index,
-                                 int end_index) {
-    int init_value = 1;
-    for (int i = start_index; i < end_index; ++i) {
-        init_value *= input[i];
-    }
-    return init_value;
-}
+result<void> reduce_min_impl(const float *in, float *out, size_t outter_size, size_t inner_size, size_t reduce_size){
+    for (size_t i = 0; i < outter_size; ++i) {
+        size_t outer_offset = i * reduce_size * inner_size;
+        for (size_t j = 0; j < inner_size; ++j) {
+            size_t base_index = outer_offset + j;
+            float min_val = in[base_index];  // 初始化最大值为第一个元素
+            size_t remaining = reduce_size;
 
-static int get_parameter(gsl::span<const size_t> in_shape,
-                         gsl::span<const size_t> axis, gsl::span<int> out) {
-    int min_index = axis[0];
-    int max_index = axis[0];
-    for (int i = 1; i < (int)axis.size(); ++i) {
-        int value = axis[i];
-        if (value < min_index)
-            min_index = value;
-        else if (value > max_index)
-            max_index = value;
-    }
-    int _sum1 = (max_index + min_index) * (max_index - min_index + 1) >> 1;
-    int _sum2 = axis[0];
-    for (int i = 1; i < (int)axis.size(); ++i) {
-        _sum2 += axis[i];
-    }
-    if (_sum2 != _sum1) {
-        return 1;
+            // set vlen and convert scaler to vector
+            size_t vl = vsetvl_e32m1(remaining);
+            vfloat32m1_t v_min = vfmv_v_f_f32m1(min_val, vl);
+
+            // process full registers data.
+            while (remaining / vl > 0) {
+                vfloat32m1_t v_in = vle32_v_f32m1(&in[base_index], vl);
+                v_min = vfmin_vv_f32m1(v_min, v_in, vl);
+
+                remaining -= vl;
+                base_index += vl;
+            }
+            vfloat32m1_t reduced_min_ = vfredmin_vs_f32m1_f32m1(v_min, v_min, v_min, vl);
+            min_val = vfmv_f_s_f32m1_f32(reduced_min_);
+
+            // process the remaining elements
+            vl = vsetvl_e32m1(remaining);
+            v_min = vfmv_v_f_f32m1(min_val, vl);
+            vfloat32m1_t v_in = vle32_v_f32m1(&in[base_index], vl);
+            v_min = vfmin_vv_f32m1(v_min, v_in, vl);
+            reduced_min_ = vfredmin_vs_f32m1_f32m1(v_min, v_min, v_min, vl);
+            min_val = vfmv_f_s_f32m1_f32(reduced_min_);
+
+            out[i * inner_size + j] = min_val;  // 存储结果
+        }
     }
-    out[0] = compute_size_by_index(in_shape, min_index, max_index + 1);
-    out[1] = compute_size_by_index(in_shape, 0, min_index);
-    out[2] = compute_size_by_index(in_shape, max_index + 1, in_shape.size());
-    return 0;
+    return ok();
 }
-#endif
+// #endif
 
 result<void> optimized::reduce(
-    typecode_t typecode, nncase::runtime::stackvm::reduce_op_t op,
-    const gsl::byte *init_value, const gsl::byte *input, gsl::byte *output,
+    NNCASE_UNUSED typecode_t typecode, nncase::runtime::stackvm::reduce_op_t op,
+    NNCASE_UNUSED  const gsl::byte *init_value, const gsl::byte *input, gsl::byte *output,
     gsl::span<const size_t> in_shape, gsl::span<const size_t> axis,
-    gsl::span<const size_t> in_strides, gsl::span<const size_t> out_strides,
-    bool keep_dims, kernel_context &context) noexcept {
+    NNCASE_UNUSED  gsl::span<const size_t> in_strides, NNCASE_UNUSED gsl::span<const size_t> out_strides,
+    NNCASE_UNUSED  bool keep_dims, NNCASE_UNUSED  kernel_context &context) noexcept {
 #if __riscv_vector
-    do {
-        if (op == reduce_op_t::mean && typecode == dt_float32) {
-            int parameters[3];
-            int ret = get_parameter(in_shape, axis, parameters);
-            if (ret) {
-                break;
-            }
-            auto input_data = IN_CAST(float, input);
-            auto out_data = OUT_CAST(float, output);
-            int gap = parameters[2];
-            if (gap == 1) {
-                reduce_mean(input_data, out_data, parameters[0], parameters[1]);
-            } else {
-                reduce_mean_s(parameters[1], parameters[0], input_data,
-                              out_data, gap);
-            }
-            return ok();
+    // The type of axis is 'size_t'. It is real axis.
+    // 计算inner_size、outter_size
+    size_t inner_size = 1, outter_size = 1;
+    size_t reduce_size = in_shape[axis[0]];
+
+    for (size_t i = 0; i < axis[0]; i++) { outter_size *= in_shape[i]; }
+
+    for (size_t i = axis[0]+1; i < in_shape.size(); i++) { inner_size *= in_shape[i]; }
+
+    const float* in = reinterpret_cast<const float*>(input);
+    float* out = reinterpret_cast<float*>(output);
+    if (axis.size() == 1)
+    {
+        switch(op)
+        {
+        case reduce_op_t::max:
+            return reduce_max_impl(in, out, outter_size, inner_size, reduce_size);
+        case reduce_op_t::min:
+            return reduce_min_impl(in, out, outter_size, inner_size, reduce_size);
+            break;
+        case reduce_op_t::sum:
+        case reduce_op_t::mean:
+        case reduce_op_t::prod:
+        default:
+            break;
         }
-    } while (0);
-#endif
+    }
 
+#endif
     return stackvm::reference::reduce(typecode, op, init_value, input, output,
                                       in_shape, axis, in_strides, out_strides,
                                       keep_dims, context);
diff --git a/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp b/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp
index 58442a40c..9656c18b0 100644
--- a/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp
+++ b/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp
@@ -249,51 +249,88 @@ result<void> optimized_softmax_impl(const T *input, T *output,
             float *ptr_output_vl = ptr_output;
 
             // max
-            float max = std::numeric_limits<float>::lowest();
-            while (n) {
-                auto vl = vsetvl_e32m8(n);
-                auto v = vle32_v_f32m8(ptr_input_vl, vl);
-                auto s = vfmv_s_f_f32m1(vundefined_f32m1(), max, vl);
+            float max = *ptr_input_vl;
+            {
+                size_t vl = vsetvl_e32m4(n);
+                vfloat32m4_t s = vfmv_v_f_f32m4(max, vl);
+                while(n / vl > 0){
+                    vfloat32m4_t v = vle32_v_f32m4(ptr_input_vl, vl);
+                    s = vfmax_vv_f32m4(s, v, vl);
 
-                s = vfredmax_vs_f32m8_f32m1(s, v, s, vl);
-                max = vfmv_f_s_f32m1_f32(s);
-                ptr_input_vl += vl;
-                n -= vl;
+                    n -= vl;
+                    ptr_input_vl += vl;
+                }
+
+                vfloat32m1_t reduced_max_ = vfredmax_vs_f32m4_f32m1(
+                    vundefined_f32m1(), s, vfmv_v_f_f32m1(max, vl), vl);
+                max = vfmv_f_s_f32m1_f32(reduced_max_);
+
+                if(n > 0){
+                    vl = vsetvl_e32m4(n);
+                    s = vfmv_v_f_f32m4(max, vl);
+                    vfloat32m4_t v = vle32_v_f32m4(ptr_input_vl, vl);
+                    s = vfmax_vv_f32m4(s, v, vl);
+                    reduced_max_ = vfredmax_vs_f32m4_f32m1(
+                        vundefined_f32m1(), s, vfmv_v_f_f32m1(max, vl), vl);
+                    max = vfmv_f_s_f32m1_f32(reduced_max_);
+                }
             }
 
             // exp((x - max) * beta) and sum(exp)
             float sum = 0.f;
             ptr_input_vl = ptr_input;
             n = axis_dim;
-            while (n) {
-                auto vl = vsetvl_e32m8(n);
-                auto v_in = vle32_v_f32m8(ptr_input_vl, vl);
+            {
+                auto vl = vsetvl_e32m4(n);
                 auto s = vfmv_s_f_f32m1(vundefined_f32m1(), sum, vl);
+                while (n / vl > 0) {
 
-                auto v_out = exp_ps(
-                    vfmul_vf_f32m8(vfsub_vf_f32m8(v_in, max, vl), beta, vl),
-                    vl);
-                s = vfredosum_vs_f32m8_f32m1(s, v_out, s, vl);
+                    auto v_in = vle32_v_f32m4(ptr_input_vl, vl);
+                    auto v_out = exp_ps(
+                        vfmul_vf_f32m4(vfsub_vf_f32m4(v_in, max, vl), beta, vl),
+                        vl);
+                    s = vfredosum_vs_f32m4_f32m1(s, v_out, s, vl);
 
-                vse32_v_f32m8(ptr_output_vl, v_out, vl);
+                    vse32_v_f32m4(ptr_output_vl, v_out, vl);
+
+                    ptr_input_vl += vl;
+                    ptr_output_vl += vl;
+                    n -= vl;
+                }
                 sum = vfmv_f_s_f32m1_f32(s);
-                ptr_input_vl += vl;
-                ptr_output_vl += vl;
-                n -= vl;
+                if (n > 0) {
+                    vl = vsetvl_e32m4(n);
+                    s = vfmv_s_f_f32m1(vundefined_f32m1(), sum, vl);
+                    auto v_in = vle32_v_f32m4(ptr_input_vl, vl);
+                    auto v_out = exp_ps(
+                        vfmul_vf_f32m4(vfsub_vf_f32m4(v_in, max, vl), beta, vl),
+                        vl);
+                    s = vfredosum_vs_f32m4_f32m1(s, v_out, s, vl);
+
+                    vse32_v_f32m4(ptr_output_vl, v_out, vl);
+                    sum = vfmv_f_s_f32m1_f32(s);
+                }
             }
-
             // div
             ptr_input_vl = ptr_input;
             ptr_output_vl = ptr_output;
             n = axis_dim;
             sum = 1.0f / sum;
-            while (n) {
-                auto vl = vsetvl_e32m8(n);
-                auto v_out = vle32_v_f32m8(ptr_output_vl, vl);
-                v_out = vfmul_vf_f32m8(v_out, sum, vl);
-                vse32_v_f32m8(ptr_output_vl, v_out, vl);
-                ptr_output_vl += vl;
-                n -= vl;
+            {
+                auto vl = vsetvl_e32m4(n);
+                while (n/vl>0) {
+                    auto v_out = vle32_v_f32m4(ptr_output_vl, vl);
+                    v_out = vfmul_vf_f32m4(v_out, sum, vl);
+                    vse32_v_f32m4(ptr_output_vl, v_out, vl);
+                    ptr_output_vl += vl;
+                    n -= vl;
+                }
+                if (n > 0){
+                    vl = vsetvl_e32m4(n);
+                    auto v_out = vle32_v_f32m4(ptr_output_vl, vl);
+                    v_out = vfmul_vf_f32m4(v_out, sum, vl);
+                    vse32_v_f32m4(ptr_output_vl, v_out, vl);
+                }
             }
 
             ptr_input += axis_dim;

From 9a824a91a51344f57697b129ff8f6e4ab6fe63f3 Mon Sep 17 00:00:00 2001
From: Curio Yang <curioyhq@gmail.com>
Date: Wed, 16 Oct 2024 17:06:15 +0800
Subject: [PATCH 3/6] - add reduce prod - fix reduce sum | mean - opt softmax
 sum

---
 .../stackvm/optimized/riscv64/reduce.cpp      | 143 +++++++++++++++---
 .../stackvm/optimized/riscv64/softmax.cpp     |  16 +-
 2 files changed, 129 insertions(+), 30 deletions(-)

diff --git a/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp b/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp
index 0b6fe3b8e..53fa35742 100644
--- a/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp
+++ b/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp
@@ -27,18 +27,18 @@ using namespace nncase::kernels;
 using namespace nncase::kernels::stackvm;
 using namespace nncase::kernels::stackvm::optimized;
 
-// #if __riscv_vector
+#if __riscv_vector
 
 result<void> reduce_max_impl(const float *in, float *out, size_t outter_size, size_t inner_size, size_t reduce_size){
     for (size_t i = 0; i < outter_size; ++i) {
         size_t outer_offset = i * reduce_size * inner_size;
         for (size_t j = 0; j < inner_size; ++j) {
             size_t base_index = outer_offset + j;
-            float max_val = in[base_index];  // 初始化最大值为第一个元素
+            float max_val = in[base_index];
             size_t remaining = reduce_size;
 
             // set vlen and convert scaler to vector
-            if(0)
+            if(0) //m1
             {
                 size_t vl = vsetvl_e32m1(remaining);
                 vfloat32m1_t v_max = vfmv_v_f_f32m1(max_val, vl);
@@ -66,7 +66,7 @@ result<void> reduce_max_impl(const float *in, float *out, size_t outter_size, si
                     max_val = vfmv_f_s_f32m1_f32(reduced_max_);
                 }
             }
-            else
+            else // m4
             {
                 // set vlen and convert scaler to vector
                 size_t vl = vsetvl_e32m4(remaining);
@@ -85,8 +85,6 @@ result<void> reduce_max_impl(const float *in, float *out, size_t outter_size, si
                 max_val = vfmv_f_s_f32m1_f32(reduced_max_);
 
                 // process the remaining elements
-
-                // 处理剩余的元素
                 if (remaining > 0) {
                     vl = vsetvl_e32m4(remaining);
                     v_max = vfmv_v_f_f32m4(max_val, vl);
@@ -98,7 +96,7 @@ result<void> reduce_max_impl(const float *in, float *out, size_t outter_size, si
                     max_val = vfmv_f_s_f32m1_f32(reduced_max_);
                 }
             }
-            out[i * inner_size + j] = max_val;  // 存储结果
+            out[i * inner_size + j] = max_val;
         }
     }
     return ok();
@@ -109,38 +107,128 @@ result<void> reduce_min_impl(const float *in, float *out, size_t outter_size, si
         size_t outer_offset = i * reduce_size * inner_size;
         for (size_t j = 0; j < inner_size; ++j) {
             size_t base_index = outer_offset + j;
-            float min_val = in[base_index];  // 初始化最大值为第一个元素
+            float min_val = in[base_index];
             size_t remaining = reduce_size;
 
             // set vlen and convert scaler to vector
-            size_t vl = vsetvl_e32m1(remaining);
-            vfloat32m1_t v_min = vfmv_v_f_f32m1(min_val, vl);
+            size_t vl = vsetvl_e32m4(remaining);
+            vfloat32m4_t v_min = vfmv_v_f_f32m4(min_val, vl);
 
             // process full registers data.
             while (remaining / vl > 0) {
-                vfloat32m1_t v_in = vle32_v_f32m1(&in[base_index], vl);
-                v_min = vfmin_vv_f32m1(v_min, v_in, vl);
+                vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl);
+                v_min = vfmin_vv_f32m4(v_min, v_in, vl);
 
                 remaining -= vl;
                 base_index += vl;
             }
-            vfloat32m1_t reduced_min_ = vfredmin_vs_f32m1_f32m1(v_min, v_min, v_min, vl);
+            vfloat32m1_t reduced_min_ = vfredmin_vs_f32m4_f32m1(
+                vundefined_f32m1(), v_min, vfmv_v_f_f32m1(min_val, vl), vl);
             min_val = vfmv_f_s_f32m1_f32(reduced_min_);
 
             // process the remaining elements
-            vl = vsetvl_e32m1(remaining);
-            v_min = vfmv_v_f_f32m1(min_val, vl);
-            vfloat32m1_t v_in = vle32_v_f32m1(&in[base_index], vl);
-            v_min = vfmin_vv_f32m1(v_min, v_in, vl);
-            reduced_min_ = vfredmin_vs_f32m1_f32m1(v_min, v_min, v_min, vl);
-            min_val = vfmv_f_s_f32m1_f32(reduced_min_);
+            if (remaining > 0) {
+                vl = vsetvl_e32m4(remaining);
+                v_min = vfmv_v_f_f32m4(min_val, vl);
+                vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl);
+                v_min = vfmin_vv_f32m4(v_min, v_in, vl);
+                reduced_min_ = vfredmin_vs_f32m4_f32m1(
+                    vundefined_f32m1(), v_min, vfmv_v_f_f32m1(min_val, vl), vl);
+                min_val = vfmv_f_s_f32m1_f32(reduced_min_);
+            }
 
-            out[i * inner_size + j] = min_val;  // 存储结果
+            out[i * inner_size + j] = min_val;
         }
     }
     return ok();
 }
-// #endif
+
+result<void> reduce_sum_impl(const float *in, float *out, size_t outter_size, size_t inner_size, size_t reduce_size){
+    for (size_t i = 0; i < outter_size; ++i) {
+        size_t outer_offset = i * reduce_size * inner_size;
+        for (size_t j = 0; j < inner_size; ++j) {
+            size_t base_index = outer_offset + j;
+            float sum = 0.0f;
+            size_t remaining = reduce_size;
+
+            // set vlen and convert scaler to vector
+            size_t vl = vsetvl_e32m4(remaining);
+            vfloat32m4_t v_sum = vfmv_v_f_f32m4(0.0f, vl);
+
+            // process full registers data.
+            while (remaining / vl > 0) {
+                vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl);
+                v_sum = vfadd_vv_f32m4(v_sum, v_in, vl);
+
+                remaining -= vl;
+                base_index += vl;
+            }
+            vfloat32m1_t reduced_sum_ = vfredosum_vs_f32m4_f32m1(
+                vundefined_f32m1(), v_sum, vfmv_v_f_f32m1(0.0f, vl), vl);
+            sum += vfmv_f_s_f32m1_f32(reduced_sum_);
+
+            // process the remaining elements
+            if (remaining > 0) {
+                vl = vsetvl_e32m4(remaining);
+                v_sum = vfmv_v_f_f32m4(0.0f, vl);
+                vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl);
+                v_sum = vfadd_vv_f32m4(v_sum, v_in, vl);
+                reduced_sum_ = vfredosum_vs_f32m4_f32m1(
+                    vundefined_f32m1(), v_sum, vfmv_v_f_f32m1(0.0f, vl), vl);
+                sum += vfmv_f_s_f32m1_f32(reduced_sum_);
+            }
+
+            out[i * inner_size + j] = sum;
+        }
+    }
+    return ok();
+}
+
+result<void> reduce_prod_impl(const float *in, float *out, size_t outter_size,
+                             size_t inner_size, size_t reduce_size) {
+    for (size_t i = 0; i < outter_size; ++i) {
+        size_t outer_offset = i * reduce_size * inner_size;
+        for (size_t j = 0; j < inner_size; ++j) {
+            size_t base_index = outer_offset + j;
+            float acc = 1.0f;
+            size_t remaining = reduce_size;
+
+            // set vlen and convert scaler to vector
+            size_t vl = vsetvl_e32m4(remaining);
+            vfloat32m4_t v_acc = vfmv_v_f_f32m4(1.0f, vl);
+
+            // process full registers data.
+            while (remaining / vl > 0) {
+                vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl);
+                v_acc = vfmul_vv_f32m4(v_acc, v_in, vl);
+
+                remaining -= vl;
+                base_index += vl;
+            }
+            for (size_t i = 0; i < vl; i++) {
+                acc *= vfmv_f_s_f32m4_f32(
+                    vslidedown_vx_f32m4(vundefined_f32m4(), v_acc, i, vl));
+            }
+
+            // process the remaining elements
+            if (remaining > 0) {
+                vl = vsetvl_e32m4(remaining);
+                v_acc = vfmv_v_f_f32m4(1.0f, vl);
+                vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl);
+                v_acc = vfmul_vv_f32m4(v_acc, v_in, vl);
+                for (size_t i = 0; i < vl; i++) {
+                acc *= vfmv_f_s_f32m4_f32(
+                    vslidedown_vx_f32m4(vundefined_f32m4(), v_acc, i, vl));
+                }
+            }
+
+            out[i * inner_size + j] = acc;
+        }
+    }
+    return ok();
+}
+
+#endif
 
 result<void> optimized::reduce(
     NNCASE_UNUSED typecode_t typecode, nncase::runtime::stackvm::reduce_op_t op,
@@ -160,7 +248,7 @@ result<void> optimized::reduce(
 
     const float* in = reinterpret_cast<const float*>(input);
     float* out = reinterpret_cast<float*>(output);
-    if (axis.size() == 1)
+    if (axis.size() == 1 && axis[0] == in_shape.size() - 1)
     {
         switch(op)
         {
@@ -168,14 +256,23 @@ result<void> optimized::reduce(
             return reduce_max_impl(in, out, outter_size, inner_size, reduce_size);
         case reduce_op_t::min:
             return reduce_min_impl(in, out, outter_size, inner_size, reduce_size);
-            break;
         case reduce_op_t::sum:
+            return reduce_sum_impl(in, out, outter_size, inner_size, reduce_size);
         case reduce_op_t::mean:
+            reduce_sum_impl(in, out, outter_size, inner_size, reduce_size).unwrap();
+            for(size_t i = 0; i < outter_size; i++)
+            {
+                out[i] *= 1.0f / reduce_size;
+            }
+            return ok();
         case reduce_op_t::prod:
+            return reduce_prod_impl(in, out, outter_size, inner_size, reduce_size);
         default:
             break;
         }
     }
+    // TODO: implement non-last axis reduce
+    // TODO: implement multi-axis reduce
 
 #endif
     return stackvm::reference::reduce(typecode, op, init_value, input, output,
diff --git a/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp b/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp
index 9656c18b0..38302ef58 100644
--- a/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp
+++ b/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp
@@ -282,33 +282,35 @@ result<void> optimized_softmax_impl(const T *input, T *output,
             n = axis_dim;
             {
                 auto vl = vsetvl_e32m4(n);
-                auto s = vfmv_s_f_f32m1(vundefined_f32m1(), sum, vl);
+                auto s = vfmv_v_f_f32m4(0.0f, vl);
                 while (n / vl > 0) {
 
                     auto v_in = vle32_v_f32m4(ptr_input_vl, vl);
                     auto v_out = exp_ps(
                         vfmul_vf_f32m4(vfsub_vf_f32m4(v_in, max, vl), beta, vl),
                         vl);
-                    s = vfredosum_vs_f32m4_f32m1(s, v_out, s, vl);
-
+                    s = vfadd_vv_f32m4(s, v_out, vl);
                     vse32_v_f32m4(ptr_output_vl, v_out, vl);
 
                     ptr_input_vl += vl;
                     ptr_output_vl += vl;
                     n -= vl;
                 }
-                sum = vfmv_f_s_f32m1_f32(s);
+                vfloat32m1_t reduce_sum_ = vfredosum_vs_f32m4_f32m1(
+                    vundefined_f32m1(), s, vfmv_v_f_f32m1(0.0f, vl), vl);
+                sum += vfmv_f_s_f32m1_f32(reduce_sum_);
+
                 if (n > 0) {
                     vl = vsetvl_e32m4(n);
-                    s = vfmv_s_f_f32m1(vundefined_f32m1(), sum, vl);
                     auto v_in = vle32_v_f32m4(ptr_input_vl, vl);
                     auto v_out = exp_ps(
                         vfmul_vf_f32m4(vfsub_vf_f32m4(v_in, max, vl), beta, vl),
                         vl);
-                    s = vfredosum_vs_f32m4_f32m1(s, v_out, s, vl);
+                    reduce_sum_ = vfredosum_vs_f32m4_f32m1(vundefined_f32m1(), v_out,
+                                                 vfmv_v_f_f32m1(0.0f, vl), vl);
 
                     vse32_v_f32m4(ptr_output_vl, v_out, vl);
-                    sum = vfmv_f_s_f32m1_f32(s);
+                    sum += vfmv_f_s_f32m1_f32(reduce_sum_);
                 }
             }
             // div

From 4458a11d9f396241dcb261ead1d5b33c6ff703e7 Mon Sep 17 00:00:00 2001
From: curioyang <curioyang@users.noreply.github.com>
Date: Wed, 16 Oct 2024 09:10:36 +0000
Subject: [PATCH 4/6] Apply code-format changes

---
 .../stackvm/optimized/riscv64/reduce.cpp      | 67 +++++++++++--------
 .../stackvm/optimized/riscv64/softmax.cpp     | 11 +--
 2 files changed, 45 insertions(+), 33 deletions(-)

diff --git a/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp b/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp
index 53fa35742..62326326d 100644
--- a/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp
+++ b/src/Native/src/kernels/stackvm/optimized/riscv64/reduce.cpp
@@ -29,7 +29,8 @@ using namespace nncase::kernels::stackvm::optimized;
 
 #if __riscv_vector
 
-result<void> reduce_max_impl(const float *in, float *out, size_t outter_size, size_t inner_size, size_t reduce_size){
+result<void> reduce_max_impl(const float *in, float *out, size_t outter_size,
+                             size_t inner_size, size_t reduce_size) {
     for (size_t i = 0; i < outter_size; ++i) {
         size_t outer_offset = i * reduce_size * inner_size;
         for (size_t j = 0; j < inner_size; ++j) {
@@ -38,7 +39,7 @@ result<void> reduce_max_impl(const float *in, float *out, size_t outter_size, si
             size_t remaining = reduce_size;
 
             // set vlen and convert scaler to vector
-            if(0) //m1
+            if (0) // m1
             {
                 size_t vl = vsetvl_e32m1(remaining);
                 vfloat32m1_t v_max = vfmv_v_f_f32m1(max_val, vl);
@@ -65,8 +66,7 @@ result<void> reduce_max_impl(const float *in, float *out, size_t outter_size, si
                         vfredmax_vs_f32m1_f32m1(v_max, v_max, v_max, vl);
                     max_val = vfmv_f_s_f32m1_f32(reduced_max_);
                 }
-            }
-            else // m4
+            } else // m4
             {
                 // set vlen and convert scaler to vector
                 size_t vl = vsetvl_e32m4(remaining);
@@ -102,7 +102,8 @@ result<void> reduce_max_impl(const float *in, float *out, size_t outter_size, si
     return ok();
 }
 
-result<void> reduce_min_impl(const float *in, float *out, size_t outter_size, size_t inner_size, size_t reduce_size){
+result<void> reduce_min_impl(const float *in, float *out, size_t outter_size,
+                             size_t inner_size, size_t reduce_size) {
     for (size_t i = 0; i < outter_size; ++i) {
         size_t outer_offset = i * reduce_size * inner_size;
         for (size_t j = 0; j < inner_size; ++j) {
@@ -143,7 +144,8 @@ result<void> reduce_min_impl(const float *in, float *out, size_t outter_size, si
     return ok();
 }
 
-result<void> reduce_sum_impl(const float *in, float *out, size_t outter_size, size_t inner_size, size_t reduce_size){
+result<void> reduce_sum_impl(const float *in, float *out, size_t outter_size,
+                             size_t inner_size, size_t reduce_size) {
     for (size_t i = 0; i < outter_size; ++i) {
         size_t outer_offset = i * reduce_size * inner_size;
         for (size_t j = 0; j < inner_size; ++j) {
@@ -185,7 +187,7 @@ result<void> reduce_sum_impl(const float *in, float *out, size_t outter_size, si
 }
 
 result<void> reduce_prod_impl(const float *in, float *out, size_t outter_size,
-                             size_t inner_size, size_t reduce_size) {
+                              size_t inner_size, size_t reduce_size) {
     for (size_t i = 0; i < outter_size; ++i) {
         size_t outer_offset = i * reduce_size * inner_size;
         for (size_t j = 0; j < inner_size; ++j) {
@@ -217,8 +219,8 @@ result<void> reduce_prod_impl(const float *in, float *out, size_t outter_size,
                 vfloat32m4_t v_in = vle32_v_f32m4(&in[base_index], vl);
                 v_acc = vfmul_vv_f32m4(v_acc, v_in, vl);
                 for (size_t i = 0; i < vl; i++) {
-                acc *= vfmv_f_s_f32m4_f32(
-                    vslidedown_vx_f32m4(vundefined_f32m4(), v_acc, i, vl));
+                    acc *= vfmv_f_s_f32m4_f32(
+                        vslidedown_vx_f32m4(vundefined_f32m4(), v_acc, i, vl));
                 }
             }
 
@@ -232,41 +234,50 @@ result<void> reduce_prod_impl(const float *in, float *out, size_t outter_size,
 
 result<void> optimized::reduce(
     NNCASE_UNUSED typecode_t typecode, nncase::runtime::stackvm::reduce_op_t op,
-    NNCASE_UNUSED  const gsl::byte *init_value, const gsl::byte *input, gsl::byte *output,
-    gsl::span<const size_t> in_shape, gsl::span<const size_t> axis,
-    NNCASE_UNUSED  gsl::span<const size_t> in_strides, NNCASE_UNUSED gsl::span<const size_t> out_strides,
-    NNCASE_UNUSED  bool keep_dims, NNCASE_UNUSED  kernel_context &context) noexcept {
+    NNCASE_UNUSED const gsl::byte *init_value, const gsl::byte *input,
+    gsl::byte *output, gsl::span<const size_t> in_shape,
+    gsl::span<const size_t> axis,
+    NNCASE_UNUSED gsl::span<const size_t> in_strides,
+    NNCASE_UNUSED gsl::span<const size_t> out_strides,
+    NNCASE_UNUSED bool keep_dims,
+    NNCASE_UNUSED kernel_context &context) noexcept {
 #if __riscv_vector
     // The type of axis is 'size_t'. It is real axis.
     // 计算inner_size、outter_size
     size_t inner_size = 1, outter_size = 1;
     size_t reduce_size = in_shape[axis[0]];
 
-    for (size_t i = 0; i < axis[0]; i++) { outter_size *= in_shape[i]; }
+    for (size_t i = 0; i < axis[0]; i++) {
+        outter_size *= in_shape[i];
+    }
 
-    for (size_t i = axis[0]+1; i < in_shape.size(); i++) { inner_size *= in_shape[i]; }
+    for (size_t i = axis[0] + 1; i < in_shape.size(); i++) {
+        inner_size *= in_shape[i];
+    }
 
-    const float* in = reinterpret_cast<const float*>(input);
-    float* out = reinterpret_cast<float*>(output);
-    if (axis.size() == 1 && axis[0] == in_shape.size() - 1)
-    {
-        switch(op)
-        {
+    const float *in = reinterpret_cast<const float *>(input);
+    float *out = reinterpret_cast<float *>(output);
+    if (axis.size() == 1 && axis[0] == in_shape.size() - 1) {
+        switch (op) {
         case reduce_op_t::max:
-            return reduce_max_impl(in, out, outter_size, inner_size, reduce_size);
+            return reduce_max_impl(in, out, outter_size, inner_size,
+                                   reduce_size);
         case reduce_op_t::min:
-            return reduce_min_impl(in, out, outter_size, inner_size, reduce_size);
+            return reduce_min_impl(in, out, outter_size, inner_size,
+                                   reduce_size);
         case reduce_op_t::sum:
-            return reduce_sum_impl(in, out, outter_size, inner_size, reduce_size);
+            return reduce_sum_impl(in, out, outter_size, inner_size,
+                                   reduce_size);
         case reduce_op_t::mean:
-            reduce_sum_impl(in, out, outter_size, inner_size, reduce_size).unwrap();
-            for(size_t i = 0; i < outter_size; i++)
-            {
+            reduce_sum_impl(in, out, outter_size, inner_size, reduce_size)
+                .unwrap();
+            for (size_t i = 0; i < outter_size; i++) {
                 out[i] *= 1.0f / reduce_size;
             }
             return ok();
         case reduce_op_t::prod:
-            return reduce_prod_impl(in, out, outter_size, inner_size, reduce_size);
+            return reduce_prod_impl(in, out, outter_size, inner_size,
+                                    reduce_size);
         default:
             break;
         }
diff --git a/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp b/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp
index 38302ef58..4539baee7 100644
--- a/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp
+++ b/src/Native/src/kernels/stackvm/optimized/riscv64/softmax.cpp
@@ -253,7 +253,7 @@ result<void> optimized_softmax_impl(const T *input, T *output,
             {
                 size_t vl = vsetvl_e32m4(n);
                 vfloat32m4_t s = vfmv_v_f_f32m4(max, vl);
-                while(n / vl > 0){
+                while (n / vl > 0) {
                     vfloat32m4_t v = vle32_v_f32m4(ptr_input_vl, vl);
                     s = vfmax_vv_f32m4(s, v, vl);
 
@@ -265,7 +265,7 @@ result<void> optimized_softmax_impl(const T *input, T *output,
                     vundefined_f32m1(), s, vfmv_v_f_f32m1(max, vl), vl);
                 max = vfmv_f_s_f32m1_f32(reduced_max_);
 
-                if(n > 0){
+                if (n > 0) {
                     vl = vsetvl_e32m4(n);
                     s = vfmv_v_f_f32m4(max, vl);
                     vfloat32m4_t v = vle32_v_f32m4(ptr_input_vl, vl);
@@ -306,7 +306,8 @@ result<void> optimized_softmax_impl(const T *input, T *output,
                     auto v_out = exp_ps(
                         vfmul_vf_f32m4(vfsub_vf_f32m4(v_in, max, vl), beta, vl),
                         vl);
-                    reduce_sum_ = vfredosum_vs_f32m4_f32m1(vundefined_f32m1(), v_out,
+                    reduce_sum_ =
+                        vfredosum_vs_f32m4_f32m1(vundefined_f32m1(), v_out,
                                                  vfmv_v_f_f32m1(0.0f, vl), vl);
 
                     vse32_v_f32m4(ptr_output_vl, v_out, vl);
@@ -320,14 +321,14 @@ result<void> optimized_softmax_impl(const T *input, T *output,
             sum = 1.0f / sum;
             {
                 auto vl = vsetvl_e32m4(n);
-                while (n/vl>0) {
+                while (n / vl > 0) {
                     auto v_out = vle32_v_f32m4(ptr_output_vl, vl);
                     v_out = vfmul_vf_f32m4(v_out, sum, vl);
                     vse32_v_f32m4(ptr_output_vl, v_out, vl);
                     ptr_output_vl += vl;
                     n -= vl;
                 }
-                if (n > 0){
+                if (n > 0) {
                     vl = vsetvl_e32m4(n);
                     auto v_out = vle32_v_f32m4(ptr_output_vl, vl);
                     v_out = vfmul_vf_f32m4(v_out, sum, vl);

From a2bc1b8283411535306b09953b58de750258b5ea Mon Sep 17 00:00:00 2001
From: Curio Yang <curioyhq@gmail.com>
Date: Fri, 18 Oct 2024 09:48:44 +0800
Subject: [PATCH 5/6] fix review

---
 src/Native/src/kernels/stackvm/optimized/reduce.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Native/src/kernels/stackvm/optimized/reduce.cpp b/src/Native/src/kernels/stackvm/optimized/reduce.cpp
index dc7b0e038..4cbe6d669 100644
--- a/src/Native/src/kernels/stackvm/optimized/reduce.cpp
+++ b/src/Native/src/kernels/stackvm/optimized/reduce.cpp
@@ -36,8 +36,9 @@ result<void> optimized::reduce(
     return stackvm::optimized::reduce(typecode, op, init_value, input, output,
                                       in_shape, axis, in_strides, out_strides,
                                       keep_dims, context);
-#endif
+#else
     return stackvm::reference::reduce(typecode, op, init_value, input, output,
                                       in_shape, axis, in_strides, out_strides,
                                       keep_dims, context);
+#endif
 }

From 1bc4c934f1334b195accf4169816ac7ea7007d54 Mon Sep 17 00:00:00 2001
From: Curio Yang <curioyhq@gmail.com>
Date: Sun, 29 Sep 2024 14:14:05 +0800
Subject: [PATCH 6/6] fix cmake path

---
 cmake/nncaseruntimeConfig.cmake.in | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/cmake/nncaseruntimeConfig.cmake.in b/cmake/nncaseruntimeConfig.cmake.in
index cce581029..664c43d7d 100644
--- a/cmake/nncaseruntimeConfig.cmake.in
+++ b/cmake/nncaseruntimeConfig.cmake.in
@@ -1,5 +1,4 @@
 include(${CMAKE_CURRENT_LIST_DIR}/nncaseruntimeTargets.cmake)
 
-if(NOT TARGET gsl-lite)
-    find_package(gsl-lite REQUIRED)
-endif()
\ No newline at end of file
+set(nncaseruntime_INCLUDE_DIRS ${CMAKE_CURRENT_LIST_DIR}/../../../include)
+set(nncaseruntime_LIBS ${CMAKE_CURRENT_LIST_DIR}/../../libNncase.Runtime.Native.a)
\ No newline at end of file