From 6fe7c42f9a4b4a1455563a68c18d49a1d00204cc Mon Sep 17 00:00:00 2001 From: tahaelbayad Date: Fri, 13 Dec 2024 20:21:21 +0100 Subject: [PATCH] add victor's comment --- .gitmodules | 4 - TargetLibraries/Snitch/CMakeLists.txt | 1 - .../Snitch/src/pulp_nn_add_i8_i8_i8.c | 202 ++++++++++++++++++ .../Snitch/third_party/pulp-nn-mixed | 1 - 4 files changed, 202 insertions(+), 6 deletions(-) create mode 100644 TargetLibraries/Snitch/src/pulp_nn_add_i8_i8_i8.c delete mode 160000 TargetLibraries/Snitch/third_party/pulp-nn-mixed diff --git a/.gitmodules b/.gitmodules index 336972c3..def05e2a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,7 +7,3 @@ [submodule "CMSIS-NN"] path = TargetLibraries/CMSIS/third_party/CMSIS-NN url = https://github.com/ARM-software/CMSIS-NN.git -[submodule "TargetLibraries/Snitch/third_party/pulp-nn-mixed"] - path = TargetLibraries/Snitch/third_party/pulp-nn-mixed - url = https://github.com/Victor-Jung/pulp-nn-mixed.git - branch = deeploySnitchTarget diff --git a/TargetLibraries/Snitch/CMakeLists.txt b/TargetLibraries/Snitch/CMakeLists.txt index 49130ee7..78a214fe 100644 --- a/TargetLibraries/Snitch/CMakeLists.txt +++ b/TargetLibraries/Snitch/CMakeLists.txt @@ -1,6 +1,5 @@ file(GLOB_RECURSE SOURCES "src/**" - "third_party/pulp-nn-mixed/DeeploySnitch/src/**" ) include(cmake/snitch-runtime-precompiled.cmake) diff --git a/TargetLibraries/Snitch/src/pulp_nn_add_i8_i8_i8.c b/TargetLibraries/Snitch/src/pulp_nn_add_i8_i8_i8.c new file mode 100644 index 00000000..61a40441 --- /dev/null +++ b/TargetLibraries/Snitch/src/pulp_nn_add_i8_i8_i8.c @@ -0,0 +1,202 @@ +/* + * pulp_nn_add_i8_i8_i8.c + * Georg Rutishauser + * Victor Jung + * + * Copyright (C) 2018-2020 University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "DeeploySnitchMath.h" + + + +void __attribute__ ((noinline)) pulp_nn_add_i8_i8_i8( + int8_t * pIn1, + int8_t * pIn2, + int8_t * pOut, + int32_t in1_mul, + int32_t in1_add, + uint16_t in1_shift, + int32_t in2_mul, + int32_t in2_add, + uint16_t in2_shift, + int32_t out_mul, + int32_t out_add, + uint16_t out_shift, + uint16_t dim_im_in_x, + uint16_t dim_im_in_y, + uint16_t ch_im_in, + int out_requant_flag) +{ + int core_id = snrt_global_compute_core_idx(); + int n_cores = snrt_global_compute_core_num(); + + if (dim_im_in_y < n_cores){ + n_cores = dim_im_in_y; + } + + int Log2Core = INT_LOG2(n_cores); + int chunck = (dim_im_in_y >> Log2Core) + ((dim_im_in_y & (n_cores - 1)) != 0); + + int32_t in1_rq1, in1_rq2, in1_rq3, in1_rq4, + in2_rq1, in2_rq2, in2_rq3, in2_rq4; + int32_t sum1, sum2, sum3, sum4; + int32_t sum_out1, sum_out2, sum_out3, sum_out4; + int32_t out1, out2, out3, out4, + sum_int1, sum_int2, sum_int3, sum_int4; + + + + int ch_im_in1_r = ch_im_in >> 0; + int ch_im_in2_r = ch_im_in >> 0; + int ch_im_out_r = ch_im_in >> 0; + + int start = MIN(chunck * core_id, dim_im_in_y); + int stop = MIN(start + chunck, dim_im_in_y); + + int8_t *target1 = pIn1 + start * ch_im_in1_r * dim_im_in_x; + int8_t *target2 = pIn2 + start * ch_im_in2_r * dim_im_in_x; + int8_t *pOutBuffer = pOut + start * ch_im_out_r * dim_im_in_x; + + int a = 0; + int b = 0; + + int8_t *target1_ext = &a; + int8_t *target2_ext = &b; + + for (int i=0; i<(((stop-start) * ch_im_out_r * dim_im_in_x) >> 2); i++) + { + target1_ext = target1; + target1+=4; + + target2_ext = target2; + target2+=4; +#ifdef ADD_VERBOSE + printf("core %d - in1 it0 before requant: %d\n", core_id, *(target1_ext)); + printf("core %d - in2 it0 before requant: %d\n", core_id, *(target2_ext)); +#endif + in1_rq1 = ((*(target1_ext)) * in1_mul + in1_add) >> in1_shift; + in2_rq1 = ((*(target2_ext)) * in2_mul + in2_add) >> in2_shift; + sum1 = clips8(in1_rq1) + clips8(in2_rq1); +#ifdef ADD_VERBOSE + printf("core %d - in1_rq1 it0 after requant: %d\nclipped in1_rq1: %d\n", core_id, in1_rq1, clips8(in1_rq1)); + printf("core %d - in2_rq1 it0 after requant: %d\nclipped in2_rq1: %d\n", core_id, in2_rq1), clips8(in2_rq1); + printf("core %d - sum1: %d\n", core_id, sum1); +#endif +#ifdef ADD_VERBOSE + printf("core %d - in1 it1 before requant: %d\n", core_id, *(target1_ext + 1 )); + printf("core %d - in2 it1 before requant: %d\n", core_id, *(target2_ext + 1 )); +#endif + in1_rq2 = ((*(target1_ext + 1 )) * in1_mul + in1_add) >> in1_shift; + in2_rq2 = ((*(target2_ext + 1 )) * in2_mul + in2_add) >> in2_shift; + sum2 = clips8(in1_rq2) + clips8(in2_rq2); +#ifdef ADD_VERBOSE + printf("core %d - in1_rq2 it1 after requant: %d\nclipped in1_rq2: %d\n", core_id, in1_rq2, clips8(in1_rq2)); + printf("core %d - in2_rq2 it1 after requant: %d\nclipped in2_rq2: %d\n", core_id, in2_rq2), clips8(in2_rq2); + printf("core %d - sum2: %d\n", core_id, sum2); +#endif +#ifdef ADD_VERBOSE + printf("core %d - in1 it2 before requant: %d\n", core_id, *(target1_ext + 2 )); + printf("core %d - in2 it2 before requant: %d\n", core_id, *(target2_ext + 2 )); +#endif + in1_rq3 = ((*(target1_ext + 2 )) * in1_mul + in1_add) >> in1_shift; + in2_rq3 = ((*(target2_ext + 2 )) * in2_mul + in2_add) >> in2_shift; + sum3 = clips8(in1_rq3) + clips8(in2_rq3); +#ifdef ADD_VERBOSE + printf("core %d - in1_rq3 it2 after requant: %d\nclipped in1_rq3: %d\n", core_id, in1_rq3, clips8(in1_rq3)); + printf("core %d - in2_rq3 it2 after requant: %d\nclipped in2_rq3: %d\n", core_id, in2_rq3), clips8(in2_rq3); + printf("core %d - sum3: %d\n", core_id, sum3); +#endif +#ifdef ADD_VERBOSE + printf("core %d - in1 it3 before requant: %d\n", core_id, *(target1_ext + 3 )); + printf("core %d - in2 it3 before requant: %d\n", core_id, *(target2_ext + 3 )); +#endif + in1_rq4 = ((*(target1_ext + 3 )) * in1_mul + in1_add) >> in1_shift; + in2_rq4 = ((*(target2_ext + 3 )) * in2_mul + in2_add) >> in2_shift; + sum4 = clips8(in1_rq4) + clips8(in2_rq4); +#ifdef ADD_VERBOSE + printf("core %d - in1_rq4 it3 after requant: %d\nclipped in1_rq4: %d\n", core_id, in1_rq4, clips8(in1_rq4)); + printf("core %d - in2_rq4 it3 after requant: %d\nclipped in2_rq4: %d\n", core_id, in2_rq4), clips8(in2_rq4); + printf("core %d - sum4: %d\n", core_id, sum4); +#endif + + if (out_requant_flag) { + sum1 = (sum1 * out_mul + out_add) >> out_shift; +#ifdef ADD_VERBOSE + printf("core %d - requantized sum1: %d\n", core_id, sum1); +#endif + sum2 = (sum2 * out_mul + out_add) >> out_shift; +#ifdef ADD_VERBOSE + printf("core %d - requantized sum2: %d\n", core_id, sum2); +#endif + sum3 = (sum3 * out_mul + out_add) >> out_shift; +#ifdef ADD_VERBOSE + printf("core %d - requantized sum3: %d\n", core_id, sum3); +#endif + sum4 = (sum4 * out_mul + out_add) >> out_shift; +#ifdef ADD_VERBOSE + printf("core %d - requantized sum4: %d\n", core_id, sum4); +#endif + } + out1 = clips8(sum1); +#ifdef ADD_VERBOSE + printf("core %d - out1 clipped: %d\n", core_id, out1); +#endif + out2 = clips8(sum2); +#ifdef ADD_VERBOSE + printf("core %d - out2 clipped: %d\n", core_id, out2); +#endif + out3 = clips8(sum3); +#ifdef ADD_VERBOSE + printf("core %d - out3 clipped: %d\n", core_id, out3); +#endif + out4 = clips8(sum4); +#ifdef ADD_VERBOSE + printf("core %d - out4 clipped: %d\n", core_id, out4); +#endif + + + *pOutBuffer = (int8_t) out1; + pOutBuffer++; + *pOutBuffer = (int8_t) out2; + pOutBuffer++; + *pOutBuffer = (int8_t) out3; + pOutBuffer++; + *pOutBuffer = (int8_t) out4; + pOutBuffer++; + } + // SCHEREMO: Cleanup leftovers, not doing it with this codebase for sub-byte formats + for (int i=0; i<(((stop-start) * ch_im_out_r * dim_im_in_x) % 4); i++){ + in1_rq1 = ((*(target1)) * in1_mul + in1_add) >> in1_shift; + in2_rq1 = ((*(target2)) * in2_mul + in2_add) >> in2_shift; + + // SCHEREMO: Maybe it's just LLVM, but unless I hack 3 non-unrolled nops in here, stuff fails + #pragma nounroll + for (int j = 0; j < 3; j++) { + asm volatile("nop" ::); + } + + target1++; + target2++; + sum1 = clips8(in1_rq1) + clips8(in2_rq1); + if (out_requant_flag) { + sum1 = (sum1 * out_mul + out_add) >> out_shift; + } + + out1 = clips8(sum1); + *pOutBuffer = (int8_t)out1; + pOutBuffer++; + } +} diff --git a/TargetLibraries/Snitch/third_party/pulp-nn-mixed b/TargetLibraries/Snitch/third_party/pulp-nn-mixed deleted file mode 160000 index 4f0902bb..00000000 --- a/TargetLibraries/Snitch/third_party/pulp-nn-mixed +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 4f0902bbdab3265b4f56e156cb4a98f52ac6e067