|
| 1 | +/* |
| 2 | + * pulp_nn_add_i8_i8_i8.c |
| 3 | + * Georg Rutishauser <georgr@iis.ee.ethz.ch> |
| 4 | + * Victor Jung <jungvi@iis.ee.ethz.ch> |
| 5 | + * |
| 6 | + * Copyright (C) 2018-2020 University of Bologna |
| 7 | + * |
| 8 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 9 | + * you may not use this file except in compliance with the License. |
| 10 | + * You may obtain a copy of the License at |
| 11 | + * |
| 12 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 13 | + * |
| 14 | + * Unless required by applicable law or agreed to in writing, software |
| 15 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 16 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 17 | + * See the License for the specific language governing permissions and |
| 18 | + * limitations under the License. |
| 19 | + */ |
| 20 | + |
| 21 | +#include "DeeploySnitchMath.h" |
| 22 | + |
| 23 | + |
| 24 | + |
| 25 | +void __attribute__ ((noinline)) pulp_nn_add_i8_i8_i8( |
| 26 | + int8_t * pIn1, |
| 27 | + int8_t * pIn2, |
| 28 | + int8_t * pOut, |
| 29 | + int32_t in1_mul, |
| 30 | + int32_t in1_add, |
| 31 | + uint16_t in1_shift, |
| 32 | + int32_t in2_mul, |
| 33 | + int32_t in2_add, |
| 34 | + uint16_t in2_shift, |
| 35 | + int32_t out_mul, |
| 36 | + int32_t out_add, |
| 37 | + uint16_t out_shift, |
| 38 | + uint16_t dim_im_in_x, |
| 39 | + uint16_t dim_im_in_y, |
| 40 | + uint16_t ch_im_in, |
| 41 | + int out_requant_flag) |
| 42 | +{ |
| 43 | + int core_id = snrt_global_compute_core_idx(); |
| 44 | + int n_cores = snrt_global_compute_core_num(); |
| 45 | + |
| 46 | + if (dim_im_in_y < n_cores){ |
| 47 | + n_cores = dim_im_in_y; |
| 48 | + } |
| 49 | + |
| 50 | + int Log2Core = INT_LOG2(n_cores); |
| 51 | + int chunck = (dim_im_in_y >> Log2Core) + ((dim_im_in_y & (n_cores - 1)) != 0); |
| 52 | + |
| 53 | + int32_t in1_rq1, in1_rq2, in1_rq3, in1_rq4, |
| 54 | + in2_rq1, in2_rq2, in2_rq3, in2_rq4; |
| 55 | + int32_t sum1, sum2, sum3, sum4; |
| 56 | + int32_t sum_out1, sum_out2, sum_out3, sum_out4; |
| 57 | + int32_t out1, out2, out3, out4, |
| 58 | + sum_int1, sum_int2, sum_int3, sum_int4; |
| 59 | + |
| 60 | + |
| 61 | + |
| 62 | + int ch_im_in1_r = ch_im_in >> 0; |
| 63 | + int ch_im_in2_r = ch_im_in >> 0; |
| 64 | + int ch_im_out_r = ch_im_in >> 0; |
| 65 | + |
| 66 | + int start = MIN(chunck * core_id, dim_im_in_y); |
| 67 | + int stop = MIN(start + chunck, dim_im_in_y); |
| 68 | + |
| 69 | + int8_t *target1 = pIn1 + start * ch_im_in1_r * dim_im_in_x; |
| 70 | + int8_t *target2 = pIn2 + start * ch_im_in2_r * dim_im_in_x; |
| 71 | + int8_t *pOutBuffer = pOut + start * ch_im_out_r * dim_im_in_x; |
| 72 | + |
| 73 | + int a = 0; |
| 74 | + int b = 0; |
| 75 | + |
| 76 | + int8_t *target1_ext = &a; |
| 77 | + int8_t *target2_ext = &b; |
| 78 | + |
| 79 | + for (int i=0; i<(((stop-start) * ch_im_out_r * dim_im_in_x) >> 2); i++) |
| 80 | + { |
| 81 | + target1_ext = target1; |
| 82 | + target1+=4; |
| 83 | + |
| 84 | + target2_ext = target2; |
| 85 | + target2+=4; |
| 86 | +#ifdef ADD_VERBOSE |
| 87 | + printf("core %d - in1 it0 before requant: %d\n", core_id, *(target1_ext)); |
| 88 | + printf("core %d - in2 it0 before requant: %d\n", core_id, *(target2_ext)); |
| 89 | +#endif |
| 90 | + in1_rq1 = ((*(target1_ext)) * in1_mul + in1_add) >> in1_shift; |
| 91 | + in2_rq1 = ((*(target2_ext)) * in2_mul + in2_add) >> in2_shift; |
| 92 | + sum1 = clips8(in1_rq1) + clips8(in2_rq1); |
| 93 | +#ifdef ADD_VERBOSE |
| 94 | + printf("core %d - in1_rq1 it0 after requant: %d\nclipped in1_rq1: %d\n", core_id, in1_rq1, clips8(in1_rq1)); |
| 95 | + printf("core %d - in2_rq1 it0 after requant: %d\nclipped in2_rq1: %d\n", core_id, in2_rq1), clips8(in2_rq1); |
| 96 | + printf("core %d - sum1: %d\n", core_id, sum1); |
| 97 | +#endif |
| 98 | +#ifdef ADD_VERBOSE |
| 99 | + printf("core %d - in1 it1 before requant: %d\n", core_id, *(target1_ext + 1 )); |
| 100 | + printf("core %d - in2 it1 before requant: %d\n", core_id, *(target2_ext + 1 )); |
| 101 | +#endif |
| 102 | + in1_rq2 = ((*(target1_ext + 1 )) * in1_mul + in1_add) >> in1_shift; |
| 103 | + in2_rq2 = ((*(target2_ext + 1 )) * in2_mul + in2_add) >> in2_shift; |
| 104 | + sum2 = clips8(in1_rq2) + clips8(in2_rq2); |
| 105 | +#ifdef ADD_VERBOSE |
| 106 | + printf("core %d - in1_rq2 it1 after requant: %d\nclipped in1_rq2: %d\n", core_id, in1_rq2, clips8(in1_rq2)); |
| 107 | + printf("core %d - in2_rq2 it1 after requant: %d\nclipped in2_rq2: %d\n", core_id, in2_rq2), clips8(in2_rq2); |
| 108 | + printf("core %d - sum2: %d\n", core_id, sum2); |
| 109 | +#endif |
| 110 | +#ifdef ADD_VERBOSE |
| 111 | + printf("core %d - in1 it2 before requant: %d\n", core_id, *(target1_ext + 2 )); |
| 112 | + printf("core %d - in2 it2 before requant: %d\n", core_id, *(target2_ext + 2 )); |
| 113 | +#endif |
| 114 | + in1_rq3 = ((*(target1_ext + 2 )) * in1_mul + in1_add) >> in1_shift; |
| 115 | + in2_rq3 = ((*(target2_ext + 2 )) * in2_mul + in2_add) >> in2_shift; |
| 116 | + sum3 = clips8(in1_rq3) + clips8(in2_rq3); |
| 117 | +#ifdef ADD_VERBOSE |
| 118 | + printf("core %d - in1_rq3 it2 after requant: %d\nclipped in1_rq3: %d\n", core_id, in1_rq3, clips8(in1_rq3)); |
| 119 | + printf("core %d - in2_rq3 it2 after requant: %d\nclipped in2_rq3: %d\n", core_id, in2_rq3), clips8(in2_rq3); |
| 120 | + printf("core %d - sum3: %d\n", core_id, sum3); |
| 121 | +#endif |
| 122 | +#ifdef ADD_VERBOSE |
| 123 | + printf("core %d - in1 it3 before requant: %d\n", core_id, *(target1_ext + 3 )); |
| 124 | + printf("core %d - in2 it3 before requant: %d\n", core_id, *(target2_ext + 3 )); |
| 125 | +#endif |
| 126 | + in1_rq4 = ((*(target1_ext + 3 )) * in1_mul + in1_add) >> in1_shift; |
| 127 | + in2_rq4 = ((*(target2_ext + 3 )) * in2_mul + in2_add) >> in2_shift; |
| 128 | + sum4 = clips8(in1_rq4) + clips8(in2_rq4); |
| 129 | +#ifdef ADD_VERBOSE |
| 130 | + printf("core %d - in1_rq4 it3 after requant: %d\nclipped in1_rq4: %d\n", core_id, in1_rq4, clips8(in1_rq4)); |
| 131 | + printf("core %d - in2_rq4 it3 after requant: %d\nclipped in2_rq4: %d\n", core_id, in2_rq4), clips8(in2_rq4); |
| 132 | + printf("core %d - sum4: %d\n", core_id, sum4); |
| 133 | +#endif |
| 134 | + |
| 135 | + if (out_requant_flag) { |
| 136 | + sum1 = (sum1 * out_mul + out_add) >> out_shift; |
| 137 | +#ifdef ADD_VERBOSE |
| 138 | + printf("core %d - requantized sum1: %d\n", core_id, sum1); |
| 139 | +#endif |
| 140 | + sum2 = (sum2 * out_mul + out_add) >> out_shift; |
| 141 | +#ifdef ADD_VERBOSE |
| 142 | + printf("core %d - requantized sum2: %d\n", core_id, sum2); |
| 143 | +#endif |
| 144 | + sum3 = (sum3 * out_mul + out_add) >> out_shift; |
| 145 | +#ifdef ADD_VERBOSE |
| 146 | + printf("core %d - requantized sum3: %d\n", core_id, sum3); |
| 147 | +#endif |
| 148 | + sum4 = (sum4 * out_mul + out_add) >> out_shift; |
| 149 | +#ifdef ADD_VERBOSE |
| 150 | + printf("core %d - requantized sum4: %d\n", core_id, sum4); |
| 151 | +#endif |
| 152 | + } |
| 153 | + out1 = clips8(sum1); |
| 154 | +#ifdef ADD_VERBOSE |
| 155 | + printf("core %d - out1 clipped: %d\n", core_id, out1); |
| 156 | +#endif |
| 157 | + out2 = clips8(sum2); |
| 158 | +#ifdef ADD_VERBOSE |
| 159 | + printf("core %d - out2 clipped: %d\n", core_id, out2); |
| 160 | +#endif |
| 161 | + out3 = clips8(sum3); |
| 162 | +#ifdef ADD_VERBOSE |
| 163 | + printf("core %d - out3 clipped: %d\n", core_id, out3); |
| 164 | +#endif |
| 165 | + out4 = clips8(sum4); |
| 166 | +#ifdef ADD_VERBOSE |
| 167 | + printf("core %d - out4 clipped: %d\n", core_id, out4); |
| 168 | +#endif |
| 169 | + |
| 170 | + |
| 171 | + *pOutBuffer = (int8_t) out1; |
| 172 | + pOutBuffer++; |
| 173 | + *pOutBuffer = (int8_t) out2; |
| 174 | + pOutBuffer++; |
| 175 | + *pOutBuffer = (int8_t) out3; |
| 176 | + pOutBuffer++; |
| 177 | + *pOutBuffer = (int8_t) out4; |
| 178 | + pOutBuffer++; |
| 179 | + } |
| 180 | + // SCHEREMO: Cleanup leftovers, not doing it with this codebase for sub-byte formats |
| 181 | + for (int i=0; i<(((stop-start) * ch_im_out_r * dim_im_in_x) % 4); i++){ |
| 182 | + in1_rq1 = ((*(target1)) * in1_mul + in1_add) >> in1_shift; |
| 183 | + in2_rq1 = ((*(target2)) * in2_mul + in2_add) >> in2_shift; |
| 184 | + |
| 185 | + // SCHEREMO: Maybe it's just LLVM, but unless I hack 3 non-unrolled nops in here, stuff fails |
| 186 | + #pragma nounroll |
| 187 | + for (int j = 0; j < 3; j++) { |
| 188 | + asm volatile("nop" ::); |
| 189 | + } |
| 190 | + |
| 191 | + target1++; |
| 192 | + target2++; |
| 193 | + sum1 = clips8(in1_rq1) + clips8(in2_rq1); |
| 194 | + if (out_requant_flag) { |
| 195 | + sum1 = (sum1 * out_mul + out_add) >> out_shift; |
| 196 | + } |
| 197 | + |
| 198 | + out1 = clips8(sum1); |
| 199 | + *pOutBuffer = (int8_t)out1; |
| 200 | + pOutBuffer++; |
| 201 | + } |
| 202 | +} |
0 commit comments