Skip to content

Commit 6fe7c42

Browse files
committed
add victor's comment
1 parent 0d4baed commit 6fe7c42

File tree

4 files changed

+202
-6
lines changed

4 files changed

+202
-6
lines changed

.gitmodules

-4
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,3 @@
77
[submodule "CMSIS-NN"]
88
path = TargetLibraries/CMSIS/third_party/CMSIS-NN
99
url = https://github.com/ARM-software/CMSIS-NN.git
10-
[submodule "TargetLibraries/Snitch/third_party/pulp-nn-mixed"]
11-
path = TargetLibraries/Snitch/third_party/pulp-nn-mixed
12-
url = https://github.com/Victor-Jung/pulp-nn-mixed.git
13-
branch = deeploySnitchTarget

TargetLibraries/Snitch/CMakeLists.txt

-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
file(GLOB_RECURSE SOURCES
22
"src/**"
3-
"third_party/pulp-nn-mixed/DeeploySnitch/src/**"
43
)
54

65
include(cmake/snitch-runtime-precompiled.cmake)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
/*
2+
* pulp_nn_add_i8_i8_i8.c
3+
* Georg Rutishauser <georgr@iis.ee.ethz.ch>
4+
* Victor Jung <jungvi@iis.ee.ethz.ch>
5+
*
6+
* Copyright (C) 2018-2020 University of Bologna
7+
*
8+
* Licensed under the Apache License, Version 2.0 (the "License");
9+
* you may not use this file except in compliance with the License.
10+
* You may obtain a copy of the License at
11+
*
12+
* http://www.apache.org/licenses/LICENSE-2.0
13+
*
14+
* Unless required by applicable law or agreed to in writing, software
15+
* distributed under the License is distributed on an "AS IS" BASIS,
16+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17+
* See the License for the specific language governing permissions and
18+
* limitations under the License.
19+
*/
20+
21+
#include "DeeploySnitchMath.h"
22+
23+
24+
25+
void __attribute__ ((noinline)) pulp_nn_add_i8_i8_i8(
26+
int8_t * pIn1,
27+
int8_t * pIn2,
28+
int8_t * pOut,
29+
int32_t in1_mul,
30+
int32_t in1_add,
31+
uint16_t in1_shift,
32+
int32_t in2_mul,
33+
int32_t in2_add,
34+
uint16_t in2_shift,
35+
int32_t out_mul,
36+
int32_t out_add,
37+
uint16_t out_shift,
38+
uint16_t dim_im_in_x,
39+
uint16_t dim_im_in_y,
40+
uint16_t ch_im_in,
41+
int out_requant_flag)
42+
{
43+
int core_id = snrt_global_compute_core_idx();
44+
int n_cores = snrt_global_compute_core_num();
45+
46+
if (dim_im_in_y < n_cores){
47+
n_cores = dim_im_in_y;
48+
}
49+
50+
int Log2Core = INT_LOG2(n_cores);
51+
int chunck = (dim_im_in_y >> Log2Core) + ((dim_im_in_y & (n_cores - 1)) != 0);
52+
53+
int32_t in1_rq1, in1_rq2, in1_rq3, in1_rq4,
54+
in2_rq1, in2_rq2, in2_rq3, in2_rq4;
55+
int32_t sum1, sum2, sum3, sum4;
56+
int32_t sum_out1, sum_out2, sum_out3, sum_out4;
57+
int32_t out1, out2, out3, out4,
58+
sum_int1, sum_int2, sum_int3, sum_int4;
59+
60+
61+
62+
int ch_im_in1_r = ch_im_in >> 0;
63+
int ch_im_in2_r = ch_im_in >> 0;
64+
int ch_im_out_r = ch_im_in >> 0;
65+
66+
int start = MIN(chunck * core_id, dim_im_in_y);
67+
int stop = MIN(start + chunck, dim_im_in_y);
68+
69+
int8_t *target1 = pIn1 + start * ch_im_in1_r * dim_im_in_x;
70+
int8_t *target2 = pIn2 + start * ch_im_in2_r * dim_im_in_x;
71+
int8_t *pOutBuffer = pOut + start * ch_im_out_r * dim_im_in_x;
72+
73+
int a = 0;
74+
int b = 0;
75+
76+
int8_t *target1_ext = &a;
77+
int8_t *target2_ext = &b;
78+
79+
for (int i=0; i<(((stop-start) * ch_im_out_r * dim_im_in_x) >> 2); i++)
80+
{
81+
target1_ext = target1;
82+
target1+=4;
83+
84+
target2_ext = target2;
85+
target2+=4;
86+
#ifdef ADD_VERBOSE
87+
printf("core %d - in1 it0 before requant: %d\n", core_id, *(target1_ext));
88+
printf("core %d - in2 it0 before requant: %d\n", core_id, *(target2_ext));
89+
#endif
90+
in1_rq1 = ((*(target1_ext)) * in1_mul + in1_add) >> in1_shift;
91+
in2_rq1 = ((*(target2_ext)) * in2_mul + in2_add) >> in2_shift;
92+
sum1 = clips8(in1_rq1) + clips8(in2_rq1);
93+
#ifdef ADD_VERBOSE
94+
printf("core %d - in1_rq1 it0 after requant: %d\nclipped in1_rq1: %d\n", core_id, in1_rq1, clips8(in1_rq1));
95+
printf("core %d - in2_rq1 it0 after requant: %d\nclipped in2_rq1: %d\n", core_id, in2_rq1), clips8(in2_rq1);
96+
printf("core %d - sum1: %d\n", core_id, sum1);
97+
#endif
98+
#ifdef ADD_VERBOSE
99+
printf("core %d - in1 it1 before requant: %d\n", core_id, *(target1_ext + 1 ));
100+
printf("core %d - in2 it1 before requant: %d\n", core_id, *(target2_ext + 1 ));
101+
#endif
102+
in1_rq2 = ((*(target1_ext + 1 )) * in1_mul + in1_add) >> in1_shift;
103+
in2_rq2 = ((*(target2_ext + 1 )) * in2_mul + in2_add) >> in2_shift;
104+
sum2 = clips8(in1_rq2) + clips8(in2_rq2);
105+
#ifdef ADD_VERBOSE
106+
printf("core %d - in1_rq2 it1 after requant: %d\nclipped in1_rq2: %d\n", core_id, in1_rq2, clips8(in1_rq2));
107+
printf("core %d - in2_rq2 it1 after requant: %d\nclipped in2_rq2: %d\n", core_id, in2_rq2), clips8(in2_rq2);
108+
printf("core %d - sum2: %d\n", core_id, sum2);
109+
#endif
110+
#ifdef ADD_VERBOSE
111+
printf("core %d - in1 it2 before requant: %d\n", core_id, *(target1_ext + 2 ));
112+
printf("core %d - in2 it2 before requant: %d\n", core_id, *(target2_ext + 2 ));
113+
#endif
114+
in1_rq3 = ((*(target1_ext + 2 )) * in1_mul + in1_add) >> in1_shift;
115+
in2_rq3 = ((*(target2_ext + 2 )) * in2_mul + in2_add) >> in2_shift;
116+
sum3 = clips8(in1_rq3) + clips8(in2_rq3);
117+
#ifdef ADD_VERBOSE
118+
printf("core %d - in1_rq3 it2 after requant: %d\nclipped in1_rq3: %d\n", core_id, in1_rq3, clips8(in1_rq3));
119+
printf("core %d - in2_rq3 it2 after requant: %d\nclipped in2_rq3: %d\n", core_id, in2_rq3), clips8(in2_rq3);
120+
printf("core %d - sum3: %d\n", core_id, sum3);
121+
#endif
122+
#ifdef ADD_VERBOSE
123+
printf("core %d - in1 it3 before requant: %d\n", core_id, *(target1_ext + 3 ));
124+
printf("core %d - in2 it3 before requant: %d\n", core_id, *(target2_ext + 3 ));
125+
#endif
126+
in1_rq4 = ((*(target1_ext + 3 )) * in1_mul + in1_add) >> in1_shift;
127+
in2_rq4 = ((*(target2_ext + 3 )) * in2_mul + in2_add) >> in2_shift;
128+
sum4 = clips8(in1_rq4) + clips8(in2_rq4);
129+
#ifdef ADD_VERBOSE
130+
printf("core %d - in1_rq4 it3 after requant: %d\nclipped in1_rq4: %d\n", core_id, in1_rq4, clips8(in1_rq4));
131+
printf("core %d - in2_rq4 it3 after requant: %d\nclipped in2_rq4: %d\n", core_id, in2_rq4), clips8(in2_rq4);
132+
printf("core %d - sum4: %d\n", core_id, sum4);
133+
#endif
134+
135+
if (out_requant_flag) {
136+
sum1 = (sum1 * out_mul + out_add) >> out_shift;
137+
#ifdef ADD_VERBOSE
138+
printf("core %d - requantized sum1: %d\n", core_id, sum1);
139+
#endif
140+
sum2 = (sum2 * out_mul + out_add) >> out_shift;
141+
#ifdef ADD_VERBOSE
142+
printf("core %d - requantized sum2: %d\n", core_id, sum2);
143+
#endif
144+
sum3 = (sum3 * out_mul + out_add) >> out_shift;
145+
#ifdef ADD_VERBOSE
146+
printf("core %d - requantized sum3: %d\n", core_id, sum3);
147+
#endif
148+
sum4 = (sum4 * out_mul + out_add) >> out_shift;
149+
#ifdef ADD_VERBOSE
150+
printf("core %d - requantized sum4: %d\n", core_id, sum4);
151+
#endif
152+
}
153+
out1 = clips8(sum1);
154+
#ifdef ADD_VERBOSE
155+
printf("core %d - out1 clipped: %d\n", core_id, out1);
156+
#endif
157+
out2 = clips8(sum2);
158+
#ifdef ADD_VERBOSE
159+
printf("core %d - out2 clipped: %d\n", core_id, out2);
160+
#endif
161+
out3 = clips8(sum3);
162+
#ifdef ADD_VERBOSE
163+
printf("core %d - out3 clipped: %d\n", core_id, out3);
164+
#endif
165+
out4 = clips8(sum4);
166+
#ifdef ADD_VERBOSE
167+
printf("core %d - out4 clipped: %d\n", core_id, out4);
168+
#endif
169+
170+
171+
*pOutBuffer = (int8_t) out1;
172+
pOutBuffer++;
173+
*pOutBuffer = (int8_t) out2;
174+
pOutBuffer++;
175+
*pOutBuffer = (int8_t) out3;
176+
pOutBuffer++;
177+
*pOutBuffer = (int8_t) out4;
178+
pOutBuffer++;
179+
}
180+
// SCHEREMO: Cleanup leftovers, not doing it with this codebase for sub-byte formats
181+
for (int i=0; i<(((stop-start) * ch_im_out_r * dim_im_in_x) % 4); i++){
182+
in1_rq1 = ((*(target1)) * in1_mul + in1_add) >> in1_shift;
183+
in2_rq1 = ((*(target2)) * in2_mul + in2_add) >> in2_shift;
184+
185+
// SCHEREMO: Maybe it's just LLVM, but unless I hack 3 non-unrolled nops in here, stuff fails
186+
#pragma nounroll
187+
for (int j = 0; j < 3; j++) {
188+
asm volatile("nop" ::);
189+
}
190+
191+
target1++;
192+
target2++;
193+
sum1 = clips8(in1_rq1) + clips8(in2_rq1);
194+
if (out_requant_flag) {
195+
sum1 = (sum1 * out_mul + out_add) >> out_shift;
196+
}
197+
198+
out1 = clips8(sum1);
199+
*pOutBuffer = (int8_t)out1;
200+
pOutBuffer++;
201+
}
202+
}
This file was deleted.

0 commit comments

Comments
 (0)