@@ -35,14 +35,12 @@ namespace lsp
35
35
{
36
36
LSP_DSP_VEC4 (0x007fffff ), // MM = frac
37
37
LSP_DSP_VEC4 (0x0000007f ), // ME = 127
38
- LSP_DSP_VEC4 (0x3d888889 ), // C0 = 1/15 = 0.0666666701436043
39
- LSP_DSP_VEC4 (0x3d9d89d9 ), // C1 = 1/13 = 0.0769230797886848
40
- LSP_DSP_VEC4 (0x3dba2e8c ), // C2 = 1/11 = 0.0909090936183929
41
- LSP_DSP_VEC4 (0x3de38e39 ), // C3 = 1/9 = 0.1111111119389534
42
- LSP_DSP_VEC4 (0x3e124925 ), // C4 = 1/7 = 0.1428571492433548
43
- LSP_DSP_VEC4 (0x3e4ccccd ), // C5 = 1/5 = 0.2000000029802322
44
- LSP_DSP_VEC4 (0x3eaaaaab ), // C6 = 1/3 = 0.3333333432674408
45
- LSP_DSP_VEC4 (0x3f800000 ), // C7 = 1.0f
38
+ LSP_DSP_VEC4 (0x3dba2e8c ), // C0 = 1/11 = 0.0909090936183929
39
+ LSP_DSP_VEC4 (0x3de38e39 ), // C1 = 1/9 = 0.1111111119389534
40
+ LSP_DSP_VEC4 (0x3e124925 ), // C2 = 1/7 = 0.1428571492433548
41
+ LSP_DSP_VEC4 (0x3e4ccccd ), // C3 = 1/5 = 0.2000000029802322
42
+ LSP_DSP_VEC4 (0x3eaaaaab ), // C4 = 1/3 = 0.3333333432674408
43
+ LSP_DSP_VEC4 (0x3f800000 ), // C5 = 1.0f
46
44
};
47
45
48
46
static const float LOGB_C[] __lsp_aligned16 =
@@ -64,22 +62,22 @@ namespace lsp
64
62
};
65
63
)
66
64
67
- #define LOGN_CORE_X8 (MM, ME, C0, C1, C2, C3, C4, C5, C6, C7 ) \
65
+ #define LOGN_CORE_X8 (MM, ME, C0, C1, C2, C3, C4, C5 ) \
68
66
/* v0 = x */ \
69
67
__ASM_EMIT (" ushr v2.4s, v0.4s, #23" ) /* v2 = ilog2(x) + 127 */ \
70
68
__ASM_EMIT (" ushr v3.4s, v1.4s, #23" ) \
71
69
__ASM_EMIT (" and v0.16b, v0.16b, " MM " .16b" ) /* v0 = x & MM */ \
72
70
__ASM_EMIT (" and v1.16b, v1.16b, " MM " .16b" ) \
73
71
__ASM_EMIT (" sub v2.4s, v2.4s, " ME " .4s" ) /* v2 = r - ME = ilog2(x) */ \
74
72
__ASM_EMIT (" sub v3.4s, v3.4s, " ME " .4s" ) \
75
- __ASM_EMIT (" orr v0.16b, v0.16b, " C7 " .16b" ) /* v0 = X = (x & MM) | (C7 = 1.0f) */ \
76
- __ASM_EMIT (" orr v1.16b, v1.16b, " C7 " .16b" ) \
73
+ __ASM_EMIT (" orr v0.16b, v0.16b, " C5 " .16b" ) /* v0 = X = (x & MM) | (C7 = 1.0f) */ \
74
+ __ASM_EMIT (" orr v1.16b, v1.16b, " C5 " .16b" ) \
77
75
__ASM_EMIT (" scvtf v2.4s, v2.4s" ) /* v2 = R = float(r) */ \
78
76
__ASM_EMIT (" scvtf v3.4s, v3.4s" ) \
79
- __ASM_EMIT (" fadd v4.4s, v0.4s, " C7 " .4s" ) /* v4 = XB = X + (C7 = 1) */ \
80
- __ASM_EMIT (" fadd v5.4s, v1.4s, " C7 " .4s" ) \
81
- __ASM_EMIT (" fsub v0.4s, v0.4s, " C7 " .4s" ) /* v0 = XT = X - (C7 = 1) */ \
82
- __ASM_EMIT (" fsub v1.4s, v1.4s, " C7 " .4s" ) \
77
+ __ASM_EMIT (" fadd v4.4s, v0.4s, " C5 " .4s" ) /* v4 = XB = X + (C7 = 1) */ \
78
+ __ASM_EMIT (" fadd v5.4s, v1.4s, " C5 " .4s" ) \
79
+ __ASM_EMIT (" fsub v0.4s, v0.4s, " C5 " .4s" ) /* v0 = XT = X - (C7 = 1) */ \
80
+ __ASM_EMIT (" fsub v1.4s, v1.4s, " C5 " .4s" ) \
83
81
__ASM_EMIT (" frecpe v6.4s, v4.4s" ) /* v6 = xb */ \
84
82
__ASM_EMIT (" frecpe v7.4s, v5.4s" ) \
85
83
__ASM_EMIT (" frecps v8.4s, v6.4s, v4.4s" ) /* v8 = (2 - XB*xb) */ \
@@ -115,27 +113,19 @@ namespace lsp
115
113
__ASM_EMIT (" fmul v7.4s, v7.4s, v5.4s" ) \
116
114
__ASM_EMIT (" fadd v6.4s, v6.4s, " C5 " .4s" ) /* v6 = C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))) */ \
117
115
__ASM_EMIT (" fadd v7.4s, v7.4s, " C5 " .4s" ) \
118
- __ASM_EMIT (" fmul v6.4s, v6.4s, v4.4s" ) /* v6 = Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \
119
- __ASM_EMIT (" fmul v7.4s, v7.4s, v5.4s" ) \
120
- __ASM_EMIT (" fadd v6.4s, v6.4s, " C6 " .4s" ) /* v6 = C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \
121
- __ASM_EMIT (" fadd v7.4s, v7.4s, " C6 " .4s" ) \
122
- __ASM_EMIT (" fmul v6.4s, v6.4s, v4.4s" ) /* v6 = Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \
123
- __ASM_EMIT (" fmul v7.4s, v7.4s, v5.4s" ) \
124
- __ASM_EMIT (" fadd v6.4s, v6.4s, " C7 " .4s" ) /* v6 = C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \
125
- __ASM_EMIT (" fadd v7.4s, v7.4s, " C7 " .4s" ) \
126
- __ASM_EMIT (" fmul v0.4s, v0.4s, v6.4s" ) /* v0 = y*(C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))))) */ \
116
+ __ASM_EMIT (" fmul v0.4s, v0.4s, v6.4s" ) /* v0 = y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \
127
117
__ASM_EMIT (" fmul v1.4s, v1.4s, v7.4s" ) \
128
118
/* v0 = y*L, v2 = R */
129
119
130
- #define LOGN_CORE_X4 (MM, ME, C0, C1, C2, C3, C4, C5, C6, C7 ) \
120
+ #define LOGN_CORE_X4 (MM, ME, C0, C1, C2, C3, C4, C5 ) \
131
121
/* v0 = x */ \
132
122
__ASM_EMIT (" ushr v2.4s, v0.4s, #23" ) /* v2 = ilog2(x) + 127 */ \
133
123
__ASM_EMIT (" and v0.16b, v0.16b, " MM " .16b" ) /* v0 = x & MM */ \
134
124
__ASM_EMIT (" sub v2.4s, v2.4s, " ME " .4s" ) /* v2 = r - ME = ilog2(x) */ \
135
- __ASM_EMIT (" orr v0.16b, v0.16b, " C7 " .16b" ) /* v0 = X = (x & MM) | (C7 = 1.0f) */ \
125
+ __ASM_EMIT (" orr v0.16b, v0.16b, " C5 " .16b" ) /* v0 = X = (x & MM) | (C7 = 1.0f) */ \
136
126
__ASM_EMIT (" scvtf v2.4s, v2.4s" ) /* v2 = R = float(r) */ \
137
- __ASM_EMIT (" fadd v4.4s, v0.4s, " C7 " .4s" ) /* v4 = XB = X + (C7 = 1) */ \
138
- __ASM_EMIT (" fsub v0.4s, v0.4s, " C7 " .4s" ) /* v0 = XT = X - (C7 = 1) */ \
127
+ __ASM_EMIT (" fadd v4.4s, v0.4s, " C5 " .4s" ) /* v4 = XB = X + (C7 = 1) */ \
128
+ __ASM_EMIT (" fsub v0.4s, v0.4s, " C5 " .4s" ) /* v0 = XT = X - (C7 = 1) */ \
139
129
__ASM_EMIT (" frecpe v6.4s, v4.4s" ) /* v6 = xb */ \
140
130
__ASM_EMIT (" frecps v8.4s, v6.4s, v4.4s" ) /* v8 = (2 - XB*xb) */ \
141
131
__ASM_EMIT (" fmul v6.4s, v8.4s, v6.4s" ) /* v6 = xb' = xb * (2 - XB*xb) */ \
@@ -154,31 +144,26 @@ namespace lsp
154
144
__ASM_EMIT (" fadd v6.4s, v6.4s, " C4 " .4s" ) /* v6 = C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))) */ \
155
145
__ASM_EMIT (" fmul v6.4s, v6.4s, v4.4s" ) /* v6 = Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))) */ \
156
146
__ASM_EMIT (" fadd v6.4s, v6.4s, " C5 " .4s" ) /* v6 = C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))) */ \
157
- __ASM_EMIT (" fmul v6.4s, v6.4s, v4.4s" ) /* v6 = Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \
158
- __ASM_EMIT (" fadd v6.4s, v6.4s, " C6 " .4s" ) /* v6 = C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \
159
- __ASM_EMIT (" fmul v6.4s, v6.4s, v4.4s" ) /* v6 = Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \
160
- __ASM_EMIT (" fadd v6.4s, v6.4s, " C7 " .4s" ) /* v6 = C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \
161
- __ASM_EMIT (" fmul v0.4s, v0.4s, v6.4s" ) /* v0 = y*(C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))))) */ \
147
+ __ASM_EMIT (" fmul v0.4s, v0.4s, v6.4s" ) /* v0 = y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \
162
148
/* v0 = y*L, v2 = R */
163
149
164
150
#define LOGN_CORE_LOAD \
165
151
__ASM_EMIT (" ldp q26, q27, [%[LOGC]]" ) \
166
152
__ASM_EMIT (" ldp q16, q17, [%[L2C], #0x00]" ) /* v16 = MM, v17 = ME */ \
167
153
__ASM_EMIT (" ldp q18, q19, [%[L2C], #0x20]" ) /* v18 = C0, v19 = C1 */ \
168
154
__ASM_EMIT (" ldp q20, q21, [%[L2C], #0x40]" ) /* v20 = C2, v21 = C3 */ \
169
- __ASM_EMIT (" ldp q22, q23, [%[L2C], #0x60]" ) /* v22 = C4, v23 = C5 */ \
170
- __ASM_EMIT (" ldp q24, q25, [%[L2C], #0x80]" ) /* v24 = C6, v25 = C7 */
155
+ __ASM_EMIT (" ldp q22, q23, [%[L2C], #0x60]" ) /* v22 = C4, v23 = C5 */
171
156
172
157
#define LOGB_CORE_X8_NOLOAD \
173
158
/* in: v0 = x1, v1 = x2 */ \
174
- LOGN_CORE_X8 (" v16" , " v17" , " v18" , " v19" , " v20" , " v21" , " v22" , " v23" , " v24 " , " v25 " ) \
159
+ LOGN_CORE_X8 (" v16" , " v17" , " v18" , " v19" , " v20" , " v21" , " v22" , " v23" ) \
175
160
__ASM_EMIT (" fmla v2.4s, v0.4s, v26.4s" ) /* v2 = R + 2*y*L*M_LOG2E */ \
176
161
__ASM_EMIT (" fmla v3.4s, v1.4s, v27.4s" ) \
177
162
/* out: v0 = logb(x0), v1 = logb(x1) */
178
163
179
164
#define LOGB_CORE_X4_NOLOAD \
180
165
/* in: v0 = x1 */ \
181
- LOGN_CORE_X4 (" v16" , " v17" , " v18" , " v19" , " v20" , " v21" , " v22" , " v23" , " v24 " , " v25 " ) \
166
+ LOGN_CORE_X4 (" v16" , " v17" , " v18" , " v19" , " v20" , " v21" , " v22" , " v23" ) \
182
167
__ASM_EMIT (" fmla v2.4s, v0.4s, v26.4s" ) /* v2 = R + 2*y*L*M_LOG2E */ \
183
168
/* out: v0 = logb(x0) */
184
169
@@ -196,7 +181,7 @@ namespace lsp
196
181
197
182
#define LOGE_CORE_X8_NOLOAD \
198
183
/* in: v0 = x1, v1 = x2 */ \
199
- LOGN_CORE_X8 (" v16" , " v17" , " v18" , " v19" , " v20" , " v21" , " v22" , " v23" , " v24 " , " v25 " ) \
184
+ LOGN_CORE_X8 (" v16" , " v17" , " v18" , " v19" , " v20" , " v21" , " v22" , " v23" ) \
200
185
__ASM_EMIT (" fadd v0.4s, v0.4s, v0.4s" ) /* v0 = 2*y*L */ \
201
186
__ASM_EMIT (" fadd v1.4s, v1.4s, v1.4s" ) \
202
187
__ASM_EMIT (" fmla v0.4s, v2.4s, v26.4s" ) /* v0 = 2*y*L + R/log2(E) */ \
@@ -205,7 +190,7 @@ namespace lsp
205
190
206
191
#define LOGE_CORE_X4_NOLOAD \
207
192
/* in: v0 = x1 */ \
208
- LOGN_CORE_X4 (" v16" , " v17" , " v18" , " v19" , " v20" , " v21" , " v22" , " v23" , " v24 " , " v25 " ) \
193
+ LOGN_CORE_X4 (" v16" , " v17" , " v18" , " v19" , " v20" , " v21" , " v22" , " v23" ) \
209
194
__ASM_EMIT (" fadd v0.4s, v0.4s, v0.4s" ) /* v0 = 2*y*L */ \
210
195
__ASM_EMIT (" fmla v0.4s, v2.4s, v26.4s" ) /* v0 = 2*y*L + R/log2(E) */ \
211
196
/* out: v0 = loge(x0) */
@@ -224,7 +209,7 @@ namespace lsp
224
209
225
210
#define LOGD_CORE_X8_NOLOAD \
226
211
/* in: v0 = x1, v1 = x2 */ \
227
- LOGN_CORE_X8 (" v16" , " v17" , " v18" , " v19" , " v20" , " v21" , " v22" , " v23" , " v24 " , " v25 " ) \
212
+ LOGN_CORE_X8 (" v16" , " v17" , " v18" , " v19" , " v20" , " v21" , " v22" , " v23" ) \
228
213
__ASM_EMIT (" fmul v0.4s, v0.4s, v26.4s" ) /* v0 = 2*y*L*log10(E) */ \
229
214
__ASM_EMIT (" fmul v1.4s, v1.4s, v26.4s" ) \
230
215
__ASM_EMIT (" fmla v0.4s, v2.4s, v27.4s" ) /* v0 = 2*y*L*log10(E) + R/log2(10) */ \
@@ -233,7 +218,7 @@ namespace lsp
233
218
234
219
#define LOGD_CORE_X4_NOLOAD \
235
220
/* in: v0 = x1 */ \
236
- LOGN_CORE_X4 (" v16" , " v17" , " v18" , " v19" , " v20" , " v21" , " v22" , " v23" , " v24 " , " v25 " ) \
221
+ LOGN_CORE_X4 (" v16" , " v17" , " v18" , " v19" , " v20" , " v21" , " v22" , " v23" ) \
237
222
__ASM_EMIT (" fmul v0.4s, v0.4s, v26.4s" ) /* v0 = 2*y*L*log10(E) */ \
238
223
__ASM_EMIT (" fmla v0.4s, v2.4s, v27.4s" ) /* v0 = 2*y*L*log10(E) + R/log2(10) */ \
239
224
/* out: v0 = logd(x0) */
0 commit comments