Skip to content

Commit a07907d

Browse files
committed
Improved performance of logarithm values calculations
1 parent 6d416ad commit a07907d

File tree

3 files changed

+103
-144
lines changed

3 files changed

+103
-144
lines changed

CHANGELOG

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
=== 1.0.20 ===
66
* Optimization of compressor and gate functions using AVX-512 instruction set.
77
* Introduced SIMD-optimized expander curve and gain functions.
8+
* Improved performance of logarithm values calculations.
89
* Updated build scripts.
910

1011
=== 1.0.19 ===

include/private/dsp/arch/aarch64/asimd/pmath/log.h

Lines changed: 26 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,12 @@ namespace lsp
3535
{
3636
LSP_DSP_VEC4(0x007fffff), // MM = frac
3737
LSP_DSP_VEC4(0x0000007f), // ME = 127
38-
LSP_DSP_VEC4(0x3d888889), // C0 = 1/15 = 0.0666666701436043
39-
LSP_DSP_VEC4(0x3d9d89d9), // C1 = 1/13 = 0.0769230797886848
40-
LSP_DSP_VEC4(0x3dba2e8c), // C2 = 1/11 = 0.0909090936183929
41-
LSP_DSP_VEC4(0x3de38e39), // C3 = 1/9 = 0.1111111119389534
42-
LSP_DSP_VEC4(0x3e124925), // C4 = 1/7 = 0.1428571492433548
43-
LSP_DSP_VEC4(0x3e4ccccd), // C5 = 1/5 = 0.2000000029802322
44-
LSP_DSP_VEC4(0x3eaaaaab), // C6 = 1/3 = 0.3333333432674408
45-
LSP_DSP_VEC4(0x3f800000), // C7 = 1.0f
38+
LSP_DSP_VEC4(0x3dba2e8c), // C0 = 1/11 = 0.0909090936183929
39+
LSP_DSP_VEC4(0x3de38e39), // C1 = 1/9 = 0.1111111119389534
40+
LSP_DSP_VEC4(0x3e124925), // C2 = 1/7 = 0.1428571492433548
41+
LSP_DSP_VEC4(0x3e4ccccd), // C3 = 1/5 = 0.2000000029802322
42+
LSP_DSP_VEC4(0x3eaaaaab), // C4 = 1/3 = 0.3333333432674408
43+
LSP_DSP_VEC4(0x3f800000), // C5 = 1.0f
4644
};
4745

4846
static const float LOGB_C[] __lsp_aligned16 =
@@ -64,22 +62,22 @@ namespace lsp
6462
};
6563
)
6664

67-
#define LOGN_CORE_X8(MM, ME, C0, C1, C2, C3, C4, C5, C6, C7) \
65+
#define LOGN_CORE_X8(MM, ME, C0, C1, C2, C3, C4, C5) \
6866
/* v0 = x */ \
6967
__ASM_EMIT("ushr v2.4s, v0.4s, #23") /* v2 = ilog2(x) + 127 */ \
7068
__ASM_EMIT("ushr v3.4s, v1.4s, #23") \
7169
__ASM_EMIT("and v0.16b, v0.16b, " MM ".16b") /* v0 = x & MM */ \
7270
__ASM_EMIT("and v1.16b, v1.16b, " MM ".16b") \
7371
__ASM_EMIT("sub v2.4s, v2.4s, " ME ".4s") /* v2 = r - ME = ilog2(x) */ \
7472
__ASM_EMIT("sub v3.4s, v3.4s, " ME ".4s") \
75-
__ASM_EMIT("orr v0.16b, v0.16b, " C7 ".16b") /* v0 = X = (x & MM) | (C7 = 1.0f) */ \
76-
__ASM_EMIT("orr v1.16b, v1.16b, " C7 ".16b") \
73+
__ASM_EMIT("orr v0.16b, v0.16b, " C5 ".16b") /* v0 = X = (x & MM) | (C7 = 1.0f) */ \
74+
__ASM_EMIT("orr v1.16b, v1.16b, " C5 ".16b") \
7775
__ASM_EMIT("scvtf v2.4s, v2.4s") /* v2 = R = float(r) */ \
7876
__ASM_EMIT("scvtf v3.4s, v3.4s") \
79-
__ASM_EMIT("fadd v4.4s, v0.4s, " C7 ".4s") /* v4 = XB = X + (C7 = 1) */ \
80-
__ASM_EMIT("fadd v5.4s, v1.4s, " C7 ".4s") \
81-
__ASM_EMIT("fsub v0.4s, v0.4s, " C7 ".4s") /* v0 = XT = X - (C7 = 1) */ \
82-
__ASM_EMIT("fsub v1.4s, v1.4s, " C7 ".4s") \
77+
__ASM_EMIT("fadd v4.4s, v0.4s, " C5 ".4s") /* v4 = XB = X + (C7 = 1) */ \
78+
__ASM_EMIT("fadd v5.4s, v1.4s, " C5 ".4s") \
79+
__ASM_EMIT("fsub v0.4s, v0.4s, " C5 ".4s") /* v0 = XT = X - (C7 = 1) */ \
80+
__ASM_EMIT("fsub v1.4s, v1.4s, " C5 ".4s") \
8381
__ASM_EMIT("frecpe v6.4s, v4.4s") /* v6 = xb */ \
8482
__ASM_EMIT("frecpe v7.4s, v5.4s") \
8583
__ASM_EMIT("frecps v8.4s, v6.4s, v4.4s") /* v8 = (2 - XB*xb) */ \
@@ -115,27 +113,19 @@ namespace lsp
115113
__ASM_EMIT("fmul v7.4s, v7.4s, v5.4s") \
116114
__ASM_EMIT("fadd v6.4s, v6.4s, " C5 ".4s") /* v6 = C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))) */ \
117115
__ASM_EMIT("fadd v7.4s, v7.4s, " C5 ".4s") \
118-
__ASM_EMIT("fmul v6.4s, v6.4s, v4.4s") /* v6 = Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \
119-
__ASM_EMIT("fmul v7.4s, v7.4s, v5.4s") \
120-
__ASM_EMIT("fadd v6.4s, v6.4s, " C6 ".4s") /* v6 = C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \
121-
__ASM_EMIT("fadd v7.4s, v7.4s, " C6 ".4s") \
122-
__ASM_EMIT("fmul v6.4s, v6.4s, v4.4s") /* v6 = Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \
123-
__ASM_EMIT("fmul v7.4s, v7.4s, v5.4s") \
124-
__ASM_EMIT("fadd v6.4s, v6.4s, " C7 ".4s") /* v6 = C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \
125-
__ASM_EMIT("fadd v7.4s, v7.4s, " C7 ".4s") \
126-
__ASM_EMIT("fmul v0.4s, v0.4s, v6.4s") /* v0 = y*(C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))))) */ \
116+
__ASM_EMIT("fmul v0.4s, v0.4s, v6.4s") /* v0 = y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \
127117
__ASM_EMIT("fmul v1.4s, v1.4s, v7.4s") \
128118
/* v0 = y*L, v2 = R */
129119

130-
#define LOGN_CORE_X4(MM, ME, C0, C1, C2, C3, C4, C5, C6, C7) \
120+
#define LOGN_CORE_X4(MM, ME, C0, C1, C2, C3, C4, C5) \
131121
/* v0 = x */ \
132122
__ASM_EMIT("ushr v2.4s, v0.4s, #23") /* v2 = ilog2(x) + 127 */ \
133123
__ASM_EMIT("and v0.16b, v0.16b, " MM ".16b") /* v0 = x & MM */ \
134124
__ASM_EMIT("sub v2.4s, v2.4s, " ME ".4s") /* v2 = r - ME = ilog2(x) */ \
135-
__ASM_EMIT("orr v0.16b, v0.16b, " C7 ".16b") /* v0 = X = (x & MM) | (C7 = 1.0f) */ \
125+
__ASM_EMIT("orr v0.16b, v0.16b, " C5 ".16b") /* v0 = X = (x & MM) | (C7 = 1.0f) */ \
136126
__ASM_EMIT("scvtf v2.4s, v2.4s") /* v2 = R = float(r) */ \
137-
__ASM_EMIT("fadd v4.4s, v0.4s, " C7 ".4s") /* v4 = XB = X + (C7 = 1) */ \
138-
__ASM_EMIT("fsub v0.4s, v0.4s, " C7 ".4s") /* v0 = XT = X - (C7 = 1) */ \
127+
__ASM_EMIT("fadd v4.4s, v0.4s, " C5 ".4s") /* v4 = XB = X + (C7 = 1) */ \
128+
__ASM_EMIT("fsub v0.4s, v0.4s, " C5 ".4s") /* v0 = XT = X - (C7 = 1) */ \
139129
__ASM_EMIT("frecpe v6.4s, v4.4s") /* v6 = xb */ \
140130
__ASM_EMIT("frecps v8.4s, v6.4s, v4.4s") /* v8 = (2 - XB*xb) */ \
141131
__ASM_EMIT("fmul v6.4s, v8.4s, v6.4s") /* v6 = xb' = xb * (2 - XB*xb) */ \
@@ -154,31 +144,26 @@ namespace lsp
154144
__ASM_EMIT("fadd v6.4s, v6.4s, " C4 ".4s") /* v6 = C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))) */ \
155145
__ASM_EMIT("fmul v6.4s, v6.4s, v4.4s") /* v6 = Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))) */ \
156146
__ASM_EMIT("fadd v6.4s, v6.4s, " C5 ".4s") /* v6 = C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))) */ \
157-
__ASM_EMIT("fmul v6.4s, v6.4s, v4.4s") /* v6 = Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \
158-
__ASM_EMIT("fadd v6.4s, v6.4s, " C6 ".4s") /* v6 = C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \
159-
__ASM_EMIT("fmul v6.4s, v6.4s, v4.4s") /* v6 = Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \
160-
__ASM_EMIT("fadd v6.4s, v6.4s, " C7 ".4s") /* v6 = C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y)))))) */ \
161-
__ASM_EMIT("fmul v0.4s, v0.4s, v6.4s") /* v0 = y*(C7+Y*(C6+Y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))))) */ \
147+
__ASM_EMIT("fmul v0.4s, v0.4s, v6.4s") /* v0 = y*(C5+Y*(C4+Y*(C3+Y*(C2+Y*(C1+C0*Y))))) */ \
162148
/* v0 = y*L, v2 = R */
163149

164150
#define LOGN_CORE_LOAD \
165151
__ASM_EMIT("ldp q26, q27, [%[LOGC]]") \
166152
__ASM_EMIT("ldp q16, q17, [%[L2C], #0x00]") /* v16 = MM, v17 = ME */ \
167153
__ASM_EMIT("ldp q18, q19, [%[L2C], #0x20]") /* v18 = C0, v19 = C1 */ \
168154
__ASM_EMIT("ldp q20, q21, [%[L2C], #0x40]") /* v20 = C2, v21 = C3 */ \
169-
__ASM_EMIT("ldp q22, q23, [%[L2C], #0x60]") /* v22 = C4, v23 = C5 */ \
170-
__ASM_EMIT("ldp q24, q25, [%[L2C], #0x80]") /* v24 = C6, v25 = C7 */
155+
__ASM_EMIT("ldp q22, q23, [%[L2C], #0x60]") /* v22 = C4, v23 = C5 */
171156

172157
#define LOGB_CORE_X8_NOLOAD \
173158
/* in: v0 = x1, v1 = x2 */ \
174-
LOGN_CORE_X8("v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25") \
159+
LOGN_CORE_X8("v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23") \
175160
__ASM_EMIT("fmla v2.4s, v0.4s, v26.4s") /* v2 = R + 2*y*L*M_LOG2E */ \
176161
__ASM_EMIT("fmla v3.4s, v1.4s, v27.4s") \
177162
/* out: v0 = logb(x0), v1 = logb(x1) */
178163

179164
#define LOGB_CORE_X4_NOLOAD \
180165
/* in: v0 = x1 */ \
181-
LOGN_CORE_X4("v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25") \
166+
LOGN_CORE_X4("v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23") \
182167
__ASM_EMIT("fmla v2.4s, v0.4s, v26.4s") /* v2 = R + 2*y*L*M_LOG2E */ \
183168
/* out: v0 = logb(x0) */
184169

@@ -196,7 +181,7 @@ namespace lsp
196181

197182
#define LOGE_CORE_X8_NOLOAD \
198183
/* in: v0 = x1, v1 = x2 */ \
199-
LOGN_CORE_X8("v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25") \
184+
LOGN_CORE_X8("v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23") \
200185
__ASM_EMIT("fadd v0.4s, v0.4s, v0.4s") /* v0 = 2*y*L */ \
201186
__ASM_EMIT("fadd v1.4s, v1.4s, v1.4s") \
202187
__ASM_EMIT("fmla v0.4s, v2.4s, v26.4s") /* v0 = 2*y*L + R/log2(E) */ \
@@ -205,7 +190,7 @@ namespace lsp
205190

206191
#define LOGE_CORE_X4_NOLOAD \
207192
/* in: v0 = x1 */ \
208-
LOGN_CORE_X4("v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25") \
193+
LOGN_CORE_X4("v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23") \
209194
__ASM_EMIT("fadd v0.4s, v0.4s, v0.4s") /* v0 = 2*y*L */ \
210195
__ASM_EMIT("fmla v0.4s, v2.4s, v26.4s") /* v0 = 2*y*L + R/log2(E) */ \
211196
/* out: v0 = loge(x0) */
@@ -224,7 +209,7 @@ namespace lsp
224209

225210
#define LOGD_CORE_X8_NOLOAD \
226211
/* in: v0 = x1, v1 = x2 */ \
227-
LOGN_CORE_X8("v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25") \
212+
LOGN_CORE_X8("v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23") \
228213
__ASM_EMIT("fmul v0.4s, v0.4s, v26.4s") /* v0 = 2*y*L*log10(E) */ \
229214
__ASM_EMIT("fmul v1.4s, v1.4s, v26.4s") \
230215
__ASM_EMIT("fmla v0.4s, v2.4s, v27.4s") /* v0 = 2*y*L*log10(E) + R/log2(10) */ \
@@ -233,7 +218,7 @@ namespace lsp
233218

234219
#define LOGD_CORE_X4_NOLOAD \
235220
/* in: v0 = x1 */ \
236-
LOGN_CORE_X4("v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25") \
221+
LOGN_CORE_X4("v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23") \
237222
__ASM_EMIT("fmul v0.4s, v0.4s, v26.4s") /* v0 = 2*y*L*log10(E) */ \
238223
__ASM_EMIT("fmla v0.4s, v2.4s, v27.4s") /* v0 = 2*y*L*log10(E) + R/log2(10) */ \
239224
/* out: v0 = logd(x0) */

0 commit comments

Comments
 (0)