Skip to content

Commit f26e06b

Browse files
committed
ARM NEON optimizations for depan_lin and depan_eqpow
1 parent 60f69af commit f26e06b

File tree

6 files changed

+266
-8
lines changed

6 files changed

+266
-8
lines changed
Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
/*
2+
* Copyright (C) 2024 Linux Studio Plugins Project <https://lsp-plug.in/>
3+
* (C) 2024 Vladimir Sadovnikov <sadko4u@gmail.com>
4+
*
5+
* This file is part of lsp-dsp-lib
6+
* Created on: 29 нояб. 2024 г.
7+
*
8+
* lsp-dsp-lib is free software: you can redistribute it and/or modify
9+
* it under the terms of the GNU Lesser General Public License as published by
10+
* the Free Software Foundation, either version 3 of the License, or
11+
* any later version.
12+
*
13+
* lsp-dsp-lib is distributed in the hope that it will be useful,
14+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
15+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16+
* GNU Lesser General Public License for more details.
17+
*
18+
* You should have received a copy of the GNU Lesser General Public License
19+
* along with lsp-dsp-lib. If not, see <https://www.gnu.org/licenses/>.
20+
*/
21+
22+
#ifndef PRIVATE_DSP_ARCH_ARM_NEON_D32_PAN_H_
23+
#define PRIVATE_DSP_ARCH_ARM_NEON_D32_PAN_H_
24+
25+
#ifndef PRIVATE_DSP_ARCH_ARM_NEON_D32_IMPL
26+
#error "This header should not be included directly"
27+
#endif /* PRIVATE_DSP_ARCH_ARM_NEON_D32_IMPL */
28+
29+
namespace lsp
30+
{
31+
namespace neon_d32
32+
{
33+
IF_ARCH_ARM(
34+
static const float depan_lin_const_f[] __lsp_aligned32 =
35+
{
36+
LSP_DSP_VEC8(1e-18f)
37+
};
38+
);
39+
40+
void depan_lin(float *dst, const float *l, const float *r, float dfl, size_t count)
41+
{
42+
/*
43+
const float sl = fabsf(l[i]);
44+
const float sr = fabsf(r[i]);
45+
const float den = sl + sr;
46+
dst[i] = (den >= 1e-18f) ? sr / den : dfl;
47+
*/
48+
ARCH_ARM_ASM
49+
(
50+
__ASM_EMIT("vdup.32 q15, %y[dfl]") /* q15 = dfl */
51+
__ASM_EMIT("vldm %[CC], {q12-q13}") /* q12-q13 = thresh */
52+
__ASM_EMIT("subs %[count], #8")
53+
__ASM_EMIT("blo 2f")
54+
/* 8x blocks */
55+
__ASM_EMIT("1:")
56+
__ASM_EMIT("vldm %[a]!, {q0-q1}") /* q0-q1 = l */
57+
__ASM_EMIT("vldm %[b]!, {q2-q3}") /* q2-q3 = r */
58+
__ASM_EMIT("vabs.f32 q0, q0") /* q0 = fabsf(l) */
59+
__ASM_EMIT("vabs.f32 q1, q1")
60+
__ASM_EMIT("vabs.f32 q2, q2") /* q2 = fabsf(r) */
61+
__ASM_EMIT("vabs.f32 q3, q3")
62+
__ASM_EMIT("vadd.f32 q0, q0, q2") /* q0 = den = fabsf(l) + fabsf(r) */
63+
__ASM_EMIT("vadd.f32 q1, q1, q3")
64+
__ASM_EMIT("vcge.f32 q4, q0, q12") /* q4 = [den >= thresh] */
65+
__ASM_EMIT("vcge.f32 q5, q1, q13")
66+
__ASM_EMIT("vrecpe.f32 q6, q0") /* q6 = s2 */
67+
__ASM_EMIT("vrecpe.f32 q7, q1")
68+
__ASM_EMIT("vrecps.f32 q8, q6, q0") /* q8 = (2 - R*s2) */
69+
__ASM_EMIT("vrecps.f32 q9, q7, q1")
70+
__ASM_EMIT("vmul.f32 q6, q8, q6") /* q6 = s2' = s2 * (2 - R*s2) */
71+
__ASM_EMIT("vmul.f32 q7, q9, q7")
72+
__ASM_EMIT("vrecps.f32 q8, q6, q0") /* q8 = (2 - R*s2') */
73+
__ASM_EMIT("vrecps.f32 q9, q7, q1")
74+
__ASM_EMIT("vmul.f32 q0, q8, q6") /* q0 = s2" = s2' * (2 - R*s2) = 1/s2 */
75+
__ASM_EMIT("vmul.f32 q1, q9, q7")
76+
__ASM_EMIT("vmul.f32 q0, q0, q2") /* q0 = pan = fabsf(r) / den */
77+
__ASM_EMIT("vmul.f32 q1, q1, q3")
78+
__ASM_EMIT("vbif q0, q15, q4") /* q0 = (den >= thresh) ? fabsf(r)/den : dfl */
79+
__ASM_EMIT("vbif q1, q15, q5")
80+
__ASM_EMIT("subs %[count], #8")
81+
__ASM_EMIT("vstm %[dst]!, {q0-q1}")
82+
__ASM_EMIT("bhs 1b")
83+
/* 4x block */
84+
__ASM_EMIT("2:")
85+
__ASM_EMIT("adds %[count], #4")
86+
__ASM_EMIT("blt 4f")
87+
__ASM_EMIT("vldm %[a]!, {q0}") /* q0 = l */
88+
__ASM_EMIT("vldm %[b]!, {q2}") /* q2 = r */
89+
__ASM_EMIT("vabs.f32 q0, q0") /* q0 = fabsf(l) */
90+
__ASM_EMIT("vabs.f32 q2, q2") /* q2 = fabsf(r) */
91+
__ASM_EMIT("vadd.f32 q0, q0, q2") /* q0 = den = fabsf(l) + fabsf(r) */
92+
__ASM_EMIT("vcge.f32 q4, q0, q12") /* q4 = [den >= thresh] */
93+
__ASM_EMIT("vrecpe.f32 q6, q0") /* q6 = s2 */
94+
__ASM_EMIT("vrecps.f32 q8, q6, q0") /* q8 = (2 - R*s2) */
95+
__ASM_EMIT("vmul.f32 q6, q8, q6") /* q6 = s2' = s2 * (2 - R*s2) */
96+
__ASM_EMIT("vrecps.f32 q8, q6, q0") /* q8 = (2 - R*s2') */
97+
__ASM_EMIT("vmul.f32 q0, q8, q6") /* q0 = s2" = s2' * (2 - R*s2) = 1/s2 */
98+
__ASM_EMIT("vmul.f32 q0, q0, q2") /* q0 = pan = fabsf(r) / den */
99+
__ASM_EMIT("vbif q0, q15, q4") /* q0 = (den >= thresh) ? fabsf(r)/den : dfl */
100+
__ASM_EMIT("sub %[count], #4")
101+
__ASM_EMIT("vstm %[dst]!, {q0-q1}")
102+
/* 1x blocks */
103+
__ASM_EMIT("4:")
104+
__ASM_EMIT("adds %[count], #3")
105+
__ASM_EMIT("blt 6f")
106+
__ASM_EMIT("5:")
107+
__ASM_EMIT("vld1.32 {d0[], d1[]}, [%[a]]!")
108+
__ASM_EMIT("vld1.32 {d4[], d5[]}, [%[b]]!")
109+
__ASM_EMIT("vabs.f32 q0, q0") /* q0 = fabsf(l) */
110+
__ASM_EMIT("vabs.f32 q2, q2") /* q2 = fabsf(r) */
111+
__ASM_EMIT("vadd.f32 q0, q0, q2") /* q0 = den = fabsf(l) + fabsf(r) */
112+
__ASM_EMIT("vcge.f32 q4, q0, q12") /* q4 = [den >= thresh] */
113+
__ASM_EMIT("vrecpe.f32 q6, q0") /* q6 = s2 */
114+
__ASM_EMIT("vrecps.f32 q8, q6, q0") /* q8 = (2 - R*s2) */
115+
__ASM_EMIT("vmul.f32 q6, q8, q6") /* q6 = s2' = s2 * (2 - R*s2) */
116+
__ASM_EMIT("vrecps.f32 q8, q6, q0") /* q8 = (2 - R*s2') */
117+
__ASM_EMIT("vmul.f32 q0, q8, q6") /* q0 = s2" = s2' * (2 - R*s2) = 1/s2 */
118+
__ASM_EMIT("vmul.f32 q0, q0, q2") /* q0 = pan = fabsf(r) / den */
119+
__ASM_EMIT("vbif q0, q15, q4") /* q0 = (den >= thresh) ? fabsf(r)/den : dfl */
120+
__ASM_EMIT("subs %[count], #1")
121+
__ASM_EMIT("vst1.32 {d0[0]}, [%[dst]]!")
122+
__ASM_EMIT("bge 5b")
123+
/* end */
124+
__ASM_EMIT("6:")
125+
126+
: [dst] "+r" (dst), [l] "+r" (l), [r] "+r" (r),
127+
[count] "+r" (count),
128+
[dfl] "+t" (dfl)
129+
: [CC] "r" (&depan_lin_const_f[0])
130+
: "cc", "memory",
131+
/* "q0" */, "q1", "q2", "q3",
132+
"q4", "q5", "q6", "q7",
133+
"q8", "q9",
134+
"q12", "q13", "q14", "q15"
135+
);
136+
}
137+
138+
IF_ARCH_X86(
139+
static const float depan_eqpow_const_f[] __lsp_aligned32 =
140+
{
141+
LSP_DSP_VEC8(1e-36f)
142+
};
143+
);
144+
145+
void depan_eqpow(float *dst, const float *l, const float *r, float dfl, size_t count)
146+
{
147+
/*
148+
const float sl = l[i] * l[i];
149+
const float sr = r[i] * r[i];
150+
const float den = sl + sr;
151+
dst[i] = (den >= 1e-36f) ? sr / den : dfl;
152+
*/
153+
ARCH_ARM_ASM
154+
(
155+
__ASM_EMIT("vdup.32 q15, %y[dfl]") /* q15 = dfl */
156+
__ASM_EMIT("vldm %[CC], {q12-q13}") /* q12-q13 = thresh */
157+
__ASM_EMIT("subs %[count], #8")
158+
__ASM_EMIT("blo 2f")
159+
/* 8x blocks */
160+
__ASM_EMIT("1:")
161+
__ASM_EMIT("vldm %[a]!, {q0-q1}") /* q0-q1 = l */
162+
__ASM_EMIT("vldm %[b]!, {q2-q3}") /* q2-q3 = r */
163+
__ASM_EMIT("vmul.f32 q0, q0, q0") /* q0 = l*l */
164+
__ASM_EMIT("vmul.f32 q1, q1, q1")
165+
__ASM_EMIT("vmul.f32 q2, q2, q2") /* q2 = r*r */
166+
__ASM_EMIT("vmul.f32 q3, q3, q3")
167+
__ASM_EMIT("vadd.f32 q0, q0, q2") /* q0 = den = l*l + r*r */
168+
__ASM_EMIT("vadd.f32 q1, q1, q3")
169+
__ASM_EMIT("vcge.f32 q4, q0, q12") /* q4 = [den >= thresh] */
170+
__ASM_EMIT("vcge.f32 q5, q1, q13")
171+
__ASM_EMIT("vrsqrte.f32 q6, q0") /* q6 = x0 */
172+
__ASM_EMIT("vrsqrte.f32 q7, q1")
173+
__ASM_EMIT("vmul.f32 q8, q6, q0") /* q8 = R * x0 */
174+
__ASM_EMIT("vmul.f32 q9, q7, q1")
175+
__ASM_EMIT("vrsqrts.f32 q10, q8, q6") /* q10 = (3 - R * x0 * x0) / 2 */
176+
__ASM_EMIT("vrsqrts.f32 q11, q9, q7")
177+
__ASM_EMIT("vmul.f32 q6, q6, q10") /* q6 = x1 = x0 * (3 - R * x0 * x0) / 2 */
178+
__ASM_EMIT("vmul.f32 q7, q7, q11")
179+
__ASM_EMIT("vmul.f32 q8, q6, q0") /* q8 = R * x1 */
180+
__ASM_EMIT("vmul.f32 q9, q7, q1")
181+
__ASM_EMIT("vrsqrts.f32 q10, q8, q6") /* q10 = (3 - R * x1 * x1) / 2 */
182+
__ASM_EMIT("vrsqrts.f32 q11, q9, q7")
183+
__ASM_EMIT("vmul.f32 q0, q6, q10") /* q0 = 1/sqrt(den) = x2 = x1 * (3 - R * x1 * x1) / 2 */
184+
__ASM_EMIT("vmul.f32 q1, q7, q11")
185+
__ASM_EMIT("vmul.f32 q0, q2, q6") /* q0 = pan = r*r/sqrt(den) */
186+
__ASM_EMIT("vmul.f32 q1, q2, q7")
187+
__ASM_EMIT("vbif q0, q15, q4") /* q0 = (den >= thresh) ? fabsf(r)/den : dfl */
188+
__ASM_EMIT("vbif q1, q15, q5")
189+
__ASM_EMIT("subs %[count], #8")
190+
__ASM_EMIT("vstm %[dst]!, {q0-q1}")
191+
__ASM_EMIT("bhs 1b")
192+
/* 4x block */
193+
__ASM_EMIT("2:")
194+
__ASM_EMIT("adds %[count], #4")
195+
__ASM_EMIT("blt 4f")
196+
__ASM_EMIT("vldm %[a]!, {q0}") /* q0 = l */
197+
__ASM_EMIT("vldm %[b]!, {q2}") /* q2 = r */
198+
__ASM_EMIT("vmul.f32 q0, q0, q0") /* q0 = l*l */
199+
__ASM_EMIT("vmul.f32 q2, q2, q2") /* q2 = r*r */
200+
__ASM_EMIT("vadd.f32 q0, q0, q2") /* q0 = den = l*l + r*r */
201+
__ASM_EMIT("vcge.f32 q4, q0, q12") /* q4 = [den >= thresh] */
202+
__ASM_EMIT("vrsqrte.f32 q6, q0") /* q6 = x0 */
203+
__ASM_EMIT("vmul.f32 q8, q6, q0") /* q8 = R * x0 */
204+
__ASM_EMIT("vrsqrts.f32 q10, q8, q6") /* q10 = (3 - R * x0 * x0) / 2 */
205+
__ASM_EMIT("vmul.f32 q6, q6, q10") /* q6 = x1 = x0 * (3 - R * x0 * x0) / 2 */
206+
__ASM_EMIT("vmul.f32 q8, q6, q0") /* q8 = R * x1 */
207+
__ASM_EMIT("vrsqrts.f32 q10, q8, q6") /* q10 = (3 - R * x1 * x1) / 2 */
208+
__ASM_EMIT("vmul.f32 q0, q6, q10") /* q0 = 1/sqrt(den) = x2 = x1 * (3 - R * x1 * x1) / 2 */
209+
__ASM_EMIT("vmul.f32 q0, q2, q6") /* q0 = pan = r*r/sqrt(den) */
210+
__ASM_EMIT("vbif q0, q15, q4") /* q0 = (den >= thresh) ? fabsf(r)/den : dfl */
211+
__ASM_EMIT("sub %[count], #4")
212+
__ASM_EMIT("vstm %[dst]!, {q0-q1}")
213+
/* 1x blocks */
214+
__ASM_EMIT("4:")
215+
__ASM_EMIT("adds %[count], #3")
216+
__ASM_EMIT("blt 6f")
217+
__ASM_EMIT("5:")
218+
__ASM_EMIT("vld1.32 {d0[], d1[]}, [%[a]]!")
219+
__ASM_EMIT("vld1.32 {d4[], d5[]}, [%[b]]!")
220+
__ASM_EMIT("vmul.f32 q0, q0, q0") /* q0 = l*l */
221+
__ASM_EMIT("vmul.f32 q2, q2, q2") /* q2 = r*r */
222+
__ASM_EMIT("vadd.f32 q0, q0, q2") /* q0 = den = l*l + r*r */
223+
__ASM_EMIT("vcge.f32 q4, q0, q12") /* q4 = [den >= thresh] */
224+
__ASM_EMIT("vrsqrte.f32 q6, q0") /* q6 = x0 */
225+
__ASM_EMIT("vmul.f32 q8, q6, q0") /* q8 = R * x0 */
226+
__ASM_EMIT("vrsqrts.f32 q10, q8, q6") /* q10 = (3 - R * x0 * x0) / 2 */
227+
__ASM_EMIT("vmul.f32 q6, q6, q10") /* q6 = x1 = x0 * (3 - R * x0 * x0) / 2 */
228+
__ASM_EMIT("vmul.f32 q8, q6, q0") /* q8 = R * x1 */
229+
__ASM_EMIT("vrsqrts.f32 q10, q8, q6") /* q10 = (3 - R * x1 * x1) / 2 */
230+
__ASM_EMIT("vmul.f32 q0, q6, q10") /* q0 = 1/sqrt(den) = x2 = x1 * (3 - R * x1 * x1) / 2 */
231+
__ASM_EMIT("vmul.f32 q0, q2, q6") /* q0 = pan = r*r/sqrt(den) */
232+
__ASM_EMIT("vbif q0, q15, q4") /* q0 = (den >= thresh) ? fabsf(r)/den : dfl */
233+
__ASM_EMIT("subs %[count], #1")
234+
__ASM_EMIT("vst1.32 {d0[0]}, [%[dst]]!")
235+
__ASM_EMIT("bge 5b")
236+
/* end */
237+
__ASM_EMIT("6:")
238+
239+
: [dst] "+r" (dst), [l] "+r" (l), [r] "+r" (r),
240+
[count] "+r" (count),
241+
[dfl] "+t" (dfl)
242+
: [CC] "r" (&depan_eqpow_const_f[0])
243+
: "cc", "memory",
244+
/* "q0" */, "q1", "q2", "q3",
245+
"q4", "q5", "q6", "q7",
246+
"q8", "q9", "q10", "q11",
247+
"q12", "q13", "q14", "q15"
248+
);
249+
}
250+
} /* namespace neon_d32 */
251+
} /* namespace lsp */
252+
253+
254+
#endif /* PRIVATE_DSP_ARCH_ARM_NEON_D32_PAN_H_ */

src/main/arm/neon-d32.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
#include <private/dsp/arch/arm/neon-d32/hmath/hsum.h>
6969
#include <private/dsp/arch/arm/neon-d32/interpolation/linear.h>
7070
#include <private/dsp/arch/arm/neon-d32/mix.h>
71+
#include <private/dsp/arch/arm/neon-d32/pan.h>
7172
#include <private/dsp/arch/arm/neon-d32/msmatrix.h>
7273
#include <private/dsp/arch/arm/neon-d32/pcomplex.h>
7374
#include <private/dsp/arch/arm/neon-d32/pmath/abs_vv.h>
@@ -418,6 +419,9 @@
418419
EXPORT1(mix_add3);
419420
EXPORT1(mix_add4);
420421

422+
EXPORT1(depan_lin);
423+
EXPORT1(depan_eqpow);
424+
421425
EXPORT1(lin_inter_set);
422426
EXPORT1(lin_inter_mul2);
423427
EXPORT1(lin_inter_mul3);

src/test/ptest/pan/depan_eqpow.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ namespace lsp
5656
IF_ARCH_ARM(
5757
namespace neon_d32
5858
{
59-
// void depan_eqpow(float *dst, const float *l, const float *r, float dfl, size_t count);
59+
void depan_eqpow(float *dst, const float *l, const float *r, float dfl, size_t count);
6060
}
6161
)
6262

@@ -111,7 +111,7 @@ PTEST_BEGIN("dsp.pan", depan_eqpow, 5, 1000)
111111
IF_ARCH_X86(CALL(avx::depan_eqpow));
112112
IF_ARCH_X86(CALL(avx::depan_eqpow_fma3));
113113
IF_ARCH_X86(CALL(avx512::depan_eqpow));
114-
// IF_ARCH_ARM(CALL(neon_d32::depan_eqpow));
114+
IF_ARCH_ARM(CALL(neon_d32::depan_eqpow));
115115
// IF_ARCH_AARCH64(CALL(asimd::depan_eqpow));
116116
PTEST_SEPARATOR;
117117
}

src/test/ptest/pan/depan_lin.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ namespace lsp
5555
IF_ARCH_ARM(
5656
namespace neon_d32
5757
{
58-
// void depan_lin(float *dst, const float *l, const float *r, float dfl, size_t count);
58+
void depan_lin(float *dst, const float *l, const float *r, float dfl, size_t count);
5959
}
6060
)
6161

@@ -109,7 +109,7 @@ PTEST_BEGIN("dsp.pan", depan_lin, 5, 1000)
109109
IF_ARCH_X86(CALL(sse::depan_lin));
110110
IF_ARCH_X86(CALL(avx::depan_lin));
111111
IF_ARCH_X86(CALL(avx512::depan_lin));
112-
// IF_ARCH_ARM(CALL(neon_d32::depan_lin));
112+
IF_ARCH_ARM(CALL(neon_d32::depan_lin));
113113
// IF_ARCH_AARCH64(CALL(asimd::depan_lin));
114114
PTEST_SEPARATOR;
115115
}

src/test/utest/pan/depan_eqpow.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ namespace lsp
5252
IF_ARCH_ARM(
5353
namespace neon_d32
5454
{
55-
// void depan_eqpow(float *dst, const float *l, const float *r, float dfl, size_t count);
55+
void depan_eqpow(float *dst, const float *l, const float *r, float dfl, size_t count);
5656
}
5757
)
5858

@@ -118,7 +118,7 @@ UTEST_BEGIN("dsp.pan", depan_eqpow)
118118
IF_ARCH_X86(CALL(avx::depan_eqpow, 32));
119119
IF_ARCH_X86(CALL(avx::depan_eqpow_fma3, 32));
120120
IF_ARCH_X86(CALL(avx512::depan_eqpow, 64));
121-
// IF_ARCH_ARM(CALL(neon_d32::depan_eqpow, 16));
121+
IF_ARCH_ARM(CALL(neon_d32::depan_eqpow, 16));
122122
// IF_ARCH_AARCH64(CALL(asimd::depan_eqpow, 16));
123123
}
124124

src/test/utest/pan/depan_lin.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ namespace lsp
5151
IF_ARCH_ARM(
5252
namespace neon_d32
5353
{
54-
// void depan_lin(float *dst, const float *l, const float *r, float dfl, size_t count);
54+
void depan_lin(float *dst, const float *l, const float *r, float dfl, size_t count);
5555
}
5656
)
5757

@@ -116,7 +116,7 @@ UTEST_BEGIN("dsp.pan", depan_lin)
116116
IF_ARCH_X86(CALL(sse::depan_lin, 16));
117117
IF_ARCH_X86(CALL(avx::depan_lin, 32));
118118
IF_ARCH_X86(CALL(avx512::depan_lin, 64));
119-
// IF_ARCH_ARM(CALL(neon_d32::depan_lin, 16));
119+
IF_ARCH_ARM(CALL(neon_d32::depan_lin, 16));
120120
// IF_ARCH_AARCH64(CALL(asimd::depan_lin, 16));
121121
}
122122

0 commit comments

Comments
 (0)