Skip to content

Commit 6d84379

Browse files
committed
AVX-512 implementation of h_sum
1 parent 7dd57a1 commit 6d84379

File tree

4 files changed

+173
-4
lines changed

4 files changed

+173
-4
lines changed
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
/*
2+
* Copyright (C) 2024 Linux Studio Plugins Project <https://lsp-plug.in/>
3+
* (C) 2024 Vladimir Sadovnikov <sadko4u@gmail.com>
4+
*
5+
* This file is part of lsp-dsp-lib
6+
* Created on: 11 дек. 2024 г.
7+
*
8+
* lsp-dsp-lib is free software: you can redistribute it and/or modify
9+
* it under the terms of the GNU Lesser General Public License as published by
10+
* the Free Software Foundation, either version 3 of the License, or
11+
* any later version.
12+
*
13+
* lsp-dsp-lib is distributed in the hope that it will be useful,
14+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
15+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16+
* GNU Lesser General Public License for more details.
17+
*
18+
* You should have received a copy of the GNU Lesser General Public License
19+
* along with lsp-dsp-lib. If not, see <https://www.gnu.org/licenses/>.
20+
*/
21+
22+
#ifndef PRIVATE_DSP_ARCH_X86_AVX512_HMATH_H_
23+
#define PRIVATE_DSP_ARCH_X86_AVX512_HMATH_H_
24+
25+
#ifndef PRIVATE_DSP_ARCH_X86_AVX512_IMPL
26+
#error "This header should not be included directly"
27+
#endif /* PRIVATE_DSP_ARCH_X86_AVX512_IMPL */
28+
29+
30+
#include <private/dsp/arch/x86/avx512/hmath/hsum.h>
31+
32+
33+
#endif /* PRIVATE_DSP_ARCH_X86_AVX512_HMATH_H_ */
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
/*
2+
* Copyright (C) 2024 Linux Studio Plugins Project <https://lsp-plug.in/>
3+
* (C) 2024 Vladimir Sadovnikov <sadko4u@gmail.com>
4+
*
5+
* This file is part of lsp-dsp-lib
6+
* Created on: 11 дек. 2024 г.
7+
*
8+
* lsp-dsp-lib is free software: you can redistribute it and/or modify
9+
* it under the terms of the GNU Lesser General Public License as published by
10+
* the Free Software Foundation, either version 3 of the License, or
11+
* any later version.
12+
*
13+
* lsp-dsp-lib is distributed in the hope that it will be useful,
14+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
15+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16+
* GNU Lesser General Public License for more details.
17+
*
18+
* You should have received a copy of the GNU Lesser General Public License
19+
* along with lsp-dsp-lib. If not, see <https://www.gnu.org/licenses/>.
20+
*/
21+
22+
#ifndef PRIVATE_DSP_ARCH_X86_AVX512_HMATH_HSUM_H_
23+
#define PRIVATE_DSP_ARCH_X86_AVX512_HMATH_HSUM_H_
24+
25+
#ifndef PRIVATE_DSP_ARCH_X86_AVX512_IMPL
26+
#error "This header should not be included directly"
27+
#endif /* PRIVATE_DSP_ARCH_X86_AVX512_IMPL */
28+
29+
namespace lsp
30+
{
31+
namespace avx512
32+
{
33+
34+
float h_sum(const float *src, size_t count)
35+
{
36+
IF_ARCH_X86(float result);
37+
ARCH_X86_ASM
38+
(
39+
__ASM_EMIT("vxorps %%zmm0, %%zmm0, %%zmm0")
40+
__ASM_EMIT("vxorps %%zmm1, %%zmm1, %%zmm1")
41+
/* x128 blocks */
42+
__ASM_EMIT("sub $128, %[count]")
43+
__ASM_EMIT("jb 2f")
44+
__ASM_EMIT("vxorps %%zmm2, %%zmm2, %%zmm2")
45+
__ASM_EMIT("vxorps %%zmm3, %%zmm3, %%zmm3")
46+
__ASM_EMIT("1:")
47+
__ASM_EMIT("vaddps 0x000(%[src]), %%zmm0, %%zmm0")
48+
__ASM_EMIT("vaddps 0x040(%[src]), %%zmm1, %%zmm1")
49+
__ASM_EMIT("vaddps 0x080(%[src]), %%zmm2, %%zmm2")
50+
__ASM_EMIT("vaddps 0x0c0(%[src]), %%zmm3, %%zmm3")
51+
__ASM_EMIT("vaddps 0x100(%[src]), %%zmm0, %%zmm0")
52+
__ASM_EMIT("vaddps 0x140(%[src]), %%zmm1, %%zmm1")
53+
__ASM_EMIT("vaddps 0x180(%[src]), %%zmm2, %%zmm2")
54+
__ASM_EMIT("vaddps 0x1c0(%[src]), %%zmm3, %%zmm3")
55+
__ASM_EMIT("add $0x200, %[src]")
56+
__ASM_EMIT("sub $128, %[count]")
57+
__ASM_EMIT("jae 1b")
58+
__ASM_EMIT("vaddps %%zmm2, %%zmm0, %%zmm0")
59+
__ASM_EMIT("vaddps %%zmm3, %%zmm1, %%zmm1")
60+
__ASM_EMIT("2:")
61+
/* x32 blocks */
62+
__ASM_EMIT("add $64, %[count]")
63+
__ASM_EMIT("jl 4f")
64+
__ASM_EMIT("3:")
65+
__ASM_EMIT("vaddps 0x000(%[src]), %%zmm0, %%zmm0")
66+
__ASM_EMIT("vaddps 0x040(%[src]), %%zmm1, %%zmm1")
67+
__ASM_EMIT("add $0x80, %[src]")
68+
__ASM_EMIT("sub $32, %[count]")
69+
__ASM_EMIT("jge 3b")
70+
__ASM_EMIT("4:")
71+
__ASM_EMIT("vextractf64x4 $1, %%zmm0, %%ymm2")
72+
__ASM_EMIT("vextractf64x4 $1, %%zmm1, %%ymm3")
73+
__ASM_EMIT("vaddps %%ymm2, %%ymm0, %%ymm0")
74+
__ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1")
75+
/* x16 block */
76+
__ASM_EMIT("add $16, %[count]")
77+
__ASM_EMIT("jl 6f")
78+
__ASM_EMIT("vaddps 0x000(%[src]), %%ymm0, %%ymm0")
79+
__ASM_EMIT("vaddps 0x020(%[src]), %%ymm1, %%ymm1")
80+
__ASM_EMIT("add $0x40, %[src]")
81+
__ASM_EMIT("sub $16, %[count]")
82+
__ASM_EMIT("6:")
83+
__ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm2")
84+
__ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm3")
85+
__ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0")
86+
__ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1")
87+
/* x8 block */
88+
__ASM_EMIT("add $8, %[count]")
89+
__ASM_EMIT("jl 8f")
90+
__ASM_EMIT("vaddps 0x000(%[src]), %%xmm0, %%xmm0")
91+
__ASM_EMIT("vaddps 0x010(%[src]), %%xmm1, %%xmm1")
92+
__ASM_EMIT("add $0x20, %[src]")
93+
__ASM_EMIT("sub $8, %[count]")
94+
__ASM_EMIT("8:")
95+
__ASM_EMIT("vaddps %%xmm1, %%xmm0, %%xmm0")
96+
/* x4 block */
97+
__ASM_EMIT("add $4, %[count]")
98+
__ASM_EMIT("jl 10f")
99+
__ASM_EMIT("vaddps 0x00(%[src]), %%xmm0, %%xmm0")
100+
__ASM_EMIT("add $0x10, %[src]")
101+
__ASM_EMIT("sub $4, %[count]")
102+
__ASM_EMIT("10:")
103+
/* x1 blocks */
104+
__ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0")
105+
__ASM_EMIT("add $3, %[count]")
106+
__ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0")
107+
__ASM_EMIT("jl 12f")
108+
__ASM_EMIT("11:")
109+
__ASM_EMIT("vaddss 0x00(%[src]), %%xmm0, %%xmm0")
110+
__ASM_EMIT("add $0x04, %[src]")
111+
__ASM_EMIT("dec %[count]")
112+
__ASM_EMIT("jge 11b")
113+
__ASM_EMIT("12:")
114+
/* end */
115+
: [src] "+r" (src), [count] "+r" (count),
116+
[res] "=Yz" (result)
117+
:
118+
: "cc", "memory",
119+
"%xmm1", "%xmm2", "%xmm3"
120+
);
121+
122+
return result;
123+
}
124+
125+
} /* namespace avx512 */
126+
} /* namespace lsp */
127+
128+
129+
#endif /* PRIVATE_DSP_ARCH_X86_AVX512_HMATH_HSUM_H_ */

src/main/x86/avx512.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
#include <private/dsp/arch/x86/avx512/dynamics.h>
4848
#include <private/dsp/arch/x86/avx512/float.h>
4949
#include <private/dsp/arch/x86/avx512/graphics/axis.h>
50+
#include <private/dsp/arch/x86/avx512/hmath.h>
5051
#include <private/dsp/arch/x86/avx512/msmatrix.h>
5152
#include <private/dsp/arch/x86/avx512/pcomplex.h>
5253
#include <private/dsp/arch/x86/avx512/pmath.h>
@@ -315,14 +316,14 @@
315316
CEXPORT1(vl, mix2);
316317
CEXPORT1(vl, mix_copy2);
317318
CEXPORT1(vl, mix_add2);
318-
319319
CEXPORT1(vl, mix3);
320320
CEXPORT1(vl, mix_copy3);
321321
CEXPORT1(vl, mix_add3);
322-
323322
CEXPORT1(vl, mix4);
324323
CEXPORT1(vl, mix_copy4);
325324
CEXPORT1(vl, mix_add4);
325+
326+
CEXPORT1(vl, h_sum);
326327
}
327328
} /* namespace avx2 */
328329
} /* namespace lsp */

src/test/utest/hmath/h_sum.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*
2-
* Copyright (C) 2023 Linux Studio Plugins Project <https://lsp-plug.in/>
3-
* (C) 2023 Vladimir Sadovnikov <sadko4u@gmail.com>
2+
* Copyright (C) 2024 Linux Studio Plugins Project <https://lsp-plug.in/>
3+
* (C) 2024 Vladimir Sadovnikov <sadko4u@gmail.com>
44
*
55
* This file is part of lsp-dsp-lib
66
* Created on: 31 мар. 2020 г.
@@ -49,6 +49,11 @@ namespace lsp
4949
{
5050
float h_sum(const float *src, size_t count);
5151
}
52+
53+
namespace avx512
54+
{
55+
float h_sum(const float *src, size_t count);
56+
}
5257
)
5358

5459
IF_ARCH_ARM(
@@ -110,6 +115,7 @@ UTEST_BEGIN("dsp.hmath", h_sum)
110115

111116
IF_ARCH_X86(CALL(generic::h_sum, sse::h_sum, 16));
112117
IF_ARCH_X86(CALL(generic::h_sum, avx::h_sum, 32));
118+
IF_ARCH_X86(CALL(generic::h_sum, avx512::h_sum, 64));
113119
IF_ARCH_ARM(CALL(generic::h_sum, neon_d32::h_sum, 16));
114120
IF_ARCH_AARCH64(CALL(generic::h_sum, asimd::h_sum, 16));
115121
}

0 commit comments

Comments
 (0)