diff --git a/include/private/dsp/arch/x86/avx512/hmath.h b/include/private/dsp/arch/x86/avx512/hmath.h
new file mode 100644
index 00000000..ced53654
--- /dev/null
+++ b/include/private/dsp/arch/x86/avx512/hmath.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2024 Linux Studio Plugins Project
+ * (C) 2024 Vladimir Sadovnikov
+ *
+ * This file is part of lsp-dsp-lib
+ * Created on: 11 дек. 2024 г.
+ *
+ * lsp-dsp-lib is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version.
+ *
+ * lsp-dsp-lib is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with lsp-dsp-lib. If not, see .
+ */
+
+#ifndef PRIVATE_DSP_ARCH_X86_AVX512_HMATH_H_
+#define PRIVATE_DSP_ARCH_X86_AVX512_HMATH_H_
+
+#ifndef PRIVATE_DSP_ARCH_X86_AVX512_IMPL
+ #error "This header should not be included directly"
+#endif /* PRIVATE_DSP_ARCH_X86_AVX512_IMPL */
+
+
+#include
+
+
+#endif /* PRIVATE_DSP_ARCH_X86_AVX512_HMATH_H_ */
diff --git a/include/private/dsp/arch/x86/avx512/hmath/hsum.h b/include/private/dsp/arch/x86/avx512/hmath/hsum.h
new file mode 100644
index 00000000..51f2574e
--- /dev/null
+++ b/include/private/dsp/arch/x86/avx512/hmath/hsum.h
@@ -0,0 +1,237 @@
+/*
+ * Copyright (C) 2024 Linux Studio Plugins Project
+ * (C) 2024 Vladimir Sadovnikov
+ *
+ * This file is part of lsp-dsp-lib
+ * Created on: 11 дек. 2024 г.
+ *
+ * lsp-dsp-lib is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version.
+ *
+ * lsp-dsp-lib is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with lsp-dsp-lib. If not, see .
+ */
+
+#ifndef PRIVATE_DSP_ARCH_X86_AVX512_HMATH_HSUM_H_
+#define PRIVATE_DSP_ARCH_X86_AVX512_HMATH_HSUM_H_
+
+#ifndef PRIVATE_DSP_ARCH_X86_AVX512_IMPL
+ #error "This header should not be included directly"
+#endif /* PRIVATE_DSP_ARCH_X86_AVX512_IMPL */
+
+namespace lsp
+{
+ namespace avx512
+ {
+
+ float h_sum(const float *src, size_t count)
+ {
+ IF_ARCH_X86(float result);
+ ARCH_X86_ASM
+ (
+ __ASM_EMIT("vxorps %%zmm0, %%zmm0, %%zmm0")
+ __ASM_EMIT("vxorps %%zmm1, %%zmm1, %%zmm1")
+ /* x128 blocks */
+ __ASM_EMIT("sub $128, %[count]")
+ __ASM_EMIT("jb 2f")
+ __ASM_EMIT("vxorps %%zmm2, %%zmm2, %%zmm2")
+ __ASM_EMIT("vxorps %%zmm3, %%zmm3, %%zmm3")
+ __ASM_EMIT("1:")
+ __ASM_EMIT("vaddps 0x000(%[src]), %%zmm0, %%zmm0")
+ __ASM_EMIT("vaddps 0x040(%[src]), %%zmm1, %%zmm1")
+ __ASM_EMIT("vaddps 0x080(%[src]), %%zmm2, %%zmm2")
+ __ASM_EMIT("vaddps 0x0c0(%[src]), %%zmm3, %%zmm3")
+ __ASM_EMIT("vaddps 0x100(%[src]), %%zmm0, %%zmm0")
+ __ASM_EMIT("vaddps 0x140(%[src]), %%zmm1, %%zmm1")
+ __ASM_EMIT("vaddps 0x180(%[src]), %%zmm2, %%zmm2")
+ __ASM_EMIT("vaddps 0x1c0(%[src]), %%zmm3, %%zmm3")
+ __ASM_EMIT("add $0x200, %[src]")
+ __ASM_EMIT("sub $128, %[count]")
+ __ASM_EMIT("jae 1b")
+ __ASM_EMIT("vaddps %%zmm2, %%zmm0, %%zmm0")
+ __ASM_EMIT("vaddps %%zmm3, %%zmm1, %%zmm1")
+ __ASM_EMIT("2:")
+ /* x32 blocks */
+ __ASM_EMIT("add $96, %[count]")
+ __ASM_EMIT("jl 4f")
+ __ASM_EMIT("3:")
+ __ASM_EMIT("vaddps 0x000(%[src]), %%zmm0, %%zmm0")
+ __ASM_EMIT("vaddps 0x040(%[src]), %%zmm1, %%zmm1")
+ __ASM_EMIT("add $0x80, %[src]")
+ __ASM_EMIT("sub $32, %[count]")
+ __ASM_EMIT("jge 3b")
+ __ASM_EMIT("4:")
+ __ASM_EMIT("vextractf64x4 $1, %%zmm0, %%ymm2")
+ __ASM_EMIT("vextractf64x4 $1, %%zmm1, %%ymm3")
+ __ASM_EMIT("vaddps %%ymm2, %%ymm0, %%ymm0")
+ __ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1")
+ /* x16 block */
+ __ASM_EMIT("add $16, %[count]")
+ __ASM_EMIT("jl 6f")
+ __ASM_EMIT("vaddps 0x000(%[src]), %%ymm0, %%ymm0")
+ __ASM_EMIT("vaddps 0x020(%[src]), %%ymm1, %%ymm1")
+ __ASM_EMIT("add $0x40, %[src]")
+ __ASM_EMIT("sub $16, %[count]")
+ __ASM_EMIT("6:")
+ __ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm2")
+ __ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm3")
+ __ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0")
+ __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1")
+ /* x8 block */
+ __ASM_EMIT("add $8, %[count]")
+ __ASM_EMIT("jl 8f")
+ __ASM_EMIT("vaddps 0x000(%[src]), %%xmm0, %%xmm0")
+ __ASM_EMIT("vaddps 0x010(%[src]), %%xmm1, %%xmm1")
+ __ASM_EMIT("add $0x20, %[src]")
+ __ASM_EMIT("sub $8, %[count]")
+ __ASM_EMIT("8:")
+ __ASM_EMIT("vaddps %%xmm1, %%xmm0, %%xmm0")
+ /* x4 block */
+ __ASM_EMIT("add $4, %[count]")
+ __ASM_EMIT("jl 10f")
+ __ASM_EMIT("vaddps 0x00(%[src]), %%xmm0, %%xmm0")
+ __ASM_EMIT("add $0x10, %[src]")
+ __ASM_EMIT("sub $4, %[count]")
+ __ASM_EMIT("10:")
+ /* x1 blocks */
+ __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0")
+ __ASM_EMIT("add $3, %[count]")
+ __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0")
+ __ASM_EMIT("jl 12f")
+ __ASM_EMIT("11:")
+ __ASM_EMIT("vaddss 0x00(%[src]), %%xmm0, %%xmm0")
+ __ASM_EMIT("add $0x04, %[src]")
+ __ASM_EMIT("dec %[count]")
+ __ASM_EMIT("jge 11b")
+ __ASM_EMIT("12:")
+ /* end */
+ : [src] "+r" (src), [count] "+r" (count),
+ [res] "=Yz" (result)
+ :
+ : "cc", "memory",
+ "%xmm1", "%xmm2", "%xmm3"
+ );
+
+ return result;
+ }
+
+ float h_sqr_sum(const float *src, size_t count)
+ {
+ IF_ARCH_X86(float result);
+ ARCH_X86_ASM
+ (
+ __ASM_EMIT("vxorps %%zmm0, %%zmm0, %%zmm0")
+ __ASM_EMIT("vxorps %%zmm1, %%zmm1, %%zmm1")
+ /* x128 blocks */
+ __ASM_EMIT("sub $128, %[count]")
+ __ASM_EMIT("jb 2f")
+ __ASM_EMIT("vxorps %%zmm2, %%zmm2, %%zmm2")
+ __ASM_EMIT("vxorps %%zmm3, %%zmm3, %%zmm3")
+ __ASM_EMIT("1:")
+ __ASM_EMIT("vmovups 0x000(%[src]), %%zmm4")
+ __ASM_EMIT("vmovups 0x040(%[src]), %%zmm5")
+ __ASM_EMIT("vmovups 0x080(%[src]), %%zmm6")
+ __ASM_EMIT("vmovups 0x0c0(%[src]), %%zmm7")
+ __ASM_EMIT("vfmadd231ps %%zmm4, %%zmm4, %%zmm0")
+ __ASM_EMIT("vfmadd231ps %%zmm5, %%zmm5, %%zmm1")
+ __ASM_EMIT("vfmadd231ps %%zmm6, %%zmm6, %%zmm2")
+ __ASM_EMIT("vfmadd231ps %%zmm7, %%zmm7, %%zmm3")
+ __ASM_EMIT("vmovups 0x100(%[src]), %%zmm4")
+ __ASM_EMIT("vmovups 0x140(%[src]), %%zmm5")
+ __ASM_EMIT("vmovups 0x180(%[src]), %%zmm6")
+ __ASM_EMIT("vmovups 0x1c0(%[src]), %%zmm7")
+ __ASM_EMIT("vfmadd231ps %%zmm4, %%zmm4, %%zmm0")
+ __ASM_EMIT("vfmadd231ps %%zmm5, %%zmm5, %%zmm1")
+ __ASM_EMIT("vfmadd231ps %%zmm6, %%zmm6, %%zmm2")
+ __ASM_EMIT("vfmadd231ps %%zmm7, %%zmm7, %%zmm3")
+ __ASM_EMIT("add $0x200, %[src]")
+ __ASM_EMIT("sub $128, %[count]")
+ __ASM_EMIT("jae 1b")
+ __ASM_EMIT("vaddps %%zmm2, %%zmm0, %%zmm0")
+ __ASM_EMIT("vaddps %%zmm3, %%zmm1, %%zmm1")
+ __ASM_EMIT("2:")
+ /* x32 blocks */
+ __ASM_EMIT("add $96, %[count]")
+ __ASM_EMIT("jl 4f")
+ __ASM_EMIT("3:")
+ __ASM_EMIT("vmovups 0x000(%[src]), %%zmm4")
+ __ASM_EMIT("vmovups 0x040(%[src]), %%zmm5")
+ __ASM_EMIT("vfmadd231ps %%zmm4, %%zmm4, %%zmm0")
+ __ASM_EMIT("vfmadd231ps %%zmm5, %%zmm5, %%zmm1")
+ __ASM_EMIT("add $0x80, %[src]")
+ __ASM_EMIT("sub $32, %[count]")
+ __ASM_EMIT("jge 3b")
+ __ASM_EMIT("4:")
+ __ASM_EMIT("vextractf64x4 $1, %%zmm0, %%ymm2")
+ __ASM_EMIT("vextractf64x4 $1, %%zmm1, %%ymm3")
+ __ASM_EMIT("vaddps %%ymm2, %%ymm0, %%ymm0")
+ __ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1")
+ /* x16 block */
+ __ASM_EMIT("add $16, %[count]")
+ __ASM_EMIT("jl 6f")
+ __ASM_EMIT("vmovups 0x000(%[src]), %%ymm4")
+ __ASM_EMIT("vmovups 0x020(%[src]), %%ymm5")
+ __ASM_EMIT("vfmadd231ps %%ymm4, %%ymm4, %%ymm0")
+ __ASM_EMIT("vfmadd231ps %%ymm5, %%ymm5, %%ymm1")
+ __ASM_EMIT("add $0x40, %[src]")
+ __ASM_EMIT("sub $16, %[count]")
+ __ASM_EMIT("6:")
+ __ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm2")
+ __ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm3")
+ __ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0")
+ __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1")
+ /* x8 block */
+ __ASM_EMIT("add $8, %[count]")
+ __ASM_EMIT("jl 8f")
+ __ASM_EMIT("vmovups 0x000(%[src]), %%xmm4")
+ __ASM_EMIT("vmovups 0x020(%[src]), %%xmm5")
+ __ASM_EMIT("vfmadd231ps %%xmm4, %%xmm4, %%xmm0")
+ __ASM_EMIT("vfmadd231ps %%xmm5, %%xmm5, %%xmm1")
+ __ASM_EMIT("add $0x20, %[src]")
+ __ASM_EMIT("sub $8, %[count]")
+ __ASM_EMIT("8:")
+ __ASM_EMIT("vaddps %%xmm1, %%xmm0, %%xmm0")
+ /* x4 block */
+ __ASM_EMIT("add $4, %[count]")
+ __ASM_EMIT("jl 10f")
+ __ASM_EMIT("vmovups 0x000(%[src]), %%xmm4")
+ __ASM_EMIT("vfmadd231ps %%xmm4, %%xmm4, %%xmm0")
+ __ASM_EMIT("add $0x10, %[src]")
+ __ASM_EMIT("sub $4, %[count]")
+ __ASM_EMIT("10:")
+ /* x1 blocks */
+ __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0")
+ __ASM_EMIT("add $3, %[count]")
+ __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0")
+ __ASM_EMIT("jl 12f")
+ __ASM_EMIT("11:")
+ __ASM_EMIT("vmovss 0x000(%[src]), %%xmm4")
+ __ASM_EMIT("vfmadd231ss %%xmm4, %%xmm4, %%xmm0")
+ __ASM_EMIT("add $0x04, %[src]")
+ __ASM_EMIT("dec %[count]")
+ __ASM_EMIT("jge 11b")
+ __ASM_EMIT("12:")
+ /* end */
+ : [src] "+r" (src), [count] "+r" (count),
+ [res] "=Yz" (result)
+ :
+ : "cc", "memory",
+ "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7"
+ );
+
+ return result;
+ }
+
+ } /* namespace avx512 */
+} /* namespace lsp */
+
+
+#endif /* PRIVATE_DSP_ARCH_X86_AVX512_HMATH_HSUM_H_ */
diff --git a/src/main/x86/avx512.cpp b/src/main/x86/avx512.cpp
index 830b7727..c9163f36 100644
--- a/src/main/x86/avx512.cpp
+++ b/src/main/x86/avx512.cpp
@@ -47,6 +47,7 @@
#include
#include
#include
+ #include
#include
#include
#include
@@ -315,14 +316,14 @@
CEXPORT1(vl, mix2);
CEXPORT1(vl, mix_copy2);
CEXPORT1(vl, mix_add2);
-
CEXPORT1(vl, mix3);
CEXPORT1(vl, mix_copy3);
CEXPORT1(vl, mix_add3);
-
CEXPORT1(vl, mix4);
CEXPORT1(vl, mix_copy4);
CEXPORT1(vl, mix_add4);
+
+ CEXPORT1(vl, h_sum);
}
} /* namespace avx2 */
} /* namespace lsp */
diff --git a/src/test/ptest/hmath/h_sqr_sum.cpp b/src/test/ptest/hmath/h_sqr_sum.cpp
index deaa953b..11517a20 100644
--- a/src/test/ptest/hmath/h_sqr_sum.cpp
+++ b/src/test/ptest/hmath/h_sqr_sum.cpp
@@ -1,6 +1,6 @@
/*
- * Copyright (C) 2023 Linux Studio Plugins Project
- * (C) 2023 Vladimir Sadovnikov
+ * Copyright (C) 2024 Linux Studio Plugins Project
+ * (C) 2024 Vladimir Sadovnikov
*
* This file is part of lsp-dsp-lib
* Created on: 31 мар. 2020 г.
@@ -45,6 +45,11 @@ namespace lsp
float h_sqr_sum(const float *src, size_t count);
float h_sqr_sum_fma3(const float *src, size_t count);
}
+
+ namespace avx512
+ {
+ float h_sqr_sum(const float *src, size_t count);
+ }
)
IF_ARCH_ARM(
@@ -100,6 +105,7 @@ PTEST_BEGIN("dsp.hmath", h_sqr_sum, 5, 10000)
IF_ARCH_X86(CALL(sse::h_sqr_sum));
IF_ARCH_X86(CALL(avx::h_sqr_sum));
IF_ARCH_X86(CALL(avx::h_sqr_sum_fma3));
+ IF_ARCH_X86(CALL(avx512::h_sqr_sum));
IF_ARCH_ARM(CALL(neon_d32::h_sqr_sum));
IF_ARCH_AARCH64(CALL(asimd::h_sqr_sum));
PTEST_SEPARATOR;
diff --git a/src/test/ptest/hmath/h_sum.cpp b/src/test/ptest/hmath/h_sum.cpp
index d2af7149..1ced0f0f 100644
--- a/src/test/ptest/hmath/h_sum.cpp
+++ b/src/test/ptest/hmath/h_sum.cpp
@@ -1,6 +1,6 @@
/*
- * Copyright (C) 2023 Linux Studio Plugins Project
- * (C) 2023 Vladimir Sadovnikov
+ * Copyright (C) 2024 Linux Studio Plugins Project
+ * (C) 2024 Vladimir Sadovnikov
*
* This file is part of lsp-dsp-lib
* Created on: 31 мар. 2020 г.
@@ -44,6 +44,11 @@ namespace lsp
{
float h_sum(const float *src, size_t count);
}
+
+ namespace avx512
+ {
+ float h_sum(const float *src, size_t count);
+ }
)
IF_ARCH_ARM(
@@ -98,6 +103,7 @@ PTEST_BEGIN("dsp.hmath", h_sum, 5, 10000)
CALL(generic::h_sum);
IF_ARCH_X86(CALL(sse::h_sum));
IF_ARCH_X86(CALL(avx::h_sum));
+ IF_ARCH_X86(CALL(avx512::h_sum));
IF_ARCH_ARM(CALL(neon_d32::h_sum));
IF_ARCH_AARCH64(CALL(asimd::h_sum));
PTEST_SEPARATOR;
diff --git a/src/test/utest/hmath/h_sqr_sum.cpp b/src/test/utest/hmath/h_sqr_sum.cpp
index 7979bd21..a6a76b6e 100644
--- a/src/test/utest/hmath/h_sqr_sum.cpp
+++ b/src/test/utest/hmath/h_sqr_sum.cpp
@@ -50,6 +50,11 @@ namespace lsp
float h_sqr_sum(const float *src, size_t count);
float h_sqr_sum_fma3(const float *src, size_t count);
}
+
+ namespace avx512
+ {
+ float h_sqr_sum(const float *src, size_t count);
+ }
)
IF_ARCH_ARM(
@@ -114,6 +119,7 @@ UTEST_BEGIN("dsp.hmath", h_sqr_sum)
IF_ARCH_X86(CALL(generic::h_sqr_sum, sse::h_sqr_sum, 16));
IF_ARCH_X86(CALL(generic::h_sqr_sum, avx::h_sqr_sum, 32));
IF_ARCH_X86(CALL(generic::h_sqr_sum, avx::h_sqr_sum_fma3, 32));
+ IF_ARCH_X86(CALL(generic::h_sqr_sum, avx512::h_sqr_sum, 64));
IF_ARCH_ARM(CALL(generic::h_sqr_sum, neon_d32::h_sqr_sum, 16));
IF_ARCH_AARCH64(CALL(generic::h_sqr_sum, asimd::h_sqr_sum, 16));
}
diff --git a/src/test/utest/hmath/h_sum.cpp b/src/test/utest/hmath/h_sum.cpp
index 1d157aab..b5cb3a5d 100644
--- a/src/test/utest/hmath/h_sum.cpp
+++ b/src/test/utest/hmath/h_sum.cpp
@@ -1,6 +1,6 @@
/*
- * Copyright (C) 2023 Linux Studio Plugins Project
- * (C) 2023 Vladimir Sadovnikov
+ * Copyright (C) 2024 Linux Studio Plugins Project
+ * (C) 2024 Vladimir Sadovnikov
*
* This file is part of lsp-dsp-lib
* Created on: 31 мар. 2020 г.
@@ -49,6 +49,11 @@ namespace lsp
{
float h_sum(const float *src, size_t count);
}
+
+ namespace avx512
+ {
+ float h_sum(const float *src, size_t count);
+ }
)
IF_ARCH_ARM(
@@ -110,6 +115,7 @@ UTEST_BEGIN("dsp.hmath", h_sum)
IF_ARCH_X86(CALL(generic::h_sum, sse::h_sum, 16));
IF_ARCH_X86(CALL(generic::h_sum, avx::h_sum, 32));
+ IF_ARCH_X86(CALL(generic::h_sum, avx512::h_sum, 64));
IF_ARCH_ARM(CALL(generic::h_sum, neon_d32::h_sum, 16));
IF_ARCH_AARCH64(CALL(generic::h_sum, asimd::h_sum, 16));
}