diff --git a/include/private/dsp/arch/x86/avx512/hmath.h b/include/private/dsp/arch/x86/avx512/hmath.h new file mode 100644 index 00000000..ced53654 --- /dev/null +++ b/include/private/dsp/arch/x86/avx512/hmath.h @@ -0,0 +1,33 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 11 дек. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#ifndef PRIVATE_DSP_ARCH_X86_AVX512_HMATH_H_ +#define PRIVATE_DSP_ARCH_X86_AVX512_HMATH_H_ + +#ifndef PRIVATE_DSP_ARCH_X86_AVX512_IMPL + #error "This header should not be included directly" +#endif /* PRIVATE_DSP_ARCH_X86_AVX512_IMPL */ + + +#include + + +#endif /* PRIVATE_DSP_ARCH_X86_AVX512_HMATH_H_ */ diff --git a/include/private/dsp/arch/x86/avx512/hmath/hsum.h b/include/private/dsp/arch/x86/avx512/hmath/hsum.h new file mode 100644 index 00000000..51f2574e --- /dev/null +++ b/include/private/dsp/arch/x86/avx512/hmath/hsum.h @@ -0,0 +1,237 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 11 дек. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#ifndef PRIVATE_DSP_ARCH_X86_AVX512_HMATH_HSUM_H_ +#define PRIVATE_DSP_ARCH_X86_AVX512_HMATH_HSUM_H_ + +#ifndef PRIVATE_DSP_ARCH_X86_AVX512_IMPL + #error "This header should not be included directly" +#endif /* PRIVATE_DSP_ARCH_X86_AVX512_IMPL */ + +namespace lsp +{ + namespace avx512 + { + + float h_sum(const float *src, size_t count) + { + IF_ARCH_X86(float result); + ARCH_X86_ASM + ( + __ASM_EMIT("vxorps %%zmm0, %%zmm0, %%zmm0") + __ASM_EMIT("vxorps %%zmm1, %%zmm1, %%zmm1") + /* x128 blocks */ + __ASM_EMIT("sub $128, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("vxorps %%zmm2, %%zmm2, %%zmm2") + __ASM_EMIT("vxorps %%zmm3, %%zmm3, %%zmm3") + __ASM_EMIT("1:") + __ASM_EMIT("vaddps 0x000(%[src]), %%zmm0, %%zmm0") + __ASM_EMIT("vaddps 0x040(%[src]), %%zmm1, %%zmm1") + __ASM_EMIT("vaddps 0x080(%[src]), %%zmm2, %%zmm2") + __ASM_EMIT("vaddps 0x0c0(%[src]), %%zmm3, %%zmm3") + __ASM_EMIT("vaddps 0x100(%[src]), %%zmm0, %%zmm0") + __ASM_EMIT("vaddps 0x140(%[src]), %%zmm1, %%zmm1") + __ASM_EMIT("vaddps 0x180(%[src]), %%zmm2, %%zmm2") + __ASM_EMIT("vaddps 0x1c0(%[src]), %%zmm3, %%zmm3") + __ASM_EMIT("add $0x200, %[src]") + __ASM_EMIT("sub $128, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("vaddps %%zmm2, %%zmm0, %%zmm0") + __ASM_EMIT("vaddps %%zmm3, %%zmm1, %%zmm1") + __ASM_EMIT("2:") + /* x32 blocks */ + __ASM_EMIT("add $96, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("3:") + __ASM_EMIT("vaddps 0x000(%[src]), %%zmm0, %%zmm0") + __ASM_EMIT("vaddps 0x040(%[src]), %%zmm1, %%zmm1") + __ASM_EMIT("add $0x80, %[src]") + __ASM_EMIT("sub $32, %[count]") + __ASM_EMIT("jge 3b") + __ASM_EMIT("4:") + __ASM_EMIT("vextractf64x4 $1, %%zmm0, %%ymm2") + __ASM_EMIT("vextractf64x4 $1, %%zmm1, %%ymm3") + __ASM_EMIT("vaddps %%ymm2, %%ymm0, %%ymm0") + __ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1") + /* x16 block */ + __ASM_EMIT("add $16, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("vaddps 0x000(%[src]), %%ymm0, %%ymm0") + __ASM_EMIT("vaddps 0x020(%[src]), %%ymm1, %%ymm1") + __ASM_EMIT("add $0x40, %[src]") + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("6:") + __ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm2") + __ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm3") + __ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0") + __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1") + /* x8 block */ + __ASM_EMIT("add $8, %[count]") + __ASM_EMIT("jl 8f") + __ASM_EMIT("vaddps 0x000(%[src]), %%xmm0, %%xmm0") + __ASM_EMIT("vaddps 0x010(%[src]), %%xmm1, %%xmm1") + __ASM_EMIT("add $0x20, %[src]") + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("8:") + __ASM_EMIT("vaddps %%xmm1, %%xmm0, %%xmm0") + /* x4 block */ + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 10f") + __ASM_EMIT("vaddps 0x00(%[src]), %%xmm0, %%xmm0") + __ASM_EMIT("add $0x10, %[src]") + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("10:") + /* x1 blocks */ + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") + __ASM_EMIT("add $3, %[count]") + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") + __ASM_EMIT("jl 12f") + __ASM_EMIT("11:") + __ASM_EMIT("vaddss 0x00(%[src]), %%xmm0, %%xmm0") + __ASM_EMIT("add $0x04, %[src]") + __ASM_EMIT("dec %[count]") + __ASM_EMIT("jge 11b") + __ASM_EMIT("12:") + /* end */ + : [src] "+r" (src), [count] "+r" (count), + [res] "=Yz" (result) + : + : "cc", "memory", + "%xmm1", "%xmm2", "%xmm3" + ); + + return result; + } + + float h_sqr_sum(const float *src, size_t count) + { + IF_ARCH_X86(float result); + ARCH_X86_ASM + ( + __ASM_EMIT("vxorps %%zmm0, %%zmm0, %%zmm0") + __ASM_EMIT("vxorps %%zmm1, %%zmm1, %%zmm1") + /* x128 blocks */ + __ASM_EMIT("sub $128, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("vxorps %%zmm2, %%zmm2, %%zmm2") + __ASM_EMIT("vxorps %%zmm3, %%zmm3, %%zmm3") + __ASM_EMIT("1:") + __ASM_EMIT("vmovups 0x000(%[src]), %%zmm4") + __ASM_EMIT("vmovups 0x040(%[src]), %%zmm5") + __ASM_EMIT("vmovups 0x080(%[src]), %%zmm6") + __ASM_EMIT("vmovups 0x0c0(%[src]), %%zmm7") + __ASM_EMIT("vfmadd231ps %%zmm4, %%zmm4, %%zmm0") + __ASM_EMIT("vfmadd231ps %%zmm5, %%zmm5, %%zmm1") + __ASM_EMIT("vfmadd231ps %%zmm6, %%zmm6, %%zmm2") + __ASM_EMIT("vfmadd231ps %%zmm7, %%zmm7, %%zmm3") + __ASM_EMIT("vmovups 0x100(%[src]), %%zmm4") + __ASM_EMIT("vmovups 0x140(%[src]), %%zmm5") + __ASM_EMIT("vmovups 0x180(%[src]), %%zmm6") + __ASM_EMIT("vmovups 0x1c0(%[src]), %%zmm7") + __ASM_EMIT("vfmadd231ps %%zmm4, %%zmm4, %%zmm0") + __ASM_EMIT("vfmadd231ps %%zmm5, %%zmm5, %%zmm1") + __ASM_EMIT("vfmadd231ps %%zmm6, %%zmm6, %%zmm2") + __ASM_EMIT("vfmadd231ps %%zmm7, %%zmm7, %%zmm3") + __ASM_EMIT("add $0x200, %[src]") + __ASM_EMIT("sub $128, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("vaddps %%zmm2, %%zmm0, %%zmm0") + __ASM_EMIT("vaddps %%zmm3, %%zmm1, %%zmm1") + __ASM_EMIT("2:") + /* x32 blocks */ + __ASM_EMIT("add $96, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("3:") + __ASM_EMIT("vmovups 0x000(%[src]), %%zmm4") + __ASM_EMIT("vmovups 0x040(%[src]), %%zmm5") + __ASM_EMIT("vfmadd231ps %%zmm4, %%zmm4, %%zmm0") + __ASM_EMIT("vfmadd231ps %%zmm5, %%zmm5, %%zmm1") + __ASM_EMIT("add $0x80, %[src]") + __ASM_EMIT("sub $32, %[count]") + __ASM_EMIT("jge 3b") + __ASM_EMIT("4:") + __ASM_EMIT("vextractf64x4 $1, %%zmm0, %%ymm2") + __ASM_EMIT("vextractf64x4 $1, %%zmm1, %%ymm3") + __ASM_EMIT("vaddps %%ymm2, %%ymm0, %%ymm0") + __ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1") + /* x16 block */ + __ASM_EMIT("add $16, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("vmovups 0x000(%[src]), %%ymm4") + __ASM_EMIT("vmovups 0x020(%[src]), %%ymm5") + __ASM_EMIT("vfmadd231ps %%ymm4, %%ymm4, %%ymm0") + __ASM_EMIT("vfmadd231ps %%ymm5, %%ymm5, %%ymm1") + __ASM_EMIT("add $0x40, %[src]") + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("6:") + __ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm2") + __ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm3") + __ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0") + __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1") + /* x8 block */ + __ASM_EMIT("add $8, %[count]") + __ASM_EMIT("jl 8f") + __ASM_EMIT("vmovups 0x000(%[src]), %%xmm4") + __ASM_EMIT("vmovups 0x020(%[src]), %%xmm5") + __ASM_EMIT("vfmadd231ps %%xmm4, %%xmm4, %%xmm0") + __ASM_EMIT("vfmadd231ps %%xmm5, %%xmm5, %%xmm1") + __ASM_EMIT("add $0x20, %[src]") + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("8:") + __ASM_EMIT("vaddps %%xmm1, %%xmm0, %%xmm0") + /* x4 block */ + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 10f") + __ASM_EMIT("vmovups 0x000(%[src]), %%xmm4") + __ASM_EMIT("vfmadd231ps %%xmm4, %%xmm4, %%xmm0") + __ASM_EMIT("add $0x10, %[src]") + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("10:") + /* x1 blocks */ + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") + __ASM_EMIT("add $3, %[count]") + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") + __ASM_EMIT("jl 12f") + __ASM_EMIT("11:") + __ASM_EMIT("vmovss 0x000(%[src]), %%xmm4") + __ASM_EMIT("vfmadd231ss %%xmm4, %%xmm4, %%xmm0") + __ASM_EMIT("add $0x04, %[src]") + __ASM_EMIT("dec %[count]") + __ASM_EMIT("jge 11b") + __ASM_EMIT("12:") + /* end */ + : [src] "+r" (src), [count] "+r" (count), + [res] "=Yz" (result) + : + : "cc", "memory", + "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7" + ); + + return result; + } + + } /* namespace avx512 */ +} /* namespace lsp */ + + +#endif /* PRIVATE_DSP_ARCH_X86_AVX512_HMATH_HSUM_H_ */ diff --git a/src/main/x86/avx512.cpp b/src/main/x86/avx512.cpp index 830b7727..c9163f36 100644 --- a/src/main/x86/avx512.cpp +++ b/src/main/x86/avx512.cpp @@ -47,6 +47,7 @@ #include #include #include + #include #include #include #include @@ -315,14 +316,14 @@ CEXPORT1(vl, mix2); CEXPORT1(vl, mix_copy2); CEXPORT1(vl, mix_add2); - CEXPORT1(vl, mix3); CEXPORT1(vl, mix_copy3); CEXPORT1(vl, mix_add3); - CEXPORT1(vl, mix4); CEXPORT1(vl, mix_copy4); CEXPORT1(vl, mix_add4); + + CEXPORT1(vl, h_sum); } } /* namespace avx2 */ } /* namespace lsp */ diff --git a/src/test/ptest/hmath/h_sqr_sum.cpp b/src/test/ptest/hmath/h_sqr_sum.cpp index deaa953b..11517a20 100644 --- a/src/test/ptest/hmath/h_sqr_sum.cpp +++ b/src/test/ptest/hmath/h_sqr_sum.cpp @@ -1,6 +1,6 @@ /* - * Copyright (C) 2023 Linux Studio Plugins Project - * (C) 2023 Vladimir Sadovnikov + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov * * This file is part of lsp-dsp-lib * Created on: 31 мар. 2020 г. @@ -45,6 +45,11 @@ namespace lsp float h_sqr_sum(const float *src, size_t count); float h_sqr_sum_fma3(const float *src, size_t count); } + + namespace avx512 + { + float h_sqr_sum(const float *src, size_t count); + } ) IF_ARCH_ARM( @@ -100,6 +105,7 @@ PTEST_BEGIN("dsp.hmath", h_sqr_sum, 5, 10000) IF_ARCH_X86(CALL(sse::h_sqr_sum)); IF_ARCH_X86(CALL(avx::h_sqr_sum)); IF_ARCH_X86(CALL(avx::h_sqr_sum_fma3)); + IF_ARCH_X86(CALL(avx512::h_sqr_sum)); IF_ARCH_ARM(CALL(neon_d32::h_sqr_sum)); IF_ARCH_AARCH64(CALL(asimd::h_sqr_sum)); PTEST_SEPARATOR; diff --git a/src/test/ptest/hmath/h_sum.cpp b/src/test/ptest/hmath/h_sum.cpp index d2af7149..1ced0f0f 100644 --- a/src/test/ptest/hmath/h_sum.cpp +++ b/src/test/ptest/hmath/h_sum.cpp @@ -1,6 +1,6 @@ /* - * Copyright (C) 2023 Linux Studio Plugins Project - * (C) 2023 Vladimir Sadovnikov + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov * * This file is part of lsp-dsp-lib * Created on: 31 мар. 2020 г. @@ -44,6 +44,11 @@ namespace lsp { float h_sum(const float *src, size_t count); } + + namespace avx512 + { + float h_sum(const float *src, size_t count); + } ) IF_ARCH_ARM( @@ -98,6 +103,7 @@ PTEST_BEGIN("dsp.hmath", h_sum, 5, 10000) CALL(generic::h_sum); IF_ARCH_X86(CALL(sse::h_sum)); IF_ARCH_X86(CALL(avx::h_sum)); + IF_ARCH_X86(CALL(avx512::h_sum)); IF_ARCH_ARM(CALL(neon_d32::h_sum)); IF_ARCH_AARCH64(CALL(asimd::h_sum)); PTEST_SEPARATOR; diff --git a/src/test/utest/hmath/h_sqr_sum.cpp b/src/test/utest/hmath/h_sqr_sum.cpp index 7979bd21..a6a76b6e 100644 --- a/src/test/utest/hmath/h_sqr_sum.cpp +++ b/src/test/utest/hmath/h_sqr_sum.cpp @@ -50,6 +50,11 @@ namespace lsp float h_sqr_sum(const float *src, size_t count); float h_sqr_sum_fma3(const float *src, size_t count); } + + namespace avx512 + { + float h_sqr_sum(const float *src, size_t count); + } ) IF_ARCH_ARM( @@ -114,6 +119,7 @@ UTEST_BEGIN("dsp.hmath", h_sqr_sum) IF_ARCH_X86(CALL(generic::h_sqr_sum, sse::h_sqr_sum, 16)); IF_ARCH_X86(CALL(generic::h_sqr_sum, avx::h_sqr_sum, 32)); IF_ARCH_X86(CALL(generic::h_sqr_sum, avx::h_sqr_sum_fma3, 32)); + IF_ARCH_X86(CALL(generic::h_sqr_sum, avx512::h_sqr_sum, 64)); IF_ARCH_ARM(CALL(generic::h_sqr_sum, neon_d32::h_sqr_sum, 16)); IF_ARCH_AARCH64(CALL(generic::h_sqr_sum, asimd::h_sqr_sum, 16)); } diff --git a/src/test/utest/hmath/h_sum.cpp b/src/test/utest/hmath/h_sum.cpp index 1d157aab..b5cb3a5d 100644 --- a/src/test/utest/hmath/h_sum.cpp +++ b/src/test/utest/hmath/h_sum.cpp @@ -1,6 +1,6 @@ /* - * Copyright (C) 2023 Linux Studio Plugins Project - * (C) 2023 Vladimir Sadovnikov + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov * * This file is part of lsp-dsp-lib * Created on: 31 мар. 2020 г. @@ -49,6 +49,11 @@ namespace lsp { float h_sum(const float *src, size_t count); } + + namespace avx512 + { + float h_sum(const float *src, size_t count); + } ) IF_ARCH_ARM( @@ -110,6 +115,7 @@ UTEST_BEGIN("dsp.hmath", h_sum) IF_ARCH_X86(CALL(generic::h_sum, sse::h_sum, 16)); IF_ARCH_X86(CALL(generic::h_sum, avx::h_sum, 32)); + IF_ARCH_X86(CALL(generic::h_sum, avx512::h_sum, 64)); IF_ARCH_ARM(CALL(generic::h_sum, neon_d32::h_sum, 16)); IF_ARCH_AARCH64(CALL(generic::h_sum, asimd::h_sum, 16)); }