|
| 1 | +/* |
| 2 | + * Copyright (C) 2024 Linux Studio Plugins Project <https://lsp-plug.in/> |
| 3 | + * (C) 2024 Vladimir Sadovnikov <sadko4u@gmail.com> |
| 4 | + * |
| 5 | + * This file is part of lsp-dsp-lib |
| 6 | + * Created on: 11 дек. 2024 г. |
| 7 | + * |
| 8 | + * lsp-dsp-lib is free software: you can redistribute it and/or modify |
| 9 | + * it under the terms of the GNU Lesser General Public License as published by |
| 10 | + * the Free Software Foundation, either version 3 of the License, or |
| 11 | + * any later version. |
| 12 | + * |
| 13 | + * lsp-dsp-lib is distributed in the hope that it will be useful, |
| 14 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 15 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 16 | + * GNU Lesser General Public License for more details. |
| 17 | + * |
| 18 | + * You should have received a copy of the GNU Lesser General Public License |
| 19 | + * along with lsp-dsp-lib. If not, see <https://www.gnu.org/licenses/>. |
| 20 | + */ |
| 21 | + |
| 22 | +#ifndef PRIVATE_DSP_ARCH_X86_AVX512_HMATH_HSUM_H_ |
| 23 | +#define PRIVATE_DSP_ARCH_X86_AVX512_HMATH_HSUM_H_ |
| 24 | + |
| 25 | +#ifndef PRIVATE_DSP_ARCH_X86_AVX512_IMPL |
| 26 | + #error "This header should not be included directly" |
| 27 | +#endif /* PRIVATE_DSP_ARCH_X86_AVX512_IMPL */ |
| 28 | + |
| 29 | +namespace lsp |
| 30 | +{ |
| 31 | + namespace avx512 |
| 32 | + { |
| 33 | + |
| 34 | + float h_sum(const float *src, size_t count) |
| 35 | + { |
| 36 | + IF_ARCH_X86(float result); |
| 37 | + ARCH_X86_ASM |
| 38 | + ( |
| 39 | + __ASM_EMIT("vxorps %%zmm0, %%zmm0, %%zmm0") |
| 40 | + __ASM_EMIT("vxorps %%zmm1, %%zmm1, %%zmm1") |
| 41 | + /* x128 blocks */ |
| 42 | + __ASM_EMIT("sub $128, %[count]") |
| 43 | + __ASM_EMIT("jb 2f") |
| 44 | + __ASM_EMIT("vxorps %%zmm2, %%zmm2, %%zmm2") |
| 45 | + __ASM_EMIT("vxorps %%zmm3, %%zmm3, %%zmm3") |
| 46 | + __ASM_EMIT("1:") |
| 47 | + __ASM_EMIT("vaddps 0x000(%[src]), %%zmm0, %%zmm0") |
| 48 | + __ASM_EMIT("vaddps 0x040(%[src]), %%zmm1, %%zmm1") |
| 49 | + __ASM_EMIT("vaddps 0x080(%[src]), %%zmm2, %%zmm2") |
| 50 | + __ASM_EMIT("vaddps 0x0c0(%[src]), %%zmm3, %%zmm3") |
| 51 | + __ASM_EMIT("vaddps 0x100(%[src]), %%zmm0, %%zmm0") |
| 52 | + __ASM_EMIT("vaddps 0x140(%[src]), %%zmm1, %%zmm1") |
| 53 | + __ASM_EMIT("vaddps 0x180(%[src]), %%zmm2, %%zmm2") |
| 54 | + __ASM_EMIT("vaddps 0x1c0(%[src]), %%zmm3, %%zmm3") |
| 55 | + __ASM_EMIT("add $0x200, %[src]") |
| 56 | + __ASM_EMIT("sub $128, %[count]") |
| 57 | + __ASM_EMIT("jae 1b") |
| 58 | + __ASM_EMIT("vaddps %%zmm2, %%zmm0, %%zmm0") |
| 59 | + __ASM_EMIT("vaddps %%zmm3, %%zmm1, %%zmm1") |
| 60 | + __ASM_EMIT("2:") |
| 61 | + /* x32 blocks */ |
| 62 | + __ASM_EMIT("add $64, %[count]") |
| 63 | + __ASM_EMIT("jl 4f") |
| 64 | + __ASM_EMIT("3:") |
| 65 | + __ASM_EMIT("vaddps 0x000(%[src]), %%zmm0, %%zmm0") |
| 66 | + __ASM_EMIT("vaddps 0x040(%[src]), %%zmm1, %%zmm1") |
| 67 | + __ASM_EMIT("add $0x80, %[src]") |
| 68 | + __ASM_EMIT("sub $32, %[count]") |
| 69 | + __ASM_EMIT("jge 3b") |
| 70 | + __ASM_EMIT("4:") |
| 71 | + __ASM_EMIT("vextractf64x4 $1, %%zmm0, %%ymm2") |
| 72 | + __ASM_EMIT("vextractf64x4 $1, %%zmm1, %%ymm3") |
| 73 | + __ASM_EMIT("vaddps %%ymm2, %%ymm0, %%ymm0") |
| 74 | + __ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1") |
| 75 | + /* x16 block */ |
| 76 | + __ASM_EMIT("add $16, %[count]") |
| 77 | + __ASM_EMIT("jl 6f") |
| 78 | + __ASM_EMIT("vaddps 0x000(%[src]), %%ymm0, %%ymm0") |
| 79 | + __ASM_EMIT("vaddps 0x020(%[src]), %%ymm1, %%ymm1") |
| 80 | + __ASM_EMIT("add $0x40, %[src]") |
| 81 | + __ASM_EMIT("sub $16, %[count]") |
| 82 | + __ASM_EMIT("6:") |
| 83 | + __ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm2") |
| 84 | + __ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm3") |
| 85 | + __ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0") |
| 86 | + __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1") |
| 87 | + /* x8 block */ |
| 88 | + __ASM_EMIT("add $8, %[count]") |
| 89 | + __ASM_EMIT("jl 8f") |
| 90 | + __ASM_EMIT("vaddps 0x000(%[src]), %%xmm0, %%xmm0") |
| 91 | + __ASM_EMIT("vaddps 0x010(%[src]), %%xmm1, %%xmm1") |
| 92 | + __ASM_EMIT("add $0x20, %[src]") |
| 93 | + __ASM_EMIT("sub $8, %[count]") |
| 94 | + __ASM_EMIT("8:") |
| 95 | + __ASM_EMIT("vaddps %%xmm1, %%xmm0, %%xmm0") |
| 96 | + /* x4 block */ |
| 97 | + __ASM_EMIT("add $4, %[count]") |
| 98 | + __ASM_EMIT("jl 10f") |
| 99 | + __ASM_EMIT("vaddps 0x00(%[src]), %%xmm0, %%xmm0") |
| 100 | + __ASM_EMIT("add $0x10, %[src]") |
| 101 | + __ASM_EMIT("sub $4, %[count]") |
| 102 | + __ASM_EMIT("10:") |
| 103 | + /* x1 blocks */ |
| 104 | + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") |
| 105 | + __ASM_EMIT("add $3, %[count]") |
| 106 | + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") |
| 107 | + __ASM_EMIT("jl 12f") |
| 108 | + __ASM_EMIT("11:") |
| 109 | + __ASM_EMIT("vaddss 0x00(%[src]), %%xmm0, %%xmm0") |
| 110 | + __ASM_EMIT("add $0x04, %[src]") |
| 111 | + __ASM_EMIT("dec %[count]") |
| 112 | + __ASM_EMIT("jge 11b") |
| 113 | + __ASM_EMIT("12:") |
| 114 | + /* end */ |
| 115 | + : [src] "+r" (src), [count] "+r" (count), |
| 116 | + [res] "=Yz" (result) |
| 117 | + : |
| 118 | + : "cc", "memory", |
| 119 | + "%xmm1", "%xmm2", "%xmm3" |
| 120 | + ); |
| 121 | + |
| 122 | + return result; |
| 123 | + } |
| 124 | + |
| 125 | + } /* namespace avx512 */ |
| 126 | +} /* namespace lsp */ |
| 127 | + |
| 128 | + |
| 129 | +#endif /* PRIVATE_DSP_ARCH_X86_AVX512_HMATH_HSUM_H_ */ |
0 commit comments