@@ -73,22 +73,18 @@ namespace lsp
73
73
__ASM_EMIT (" add $0x80, %[off]" )
74
74
__ASM_EMIT (" sub $32, %[count]" )
75
75
__ASM_EMIT (" 4:" )
76
- __ASM_EMIT (" vextractf64x4 $1, %%zmm0, %%ymm2" )
77
- __ASM_EMIT (" vextractf64x4 $1, %%zmm1, %%ymm3" )
78
- __ASM_EMIT (" vaddps %%ymm2, %%ymm0, %%ymm0" )
79
- __ASM_EMIT (" vaddps %%ymm3, %%ymm1, %%ymm1" )
76
+ __ASM_EMIT (" vaddps %%zmm1, %%zmm0, %%zmm0" )
80
77
/* x16 block */
81
78
__ASM_EMIT (" add $16, %[count]" )
82
79
__ASM_EMIT (" jl 6f" )
83
- __ASM_EMIT (" vmovups 0x00(%[a], %[off]), %%ymm2" )
84
- __ASM_EMIT (" vmovups 0x20(%[a], %[off]), %%ymm3" )
85
- __ASM_EMIT (" vmulps 0x00(%[b], %[off]), %%ymm2, %%ymm2" )
86
- __ASM_EMIT (" vmulps 0x20(%[b], %[off]), %%ymm3, %%ymm3" )
87
- __ASM_EMIT (" vaddps %%ymm2, %%ymm0, %%ymm0" )
88
- __ASM_EMIT (" vaddps %%ymm3, %%ymm1, %%ymm1" )
80
+ __ASM_EMIT (" vmovups 0x00(%[a], %[off]), %%zmm2" )
81
+ __ASM_EMIT (" vmulps 0x00(%[b], %[off]), %%zmm2, %%zmm2" )
82
+ __ASM_EMIT (" vaddps %%zmm2, %%zmm0, %%zmm0" )
89
83
__ASM_EMIT (" add $0x40, %[off]" )
90
84
__ASM_EMIT (" sub $16, %[count]" )
91
85
__ASM_EMIT (" 6:" )
86
+ __ASM_EMIT (" vextractf64x4 $1, %%zmm0, %%ymm2" )
87
+ __ASM_EMIT (" vaddps %%ymm2, %%ymm0, %%ymm0" )
92
88
/* x8 block */
93
89
__ASM_EMIT (" add $8, %[count]" )
94
90
__ASM_EMIT (" vaddps %%ymm1, %%ymm0, %%ymm0" )
@@ -99,10 +95,10 @@ namespace lsp
99
95
__ASM_EMIT (" add $0x20, %[off]" )
100
96
__ASM_EMIT (" sub $8, %[count]" )
101
97
__ASM_EMIT (" 8:" )
98
+ __ASM_EMIT (" vextractf128 $1, %%ymm0, %%xmm2" )
99
+ __ASM_EMIT (" vaddps %%xmm2, %%xmm0, %%xmm0" )
102
100
/* x4 block */
103
- __ASM_EMIT (" vextractf128 $0x01, %%ymm0, %%xmm1" )
104
101
__ASM_EMIT (" add $4, %[count]" )
105
- __ASM_EMIT (" vaddps %%xmm1, %%xmm0, %%xmm0" )
106
102
__ASM_EMIT (" jl 10f" )
107
103
__ASM_EMIT (" vmovups 0x00(%[a], %[off]), %%xmm2" )
108
104
__ASM_EMIT (" vmulps 0x00(%[b], %[off]), %%xmm2, %%xmm2" )
@@ -164,47 +160,33 @@ namespace lsp
164
160
__ASM_EMIT (" sub $32, %[count]" )
165
161
__ASM_EMIT (" jae 1b" )
166
162
__ASM_EMIT (" 2:" )
167
- __ASM_EMIT (" vextractf64x4 $1, %%zmm0, %%ymm2" )
168
- __ASM_EMIT (" vextractf64x4 $1, %%zmm1, %%ymm3" )
169
- __ASM_EMIT (" vaddps %%ymm2, %%ymm0, %%ymm0" )
170
- __ASM_EMIT (" vaddps %%ymm3, %%ymm1, %%ymm1" )
163
+ __ASM_EMIT (" vaddps %%zmm1, %%zmm0, %%zmm0" )
171
164
/* x16 block */
172
165
__ASM_EMIT (" add $16, %[count]" )
173
166
__ASM_EMIT (" jl 4f" )
174
- __ASM_EMIT (" vmovups 0x00(%[a], %[off]), %%ymm2" )
175
- __ASM_EMIT (" vmovups 0x20(%[a], %[off]), %%ymm3" )
176
- __ASM_EMIT (" vmovups 0x00(%[b], %[off]), %%ymm4" )
177
- __ASM_EMIT (" vmovups 0x20(%[b], %[off]), %%ymm5" )
178
- __ASM_EMIT (" vmulps %%ymm2, %%ymm2, %%ymm2" )
179
- __ASM_EMIT (" vmulps %%ymm3, %%ymm3, %%ymm3" )
180
- __ASM_EMIT (" vmulps %%ymm4, %%ymm4, %%ymm4" )
181
- __ASM_EMIT (" vmulps %%ymm5, %%ymm5, %%ymm5" )
182
- __ASM_EMIT (" vfmadd231ps %%ymm4, %%ymm2, %%ymm0" )
183
- __ASM_EMIT (" vfmadd231ps %%ymm5, %%ymm3, %%ymm1" )
167
+ __ASM_EMIT (" vmovups 0x00(%[a], %[off]), %%zmm2" )
168
+ __ASM_EMIT (" vmovups 0x00(%[b], %[off]), %%zmm4" )
169
+ __ASM_EMIT (" vmulps %%zmm2, %%zmm2, %%zmm2" )
170
+ __ASM_EMIT (" vmulps %%zmm4, %%zmm4, %%zmm4" )
171
+ __ASM_EMIT (" vfmadd231ps %%zmm4, %%zmm2, %%zmm0" )
184
172
__ASM_EMIT (" add $0x40, %[off]" )
185
173
__ASM_EMIT (" sub $16, %[count]" )
186
174
__ASM_EMIT (" 4:" )
187
- __ASM_EMIT (" vextractf128 $0x01, %%ymm0, %%xmm2" )
188
- __ASM_EMIT (" vextractf128 $0x01, %%ymm1, %%xmm3" )
189
- __ASM_EMIT (" vaddps %%xmm2, %%xmm0, %%xmm0" )
190
- __ASM_EMIT (" vaddps %%xmm3, %%xmm1, %%xmm1" )
175
+ __ASM_EMIT (" vextractf64x4 $1, %%zmm0, %%ymm2" )
176
+ __ASM_EMIT (" vaddps %%ymm2, %%ymm0, %%ymm0" )
191
177
/* x8 block */
192
178
__ASM_EMIT (" add $8, %[count]" )
193
179
__ASM_EMIT (" jl 6f" )
194
- __ASM_EMIT (" vmovups 0x00(%[a], %[off]), %%xmm2" )
195
- __ASM_EMIT (" vmovups 0x10(%[a], %[off]), %%xmm3" )
196
- __ASM_EMIT (" vmovups 0x00(%[b], %[off]), %%xmm4" )
197
- __ASM_EMIT (" vmovups 0x10(%[b], %[off]), %%xmm5" )
198
- __ASM_EMIT (" vmulps %%xmm2, %%xmm2, %%xmm2" )
199
- __ASM_EMIT (" vmulps %%xmm3, %%xmm3, %%xmm3" )
200
- __ASM_EMIT (" vmulps %%xmm4, %%xmm4, %%xmm4" )
201
- __ASM_EMIT (" vmulps %%xmm5, %%xmm5, %%xmm5" )
202
- __ASM_EMIT (" vfmadd231ps %%xmm4, %%xmm2, %%xmm0" )
203
- __ASM_EMIT (" vfmadd231ps %%xmm5, %%xmm3, %%xmm1" )
180
+ __ASM_EMIT (" vmovups 0x00(%[a], %[off]), %%ymm2" )
181
+ __ASM_EMIT (" vmovups 0x00(%[b], %[off]), %%ymm4" )
182
+ __ASM_EMIT (" vmulps %%ymm2, %%ymm2, %%ymm2" )
183
+ __ASM_EMIT (" vmulps %%ymm4, %%ymm4, %%ymm4" )
184
+ __ASM_EMIT (" vfmadd231ps %%ymm4, %%ymm2, %%ymm0" )
204
185
__ASM_EMIT (" add $0x20, %[off]" )
205
186
__ASM_EMIT (" sub $8, %[count]" )
206
187
__ASM_EMIT (" 6:" )
207
- __ASM_EMIT (" vaddps %%xmm1, %%xmm0, %%xmm0" )
188
+ __ASM_EMIT (" vextractf128 $1, %%ymm0, %%xmm2" )
189
+ __ASM_EMIT (" vaddps %%xmm2, %%xmm0, %%xmm0" )
208
190
/* x4 block */
209
191
__ASM_EMIT (" add $4, %[count]" )
210
192
__ASM_EMIT (" jl 8f" )
@@ -277,39 +259,29 @@ namespace lsp
277
259
__ASM_EMIT (" sub $32, %[count]" )
278
260
__ASM_EMIT (" jae 1b" )
279
261
__ASM_EMIT (" 2:" )
280
- __ASM_EMIT (" vextractf64x4 $1, %%zmm0, %%ymm2" )
281
- __ASM_EMIT (" vextractf64x4 $1, %%zmm1, %%ymm3" )
282
- __ASM_EMIT (" vaddps %%ymm2, %%ymm0, %%ymm0" )
283
- __ASM_EMIT (" vaddps %%ymm3, %%ymm1, %%ymm1" )
262
+ __ASM_EMIT (" vaddps %%zmm1, %%zmm0, %%zmm0" )
284
263
/* x16 block */
285
264
__ASM_EMIT (" add $16, %[count]" )
286
265
__ASM_EMIT (" jl 4f" )
287
- __ASM_EMIT (" vandps 0x00(%[a], %[off]), %%ymm6, %%ymm2" )
288
- __ASM_EMIT (" vandps 0x20(%[a], %[off]), %%ymm7, %%ymm3" )
289
- __ASM_EMIT (" vandps 0x00(%[b], %[off]), %%ymm6, %%ymm4" )
290
- __ASM_EMIT (" vandps 0x20(%[b], %[off]), %%ymm7, %%ymm5" )
291
- __ASM_EMIT (" vfmadd231ps %%ymm4, %%ymm2, %%ymm0" )
292
- __ASM_EMIT (" vfmadd231ps %%ymm5, %%ymm3, %%ymm1" )
266
+ __ASM_EMIT (" vandps 0x00(%[a], %[off]), %%zmm6, %%zmm2" )
267
+ __ASM_EMIT (" vandps 0x00(%[b], %[off]), %%zmm6, %%zmm4" )
268
+ __ASM_EMIT (" vfmadd231ps %%zmm4, %%zmm2, %%zmm0" )
293
269
__ASM_EMIT (" add $0x40, %[off]" )
294
270
__ASM_EMIT (" sub $16, %[count]" )
295
271
__ASM_EMIT (" 4:" )
296
- __ASM_EMIT (" vextractf128 $0x01, %%ymm0, %%xmm2" )
297
- __ASM_EMIT (" vextractf128 $0x01, %%ymm1, %%xmm3" )
298
- __ASM_EMIT (" vaddps %%xmm2, %%xmm0, %%xmm0" )
299
- __ASM_EMIT (" vaddps %%xmm3, %%xmm1, %%xmm1" )
272
+ __ASM_EMIT (" vextractf64x4 $1, %%zmm0, %%ymm2" )
273
+ __ASM_EMIT (" vaddps %%ymm2, %%ymm0, %%ymm0" )
300
274
/* x8 block */
301
275
__ASM_EMIT (" add $8, %[count]" )
302
276
__ASM_EMIT (" jl 6f" )
303
- __ASM_EMIT (" vandps 0x00(%[a], %[off]), %%xmm6, %%xmm2" )
304
- __ASM_EMIT (" vandps 0x10(%[a], %[off]), %%xmm7, %%xmm3" )
305
- __ASM_EMIT (" vandps 0x00(%[b], %[off]), %%xmm6, %%xmm4" )
306
- __ASM_EMIT (" vandps 0x10(%[b], %[off]), %%xmm7, %%xmm5" )
307
- __ASM_EMIT (" vfmadd231ps %%xmm4, %%xmm2, %%xmm0" )
308
- __ASM_EMIT (" vfmadd231ps %%xmm5, %%xmm3, %%xmm1" )
277
+ __ASM_EMIT (" vandps 0x00(%[a], %[off]), %%ymm6, %%ymm2" )
278
+ __ASM_EMIT (" vandps 0x00(%[b], %[off]), %%ymm6, %%ymm4" )
279
+ __ASM_EMIT (" vfmadd231ps %%ymm4, %%ymm2, %%ymm0" )
309
280
__ASM_EMIT (" add $0x20, %[off]" )
310
281
__ASM_EMIT (" sub $8, %[count]" )
311
282
__ASM_EMIT (" 6:" )
312
- __ASM_EMIT (" vaddps %%xmm1, %%xmm0, %%xmm0" )
283
+ __ASM_EMIT (" vextractf128 $1, %%ymm0, %%xmm2" )
284
+ __ASM_EMIT (" vaddps %%xmm2, %%xmm0, %%xmm0" )
313
285
/* x4 block */
314
286
__ASM_EMIT (" add $4, %[count]" )
315
287
__ASM_EMIT (" jl 8f" )
0 commit comments