@@ -41,22 +41,18 @@ namespace lsp
41
41
/* x128 blocks */
42
42
__ASM_EMIT (" sub $128, %[count]" )
43
43
__ASM_EMIT (" jb 2f" )
44
- __ASM_EMIT (" vxorps %%zmm2, %%zmm2, %%zmm2" )
45
- __ASM_EMIT (" vxorps %%zmm3, %%zmm3, %%zmm3" )
46
44
__ASM_EMIT (" 1:" )
47
45
__ASM_EMIT (" vaddps 0x000(%[src]), %%zmm0, %%zmm0" )
48
46
__ASM_EMIT (" vaddps 0x040(%[src]), %%zmm1, %%zmm1" )
49
- __ASM_EMIT (" vaddps 0x080(%[src]), %%zmm2 , %%zmm2 " )
50
- __ASM_EMIT (" vaddps 0x0c0(%[src]), %%zmm3 , %%zmm3 " )
47
+ __ASM_EMIT (" vaddps 0x080(%[src]), %%zmm0 , %%zmm0 " )
48
+ __ASM_EMIT (" vaddps 0x0c0(%[src]), %%zmm1 , %%zmm1 " )
51
49
__ASM_EMIT (" vaddps 0x100(%[src]), %%zmm0, %%zmm0" )
52
50
__ASM_EMIT (" vaddps 0x140(%[src]), %%zmm1, %%zmm1" )
53
- __ASM_EMIT (" vaddps 0x180(%[src]), %%zmm2 , %%zmm2 " )
54
- __ASM_EMIT (" vaddps 0x1c0(%[src]), %%zmm3 , %%zmm3 " )
51
+ __ASM_EMIT (" vaddps 0x180(%[src]), %%zmm0 , %%zmm0 " )
52
+ __ASM_EMIT (" vaddps 0x1c0(%[src]), %%zmm1 , %%zmm1 " )
55
53
__ASM_EMIT (" add $0x200, %[src]" )
56
54
__ASM_EMIT (" sub $128, %[count]" )
57
55
__ASM_EMIT (" jae 1b" )
58
- __ASM_EMIT (" vaddps %%zmm2, %%zmm0, %%zmm0" )
59
- __ASM_EMIT (" vaddps %%zmm3, %%zmm1, %%zmm1" )
60
56
__ASM_EMIT (" 2:" )
61
57
/* x32 blocks */
62
58
__ASM_EMIT (" add $96, %[count]" )
@@ -68,31 +64,25 @@ namespace lsp
68
64
__ASM_EMIT (" sub $32, %[count]" )
69
65
__ASM_EMIT (" jge 3b" )
70
66
__ASM_EMIT (" 4:" )
71
- __ASM_EMIT (" vextractf64x4 $1, %%zmm0, %%ymm2" )
72
- __ASM_EMIT (" vextractf64x4 $1, %%zmm1, %%ymm3" )
73
- __ASM_EMIT (" vaddps %%ymm2, %%ymm0, %%ymm0" )
74
- __ASM_EMIT (" vaddps %%ymm3, %%ymm1, %%ymm1" )
67
+ __ASM_EMIT (" vaddps %%zmm1, %%zmm0, %%zmm0" )
75
68
/* x16 block */
76
69
__ASM_EMIT (" add $16, %[count]" )
77
70
__ASM_EMIT (" jl 6f" )
78
- __ASM_EMIT (" vaddps 0x000(%[src]), %%ymm0, %%ymm0" )
79
- __ASM_EMIT (" vaddps 0x020(%[src]), %%ymm1, %%ymm1" )
71
+ __ASM_EMIT (" vaddps 0x000(%[src]), %%zmm0, %%zmm0" )
80
72
__ASM_EMIT (" add $0x40, %[src]" )
81
73
__ASM_EMIT (" sub $16, %[count]" )
82
74
__ASM_EMIT (" 6:" )
83
- __ASM_EMIT (" vextractf128 $1, %%ymm0, %%xmm2" )
84
- __ASM_EMIT (" vextractf128 $1, %%ymm1, %%xmm3" )
85
- __ASM_EMIT (" vaddps %%xmm2, %%xmm0, %%xmm0" )
86
- __ASM_EMIT (" vaddps %%xmm3, %%xmm1, %%xmm1" )
75
+ __ASM_EMIT (" vextractf64x4 $1, %%zmm0, %%ymm2" )
76
+ __ASM_EMIT (" vaddps %%ymm2, %%ymm0, %%ymm0" )
87
77
/* x8 block */
88
78
__ASM_EMIT (" add $8, %[count]" )
89
79
__ASM_EMIT (" jl 8f" )
90
- __ASM_EMIT (" vaddps 0x000(%[src]), %%xmm0, %%xmm0" )
91
- __ASM_EMIT (" vaddps 0x010(%[src]), %%xmm1, %%xmm1" )
80
+ __ASM_EMIT (" vaddps 0x000(%[src]), %%ymm0, %%ymm0" )
92
81
__ASM_EMIT (" add $0x20, %[src]" )
93
82
__ASM_EMIT (" sub $8, %[count]" )
94
83
__ASM_EMIT (" 8:" )
95
- __ASM_EMIT (" vaddps %%xmm1, %%xmm0, %%xmm0" )
84
+ __ASM_EMIT (" vextractf128 $1, %%ymm0, %%xmm2" )
85
+ __ASM_EMIT (" vaddps %%xmm2, %%xmm0, %%xmm0" )
96
86
/* x4 block */
97
87
__ASM_EMIT (" add $4, %[count]" )
98
88
__ASM_EMIT (" jl 10f" )
@@ -116,7 +106,7 @@ namespace lsp
116
106
[res] " =Yz" (result)
117
107
:
118
108
: " cc" , " memory" ,
119
- " %xmm1" , " %xmm2 " , " %xmm3 "
109
+ " %xmm1"
120
110
);
121
111
122
112
return result;
@@ -169,35 +159,27 @@ namespace lsp
169
159
__ASM_EMIT (" sub $32, %[count]" )
170
160
__ASM_EMIT (" jge 3b" )
171
161
__ASM_EMIT (" 4:" )
172
- __ASM_EMIT (" vextractf64x4 $1, %%zmm0, %%ymm2" )
173
- __ASM_EMIT (" vextractf64x4 $1, %%zmm1, %%ymm3" )
174
- __ASM_EMIT (" vaddps %%ymm2, %%ymm0, %%ymm0" )
175
- __ASM_EMIT (" vaddps %%ymm3, %%ymm1, %%ymm1" )
162
+ __ASM_EMIT (" vaddps %%zmm1, %%zmm0, %%zmm0" )
176
163
/* x16 block */
177
164
__ASM_EMIT (" add $16, %[count]" )
178
165
__ASM_EMIT (" jl 6f" )
179
- __ASM_EMIT (" vmovups 0x000(%[src]), %%ymm4" )
180
- __ASM_EMIT (" vmovups 0x020(%[src]), %%ymm5" )
181
- __ASM_EMIT (" vfmadd231ps %%ymm4, %%ymm4, %%ymm0" )
182
- __ASM_EMIT (" vfmadd231ps %%ymm5, %%ymm5, %%ymm1" )
166
+ __ASM_EMIT (" vmovups 0x000(%[src]), %%zmm4" )
167
+ __ASM_EMIT (" vfmadd231ps %%zmm4, %%zmm4, %%zmm0" )
183
168
__ASM_EMIT (" add $0x40, %[src]" )
184
169
__ASM_EMIT (" sub $16, %[count]" )
185
170
__ASM_EMIT (" 6:" )
186
- __ASM_EMIT (" vextractf128 $1, %%ymm0, %%xmm2" )
187
- __ASM_EMIT (" vextractf128 $1, %%ymm1, %%xmm3" )
188
- __ASM_EMIT (" vaddps %%xmm2, %%xmm0, %%xmm0" )
189
- __ASM_EMIT (" vaddps %%xmm3, %%xmm1, %%xmm1" )
171
+ __ASM_EMIT (" vextractf64x4 $1, %%zmm0, %%ymm2" )
172
+ __ASM_EMIT (" vaddps %%ymm2, %%ymm0, %%ymm0" )
190
173
/* x8 block */
191
174
__ASM_EMIT (" add $8, %[count]" )
192
175
__ASM_EMIT (" jl 8f" )
193
- __ASM_EMIT (" vmovups 0x000(%[src]), %%xmm4" )
194
- __ASM_EMIT (" vmovups 0x010(%[src]), %%xmm5" )
195
- __ASM_EMIT (" vfmadd231ps %%xmm4, %%xmm4, %%xmm0" )
196
- __ASM_EMIT (" vfmadd231ps %%xmm5, %%xmm5, %%xmm1" )
176
+ __ASM_EMIT (" vmovups 0x000(%[src]), %%ymm4" )
177
+ __ASM_EMIT (" vfmadd231ps %%ymm4, %%ymm4, %%ymm0" )
197
178
__ASM_EMIT (" add $0x20, %[src]" )
198
179
__ASM_EMIT (" sub $8, %[count]" )
199
180
__ASM_EMIT (" 8:" )
200
- __ASM_EMIT (" vaddps %%xmm1, %%xmm0, %%xmm0" )
181
+ __ASM_EMIT (" vextractf128 $1, %%ymm0, %%xmm2" )
182
+ __ASM_EMIT (" vaddps %%xmm2, %%xmm0, %%xmm0" )
201
183
/* x4 block */
202
184
__ASM_EMIT (" add $4, %[count]" )
203
185
__ASM_EMIT (" jl 10f" )
@@ -244,8 +226,8 @@ namespace lsp
244
226
(
245
227
__ASM_EMIT (" vxorps %%zmm0, %%zmm0, %%zmm0" )
246
228
__ASM_EMIT (" vmovaps %[CC], %%zmm6" )
247
- __ASM_EMIT (" vxorps %%zmm1, %%zmm1, %%zmm1" )
248
229
__ASM_EMIT (" vmovaps %%zmm6, %%zmm7" )
230
+ __ASM_EMIT (" vxorps %%zmm1, %%zmm1, %%zmm1" )
249
231
/* x128 blocks */
250
232
__ASM_EMIT (" sub $128, %[count]" )
251
233
__ASM_EMIT (" jb 2f" )
@@ -282,35 +264,27 @@ namespace lsp
282
264
__ASM_EMIT (" sub $32, %[count]" )
283
265
__ASM_EMIT (" jge 3b" )
284
266
__ASM_EMIT (" 4:" )
285
- __ASM_EMIT (" vextractf64x4 $1, %%zmm0, %%ymm2" )
286
- __ASM_EMIT (" vextractf64x4 $1, %%zmm1, %%ymm3" )
287
- __ASM_EMIT (" vaddps %%ymm2, %%ymm0, %%ymm0" )
288
- __ASM_EMIT (" vaddps %%ymm3, %%ymm1, %%ymm1" )
267
+ __ASM_EMIT (" vaddps %%zmm1, %%zmm0, %%zmm0" )
289
268
/* x16 block */
290
269
__ASM_EMIT (" add $16, %[count]" )
291
270
__ASM_EMIT (" jl 6f" )
292
- __ASM_EMIT (" vandps 0x000(%[src]), %%ymm6, %%ymm2" )
293
- __ASM_EMIT (" vandps 0x020(%[src]), %%ymm7, %%ymm3" )
294
- __ASM_EMIT (" vaddps %%ymm2, %%ymm0, %%ymm0" )
295
- __ASM_EMIT (" vaddps %%ymm3, %%ymm1, %%ymm1" )
271
+ __ASM_EMIT (" vandps 0x000(%[src]), %%zmm6, %%zmm2" )
272
+ __ASM_EMIT (" vaddps %%zmm2, %%zmm0, %%zmm0" )
296
273
__ASM_EMIT (" add $0x40, %[src]" )
297
274
__ASM_EMIT (" sub $16, %[count]" )
298
275
__ASM_EMIT (" 6:" )
299
- __ASM_EMIT (" vextractf128 $1, %%ymm0, %%xmm2" )
300
- __ASM_EMIT (" vextractf128 $1, %%ymm1, %%xmm3" )
301
- __ASM_EMIT (" vaddps %%xmm2, %%xmm0, %%xmm0" )
302
- __ASM_EMIT (" vaddps %%xmm3, %%xmm1, %%xmm1" )
276
+ __ASM_EMIT (" vextractf64x4 $1, %%zmm0, %%ymm2" )
277
+ __ASM_EMIT (" vaddps %%ymm2, %%ymm0, %%ymm0" )
303
278
/* x8 block */
304
279
__ASM_EMIT (" add $8, %[count]" )
305
280
__ASM_EMIT (" jl 8f" )
306
- __ASM_EMIT (" vandps 0x000(%[src]), %%xmm6, %%xmm2" )
307
- __ASM_EMIT (" vandps 0x010(%[src]), %%xmm7, %%xmm3" )
308
- __ASM_EMIT (" vaddps %%xmm2, %%xmm0, %%xmm0" )
309
- __ASM_EMIT (" vaddps %%xmm3, %%xmm1, %%xmm1" )
281
+ __ASM_EMIT (" vandps 0x000(%[src]), %%ymm6, %%ymm2" )
282
+ __ASM_EMIT (" vaddps %%ymm2, %%ymm0, %%ymm0" )
310
283
__ASM_EMIT (" add $0x20, %[src]" )
311
284
__ASM_EMIT (" sub $8, %[count]" )
312
285
__ASM_EMIT (" 8:" )
313
- __ASM_EMIT (" vaddps %%xmm1, %%xmm0, %%xmm0" )
286
+ __ASM_EMIT (" vextractf128 $1, %%ymm0, %%xmm2" )
287
+ __ASM_EMIT (" vaddps %%xmm2, %%xmm0, %%xmm0" )
314
288
/* x4 block */
315
289
__ASM_EMIT (" add $4, %[count]" )
316
290
__ASM_EMIT (" jl 10f" )
0 commit comments