-
Notifications
You must be signed in to change notification settings - Fork 12
/
final_fold.S
227 lines (183 loc) · 5.57 KB
/
final_fold.S
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
/*
* Calculate the checksum of 128 bits of data.
*
* We add 32 bits of 0s to make 192 bits of data - this matches what a
* CRC does. We reduce the 192 bits in two steps, first reducing the top 64
* bits to produce 96 bits, then reducing the top 32 bits of that to produce 64
* bits.
*
* We then use fixed point Barrett reduction to compute a mod n over GF(2)
* for n = 0x104c11db7 using POWER8 instructions. We use x = 32.
*
* http://en.wikipedia.org/wiki/Barrett_reduction
*
* Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of either:
*
* a) the GNU General Public License as published by the Free Software
* Foundation; either version 2 of the License, or (at your option)
* any later version, or
* b) the Apache License, Version 2.0
*/
#if defined (__clang__)
#ifndef __ALTIVEC__
#define __ALTIVEC__
#endif
#include "ppc-asm.h"
#else
#include <ppc-asm.h>
#endif
#include "ppc-opcode.h"
.section .data
.balign 16
.constants:
/* x^96 mod p(x) */
.octa 0x000000000000000000000000f200aa66
/* x^64 mod p(x) */
.octa 0x000000000000000000000000490d678d
/* Barrett constant m - (4^32)/n */
.octa 0x00000000000000000000000104d101df
/* Barrett constant n */
.octa 0x00000000000000000000000104c11db7
/* byte reverse permute constant */
.octa 0x0F0E0D0C0B0A09080706050403020100
.bit_reflected_constants:
/* x^96 mod p(x)` << 1 */
.octa 0x000000000000000000000000ccaa009e
/* x^64 mod p(x)` << 1 */
.octa 0x00000000000000000000000163cd6124
/* 33 bit reflected Barrett constant m - (4^32)/n */
.octa 0x000000000000000000000001f7011641
/* 33 bit reflected Barrett constant n */
.octa 0x000000000000000000000001db710641
/* byte reverse permute constant */
.octa 0x0F0E0D0C0B0A09080706050403020100
.text
#define const1 v10
#define const2 v11
#define const3 v12
#define const4 v13
#define mask_32bit v28
#define mask_64bit v29
#define zeroes v30
#define ones v31
/* unsigned int final_fold(void *data) */
FUNC_START(final_fold)
lis r4,.constants@ha
la r4,.constants@l(r4)
li r5,16
li r6,32
li r7,48
li r8,64
vxor zeroes,zeroes,zeroes
vspltisw ones,-1
vsldoi mask_32bit,zeroes,ones,4
vsldoi mask_64bit,zeroes,ones,8
lvx v0,0,r3 /* load data */
#ifdef __LITTLE_ENDIAN__
lvx const1,r8,r4
vperm v0,v0,v0,const1
#endif
lvx const1,0,r4
lvx const2,r5,r4
lvx const3,r6,r4
lvx const4,r7,r4
/*
* We append 32 bits of zeroes to our 128 bit value. This gives us 192
* bits that we reduce in two steps.
*/
/* Reduce the top 64 bits */
vsldoi v1,zeroes,v0,8 /* Grab the top 64 bits */
VPMSUMD(v1,v1,const1)
/* Add 32 bits of zeroes and xor with the reduced top 64 bits */
vsldoi v0,v0,zeroes,4
vxor v0,v1,v0
/* We have a 96 bit value, now reduce the top 32 bits */
vsldoi v1,zeroes,v0,8 /* Grab the top 64 bits */
vand v1,v1,mask_32bit
VPMSUMD(v1,v1,const2)
vxor v0,v1,v0
vand v0,v0,mask_64bit
/*
* Now for Barrett reduction. The idea is to calculate q,
* the multiple of our polynomial that we need to subtract. By
* doing the computation 2x bits higher (ie 64 bits) and shifting the
* result back down 2x bits, we round down to the nearest multiple.
*/
VPMSUMD(v1,v0,const3) /* ma */
vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */
VPMSUMD(v1,v1,const4) /* qn */
vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
/*
* Get the result into r3. We need to shift it left 8 bytes:
* V0 [ 0 1 2 X ]
* V0 [ 0 X 2 3 ]
*/
vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */
MFVRD(r3, v0)
blr
FUNC_END(final_fold)
/* unsigned int final_fold_reflected(void *data) */
FUNC_START(final_fold_reflected)
lis r4,.bit_reflected_constants@ha
la r4,.bit_reflected_constants@l(r4)
li r5,16
li r6,32
li r7,48
li r8,64
vxor zeroes,zeroes,zeroes
vspltisw ones,-1
vsldoi mask_32bit,zeroes,ones,4
vsldoi mask_64bit,zeroes,ones,8
lvx v0,0,r3 /* load data */
#ifndef __LITTLE_ENDIAN__
lvx const1,r8,r4
vperm v0,v0,v0,const1
#endif
lvx const1,0,r4
lvx const2,r5,r4
lvx const3,r6,r4
lvx const4,r7,r4
/*
* We append 32 bits of zeroes to our 128 bit value. This gives us 192
* bits that we reduce in two steps. This time we are reducing the
* bits on the right side (ie the lower bits) and xor'ing them
* on the left side.
*/
/* Reduce the top 64 bits */
VPMSUMD(v1,v0,const1)
vsldoi v1,v1,zeroes,4 /* align 96bit result to the left */
/* Add 32 bits of zeroes and xor with the reduced top 64 bits */
vsldoi v0,zeroes,v0,12 /* zeroes on the left */
vxor v0,v1,v0
/* We have a 96 bit value, now reduce the top 32 bits */
vsldoi v1,zeroes,v0,12 /* Grab the right 64 bits */
vand v1,v1,mask_32bit
VPMSUMD(v1,v1,const2)
vsldoi v0,zeroes,v0,8
vxor v0,v1,v0
vand v0,v0,mask_64bit
/*
* Now for the Barrett reduction algorithm. Instead of bit reflecting
* our data (which is expensive to do), we bit reflect our constants
* and our algorithm, which means the intermediate data in our vector
* registers goes from 0-63 instead of 63-0. We can reflect the
* algorithm because we don't carry in mod 2 arithmetic.
*/
vand v1,v0,mask_32bit /* bottom 32 bits of a */
VPMSUMD(v1,v1,const3) /* ma */
vand v1,v1,mask_32bit /* bottom 32bits of ma */
VPMSUMD(v1,v1,const4) /* qn */
vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
/*
* Since we are bit reflected, the result (ie the low 32 bits) is in the
* high 32 bits. We just need to shift it left 4 bytes
* V0 [ 0 1 X 3 ]
* V0 [ 0 X 2 3 ]
*/
vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits */
MFVRD(r3, v0)
blr
FUNC_END(final_fold_reflected)