Skip to content

Commit

Permalink
New f1600x4
Browse files Browse the repository at this point in the history
Instead of using intrinsics and full unrolling, this uses a
four-round unrolled version adapted from the one I wrote for
Cloudflare's CIRCL library:

    github.com/cloudflare/circl/simd/keccakf1600
  • Loading branch information
bwesterb committed Jul 31, 2020
1 parent c510ae2 commit 40122bc
Show file tree
Hide file tree
Showing 16 changed files with 1,032 additions and 1,784 deletions.
13 changes: 2 additions & 11 deletions shake256-avx2/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ CFLAGS = -Wall -Wextra -Wpedantic -O3 -std=c99 -march=native -fomit-frame-pointe

THASH = robust

SOURCES = hash_shake256.c hash_shake256x4.c thash_shake256_$(THASH).c thash_shake256_$(THASH)x4.c address.c randombytes.c wots.c utils.c utilsx4.c fors.c sign.c fips202.c fips202x4.c keccak4x/KeccakP-1600-times4-SIMD256.o
HEADERS = params.h hash.h hashx4.h thash.h thashx4.h address.h randombytes.h wots.h utils.h utilsx4.h fors.h api.h fips202.h fips202x4.h
SOURCES = hash_shake256.c hash_shake256x4.c thash_shake256_$(THASH).c thash_shake256_$(THASH)x4.c address.c randombytes.c wots.c utils.c utilsx4.c fors.c sign.c fips202.c f1600x4.c f1600x4.s
HEADERS = params.h hash.h hashx4.h thash.h thashx4.h address.h randombytes.h wots.h utils.h utilsx4.h fors.h api.h fips202.h f1600x4.h

DET_SOURCES = $(SOURCES:randombytes.%=rng.%)
DET_HEADERS = $(HEADERS:randombytes.%=rng.%)
Expand Down Expand Up @@ -39,16 +39,7 @@ test/%: test/%.c $(SOURCES) $(HEADERS)
test/%.exec: test/%
@$<

keccak4x/KeccakP-1600-times4-SIMD256.o: keccak4x/align.h \
keccak4x/brg_endian.h \
keccak4x/KeccakP-1600-times4-SIMD256.c \
keccak4x/KeccakP-1600-times4-SnP.h \
keccak4x/KeccakP-1600-unrolling.macros \
keccak4x/SIMD256-config.h
$(CC) $(CFLAGS) -c keccak4x/KeccakP-1600-times4-SIMD256.c -o $@

clean:
-$(RM) keccak4x/KeccakP-1600-times4-SIMD256.o
-$(RM) $(TESTS)
-$(RM) $(BENCHMARK)
-$(RM) PQCgenKAT_sign
Expand Down
28 changes: 28 additions & 0 deletions shake256-avx2/f1600x4.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#include <stdint.h>

uint64_t keccak_rc[24] = {
0x0000000000000001,
0x0000000000008082,
0x800000000000808A,
0x8000000080008000,
0x000000000000808B,
0x0000000080000001,
0x8000000080008081,
0x8000000000008009,
0x000000000000008A,
0x0000000000000088,
0x0000000080008009,
0x000000008000000A,
0x000000008000808B,
0x800000000000008B,
0x8000000000008089,
0x8000000000008003,
0x8000000000008002,
0x8000000000000080,
0x000000000000800A,
0x800000008000000A,
0x8000000080008081,
0x8000000000008080,
0x0000000080000001,
0x8000000080008008
};
4 changes: 4 additions & 0 deletions shake256-avx2/f1600x4.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#pragma once

extern void f1600x4AVX2(uint64_t *s, uint64_t *rc);
extern uint64_t keccak_rc[24];
79 changes: 79 additions & 0 deletions shake256-avx2/f1600x4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import peachpy.x86_64

stateArg = Argument(ptr(uint64_t))
rcArg = Argument(ptr(uint64_t))
with Function("f1600x4AVX2", (stateArg, rcArg), target=uarch.haswell) as function:
statePtr = GeneralPurposeRegister64()
rcPtr = GeneralPurposeRegister64()
superRound = GeneralPurposeRegister64()

LOAD.ARGUMENT(statePtr, stateArg)
LOAD.ARGUMENT(rcPtr, rcArg)

MOV(superRound, 6)

def state(offset):
return [statePtr + 32*offset]

with Loop() as loop:
for r in range(4):
p = [YMMRegister() for i in range(5)]
for i in range(5): VMOVDQA(p[i], state(i))
for j in range(1, 5):
for i in range(5): VPXOR(p[i], p[i], state(5*j+i))

t = [YMMRegister() for i in range(5)]
d = [YMMRegister() for i in range(5)]

for i in range(5): VPSLLQ(t[i], p[(i+1)%5], 1)
for i in range(5): VPSRLQ(d[i], p[(i+1)%5], 63)
for i in range(5): VPOR(d[i], d[i], t[i])
for i in range(5): VPXOR(d[i], p[(i+4)%5], d[i])

def rot(i, g):
table = [[0, 24, 18, 6, 12],
[7, 23, 2, 9, 22],
[1, 3, 17, 16, 20],
[13, 8, 4, 5, 15],
[19, 10, 21, 14, 11]]
t = table[g][i]
return ((t + 1) * t // 2) % 64

def di(i, g):
return (3*g + i) % 5
def si(i, g, r):
n = [6, 16, 11, 1][r]
m = [10, 20, 15, 5][r]
return (i*n + m*g) % 25

for g in range(5):
s = [YMMRegister() for i in range(5)]
for i in range(5):
VPXOR(s[i], d[di(i, g)], state(si(di(i, g), g, r)))
for i in range(5):
if rot(i, g) != 0:
VPSLLQ(t[i], s[i], rot(i, g))
for i in range(5):
if rot(i, g) != 0:
VPSRLQ(s[i], s[i], 64-rot(i, g))
for i in range(5):
if rot(i, g) != 0:
VPOR(s[i], s[i], t[i])
for i in range(5): VPANDN(t[i], s[(i+1)%5], s[(i+2)%5])
for i in range(5): VPXOR(t[i], t[i], s[i])

if g == 0:
rc = YMMRegister()
VPBROADCASTQ(rc, [rcPtr + r*8])
VPXOR(t[0], t[0], rc)
for i in range(5):
VMOVDQA(state(si(i, g, r)), t[i])

ADD(rcPtr, 8*4)
SUB(superRound, 1)
JNZ(loop.begin)

RETURN ()



Loading

0 comments on commit 40122bc

Please sign in to comment.