New f1600x4

Instead of using intrinsics and full unrolling, this uses a four-round unrolled version adapted from the one I wrote for Cloudflare's CIRCL library: github.com/cloudflare/circl/simd/keccakf1600
sphincs · Jul 31, 2020 · 40122bc · 40122bc
1 parent c510ae2
commit 40122bc
Show file tree

Hide file tree

Showing 16 changed files with 1,032 additions and 1,784 deletions.
diff --git a/shake256-avx2/Makefile b/shake256-avx2/Makefile
@@ -3,8 +3,8 @@ CFLAGS = -Wall -Wextra -Wpedantic -O3 -std=c99 -march=native -fomit-frame-pointe
 
 THASH = robust
 
-SOURCES =          hash_shake256.c hash_shake256x4.c thash_shake256_$(THASH).c thash_shake256_$(THASH)x4.c address.c randombytes.c wots.c utils.c utilsx4.c fors.c sign.c fips202.c fips202x4.c keccak4x/KeccakP-1600-times4-SIMD256.o
-HEADERS = params.h hash.h          hashx4.h          thash.h                 thashx4.h                 address.h randombytes.h wots.h utils.h utilsx4.h fors.h api.h fips202.h fips202x4.h
+SOURCES =          hash_shake256.c hash_shake256x4.c thash_shake256_$(THASH).c thash_shake256_$(THASH)x4.c address.c randombytes.c wots.c utils.c utilsx4.c fors.c sign.c fips202.c f1600x4.c f1600x4.s
+HEADERS = params.h hash.h          hashx4.h          thash.h                 thashx4.h                 address.h randombytes.h wots.h utils.h utilsx4.h fors.h api.h fips202.h f1600x4.h
 
 DET_SOURCES = $(SOURCES:randombytes.%=rng.%)
 DET_HEADERS = $(HEADERS:randombytes.%=rng.%)
@@ -39,16 +39,7 @@ test/%: test/%.c $(SOURCES) $(HEADERS)
 test/%.exec: test/%
 	@$<
 
-keccak4x/KeccakP-1600-times4-SIMD256.o: keccak4x/align.h \
-										keccak4x/brg_endian.h \
-										keccak4x/KeccakP-1600-times4-SIMD256.c \
-										keccak4x/KeccakP-1600-times4-SnP.h \
-										keccak4x/KeccakP-1600-unrolling.macros \
-										keccak4x/SIMD256-config.h
-	$(CC) $(CFLAGS) -c keccak4x/KeccakP-1600-times4-SIMD256.c -o $@
-
 clean:
-	-$(RM) keccak4x/KeccakP-1600-times4-SIMD256.o
 	-$(RM) $(TESTS)
 	-$(RM) $(BENCHMARK)
 	-$(RM) PQCgenKAT_sign

diff --git a/shake256-avx2/f1600x4.c b/shake256-avx2/f1600x4.c
@@ -0,0 +1,28 @@
+#include <stdint.h>
+
+uint64_t keccak_rc[24] = {
+    0x0000000000000001,
+    0x0000000000008082,
+    0x800000000000808A,
+    0x8000000080008000,
+    0x000000000000808B,
+    0x0000000080000001,
+    0x8000000080008081,
+    0x8000000000008009,
+    0x000000000000008A,
+    0x0000000000000088,
+    0x0000000080008009,
+    0x000000008000000A,
+    0x000000008000808B,
+    0x800000000000008B,
+    0x8000000000008089,
+    0x8000000000008003,
+    0x8000000000008002,
+    0x8000000000000080,
+    0x000000000000800A,
+    0x800000008000000A,
+    0x8000000080008081,
+    0x8000000000008080,
+    0x0000000080000001,
+    0x8000000080008008
+};
diff --git a/shake256-avx2/f1600x4.h b/shake256-avx2/f1600x4.h
@@ -0,0 +1,4 @@
+#pragma once
+
+extern void f1600x4AVX2(uint64_t *s, uint64_t *rc);
+extern uint64_t keccak_rc[24];
diff --git a/shake256-avx2/f1600x4.py b/shake256-avx2/f1600x4.py
@@ -0,0 +1,79 @@
+import peachpy.x86_64
+
+stateArg = Argument(ptr(uint64_t))
+rcArg = Argument(ptr(uint64_t))
+with Function("f1600x4AVX2", (stateArg, rcArg), target=uarch.haswell) as function:
+    statePtr = GeneralPurposeRegister64()
+    rcPtr = GeneralPurposeRegister64()
+    superRound = GeneralPurposeRegister64()
+
+    LOAD.ARGUMENT(statePtr, stateArg)
+    LOAD.ARGUMENT(rcPtr, rcArg)
+
+    MOV(superRound, 6)
+
+    def state(offset):
+        return [statePtr + 32*offset]
+
+    with Loop() as loop:
+        for r in range(4):
+            p = [YMMRegister() for i in range(5)]
+            for i in range(5): VMOVDQA(p[i], state(i))
+            for j in range(1, 5):
+                for i in range(5): VPXOR(p[i], p[i], state(5*j+i))
+
+            t = [YMMRegister() for i in range(5)]
+            d = [YMMRegister() for i in range(5)]
+
+            for i in range(5): VPSLLQ(t[i], p[(i+1)%5], 1)
+            for i in range(5): VPSRLQ(d[i], p[(i+1)%5], 63)
+            for i in range(5): VPOR(d[i], d[i], t[i])
+            for i in range(5): VPXOR(d[i], p[(i+4)%5], d[i])
+
+            def rot(i, g):
+                table = [[0, 24, 18, 6, 12],
+                         [7, 23, 2, 9, 22],
+                         [1, 3, 17, 16, 20],
+                         [13, 8, 4, 5, 15],
+                         [19, 10, 21, 14, 11]]
+                t = table[g][i]
+                return ((t + 1) * t // 2) % 64
+
+            def di(i, g):
+                return (3*g + i) % 5
+            def si(i, g, r):
+                n = [6, 16, 11, 1][r]
+                m = [10, 20, 15, 5][r]
+                return (i*n + m*g) % 25
+
+            for g in range(5):
+                s = [YMMRegister() for i in range(5)]
+                for i in range(5):
+                    VPXOR(s[i], d[di(i, g)], state(si(di(i, g), g, r)))
+                for i in range(5):
+                    if rot(i, g) != 0:
+                        VPSLLQ(t[i], s[i], rot(i, g))
+                for i in range(5):
+                    if rot(i, g) != 0:
+                        VPSRLQ(s[i], s[i], 64-rot(i, g))
+                for i in range(5):
+                    if rot(i, g) != 0:
+                        VPOR(s[i], s[i], t[i])
+                for i in range(5): VPANDN(t[i], s[(i+1)%5], s[(i+2)%5])
+                for i in range(5): VPXOR(t[i], t[i], s[i])
+
+                if g == 0:
+                    rc = YMMRegister()
+                    VPBROADCASTQ(rc, [rcPtr + r*8])
+                    VPXOR(t[0], t[0], rc)
+                for i in range(5):
+                    VMOVDQA(state(si(i, g, r)), t[i])
+
+        ADD(rcPtr, 8*4)
+        SUB(superRound, 1)
+        JNZ(loop.begin)
+
+    RETURN ()
+
+
+