diff --git a/shake256-avx2/.gitignore b/shake256-avx2/.gitignore index 20d83ac7..d5c31c75 100644 --- a/shake256-avx2/.gitignore +++ b/shake256-avx2/.gitignore @@ -3,4 +3,5 @@ test/* PQCsignKAT_*.rsp PQCsignKAT_*.req PQCgenKAT_sign -keccak4x/KeccakP-1600-times4-SIMD256.o \ No newline at end of file +keccak4x/KeccakP-1600-times4-SIMD256.o +*.s diff --git a/shake256-avx2/Makefile b/shake256-avx2/Makefile index cb2e7c5d..1c42e182 100644 --- a/shake256-avx2/Makefile +++ b/shake256-avx2/Makefile @@ -3,8 +3,8 @@ CFLAGS = -Wall -Wextra -Wpedantic -O3 -std=c99 -march=native -fomit-frame-pointe THASH = robust -SOURCES = hash_shake256.c hash_shake256x4.c thash_shake256_$(THASH).c thash_shake256_$(THASH)x4.c address.c randombytes.c wots.c utils.c utilsx4.c fors.c sign.c fips202.c fips202x4.c keccak4x/KeccakP-1600-times4-SIMD256.o -HEADERS = params.h hash.h hashx4.h thash.h thashx4.h address.h randombytes.h wots.h utils.h utilsx4.h fors.h api.h fips202.h fips202x4.h +SOURCES = hash_shake256.c hash_shake256x4.c thash_shake256_$(THASH).c thash_shake256_$(THASH)x4.c address.c randombytes.c wots.c utils.c utilsx4.c fors.c sign.c fips202.c f1600x4.c f1600x4.s +HEADERS = params.h hash.h hashx4.h thash.h thashx4.h address.h randombytes.h wots.h utils.h utilsx4.h fors.h api.h fips202.h f1600x4.h DET_SOURCES = $(SOURCES:randombytes.%=rng.%) DET_HEADERS = $(HEADERS:randombytes.%=rng.%) @@ -39,16 +39,7 @@ test/%: test/%.c $(SOURCES) $(HEADERS) test/%.exec: test/% @$< -keccak4x/KeccakP-1600-times4-SIMD256.o: keccak4x/align.h \ - keccak4x/brg_endian.h \ - keccak4x/KeccakP-1600-times4-SIMD256.c \ - keccak4x/KeccakP-1600-times4-SnP.h \ - keccak4x/KeccakP-1600-unrolling.macros \ - keccak4x/SIMD256-config.h - $(CC) $(CFLAGS) -c keccak4x/KeccakP-1600-times4-SIMD256.c -o $@ - clean: - -$(RM) keccak4x/KeccakP-1600-times4-SIMD256.o -$(RM) $(TESTS) -$(RM) $(BENCHMARK) -$(RM) PQCgenKAT_sign diff --git a/shake256-avx2/f1600x4.S b/shake256-avx2/f1600x4.S new file mode 100644 index 00000000..c3d6a61a --- /dev/null +++ b/shake256-avx2/f1600x4.S @@ -0,0 +1,901 @@ +# Generated by PeachPy 0.2.0 from test.py + + +#ifdef __APPLE__ +.section __TEXT,__text,regular,pure_instructions +.globl _f1600x4AVX2 +.p2align 4, 0x90 +_f1600x4AVX2: +#else /* !__APPLE__ */ +.text +.p2align 4,,15 +.globl f1600x4AVX2 +.type f1600x4AVX2, @function +f1600x4AVX2: +#endif /* !__APPLE */ + movq $6, %rax +17: # loop.begin: + vmovdqa 0(%rdi), %ymm8 + vmovdqa 32(%rdi), %ymm9 + vmovdqa 64(%rdi), %ymm10 + vmovdqa 96(%rdi), %ymm11 + vmovdqa 128(%rdi), %ymm12 + vpxor 160(%rdi), %ymm8, %ymm8 + vpxor 192(%rdi), %ymm9, %ymm9 + vpxor 224(%rdi), %ymm10, %ymm10 + vpxor 256(%rdi), %ymm11, %ymm11 + vpxor 288(%rdi), %ymm12, %ymm12 + vpxor 320(%rdi), %ymm8, %ymm8 + vpxor 352(%rdi), %ymm9, %ymm9 + vpxor 384(%rdi), %ymm10, %ymm10 + vpxor 416(%rdi), %ymm11, %ymm11 + vpxor 448(%rdi), %ymm12, %ymm12 + vpxor 480(%rdi), %ymm8, %ymm8 + vpxor 512(%rdi), %ymm9, %ymm9 + vpxor 544(%rdi), %ymm10, %ymm10 + vpxor 576(%rdi), %ymm11, %ymm11 + vpxor 608(%rdi), %ymm12, %ymm12 + vpxor 640(%rdi), %ymm8, %ymm8 + vpxor 672(%rdi), %ymm9, %ymm9 + vpxor 704(%rdi), %ymm10, %ymm10 + vpxor 736(%rdi), %ymm11, %ymm11 + vpxor 768(%rdi), %ymm12, %ymm12 + vpsllq $1, %ymm9, %ymm13 + vpsllq $1, %ymm10, %ymm14 + vpsllq $1, %ymm11, %ymm15 + vpsllq $1, %ymm12, %ymm7 + vpsllq $1, %ymm8, %ymm6 + vpsrlq $63, %ymm9, %ymm5 + vpsrlq $63, %ymm10, %ymm4 + vpsrlq $63, %ymm11, %ymm3 + vpsrlq $63, %ymm12, %ymm2 + vpsrlq $63, %ymm8, %ymm1 + vpor %ymm13, %ymm5, %ymm5 + vpor %ymm14, %ymm4, %ymm4 + vpor %ymm15, %ymm3, %ymm3 + vpor %ymm7, %ymm2, %ymm2 + vpor %ymm6, %ymm1, %ymm1 + vpxor %ymm5, %ymm12, %ymm5 + vpxor %ymm4, %ymm8, %ymm4 + vpxor %ymm3, %ymm9, %ymm3 + vpxor %ymm2, %ymm10, %ymm2 + vpxor %ymm1, %ymm11, %ymm1 + vpxor 0(%rdi), %ymm5, %ymm8 + vpxor 192(%rdi), %ymm4, %ymm9 + vpxor 384(%rdi), %ymm3, %ymm10 + vpxor 576(%rdi), %ymm2, %ymm11 + vpxor 768(%rdi), %ymm1, %ymm12 + vpsllq $44, %ymm9, %ymm14 + vpsllq $43, %ymm10, %ymm15 + vpsllq $21, %ymm11, %ymm7 + vpsllq $14, %ymm12, %ymm6 + vpsrlq $20, %ymm9, %ymm9 + vpsrlq $21, %ymm10, %ymm10 + vpsrlq $43, %ymm11, %ymm11 + vpsrlq $50, %ymm12, %ymm12 + vpor %ymm14, %ymm9, %ymm9 + vpor %ymm15, %ymm10, %ymm10 + vpor %ymm7, %ymm11, %ymm11 + vpor %ymm6, %ymm12, %ymm12 + vpandn %ymm10, %ymm9, %ymm13 + vpandn %ymm11, %ymm10, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm8, %ymm12, %ymm7 + vpandn %ymm9, %ymm8, %ymm6 + vpxor %ymm8, %ymm13, %ymm13 + vpxor %ymm9, %ymm14, %ymm14 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm6, %ymm6 + vpbroadcastq 0(%rsi), %ymm8 + vpxor %ymm8, %ymm13, %ymm13 + vmovdqa %ymm13, 0(%rdi) + vmovdqa %ymm14, 192(%rdi) + vmovdqa %ymm15, 384(%rdi) + vmovdqa %ymm7, 576(%rdi) + vmovdqa %ymm6, 768(%rdi) + vpxor 96(%rdi), %ymm2, %ymm8 + vpxor 288(%rdi), %ymm1, %ymm9 + vpxor 320(%rdi), %ymm5, %ymm10 + vpxor 512(%rdi), %ymm4, %ymm11 + vpxor 704(%rdi), %ymm3, %ymm12 + vpsllq $28, %ymm8, %ymm13 + vpsllq $20, %ymm9, %ymm14 + vpsllq $3, %ymm10, %ymm15 + vpsllq $45, %ymm11, %ymm7 + vpsllq $61, %ymm12, %ymm6 + vpsrlq $36, %ymm8, %ymm8 + vpsrlq $44, %ymm9, %ymm9 + vpsrlq $61, %ymm10, %ymm10 + vpsrlq $19, %ymm11, %ymm11 + vpsrlq $3, %ymm12, %ymm12 + vpor %ymm13, %ymm8, %ymm8 + vpor %ymm14, %ymm9, %ymm9 + vpor %ymm15, %ymm10, %ymm10 + vpor %ymm7, %ymm11, %ymm11 + vpor %ymm6, %ymm12, %ymm12 + vpandn %ymm10, %ymm9, %ymm13 + vpandn %ymm11, %ymm10, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm8, %ymm12, %ymm7 + vpandn %ymm9, %ymm8, %ymm6 + vpxor %ymm8, %ymm13, %ymm13 + vpxor %ymm9, %ymm14, %ymm14 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm6, %ymm6 + vmovdqa %ymm13, 320(%rdi) + vmovdqa %ymm14, 512(%rdi) + vmovdqa %ymm15, 704(%rdi) + vmovdqa %ymm7, 96(%rdi) + vmovdqa %ymm6, 288(%rdi) + vpxor 32(%rdi), %ymm4, %ymm8 + vpxor 224(%rdi), %ymm3, %ymm9 + vpxor 416(%rdi), %ymm2, %ymm10 + vpxor 608(%rdi), %ymm1, %ymm11 + vpxor 640(%rdi), %ymm5, %ymm12 + vpsllq $1, %ymm8, %ymm13 + vpsllq $6, %ymm9, %ymm14 + vpsllq $25, %ymm10, %ymm15 + vpsllq $8, %ymm11, %ymm7 + vpsllq $18, %ymm12, %ymm6 + vpsrlq $63, %ymm8, %ymm8 + vpsrlq $58, %ymm9, %ymm9 + vpsrlq $39, %ymm10, %ymm10 + vpsrlq $56, %ymm11, %ymm11 + vpsrlq $46, %ymm12, %ymm12 + vpor %ymm13, %ymm8, %ymm8 + vpor %ymm14, %ymm9, %ymm9 + vpor %ymm15, %ymm10, %ymm10 + vpor %ymm7, %ymm11, %ymm11 + vpor %ymm6, %ymm12, %ymm12 + vpandn %ymm10, %ymm9, %ymm13 + vpandn %ymm11, %ymm10, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm8, %ymm12, %ymm7 + vpandn %ymm9, %ymm8, %ymm6 + vpxor %ymm8, %ymm13, %ymm13 + vpxor %ymm9, %ymm14, %ymm14 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm6, %ymm6 + vmovdqa %ymm13, 640(%rdi) + vmovdqa %ymm14, 32(%rdi) + vmovdqa %ymm15, 224(%rdi) + vmovdqa %ymm7, 416(%rdi) + vmovdqa %ymm6, 608(%rdi) + vpxor 128(%rdi), %ymm1, %ymm8 + vpxor 160(%rdi), %ymm5, %ymm9 + vpxor 352(%rdi), %ymm4, %ymm10 + vpxor 544(%rdi), %ymm3, %ymm11 + vpxor 736(%rdi), %ymm2, %ymm12 + vpsllq $27, %ymm8, %ymm13 + vpsllq $36, %ymm9, %ymm14 + vpsllq $10, %ymm10, %ymm15 + vpsllq $15, %ymm11, %ymm7 + vpsllq $56, %ymm12, %ymm6 + vpsrlq $37, %ymm8, %ymm8 + vpsrlq $28, %ymm9, %ymm9 + vpsrlq $54, %ymm10, %ymm10 + vpsrlq $49, %ymm11, %ymm11 + vpsrlq $8, %ymm12, %ymm12 + vpor %ymm13, %ymm8, %ymm8 + vpor %ymm14, %ymm9, %ymm9 + vpor %ymm15, %ymm10, %ymm10 + vpor %ymm7, %ymm11, %ymm11 + vpor %ymm6, %ymm12, %ymm12 + vpandn %ymm10, %ymm9, %ymm13 + vpandn %ymm11, %ymm10, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm8, %ymm12, %ymm7 + vpandn %ymm9, %ymm8, %ymm6 + vpxor %ymm8, %ymm13, %ymm13 + vpxor %ymm9, %ymm14, %ymm14 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm6, %ymm6 + vmovdqa %ymm13, 160(%rdi) + vmovdqa %ymm14, 352(%rdi) + vmovdqa %ymm15, 544(%rdi) + vmovdqa %ymm7, 736(%rdi) + vmovdqa %ymm6, 128(%rdi) + vpxor 64(%rdi), %ymm3, %ymm8 + vpxor 256(%rdi), %ymm2, %ymm9 + vpxor 448(%rdi), %ymm1, %ymm10 + vpxor 480(%rdi), %ymm5, %ymm11 + vpxor 672(%rdi), %ymm4, %ymm12 + vpsllq $62, %ymm8, %ymm13 + vpsllq $55, %ymm9, %ymm14 + vpsllq $39, %ymm10, %ymm15 + vpsllq $41, %ymm11, %ymm7 + vpsllq $2, %ymm12, %ymm6 + vpsrlq $2, %ymm8, %ymm8 + vpsrlq $9, %ymm9, %ymm9 + vpsrlq $25, %ymm10, %ymm10 + vpsrlq $23, %ymm11, %ymm11 + vpsrlq $62, %ymm12, %ymm12 + vpor %ymm13, %ymm8, %ymm8 + vpor %ymm14, %ymm9, %ymm9 + vpor %ymm15, %ymm10, %ymm10 + vpor %ymm7, %ymm11, %ymm11 + vpor %ymm6, %ymm12, %ymm12 + vpandn %ymm10, %ymm9, %ymm13 + vpandn %ymm11, %ymm10, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm8, %ymm12, %ymm7 + vpandn %ymm9, %ymm8, %ymm6 + vpxor %ymm8, %ymm13, %ymm13 + vpxor %ymm9, %ymm14, %ymm14 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm6, %ymm6 + vmovdqa %ymm13, 480(%rdi) + vmovdqa %ymm14, 672(%rdi) + vmovdqa %ymm15, 64(%rdi) + vmovdqa %ymm7, 256(%rdi) + vmovdqa %ymm6, 448(%rdi) + vmovdqa 0(%rdi), %ymm8 + vmovdqa 32(%rdi), %ymm9 + vmovdqa 64(%rdi), %ymm10 + vmovdqa 96(%rdi), %ymm11 + vmovdqa 128(%rdi), %ymm12 + vpxor 160(%rdi), %ymm8, %ymm8 + vpxor 192(%rdi), %ymm9, %ymm9 + vpxor 224(%rdi), %ymm10, %ymm10 + vpxor 256(%rdi), %ymm11, %ymm11 + vpxor 288(%rdi), %ymm12, %ymm12 + vpxor 320(%rdi), %ymm8, %ymm8 + vpxor 352(%rdi), %ymm9, %ymm9 + vpxor 384(%rdi), %ymm10, %ymm10 + vpxor 416(%rdi), %ymm11, %ymm11 + vpxor 448(%rdi), %ymm12, %ymm12 + vpxor 480(%rdi), %ymm8, %ymm8 + vpxor 512(%rdi), %ymm9, %ymm9 + vpxor 544(%rdi), %ymm10, %ymm10 + vpxor 576(%rdi), %ymm11, %ymm11 + vpxor 608(%rdi), %ymm12, %ymm12 + vpxor 640(%rdi), %ymm8, %ymm8 + vpxor 672(%rdi), %ymm9, %ymm9 + vpxor 704(%rdi), %ymm10, %ymm10 + vpxor 736(%rdi), %ymm11, %ymm11 + vpxor 768(%rdi), %ymm12, %ymm12 + vpsllq $1, %ymm9, %ymm13 + vpsllq $1, %ymm10, %ymm14 + vpsllq $1, %ymm11, %ymm15 + vpsllq $1, %ymm12, %ymm7 + vpsllq $1, %ymm8, %ymm6 + vpsrlq $63, %ymm9, %ymm5 + vpsrlq $63, %ymm10, %ymm4 + vpsrlq $63, %ymm11, %ymm3 + vpsrlq $63, %ymm12, %ymm2 + vpsrlq $63, %ymm8, %ymm1 + vpor %ymm13, %ymm5, %ymm5 + vpor %ymm14, %ymm4, %ymm4 + vpor %ymm15, %ymm3, %ymm3 + vpor %ymm7, %ymm2, %ymm2 + vpor %ymm6, %ymm1, %ymm1 + vpxor %ymm5, %ymm12, %ymm5 + vpxor %ymm4, %ymm8, %ymm4 + vpxor %ymm3, %ymm9, %ymm3 + vpxor %ymm2, %ymm10, %ymm2 + vpxor %ymm1, %ymm11, %ymm1 + vpxor 0(%rdi), %ymm5, %ymm8 + vpxor 512(%rdi), %ymm4, %ymm9 + vpxor 224(%rdi), %ymm3, %ymm10 + vpxor 736(%rdi), %ymm2, %ymm11 + vpxor 448(%rdi), %ymm1, %ymm12 + vpsllq $44, %ymm9, %ymm14 + vpsllq $43, %ymm10, %ymm15 + vpsllq $21, %ymm11, %ymm7 + vpsllq $14, %ymm12, %ymm6 + vpsrlq $20, %ymm9, %ymm9 + vpsrlq $21, %ymm10, %ymm10 + vpsrlq $43, %ymm11, %ymm11 + vpsrlq $50, %ymm12, %ymm12 + vpor %ymm14, %ymm9, %ymm9 + vpor %ymm15, %ymm10, %ymm10 + vpor %ymm7, %ymm11, %ymm11 + vpor %ymm6, %ymm12, %ymm12 + vpandn %ymm10, %ymm9, %ymm13 + vpandn %ymm11, %ymm10, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm8, %ymm12, %ymm7 + vpandn %ymm9, %ymm8, %ymm6 + vpxor %ymm8, %ymm13, %ymm13 + vpxor %ymm9, %ymm14, %ymm14 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm6, %ymm6 + vpbroadcastq 8(%rsi), %ymm8 + vpxor %ymm8, %ymm13, %ymm13 + vmovdqa %ymm13, 0(%rdi) + vmovdqa %ymm14, 512(%rdi) + vmovdqa %ymm15, 224(%rdi) + vmovdqa %ymm7, 736(%rdi) + vmovdqa %ymm6, 448(%rdi) + vpxor 576(%rdi), %ymm2, %ymm8 + vpxor 288(%rdi), %ymm1, %ymm9 + vpxor 640(%rdi), %ymm5, %ymm10 + vpxor 352(%rdi), %ymm4, %ymm11 + vpxor 64(%rdi), %ymm3, %ymm12 + vpsllq $28, %ymm8, %ymm13 + vpsllq $20, %ymm9, %ymm14 + vpsllq $3, %ymm10, %ymm15 + vpsllq $45, %ymm11, %ymm7 + vpsllq $61, %ymm12, %ymm6 + vpsrlq $36, %ymm8, %ymm8 + vpsrlq $44, %ymm9, %ymm9 + vpsrlq $61, %ymm10, %ymm10 + vpsrlq $19, %ymm11, %ymm11 + vpsrlq $3, %ymm12, %ymm12 + vpor %ymm13, %ymm8, %ymm8 + vpor %ymm14, %ymm9, %ymm9 + vpor %ymm15, %ymm10, %ymm10 + vpor %ymm7, %ymm11, %ymm11 + vpor %ymm6, %ymm12, %ymm12 + vpandn %ymm10, %ymm9, %ymm13 + vpandn %ymm11, %ymm10, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm8, %ymm12, %ymm7 + vpandn %ymm9, %ymm8, %ymm6 + vpxor %ymm8, %ymm13, %ymm13 + vpxor %ymm9, %ymm14, %ymm14 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm6, %ymm6 + vmovdqa %ymm13, 640(%rdi) + vmovdqa %ymm14, 352(%rdi) + vmovdqa %ymm15, 64(%rdi) + vmovdqa %ymm7, 576(%rdi) + vmovdqa %ymm6, 288(%rdi) + vpxor 192(%rdi), %ymm4, %ymm8 + vpxor 704(%rdi), %ymm3, %ymm9 + vpxor 416(%rdi), %ymm2, %ymm10 + vpxor 128(%rdi), %ymm1, %ymm11 + vpxor 480(%rdi), %ymm5, %ymm12 + vpsllq $1, %ymm8, %ymm13 + vpsllq $6, %ymm9, %ymm14 + vpsllq $25, %ymm10, %ymm15 + vpsllq $8, %ymm11, %ymm7 + vpsllq $18, %ymm12, %ymm6 + vpsrlq $63, %ymm8, %ymm8 + vpsrlq $58, %ymm9, %ymm9 + vpsrlq $39, %ymm10, %ymm10 + vpsrlq $56, %ymm11, %ymm11 + vpsrlq $46, %ymm12, %ymm12 + vpor %ymm13, %ymm8, %ymm8 + vpor %ymm14, %ymm9, %ymm9 + vpor %ymm15, %ymm10, %ymm10 + vpor %ymm7, %ymm11, %ymm11 + vpor %ymm6, %ymm12, %ymm12 + vpandn %ymm10, %ymm9, %ymm13 + vpandn %ymm11, %ymm10, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm8, %ymm12, %ymm7 + vpandn %ymm9, %ymm8, %ymm6 + vpxor %ymm8, %ymm13, %ymm13 + vpxor %ymm9, %ymm14, %ymm14 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm6, %ymm6 + vmovdqa %ymm13, 480(%rdi) + vmovdqa %ymm14, 192(%rdi) + vmovdqa %ymm15, 704(%rdi) + vmovdqa %ymm7, 416(%rdi) + vmovdqa %ymm6, 128(%rdi) + vpxor 768(%rdi), %ymm1, %ymm8 + vpxor 320(%rdi), %ymm5, %ymm9 + vpxor 32(%rdi), %ymm4, %ymm10 + vpxor 544(%rdi), %ymm3, %ymm11 + vpxor 256(%rdi), %ymm2, %ymm12 + vpsllq $27, %ymm8, %ymm13 + vpsllq $36, %ymm9, %ymm14 + vpsllq $10, %ymm10, %ymm15 + vpsllq $15, %ymm11, %ymm7 + vpsllq $56, %ymm12, %ymm6 + vpsrlq $37, %ymm8, %ymm8 + vpsrlq $28, %ymm9, %ymm9 + vpsrlq $54, %ymm10, %ymm10 + vpsrlq $49, %ymm11, %ymm11 + vpsrlq $8, %ymm12, %ymm12 + vpor %ymm13, %ymm8, %ymm8 + vpor %ymm14, %ymm9, %ymm9 + vpor %ymm15, %ymm10, %ymm10 + vpor %ymm7, %ymm11, %ymm11 + vpor %ymm6, %ymm12, %ymm12 + vpandn %ymm10, %ymm9, %ymm13 + vpandn %ymm11, %ymm10, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm8, %ymm12, %ymm7 + vpandn %ymm9, %ymm8, %ymm6 + vpxor %ymm8, %ymm13, %ymm13 + vpxor %ymm9, %ymm14, %ymm14 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm6, %ymm6 + vmovdqa %ymm13, 320(%rdi) + vmovdqa %ymm14, 32(%rdi) + vmovdqa %ymm15, 544(%rdi) + vmovdqa %ymm7, 256(%rdi) + vmovdqa %ymm6, 768(%rdi) + vpxor 384(%rdi), %ymm3, %ymm8 + vpxor 96(%rdi), %ymm2, %ymm9 + vpxor 608(%rdi), %ymm1, %ymm10 + vpxor 160(%rdi), %ymm5, %ymm11 + vpxor 672(%rdi), %ymm4, %ymm12 + vpsllq $62, %ymm8, %ymm13 + vpsllq $55, %ymm9, %ymm14 + vpsllq $39, %ymm10, %ymm15 + vpsllq $41, %ymm11, %ymm7 + vpsllq $2, %ymm12, %ymm6 + vpsrlq $2, %ymm8, %ymm8 + vpsrlq $9, %ymm9, %ymm9 + vpsrlq $25, %ymm10, %ymm10 + vpsrlq $23, %ymm11, %ymm11 + vpsrlq $62, %ymm12, %ymm12 + vpor %ymm13, %ymm8, %ymm8 + vpor %ymm14, %ymm9, %ymm9 + vpor %ymm15, %ymm10, %ymm10 + vpor %ymm7, %ymm11, %ymm11 + vpor %ymm6, %ymm12, %ymm12 + vpandn %ymm10, %ymm9, %ymm13 + vpandn %ymm11, %ymm10, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm8, %ymm12, %ymm7 + vpandn %ymm9, %ymm8, %ymm6 + vpxor %ymm8, %ymm13, %ymm13 + vpxor %ymm9, %ymm14, %ymm14 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm6, %ymm6 + vmovdqa %ymm13, 160(%rdi) + vmovdqa %ymm14, 672(%rdi) + vmovdqa %ymm15, 384(%rdi) + vmovdqa %ymm7, 96(%rdi) + vmovdqa %ymm6, 608(%rdi) + vmovdqa 0(%rdi), %ymm8 + vmovdqa 32(%rdi), %ymm9 + vmovdqa 64(%rdi), %ymm10 + vmovdqa 96(%rdi), %ymm11 + vmovdqa 128(%rdi), %ymm12 + vpxor 160(%rdi), %ymm8, %ymm8 + vpxor 192(%rdi), %ymm9, %ymm9 + vpxor 224(%rdi), %ymm10, %ymm10 + vpxor 256(%rdi), %ymm11, %ymm11 + vpxor 288(%rdi), %ymm12, %ymm12 + vpxor 320(%rdi), %ymm8, %ymm8 + vpxor 352(%rdi), %ymm9, %ymm9 + vpxor 384(%rdi), %ymm10, %ymm10 + vpxor 416(%rdi), %ymm11, %ymm11 + vpxor 448(%rdi), %ymm12, %ymm12 + vpxor 480(%rdi), %ymm8, %ymm8 + vpxor 512(%rdi), %ymm9, %ymm9 + vpxor 544(%rdi), %ymm10, %ymm10 + vpxor 576(%rdi), %ymm11, %ymm11 + vpxor 608(%rdi), %ymm12, %ymm12 + vpxor 640(%rdi), %ymm8, %ymm8 + vpxor 672(%rdi), %ymm9, %ymm9 + vpxor 704(%rdi), %ymm10, %ymm10 + vpxor 736(%rdi), %ymm11, %ymm11 + vpxor 768(%rdi), %ymm12, %ymm12 + vpsllq $1, %ymm9, %ymm13 + vpsllq $1, %ymm10, %ymm14 + vpsllq $1, %ymm11, %ymm15 + vpsllq $1, %ymm12, %ymm7 + vpsllq $1, %ymm8, %ymm6 + vpsrlq $63, %ymm9, %ymm5 + vpsrlq $63, %ymm10, %ymm4 + vpsrlq $63, %ymm11, %ymm3 + vpsrlq $63, %ymm12, %ymm2 + vpsrlq $63, %ymm8, %ymm1 + vpor %ymm13, %ymm5, %ymm5 + vpor %ymm14, %ymm4, %ymm4 + vpor %ymm15, %ymm3, %ymm3 + vpor %ymm7, %ymm2, %ymm2 + vpor %ymm6, %ymm1, %ymm1 + vpxor %ymm5, %ymm12, %ymm5 + vpxor %ymm4, %ymm8, %ymm4 + vpxor %ymm3, %ymm9, %ymm3 + vpxor %ymm2, %ymm10, %ymm2 + vpxor %ymm1, %ymm11, %ymm1 + vpxor 0(%rdi), %ymm5, %ymm8 + vpxor 352(%rdi), %ymm4, %ymm9 + vpxor 704(%rdi), %ymm3, %ymm10 + vpxor 256(%rdi), %ymm2, %ymm11 + vpxor 608(%rdi), %ymm1, %ymm12 + vpsllq $44, %ymm9, %ymm14 + vpsllq $43, %ymm10, %ymm15 + vpsllq $21, %ymm11, %ymm7 + vpsllq $14, %ymm12, %ymm6 + vpsrlq $20, %ymm9, %ymm9 + vpsrlq $21, %ymm10, %ymm10 + vpsrlq $43, %ymm11, %ymm11 + vpsrlq $50, %ymm12, %ymm12 + vpor %ymm14, %ymm9, %ymm9 + vpor %ymm15, %ymm10, %ymm10 + vpor %ymm7, %ymm11, %ymm11 + vpor %ymm6, %ymm12, %ymm12 + vpandn %ymm10, %ymm9, %ymm13 + vpandn %ymm11, %ymm10, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm8, %ymm12, %ymm7 + vpandn %ymm9, %ymm8, %ymm6 + vpxor %ymm8, %ymm13, %ymm13 + vpxor %ymm9, %ymm14, %ymm14 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm6, %ymm6 + vpbroadcastq 16(%rsi), %ymm8 + vpxor %ymm8, %ymm13, %ymm13 + vmovdqa %ymm13, 0(%rdi) + vmovdqa %ymm14, 352(%rdi) + vmovdqa %ymm15, 704(%rdi) + vmovdqa %ymm7, 256(%rdi) + vmovdqa %ymm6, 608(%rdi) + vpxor 736(%rdi), %ymm2, %ymm8 + vpxor 288(%rdi), %ymm1, %ymm9 + vpxor 480(%rdi), %ymm5, %ymm10 + vpxor 32(%rdi), %ymm4, %ymm11 + vpxor 384(%rdi), %ymm3, %ymm12 + vpsllq $28, %ymm8, %ymm13 + vpsllq $20, %ymm9, %ymm14 + vpsllq $3, %ymm10, %ymm15 + vpsllq $45, %ymm11, %ymm7 + vpsllq $61, %ymm12, %ymm6 + vpsrlq $36, %ymm8, %ymm8 + vpsrlq $44, %ymm9, %ymm9 + vpsrlq $61, %ymm10, %ymm10 + vpsrlq $19, %ymm11, %ymm11 + vpsrlq $3, %ymm12, %ymm12 + vpor %ymm13, %ymm8, %ymm8 + vpor %ymm14, %ymm9, %ymm9 + vpor %ymm15, %ymm10, %ymm10 + vpor %ymm7, %ymm11, %ymm11 + vpor %ymm6, %ymm12, %ymm12 + vpandn %ymm10, %ymm9, %ymm13 + vpandn %ymm11, %ymm10, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm8, %ymm12, %ymm7 + vpandn %ymm9, %ymm8, %ymm6 + vpxor %ymm8, %ymm13, %ymm13 + vpxor %ymm9, %ymm14, %ymm14 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm6, %ymm6 + vmovdqa %ymm13, 480(%rdi) + vmovdqa %ymm14, 32(%rdi) + vmovdqa %ymm15, 384(%rdi) + vmovdqa %ymm7, 736(%rdi) + vmovdqa %ymm6, 288(%rdi) + vpxor 512(%rdi), %ymm4, %ymm8 + vpxor 64(%rdi), %ymm3, %ymm9 + vpxor 416(%rdi), %ymm2, %ymm10 + vpxor 768(%rdi), %ymm1, %ymm11 + vpxor 160(%rdi), %ymm5, %ymm12 + vpsllq $1, %ymm8, %ymm13 + vpsllq $6, %ymm9, %ymm14 + vpsllq $25, %ymm10, %ymm15 + vpsllq $8, %ymm11, %ymm7 + vpsllq $18, %ymm12, %ymm6 + vpsrlq $63, %ymm8, %ymm8 + vpsrlq $58, %ymm9, %ymm9 + vpsrlq $39, %ymm10, %ymm10 + vpsrlq $56, %ymm11, %ymm11 + vpsrlq $46, %ymm12, %ymm12 + vpor %ymm13, %ymm8, %ymm8 + vpor %ymm14, %ymm9, %ymm9 + vpor %ymm15, %ymm10, %ymm10 + vpor %ymm7, %ymm11, %ymm11 + vpor %ymm6, %ymm12, %ymm12 + vpandn %ymm10, %ymm9, %ymm13 + vpandn %ymm11, %ymm10, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm8, %ymm12, %ymm7 + vpandn %ymm9, %ymm8, %ymm6 + vpxor %ymm8, %ymm13, %ymm13 + vpxor %ymm9, %ymm14, %ymm14 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm6, %ymm6 + vmovdqa %ymm13, 160(%rdi) + vmovdqa %ymm14, 512(%rdi) + vmovdqa %ymm15, 64(%rdi) + vmovdqa %ymm7, 416(%rdi) + vmovdqa %ymm6, 768(%rdi) + vpxor 448(%rdi), %ymm1, %ymm8 + vpxor 640(%rdi), %ymm5, %ymm9 + vpxor 192(%rdi), %ymm4, %ymm10 + vpxor 544(%rdi), %ymm3, %ymm11 + vpxor 96(%rdi), %ymm2, %ymm12 + vpsllq $27, %ymm8, %ymm13 + vpsllq $36, %ymm9, %ymm14 + vpsllq $10, %ymm10, %ymm15 + vpsllq $15, %ymm11, %ymm7 + vpsllq $56, %ymm12, %ymm6 + vpsrlq $37, %ymm8, %ymm8 + vpsrlq $28, %ymm9, %ymm9 + vpsrlq $54, %ymm10, %ymm10 + vpsrlq $49, %ymm11, %ymm11 + vpsrlq $8, %ymm12, %ymm12 + vpor %ymm13, %ymm8, %ymm8 + vpor %ymm14, %ymm9, %ymm9 + vpor %ymm15, %ymm10, %ymm10 + vpor %ymm7, %ymm11, %ymm11 + vpor %ymm6, %ymm12, %ymm12 + vpandn %ymm10, %ymm9, %ymm13 + vpandn %ymm11, %ymm10, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm8, %ymm12, %ymm7 + vpandn %ymm9, %ymm8, %ymm6 + vpxor %ymm8, %ymm13, %ymm13 + vpxor %ymm9, %ymm14, %ymm14 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm6, %ymm6 + vmovdqa %ymm13, 640(%rdi) + vmovdqa %ymm14, 192(%rdi) + vmovdqa %ymm15, 544(%rdi) + vmovdqa %ymm7, 96(%rdi) + vmovdqa %ymm6, 448(%rdi) + vpxor 224(%rdi), %ymm3, %ymm8 + vpxor 576(%rdi), %ymm2, %ymm9 + vpxor 128(%rdi), %ymm1, %ymm10 + vpxor 320(%rdi), %ymm5, %ymm11 + vpxor 672(%rdi), %ymm4, %ymm12 + vpsllq $62, %ymm8, %ymm13 + vpsllq $55, %ymm9, %ymm14 + vpsllq $39, %ymm10, %ymm15 + vpsllq $41, %ymm11, %ymm7 + vpsllq $2, %ymm12, %ymm6 + vpsrlq $2, %ymm8, %ymm8 + vpsrlq $9, %ymm9, %ymm9 + vpsrlq $25, %ymm10, %ymm10 + vpsrlq $23, %ymm11, %ymm11 + vpsrlq $62, %ymm12, %ymm12 + vpor %ymm13, %ymm8, %ymm8 + vpor %ymm14, %ymm9, %ymm9 + vpor %ymm15, %ymm10, %ymm10 + vpor %ymm7, %ymm11, %ymm11 + vpor %ymm6, %ymm12, %ymm12 + vpandn %ymm10, %ymm9, %ymm13 + vpandn %ymm11, %ymm10, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm8, %ymm12, %ymm7 + vpandn %ymm9, %ymm8, %ymm6 + vpxor %ymm8, %ymm13, %ymm13 + vpxor %ymm9, %ymm14, %ymm14 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm6, %ymm6 + vmovdqa %ymm13, 320(%rdi) + vmovdqa %ymm14, 672(%rdi) + vmovdqa %ymm15, 224(%rdi) + vmovdqa %ymm7, 576(%rdi) + vmovdqa %ymm6, 128(%rdi) + vmovdqa 0(%rdi), %ymm8 + vmovdqa 32(%rdi), %ymm9 + vmovdqa 64(%rdi), %ymm10 + vmovdqa 96(%rdi), %ymm11 + vmovdqa 128(%rdi), %ymm12 + vpxor 160(%rdi), %ymm8, %ymm8 + vpxor 192(%rdi), %ymm9, %ymm9 + vpxor 224(%rdi), %ymm10, %ymm10 + vpxor 256(%rdi), %ymm11, %ymm11 + vpxor 288(%rdi), %ymm12, %ymm12 + vpxor 320(%rdi), %ymm8, %ymm8 + vpxor 352(%rdi), %ymm9, %ymm9 + vpxor 384(%rdi), %ymm10, %ymm10 + vpxor 416(%rdi), %ymm11, %ymm11 + vpxor 448(%rdi), %ymm12, %ymm12 + vpxor 480(%rdi), %ymm8, %ymm8 + vpxor 512(%rdi), %ymm9, %ymm9 + vpxor 544(%rdi), %ymm10, %ymm10 + vpxor 576(%rdi), %ymm11, %ymm11 + vpxor 608(%rdi), %ymm12, %ymm12 + vpxor 640(%rdi), %ymm8, %ymm8 + vpxor 672(%rdi), %ymm9, %ymm9 + vpxor 704(%rdi), %ymm10, %ymm10 + vpxor 736(%rdi), %ymm11, %ymm11 + vpxor 768(%rdi), %ymm12, %ymm12 + vpsllq $1, %ymm9, %ymm13 + vpsllq $1, %ymm10, %ymm14 + vpsllq $1, %ymm11, %ymm15 + vpsllq $1, %ymm12, %ymm7 + vpsllq $1, %ymm8, %ymm6 + vpsrlq $63, %ymm9, %ymm5 + vpsrlq $63, %ymm10, %ymm4 + vpsrlq $63, %ymm11, %ymm3 + vpsrlq $63, %ymm12, %ymm2 + vpsrlq $63, %ymm8, %ymm1 + vpor %ymm13, %ymm5, %ymm5 + vpor %ymm14, %ymm4, %ymm4 + vpor %ymm15, %ymm3, %ymm3 + vpor %ymm7, %ymm2, %ymm2 + vpor %ymm6, %ymm1, %ymm1 + vpxor %ymm5, %ymm12, %ymm5 + vpxor %ymm4, %ymm8, %ymm4 + vpxor %ymm3, %ymm9, %ymm3 + vpxor %ymm2, %ymm10, %ymm2 + vpxor %ymm1, %ymm11, %ymm1 + vpxor 0(%rdi), %ymm5, %ymm8 + vpxor 32(%rdi), %ymm4, %ymm9 + vpxor 64(%rdi), %ymm3, %ymm10 + vpxor 96(%rdi), %ymm2, %ymm11 + vpxor 128(%rdi), %ymm1, %ymm12 + vpsllq $44, %ymm9, %ymm14 + vpsllq $43, %ymm10, %ymm15 + vpsllq $21, %ymm11, %ymm7 + vpsllq $14, %ymm12, %ymm6 + vpsrlq $20, %ymm9, %ymm9 + vpsrlq $21, %ymm10, %ymm10 + vpsrlq $43, %ymm11, %ymm11 + vpsrlq $50, %ymm12, %ymm12 + vpor %ymm14, %ymm9, %ymm9 + vpor %ymm15, %ymm10, %ymm10 + vpor %ymm7, %ymm11, %ymm11 + vpor %ymm6, %ymm12, %ymm12 + vpandn %ymm10, %ymm9, %ymm13 + vpandn %ymm11, %ymm10, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm8, %ymm12, %ymm7 + vpandn %ymm9, %ymm8, %ymm6 + vpxor %ymm8, %ymm13, %ymm13 + vpxor %ymm9, %ymm14, %ymm14 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm6, %ymm6 + vpbroadcastq 24(%rsi), %ymm8 + vpxor %ymm8, %ymm13, %ymm13 + vmovdqa %ymm13, 0(%rdi) + vmovdqa %ymm14, 32(%rdi) + vmovdqa %ymm15, 64(%rdi) + vmovdqa %ymm7, 96(%rdi) + vmovdqa %ymm6, 128(%rdi) + vpxor 256(%rdi), %ymm2, %ymm8 + vpxor 288(%rdi), %ymm1, %ymm9 + vpxor 160(%rdi), %ymm5, %ymm10 + vpxor 192(%rdi), %ymm4, %ymm11 + vpxor 224(%rdi), %ymm3, %ymm12 + vpsllq $28, %ymm8, %ymm13 + vpsllq $20, %ymm9, %ymm14 + vpsllq $3, %ymm10, %ymm15 + vpsllq $45, %ymm11, %ymm7 + vpsllq $61, %ymm12, %ymm6 + vpsrlq $36, %ymm8, %ymm8 + vpsrlq $44, %ymm9, %ymm9 + vpsrlq $61, %ymm10, %ymm10 + vpsrlq $19, %ymm11, %ymm11 + vpsrlq $3, %ymm12, %ymm12 + vpor %ymm13, %ymm8, %ymm8 + vpor %ymm14, %ymm9, %ymm9 + vpor %ymm15, %ymm10, %ymm10 + vpor %ymm7, %ymm11, %ymm11 + vpor %ymm6, %ymm12, %ymm12 + vpandn %ymm10, %ymm9, %ymm13 + vpandn %ymm11, %ymm10, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm8, %ymm12, %ymm7 + vpandn %ymm9, %ymm8, %ymm6 + vpxor %ymm8, %ymm13, %ymm13 + vpxor %ymm9, %ymm14, %ymm14 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm6, %ymm6 + vmovdqa %ymm13, 160(%rdi) + vmovdqa %ymm14, 192(%rdi) + vmovdqa %ymm15, 224(%rdi) + vmovdqa %ymm7, 256(%rdi) + vmovdqa %ymm6, 288(%rdi) + vpxor 352(%rdi), %ymm4, %ymm8 + vpxor 384(%rdi), %ymm3, %ymm9 + vpxor 416(%rdi), %ymm2, %ymm10 + vpxor 448(%rdi), %ymm1, %ymm11 + vpxor 320(%rdi), %ymm5, %ymm12 + vpsllq $1, %ymm8, %ymm13 + vpsllq $6, %ymm9, %ymm14 + vpsllq $25, %ymm10, %ymm15 + vpsllq $8, %ymm11, %ymm7 + vpsllq $18, %ymm12, %ymm6 + vpsrlq $63, %ymm8, %ymm8 + vpsrlq $58, %ymm9, %ymm9 + vpsrlq $39, %ymm10, %ymm10 + vpsrlq $56, %ymm11, %ymm11 + vpsrlq $46, %ymm12, %ymm12 + vpor %ymm13, %ymm8, %ymm8 + vpor %ymm14, %ymm9, %ymm9 + vpor %ymm15, %ymm10, %ymm10 + vpor %ymm7, %ymm11, %ymm11 + vpor %ymm6, %ymm12, %ymm12 + vpandn %ymm10, %ymm9, %ymm13 + vpandn %ymm11, %ymm10, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm8, %ymm12, %ymm7 + vpandn %ymm9, %ymm8, %ymm6 + vpxor %ymm8, %ymm13, %ymm13 + vpxor %ymm9, %ymm14, %ymm14 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm6, %ymm6 + vmovdqa %ymm13, 320(%rdi) + vmovdqa %ymm14, 352(%rdi) + vmovdqa %ymm15, 384(%rdi) + vmovdqa %ymm7, 416(%rdi) + vmovdqa %ymm6, 448(%rdi) + vpxor 608(%rdi), %ymm1, %ymm8 + vpxor 480(%rdi), %ymm5, %ymm9 + vpxor 512(%rdi), %ymm4, %ymm10 + vpxor 544(%rdi), %ymm3, %ymm11 + vpxor 576(%rdi), %ymm2, %ymm12 + vpsllq $27, %ymm8, %ymm13 + vpsllq $36, %ymm9, %ymm14 + vpsllq $10, %ymm10, %ymm15 + vpsllq $15, %ymm11, %ymm7 + vpsllq $56, %ymm12, %ymm6 + vpsrlq $37, %ymm8, %ymm8 + vpsrlq $28, %ymm9, %ymm9 + vpsrlq $54, %ymm10, %ymm10 + vpsrlq $49, %ymm11, %ymm11 + vpsrlq $8, %ymm12, %ymm12 + vpor %ymm13, %ymm8, %ymm8 + vpor %ymm14, %ymm9, %ymm9 + vpor %ymm15, %ymm10, %ymm10 + vpor %ymm7, %ymm11, %ymm11 + vpor %ymm6, %ymm12, %ymm12 + vpandn %ymm10, %ymm9, %ymm13 + vpandn %ymm11, %ymm10, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm8, %ymm12, %ymm7 + vpandn %ymm9, %ymm8, %ymm6 + vpxor %ymm8, %ymm13, %ymm13 + vpxor %ymm9, %ymm14, %ymm14 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm6, %ymm6 + vmovdqa %ymm13, 480(%rdi) + vmovdqa %ymm14, 512(%rdi) + vmovdqa %ymm15, 544(%rdi) + vmovdqa %ymm7, 576(%rdi) + vmovdqa %ymm6, 608(%rdi) + vpxor 704(%rdi), %ymm3, %ymm8 + vpxor 736(%rdi), %ymm2, %ymm9 + vpxor 768(%rdi), %ymm1, %ymm10 + vpxor 640(%rdi), %ymm5, %ymm11 + vpxor 672(%rdi), %ymm4, %ymm12 + vpsllq $62, %ymm8, %ymm13 + vpsllq $55, %ymm9, %ymm14 + vpsllq $39, %ymm10, %ymm15 + vpsllq $41, %ymm11, %ymm7 + vpsllq $2, %ymm12, %ymm6 + vpsrlq $2, %ymm8, %ymm8 + vpsrlq $9, %ymm9, %ymm9 + vpsrlq $25, %ymm10, %ymm10 + vpsrlq $23, %ymm11, %ymm11 + vpsrlq $62, %ymm12, %ymm12 + vpor %ymm13, %ymm8, %ymm8 + vpor %ymm14, %ymm9, %ymm9 + vpor %ymm15, %ymm10, %ymm10 + vpor %ymm7, %ymm11, %ymm11 + vpor %ymm6, %ymm12, %ymm12 + vpandn %ymm10, %ymm9, %ymm13 + vpandn %ymm11, %ymm10, %ymm14 + vpandn %ymm12, %ymm11, %ymm15 + vpandn %ymm8, %ymm12, %ymm7 + vpandn %ymm9, %ymm8, %ymm6 + vpxor %ymm8, %ymm13, %ymm13 + vpxor %ymm9, %ymm14, %ymm14 + vpxor %ymm10, %ymm15, %ymm15 + vpxor %ymm11, %ymm7, %ymm7 + vpxor %ymm12, %ymm6, %ymm6 + vmovdqa %ymm13, 640(%rdi) + vmovdqa %ymm14, 672(%rdi) + vmovdqa %ymm15, 704(%rdi) + vmovdqa %ymm7, 736(%rdi) + vmovdqa %ymm6, 768(%rdi) + addq $32, %rsi + subq $1, %rax + jnz 17b # loop.begin + vzeroupper + ret +#ifndef __APPLE__ +.size f1600x4AVX2, .-f1600x4AVX2 +#endif /* !__APPLE__ */ diff --git a/shake256-avx2/f1600x4.c b/shake256-avx2/f1600x4.c new file mode 100644 index 00000000..5b24408b --- /dev/null +++ b/shake256-avx2/f1600x4.c @@ -0,0 +1,28 @@ +#include + +uint64_t keccak_rc[24] = { + 0x0000000000000001, + 0x0000000000008082, + 0x800000000000808A, + 0x8000000080008000, + 0x000000000000808B, + 0x0000000080000001, + 0x8000000080008081, + 0x8000000000008009, + 0x000000000000008A, + 0x0000000000000088, + 0x0000000080008009, + 0x000000008000000A, + 0x000000008000808B, + 0x800000000000008B, + 0x8000000000008089, + 0x8000000000008003, + 0x8000000000008002, + 0x8000000000000080, + 0x000000000000800A, + 0x800000008000000A, + 0x8000000080008081, + 0x8000000000008080, + 0x0000000080000001, + 0x8000000080008008 +}; diff --git a/shake256-avx2/f1600x4.h b/shake256-avx2/f1600x4.h new file mode 100644 index 00000000..ff4785e6 --- /dev/null +++ b/shake256-avx2/f1600x4.h @@ -0,0 +1,4 @@ +#pragma once + +extern void f1600x4AVX2(uint64_t *s, uint64_t *rc); +extern uint64_t keccak_rc[24]; diff --git a/shake256-avx2/f1600x4.py b/shake256-avx2/f1600x4.py new file mode 100644 index 00000000..98a4a68f --- /dev/null +++ b/shake256-avx2/f1600x4.py @@ -0,0 +1,79 @@ +import peachpy.x86_64 + +stateArg = Argument(ptr(uint64_t)) +rcArg = Argument(ptr(uint64_t)) +with Function("f1600x4AVX2", (stateArg, rcArg), target=uarch.haswell) as function: + statePtr = GeneralPurposeRegister64() + rcPtr = GeneralPurposeRegister64() + superRound = GeneralPurposeRegister64() + + LOAD.ARGUMENT(statePtr, stateArg) + LOAD.ARGUMENT(rcPtr, rcArg) + + MOV(superRound, 6) + + def state(offset): + return [statePtr + 32*offset] + + with Loop() as loop: + for r in range(4): + p = [YMMRegister() for i in range(5)] + for i in range(5): VMOVDQA(p[i], state(i)) + for j in range(1, 5): + for i in range(5): VPXOR(p[i], p[i], state(5*j+i)) + + t = [YMMRegister() for i in range(5)] + d = [YMMRegister() for i in range(5)] + + for i in range(5): VPSLLQ(t[i], p[(i+1)%5], 1) + for i in range(5): VPSRLQ(d[i], p[(i+1)%5], 63) + for i in range(5): VPOR(d[i], d[i], t[i]) + for i in range(5): VPXOR(d[i], p[(i+4)%5], d[i]) + + def rot(i, g): + table = [[0, 24, 18, 6, 12], + [7, 23, 2, 9, 22], + [1, 3, 17, 16, 20], + [13, 8, 4, 5, 15], + [19, 10, 21, 14, 11]] + t = table[g][i] + return ((t + 1) * t // 2) % 64 + + def di(i, g): + return (3*g + i) % 5 + def si(i, g, r): + n = [6, 16, 11, 1][r] + m = [10, 20, 15, 5][r] + return (i*n + m*g) % 25 + + for g in range(5): + s = [YMMRegister() for i in range(5)] + for i in range(5): + VPXOR(s[i], d[di(i, g)], state(si(di(i, g), g, r))) + for i in range(5): + if rot(i, g) != 0: + VPSLLQ(t[i], s[i], rot(i, g)) + for i in range(5): + if rot(i, g) != 0: + VPSRLQ(s[i], s[i], 64-rot(i, g)) + for i in range(5): + if rot(i, g) != 0: + VPOR(s[i], s[i], t[i]) + for i in range(5): VPANDN(t[i], s[(i+1)%5], s[(i+2)%5]) + for i in range(5): VPXOR(t[i], t[i], s[i]) + + if g == 0: + rc = YMMRegister() + VPBROADCASTQ(rc, [rcPtr + r*8]) + VPXOR(t[0], t[0], rc) + for i in range(5): + VMOVDQA(state(si(i, g, r)), t[i]) + + ADD(rcPtr, 8*4) + SUB(superRound, 1) + JNZ(loop.begin) + + RETURN () + + + diff --git a/shake256-avx2/fips202x4.c b/shake256-avx2/fips202x4.c deleted file mode 100644 index d3a22769..00000000 --- a/shake256-avx2/fips202x4.c +++ /dev/null @@ -1,223 +0,0 @@ -#include -#include -#include -#include "fips202.h" - -#define NROUNDS 24 -#define ROL(a, offset) ((a << offset) ^ (a >> (64-offset))) - -static uint64_t load64(const unsigned char *x) -{ - unsigned long long r = 0, i; - - for (i = 0; i < 8; ++i) { - r |= (unsigned long long)x[i] << 8 * i; - } - return r; -} - -static void store64(uint8_t *x, uint64_t u) -{ - unsigned int i; - - for(i=0; i<8; ++i) { - x[i] = u; - u >>= 8; - } -} - -/* Use implementation from the Keccak Code Package */ -extern void KeccakP1600times4_PermuteAll_24rounds(__m256i *s); -#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds - -static void keccak_absorb4x(__m256i *s, - unsigned int r, - const unsigned char *m0, - const unsigned char *m1, - const unsigned char *m2, - const unsigned char *m3, - unsigned long long int mlen, - unsigned char p) -{ - unsigned long long i; - unsigned char t0[200]; - unsigned char t1[200]; - unsigned char t2[200]; - unsigned char t3[200]; - - unsigned long long *ss = (unsigned long long *)s; - - - while (mlen >= r) - { - for (i = 0; i < r / 8; ++i) - { - ss[4*i+0] ^= load64(m0 + 8 * i); - ss[4*i+1] ^= load64(m1 + 8 * i); - ss[4*i+2] ^= load64(m2 + 8 * i); - ss[4*i+3] ^= load64(m3 + 8 * i); - } - - KeccakF1600_StatePermute4x(s); - mlen -= r; - m0 += r; - m1 += r; - m2 += r; - m3 += r; - } - - for (i = 0; i < r; ++i) - { - t0[i] = 0; - t1[i] = 0; - t2[i] = 0; - t3[i] = 0; - } - for (i = 0; i < mlen; ++i) - { - t0[i] = m0[i]; - t1[i] = m1[i]; - t2[i] = m2[i]; - t3[i] = m3[i]; - } - - t0[i] = p; - t1[i] = p; - t2[i] = p; - t3[i] = p; - - t0[r - 1] |= 128; - t1[r - 1] |= 128; - t2[r - 1] |= 128; - t3[r - 1] |= 128; - - for (i = 0; i < r / 8; ++i) - { - ss[4*i+0] ^= load64(t0 + 8 * i); - ss[4*i+1] ^= load64(t1 + 8 * i); - ss[4*i+2] ^= load64(t2 + 8 * i); - ss[4*i+3] ^= load64(t3 + 8 * i); - } -} - - -static void keccak_squeezeblocks4x(unsigned char *h0, - unsigned char *h1, - unsigned char *h2, - unsigned char *h3, - unsigned long long int nblocks, - __m256i *s, - unsigned int r) -{ - unsigned int i; - - unsigned long long *ss = (unsigned long long *)s; - - while(nblocks > 0) - { - KeccakF1600_StatePermute4x(s); - for(i=0;i<(r>>3);i++) - { - store64(h0+8*i, ss[4*i+0]); - store64(h1+8*i, ss[4*i+1]); - store64(h2+8*i, ss[4*i+2]); - store64(h3+8*i, ss[4*i+3]); - } - h0 += r; - h1 += r; - h2 += r; - h3 += r; - nblocks--; - } -} - - - -void shake128x4(unsigned char *out0, - unsigned char *out1, - unsigned char *out2, - unsigned char *out3, unsigned long long outlen, - unsigned char *in0, - unsigned char *in1, - unsigned char *in2, - unsigned char *in3, unsigned long long inlen) -{ - __m256i s[25]; - unsigned char t0[SHAKE128_RATE]; - unsigned char t1[SHAKE128_RATE]; - unsigned char t2[SHAKE128_RATE]; - unsigned char t3[SHAKE128_RATE]; - unsigned int i; - - /* zero state */ - for(i=0;i<25;i++) - s[i] = _mm256_xor_si256(s[i], s[i]); - - /* absorb 4 message of identical length in parallel */ - keccak_absorb4x(s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F); - - /* Squeeze output */ - keccak_squeezeblocks4x(out0, out1, out2, out3, outlen/SHAKE128_RATE, s, SHAKE128_RATE); - - out0 += (outlen/SHAKE128_RATE)*SHAKE128_RATE; - out1 += (outlen/SHAKE128_RATE)*SHAKE128_RATE; - out2 += (outlen/SHAKE128_RATE)*SHAKE128_RATE; - out3 += (outlen/SHAKE128_RATE)*SHAKE128_RATE; - - if(outlen%SHAKE128_RATE) - { - keccak_squeezeblocks4x(t0, t1, t2, t3, 1, s, SHAKE128_RATE); - for(i=0;i - -void shake128x4(unsigned char *out0, - unsigned char *out1, - unsigned char *out2, - unsigned char *out3, unsigned long long outlen, - unsigned char *in0, - unsigned char *in1, - unsigned char *in2, - unsigned char *in3, unsigned long long inlen); - -void shake256x4(unsigned char *out0, - unsigned char *out1, - unsigned char *out2, - unsigned char *out3, unsigned long long outlen, - unsigned char *in0, - unsigned char *in1, - unsigned char *in2, - unsigned char *in3, unsigned long long inlen); - -#endif diff --git a/shake256-avx2/hash_shake256x4.c b/shake256-avx2/hash_shake256x4.c index 105d1ed8..fcf0df47 100644 --- a/shake256-avx2/hash_shake256x4.c +++ b/shake256-avx2/hash_shake256x4.c @@ -1,12 +1,10 @@ #include #include +#include #include "address.h" #include "params.h" -#include "fips202x4.h" -#include "hashx4.h" - -extern void KeccakP1600times4_PermuteAll_24rounds(__m256i *s); +#include "f1600x4.h" /* Swap endianess */ static uint32_t swap32(uint32_t val) { @@ -55,7 +53,7 @@ void prf_addrx4(unsigned char *out0, state[i] = _mm256_set1_epi64x(0); } - KeccakP1600times4_PermuteAll_24rounds(&state[0]); + f1600x4AVX2((uint64_t*)&state[0], &keccak_rc[0]); for (int i = 0; i < SPX_N/8; i++) { ((int64_t*)out0)[i] = _mm256_extract_epi64(state[i], 0); diff --git a/shake256-avx2/keccak4x/KeccakP-1600-times4-SIMD256.c b/shake256-avx2/keccak4x/KeccakP-1600-times4-SIMD256.c deleted file mode 100644 index 7a0428fb..00000000 --- a/shake256-avx2/keccak4x/KeccakP-1600-times4-SIMD256.c +++ /dev/null @@ -1,1030 +0,0 @@ -/* -Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni, -Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby -denoted as "the implementer". - -For more information, feedback or questions, please refer to our websites: -http://keccak.noekeon.org/ -http://keyak.noekeon.org/ -http://ketje.noekeon.org/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ -*/ - -#include -#include -#include -#include -#include -#include -#include -#include "align.h" -#include "KeccakP-1600-times4-SnP.h" -#include "SIMD256-config.h" - -#include "brg_endian.h" -#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN) -#error Expecting a little-endian platform -#endif - -typedef unsigned char UINT8; -typedef unsigned long long int UINT64; -typedef __m128i V128; -typedef __m256i V256; - -#define laneIndex(instanceIndex, lanePosition) ((lanePosition)*4 + instanceIndex) - -#if defined(KeccakP1600times4_useAVX2) - #define ANDnu256(a, b) _mm256_andnot_si256(a, b) - #define CONST256(a) _mm256_load_si256((const V256 *)&(a)) - #define CONST256_64(a) (V256)_mm256_broadcast_sd((const double*)(&a)) - #define LOAD256(a) _mm256_load_si256((const V256 *)&(a)) - #define LOAD256u(a) _mm256_loadu_si256((const V256 *)&(a)) - #define LOAD4_64(a, b, c, d) _mm256_set_epi64x((UINT64)(a), (UINT64)(b), (UINT64)(c), (UINT64)(d)) - #define ROL64in256(d, a, o) d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o))) - #define ROL64in256_8(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho8)) - #define ROL64in256_56(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho56)) -static const UINT64 rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F}; -static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19}; - #define STORE256(a, b) _mm256_store_si256((V256 *)&(a), b) - #define STORE256u(a, b) _mm256_storeu_si256((V256 *)&(a), b) - #define STORE2_128(ah, al, v) _mm256_storeu2_m128d((V128*)&(ah), (V128*)&(al), v) - #define XOR256(a, b) _mm256_xor_si256(a, b) - #define XOReq256(a, b) a = _mm256_xor_si256(a, b) - #define UNPACKL( a, b ) _mm256_unpacklo_epi64((a), (b)) - #define UNPACKH( a, b ) _mm256_unpackhi_epi64((a), (b)) - #define PERM128( a, b, c ) (V256)_mm256_permute2f128_ps((__m256)(a), (__m256)(b), c) - #define SHUFFLE64( a, b, c ) (V256)_mm256_shuffle_pd((__m256d)(a), (__m256d)(b), c) - - #define UNINTLEAVE() lanesL01 = UNPACKL( lanes0, lanes1 ), \ - lanesH01 = UNPACKH( lanes0, lanes1 ), \ - lanesL23 = UNPACKL( lanes2, lanes3 ), \ - lanesH23 = UNPACKH( lanes2, lanes3 ), \ - lanes0 = PERM128( lanesL01, lanesL23, 0x20 ), \ - lanes2 = PERM128( lanesL01, lanesL23, 0x31 ), \ - lanes1 = PERM128( lanesH01, lanesH23, 0x20 ), \ - lanes3 = PERM128( lanesH01, lanesH23, 0x31 ) - - #define INTLEAVE() lanesL01 = PERM128( lanes0, lanes2, 0x20 ), \ - lanesH01 = PERM128( lanes1, lanes3, 0x20 ), \ - lanesL23 = PERM128( lanes0, lanes2, 0x31 ), \ - lanesH23 = PERM128( lanes1, lanes3, 0x31 ), \ - lanes0 = SHUFFLE64( lanesL01, lanesH01, 0x00 ), \ - lanes1 = SHUFFLE64( lanesL01, lanesH01, 0x0F ), \ - lanes2 = SHUFFLE64( lanesL23, lanesH23, 0x00 ), \ - lanes3 = SHUFFLE64( lanesL23, lanesH23, 0x0F ) - -#endif - -#define SnP_laneLengthInBytes 8 - -void KeccakP1600times4_InitializeAll(void *states) -{ - memset(states, 0, KeccakP1600times4_statesSizeInBytes); -} - -void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length) -{ - unsigned int sizeLeft = length; - unsigned int lanePosition = offset/SnP_laneLengthInBytes; - unsigned int offsetInLane = offset%SnP_laneLengthInBytes; - const unsigned char *curData = data; - UINT64 *statesAsLanes = (UINT64 *)states; - - if ((sizeLeft > 0) && (offsetInLane != 0)) { - unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane; - UINT64 lane = 0; - if (bytesInLane > sizeLeft) - bytesInLane = sizeLeft; - memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane); - statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane; - sizeLeft -= bytesInLane; - lanePosition++; - curData += bytesInLane; - } - - while(sizeLeft >= SnP_laneLengthInBytes) { - UINT64 lane = *((const UINT64*)curData); - statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane; - sizeLeft -= SnP_laneLengthInBytes; - lanePosition++; - curData += SnP_laneLengthInBytes; - } - - if (sizeLeft > 0) { - UINT64 lane = 0; - memcpy(&lane, curData, sizeLeft); - statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane; - } -} - -void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset) -{ - V256 *stateAsLanes = (V256 *)states; - unsigned int i; - const UINT64 *curData0 = (const UINT64 *)data; - const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes); - const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes); - const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes); - V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23; - - #define Xor_In( argIndex ) XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex])) - - #define Xor_In4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\ - lanes1 = LOAD256u( curData1[argIndex]),\ - lanes2 = LOAD256u( curData2[argIndex]),\ - lanes3 = LOAD256u( curData3[argIndex]),\ - INTLEAVE(),\ - XOReq256( stateAsLanes[argIndex+0], lanes0 ),\ - XOReq256( stateAsLanes[argIndex+1], lanes1 ),\ - XOReq256( stateAsLanes[argIndex+2], lanes2 ),\ - XOReq256( stateAsLanes[argIndex+3], lanes3 ) - - if ( laneCount >= 16 ) { - Xor_In4( 0 ); - Xor_In4( 4 ); - Xor_In4( 8 ); - Xor_In4( 12 ); - if ( laneCount >= 20 ) { - Xor_In4( 16 ); - for(i=20; i 0) && (offsetInLane != 0)) { - unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane; - if (bytesInLane > sizeLeft) - bytesInLane = sizeLeft; - memcpy( ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, curData, bytesInLane); - sizeLeft -= bytesInLane; - lanePosition++; - curData += bytesInLane; - } - - while(sizeLeft >= SnP_laneLengthInBytes) { - UINT64 lane = *((const UINT64*)curData); - statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane; - sizeLeft -= SnP_laneLengthInBytes; - lanePosition++; - curData += SnP_laneLengthInBytes; - } - - if (sizeLeft > 0) { - memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, sizeLeft); - } -} - -void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset) -{ - V256 *stateAsLanes = (V256 *)states; - unsigned int i; - const UINT64 *curData0 = (const UINT64 *)data; - const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes); - const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes); - const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes); - V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23; - - #define OverWr( argIndex ) STORE256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex])) - - #define OverWr4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\ - lanes1 = LOAD256u( curData1[argIndex]),\ - lanes2 = LOAD256u( curData2[argIndex]),\ - lanes3 = LOAD256u( curData3[argIndex]),\ - INTLEAVE(),\ - STORE256( stateAsLanes[argIndex+0], lanes0 ),\ - STORE256( stateAsLanes[argIndex+1], lanes1 ),\ - STORE256( stateAsLanes[argIndex+2], lanes2 ),\ - STORE256( stateAsLanes[argIndex+3], lanes3 ) - - if ( laneCount >= 16 ) { - OverWr4( 0 ); - OverWr4( 4 ); - OverWr4( 8 ); - OverWr4( 12 ); - if ( laneCount >= 20 ) { - OverWr4( 16 ); - for(i=20; i= SnP_laneLengthInBytes) { - statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0; - sizeLeft -= SnP_laneLengthInBytes; - lanePosition++; - } - - if (sizeLeft > 0) { - memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft); - } -} - -void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length) -{ - unsigned int sizeLeft = length; - unsigned int lanePosition = offset/SnP_laneLengthInBytes; - unsigned int offsetInLane = offset%SnP_laneLengthInBytes; - unsigned char *curData = data; - const UINT64 *statesAsLanes = (const UINT64 *)states; - - if ((sizeLeft > 0) && (offsetInLane != 0)) { - unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane; - if (bytesInLane > sizeLeft) - bytesInLane = sizeLeft; - memcpy( curData, ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane); - sizeLeft -= bytesInLane; - lanePosition++; - curData += bytesInLane; - } - - while(sizeLeft >= SnP_laneLengthInBytes) { - *(UINT64*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)]; - sizeLeft -= SnP_laneLengthInBytes; - lanePosition++; - curData += SnP_laneLengthInBytes; - } - - if (sizeLeft > 0) { - memcpy( curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], sizeLeft); - } -} - -void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset) -{ - UINT64 *curData0 = (UINT64 *)data; - UINT64 *curData1 = (UINT64 *)(data+laneOffset*1*SnP_laneLengthInBytes); - UINT64 *curData2 = (UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes); - UINT64 *curData3 = (UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes); - - const V256 *stateAsLanes = (const V256 *)states; - const UINT64 *stateAsLanes64 = (const UINT64*)states; - V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23; - unsigned int i; - - #define Extr( argIndex ) curData0[argIndex] = stateAsLanes64[4*(argIndex)], \ - curData1[argIndex] = stateAsLanes64[4*(argIndex)+1], \ - curData2[argIndex] = stateAsLanes64[4*(argIndex)+2], \ - curData3[argIndex] = stateAsLanes64[4*(argIndex)+3] - - #define Extr4( argIndex ) lanes0 = LOAD256( stateAsLanes[argIndex+0] ), \ - lanes1 = LOAD256( stateAsLanes[argIndex+1] ), \ - lanes2 = LOAD256( stateAsLanes[argIndex+2] ), \ - lanes3 = LOAD256( stateAsLanes[argIndex+3] ), \ - UNINTLEAVE(), \ - STORE256u( curData0[argIndex], lanes0 ), \ - STORE256u( curData1[argIndex], lanes1 ), \ - STORE256u( curData2[argIndex], lanes2 ), \ - STORE256u( curData3[argIndex], lanes3 ) - - if ( laneCount >= 16 ) { - Extr4( 0 ); - Extr4( 4 ); - Extr4( 8 ); - Extr4( 12 ); - if ( laneCount >= 20 ) { - Extr4( 16 ); - for(i=20; i 0) && (offsetInLane != 0)) { - unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane; - UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> (8 * offsetInLane); - if (bytesInLane > sizeLeft) - bytesInLane = sizeLeft; - sizeLeft -= bytesInLane; - do { - *(curOutput++) = *(curInput++) ^ (unsigned char)lane; - lane >>= 8; - } while ( --bytesInLane != 0); - lanePosition++; - } - - while(sizeLeft >= SnP_laneLengthInBytes) { - *((UINT64*)curOutput) = *((UINT64*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)]; - sizeLeft -= SnP_laneLengthInBytes; - lanePosition++; - curInput += SnP_laneLengthInBytes; - curOutput += SnP_laneLengthInBytes; - } - - if (sizeLeft != 0) { - UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)]; - do { - *(curOutput++) = *(curInput++) ^ (unsigned char)lane; - lane >>= 8; - } while ( --sizeLeft != 0); - } -} - -void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset) -{ - const UINT64 *curInput0 = (UINT64 *)input; - const UINT64 *curInput1 = (UINT64 *)(input+laneOffset*1*SnP_laneLengthInBytes); - const UINT64 *curInput2 = (UINT64 *)(input+laneOffset*2*SnP_laneLengthInBytes); - const UINT64 *curInput3 = (UINT64 *)(input+laneOffset*3*SnP_laneLengthInBytes); - UINT64 *curOutput0 = (UINT64 *)output; - UINT64 *curOutput1 = (UINT64 *)(output+laneOffset*1*SnP_laneLengthInBytes); - UINT64 *curOutput2 = (UINT64 *)(output+laneOffset*2*SnP_laneLengthInBytes); - UINT64 *curOutput3 = (UINT64 *)(output+laneOffset*3*SnP_laneLengthInBytes); - - const V256 *stateAsLanes = (const V256 *)states; - const UINT64 *stateAsLanes64 = (const UINT64*)states; - V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23; - unsigned int i; - - #define ExtrXor( argIndex ) \ - curOutput0[argIndex] = curInput0[argIndex] ^ stateAsLanes64[4*(argIndex)],\ - curOutput1[argIndex] = curInput1[argIndex] ^ stateAsLanes64[4*(argIndex)+1],\ - curOutput2[argIndex] = curInput2[argIndex] ^ stateAsLanes64[4*(argIndex)+2],\ - curOutput3[argIndex] = curInput3[argIndex] ^ stateAsLanes64[4*(argIndex)+3] - - #define ExtrXor4( argIndex ) \ - lanes0 = LOAD256( stateAsLanes[argIndex+0] ),\ - lanes1 = LOAD256( stateAsLanes[argIndex+1] ),\ - lanes2 = LOAD256( stateAsLanes[argIndex+2] ),\ - lanes3 = LOAD256( stateAsLanes[argIndex+3] ),\ - UNINTLEAVE(),\ - lanesL01 = LOAD256u( curInput0[argIndex]),\ - lanesH01 = LOAD256u( curInput1[argIndex]),\ - lanesL23 = LOAD256u( curInput2[argIndex]),\ - lanesH23 = LOAD256u( curInput3[argIndex]),\ - XOReq256( lanes0, lanesL01 ),\ - XOReq256( lanes1, lanesH01 ),\ - XOReq256( lanes2, lanesL23 ),\ - XOReq256( lanes3, lanesH23 ),\ - STORE256u( curOutput0[argIndex], lanes0 ),\ - STORE256u( curOutput1[argIndex], lanes1 ),\ - STORE256u( curOutput2[argIndex], lanes2 ),\ - STORE256u( curOutput3[argIndex], lanes3 ) - - if ( laneCount >= 16 ) { - ExtrXor4( 0 ); - ExtrXor4( 4 ); - ExtrXor4( 8 ); - ExtrXor4( 12 ); - if ( laneCount >= 20 ) { - ExtrXor4( 16 ); - for(i=20; i= (laneOffsetParallel*3 + laneCount)*8) { - V256 *stateAsLanes = (V256 *)states; - V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23; - #define Xor_In( argIndex ) \ - XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex])) - #define Xor_In4( argIndex ) \ - lanes0 = LOAD256u( curData0[argIndex]),\ - lanes1 = LOAD256u( curData1[argIndex]),\ - lanes2 = LOAD256u( curData2[argIndex]),\ - lanes3 = LOAD256u( curData3[argIndex]),\ - INTLEAVE(),\ - XOReq256( stateAsLanes[argIndex+0], lanes0 ),\ - XOReq256( stateAsLanes[argIndex+1], lanes1 ),\ - XOReq256( stateAsLanes[argIndex+2], lanes2 ),\ - XOReq256( stateAsLanes[argIndex+3], lanes3 ) - Xor_In4( 0 ); - Xor_In4( 4 ); - Xor_In4( 8 ); - Xor_In4( 12 ); - Xor_In4( 16 ); - Xor_In( 20 ); - #undef Xor_In - #undef Xor_In4 - KeccakP1600times4_PermuteAll_24rounds(states); - curData0 += laneOffsetSerial; - curData1 += laneOffsetSerial; - curData2 += laneOffsetSerial; - curData3 += laneOffsetSerial; - dataByteLen -= laneOffsetSerial*8; - } - return (const unsigned char *)curData0 - dataStart; -#else -// unsigned int i; - const unsigned char *dataStart = data; - const UINT64 *curData0 = (const UINT64 *)data; - const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes); - const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes); - const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes); - V256 *statesAsLanes = (V256 *)states; - declareABCDE - - copyFromState(A, statesAsLanes) - while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) { - #define XOR_In( Xxx, argIndex ) \ - XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex])) - XOR_In( Aba, 0 ); - XOR_In( Abe, 1 ); - XOR_In( Abi, 2 ); - XOR_In( Abo, 3 ); - XOR_In( Abu, 4 ); - XOR_In( Aga, 5 ); - XOR_In( Age, 6 ); - XOR_In( Agi, 7 ); - XOR_In( Ago, 8 ); - XOR_In( Agu, 9 ); - XOR_In( Aka, 10 ); - XOR_In( Ake, 11 ); - XOR_In( Aki, 12 ); - XOR_In( Ako, 13 ); - XOR_In( Aku, 14 ); - XOR_In( Ama, 15 ); - XOR_In( Ame, 16 ); - XOR_In( Ami, 17 ); - XOR_In( Amo, 18 ); - XOR_In( Amu, 19 ); - XOR_In( Asa, 20 ); - #undef XOR_In - rounds24 - curData0 += laneOffsetSerial; - curData1 += laneOffsetSerial; - curData2 += laneOffsetSerial; - curData3 += laneOffsetSerial; - dataByteLen -= laneOffsetSerial*8; - } - copyToState(statesAsLanes, A) - return (const unsigned char *)curData0 - dataStart; -#endif - } - else { -// unsigned int i; - const unsigned char *dataStart = data; - - while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) { - KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel); - KeccakP1600times4_PermuteAll_24rounds(states); - data += laneOffsetSerial*8; - dataByteLen -= laneOffsetSerial*8; - } - return data - dataStart; - } -} - -size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen) -{ - if (laneCount == 21) { -#if 0 - const unsigned char *dataStart = data; - const UINT64 *curData0 = (const UINT64 *)data; - const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes); - const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes); - const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes); - - while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) { - V256 *stateAsLanes = states; - V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23; - #define Xor_In( argIndex ) \ - XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex])) - #define Xor_In4( argIndex ) \ - lanes0 = LOAD256u( curData0[argIndex]),\ - lanes1 = LOAD256u( curData1[argIndex]),\ - lanes2 = LOAD256u( curData2[argIndex]),\ - lanes3 = LOAD256u( curData3[argIndex]),\ - INTLEAVE(),\ - XOReq256( stateAsLanes[argIndex+0], lanes0 ),\ - XOReq256( stateAsLanes[argIndex+1], lanes1 ),\ - XOReq256( stateAsLanes[argIndex+2], lanes2 ),\ - XOReq256( stateAsLanes[argIndex+3], lanes3 ) - Xor_In4( 0 ); - Xor_In4( 4 ); - Xor_In4( 8 ); - Xor_In4( 12 ); - Xor_In4( 16 ); - Xor_In( 20 ); - #undef Xor_In - #undef Xor_In4 - KeccakP1600times4_PermuteAll_12rounds(states); - curData0 += laneOffsetSerial; - curData1 += laneOffsetSerial; - curData2 += laneOffsetSerial; - curData3 += laneOffsetSerial; - dataByteLen -= laneOffsetSerial*8; - } - return (const unsigned char *)curData0 - dataStart; -#else -// unsigned int i; - const unsigned char *dataStart = data; - const UINT64 *curData0 = (const UINT64 *)data; - const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes); - const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes); - const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes); - V256 *statesAsLanes = states; - declareABCDE - - copyFromState(A, statesAsLanes) - while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) { - #define XOR_In( Xxx, argIndex ) \ - XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex])) - XOR_In( Aba, 0 ); - XOR_In( Abe, 1 ); - XOR_In( Abi, 2 ); - XOR_In( Abo, 3 ); - XOR_In( Abu, 4 ); - XOR_In( Aga, 5 ); - XOR_In( Age, 6 ); - XOR_In( Agi, 7 ); - XOR_In( Ago, 8 ); - XOR_In( Agu, 9 ); - XOR_In( Aka, 10 ); - XOR_In( Ake, 11 ); - XOR_In( Aki, 12 ); - XOR_In( Ako, 13 ); - XOR_In( Aku, 14 ); - XOR_In( Ama, 15 ); - XOR_In( Ame, 16 ); - XOR_In( Ami, 17 ); - XOR_In( Amo, 18 ); - XOR_In( Amu, 19 ); - XOR_In( Asa, 20 ); - #undef XOR_In - rounds12 - curData0 += laneOffsetSerial; - curData1 += laneOffsetSerial; - curData2 += laneOffsetSerial; - curData3 += laneOffsetSerial; - dataByteLen -= laneOffsetSerial*8; - } - copyToState(statesAsLanes, A) - return (const unsigned char *)curData0 - dataStart; -#endif - } - else { -// unsigned int i; - const unsigned char *dataStart = data; - - while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) { - KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel); - KeccakP1600times4_PermuteAll_12rounds(states); - data += laneOffsetSerial*8; - dataByteLen -= laneOffsetSerial*8; - } - return data - dataStart; - } -} diff --git a/shake256-avx2/keccak4x/KeccakP-1600-times4-SnP.h b/shake256-avx2/keccak4x/KeccakP-1600-times4-SnP.h deleted file mode 100644 index 60338488..00000000 --- a/shake256-avx2/keccak4x/KeccakP-1600-times4-SnP.h +++ /dev/null @@ -1,50 +0,0 @@ -/* -Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni, -Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby -denoted as "the implementer". - -For more information, feedback or questions, please refer to our websites: -http://keccak.noekeon.org/ -http://keyak.noekeon.org/ -http://ketje.noekeon.org/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ -*/ - -#ifndef _KeccakP_1600_times4_SnP_h_ -#define _KeccakP_1600_times4_SnP_h_ - -/** For the documentation, see PlSnP-documentation.h. - */ - -#include "SIMD256-config.h" - -#define KeccakP1600times4_implementation "256-bit SIMD implementation (" KeccakP1600times4_implementation_config ")" -#define KeccakP1600times4_statesSizeInBytes 800 -#define KeccakP1600times4_statesAlignment 32 -#define KeccakF1600times4_FastLoop_supported -#define KeccakP1600times4_12rounds_FastLoop_supported - -#include - -#define KeccakP1600times4_StaticInitialize() -void KeccakP1600times4_InitializeAll(void *states); -#define KeccakP1600times4_AddByte(states, instanceIndex, byte, offset) \ - ((unsigned char*)(states))[(instanceIndex)*8 + ((offset)/8)*4*8 + (offset)%8] ^= (byte) -void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length); -void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset); -void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length); -void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset); -void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount); -void KeccakP1600times4_PermuteAll_12rounds(void *states); -void KeccakP1600times4_PermuteAll_24rounds(void *states); -void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length); -void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset); -void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length); -void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset); -size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen); -size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen); - -#endif diff --git a/shake256-avx2/keccak4x/KeccakP-1600-unrolling.macros b/shake256-avx2/keccak4x/KeccakP-1600-unrolling.macros deleted file mode 100644 index 3180bb06..00000000 --- a/shake256-avx2/keccak4x/KeccakP-1600-unrolling.macros +++ /dev/null @@ -1,198 +0,0 @@ -/* -Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni, -Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby -denoted as "the implementer". - -For more information, feedback or questions, please refer to our websites: -http://keccak.noekeon.org/ -http://keyak.noekeon.org/ -http://ketje.noekeon.org/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ -*/ - -#if (defined(FullUnrolling)) -#define rounds24 \ - prepareTheta \ - thetaRhoPiChiIotaPrepareTheta( 0, A, E) \ - thetaRhoPiChiIotaPrepareTheta( 1, E, A) \ - thetaRhoPiChiIotaPrepareTheta( 2, A, E) \ - thetaRhoPiChiIotaPrepareTheta( 3, E, A) \ - thetaRhoPiChiIotaPrepareTheta( 4, A, E) \ - thetaRhoPiChiIotaPrepareTheta( 5, E, A) \ - thetaRhoPiChiIotaPrepareTheta( 6, A, E) \ - thetaRhoPiChiIotaPrepareTheta( 7, E, A) \ - thetaRhoPiChiIotaPrepareTheta( 8, A, E) \ - thetaRhoPiChiIotaPrepareTheta( 9, E, A) \ - thetaRhoPiChiIotaPrepareTheta(10, A, E) \ - thetaRhoPiChiIotaPrepareTheta(11, E, A) \ - thetaRhoPiChiIotaPrepareTheta(12, A, E) \ - thetaRhoPiChiIotaPrepareTheta(13, E, A) \ - thetaRhoPiChiIotaPrepareTheta(14, A, E) \ - thetaRhoPiChiIotaPrepareTheta(15, E, A) \ - thetaRhoPiChiIotaPrepareTheta(16, A, E) \ - thetaRhoPiChiIotaPrepareTheta(17, E, A) \ - thetaRhoPiChiIotaPrepareTheta(18, A, E) \ - thetaRhoPiChiIotaPrepareTheta(19, E, A) \ - thetaRhoPiChiIotaPrepareTheta(20, A, E) \ - thetaRhoPiChiIotaPrepareTheta(21, E, A) \ - thetaRhoPiChiIotaPrepareTheta(22, A, E) \ - thetaRhoPiChiIota(23, E, A) \ - -#define rounds12 \ - prepareTheta \ - thetaRhoPiChiIotaPrepareTheta(12, A, E) \ - thetaRhoPiChiIotaPrepareTheta(13, E, A) \ - thetaRhoPiChiIotaPrepareTheta(14, A, E) \ - thetaRhoPiChiIotaPrepareTheta(15, E, A) \ - thetaRhoPiChiIotaPrepareTheta(16, A, E) \ - thetaRhoPiChiIotaPrepareTheta(17, E, A) \ - thetaRhoPiChiIotaPrepareTheta(18, A, E) \ - thetaRhoPiChiIotaPrepareTheta(19, E, A) \ - thetaRhoPiChiIotaPrepareTheta(20, A, E) \ - thetaRhoPiChiIotaPrepareTheta(21, E, A) \ - thetaRhoPiChiIotaPrepareTheta(22, A, E) \ - thetaRhoPiChiIota(23, E, A) \ - -#elif (Unrolling == 12) -#define rounds24 \ - prepareTheta \ - for(i=0; i<24; i+=12) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+ 1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+ 2, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+ 3, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+ 4, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+ 5, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+ 6, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+ 7, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+ 8, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+ 9, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+10, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+11, E, A) \ - } \ - -#define rounds12 \ - prepareTheta \ - thetaRhoPiChiIotaPrepareTheta(12, A, E) \ - thetaRhoPiChiIotaPrepareTheta(13, E, A) \ - thetaRhoPiChiIotaPrepareTheta(14, A, E) \ - thetaRhoPiChiIotaPrepareTheta(15, E, A) \ - thetaRhoPiChiIotaPrepareTheta(16, A, E) \ - thetaRhoPiChiIotaPrepareTheta(17, E, A) \ - thetaRhoPiChiIotaPrepareTheta(18, A, E) \ - thetaRhoPiChiIotaPrepareTheta(19, E, A) \ - thetaRhoPiChiIotaPrepareTheta(20, A, E) \ - thetaRhoPiChiIotaPrepareTheta(21, E, A) \ - thetaRhoPiChiIotaPrepareTheta(22, A, E) \ - thetaRhoPiChiIota(23, E, A) \ - -#elif (Unrolling == 6) -#define rounds24 \ - prepareTheta \ - for(i=0; i<24; i+=6) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \ - } \ - -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=6) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \ - } \ - -#elif (Unrolling == 4) -#define rounds24 \ - prepareTheta \ - for(i=0; i<24; i+=4) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ - } \ - -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=4) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ - } \ - -#elif (Unrolling == 3) -#define rounds24 \ - prepareTheta \ - for(i=0; i<24; i+=3) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - copyStateVariables(A, E) \ - } \ - -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=3) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - copyStateVariables(A, E) \ - } \ - -#elif (Unrolling == 2) -#define rounds24 \ - prepareTheta \ - for(i=0; i<24; i+=2) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - } \ - -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=2) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - } \ - -#elif (Unrolling == 1) -#define rounds24 \ - prepareTheta \ - for(i=0; i<24; i++) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - copyStateVariables(A, E) \ - } \ - -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i++) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - copyStateVariables(A, E) \ - } \ - -#else -#error "Unrolling is not correctly specified!" -#endif - -#define roundsN(__nrounds) \ - prepareTheta \ - i = 24 - (__nrounds); \ - if ((i&1) != 0) { \ - thetaRhoPiChiIotaPrepareTheta(i, A, E) \ - copyStateVariables(A, E) \ - ++i; \ - } \ - for( /* empty */; i<24; i+=2) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - } diff --git a/shake256-avx2/keccak4x/SIMD256-config.h b/shake256-avx2/keccak4x/SIMD256-config.h deleted file mode 100644 index 1c65fe29..00000000 --- a/shake256-avx2/keccak4x/SIMD256-config.h +++ /dev/null @@ -1,3 +0,0 @@ -#define KeccakP1600times4_implementation_config "AVX2, all rounds unrolled" -#define KeccakP1600times4_fullUnrolling -#define KeccakP1600times4_useAVX2 diff --git a/shake256-avx2/keccak4x/align.h b/shake256-avx2/keccak4x/align.h deleted file mode 100644 index e29771ed..00000000 --- a/shake256-avx2/keccak4x/align.h +++ /dev/null @@ -1,34 +0,0 @@ -/* -Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni, -Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby -denoted as "the implementer". - -For more information, feedback or questions, please refer to our websites: -http://keccak.noekeon.org/ -http://keyak.noekeon.org/ -http://ketje.noekeon.org/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ -*/ - -#ifndef _align_h_ -#define _align_h_ - -/* on Mac OS-X and possibly others, ALIGN(x) is defined in param.h, and -Werror chokes on the redef. */ -#ifdef ALIGN -#undef ALIGN -#endif - -#if defined(__GNUC__) -#define ALIGN(x) __attribute__ ((aligned(x))) -#elif defined(_MSC_VER) -#define ALIGN(x) __declspec(align(x)) -#elif defined(__ARMCC_VERSION) -#define ALIGN(x) __align(x) -#else -#define ALIGN(x) -#endif - -#endif diff --git a/shake256-avx2/keccak4x/brg_endian.h b/shake256-avx2/keccak4x/brg_endian.h deleted file mode 100644 index 7226eb3b..00000000 --- a/shake256-avx2/keccak4x/brg_endian.h +++ /dev/null @@ -1,142 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The redistribution and use of this software (with or without changes) - is allowed without the payment of fees or royalties provided that: - - 1. source code distributions include the above copyright notice, this - list of conditions and the following disclaimer; - - 2. binary distributions include the above copyright notice, this list - of conditions and the following disclaimer in their documentation; - - 3. the name of the copyright holder is not used to endorse products - built using this software without specific written permission. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue Date: 20/12/2007 - Changes for ARM 9/9/2010 -*/ - -#ifndef _BRG_ENDIAN_H -#define _BRG_ENDIAN_H - -#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ -#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ - -#if 0 -/* Include files where endian defines and byteswap functions may reside */ -#if defined( __sun ) -# include -#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ ) -# include -#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \ - defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ ) -# include -#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ ) -# if !defined( __MINGW32__ ) && !defined( _AIX ) -# include -# if !defined( __BEOS__ ) -# include -# endif -# endif -#endif -#endif - -/* Now attempt to set the define for platform byte order using any */ -/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */ -/* seem to encompass most endian symbol definitions */ - -#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN ) -# if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif defined( BIG_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#elif defined( LITTLE_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#endif - -#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN ) -# if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif defined( _BIG_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#elif defined( _LITTLE_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#endif - -#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN ) -# if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif defined( __BIG_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#elif defined( __LITTLE_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#endif - -#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ ) -# if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__ -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__ -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif defined( __BIG_ENDIAN__ ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#elif defined( __LITTLE_ENDIAN__ ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#endif - -/* if the platform byte order could not be determined, then try to */ -/* set this define using common machine defines */ -#if !defined(PLATFORM_BYTE_ORDER) - -#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \ - defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \ - defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \ - defined( vax ) || defined( vms ) || defined( VMS ) || \ - defined( __VMS ) || defined( _M_X64 ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN - -#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \ - defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \ - defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \ - defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \ - defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \ - defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) || \ - defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN - -#elif defined(__arm__) -# ifdef __BIG_ENDIAN -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# else -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif 1 /* **** EDIT HERE IF NECESSARY **** */ -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#elif 0 /* **** EDIT HERE IF NECESSARY **** */ -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#else -# error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order -#endif - -#endif - -#endif diff --git a/shake256-avx2/thash_shake256_robustx4.c b/shake256-avx2/thash_shake256_robustx4.c index 483987b9..4a26b756 100644 --- a/shake256-avx2/thash_shake256_robustx4.c +++ b/shake256-avx2/thash_shake256_robustx4.c @@ -1,13 +1,14 @@ #include +#include #include +#include #include "thashx4.h" #include "address.h" #include "params.h" -#include "fips202x4.h" +#include "f1600x4.h" -extern void KeccakP1600times4_PermuteAll_24rounds(__m256i *s); static uint32_t swap32(uint32_t val) { val = ((val << 8) & 0xFF00FF00 ) | ((val >> 8) & 0xFF00FF ); @@ -64,7 +65,7 @@ void thashx4(unsigned char *out0, __m256i state2[25]; memcpy(state2, state, 800); - KeccakP1600times4_PermuteAll_24rounds(&state[0]); + f1600x4AVX2((uint64_t*)&state[0], &keccak_rc[0]); /* By copying from state, state2 already contains the pub_seed * and addres. We just need to copy in the input blocks xorred with @@ -91,7 +92,7 @@ void thashx4(unsigned char *out0, _mm256_set1_epi64x(0x1f) ); - KeccakP1600times4_PermuteAll_24rounds(&state2[0]); + f1600x4AVX2((uint64_t*)&state2[0], &keccak_rc[0]); for (int i = 0; i < SPX_N/8; i++) { ((int64_t*)out0)[i] = _mm256_extract_epi64(state2[i], 0); @@ -136,7 +137,7 @@ void thashx4(unsigned char *out0, __m256i state2[25]; memcpy(state2, state, 800); - KeccakP1600times4_PermuteAll_24rounds(&state[0]); + f1600x4AVX2((uint64_t*)&state[0], &keccak_rc[0]); /* We will won't be able to fit all input in on go. * By copying from state, state2 already contains the pub_seed @@ -154,7 +155,7 @@ void thashx4(unsigned char *out0, ); } - KeccakP1600times4_PermuteAll_24rounds(&state2[0]); + f1600x4AVX2((uint64_t*)&state2[0], &keccak_rc[0]); /* Final input. */ for (unsigned int i = 0; i < 3+8*(inblocks-1); i++) { @@ -177,7 +178,7 @@ void thashx4(unsigned char *out0, _mm256_set1_epi64x(0x1f)); state2[16] = _mm256_xor_si256(state2[16], _mm256_set1_epi64x(0x80ll << 56)); - KeccakP1600times4_PermuteAll_24rounds(&state2[0]); + f1600x4AVX2((uint64_t*)&state2[0], &keccak_rc[0]); for (int i = 0; i < 8; i++) { ((int64_t*)out0)[i] = _mm256_extract_epi64(state2[i], 0); @@ -186,36 +187,6 @@ void thashx4(unsigned char *out0, ((int64_t*)out3)[i] = _mm256_extract_epi64(state2[i], 3); } } else { - unsigned char buf0[SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N]; - unsigned char buf1[SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N]; - unsigned char buf2[SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N]; - unsigned char buf3[SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N]; - unsigned char bitmask0[inblocks * SPX_N]; - unsigned char bitmask1[inblocks * SPX_N]; - unsigned char bitmask2[inblocks * SPX_N]; - unsigned char bitmask3[inblocks * SPX_N]; - unsigned int i; - - memcpy(buf0, pub_seed, SPX_N); - memcpy(buf1, pub_seed, SPX_N); - memcpy(buf2, pub_seed, SPX_N); - memcpy(buf3, pub_seed, SPX_N); - addr_to_bytes(buf0 + SPX_N, addrx4 + 0*8); - addr_to_bytes(buf1 + SPX_N, addrx4 + 1*8); - addr_to_bytes(buf2 + SPX_N, addrx4 + 2*8); - addr_to_bytes(buf3 + SPX_N, addrx4 + 3*8); - - shake256x4(bitmask0, bitmask1, bitmask2, bitmask3, inblocks * SPX_N, - buf0, buf1, buf2, buf3, SPX_N + SPX_ADDR_BYTES); - - for (i = 0; i < inblocks * SPX_N; i++) { - buf0[SPX_N + SPX_ADDR_BYTES + i] = in0[i] ^ bitmask0[i]; - buf1[SPX_N + SPX_ADDR_BYTES + i] = in1[i] ^ bitmask1[i]; - buf2[SPX_N + SPX_ADDR_BYTES + i] = in2[i] ^ bitmask2[i]; - buf3[SPX_N + SPX_ADDR_BYTES + i] = in3[i] ^ bitmask3[i]; - } - - shake256x4(out0, out1, out2, out3, SPX_N, - buf0, buf1, buf2, buf3, SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N); + assert(0); } } diff --git a/shake256-avx2/thash_shake256_simplex4.c b/shake256-avx2/thash_shake256_simplex4.c index ef0b0cb2..257eef56 100644 --- a/shake256-avx2/thash_shake256_simplex4.c +++ b/shake256-avx2/thash_shake256_simplex4.c @@ -1,13 +1,12 @@ #include #include +#include #include "thashx4.h" +#include "f1600x4.h" #include "address.h" #include "params.h" -#include "fips202x4.h" - -extern void KeccakP1600times4_PermuteAll_24rounds(__m256i *s); static uint32_t swap32(uint32_t val) { val = ((val << 8) & 0xFF00FF00 ) | ((val >> 8) & 0xFF00FF ); @@ -70,7 +69,7 @@ void thashx4(unsigned char *out0, state[i] = _mm256_set1_epi64x(0); } - KeccakP1600times4_PermuteAll_24rounds(&state[0]); + f1600x4AVX2((uint64_t*)&state[0], &keccak_rc[0]) for (int i = 0; i < SPX_N/8; i++) { ((int64_t*)out0)[i] = _mm256_extract_epi64(state[i], 0); @@ -112,7 +111,7 @@ void thashx4(unsigned char *out0, ); } - KeccakP1600times4_PermuteAll_24rounds(&state[0]); + f1600x4AVX2((uint64_t*)&state[0], &keccak_rc[0]) /* Final input. */ for (unsigned int i = 0; i < 3+8*(inblocks-1); i++) { @@ -132,7 +131,7 @@ void thashx4(unsigned char *out0, _mm256_set1_epi64x(0x1f)); state[16] = _mm256_xor_si256(state[16], _mm256_set1_epi64x(0x80ll << 56)); - KeccakP1600times4_PermuteAll_24rounds(&state[0]); + f1600x4AVX2((uint64_t*)&state[0], &keccak_rc[0]) for (int i = 0; i < 8; i++) { ((int64_t*)out0)[i] = _mm256_extract_epi64(state[i], 0); @@ -141,25 +140,6 @@ void thashx4(unsigned char *out0, ((int64_t*)out3)[i] = _mm256_extract_epi64(state[i], 3); } } else { - unsigned char buf0[SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N]; - unsigned char buf1[SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N]; - unsigned char buf2[SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N]; - unsigned char buf3[SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N]; - - memcpy(buf0, pub_seed, SPX_N); - memcpy(buf1, pub_seed, SPX_N); - memcpy(buf2, pub_seed, SPX_N); - memcpy(buf3, pub_seed, SPX_N); - addr_to_bytes(buf0 + SPX_N, addrx4 + 0*8); - addr_to_bytes(buf1 + SPX_N, addrx4 + 1*8); - addr_to_bytes(buf2 + SPX_N, addrx4 + 2*8); - addr_to_bytes(buf3 + SPX_N, addrx4 + 3*8); - memcpy(buf0 + SPX_N + SPX_ADDR_BYTES, in0, inblocks * SPX_N); - memcpy(buf1 + SPX_N + SPX_ADDR_BYTES, in1, inblocks * SPX_N); - memcpy(buf2 + SPX_N + SPX_ADDR_BYTES, in2, inblocks * SPX_N); - memcpy(buf3 + SPX_N + SPX_ADDR_BYTES, in3, inblocks * SPX_N); - - shake256x4(out0, out1, out2, out3, SPX_N, - buf0, buf1, buf2, buf3, SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N); + assert(0); } }