diff --git a/shake256-avx2/.gitignore b/shake256-avx2/.gitignore
index 20d83ac7..d5c31c75 100644
--- a/shake256-avx2/.gitignore
+++ b/shake256-avx2/.gitignore
@@ -3,4 +3,5 @@ test/*
 PQCsignKAT_*.rsp
 PQCsignKAT_*.req
 PQCgenKAT_sign
-keccak4x/KeccakP-1600-times4-SIMD256.o
\ No newline at end of file
+keccak4x/KeccakP-1600-times4-SIMD256.o
+*.s
diff --git a/shake256-avx2/Makefile b/shake256-avx2/Makefile
index cb2e7c5d..1c42e182 100644
--- a/shake256-avx2/Makefile
+++ b/shake256-avx2/Makefile
@@ -3,8 +3,8 @@ CFLAGS = -Wall -Wextra -Wpedantic -O3 -std=c99 -march=native -fomit-frame-pointe
 
 THASH = robust
 
-SOURCES =          hash_shake256.c hash_shake256x4.c thash_shake256_$(THASH).c thash_shake256_$(THASH)x4.c address.c randombytes.c wots.c utils.c utilsx4.c fors.c sign.c fips202.c fips202x4.c keccak4x/KeccakP-1600-times4-SIMD256.o
-HEADERS = params.h hash.h          hashx4.h          thash.h                 thashx4.h                 address.h randombytes.h wots.h utils.h utilsx4.h fors.h api.h fips202.h fips202x4.h
+SOURCES =          hash_shake256.c hash_shake256x4.c thash_shake256_$(THASH).c thash_shake256_$(THASH)x4.c address.c randombytes.c wots.c utils.c utilsx4.c fors.c sign.c fips202.c f1600x4.c f1600x4.s
+HEADERS = params.h hash.h          hashx4.h          thash.h                 thashx4.h                 address.h randombytes.h wots.h utils.h utilsx4.h fors.h api.h fips202.h f1600x4.h
 
 DET_SOURCES = $(SOURCES:randombytes.%=rng.%)
 DET_HEADERS = $(HEADERS:randombytes.%=rng.%)
@@ -39,16 +39,7 @@ test/%: test/%.c $(SOURCES) $(HEADERS)
 test/%.exec: test/%
 	@$<
 
-keccak4x/KeccakP-1600-times4-SIMD256.o: keccak4x/align.h \
-										keccak4x/brg_endian.h \
-										keccak4x/KeccakP-1600-times4-SIMD256.c \
-										keccak4x/KeccakP-1600-times4-SnP.h \
-										keccak4x/KeccakP-1600-unrolling.macros \
-										keccak4x/SIMD256-config.h
-	$(CC) $(CFLAGS) -c keccak4x/KeccakP-1600-times4-SIMD256.c -o $@
-
 clean:
-	-$(RM) keccak4x/KeccakP-1600-times4-SIMD256.o
 	-$(RM) $(TESTS)
 	-$(RM) $(BENCHMARK)
 	-$(RM) PQCgenKAT_sign
diff --git a/shake256-avx2/f1600x4.S b/shake256-avx2/f1600x4.S
new file mode 100644
index 00000000..c3d6a61a
--- /dev/null
+++ b/shake256-avx2/f1600x4.S
@@ -0,0 +1,901 @@
+# Generated by PeachPy 0.2.0 from test.py
+
+
+#ifdef __APPLE__
+.section __TEXT,__text,regular,pure_instructions
+.globl _f1600x4AVX2
+.p2align 4, 0x90
+_f1600x4AVX2:
+#else /* !__APPLE__ */
+.text
+.p2align 4,,15
+.globl f1600x4AVX2
+.type f1600x4AVX2, @function
+f1600x4AVX2:
+#endif /* !__APPLE */
+	movq $6, %rax
+17: # loop.begin:
+		vmovdqa 0(%rdi), %ymm8
+		vmovdqa 32(%rdi), %ymm9
+		vmovdqa 64(%rdi), %ymm10
+		vmovdqa 96(%rdi), %ymm11
+		vmovdqa 128(%rdi), %ymm12
+		vpxor 160(%rdi), %ymm8, %ymm8
+		vpxor 192(%rdi), %ymm9, %ymm9
+		vpxor 224(%rdi), %ymm10, %ymm10
+		vpxor 256(%rdi), %ymm11, %ymm11
+		vpxor 288(%rdi), %ymm12, %ymm12
+		vpxor 320(%rdi), %ymm8, %ymm8
+		vpxor 352(%rdi), %ymm9, %ymm9
+		vpxor 384(%rdi), %ymm10, %ymm10
+		vpxor 416(%rdi), %ymm11, %ymm11
+		vpxor 448(%rdi), %ymm12, %ymm12
+		vpxor 480(%rdi), %ymm8, %ymm8
+		vpxor 512(%rdi), %ymm9, %ymm9
+		vpxor 544(%rdi), %ymm10, %ymm10
+		vpxor 576(%rdi), %ymm11, %ymm11
+		vpxor 608(%rdi), %ymm12, %ymm12
+		vpxor 640(%rdi), %ymm8, %ymm8
+		vpxor 672(%rdi), %ymm9, %ymm9
+		vpxor 704(%rdi), %ymm10, %ymm10
+		vpxor 736(%rdi), %ymm11, %ymm11
+		vpxor 768(%rdi), %ymm12, %ymm12
+		vpsllq $1, %ymm9, %ymm13
+		vpsllq $1, %ymm10, %ymm14
+		vpsllq $1, %ymm11, %ymm15
+		vpsllq $1, %ymm12, %ymm7
+		vpsllq $1, %ymm8, %ymm6
+		vpsrlq $63, %ymm9, %ymm5
+		vpsrlq $63, %ymm10, %ymm4
+		vpsrlq $63, %ymm11, %ymm3
+		vpsrlq $63, %ymm12, %ymm2
+		vpsrlq $63, %ymm8, %ymm1
+		vpor %ymm13, %ymm5, %ymm5
+		vpor %ymm14, %ymm4, %ymm4
+		vpor %ymm15, %ymm3, %ymm3
+		vpor %ymm7, %ymm2, %ymm2
+		vpor %ymm6, %ymm1, %ymm1
+		vpxor %ymm5, %ymm12, %ymm5
+		vpxor %ymm4, %ymm8, %ymm4
+		vpxor %ymm3, %ymm9, %ymm3
+		vpxor %ymm2, %ymm10, %ymm2
+		vpxor %ymm1, %ymm11, %ymm1
+		vpxor 0(%rdi), %ymm5, %ymm8
+		vpxor 192(%rdi), %ymm4, %ymm9
+		vpxor 384(%rdi), %ymm3, %ymm10
+		vpxor 576(%rdi), %ymm2, %ymm11
+		vpxor 768(%rdi), %ymm1, %ymm12
+		vpsllq $44, %ymm9, %ymm14
+		vpsllq $43, %ymm10, %ymm15
+		vpsllq $21, %ymm11, %ymm7
+		vpsllq $14, %ymm12, %ymm6
+		vpsrlq $20, %ymm9, %ymm9
+		vpsrlq $21, %ymm10, %ymm10
+		vpsrlq $43, %ymm11, %ymm11
+		vpsrlq $50, %ymm12, %ymm12
+		vpor %ymm14, %ymm9, %ymm9
+		vpor %ymm15, %ymm10, %ymm10
+		vpor %ymm7, %ymm11, %ymm11
+		vpor %ymm6, %ymm12, %ymm12
+		vpandn %ymm10, %ymm9, %ymm13
+		vpandn %ymm11, %ymm10, %ymm14
+		vpandn %ymm12, %ymm11, %ymm15
+		vpandn %ymm8, %ymm12, %ymm7
+		vpandn %ymm9, %ymm8, %ymm6
+		vpxor %ymm8, %ymm13, %ymm13
+		vpxor %ymm9, %ymm14, %ymm14
+		vpxor %ymm10, %ymm15, %ymm15
+		vpxor %ymm11, %ymm7, %ymm7
+		vpxor %ymm12, %ymm6, %ymm6
+		vpbroadcastq 0(%rsi), %ymm8
+		vpxor %ymm8, %ymm13, %ymm13
+		vmovdqa %ymm13, 0(%rdi)
+		vmovdqa %ymm14, 192(%rdi)
+		vmovdqa %ymm15, 384(%rdi)
+		vmovdqa %ymm7, 576(%rdi)
+		vmovdqa %ymm6, 768(%rdi)
+		vpxor 96(%rdi), %ymm2, %ymm8
+		vpxor 288(%rdi), %ymm1, %ymm9
+		vpxor 320(%rdi), %ymm5, %ymm10
+		vpxor 512(%rdi), %ymm4, %ymm11
+		vpxor 704(%rdi), %ymm3, %ymm12
+		vpsllq $28, %ymm8, %ymm13
+		vpsllq $20, %ymm9, %ymm14
+		vpsllq $3, %ymm10, %ymm15
+		vpsllq $45, %ymm11, %ymm7
+		vpsllq $61, %ymm12, %ymm6
+		vpsrlq $36, %ymm8, %ymm8
+		vpsrlq $44, %ymm9, %ymm9
+		vpsrlq $61, %ymm10, %ymm10
+		vpsrlq $19, %ymm11, %ymm11
+		vpsrlq $3, %ymm12, %ymm12
+		vpor %ymm13, %ymm8, %ymm8
+		vpor %ymm14, %ymm9, %ymm9
+		vpor %ymm15, %ymm10, %ymm10
+		vpor %ymm7, %ymm11, %ymm11
+		vpor %ymm6, %ymm12, %ymm12
+		vpandn %ymm10, %ymm9, %ymm13
+		vpandn %ymm11, %ymm10, %ymm14
+		vpandn %ymm12, %ymm11, %ymm15
+		vpandn %ymm8, %ymm12, %ymm7
+		vpandn %ymm9, %ymm8, %ymm6
+		vpxor %ymm8, %ymm13, %ymm13
+		vpxor %ymm9, %ymm14, %ymm14
+		vpxor %ymm10, %ymm15, %ymm15
+		vpxor %ymm11, %ymm7, %ymm7
+		vpxor %ymm12, %ymm6, %ymm6
+		vmovdqa %ymm13, 320(%rdi)
+		vmovdqa %ymm14, 512(%rdi)
+		vmovdqa %ymm15, 704(%rdi)
+		vmovdqa %ymm7, 96(%rdi)
+		vmovdqa %ymm6, 288(%rdi)
+		vpxor 32(%rdi), %ymm4, %ymm8
+		vpxor 224(%rdi), %ymm3, %ymm9
+		vpxor 416(%rdi), %ymm2, %ymm10
+		vpxor 608(%rdi), %ymm1, %ymm11
+		vpxor 640(%rdi), %ymm5, %ymm12
+		vpsllq $1, %ymm8, %ymm13
+		vpsllq $6, %ymm9, %ymm14
+		vpsllq $25, %ymm10, %ymm15
+		vpsllq $8, %ymm11, %ymm7
+		vpsllq $18, %ymm12, %ymm6
+		vpsrlq $63, %ymm8, %ymm8
+		vpsrlq $58, %ymm9, %ymm9
+		vpsrlq $39, %ymm10, %ymm10
+		vpsrlq $56, %ymm11, %ymm11
+		vpsrlq $46, %ymm12, %ymm12
+		vpor %ymm13, %ymm8, %ymm8
+		vpor %ymm14, %ymm9, %ymm9
+		vpor %ymm15, %ymm10, %ymm10
+		vpor %ymm7, %ymm11, %ymm11
+		vpor %ymm6, %ymm12, %ymm12
+		vpandn %ymm10, %ymm9, %ymm13
+		vpandn %ymm11, %ymm10, %ymm14
+		vpandn %ymm12, %ymm11, %ymm15
+		vpandn %ymm8, %ymm12, %ymm7
+		vpandn %ymm9, %ymm8, %ymm6
+		vpxor %ymm8, %ymm13, %ymm13
+		vpxor %ymm9, %ymm14, %ymm14
+		vpxor %ymm10, %ymm15, %ymm15
+		vpxor %ymm11, %ymm7, %ymm7
+		vpxor %ymm12, %ymm6, %ymm6
+		vmovdqa %ymm13, 640(%rdi)
+		vmovdqa %ymm14, 32(%rdi)
+		vmovdqa %ymm15, 224(%rdi)
+		vmovdqa %ymm7, 416(%rdi)
+		vmovdqa %ymm6, 608(%rdi)
+		vpxor 128(%rdi), %ymm1, %ymm8
+		vpxor 160(%rdi), %ymm5, %ymm9
+		vpxor 352(%rdi), %ymm4, %ymm10
+		vpxor 544(%rdi), %ymm3, %ymm11
+		vpxor 736(%rdi), %ymm2, %ymm12
+		vpsllq $27, %ymm8, %ymm13
+		vpsllq $36, %ymm9, %ymm14
+		vpsllq $10, %ymm10, %ymm15
+		vpsllq $15, %ymm11, %ymm7
+		vpsllq $56, %ymm12, %ymm6
+		vpsrlq $37, %ymm8, %ymm8
+		vpsrlq $28, %ymm9, %ymm9
+		vpsrlq $54, %ymm10, %ymm10
+		vpsrlq $49, %ymm11, %ymm11
+		vpsrlq $8, %ymm12, %ymm12
+		vpor %ymm13, %ymm8, %ymm8
+		vpor %ymm14, %ymm9, %ymm9
+		vpor %ymm15, %ymm10, %ymm10
+		vpor %ymm7, %ymm11, %ymm11
+		vpor %ymm6, %ymm12, %ymm12
+		vpandn %ymm10, %ymm9, %ymm13
+		vpandn %ymm11, %ymm10, %ymm14
+		vpandn %ymm12, %ymm11, %ymm15
+		vpandn %ymm8, %ymm12, %ymm7
+		vpandn %ymm9, %ymm8, %ymm6
+		vpxor %ymm8, %ymm13, %ymm13
+		vpxor %ymm9, %ymm14, %ymm14
+		vpxor %ymm10, %ymm15, %ymm15
+		vpxor %ymm11, %ymm7, %ymm7
+		vpxor %ymm12, %ymm6, %ymm6
+		vmovdqa %ymm13, 160(%rdi)
+		vmovdqa %ymm14, 352(%rdi)
+		vmovdqa %ymm15, 544(%rdi)
+		vmovdqa %ymm7, 736(%rdi)
+		vmovdqa %ymm6, 128(%rdi)
+		vpxor 64(%rdi), %ymm3, %ymm8
+		vpxor 256(%rdi), %ymm2, %ymm9
+		vpxor 448(%rdi), %ymm1, %ymm10
+		vpxor 480(%rdi), %ymm5, %ymm11
+		vpxor 672(%rdi), %ymm4, %ymm12
+		vpsllq $62, %ymm8, %ymm13
+		vpsllq $55, %ymm9, %ymm14
+		vpsllq $39, %ymm10, %ymm15
+		vpsllq $41, %ymm11, %ymm7
+		vpsllq $2, %ymm12, %ymm6
+		vpsrlq $2, %ymm8, %ymm8
+		vpsrlq $9, %ymm9, %ymm9
+		vpsrlq $25, %ymm10, %ymm10
+		vpsrlq $23, %ymm11, %ymm11
+		vpsrlq $62, %ymm12, %ymm12
+		vpor %ymm13, %ymm8, %ymm8
+		vpor %ymm14, %ymm9, %ymm9
+		vpor %ymm15, %ymm10, %ymm10
+		vpor %ymm7, %ymm11, %ymm11
+		vpor %ymm6, %ymm12, %ymm12
+		vpandn %ymm10, %ymm9, %ymm13
+		vpandn %ymm11, %ymm10, %ymm14
+		vpandn %ymm12, %ymm11, %ymm15
+		vpandn %ymm8, %ymm12, %ymm7
+		vpandn %ymm9, %ymm8, %ymm6
+		vpxor %ymm8, %ymm13, %ymm13
+		vpxor %ymm9, %ymm14, %ymm14
+		vpxor %ymm10, %ymm15, %ymm15
+		vpxor %ymm11, %ymm7, %ymm7
+		vpxor %ymm12, %ymm6, %ymm6
+		vmovdqa %ymm13, 480(%rdi)
+		vmovdqa %ymm14, 672(%rdi)
+		vmovdqa %ymm15, 64(%rdi)
+		vmovdqa %ymm7, 256(%rdi)
+		vmovdqa %ymm6, 448(%rdi)
+		vmovdqa 0(%rdi), %ymm8
+		vmovdqa 32(%rdi), %ymm9
+		vmovdqa 64(%rdi), %ymm10
+		vmovdqa 96(%rdi), %ymm11
+		vmovdqa 128(%rdi), %ymm12
+		vpxor 160(%rdi), %ymm8, %ymm8
+		vpxor 192(%rdi), %ymm9, %ymm9
+		vpxor 224(%rdi), %ymm10, %ymm10
+		vpxor 256(%rdi), %ymm11, %ymm11
+		vpxor 288(%rdi), %ymm12, %ymm12
+		vpxor 320(%rdi), %ymm8, %ymm8
+		vpxor 352(%rdi), %ymm9, %ymm9
+		vpxor 384(%rdi), %ymm10, %ymm10
+		vpxor 416(%rdi), %ymm11, %ymm11
+		vpxor 448(%rdi), %ymm12, %ymm12
+		vpxor 480(%rdi), %ymm8, %ymm8
+		vpxor 512(%rdi), %ymm9, %ymm9
+		vpxor 544(%rdi), %ymm10, %ymm10
+		vpxor 576(%rdi), %ymm11, %ymm11
+		vpxor 608(%rdi), %ymm12, %ymm12
+		vpxor 640(%rdi), %ymm8, %ymm8
+		vpxor 672(%rdi), %ymm9, %ymm9
+		vpxor 704(%rdi), %ymm10, %ymm10
+		vpxor 736(%rdi), %ymm11, %ymm11
+		vpxor 768(%rdi), %ymm12, %ymm12
+		vpsllq $1, %ymm9, %ymm13
+		vpsllq $1, %ymm10, %ymm14
+		vpsllq $1, %ymm11, %ymm15
+		vpsllq $1, %ymm12, %ymm7
+		vpsllq $1, %ymm8, %ymm6
+		vpsrlq $63, %ymm9, %ymm5
+		vpsrlq $63, %ymm10, %ymm4
+		vpsrlq $63, %ymm11, %ymm3
+		vpsrlq $63, %ymm12, %ymm2
+		vpsrlq $63, %ymm8, %ymm1
+		vpor %ymm13, %ymm5, %ymm5
+		vpor %ymm14, %ymm4, %ymm4
+		vpor %ymm15, %ymm3, %ymm3
+		vpor %ymm7, %ymm2, %ymm2
+		vpor %ymm6, %ymm1, %ymm1
+		vpxor %ymm5, %ymm12, %ymm5
+		vpxor %ymm4, %ymm8, %ymm4
+		vpxor %ymm3, %ymm9, %ymm3
+		vpxor %ymm2, %ymm10, %ymm2
+		vpxor %ymm1, %ymm11, %ymm1
+		vpxor 0(%rdi), %ymm5, %ymm8
+		vpxor 512(%rdi), %ymm4, %ymm9
+		vpxor 224(%rdi), %ymm3, %ymm10
+		vpxor 736(%rdi), %ymm2, %ymm11
+		vpxor 448(%rdi), %ymm1, %ymm12
+		vpsllq $44, %ymm9, %ymm14
+		vpsllq $43, %ymm10, %ymm15
+		vpsllq $21, %ymm11, %ymm7
+		vpsllq $14, %ymm12, %ymm6
+		vpsrlq $20, %ymm9, %ymm9
+		vpsrlq $21, %ymm10, %ymm10
+		vpsrlq $43, %ymm11, %ymm11
+		vpsrlq $50, %ymm12, %ymm12
+		vpor %ymm14, %ymm9, %ymm9
+		vpor %ymm15, %ymm10, %ymm10
+		vpor %ymm7, %ymm11, %ymm11
+		vpor %ymm6, %ymm12, %ymm12
+		vpandn %ymm10, %ymm9, %ymm13
+		vpandn %ymm11, %ymm10, %ymm14
+		vpandn %ymm12, %ymm11, %ymm15
+		vpandn %ymm8, %ymm12, %ymm7
+		vpandn %ymm9, %ymm8, %ymm6
+		vpxor %ymm8, %ymm13, %ymm13
+		vpxor %ymm9, %ymm14, %ymm14
+		vpxor %ymm10, %ymm15, %ymm15
+		vpxor %ymm11, %ymm7, %ymm7
+		vpxor %ymm12, %ymm6, %ymm6
+		vpbroadcastq 8(%rsi), %ymm8
+		vpxor %ymm8, %ymm13, %ymm13
+		vmovdqa %ymm13, 0(%rdi)
+		vmovdqa %ymm14, 512(%rdi)
+		vmovdqa %ymm15, 224(%rdi)
+		vmovdqa %ymm7, 736(%rdi)
+		vmovdqa %ymm6, 448(%rdi)
+		vpxor 576(%rdi), %ymm2, %ymm8
+		vpxor 288(%rdi), %ymm1, %ymm9
+		vpxor 640(%rdi), %ymm5, %ymm10
+		vpxor 352(%rdi), %ymm4, %ymm11
+		vpxor 64(%rdi), %ymm3, %ymm12
+		vpsllq $28, %ymm8, %ymm13
+		vpsllq $20, %ymm9, %ymm14
+		vpsllq $3, %ymm10, %ymm15
+		vpsllq $45, %ymm11, %ymm7
+		vpsllq $61, %ymm12, %ymm6
+		vpsrlq $36, %ymm8, %ymm8
+		vpsrlq $44, %ymm9, %ymm9
+		vpsrlq $61, %ymm10, %ymm10
+		vpsrlq $19, %ymm11, %ymm11
+		vpsrlq $3, %ymm12, %ymm12
+		vpor %ymm13, %ymm8, %ymm8
+		vpor %ymm14, %ymm9, %ymm9
+		vpor %ymm15, %ymm10, %ymm10
+		vpor %ymm7, %ymm11, %ymm11
+		vpor %ymm6, %ymm12, %ymm12
+		vpandn %ymm10, %ymm9, %ymm13
+		vpandn %ymm11, %ymm10, %ymm14
+		vpandn %ymm12, %ymm11, %ymm15
+		vpandn %ymm8, %ymm12, %ymm7
+		vpandn %ymm9, %ymm8, %ymm6
+		vpxor %ymm8, %ymm13, %ymm13
+		vpxor %ymm9, %ymm14, %ymm14
+		vpxor %ymm10, %ymm15, %ymm15
+		vpxor %ymm11, %ymm7, %ymm7
+		vpxor %ymm12, %ymm6, %ymm6
+		vmovdqa %ymm13, 640(%rdi)
+		vmovdqa %ymm14, 352(%rdi)
+		vmovdqa %ymm15, 64(%rdi)
+		vmovdqa %ymm7, 576(%rdi)
+		vmovdqa %ymm6, 288(%rdi)
+		vpxor 192(%rdi), %ymm4, %ymm8
+		vpxor 704(%rdi), %ymm3, %ymm9
+		vpxor 416(%rdi), %ymm2, %ymm10
+		vpxor 128(%rdi), %ymm1, %ymm11
+		vpxor 480(%rdi), %ymm5, %ymm12
+		vpsllq $1, %ymm8, %ymm13
+		vpsllq $6, %ymm9, %ymm14
+		vpsllq $25, %ymm10, %ymm15
+		vpsllq $8, %ymm11, %ymm7
+		vpsllq $18, %ymm12, %ymm6
+		vpsrlq $63, %ymm8, %ymm8
+		vpsrlq $58, %ymm9, %ymm9
+		vpsrlq $39, %ymm10, %ymm10
+		vpsrlq $56, %ymm11, %ymm11
+		vpsrlq $46, %ymm12, %ymm12
+		vpor %ymm13, %ymm8, %ymm8
+		vpor %ymm14, %ymm9, %ymm9
+		vpor %ymm15, %ymm10, %ymm10
+		vpor %ymm7, %ymm11, %ymm11
+		vpor %ymm6, %ymm12, %ymm12
+		vpandn %ymm10, %ymm9, %ymm13
+		vpandn %ymm11, %ymm10, %ymm14
+		vpandn %ymm12, %ymm11, %ymm15
+		vpandn %ymm8, %ymm12, %ymm7
+		vpandn %ymm9, %ymm8, %ymm6
+		vpxor %ymm8, %ymm13, %ymm13
+		vpxor %ymm9, %ymm14, %ymm14
+		vpxor %ymm10, %ymm15, %ymm15
+		vpxor %ymm11, %ymm7, %ymm7
+		vpxor %ymm12, %ymm6, %ymm6
+		vmovdqa %ymm13, 480(%rdi)
+		vmovdqa %ymm14, 192(%rdi)
+		vmovdqa %ymm15, 704(%rdi)
+		vmovdqa %ymm7, 416(%rdi)
+		vmovdqa %ymm6, 128(%rdi)
+		vpxor 768(%rdi), %ymm1, %ymm8
+		vpxor 320(%rdi), %ymm5, %ymm9
+		vpxor 32(%rdi), %ymm4, %ymm10
+		vpxor 544(%rdi), %ymm3, %ymm11
+		vpxor 256(%rdi), %ymm2, %ymm12
+		vpsllq $27, %ymm8, %ymm13
+		vpsllq $36, %ymm9, %ymm14
+		vpsllq $10, %ymm10, %ymm15
+		vpsllq $15, %ymm11, %ymm7
+		vpsllq $56, %ymm12, %ymm6
+		vpsrlq $37, %ymm8, %ymm8
+		vpsrlq $28, %ymm9, %ymm9
+		vpsrlq $54, %ymm10, %ymm10
+		vpsrlq $49, %ymm11, %ymm11
+		vpsrlq $8, %ymm12, %ymm12
+		vpor %ymm13, %ymm8, %ymm8
+		vpor %ymm14, %ymm9, %ymm9
+		vpor %ymm15, %ymm10, %ymm10
+		vpor %ymm7, %ymm11, %ymm11
+		vpor %ymm6, %ymm12, %ymm12
+		vpandn %ymm10, %ymm9, %ymm13
+		vpandn %ymm11, %ymm10, %ymm14
+		vpandn %ymm12, %ymm11, %ymm15
+		vpandn %ymm8, %ymm12, %ymm7
+		vpandn %ymm9, %ymm8, %ymm6
+		vpxor %ymm8, %ymm13, %ymm13
+		vpxor %ymm9, %ymm14, %ymm14
+		vpxor %ymm10, %ymm15, %ymm15
+		vpxor %ymm11, %ymm7, %ymm7
+		vpxor %ymm12, %ymm6, %ymm6
+		vmovdqa %ymm13, 320(%rdi)
+		vmovdqa %ymm14, 32(%rdi)
+		vmovdqa %ymm15, 544(%rdi)
+		vmovdqa %ymm7, 256(%rdi)
+		vmovdqa %ymm6, 768(%rdi)
+		vpxor 384(%rdi), %ymm3, %ymm8
+		vpxor 96(%rdi), %ymm2, %ymm9
+		vpxor 608(%rdi), %ymm1, %ymm10
+		vpxor 160(%rdi), %ymm5, %ymm11
+		vpxor 672(%rdi), %ymm4, %ymm12
+		vpsllq $62, %ymm8, %ymm13
+		vpsllq $55, %ymm9, %ymm14
+		vpsllq $39, %ymm10, %ymm15
+		vpsllq $41, %ymm11, %ymm7
+		vpsllq $2, %ymm12, %ymm6
+		vpsrlq $2, %ymm8, %ymm8
+		vpsrlq $9, %ymm9, %ymm9
+		vpsrlq $25, %ymm10, %ymm10
+		vpsrlq $23, %ymm11, %ymm11
+		vpsrlq $62, %ymm12, %ymm12
+		vpor %ymm13, %ymm8, %ymm8
+		vpor %ymm14, %ymm9, %ymm9
+		vpor %ymm15, %ymm10, %ymm10
+		vpor %ymm7, %ymm11, %ymm11
+		vpor %ymm6, %ymm12, %ymm12
+		vpandn %ymm10, %ymm9, %ymm13
+		vpandn %ymm11, %ymm10, %ymm14
+		vpandn %ymm12, %ymm11, %ymm15
+		vpandn %ymm8, %ymm12, %ymm7
+		vpandn %ymm9, %ymm8, %ymm6
+		vpxor %ymm8, %ymm13, %ymm13
+		vpxor %ymm9, %ymm14, %ymm14
+		vpxor %ymm10, %ymm15, %ymm15
+		vpxor %ymm11, %ymm7, %ymm7
+		vpxor %ymm12, %ymm6, %ymm6
+		vmovdqa %ymm13, 160(%rdi)
+		vmovdqa %ymm14, 672(%rdi)
+		vmovdqa %ymm15, 384(%rdi)
+		vmovdqa %ymm7, 96(%rdi)
+		vmovdqa %ymm6, 608(%rdi)
+		vmovdqa 0(%rdi), %ymm8
+		vmovdqa 32(%rdi), %ymm9
+		vmovdqa 64(%rdi), %ymm10
+		vmovdqa 96(%rdi), %ymm11
+		vmovdqa 128(%rdi), %ymm12
+		vpxor 160(%rdi), %ymm8, %ymm8
+		vpxor 192(%rdi), %ymm9, %ymm9
+		vpxor 224(%rdi), %ymm10, %ymm10
+		vpxor 256(%rdi), %ymm11, %ymm11
+		vpxor 288(%rdi), %ymm12, %ymm12
+		vpxor 320(%rdi), %ymm8, %ymm8
+		vpxor 352(%rdi), %ymm9, %ymm9
+		vpxor 384(%rdi), %ymm10, %ymm10
+		vpxor 416(%rdi), %ymm11, %ymm11
+		vpxor 448(%rdi), %ymm12, %ymm12
+		vpxor 480(%rdi), %ymm8, %ymm8
+		vpxor 512(%rdi), %ymm9, %ymm9
+		vpxor 544(%rdi), %ymm10, %ymm10
+		vpxor 576(%rdi), %ymm11, %ymm11
+		vpxor 608(%rdi), %ymm12, %ymm12
+		vpxor 640(%rdi), %ymm8, %ymm8
+		vpxor 672(%rdi), %ymm9, %ymm9
+		vpxor 704(%rdi), %ymm10, %ymm10
+		vpxor 736(%rdi), %ymm11, %ymm11
+		vpxor 768(%rdi), %ymm12, %ymm12
+		vpsllq $1, %ymm9, %ymm13
+		vpsllq $1, %ymm10, %ymm14
+		vpsllq $1, %ymm11, %ymm15
+		vpsllq $1, %ymm12, %ymm7
+		vpsllq $1, %ymm8, %ymm6
+		vpsrlq $63, %ymm9, %ymm5
+		vpsrlq $63, %ymm10, %ymm4
+		vpsrlq $63, %ymm11, %ymm3
+		vpsrlq $63, %ymm12, %ymm2
+		vpsrlq $63, %ymm8, %ymm1
+		vpor %ymm13, %ymm5, %ymm5
+		vpor %ymm14, %ymm4, %ymm4
+		vpor %ymm15, %ymm3, %ymm3
+		vpor %ymm7, %ymm2, %ymm2
+		vpor %ymm6, %ymm1, %ymm1
+		vpxor %ymm5, %ymm12, %ymm5
+		vpxor %ymm4, %ymm8, %ymm4
+		vpxor %ymm3, %ymm9, %ymm3
+		vpxor %ymm2, %ymm10, %ymm2
+		vpxor %ymm1, %ymm11, %ymm1
+		vpxor 0(%rdi), %ymm5, %ymm8
+		vpxor 352(%rdi), %ymm4, %ymm9
+		vpxor 704(%rdi), %ymm3, %ymm10
+		vpxor 256(%rdi), %ymm2, %ymm11
+		vpxor 608(%rdi), %ymm1, %ymm12
+		vpsllq $44, %ymm9, %ymm14
+		vpsllq $43, %ymm10, %ymm15
+		vpsllq $21, %ymm11, %ymm7
+		vpsllq $14, %ymm12, %ymm6
+		vpsrlq $20, %ymm9, %ymm9
+		vpsrlq $21, %ymm10, %ymm10
+		vpsrlq $43, %ymm11, %ymm11
+		vpsrlq $50, %ymm12, %ymm12
+		vpor %ymm14, %ymm9, %ymm9
+		vpor %ymm15, %ymm10, %ymm10
+		vpor %ymm7, %ymm11, %ymm11
+		vpor %ymm6, %ymm12, %ymm12
+		vpandn %ymm10, %ymm9, %ymm13
+		vpandn %ymm11, %ymm10, %ymm14
+		vpandn %ymm12, %ymm11, %ymm15
+		vpandn %ymm8, %ymm12, %ymm7
+		vpandn %ymm9, %ymm8, %ymm6
+		vpxor %ymm8, %ymm13, %ymm13
+		vpxor %ymm9, %ymm14, %ymm14
+		vpxor %ymm10, %ymm15, %ymm15
+		vpxor %ymm11, %ymm7, %ymm7
+		vpxor %ymm12, %ymm6, %ymm6
+		vpbroadcastq 16(%rsi), %ymm8
+		vpxor %ymm8, %ymm13, %ymm13
+		vmovdqa %ymm13, 0(%rdi)
+		vmovdqa %ymm14, 352(%rdi)
+		vmovdqa %ymm15, 704(%rdi)
+		vmovdqa %ymm7, 256(%rdi)
+		vmovdqa %ymm6, 608(%rdi)
+		vpxor 736(%rdi), %ymm2, %ymm8
+		vpxor 288(%rdi), %ymm1, %ymm9
+		vpxor 480(%rdi), %ymm5, %ymm10
+		vpxor 32(%rdi), %ymm4, %ymm11
+		vpxor 384(%rdi), %ymm3, %ymm12
+		vpsllq $28, %ymm8, %ymm13
+		vpsllq $20, %ymm9, %ymm14
+		vpsllq $3, %ymm10, %ymm15
+		vpsllq $45, %ymm11, %ymm7
+		vpsllq $61, %ymm12, %ymm6
+		vpsrlq $36, %ymm8, %ymm8
+		vpsrlq $44, %ymm9, %ymm9
+		vpsrlq $61, %ymm10, %ymm10
+		vpsrlq $19, %ymm11, %ymm11
+		vpsrlq $3, %ymm12, %ymm12
+		vpor %ymm13, %ymm8, %ymm8
+		vpor %ymm14, %ymm9, %ymm9
+		vpor %ymm15, %ymm10, %ymm10
+		vpor %ymm7, %ymm11, %ymm11
+		vpor %ymm6, %ymm12, %ymm12
+		vpandn %ymm10, %ymm9, %ymm13
+		vpandn %ymm11, %ymm10, %ymm14
+		vpandn %ymm12, %ymm11, %ymm15
+		vpandn %ymm8, %ymm12, %ymm7
+		vpandn %ymm9, %ymm8, %ymm6
+		vpxor %ymm8, %ymm13, %ymm13
+		vpxor %ymm9, %ymm14, %ymm14
+		vpxor %ymm10, %ymm15, %ymm15
+		vpxor %ymm11, %ymm7, %ymm7
+		vpxor %ymm12, %ymm6, %ymm6
+		vmovdqa %ymm13, 480(%rdi)
+		vmovdqa %ymm14, 32(%rdi)
+		vmovdqa %ymm15, 384(%rdi)
+		vmovdqa %ymm7, 736(%rdi)
+		vmovdqa %ymm6, 288(%rdi)
+		vpxor 512(%rdi), %ymm4, %ymm8
+		vpxor 64(%rdi), %ymm3, %ymm9
+		vpxor 416(%rdi), %ymm2, %ymm10
+		vpxor 768(%rdi), %ymm1, %ymm11
+		vpxor 160(%rdi), %ymm5, %ymm12
+		vpsllq $1, %ymm8, %ymm13
+		vpsllq $6, %ymm9, %ymm14
+		vpsllq $25, %ymm10, %ymm15
+		vpsllq $8, %ymm11, %ymm7
+		vpsllq $18, %ymm12, %ymm6
+		vpsrlq $63, %ymm8, %ymm8
+		vpsrlq $58, %ymm9, %ymm9
+		vpsrlq $39, %ymm10, %ymm10
+		vpsrlq $56, %ymm11, %ymm11
+		vpsrlq $46, %ymm12, %ymm12
+		vpor %ymm13, %ymm8, %ymm8
+		vpor %ymm14, %ymm9, %ymm9
+		vpor %ymm15, %ymm10, %ymm10
+		vpor %ymm7, %ymm11, %ymm11
+		vpor %ymm6, %ymm12, %ymm12
+		vpandn %ymm10, %ymm9, %ymm13
+		vpandn %ymm11, %ymm10, %ymm14
+		vpandn %ymm12, %ymm11, %ymm15
+		vpandn %ymm8, %ymm12, %ymm7
+		vpandn %ymm9, %ymm8, %ymm6
+		vpxor %ymm8, %ymm13, %ymm13
+		vpxor %ymm9, %ymm14, %ymm14
+		vpxor %ymm10, %ymm15, %ymm15
+		vpxor %ymm11, %ymm7, %ymm7
+		vpxor %ymm12, %ymm6, %ymm6
+		vmovdqa %ymm13, 160(%rdi)
+		vmovdqa %ymm14, 512(%rdi)
+		vmovdqa %ymm15, 64(%rdi)
+		vmovdqa %ymm7, 416(%rdi)
+		vmovdqa %ymm6, 768(%rdi)
+		vpxor 448(%rdi), %ymm1, %ymm8
+		vpxor 640(%rdi), %ymm5, %ymm9
+		vpxor 192(%rdi), %ymm4, %ymm10
+		vpxor 544(%rdi), %ymm3, %ymm11
+		vpxor 96(%rdi), %ymm2, %ymm12
+		vpsllq $27, %ymm8, %ymm13
+		vpsllq $36, %ymm9, %ymm14
+		vpsllq $10, %ymm10, %ymm15
+		vpsllq $15, %ymm11, %ymm7
+		vpsllq $56, %ymm12, %ymm6
+		vpsrlq $37, %ymm8, %ymm8
+		vpsrlq $28, %ymm9, %ymm9
+		vpsrlq $54, %ymm10, %ymm10
+		vpsrlq $49, %ymm11, %ymm11
+		vpsrlq $8, %ymm12, %ymm12
+		vpor %ymm13, %ymm8, %ymm8
+		vpor %ymm14, %ymm9, %ymm9
+		vpor %ymm15, %ymm10, %ymm10
+		vpor %ymm7, %ymm11, %ymm11
+		vpor %ymm6, %ymm12, %ymm12
+		vpandn %ymm10, %ymm9, %ymm13
+		vpandn %ymm11, %ymm10, %ymm14
+		vpandn %ymm12, %ymm11, %ymm15
+		vpandn %ymm8, %ymm12, %ymm7
+		vpandn %ymm9, %ymm8, %ymm6
+		vpxor %ymm8, %ymm13, %ymm13
+		vpxor %ymm9, %ymm14, %ymm14
+		vpxor %ymm10, %ymm15, %ymm15
+		vpxor %ymm11, %ymm7, %ymm7
+		vpxor %ymm12, %ymm6, %ymm6
+		vmovdqa %ymm13, 640(%rdi)
+		vmovdqa %ymm14, 192(%rdi)
+		vmovdqa %ymm15, 544(%rdi)
+		vmovdqa %ymm7, 96(%rdi)
+		vmovdqa %ymm6, 448(%rdi)
+		vpxor 224(%rdi), %ymm3, %ymm8
+		vpxor 576(%rdi), %ymm2, %ymm9
+		vpxor 128(%rdi), %ymm1, %ymm10
+		vpxor 320(%rdi), %ymm5, %ymm11
+		vpxor 672(%rdi), %ymm4, %ymm12
+		vpsllq $62, %ymm8, %ymm13
+		vpsllq $55, %ymm9, %ymm14
+		vpsllq $39, %ymm10, %ymm15
+		vpsllq $41, %ymm11, %ymm7
+		vpsllq $2, %ymm12, %ymm6
+		vpsrlq $2, %ymm8, %ymm8
+		vpsrlq $9, %ymm9, %ymm9
+		vpsrlq $25, %ymm10, %ymm10
+		vpsrlq $23, %ymm11, %ymm11
+		vpsrlq $62, %ymm12, %ymm12
+		vpor %ymm13, %ymm8, %ymm8
+		vpor %ymm14, %ymm9, %ymm9
+		vpor %ymm15, %ymm10, %ymm10
+		vpor %ymm7, %ymm11, %ymm11
+		vpor %ymm6, %ymm12, %ymm12
+		vpandn %ymm10, %ymm9, %ymm13
+		vpandn %ymm11, %ymm10, %ymm14
+		vpandn %ymm12, %ymm11, %ymm15
+		vpandn %ymm8, %ymm12, %ymm7
+		vpandn %ymm9, %ymm8, %ymm6
+		vpxor %ymm8, %ymm13, %ymm13
+		vpxor %ymm9, %ymm14, %ymm14
+		vpxor %ymm10, %ymm15, %ymm15
+		vpxor %ymm11, %ymm7, %ymm7
+		vpxor %ymm12, %ymm6, %ymm6
+		vmovdqa %ymm13, 320(%rdi)
+		vmovdqa %ymm14, 672(%rdi)
+		vmovdqa %ymm15, 224(%rdi)
+		vmovdqa %ymm7, 576(%rdi)
+		vmovdqa %ymm6, 128(%rdi)
+		vmovdqa 0(%rdi), %ymm8
+		vmovdqa 32(%rdi), %ymm9
+		vmovdqa 64(%rdi), %ymm10
+		vmovdqa 96(%rdi), %ymm11
+		vmovdqa 128(%rdi), %ymm12
+		vpxor 160(%rdi), %ymm8, %ymm8
+		vpxor 192(%rdi), %ymm9, %ymm9
+		vpxor 224(%rdi), %ymm10, %ymm10
+		vpxor 256(%rdi), %ymm11, %ymm11
+		vpxor 288(%rdi), %ymm12, %ymm12
+		vpxor 320(%rdi), %ymm8, %ymm8
+		vpxor 352(%rdi), %ymm9, %ymm9
+		vpxor 384(%rdi), %ymm10, %ymm10
+		vpxor 416(%rdi), %ymm11, %ymm11
+		vpxor 448(%rdi), %ymm12, %ymm12
+		vpxor 480(%rdi), %ymm8, %ymm8
+		vpxor 512(%rdi), %ymm9, %ymm9
+		vpxor 544(%rdi), %ymm10, %ymm10
+		vpxor 576(%rdi), %ymm11, %ymm11
+		vpxor 608(%rdi), %ymm12, %ymm12
+		vpxor 640(%rdi), %ymm8, %ymm8
+		vpxor 672(%rdi), %ymm9, %ymm9
+		vpxor 704(%rdi), %ymm10, %ymm10
+		vpxor 736(%rdi), %ymm11, %ymm11
+		vpxor 768(%rdi), %ymm12, %ymm12
+		vpsllq $1, %ymm9, %ymm13
+		vpsllq $1, %ymm10, %ymm14
+		vpsllq $1, %ymm11, %ymm15
+		vpsllq $1, %ymm12, %ymm7
+		vpsllq $1, %ymm8, %ymm6
+		vpsrlq $63, %ymm9, %ymm5
+		vpsrlq $63, %ymm10, %ymm4
+		vpsrlq $63, %ymm11, %ymm3
+		vpsrlq $63, %ymm12, %ymm2
+		vpsrlq $63, %ymm8, %ymm1
+		vpor %ymm13, %ymm5, %ymm5
+		vpor %ymm14, %ymm4, %ymm4
+		vpor %ymm15, %ymm3, %ymm3
+		vpor %ymm7, %ymm2, %ymm2
+		vpor %ymm6, %ymm1, %ymm1
+		vpxor %ymm5, %ymm12, %ymm5
+		vpxor %ymm4, %ymm8, %ymm4
+		vpxor %ymm3, %ymm9, %ymm3
+		vpxor %ymm2, %ymm10, %ymm2
+		vpxor %ymm1, %ymm11, %ymm1
+		vpxor 0(%rdi), %ymm5, %ymm8
+		vpxor 32(%rdi), %ymm4, %ymm9
+		vpxor 64(%rdi), %ymm3, %ymm10
+		vpxor 96(%rdi), %ymm2, %ymm11
+		vpxor 128(%rdi), %ymm1, %ymm12
+		vpsllq $44, %ymm9, %ymm14
+		vpsllq $43, %ymm10, %ymm15
+		vpsllq $21, %ymm11, %ymm7
+		vpsllq $14, %ymm12, %ymm6
+		vpsrlq $20, %ymm9, %ymm9
+		vpsrlq $21, %ymm10, %ymm10
+		vpsrlq $43, %ymm11, %ymm11
+		vpsrlq $50, %ymm12, %ymm12
+		vpor %ymm14, %ymm9, %ymm9
+		vpor %ymm15, %ymm10, %ymm10
+		vpor %ymm7, %ymm11, %ymm11
+		vpor %ymm6, %ymm12, %ymm12
+		vpandn %ymm10, %ymm9, %ymm13
+		vpandn %ymm11, %ymm10, %ymm14
+		vpandn %ymm12, %ymm11, %ymm15
+		vpandn %ymm8, %ymm12, %ymm7
+		vpandn %ymm9, %ymm8, %ymm6
+		vpxor %ymm8, %ymm13, %ymm13
+		vpxor %ymm9, %ymm14, %ymm14
+		vpxor %ymm10, %ymm15, %ymm15
+		vpxor %ymm11, %ymm7, %ymm7
+		vpxor %ymm12, %ymm6, %ymm6
+		vpbroadcastq 24(%rsi), %ymm8
+		vpxor %ymm8, %ymm13, %ymm13
+		vmovdqa %ymm13, 0(%rdi)
+		vmovdqa %ymm14, 32(%rdi)
+		vmovdqa %ymm15, 64(%rdi)
+		vmovdqa %ymm7, 96(%rdi)
+		vmovdqa %ymm6, 128(%rdi)
+		vpxor 256(%rdi), %ymm2, %ymm8
+		vpxor 288(%rdi), %ymm1, %ymm9
+		vpxor 160(%rdi), %ymm5, %ymm10
+		vpxor 192(%rdi), %ymm4, %ymm11
+		vpxor 224(%rdi), %ymm3, %ymm12
+		vpsllq $28, %ymm8, %ymm13
+		vpsllq $20, %ymm9, %ymm14
+		vpsllq $3, %ymm10, %ymm15
+		vpsllq $45, %ymm11, %ymm7
+		vpsllq $61, %ymm12, %ymm6
+		vpsrlq $36, %ymm8, %ymm8
+		vpsrlq $44, %ymm9, %ymm9
+		vpsrlq $61, %ymm10, %ymm10
+		vpsrlq $19, %ymm11, %ymm11
+		vpsrlq $3, %ymm12, %ymm12
+		vpor %ymm13, %ymm8, %ymm8
+		vpor %ymm14, %ymm9, %ymm9
+		vpor %ymm15, %ymm10, %ymm10
+		vpor %ymm7, %ymm11, %ymm11
+		vpor %ymm6, %ymm12, %ymm12
+		vpandn %ymm10, %ymm9, %ymm13
+		vpandn %ymm11, %ymm10, %ymm14
+		vpandn %ymm12, %ymm11, %ymm15
+		vpandn %ymm8, %ymm12, %ymm7
+		vpandn %ymm9, %ymm8, %ymm6
+		vpxor %ymm8, %ymm13, %ymm13
+		vpxor %ymm9, %ymm14, %ymm14
+		vpxor %ymm10, %ymm15, %ymm15
+		vpxor %ymm11, %ymm7, %ymm7
+		vpxor %ymm12, %ymm6, %ymm6
+		vmovdqa %ymm13, 160(%rdi)
+		vmovdqa %ymm14, 192(%rdi)
+		vmovdqa %ymm15, 224(%rdi)
+		vmovdqa %ymm7, 256(%rdi)
+		vmovdqa %ymm6, 288(%rdi)
+		vpxor 352(%rdi), %ymm4, %ymm8
+		vpxor 384(%rdi), %ymm3, %ymm9
+		vpxor 416(%rdi), %ymm2, %ymm10
+		vpxor 448(%rdi), %ymm1, %ymm11
+		vpxor 320(%rdi), %ymm5, %ymm12
+		vpsllq $1, %ymm8, %ymm13
+		vpsllq $6, %ymm9, %ymm14
+		vpsllq $25, %ymm10, %ymm15
+		vpsllq $8, %ymm11, %ymm7
+		vpsllq $18, %ymm12, %ymm6
+		vpsrlq $63, %ymm8, %ymm8
+		vpsrlq $58, %ymm9, %ymm9
+		vpsrlq $39, %ymm10, %ymm10
+		vpsrlq $56, %ymm11, %ymm11
+		vpsrlq $46, %ymm12, %ymm12
+		vpor %ymm13, %ymm8, %ymm8
+		vpor %ymm14, %ymm9, %ymm9
+		vpor %ymm15, %ymm10, %ymm10
+		vpor %ymm7, %ymm11, %ymm11
+		vpor %ymm6, %ymm12, %ymm12
+		vpandn %ymm10, %ymm9, %ymm13
+		vpandn %ymm11, %ymm10, %ymm14
+		vpandn %ymm12, %ymm11, %ymm15
+		vpandn %ymm8, %ymm12, %ymm7
+		vpandn %ymm9, %ymm8, %ymm6
+		vpxor %ymm8, %ymm13, %ymm13
+		vpxor %ymm9, %ymm14, %ymm14
+		vpxor %ymm10, %ymm15, %ymm15
+		vpxor %ymm11, %ymm7, %ymm7
+		vpxor %ymm12, %ymm6, %ymm6
+		vmovdqa %ymm13, 320(%rdi)
+		vmovdqa %ymm14, 352(%rdi)
+		vmovdqa %ymm15, 384(%rdi)
+		vmovdqa %ymm7, 416(%rdi)
+		vmovdqa %ymm6, 448(%rdi)
+		vpxor 608(%rdi), %ymm1, %ymm8
+		vpxor 480(%rdi), %ymm5, %ymm9
+		vpxor 512(%rdi), %ymm4, %ymm10
+		vpxor 544(%rdi), %ymm3, %ymm11
+		vpxor 576(%rdi), %ymm2, %ymm12
+		vpsllq $27, %ymm8, %ymm13
+		vpsllq $36, %ymm9, %ymm14
+		vpsllq $10, %ymm10, %ymm15
+		vpsllq $15, %ymm11, %ymm7
+		vpsllq $56, %ymm12, %ymm6
+		vpsrlq $37, %ymm8, %ymm8
+		vpsrlq $28, %ymm9, %ymm9
+		vpsrlq $54, %ymm10, %ymm10
+		vpsrlq $49, %ymm11, %ymm11
+		vpsrlq $8, %ymm12, %ymm12
+		vpor %ymm13, %ymm8, %ymm8
+		vpor %ymm14, %ymm9, %ymm9
+		vpor %ymm15, %ymm10, %ymm10
+		vpor %ymm7, %ymm11, %ymm11
+		vpor %ymm6, %ymm12, %ymm12
+		vpandn %ymm10, %ymm9, %ymm13
+		vpandn %ymm11, %ymm10, %ymm14
+		vpandn %ymm12, %ymm11, %ymm15
+		vpandn %ymm8, %ymm12, %ymm7
+		vpandn %ymm9, %ymm8, %ymm6
+		vpxor %ymm8, %ymm13, %ymm13
+		vpxor %ymm9, %ymm14, %ymm14
+		vpxor %ymm10, %ymm15, %ymm15
+		vpxor %ymm11, %ymm7, %ymm7
+		vpxor %ymm12, %ymm6, %ymm6
+		vmovdqa %ymm13, 480(%rdi)
+		vmovdqa %ymm14, 512(%rdi)
+		vmovdqa %ymm15, 544(%rdi)
+		vmovdqa %ymm7, 576(%rdi)
+		vmovdqa %ymm6, 608(%rdi)
+		vpxor 704(%rdi), %ymm3, %ymm8
+		vpxor 736(%rdi), %ymm2, %ymm9
+		vpxor 768(%rdi), %ymm1, %ymm10
+		vpxor 640(%rdi), %ymm5, %ymm11
+		vpxor 672(%rdi), %ymm4, %ymm12
+		vpsllq $62, %ymm8, %ymm13
+		vpsllq $55, %ymm9, %ymm14
+		vpsllq $39, %ymm10, %ymm15
+		vpsllq $41, %ymm11, %ymm7
+		vpsllq $2, %ymm12, %ymm6
+		vpsrlq $2, %ymm8, %ymm8
+		vpsrlq $9, %ymm9, %ymm9
+		vpsrlq $25, %ymm10, %ymm10
+		vpsrlq $23, %ymm11, %ymm11
+		vpsrlq $62, %ymm12, %ymm12
+		vpor %ymm13, %ymm8, %ymm8
+		vpor %ymm14, %ymm9, %ymm9
+		vpor %ymm15, %ymm10, %ymm10
+		vpor %ymm7, %ymm11, %ymm11
+		vpor %ymm6, %ymm12, %ymm12
+		vpandn %ymm10, %ymm9, %ymm13
+		vpandn %ymm11, %ymm10, %ymm14
+		vpandn %ymm12, %ymm11, %ymm15
+		vpandn %ymm8, %ymm12, %ymm7
+		vpandn %ymm9, %ymm8, %ymm6
+		vpxor %ymm8, %ymm13, %ymm13
+		vpxor %ymm9, %ymm14, %ymm14
+		vpxor %ymm10, %ymm15, %ymm15
+		vpxor %ymm11, %ymm7, %ymm7
+		vpxor %ymm12, %ymm6, %ymm6
+		vmovdqa %ymm13, 640(%rdi)
+		vmovdqa %ymm14, 672(%rdi)
+		vmovdqa %ymm15, 704(%rdi)
+		vmovdqa %ymm7, 736(%rdi)
+		vmovdqa %ymm6, 768(%rdi)
+		addq $32, %rsi
+		subq $1, %rax
+		jnz 17b # loop.begin
+	vzeroupper
+	ret
+#ifndef __APPLE__
+.size f1600x4AVX2, .-f1600x4AVX2
+#endif /* !__APPLE__ */
diff --git a/shake256-avx2/f1600x4.c b/shake256-avx2/f1600x4.c
new file mode 100644
index 00000000..5b24408b
--- /dev/null
+++ b/shake256-avx2/f1600x4.c
@@ -0,0 +1,28 @@
+#include <stdint.h>
+
+uint64_t keccak_rc[24] = {
+    0x0000000000000001,
+    0x0000000000008082,
+    0x800000000000808A,
+    0x8000000080008000,
+    0x000000000000808B,
+    0x0000000080000001,
+    0x8000000080008081,
+    0x8000000000008009,
+    0x000000000000008A,
+    0x0000000000000088,
+    0x0000000080008009,
+    0x000000008000000A,
+    0x000000008000808B,
+    0x800000000000008B,
+    0x8000000000008089,
+    0x8000000000008003,
+    0x8000000000008002,
+    0x8000000000000080,
+    0x000000000000800A,
+    0x800000008000000A,
+    0x8000000080008081,
+    0x8000000000008080,
+    0x0000000080000001,
+    0x8000000080008008
+};
diff --git a/shake256-avx2/f1600x4.h b/shake256-avx2/f1600x4.h
new file mode 100644
index 00000000..ff4785e6
--- /dev/null
+++ b/shake256-avx2/f1600x4.h
@@ -0,0 +1,4 @@
+#pragma once
+
+extern void f1600x4AVX2(uint64_t *s, uint64_t *rc);
+extern uint64_t keccak_rc[24];
diff --git a/shake256-avx2/f1600x4.py b/shake256-avx2/f1600x4.py
new file mode 100644
index 00000000..98a4a68f
--- /dev/null
+++ b/shake256-avx2/f1600x4.py
@@ -0,0 +1,79 @@
+import peachpy.x86_64
+
+stateArg = Argument(ptr(uint64_t))
+rcArg = Argument(ptr(uint64_t))
+with Function("f1600x4AVX2", (stateArg, rcArg), target=uarch.haswell) as function:
+    statePtr = GeneralPurposeRegister64()
+    rcPtr = GeneralPurposeRegister64()
+    superRound = GeneralPurposeRegister64()
+
+    LOAD.ARGUMENT(statePtr, stateArg)
+    LOAD.ARGUMENT(rcPtr, rcArg)
+
+    MOV(superRound, 6)
+
+    def state(offset):
+        return [statePtr + 32*offset]
+
+    with Loop() as loop:
+        for r in range(4):
+            p = [YMMRegister() for i in range(5)]
+            for i in range(5): VMOVDQA(p[i], state(i))
+            for j in range(1, 5):
+                for i in range(5): VPXOR(p[i], p[i], state(5*j+i))
+
+            t = [YMMRegister() for i in range(5)]
+            d = [YMMRegister() for i in range(5)]
+
+            for i in range(5): VPSLLQ(t[i], p[(i+1)%5], 1)
+            for i in range(5): VPSRLQ(d[i], p[(i+1)%5], 63)
+            for i in range(5): VPOR(d[i], d[i], t[i])
+            for i in range(5): VPXOR(d[i], p[(i+4)%5], d[i])
+
+            def rot(i, g):
+                table = [[0, 24, 18, 6, 12],
+                         [7, 23, 2, 9, 22],
+                         [1, 3, 17, 16, 20],
+                         [13, 8, 4, 5, 15],
+                         [19, 10, 21, 14, 11]]
+                t = table[g][i]
+                return ((t + 1) * t // 2) % 64
+
+            def di(i, g):
+                return (3*g + i) % 5
+            def si(i, g, r):
+                n = [6, 16, 11, 1][r]
+                m = [10, 20, 15, 5][r]
+                return (i*n + m*g) % 25
+
+            for g in range(5):
+                s = [YMMRegister() for i in range(5)]
+                for i in range(5):
+                    VPXOR(s[i], d[di(i, g)], state(si(di(i, g), g, r)))
+                for i in range(5):
+                    if rot(i, g) != 0:
+                        VPSLLQ(t[i], s[i], rot(i, g))
+                for i in range(5):
+                    if rot(i, g) != 0:
+                        VPSRLQ(s[i], s[i], 64-rot(i, g))
+                for i in range(5):
+                    if rot(i, g) != 0:
+                        VPOR(s[i], s[i], t[i])
+                for i in range(5): VPANDN(t[i], s[(i+1)%5], s[(i+2)%5])
+                for i in range(5): VPXOR(t[i], t[i], s[i])
+
+                if g == 0:
+                    rc = YMMRegister()
+                    VPBROADCASTQ(rc, [rcPtr + r*8])
+                    VPXOR(t[0], t[0], rc)
+                for i in range(5):
+                    VMOVDQA(state(si(i, g, r)), t[i])
+
+        ADD(rcPtr, 8*4)
+        SUB(superRound, 1)
+        JNZ(loop.begin)
+
+    RETURN ()
+
+
+
diff --git a/shake256-avx2/fips202x4.c b/shake256-avx2/fips202x4.c
deleted file mode 100644
index d3a22769..00000000
--- a/shake256-avx2/fips202x4.c
+++ /dev/null
@@ -1,223 +0,0 @@
-#include <immintrin.h>
-#include <stdint.h>
-#include <assert.h>
-#include "fips202.h"
-
-#define NROUNDS 24
-#define ROL(a, offset) ((a << offset) ^ (a >> (64-offset)))
-
-static uint64_t load64(const unsigned char *x)
-{
-  unsigned long long r = 0, i;
-
-  for (i = 0; i < 8; ++i) {
-    r |= (unsigned long long)x[i] << 8 * i;
-  }
-  return r;
-}
-
-static void store64(uint8_t *x, uint64_t u)
-{
-  unsigned int i;
-
-  for(i=0; i<8; ++i) {
-    x[i] = u;
-    u >>= 8;
-  }
-}
-
-/* Use implementation from the Keccak Code Package */
-extern void KeccakP1600times4_PermuteAll_24rounds(__m256i *s);
-#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds
-
-static void keccak_absorb4x(__m256i *s,
-                          unsigned int r,
-                          const unsigned char *m0,
-                          const unsigned char *m1,
-                          const unsigned char *m2,
-                          const unsigned char *m3,
-                          unsigned long long int mlen,
-                          unsigned char p)
-{
-  unsigned long long i;
-  unsigned char t0[200];
-  unsigned char t1[200];
-  unsigned char t2[200];
-  unsigned char t3[200];
-
-  unsigned long long *ss = (unsigned long long *)s;
-
-
-  while (mlen >= r)
-  {
-    for (i = 0; i < r / 8; ++i)
-    {
-      ss[4*i+0] ^= load64(m0 + 8 * i);
-      ss[4*i+1] ^= load64(m1 + 8 * i);
-      ss[4*i+2] ^= load64(m2 + 8 * i);
-      ss[4*i+3] ^= load64(m3 + 8 * i);
-    }
-
-    KeccakF1600_StatePermute4x(s);
-    mlen -= r;
-    m0 += r;
-    m1 += r;
-    m2 += r;
-    m3 += r;
-  }
-
-  for (i = 0; i < r; ++i)
-  {
-    t0[i] = 0;
-    t1[i] = 0;
-    t2[i] = 0;
-    t3[i] = 0;
-  }
-  for (i = 0; i < mlen; ++i)
-  {
-    t0[i] = m0[i];
-    t1[i] = m1[i];
-    t2[i] = m2[i];
-    t3[i] = m3[i];
-  }
-
-  t0[i] = p;
-  t1[i] = p;
-  t2[i] = p;
-  t3[i] = p;
-
-  t0[r - 1] |= 128;
-  t1[r - 1] |= 128;
-  t2[r - 1] |= 128;
-  t3[r - 1] |= 128;
-
-  for (i = 0; i < r / 8; ++i)
-  {
-    ss[4*i+0] ^= load64(t0 + 8 * i);
-    ss[4*i+1] ^= load64(t1 + 8 * i);
-    ss[4*i+2] ^= load64(t2 + 8 * i);
-    ss[4*i+3] ^= load64(t3 + 8 * i);
-  }
-}
-
-
-static void keccak_squeezeblocks4x(unsigned char *h0,
-                                   unsigned char *h1,
-                                   unsigned char *h2,
-                                   unsigned char *h3,
-                                   unsigned long long int nblocks,
-                                   __m256i *s,
-                                   unsigned int r)
-{
-  unsigned int i;
-
-  unsigned long long *ss = (unsigned long long *)s;
-
-  while(nblocks > 0)
-  {
-    KeccakF1600_StatePermute4x(s);
-    for(i=0;i<(r>>3);i++)
-    {
-      store64(h0+8*i, ss[4*i+0]);
-      store64(h1+8*i, ss[4*i+1]);
-      store64(h2+8*i, ss[4*i+2]);
-      store64(h3+8*i, ss[4*i+3]);
-    }
-    h0 += r;
-    h1 += r;
-    h2 += r;
-    h3 += r;
-    nblocks--;
-  }
-}
-
-
-
-void shake128x4(unsigned char *out0,
-                unsigned char *out1,
-                unsigned char *out2,
-                unsigned char *out3, unsigned long long outlen,
-                unsigned char *in0,
-                unsigned char *in1,
-                unsigned char *in2,
-                unsigned char *in3, unsigned long long inlen)
-{
-  __m256i s[25];
-  unsigned char t0[SHAKE128_RATE];
-  unsigned char t1[SHAKE128_RATE];
-  unsigned char t2[SHAKE128_RATE];
-  unsigned char t3[SHAKE128_RATE];
-  unsigned int i;
-
-  /* zero state */
-  for(i=0;i<25;i++)
-    s[i] = _mm256_xor_si256(s[i], s[i]);
-
-  /* absorb 4 message of identical length in parallel */
-  keccak_absorb4x(s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F);
-
-  /* Squeeze output */
-  keccak_squeezeblocks4x(out0, out1, out2, out3, outlen/SHAKE128_RATE, s, SHAKE128_RATE);
-
-  out0 += (outlen/SHAKE128_RATE)*SHAKE128_RATE;
-  out1 += (outlen/SHAKE128_RATE)*SHAKE128_RATE;
-  out2 += (outlen/SHAKE128_RATE)*SHAKE128_RATE;
-  out3 += (outlen/SHAKE128_RATE)*SHAKE128_RATE;
-
-  if(outlen%SHAKE128_RATE)
-  {
-    keccak_squeezeblocks4x(t0, t1, t2, t3, 1, s, SHAKE128_RATE);
-    for(i=0;i<outlen%SHAKE128_RATE;i++)
-    {
-      out0[i] = t0[i];
-      out1[i] = t1[i];
-      out2[i] = t2[i];
-      out3[i] = t3[i];
-    }
-  }
-}
-
-
-void shake256x4(unsigned char *out0,
-                unsigned char *out1,
-                unsigned char *out2,
-                unsigned char *out3, unsigned long long outlen,
-                unsigned char *in0,
-                unsigned char *in1,
-                unsigned char *in2,
-                unsigned char *in3, unsigned long long inlen)
-{
-  __m256i s[25];
-  unsigned char t0[SHAKE256_RATE];
-  unsigned char t1[SHAKE256_RATE];
-  unsigned char t2[SHAKE256_RATE];
-  unsigned char t3[SHAKE256_RATE];
-  unsigned int i;
-
-  /* zero state */
-  for(i=0;i<25;i++)
-    s[i] = _mm256_xor_si256(s[i], s[i]);
-
-  /* absorb 4 message of identical length in parallel */
-  keccak_absorb4x(s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F);
-
-  /* Squeeze output */
-  keccak_squeezeblocks4x(out0, out1, out2, out3, outlen/SHAKE256_RATE, s, SHAKE256_RATE);
-
-  out0 += (outlen/SHAKE256_RATE)*SHAKE256_RATE;
-  out1 += (outlen/SHAKE256_RATE)*SHAKE256_RATE;
-  out2 += (outlen/SHAKE256_RATE)*SHAKE256_RATE;
-  out3 += (outlen/SHAKE256_RATE)*SHAKE256_RATE;
-
-  if(outlen%SHAKE256_RATE)
-  {
-    keccak_squeezeblocks4x(t0, t1, t2, t3, 1, s, SHAKE256_RATE);
-    for(i=0;i<outlen%SHAKE256_RATE;i++)
-    {
-      out0[i] = t0[i];
-      out1[i] = t1[i];
-      out2[i] = t2[i];
-      out3[i] = t3[i];
-    }
-  }
-}
diff --git a/shake256-avx2/fips202x4.h b/shake256-avx2/fips202x4.h
deleted file mode 100644
index bc03f181..00000000
--- a/shake256-avx2/fips202x4.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef SPX_FIPS202X4_H
-#define SPX_FIPS202X4_H
-
-#include <immintrin.h>
-
-void shake128x4(unsigned char *out0,
-                unsigned char *out1,
-                unsigned char *out2,
-                unsigned char *out3, unsigned long long outlen,
-                unsigned char *in0,
-                unsigned char *in1,
-                unsigned char *in2,
-                unsigned char *in3, unsigned long long inlen);
-
-void shake256x4(unsigned char *out0,
-                unsigned char *out1,
-                unsigned char *out2,
-                unsigned char *out3, unsigned long long outlen,
-                unsigned char *in0,
-                unsigned char *in1,
-                unsigned char *in2,
-                unsigned char *in3, unsigned long long inlen);
-
-#endif
diff --git a/shake256-avx2/hash_shake256x4.c b/shake256-avx2/hash_shake256x4.c
index 105d1ed8..fcf0df47 100644
--- a/shake256-avx2/hash_shake256x4.c
+++ b/shake256-avx2/hash_shake256x4.c
@@ -1,12 +1,10 @@
 #include <stdint.h>
 #include <string.h>
+#include <immintrin.h>
 
 #include "address.h"
 #include "params.h"
-#include "fips202x4.h"
-#include "hashx4.h"
-
-extern void KeccakP1600times4_PermuteAll_24rounds(__m256i *s);
+#include "f1600x4.h"
 
 /* Swap endianess */
 static uint32_t swap32(uint32_t val) {
@@ -55,7 +53,7 @@ void prf_addrx4(unsigned char *out0,
         state[i] = _mm256_set1_epi64x(0);
     }
 
-    KeccakP1600times4_PermuteAll_24rounds(&state[0]);
+    f1600x4AVX2((uint64_t*)&state[0], &keccak_rc[0]);
 
     for (int i = 0; i < SPX_N/8; i++) {
         ((int64_t*)out0)[i] = _mm256_extract_epi64(state[i], 0);
diff --git a/shake256-avx2/keccak4x/KeccakP-1600-times4-SIMD256.c b/shake256-avx2/keccak4x/KeccakP-1600-times4-SIMD256.c
deleted file mode 100644
index 7a0428fb..00000000
--- a/shake256-avx2/keccak4x/KeccakP-1600-times4-SIMD256.c
+++ /dev/null
@@ -1,1030 +0,0 @@
-/*
-Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
-Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
-denoted as "the implementer".
-
-For more information, feedback or questions, please refer to our websites:
-http://keccak.noekeon.org/
-http://keyak.noekeon.org/
-http://ketje.noekeon.org/
-
-To the extent possible under law, the implementer has waived all copyright
-and related or neighboring rights to the source code in this file.
-http://creativecommons.org/publicdomain/zero/1.0/
-*/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <smmintrin.h>
-#include <wmmintrin.h>
-#include <immintrin.h>
-#include <emmintrin.h>
-#include "align.h"
-#include "KeccakP-1600-times4-SnP.h"
-#include "SIMD256-config.h"
-
-#include "brg_endian.h"
-#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
-#error Expecting a little-endian platform
-#endif
-
-typedef unsigned char UINT8;
-typedef unsigned long long int UINT64;
-typedef __m128i V128;
-typedef __m256i V256;
-
-#define laneIndex(instanceIndex, lanePosition) ((lanePosition)*4 + instanceIndex)
-
-#if defined(KeccakP1600times4_useAVX2)
-    #define ANDnu256(a, b)          _mm256_andnot_si256(a, b)
-    #define CONST256(a)             _mm256_load_si256((const V256 *)&(a))
-    #define CONST256_64(a)          (V256)_mm256_broadcast_sd((const double*)(&a))
-    #define LOAD256(a)              _mm256_load_si256((const V256 *)&(a))
-    #define LOAD256u(a)             _mm256_loadu_si256((const V256 *)&(a))
-    #define LOAD4_64(a, b, c, d)    _mm256_set_epi64x((UINT64)(a), (UINT64)(b), (UINT64)(c), (UINT64)(d))
-    #define ROL64in256(d, a, o)     d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o)))
-    #define ROL64in256_8(d, a)      d = _mm256_shuffle_epi8(a, CONST256(rho8))
-    #define ROL64in256_56(d, a)     d = _mm256_shuffle_epi8(a, CONST256(rho56))
-static const UINT64 rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F};
-static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19};
-    #define STORE256(a, b)          _mm256_store_si256((V256 *)&(a), b)
-    #define STORE256u(a, b)         _mm256_storeu_si256((V256 *)&(a), b)
-    #define STORE2_128(ah, al, v)   _mm256_storeu2_m128d((V128*)&(ah), (V128*)&(al), v)
-    #define XOR256(a, b)            _mm256_xor_si256(a, b)
-    #define XOReq256(a, b)          a = _mm256_xor_si256(a, b)
-    #define UNPACKL( a, b )         _mm256_unpacklo_epi64((a), (b))
-    #define UNPACKH( a, b )         _mm256_unpackhi_epi64((a), (b))
-    #define PERM128( a, b, c )      (V256)_mm256_permute2f128_ps((__m256)(a), (__m256)(b), c)
-    #define SHUFFLE64( a, b, c )    (V256)_mm256_shuffle_pd((__m256d)(a), (__m256d)(b), c)
-
-    #define UNINTLEAVE()            lanesL01 = UNPACKL( lanes0, lanes1 ),                   \
-                                    lanesH01 = UNPACKH( lanes0, lanes1 ),                   \
-                                    lanesL23 = UNPACKL( lanes2, lanes3 ),                   \
-                                    lanesH23 = UNPACKH( lanes2, lanes3 ),                   \
-                                    lanes0 = PERM128( lanesL01, lanesL23, 0x20 ),           \
-                                    lanes2 = PERM128( lanesL01, lanesL23, 0x31 ),           \
-                                    lanes1 = PERM128( lanesH01, lanesH23, 0x20 ),           \
-                                    lanes3 = PERM128( lanesH01, lanesH23, 0x31 )
-
-    #define INTLEAVE()              lanesL01 = PERM128( lanes0, lanes2, 0x20 ),             \
-                                    lanesH01 = PERM128( lanes1, lanes3, 0x20 ),             \
-                                    lanesL23 = PERM128( lanes0, lanes2, 0x31 ),             \
-                                    lanesH23 = PERM128( lanes1, lanes3, 0x31 ),             \
-                                    lanes0 = SHUFFLE64( lanesL01, lanesH01, 0x00 ),         \
-                                    lanes1 = SHUFFLE64( lanesL01, lanesH01, 0x0F ),         \
-                                    lanes2 = SHUFFLE64( lanesL23, lanesH23, 0x00 ),         \
-                                    lanes3 = SHUFFLE64( lanesL23, lanesH23, 0x0F )
-
-#endif
-
-#define SnP_laneLengthInBytes 8
-
-void KeccakP1600times4_InitializeAll(void *states)
-{
-    memset(states, 0, KeccakP1600times4_statesSizeInBytes);
-}
-
-void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
-{
-    unsigned int sizeLeft = length;
-    unsigned int lanePosition = offset/SnP_laneLengthInBytes;
-    unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
-    const unsigned char *curData = data;
-    UINT64 *statesAsLanes = (UINT64 *)states;
-
-    if ((sizeLeft > 0) && (offsetInLane != 0)) {
-        unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
-        UINT64 lane = 0;
-        if (bytesInLane > sizeLeft)
-            bytesInLane = sizeLeft;
-        memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane);
-        statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
-        sizeLeft -= bytesInLane;
-        lanePosition++;
-        curData += bytesInLane;
-    }
-
-    while(sizeLeft >= SnP_laneLengthInBytes) {
-        UINT64 lane = *((const UINT64*)curData);
-        statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
-        sizeLeft -= SnP_laneLengthInBytes;
-        lanePosition++;
-        curData += SnP_laneLengthInBytes;
-    }
-
-    if (sizeLeft > 0) {
-        UINT64 lane = 0;
-        memcpy(&lane, curData, sizeLeft);
-        statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
-    }
-}
-
-void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
-{
-    V256 *stateAsLanes = (V256 *)states;
-    unsigned int i;
-    const UINT64 *curData0 = (const UINT64 *)data;
-    const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);
-    const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
-    const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
-    V256    lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
-
-    #define Xor_In( argIndex )  XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
-
-    #define Xor_In4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\
-                                lanes1 = LOAD256u( curData1[argIndex]),\
-                                lanes2 = LOAD256u( curData2[argIndex]),\
-                                lanes3 = LOAD256u( curData3[argIndex]),\
-                                INTLEAVE(),\
-                                XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
-                                XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
-                                XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
-                                XOReq256( stateAsLanes[argIndex+3], lanes3 )
-
-    if ( laneCount >= 16 )  {
-        Xor_In4( 0 );
-        Xor_In4( 4 );
-        Xor_In4( 8 );
-        Xor_In4( 12 );
-        if ( laneCount >= 20 )  {
-            Xor_In4( 16 );
-            for(i=20; i<laneCount; i++)
-                Xor_In( i );
-        }
-        else {
-            for(i=16; i<laneCount; i++)
-                Xor_In( i );
-        }
-    }
-    else {
-        for(i=0; i<laneCount; i++)
-            Xor_In( i );
-    }
-    #undef  Xor_In
-    #undef  Xor_In4
-}
-
-void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
-{
-    unsigned int sizeLeft = length;
-    unsigned int lanePosition = offset/SnP_laneLengthInBytes;
-    unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
-    const unsigned char *curData = data;
-    UINT64 *statesAsLanes = (UINT64 *)states;
-
-    if ((sizeLeft > 0) && (offsetInLane != 0)) {
-        unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
-        if (bytesInLane > sizeLeft)
-            bytesInLane = sizeLeft;
-        memcpy( ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, curData, bytesInLane);
-        sizeLeft -= bytesInLane;
-        lanePosition++;
-        curData += bytesInLane;
-    }
-
-    while(sizeLeft >= SnP_laneLengthInBytes) {
-        UINT64 lane = *((const UINT64*)curData);
-        statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane;
-        sizeLeft -= SnP_laneLengthInBytes;
-        lanePosition++;
-        curData += SnP_laneLengthInBytes;
-    }
-
-    if (sizeLeft > 0) {
-        memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, sizeLeft);
-    }
-}
-
-void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
-{
-    V256 *stateAsLanes = (V256 *)states;
-    unsigned int i;
-    const UINT64 *curData0 = (const UINT64 *)data;
-    const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);
-    const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
-    const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
-    V256    lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
-
-    #define OverWr( argIndex )  STORE256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
-
-    #define OverWr4( argIndex )     lanes0 = LOAD256u( curData0[argIndex]),\
-                                    lanes1 = LOAD256u( curData1[argIndex]),\
-                                    lanes2 = LOAD256u( curData2[argIndex]),\
-                                    lanes3 = LOAD256u( curData3[argIndex]),\
-                                    INTLEAVE(),\
-                                    STORE256( stateAsLanes[argIndex+0], lanes0 ),\
-                                    STORE256( stateAsLanes[argIndex+1], lanes1 ),\
-                                    STORE256( stateAsLanes[argIndex+2], lanes2 ),\
-                                    STORE256( stateAsLanes[argIndex+3], lanes3 )
-
-    if ( laneCount >= 16 )  {
-        OverWr4( 0 );
-        OverWr4( 4 );
-        OverWr4( 8 );
-        OverWr4( 12 );
-        if ( laneCount >= 20 )  {
-            OverWr4( 16 );
-            for(i=20; i<laneCount; i++)
-                OverWr( i );
-        }
-        else {
-            for(i=16; i<laneCount; i++)
-                OverWr( i );
-        }
-    }
-    else {
-        for(i=0; i<laneCount; i++)
-            OverWr( i );
-    }
-    #undef  OverWr
-    #undef  OverWr4
-}
-
-void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount)
-{
-    unsigned int sizeLeft = byteCount;
-    unsigned int lanePosition = 0;
-    UINT64 *statesAsLanes = (UINT64 *)states;
-
-    while(sizeLeft >= SnP_laneLengthInBytes) {
-        statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0;
-        sizeLeft -= SnP_laneLengthInBytes;
-        lanePosition++;
-    }
-
-    if (sizeLeft > 0) {
-        memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft);
-    }
-}
-
-void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length)
-{
-    unsigned int sizeLeft = length;
-    unsigned int lanePosition = offset/SnP_laneLengthInBytes;
-    unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
-    unsigned char *curData = data;
-    const UINT64 *statesAsLanes = (const UINT64 *)states;
-
-    if ((sizeLeft > 0) && (offsetInLane != 0)) {
-        unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
-        if (bytesInLane > sizeLeft)
-            bytesInLane = sizeLeft;
-        memcpy( curData, ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane);
-        sizeLeft -= bytesInLane;
-        lanePosition++;
-        curData += bytesInLane;
-    }
-
-    while(sizeLeft >= SnP_laneLengthInBytes) {
-        *(UINT64*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
-        sizeLeft -= SnP_laneLengthInBytes;
-        lanePosition++;
-        curData += SnP_laneLengthInBytes;
-    }
-
-    if (sizeLeft > 0) {
-        memcpy( curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], sizeLeft);
-    }
-}
-
-void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
-{
-    UINT64 *curData0 = (UINT64 *)data;
-    UINT64 *curData1 = (UINT64 *)(data+laneOffset*1*SnP_laneLengthInBytes);
-    UINT64 *curData2 = (UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
-    UINT64 *curData3 = (UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
-
-    const V256 *stateAsLanes = (const V256 *)states;
-    const UINT64 *stateAsLanes64 = (const UINT64*)states;
-    V256    lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
-    unsigned int i;
-
-    #define Extr( argIndex )    curData0[argIndex] = stateAsLanes64[4*(argIndex)],      \
-                                curData1[argIndex] = stateAsLanes64[4*(argIndex)+1],    \
-                                curData2[argIndex] = stateAsLanes64[4*(argIndex)+2],    \
-                                curData3[argIndex] = stateAsLanes64[4*(argIndex)+3]
-
-    #define Extr4( argIndex )   lanes0 = LOAD256( stateAsLanes[argIndex+0] ),           \
-                                lanes1 = LOAD256( stateAsLanes[argIndex+1] ),           \
-                                lanes2 = LOAD256( stateAsLanes[argIndex+2] ),           \
-                                lanes3 = LOAD256( stateAsLanes[argIndex+3] ),           \
-                                UNINTLEAVE(),                                           \
-                                STORE256u( curData0[argIndex], lanes0 ),                \
-                                STORE256u( curData1[argIndex], lanes1 ),                \
-                                STORE256u( curData2[argIndex], lanes2 ),                \
-                                STORE256u( curData3[argIndex], lanes3 )
-
-    if ( laneCount >= 16 )  {
-        Extr4( 0 );
-        Extr4( 4 );
-        Extr4( 8 );
-        Extr4( 12 );
-        if ( laneCount >= 20 )  {
-            Extr4( 16 );
-            for(i=20; i<laneCount; i++)
-                Extr( i );
-        }
-        else {
-            for(i=16; i<laneCount; i++)
-                Extr( i );
-        }
-    }
-    else {
-        for(i=0; i<laneCount; i++)
-            Extr( i );
-    }
-    #undef  Extr
-    #undef  Extr4
-}
-
-void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
-{
-    unsigned int sizeLeft = length;
-    unsigned int lanePosition = offset/SnP_laneLengthInBytes;
-    unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
-    const unsigned char *curInput = input;
-    unsigned char *curOutput = output;
-    const UINT64 *statesAsLanes = (const UINT64 *)states;
-
-    if ((sizeLeft > 0) && (offsetInLane != 0)) {
-        unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
-        UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> (8 * offsetInLane);
-        if (bytesInLane > sizeLeft)
-            bytesInLane = sizeLeft;
-        sizeLeft -= bytesInLane;
-        do {
-            *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
-            lane >>= 8;
-        } while ( --bytesInLane != 0);
-        lanePosition++;
-    }
-
-    while(sizeLeft >= SnP_laneLengthInBytes) {
-        *((UINT64*)curOutput) = *((UINT64*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)];
-        sizeLeft -= SnP_laneLengthInBytes;
-        lanePosition++;
-        curInput += SnP_laneLengthInBytes;
-        curOutput += SnP_laneLengthInBytes;
-    }
-
-    if (sizeLeft != 0) {
-        UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
-        do {
-            *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
-            lane >>= 8;
-        } while ( --sizeLeft != 0);
-    }
-}
-
-void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
-{
-    const UINT64 *curInput0 = (UINT64 *)input;
-    const UINT64 *curInput1 = (UINT64 *)(input+laneOffset*1*SnP_laneLengthInBytes);
-    const UINT64 *curInput2 = (UINT64 *)(input+laneOffset*2*SnP_laneLengthInBytes);
-    const UINT64 *curInput3 = (UINT64 *)(input+laneOffset*3*SnP_laneLengthInBytes);
-    UINT64 *curOutput0 = (UINT64 *)output;
-    UINT64 *curOutput1 = (UINT64 *)(output+laneOffset*1*SnP_laneLengthInBytes);
-    UINT64 *curOutput2 = (UINT64 *)(output+laneOffset*2*SnP_laneLengthInBytes);
-    UINT64 *curOutput3 = (UINT64 *)(output+laneOffset*3*SnP_laneLengthInBytes);
-
-    const V256 *stateAsLanes = (const V256 *)states;
-    const UINT64 *stateAsLanes64 = (const UINT64*)states;
-    V256    lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
-    unsigned int i;
-
-    #define ExtrXor( argIndex ) \
-                                curOutput0[argIndex] = curInput0[argIndex] ^ stateAsLanes64[4*(argIndex)],\
-                                curOutput1[argIndex] = curInput1[argIndex] ^ stateAsLanes64[4*(argIndex)+1],\
-                                curOutput2[argIndex] = curInput2[argIndex] ^ stateAsLanes64[4*(argIndex)+2],\
-                                curOutput3[argIndex] = curInput3[argIndex] ^ stateAsLanes64[4*(argIndex)+3]
-
-    #define ExtrXor4( argIndex ) \
-                                    lanes0 = LOAD256( stateAsLanes[argIndex+0] ),\
-                                    lanes1 = LOAD256( stateAsLanes[argIndex+1] ),\
-                                    lanes2 = LOAD256( stateAsLanes[argIndex+2] ),\
-                                    lanes3 = LOAD256( stateAsLanes[argIndex+3] ),\
-                                    UNINTLEAVE(),\
-                                    lanesL01 = LOAD256u( curInput0[argIndex]),\
-                                    lanesH01 = LOAD256u( curInput1[argIndex]),\
-                                    lanesL23 = LOAD256u( curInput2[argIndex]),\
-                                    lanesH23 = LOAD256u( curInput3[argIndex]),\
-                                    XOReq256( lanes0, lanesL01 ),\
-                                    XOReq256( lanes1, lanesH01 ),\
-                                    XOReq256( lanes2, lanesL23 ),\
-                                    XOReq256( lanes3, lanesH23 ),\
-                                    STORE256u( curOutput0[argIndex], lanes0 ),\
-                                    STORE256u( curOutput1[argIndex], lanes1 ),\
-                                    STORE256u( curOutput2[argIndex], lanes2 ),\
-                                    STORE256u( curOutput3[argIndex], lanes3 )
-
-    if ( laneCount >= 16 )  {
-        ExtrXor4( 0 );
-        ExtrXor4( 4 );
-        ExtrXor4( 8 );
-        ExtrXor4( 12 );
-        if ( laneCount >= 20 )  {
-            ExtrXor4( 16 );
-            for(i=20; i<laneCount; i++)
-                ExtrXor( i );
-        }
-        else {
-            for(i=16; i<laneCount; i++)
-                ExtrXor( i );
-        }
-    }
-    else {
-        for(i=0; i<laneCount; i++)
-            ExtrXor( i );
-    }
-    #undef  ExtrXor
-    #undef  ExtrXor4
-}
-
-#define declareABCDE \
-    V256 Aba, Abe, Abi, Abo, Abu; \
-    V256 Aga, Age, Agi, Ago, Agu; \
-    V256 Aka, Ake, Aki, Ako, Aku; \
-    V256 Ama, Ame, Ami, Amo, Amu; \
-    V256 Asa, Ase, Asi, Aso, Asu; \
-    V256 Bba, Bbe, Bbi, Bbo, Bbu; \
-    V256 Bga, Bge, Bgi, Bgo, Bgu; \
-    V256 Bka, Bke, Bki, Bko, Bku; \
-    V256 Bma, Bme, Bmi, Bmo, Bmu; \
-    V256 Bsa, Bse, Bsi, Bso, Bsu; \
-    V256 Ca, Ce, Ci, Co, Cu; \
-    V256 Ca1, Ce1, Ci1, Co1, Cu1; \
-    V256 Da, De, Di, Do, Du; \
-    V256 Eba, Ebe, Ebi, Ebo, Ebu; \
-    V256 Ega, Ege, Egi, Ego, Egu; \
-    V256 Eka, Eke, Eki, Eko, Eku; \
-    V256 Ema, Eme, Emi, Emo, Emu; \
-    V256 Esa, Ese, Esi, Eso, Esu; \
-
-#define prepareTheta \
-    Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); \
-    Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); \
-    Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); \
-    Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); \
-    Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); \
-
-/* --- Theta Rho Pi Chi Iota Prepare-theta */
-/* --- 64-bit lanes mapped to 64-bit words */
-#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
-    ROL64in256(Ce1, Ce, 1); \
-    Da = XOR256(Cu, Ce1); \
-    ROL64in256(Ci1, Ci, 1); \
-    De = XOR256(Ca, Ci1); \
-    ROL64in256(Co1, Co, 1); \
-    Di = XOR256(Ce, Co1); \
-    ROL64in256(Cu1, Cu, 1); \
-    Do = XOR256(Ci, Cu1); \
-    ROL64in256(Ca1, Ca, 1); \
-    Du = XOR256(Co, Ca1); \
-\
-    XOReq256(A##ba, Da); \
-    Bba = A##ba; \
-    XOReq256(A##ge, De); \
-    ROL64in256(Bbe, A##ge, 44); \
-    XOReq256(A##ki, Di); \
-    ROL64in256(Bbi, A##ki, 43); \
-    E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
-    XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
-    Ca = E##ba; \
-    XOReq256(A##mo, Do); \
-    ROL64in256(Bbo, A##mo, 21); \
-    E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
-    Ce = E##be; \
-    XOReq256(A##su, Du); \
-    ROL64in256(Bbu, A##su, 14); \
-    E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
-    Ci = E##bi; \
-    E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
-    Co = E##bo; \
-    E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
-    Cu = E##bu; \
-\
-    XOReq256(A##bo, Do); \
-    ROL64in256(Bga, A##bo, 28); \
-    XOReq256(A##gu, Du); \
-    ROL64in256(Bge, A##gu, 20); \
-    XOReq256(A##ka, Da); \
-    ROL64in256(Bgi, A##ka, 3); \
-    E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
-    XOReq256(Ca, E##ga); \
-    XOReq256(A##me, De); \
-    ROL64in256(Bgo, A##me, 45); \
-    E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
-    XOReq256(Ce, E##ge); \
-    XOReq256(A##si, Di); \
-    ROL64in256(Bgu, A##si, 61); \
-    E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
-    XOReq256(Ci, E##gi); \
-    E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
-    XOReq256(Co, E##go); \
-    E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
-    XOReq256(Cu, E##gu); \
-\
-    XOReq256(A##be, De); \
-    ROL64in256(Bka, A##be, 1); \
-    XOReq256(A##gi, Di); \
-    ROL64in256(Bke, A##gi, 6); \
-    XOReq256(A##ko, Do); \
-    ROL64in256(Bki, A##ko, 25); \
-    E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
-    XOReq256(Ca, E##ka); \
-    XOReq256(A##mu, Du); \
-    ROL64in256_8(Bko, A##mu); \
-    E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
-    XOReq256(Ce, E##ke); \
-    XOReq256(A##sa, Da); \
-    ROL64in256(Bku, A##sa, 18); \
-    E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
-    XOReq256(Ci, E##ki); \
-    E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
-    XOReq256(Co, E##ko); \
-    E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
-    XOReq256(Cu, E##ku); \
-\
-    XOReq256(A##bu, Du); \
-    ROL64in256(Bma, A##bu, 27); \
-    XOReq256(A##ga, Da); \
-    ROL64in256(Bme, A##ga, 36); \
-    XOReq256(A##ke, De); \
-    ROL64in256(Bmi, A##ke, 10); \
-    E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
-    XOReq256(Ca, E##ma); \
-    XOReq256(A##mi, Di); \
-    ROL64in256(Bmo, A##mi, 15); \
-    E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
-    XOReq256(Ce, E##me); \
-    XOReq256(A##so, Do); \
-    ROL64in256_56(Bmu, A##so); \
-    E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
-    XOReq256(Ci, E##mi); \
-    E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
-    XOReq256(Co, E##mo); \
-    E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
-    XOReq256(Cu, E##mu); \
-\
-    XOReq256(A##bi, Di); \
-    ROL64in256(Bsa, A##bi, 62); \
-    XOReq256(A##go, Do); \
-    ROL64in256(Bse, A##go, 55); \
-    XOReq256(A##ku, Du); \
-    ROL64in256(Bsi, A##ku, 39); \
-    E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
-    XOReq256(Ca, E##sa); \
-    XOReq256(A##ma, Da); \
-    ROL64in256(Bso, A##ma, 41); \
-    E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
-    XOReq256(Ce, E##se); \
-    XOReq256(A##se, De); \
-    ROL64in256(Bsu, A##se, 2); \
-    E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
-    XOReq256(Ci, E##si); \
-    E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
-    XOReq256(Co, E##so); \
-    E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
-    XOReq256(Cu, E##su); \
-\
-
-/* --- Theta Rho Pi Chi Iota */
-/* --- 64-bit lanes mapped to 64-bit words */
-#define thetaRhoPiChiIota(i, A, E) \
-    ROL64in256(Ce1, Ce, 1); \
-    Da = XOR256(Cu, Ce1); \
-    ROL64in256(Ci1, Ci, 1); \
-    De = XOR256(Ca, Ci1); \
-    ROL64in256(Co1, Co, 1); \
-    Di = XOR256(Ce, Co1); \
-    ROL64in256(Cu1, Cu, 1); \
-    Do = XOR256(Ci, Cu1); \
-    ROL64in256(Ca1, Ca, 1); \
-    Du = XOR256(Co, Ca1); \
-\
-    XOReq256(A##ba, Da); \
-    Bba = A##ba; \
-    XOReq256(A##ge, De); \
-    ROL64in256(Bbe, A##ge, 44); \
-    XOReq256(A##ki, Di); \
-    ROL64in256(Bbi, A##ki, 43); \
-    E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
-    XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
-    XOReq256(A##mo, Do); \
-    ROL64in256(Bbo, A##mo, 21); \
-    E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
-    XOReq256(A##su, Du); \
-    ROL64in256(Bbu, A##su, 14); \
-    E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
-    E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
-    E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
-\
-    XOReq256(A##bo, Do); \
-    ROL64in256(Bga, A##bo, 28); \
-    XOReq256(A##gu, Du); \
-    ROL64in256(Bge, A##gu, 20); \
-    XOReq256(A##ka, Da); \
-    ROL64in256(Bgi, A##ka, 3); \
-    E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
-    XOReq256(A##me, De); \
-    ROL64in256(Bgo, A##me, 45); \
-    E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
-    XOReq256(A##si, Di); \
-    ROL64in256(Bgu, A##si, 61); \
-    E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
-    E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
-    E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
-\
-    XOReq256(A##be, De); \
-    ROL64in256(Bka, A##be, 1); \
-    XOReq256(A##gi, Di); \
-    ROL64in256(Bke, A##gi, 6); \
-    XOReq256(A##ko, Do); \
-    ROL64in256(Bki, A##ko, 25); \
-    E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
-    XOReq256(A##mu, Du); \
-    ROL64in256_8(Bko, A##mu); \
-    E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
-    XOReq256(A##sa, Da); \
-    ROL64in256(Bku, A##sa, 18); \
-    E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
-    E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
-    E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
-\
-    XOReq256(A##bu, Du); \
-    ROL64in256(Bma, A##bu, 27); \
-    XOReq256(A##ga, Da); \
-    ROL64in256(Bme, A##ga, 36); \
-    XOReq256(A##ke, De); \
-    ROL64in256(Bmi, A##ke, 10); \
-    E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
-    XOReq256(A##mi, Di); \
-    ROL64in256(Bmo, A##mi, 15); \
-    E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
-    XOReq256(A##so, Do); \
-    ROL64in256_56(Bmu, A##so); \
-    E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
-    E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
-    E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
-\
-    XOReq256(A##bi, Di); \
-    ROL64in256(Bsa, A##bi, 62); \
-    XOReq256(A##go, Do); \
-    ROL64in256(Bse, A##go, 55); \
-    XOReq256(A##ku, Du); \
-    ROL64in256(Bsi, A##ku, 39); \
-    E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
-    XOReq256(A##ma, Da); \
-    ROL64in256(Bso, A##ma, 41); \
-    E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
-    XOReq256(A##se, De); \
-    ROL64in256(Bsu, A##se, 2); \
-    E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
-    E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
-    E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
-\
-
-static ALIGN(KeccakP1600times4_statesAlignment) const UINT64 KeccakF1600RoundConstants[24] = {
-    0x0000000000000001ULL,
-    0x0000000000008082ULL,
-    0x800000000000808aULL,
-    0x8000000080008000ULL,
-    0x000000000000808bULL,
-    0x0000000080000001ULL,
-    0x8000000080008081ULL,
-    0x8000000000008009ULL,
-    0x000000000000008aULL,
-    0x0000000000000088ULL,
-    0x0000000080008009ULL,
-    0x000000008000000aULL,
-    0x000000008000808bULL,
-    0x800000000000008bULL,
-    0x8000000000008089ULL,
-    0x8000000000008003ULL,
-    0x8000000000008002ULL,
-    0x8000000000000080ULL,
-    0x000000000000800aULL,
-    0x800000008000000aULL,
-    0x8000000080008081ULL,
-    0x8000000000008080ULL,
-    0x0000000080000001ULL,
-    0x8000000080008008ULL};
-
-#define copyFromState(X, state) \
-    X##ba = LOAD256(state[ 0]); \
-    X##be = LOAD256(state[ 1]); \
-    X##bi = LOAD256(state[ 2]); \
-    X##bo = LOAD256(state[ 3]); \
-    X##bu = LOAD256(state[ 4]); \
-    X##ga = LOAD256(state[ 5]); \
-    X##ge = LOAD256(state[ 6]); \
-    X##gi = LOAD256(state[ 7]); \
-    X##go = LOAD256(state[ 8]); \
-    X##gu = LOAD256(state[ 9]); \
-    X##ka = LOAD256(state[10]); \
-    X##ke = LOAD256(state[11]); \
-    X##ki = LOAD256(state[12]); \
-    X##ko = LOAD256(state[13]); \
-    X##ku = LOAD256(state[14]); \
-    X##ma = LOAD256(state[15]); \
-    X##me = LOAD256(state[16]); \
-    X##mi = LOAD256(state[17]); \
-    X##mo = LOAD256(state[18]); \
-    X##mu = LOAD256(state[19]); \
-    X##sa = LOAD256(state[20]); \
-    X##se = LOAD256(state[21]); \
-    X##si = LOAD256(state[22]); \
-    X##so = LOAD256(state[23]); \
-    X##su = LOAD256(state[24]); \
-
-#define copyToState(state, X) \
-    STORE256(state[ 0], X##ba); \
-    STORE256(state[ 1], X##be); \
-    STORE256(state[ 2], X##bi); \
-    STORE256(state[ 3], X##bo); \
-    STORE256(state[ 4], X##bu); \
-    STORE256(state[ 5], X##ga); \
-    STORE256(state[ 6], X##ge); \
-    STORE256(state[ 7], X##gi); \
-    STORE256(state[ 8], X##go); \
-    STORE256(state[ 9], X##gu); \
-    STORE256(state[10], X##ka); \
-    STORE256(state[11], X##ke); \
-    STORE256(state[12], X##ki); \
-    STORE256(state[13], X##ko); \
-    STORE256(state[14], X##ku); \
-    STORE256(state[15], X##ma); \
-    STORE256(state[16], X##me); \
-    STORE256(state[17], X##mi); \
-    STORE256(state[18], X##mo); \
-    STORE256(state[19], X##mu); \
-    STORE256(state[20], X##sa); \
-    STORE256(state[21], X##se); \
-    STORE256(state[22], X##si); \
-    STORE256(state[23], X##so); \
-    STORE256(state[24], X##su); \
-
-#define copyStateVariables(X, Y) \
-    X##ba = Y##ba; \
-    X##be = Y##be; \
-    X##bi = Y##bi; \
-    X##bo = Y##bo; \
-    X##bu = Y##bu; \
-    X##ga = Y##ga; \
-    X##ge = Y##ge; \
-    X##gi = Y##gi; \
-    X##go = Y##go; \
-    X##gu = Y##gu; \
-    X##ka = Y##ka; \
-    X##ke = Y##ke; \
-    X##ki = Y##ki; \
-    X##ko = Y##ko; \
-    X##ku = Y##ku; \
-    X##ma = Y##ma; \
-    X##me = Y##me; \
-    X##mi = Y##mi; \
-    X##mo = Y##mo; \
-    X##mu = Y##mu; \
-    X##sa = Y##sa; \
-    X##se = Y##se; \
-    X##si = Y##si; \
-    X##so = Y##so; \
-    X##su = Y##su; \
-
- #ifdef KeccakP1600times4_fullUnrolling
-#define FullUnrolling
-#else
-#define Unrolling KeccakP1600times4_unrolling
-#endif
-#include "KeccakP-1600-unrolling.macros"
-
-void KeccakP1600times4_PermuteAll_24rounds(void *states)
-{
-    V256 *statesAsLanes = (V256 *)states;
-    declareABCDE
-    #ifndef KeccakP1600times4_fullUnrolling
-    unsigned int i;
-    #endif
-
-    copyFromState(A, statesAsLanes)
-    rounds24
-    copyToState(statesAsLanes, A)
-}
-
-void KeccakP1600times4_PermuteAll_12rounds(void *states)
-{
-    V256 *statesAsLanes = (V256 *)states;
-    declareABCDE
-    #ifndef KeccakP1600times4_fullUnrolling
-    unsigned int i;
-    #endif
-
-    copyFromState(A, statesAsLanes)
-    rounds12
-    copyToState(statesAsLanes, A)
-}
-
-size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
-{
-    if (laneCount == 21) {
-#if 0
-        const unsigned char *dataStart = data;
-        const UINT64 *curData0 = (const UINT64 *)data;
-        const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
-        const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
-        const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
-
-        while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
-            V256 *stateAsLanes = (V256 *)states;
-            V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
-            #define Xor_In( argIndex ) \
-                XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
-            #define Xor_In4( argIndex ) \
-                lanes0 = LOAD256u( curData0[argIndex]),\
-                lanes1 = LOAD256u( curData1[argIndex]),\
-                lanes2 = LOAD256u( curData2[argIndex]),\
-                lanes3 = LOAD256u( curData3[argIndex]),\
-                INTLEAVE(),\
-                XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
-                XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
-                XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
-                XOReq256( stateAsLanes[argIndex+3], lanes3 )
-            Xor_In4( 0 );
-            Xor_In4( 4 );
-            Xor_In4( 8 );
-            Xor_In4( 12 );
-            Xor_In4( 16 );
-            Xor_In( 20 );
-            #undef  Xor_In
-            #undef  Xor_In4
-            KeccakP1600times4_PermuteAll_24rounds(states);
-            curData0 += laneOffsetSerial;
-            curData1 += laneOffsetSerial;
-            curData2 += laneOffsetSerial;
-            curData3 += laneOffsetSerial;
-            dataByteLen -= laneOffsetSerial*8;
-        }
-        return (const unsigned char *)curData0 - dataStart;
-#else
-//        unsigned int i;
-        const unsigned char *dataStart = data;
-        const UINT64 *curData0 = (const UINT64 *)data;
-        const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
-        const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
-        const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
-        V256 *statesAsLanes = (V256 *)states;
-        declareABCDE
-
-        copyFromState(A, statesAsLanes)
-        while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
-            #define XOR_In( Xxx, argIndex ) \
-                XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
-            XOR_In( Aba, 0 );
-            XOR_In( Abe, 1 );
-            XOR_In( Abi, 2 );
-            XOR_In( Abo, 3 );
-            XOR_In( Abu, 4 );
-            XOR_In( Aga, 5 );
-            XOR_In( Age, 6 );
-            XOR_In( Agi, 7 );
-            XOR_In( Ago, 8 );
-            XOR_In( Agu, 9 );
-            XOR_In( Aka, 10 );
-            XOR_In( Ake, 11 );
-            XOR_In( Aki, 12 );
-            XOR_In( Ako, 13 );
-            XOR_In( Aku, 14 );
-            XOR_In( Ama, 15 );
-            XOR_In( Ame, 16 );
-            XOR_In( Ami, 17 );
-            XOR_In( Amo, 18 );
-            XOR_In( Amu, 19 );
-            XOR_In( Asa, 20 );
-            #undef XOR_In
-            rounds24
-            curData0 += laneOffsetSerial;
-            curData1 += laneOffsetSerial;
-            curData2 += laneOffsetSerial;
-            curData3 += laneOffsetSerial;
-            dataByteLen -= laneOffsetSerial*8;
-        }
-        copyToState(statesAsLanes, A)
-        return (const unsigned char *)curData0 - dataStart;
-#endif
-    }
-    else {
-//        unsigned int i;
-        const unsigned char *dataStart = data;
-
-        while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
-            KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
-            KeccakP1600times4_PermuteAll_24rounds(states);
-            data += laneOffsetSerial*8;
-            dataByteLen -= laneOffsetSerial*8;
-        }
-        return data - dataStart;
-    }
-}
-
-size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
-{
-    if (laneCount == 21) {
-#if 0
-        const unsigned char *dataStart = data;
-        const UINT64 *curData0 = (const UINT64 *)data;
-        const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
-        const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
-        const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
-
-        while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
-            V256 *stateAsLanes = states;
-            V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
-            #define Xor_In( argIndex ) \
-                XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
-            #define Xor_In4( argIndex ) \
-                lanes0 = LOAD256u( curData0[argIndex]),\
-                lanes1 = LOAD256u( curData1[argIndex]),\
-                lanes2 = LOAD256u( curData2[argIndex]),\
-                lanes3 = LOAD256u( curData3[argIndex]),\
-                INTLEAVE(),\
-                XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
-                XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
-                XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
-                XOReq256( stateAsLanes[argIndex+3], lanes3 )
-            Xor_In4( 0 );
-            Xor_In4( 4 );
-            Xor_In4( 8 );
-            Xor_In4( 12 );
-            Xor_In4( 16 );
-            Xor_In( 20 );
-            #undef  Xor_In
-            #undef  Xor_In4
-            KeccakP1600times4_PermuteAll_12rounds(states);
-            curData0 += laneOffsetSerial;
-            curData1 += laneOffsetSerial;
-            curData2 += laneOffsetSerial;
-            curData3 += laneOffsetSerial;
-            dataByteLen -= laneOffsetSerial*8;
-        }
-        return (const unsigned char *)curData0 - dataStart;
-#else
-//        unsigned int i;
-        const unsigned char *dataStart = data;
-        const UINT64 *curData0 = (const UINT64 *)data;
-        const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
-        const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
-        const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
-        V256 *statesAsLanes = states;
-        declareABCDE
-
-        copyFromState(A, statesAsLanes)
-        while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
-            #define XOR_In( Xxx, argIndex ) \
-                XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
-            XOR_In( Aba, 0 );
-            XOR_In( Abe, 1 );
-            XOR_In( Abi, 2 );
-            XOR_In( Abo, 3 );
-            XOR_In( Abu, 4 );
-            XOR_In( Aga, 5 );
-            XOR_In( Age, 6 );
-            XOR_In( Agi, 7 );
-            XOR_In( Ago, 8 );
-            XOR_In( Agu, 9 );
-            XOR_In( Aka, 10 );
-            XOR_In( Ake, 11 );
-            XOR_In( Aki, 12 );
-            XOR_In( Ako, 13 );
-            XOR_In( Aku, 14 );
-            XOR_In( Ama, 15 );
-            XOR_In( Ame, 16 );
-            XOR_In( Ami, 17 );
-            XOR_In( Amo, 18 );
-            XOR_In( Amu, 19 );
-            XOR_In( Asa, 20 );
-            #undef XOR_In
-            rounds12
-            curData0 += laneOffsetSerial;
-            curData1 += laneOffsetSerial;
-            curData2 += laneOffsetSerial;
-            curData3 += laneOffsetSerial;
-            dataByteLen -= laneOffsetSerial*8;
-        }
-        copyToState(statesAsLanes, A)
-        return (const unsigned char *)curData0 - dataStart;
-#endif
-    }
-    else {
-//        unsigned int i;
-        const unsigned char *dataStart = data;
-
-        while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
-            KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
-            KeccakP1600times4_PermuteAll_12rounds(states);
-            data += laneOffsetSerial*8;
-            dataByteLen -= laneOffsetSerial*8;
-        }
-        return data - dataStart;
-    }
-}
diff --git a/shake256-avx2/keccak4x/KeccakP-1600-times4-SnP.h b/shake256-avx2/keccak4x/KeccakP-1600-times4-SnP.h
deleted file mode 100644
index 60338488..00000000
--- a/shake256-avx2/keccak4x/KeccakP-1600-times4-SnP.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
-Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
-Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
-denoted as "the implementer".
-
-For more information, feedback or questions, please refer to our websites:
-http://keccak.noekeon.org/
-http://keyak.noekeon.org/
-http://ketje.noekeon.org/
-
-To the extent possible under law, the implementer has waived all copyright
-and related or neighboring rights to the source code in this file.
-http://creativecommons.org/publicdomain/zero/1.0/
-*/
-
-#ifndef _KeccakP_1600_times4_SnP_h_
-#define _KeccakP_1600_times4_SnP_h_
-
-/** For the documentation, see PlSnP-documentation.h.
- */
-
-#include "SIMD256-config.h"
-
-#define KeccakP1600times4_implementation        "256-bit SIMD implementation (" KeccakP1600times4_implementation_config ")"
-#define KeccakP1600times4_statesSizeInBytes     800
-#define KeccakP1600times4_statesAlignment       32
-#define KeccakF1600times4_FastLoop_supported
-#define KeccakP1600times4_12rounds_FastLoop_supported
-
-#include <stddef.h>
-
-#define KeccakP1600times4_StaticInitialize()
-void KeccakP1600times4_InitializeAll(void *states);
-#define KeccakP1600times4_AddByte(states, instanceIndex, byte, offset) \
-    ((unsigned char*)(states))[(instanceIndex)*8 + ((offset)/8)*4*8 + (offset)%8] ^= (byte)
-void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length);
-void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
-void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length);
-void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
-void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount);
-void KeccakP1600times4_PermuteAll_12rounds(void *states);
-void KeccakP1600times4_PermuteAll_24rounds(void *states);
-void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length);
-void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
-void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex,  const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length);
-void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset);
-size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen);
-size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen);
-
-#endif
diff --git a/shake256-avx2/keccak4x/KeccakP-1600-unrolling.macros b/shake256-avx2/keccak4x/KeccakP-1600-unrolling.macros
deleted file mode 100644
index 3180bb06..00000000
--- a/shake256-avx2/keccak4x/KeccakP-1600-unrolling.macros
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
-Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
-Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
-denoted as "the implementer".
-
-For more information, feedback or questions, please refer to our websites:
-http://keccak.noekeon.org/
-http://keyak.noekeon.org/
-http://ketje.noekeon.org/
-
-To the extent possible under law, the implementer has waived all copyright
-and related or neighboring rights to the source code in this file.
-http://creativecommons.org/publicdomain/zero/1.0/
-*/
-
-#if (defined(FullUnrolling))
-#define rounds24 \
-    prepareTheta \
-    thetaRhoPiChiIotaPrepareTheta( 0, A, E) \
-    thetaRhoPiChiIotaPrepareTheta( 1, E, A) \
-    thetaRhoPiChiIotaPrepareTheta( 2, A, E) \
-    thetaRhoPiChiIotaPrepareTheta( 3, E, A) \
-    thetaRhoPiChiIotaPrepareTheta( 4, A, E) \
-    thetaRhoPiChiIotaPrepareTheta( 5, E, A) \
-    thetaRhoPiChiIotaPrepareTheta( 6, A, E) \
-    thetaRhoPiChiIotaPrepareTheta( 7, E, A) \
-    thetaRhoPiChiIotaPrepareTheta( 8, A, E) \
-    thetaRhoPiChiIotaPrepareTheta( 9, E, A) \
-    thetaRhoPiChiIotaPrepareTheta(10, A, E) \
-    thetaRhoPiChiIotaPrepareTheta(11, E, A) \
-    thetaRhoPiChiIotaPrepareTheta(12, A, E) \
-    thetaRhoPiChiIotaPrepareTheta(13, E, A) \
-    thetaRhoPiChiIotaPrepareTheta(14, A, E) \
-    thetaRhoPiChiIotaPrepareTheta(15, E, A) \
-    thetaRhoPiChiIotaPrepareTheta(16, A, E) \
-    thetaRhoPiChiIotaPrepareTheta(17, E, A) \
-    thetaRhoPiChiIotaPrepareTheta(18, A, E) \
-    thetaRhoPiChiIotaPrepareTheta(19, E, A) \
-    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
-    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
-    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
-    thetaRhoPiChiIota(23, E, A) \
-
-#define rounds12 \
-    prepareTheta \
-    thetaRhoPiChiIotaPrepareTheta(12, A, E) \
-    thetaRhoPiChiIotaPrepareTheta(13, E, A) \
-    thetaRhoPiChiIotaPrepareTheta(14, A, E) \
-    thetaRhoPiChiIotaPrepareTheta(15, E, A) \
-    thetaRhoPiChiIotaPrepareTheta(16, A, E) \
-    thetaRhoPiChiIotaPrepareTheta(17, E, A) \
-    thetaRhoPiChiIotaPrepareTheta(18, A, E) \
-    thetaRhoPiChiIotaPrepareTheta(19, E, A) \
-    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
-    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
-    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
-    thetaRhoPiChiIota(23, E, A) \
-
-#elif (Unrolling == 12)
-#define rounds24 \
-    prepareTheta \
-    for(i=0; i<24; i+=12) { \
-        thetaRhoPiChiIotaPrepareTheta(i   , A, E) \
-        thetaRhoPiChiIotaPrepareTheta(i+ 1, E, A) \
-        thetaRhoPiChiIotaPrepareTheta(i+ 2, A, E) \
-        thetaRhoPiChiIotaPrepareTheta(i+ 3, E, A) \
-        thetaRhoPiChiIotaPrepareTheta(i+ 4, A, E) \
-        thetaRhoPiChiIotaPrepareTheta(i+ 5, E, A) \
-        thetaRhoPiChiIotaPrepareTheta(i+ 6, A, E) \
-        thetaRhoPiChiIotaPrepareTheta(i+ 7, E, A) \
-        thetaRhoPiChiIotaPrepareTheta(i+ 8, A, E) \
-        thetaRhoPiChiIotaPrepareTheta(i+ 9, E, A) \
-        thetaRhoPiChiIotaPrepareTheta(i+10, A, E) \
-        thetaRhoPiChiIotaPrepareTheta(i+11, E, A) \
-    } \
-
-#define rounds12 \
-    prepareTheta \
-    thetaRhoPiChiIotaPrepareTheta(12, A, E) \
-    thetaRhoPiChiIotaPrepareTheta(13, E, A) \
-    thetaRhoPiChiIotaPrepareTheta(14, A, E) \
-    thetaRhoPiChiIotaPrepareTheta(15, E, A) \
-    thetaRhoPiChiIotaPrepareTheta(16, A, E) \
-    thetaRhoPiChiIotaPrepareTheta(17, E, A) \
-    thetaRhoPiChiIotaPrepareTheta(18, A, E) \
-    thetaRhoPiChiIotaPrepareTheta(19, E, A) \
-    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
-    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
-    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
-    thetaRhoPiChiIota(23, E, A) \
-
-#elif (Unrolling == 6)
-#define rounds24 \
-    prepareTheta \
-    for(i=0; i<24; i+=6) { \
-        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
-        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
-        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
-        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
-        thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
-        thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
-    } \
-
-#define rounds12 \
-    prepareTheta \
-    for(i=12; i<24; i+=6) { \
-        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
-        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
-        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
-        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
-        thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
-        thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
-    } \
-
-#elif (Unrolling == 4)
-#define rounds24 \
-    prepareTheta \
-    for(i=0; i<24; i+=4) { \
-        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
-        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
-        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
-        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
-    } \
-
-#define rounds12 \
-    prepareTheta \
-    for(i=12; i<24; i+=4) { \
-        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
-        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
-        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
-        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
-    } \
-
-#elif (Unrolling == 3)
-#define rounds24 \
-    prepareTheta \
-    for(i=0; i<24; i+=3) { \
-        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
-        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
-        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
-        copyStateVariables(A, E) \
-    } \
-
-#define rounds12 \
-    prepareTheta \
-    for(i=12; i<24; i+=3) { \
-        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
-        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
-        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
-        copyStateVariables(A, E) \
-    } \
-
-#elif (Unrolling == 2)
-#define rounds24 \
-    prepareTheta \
-    for(i=0; i<24; i+=2) { \
-        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
-        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
-    } \
-
-#define rounds12 \
-    prepareTheta \
-    for(i=12; i<24; i+=2) { \
-        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
-        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
-    } \
-
-#elif (Unrolling == 1)
-#define rounds24 \
-    prepareTheta \
-    for(i=0; i<24; i++) { \
-        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
-        copyStateVariables(A, E) \
-    } \
-
-#define rounds12 \
-    prepareTheta \
-    for(i=12; i<24; i++) { \
-        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
-        copyStateVariables(A, E) \
-    } \
-
-#else
-#error "Unrolling is not correctly specified!"
-#endif
-
-#define roundsN(__nrounds) \
-    prepareTheta \
-    i = 24 - (__nrounds); \
-    if ((i&1) != 0) { \
-        thetaRhoPiChiIotaPrepareTheta(i, A, E) \
-        copyStateVariables(A, E) \
-        ++i; \
-    } \
-    for( /* empty */; i<24; i+=2) { \
-        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
-        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
-    }
diff --git a/shake256-avx2/keccak4x/SIMD256-config.h b/shake256-avx2/keccak4x/SIMD256-config.h
deleted file mode 100644
index 1c65fe29..00000000
--- a/shake256-avx2/keccak4x/SIMD256-config.h
+++ /dev/null
@@ -1,3 +0,0 @@
-#define KeccakP1600times4_implementation_config "AVX2, all rounds unrolled"
-#define KeccakP1600times4_fullUnrolling
-#define KeccakP1600times4_useAVX2
diff --git a/shake256-avx2/keccak4x/align.h b/shake256-avx2/keccak4x/align.h
deleted file mode 100644
index e29771ed..00000000
--- a/shake256-avx2/keccak4x/align.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
-Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
-Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
-denoted as "the implementer".
-
-For more information, feedback or questions, please refer to our websites:
-http://keccak.noekeon.org/
-http://keyak.noekeon.org/
-http://ketje.noekeon.org/
-
-To the extent possible under law, the implementer has waived all copyright
-and related or neighboring rights to the source code in this file.
-http://creativecommons.org/publicdomain/zero/1.0/
-*/
-
-#ifndef _align_h_
-#define _align_h_
-
-/* on Mac OS-X and possibly others, ALIGN(x) is defined in param.h, and -Werror chokes on the redef. */
-#ifdef ALIGN
-#undef ALIGN
-#endif
-
-#if defined(__GNUC__)
-#define ALIGN(x) __attribute__ ((aligned(x)))
-#elif defined(_MSC_VER)
-#define ALIGN(x) __declspec(align(x))
-#elif defined(__ARMCC_VERSION)
-#define ALIGN(x) __align(x)
-#else
-#define ALIGN(x)
-#endif
-
-#endif
diff --git a/shake256-avx2/keccak4x/brg_endian.h b/shake256-avx2/keccak4x/brg_endian.h
deleted file mode 100644
index 7226eb3b..00000000
--- a/shake256-avx2/keccak4x/brg_endian.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- ---------------------------------------------------------------------------
- Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
-
- LICENSE TERMS
-
- The redistribution and use of this software (with or without changes)
- is allowed without the payment of fees or royalties provided that:
-
-  1. source code distributions include the above copyright notice, this
-     list of conditions and the following disclaimer;
-
-  2. binary distributions include the above copyright notice, this list
-     of conditions and the following disclaimer in their documentation;
-
-  3. the name of the copyright holder is not used to endorse products
-     built using this software without specific written permission.
-
- DISCLAIMER
-
- This software is provided 'as is' with no explicit or implied warranties
- in respect of its properties, including, but not limited to, correctness
- and/or fitness for purpose.
- ---------------------------------------------------------------------------
- Issue Date: 20/12/2007
- Changes for ARM 9/9/2010
-*/
-
-#ifndef _BRG_ENDIAN_H
-#define _BRG_ENDIAN_H
-
-#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
-#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
-
-#if 0
-/* Include files where endian defines and byteswap functions may reside */
-#if defined( __sun )
-#  include <sys/isa_defs.h>
-#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
-#  include <sys/endian.h>
-#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
-      defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
-#  include <machine/endian.h>
-#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
-#  if !defined( __MINGW32__ ) && !defined( _AIX )
-#    include <endian.h>
-#    if !defined( __BEOS__ )
-#      include <byteswap.h>
-#    endif
-#  endif
-#endif
-#endif
-
-/* Now attempt to set the define for platform byte order using any  */
-/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which  */
-/* seem to encompass most endian symbol definitions                 */
-
-#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
-#  if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
-#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
-#  elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
-#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
-#  endif
-#elif defined( BIG_ENDIAN )
-#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
-#elif defined( LITTLE_ENDIAN )
-#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
-#endif
-
-#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
-#  if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
-#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
-#  elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
-#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
-#  endif
-#elif defined( _BIG_ENDIAN )
-#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
-#elif defined( _LITTLE_ENDIAN )
-#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
-#endif
-
-#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
-#  if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
-#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
-#  elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
-#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
-#  endif
-#elif defined( __BIG_ENDIAN )
-#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
-#elif defined( __LITTLE_ENDIAN )
-#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
-#endif
-
-#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
-#  if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
-#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
-#  elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
-#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
-#  endif
-#elif defined( __BIG_ENDIAN__ )
-#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
-#elif defined( __LITTLE_ENDIAN__ )
-#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
-#endif
-
-/*  if the platform byte order could not be determined, then try to */
-/*  set this define using common machine defines                    */
-#if !defined(PLATFORM_BYTE_ORDER)
-
-#if   defined( __alpha__ ) || defined( __alpha ) || defined( i386 )       || \
-      defined( __i386__ )  || defined( _M_I86 )  || defined( _M_IX86 )    || \
-      defined( __OS2__ )   || defined( sun386 )  || defined( __TURBOC__ ) || \
-      defined( vax )       || defined( vms )     || defined( VMS )        || \
-      defined( __VMS )     || defined( _M_X64 )
-#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
-
-#elif defined( AMIGA )   || defined( applec )    || defined( __AS400__ )  || \
-      defined( _CRAY )   || defined( __hppa )    || defined( __hp9000 )   || \
-      defined( ibm370 )  || defined( mc68000 )   || defined( m68k )       || \
-      defined( __MRC__ ) || defined( __MVS__ )   || defined( __MWERKS__ ) || \
-      defined( sparc )   || defined( __sparc)    || defined( SYMANTEC_C ) || \
-      defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM )   || \
-      defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX )
-#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
-
-#elif defined(__arm__)
-# ifdef __BIG_ENDIAN
-#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
-# else
-#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
-# endif
-#elif 1     /* **** EDIT HERE IF NECESSARY **** */
-#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
-#elif 0     /* **** EDIT HERE IF NECESSARY **** */
-#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
-#else
-#  error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order
-#endif
-
-#endif
-
-#endif
diff --git a/shake256-avx2/thash_shake256_robustx4.c b/shake256-avx2/thash_shake256_robustx4.c
index 483987b9..4a26b756 100644
--- a/shake256-avx2/thash_shake256_robustx4.c
+++ b/shake256-avx2/thash_shake256_robustx4.c
@@ -1,13 +1,14 @@
 #include <stdint.h>
+#include <assert.h>
 #include <string.h>
+#include <immintrin.h>
 
 #include "thashx4.h"
 #include "address.h"
 #include "params.h"
 
-#include "fips202x4.h"
+#include "f1600x4.h"
 
-extern void KeccakP1600times4_PermuteAll_24rounds(__m256i *s);
 
 static uint32_t swap32(uint32_t val) {
     val = ((val << 8) & 0xFF00FF00 ) | ((val >> 8) & 0xFF00FF );
@@ -64,7 +65,7 @@ void thashx4(unsigned char *out0,
         __m256i state2[25];
         memcpy(state2, state, 800);
 
-        KeccakP1600times4_PermuteAll_24rounds(&state[0]);
+        f1600x4AVX2((uint64_t*)&state[0], &keccak_rc[0]);
 
         /* By copying from state, state2 already contains the pub_seed
          * and addres.  We just need to copy in the input blocks xorred with
@@ -91,7 +92,7 @@ void thashx4(unsigned char *out0,
             _mm256_set1_epi64x(0x1f)
         );
 
-        KeccakP1600times4_PermuteAll_24rounds(&state2[0]);
+        f1600x4AVX2((uint64_t*)&state2[0], &keccak_rc[0]);
 
         for (int i = 0; i < SPX_N/8; i++) {
             ((int64_t*)out0)[i] = _mm256_extract_epi64(state2[i], 0);
@@ -136,7 +137,7 @@ void thashx4(unsigned char *out0,
         __m256i state2[25];
         memcpy(state2, state, 800);
 
-        KeccakP1600times4_PermuteAll_24rounds(&state[0]);
+        f1600x4AVX2((uint64_t*)&state[0], &keccak_rc[0]);
 
         /* We will won't be able to fit all input in on go.
          * By copying from state, state2 already contains the pub_seed
@@ -154,7 +155,7 @@ void thashx4(unsigned char *out0,
                 );
         }
 
-        KeccakP1600times4_PermuteAll_24rounds(&state2[0]);
+        f1600x4AVX2((uint64_t*)&state2[0], &keccak_rc[0]);
 
         /* Final input. */
         for (unsigned int i = 0; i < 3+8*(inblocks-1); i++) {
@@ -177,7 +178,7 @@ void thashx4(unsigned char *out0,
                 _mm256_set1_epi64x(0x1f));
         state2[16] = _mm256_xor_si256(state2[16], _mm256_set1_epi64x(0x80ll << 56));
 
-        KeccakP1600times4_PermuteAll_24rounds(&state2[0]);
+        f1600x4AVX2((uint64_t*)&state2[0], &keccak_rc[0]);
 
         for (int i = 0; i < 8; i++) {
             ((int64_t*)out0)[i] = _mm256_extract_epi64(state2[i], 0);
@@ -186,36 +187,6 @@ void thashx4(unsigned char *out0,
             ((int64_t*)out3)[i] = _mm256_extract_epi64(state2[i], 3);
         }
     } else {
-        unsigned char buf0[SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N];
-        unsigned char buf1[SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N];
-        unsigned char buf2[SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N];
-        unsigned char buf3[SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N];
-        unsigned char bitmask0[inblocks * SPX_N];
-        unsigned char bitmask1[inblocks * SPX_N];
-        unsigned char bitmask2[inblocks * SPX_N];
-        unsigned char bitmask3[inblocks * SPX_N];
-        unsigned int i;
-
-        memcpy(buf0, pub_seed, SPX_N);
-        memcpy(buf1, pub_seed, SPX_N);
-        memcpy(buf2, pub_seed, SPX_N);
-        memcpy(buf3, pub_seed, SPX_N);
-        addr_to_bytes(buf0 + SPX_N, addrx4 + 0*8);
-        addr_to_bytes(buf1 + SPX_N, addrx4 + 1*8);
-        addr_to_bytes(buf2 + SPX_N, addrx4 + 2*8);
-        addr_to_bytes(buf3 + SPX_N, addrx4 + 3*8);
-
-        shake256x4(bitmask0, bitmask1, bitmask2, bitmask3, inblocks * SPX_N,
-                   buf0, buf1, buf2, buf3, SPX_N + SPX_ADDR_BYTES);
-
-        for (i = 0; i < inblocks * SPX_N; i++) {
-            buf0[SPX_N + SPX_ADDR_BYTES + i] = in0[i] ^ bitmask0[i];
-            buf1[SPX_N + SPX_ADDR_BYTES + i] = in1[i] ^ bitmask1[i];
-            buf2[SPX_N + SPX_ADDR_BYTES + i] = in2[i] ^ bitmask2[i];
-            buf3[SPX_N + SPX_ADDR_BYTES + i] = in3[i] ^ bitmask3[i];
-        }
-
-        shake256x4(out0, out1, out2, out3, SPX_N,
-                   buf0, buf1, buf2, buf3, SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N);
+        assert(0);
     }
 }
diff --git a/shake256-avx2/thash_shake256_simplex4.c b/shake256-avx2/thash_shake256_simplex4.c
index ef0b0cb2..257eef56 100644
--- a/shake256-avx2/thash_shake256_simplex4.c
+++ b/shake256-avx2/thash_shake256_simplex4.c
@@ -1,13 +1,12 @@
 #include <stdint.h>
 #include <string.h>
+#include <immintrin.h>
 
 #include "thashx4.h"
+#include "f1600x4.h"
 #include "address.h"
 #include "params.h"
 
-#include "fips202x4.h"
-
-extern void KeccakP1600times4_PermuteAll_24rounds(__m256i *s);
 
 static uint32_t swap32(uint32_t val) {
     val = ((val << 8) & 0xFF00FF00 ) | ((val >> 8) & 0xFF00FF );
@@ -70,7 +69,7 @@ void thashx4(unsigned char *out0,
             state[i] = _mm256_set1_epi64x(0);
         }
 
-        KeccakP1600times4_PermuteAll_24rounds(&state[0]);
+        f1600x4AVX2((uint64_t*)&state[0], &keccak_rc[0])
 
         for (int i = 0; i < SPX_N/8; i++) {
             ((int64_t*)out0)[i] = _mm256_extract_epi64(state[i], 0);
@@ -112,7 +111,7 @@ void thashx4(unsigned char *out0,
             );
         }
 
-        KeccakP1600times4_PermuteAll_24rounds(&state[0]);
+        f1600x4AVX2((uint64_t*)&state[0], &keccak_rc[0])
 
         /* Final input. */
         for (unsigned int i = 0; i < 3+8*(inblocks-1); i++) {
@@ -132,7 +131,7 @@ void thashx4(unsigned char *out0,
                 _mm256_set1_epi64x(0x1f));
         state[16] = _mm256_xor_si256(state[16], _mm256_set1_epi64x(0x80ll << 56));
 
-        KeccakP1600times4_PermuteAll_24rounds(&state[0]);
+        f1600x4AVX2((uint64_t*)&state[0], &keccak_rc[0])
 
         for (int i = 0; i < 8; i++) {
             ((int64_t*)out0)[i] = _mm256_extract_epi64(state[i], 0);
@@ -141,25 +140,6 @@ void thashx4(unsigned char *out0,
             ((int64_t*)out3)[i] = _mm256_extract_epi64(state[i], 3);
         }
     } else {
-        unsigned char buf0[SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N];
-        unsigned char buf1[SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N];
-        unsigned char buf2[SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N];
-        unsigned char buf3[SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N];
-
-        memcpy(buf0, pub_seed, SPX_N);
-        memcpy(buf1, pub_seed, SPX_N);
-        memcpy(buf2, pub_seed, SPX_N);
-        memcpy(buf3, pub_seed, SPX_N);
-        addr_to_bytes(buf0 + SPX_N, addrx4 + 0*8);
-        addr_to_bytes(buf1 + SPX_N, addrx4 + 1*8);
-        addr_to_bytes(buf2 + SPX_N, addrx4 + 2*8);
-        addr_to_bytes(buf3 + SPX_N, addrx4 + 3*8);
-        memcpy(buf0 + SPX_N + SPX_ADDR_BYTES, in0, inblocks * SPX_N);
-        memcpy(buf1 + SPX_N + SPX_ADDR_BYTES, in1, inblocks * SPX_N);
-        memcpy(buf2 + SPX_N + SPX_ADDR_BYTES, in2, inblocks * SPX_N);
-        memcpy(buf3 + SPX_N + SPX_ADDR_BYTES, in3, inblocks * SPX_N);
-
-        shake256x4(out0, out1, out2, out3, SPX_N,
-                   buf0, buf1, buf2, buf3, SPX_N + SPX_ADDR_BYTES + inblocks*SPX_N);
+        assert(0);
     }
 }