diff --git a/Makefile.am b/Makefile.am index f26f59059..1068723fa 100644 --- a/Makefile.am +++ b/Makefile.am @@ -19,6 +19,9 @@ minerd_SOURCES = elist.h miner.h compat.h \ cpu-miner.c util.c \ sha2.c scrypt.c \ neoscrypt.c neoscrypt.h +if USE_AVX +minerd_SOURCES += chacha_20_sidm.c salsa_20_sidm.c +endif if USE_ASM minerd_SOURCES += neoscrypt_asm.S if ARCH_x86 diff --git a/README b/README index afb1b7ddb..6420e7e8b 100644 --- a/README +++ b/README @@ -14,7 +14,10 @@ Dependencies: Basic *nix build instructions: ./autogen.sh # only needed if building from git repo ./nomacro.pl # only needed if building on Mac OS X or with Clang +<> ./configure CFLAGS="-O2 -fomit-frame-pointer -DASM -DOPT -DMINER_4WAY -DSHA256" +<> + ./configure CFLAGS="-O2 -march=native -fomit-frame-pointer -DAVX -DOPT -DSHA256" make Notes for AIX users: diff --git a/blake2s_sidm.c b/blake2s_sidm.c new file mode 100644 index 000000000..55ecb535c --- /dev/null +++ b/blake2s_sidm.c @@ -0,0 +1,691 @@ +/* + * Copyright 2015 gerko.deroo@kangaderoo.nl + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* BLAKE2s */ + +#include +#include +#include +#include + +#if defined(__x86_64__) + +#define BLAKE2S_BLOCK_SIZE_SIDM 64U + +typedef struct blake2s_state_sidm_t { + uint32_t h[8]; + uint32_t t[2]; + uint32_t f[2]; + u_char buf[2 * BLAKE2S_BLOCK_SIZE_SIDM]; + uint32_t buflen; +} blake2s_state_sidm; + +static const uint32_t blake2s_IV_sidm[8] __attribute__((aligned(32))) = { + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 +}; + +static inline void load_message(uint32_t* message, uint32_t offset, uint32_t m0, uint32_t m1, uint32_t m2, uint32_t m3) +{ + message[offset++] = m0; + message[offset++] = m1; + message[offset++] = m2; + message[offset] = m3; +} + +static inline void blake2_round_sidm(__m128i* state, __m128i* Message) +{ + __m128i _calc; + __m128i const _rotate_16 = _mm_setr_epi8(2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13); +// __m128i const _rotate_8_l = _mm_setr_epi8(3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14); + __m128i const _rotate_8_r = _mm_setr_epi8(1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12); + + /* first row */ + state[0] = _mm_add_epi32(state[0], state[1]); + state[0] = _mm_add_epi32(state[0], Message[0]); + state[3] = _mm_xor_si128(state[3], state[0]); + state[3] = _mm_shuffle_epi8(state[3],_rotate_16); + + /* second row */ + state[2] = _mm_add_epi32(state[2], state[3]); + _calc = _mm_xor_si128(state[1], state[2]); + state[1] = _mm_srli_epi32(_calc, (12)); + _calc = _mm_slli_epi32(_calc,(32 - 12)); + state[1] = _mm_xor_si128(state[1], _calc); + + /* third row */ + state[0] = _mm_add_epi32(state[0], state[1]); + state[0] = _mm_add_epi32(state[0], Message[1]); + state[3] = _mm_xor_si128(state[3], state[0]); + state[3] = _mm_shuffle_epi8(state[3],_rotate_8_r); + + /* fourth row */ + state[2] = _mm_add_epi32(state[2], state[3]); + _calc = _mm_xor_si128(state[1], state[2]); + state[1] = _mm_srli_epi32(_calc, (7)); + _calc = _mm_slli_epi32(_calc,(32 - 7)); + state[1] = _mm_xor_si128(state[1], _calc); + +// row a 0 1 2 3 0 1 2 3 +// row b 4 5 6 7 --> 5 6 7 4 +// row c 8 9 10 11 ..> 10 11 8 9 +// row d 12 13 14 15 15 12 13 14 + // transpose_matrix(row1, row2, row3, row4, row_to_column); + state[1] = _mm_shuffle_epi32(state[1],0x39); // 10 01 00 11 + state[2] = _mm_shuffle_epi32(state[2],0x4e); // 01 00 11 10 + state[3] = _mm_shuffle_epi32(state[3],0x93); // 00 11 10 01 + // end transpose + + /* first column */ + state[0] = _mm_add_epi32(state[0], state[1]); + state[0] = _mm_add_epi32(state[0], Message[2]); + state[3] = _mm_xor_si128(state[3], state[0]); + state[3] = _mm_shuffle_epi8(state[3],_rotate_16); + + /* second column */ + state[2] = _mm_add_epi32(state[2], state[3]); + _calc = _mm_xor_si128(state[1], state[2]); + state[1] = _mm_srli_epi32(_calc, (12)); + _calc = _mm_slli_epi32(_calc,(32 - 12)); + state[1] = _mm_xor_si128(state[1], _calc); + + /* third column */ + state[0] = _mm_add_epi32(state[0], state[1]); + state[0] = _mm_add_epi32(state[0], Message[3]); + state[3] = _mm_xor_si128(state[3], state[0]); + state[3] = _mm_shuffle_epi8(state[3],_rotate_8_r); + + /* fourth column */ + state[2] = _mm_add_epi32(state[2], state[3]); + _calc = _mm_xor_si128(state[1], state[2]); + state[1] = _mm_srli_epi32(_calc, (7)); + _calc = _mm_slli_epi32(_calc,(32 - 7)); + state[1] = _mm_xor_si128(state[1], _calc); + + // transpose_matrix(row1, row2, row3, row4, row_to_column); + state[1] = _mm_shuffle_epi32(state[1],0x93); + state[2] = _mm_shuffle_epi32(state[2],0x4e); + state[3] = _mm_shuffle_epi32(state[3],0x39); + // end transpose + +} + +void blake2_compress_sidm(blake2s_state_sidm *S) +{ + uint32_t v[16] __attribute__((aligned(32))); + uint32_t *m = (uint *) S->buf; + __m128i *_mm_v = (__m128i*) &v; + __m128i Message[4]; + + v[0] = S->h[0]; + v[1] = S->h[1]; + v[2] = S->h[2]; + v[3] = S->h[3]; + v[4] = S->h[4]; + v[5] = S->h[5]; + v[6] = S->h[6]; + v[7] = S->h[7]; + v[8] = blake2s_IV_sidm[0]; + v[9] = blake2s_IV_sidm[1]; + v[10] = blake2s_IV_sidm[2]; + v[11] = blake2s_IV_sidm[3]; + v[12] = S->t[0] ^ blake2s_IV_sidm[4]; + v[13] = S->t[1] ^ blake2s_IV_sidm[5]; + v[14] = S->f[0] ^ blake2s_IV_sidm[6]; + v[15] = S->f[1] ^ blake2s_IV_sidm[7]; + + // round 1 + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , + + Message[0]=_mm_setr_epi32(m[0], m[2], m[4], m[6]); + Message[1]=_mm_setr_epi32(m[1], m[3], m[5], m[7]); + Message[2]=_mm_setr_epi32(m[8], m[10], m[12], m[14]); + Message[3]=_mm_setr_epi32(m[9], m[11], m[13], m[15]); + + blake2_round_sidm((__m128i *)v, Message); + +// { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , + Message[0]=_mm_setr_epi32(m[14], m[4], m[9], m[13]); + Message[1]=_mm_setr_epi32(m[10], m[8], m[15], m[6]); + Message[2]=_mm_setr_epi32(m[1], m[0], m[11], m[5]); + Message[3]=_mm_setr_epi32(m[12], m[2], m[7], m[3]); + + blake2_round_sidm((__m128i *)v, Message); +// { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , + Message[0]=_mm_setr_epi32(m[11], m[12], m[5], m[15]); + Message[1]=_mm_setr_epi32(m[8], m[0], m[2], m[13]); + Message[2]=_mm_setr_epi32(m[10], m[3], m[7], m[9]); + Message[3]=_mm_setr_epi32(m[14], m[6], m[1], m[4]); + + blake2_round_sidm((__m128i *)v, Message); +// { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , + Message[0]=_mm_setr_epi32(m[7], m[3], m[13], m[11]); + Message[1]=_mm_setr_epi32(m[9], m[1], m[12], m[14]); + Message[2]=_mm_setr_epi32(m[2], m[5], m[4], m[15]); + Message[3]=_mm_setr_epi32(m[6], m[10], m[0], m[8]); + + blake2_round_sidm((__m128i *)v, Message); +// { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , + Message[0]=_mm_setr_epi32(m[9], m[5], m[2], m[10]); + Message[1]=_mm_setr_epi32(m[0], m[7], m[4], m[15]); + Message[2]=_mm_setr_epi32(m[14], m[11], m[6], m[3]); + Message[3]=_mm_setr_epi32(m[1], m[12], m[8], m[13]); + + blake2_round_sidm((__m128i *)v, Message); +// { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , + Message[0]=_mm_setr_epi32(m[2], m[6], m[0], m[8]); + Message[1]=_mm_setr_epi32(m[12], m[10], m[11], m[3]); + Message[2]=_mm_setr_epi32(m[4], m[7], m[15], m[1]); + Message[3]=_mm_setr_epi32(m[13], m[5], m[14], m[9]); + + blake2_round_sidm((__m128i *)v, Message); +// { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , + Message[0]=_mm_setr_epi32(m[12], m[1], m[14], m[4]); + Message[1]=_mm_setr_epi32(m[5], m[15], m[13], m[10]); + Message[2]=_mm_setr_epi32(m[0], m[6], m[9], m[8]); + Message[3]=_mm_setr_epi32(m[7], m[3], m[2], m[11]); + + blake2_round_sidm((__m128i *)v, Message); +// { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , + Message[0]=_mm_setr_epi32(m[13], m[7], m[12], m[3]); + Message[1]=_mm_setr_epi32(m[11], m[14], m[1], m[9]); + Message[2]=_mm_setr_epi32(m[5], m[15], m[8], m[2]); + Message[3]=_mm_setr_epi32(m[0], m[4], m[6], m[10]); + + blake2_round_sidm((__m128i *)v, Message); +// { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , + Message[0]=_mm_setr_epi32(m[6], m[14], m[11], m[0]); + Message[1]=_mm_setr_epi32(m[15], m[9], m[3], m[8]); + Message[2]=_mm_setr_epi32(m[12], m[13], m[1], m[10]); + Message[3]=_mm_setr_epi32(m[2], m[7], m[4], m[5]); + + blake2_round_sidm((__m128i *)v, Message); +// { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , + Message[0]=_mm_setr_epi32(m[10], m[8], m[7], m[1]); + Message[1]=_mm_setr_epi32(m[2], m[4], m[6], m[5]); + Message[2]=_mm_setr_epi32(m[15], m[9], m[3], m[13]); + Message[3]=_mm_setr_epi32(m[11], m[14], m[12], m[0]); + + blake2_round_sidm((__m128i *)v, Message); + + _mm_v[0] = _mm_xor_si128(_mm_v[0],_mm_v[2]); + _mm_v[1] = _mm_xor_si128(_mm_v[1],_mm_v[3]); + + S->h[0] ^= v[0];// ^ v[8]; + S->h[1] ^= v[1];// ^ v[9]; + S->h[2] ^= v[2];// ^ v[10]; + S->h[3] ^= v[3];// ^ v[11]; + S->h[4] ^= v[4];// ^ v[12]; + S->h[5] ^= v[5];// ^ v[13]; + S->h[6] ^= v[6];// ^ v[14]; + S->h[7] ^= v[7];// ^ v[15]; +} + +/* + * three fold + */ + +static inline void blake2_round_sidm_X3(__m128i* state, __m128i* Message) +{ + __m128i _calc1, _calc2; + __m128i const _rotate_16 = _mm_setr_epi8(2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13); + __m128i const _rotate_8_r = _mm_setr_epi8(1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12); + + __m128i row0_1 = state[0]; + __m128i row1_1 = state[1]; + __m128i row2_1 = state[2]; + __m128i row3_1 = state[3]; + __m128i row0_2 = state[4]; + __m128i row1_2 = state[5]; + __m128i row2_2 = state[6]; + __m128i row3_2 = state[7]; + __m128i row0_3 = state[8]; + __m128i row1_3 = state[9]; + __m128i row2_3 = state[10]; + __m128i row3_3 = state[11]; + + /* first row */ + row0_1 = _mm_add_epi32(row0_1, row1_1); + row0_2 = _mm_add_epi32(row0_2, row1_2); + row0_1 = _mm_add_epi32(row0_1, Message[0]); + row0_3 = _mm_add_epi32(row0_3, row1_3); + row0_2 = _mm_add_epi32(row0_2, Message[4]); + row0_3 = _mm_add_epi32(row0_3, Message[8]); + row3_1 = _mm_xor_si128(row3_1, row0_1); + row3_2 = _mm_xor_si128(row3_2, row0_2); + row3_3 = _mm_xor_si128(row3_3, row0_3); + row3_1 = _mm_shuffle_epi8(row3_1,_rotate_16); + row3_2 = _mm_shuffle_epi8(row3_2,_rotate_16); + row3_3 = _mm_shuffle_epi8(row3_3,_rotate_16); + + /* second row */ + row2_1 = _mm_add_epi32(row2_1, row3_1); + row2_2 = _mm_add_epi32(row2_2, row3_2); + _calc1 = _mm_xor_si128(row1_1, row2_1); + row2_3 = _mm_add_epi32(row2_3, row3_3); + row1_1 = _mm_srli_epi32(_calc1, (12)); + _calc1 = _mm_slli_epi32(_calc1,(32 - 12)); + _calc2 = _mm_xor_si128(row1_2, row2_2); + row1_1 = _mm_xor_si128(row1_1, _calc1); + _calc1 = _mm_xor_si128(row1_3, row2_3); + row1_2 = _mm_srli_epi32(_calc2, (12)); + row1_3 = _mm_srli_epi32(_calc1, (12)); + _calc2 = _mm_slli_epi32(_calc2,(32 - 12)); + _calc1 = _mm_slli_epi32(_calc1,(32 - 12)); + row1_2 = _mm_xor_si128(row1_2, _calc2); + row1_3 = _mm_xor_si128(row1_3, _calc1); + + /* third row */ + row0_1 = _mm_add_epi32(row0_1, row1_1); + row0_2 = _mm_add_epi32(row0_2, row1_2); + row0_1 = _mm_add_epi32(row0_1, Message[1]); + row0_3 = _mm_add_epi32(row0_3, row1_3); + row0_2 = _mm_add_epi32(row0_2, Message[5]); + row0_3 = _mm_add_epi32(row0_3, Message[9]); + row3_1 = _mm_xor_si128(row3_1, row0_1); + row3_2 = _mm_xor_si128(row3_2, row0_2); + row3_3 = _mm_xor_si128(row3_3, row0_3); + row3_1 = _mm_shuffle_epi8(row3_1,_rotate_8_r); + row3_2 = _mm_shuffle_epi8(row3_2,_rotate_8_r); + row3_3 = _mm_shuffle_epi8(row3_3,_rotate_8_r); + + /* fourth row */ + row2_1 = _mm_add_epi32(row2_1, row3_1); + row2_2 = _mm_add_epi32(row2_2, row3_2); + _calc1 = _mm_xor_si128(row1_1, row2_1); + row2_3 = _mm_add_epi32(row2_3, row3_3); + row1_1 = _mm_srli_epi32(_calc1, (7)); + _calc1 = _mm_slli_epi32(_calc1,(32 - 7)); + _calc2 = _mm_xor_si128(row1_2, row2_2); + row1_1 = _mm_xor_si128(row1_1, _calc1); + _calc1 = _mm_xor_si128(row1_3, row2_3); + row1_2 = _mm_srli_epi32(_calc2, (7)); + row1_3 = _mm_srli_epi32(_calc1, (7)); + _calc2 = _mm_slli_epi32(_calc2,(32 - 7)); + _calc1 = _mm_slli_epi32(_calc1,(32 - 7)); + row1_2 = _mm_xor_si128(row1_2, _calc2); + row1_3 = _mm_xor_si128(row1_3, _calc1); + +// row a 0 1 2 3 0 1 2 3 +// row b 4 5 6 7 --> 5 6 7 4 +// row c 8 9 10 11 ..> 10 11 8 9 +// row d 12 13 14 15 15 12 13 14 + // transpose_matrix(row1, row2, row3, row4, row_to_column); + row1_1 = _mm_shuffle_epi32(row1_1,0x39); // 10 01 00 11 + row2_1 = _mm_shuffle_epi32(row2_1,0x4e); // 01 00 11 10 + row3_1 = _mm_shuffle_epi32(row3_1,0x93); // 00 11 10 01 + row1_2 = _mm_shuffle_epi32(row1_2,0x39); // 10 01 00 11 + row2_2 = _mm_shuffle_epi32(row2_2,0x4e); // 01 00 11 10 + row3_2 = _mm_shuffle_epi32(row3_2,0x93); // 00 11 10 01 + row1_3 = _mm_shuffle_epi32(row1_3,0x39); // 10 01 00 11 + row2_3 = _mm_shuffle_epi32(row2_3,0x4e); // 01 00 11 10 + row3_3 = _mm_shuffle_epi32(row3_3,0x93); // 00 11 10 01 + // end transpose + + /* first column */ + row0_1 = _mm_add_epi32(row0_1, row1_1); + row0_2 = _mm_add_epi32(row0_2, row1_2); + row0_1 = _mm_add_epi32(row0_1, Message[2]); + row0_3 = _mm_add_epi32(row0_3, row1_3); + row0_2 = _mm_add_epi32(row0_2, Message[6]); + row0_3 = _mm_add_epi32(row0_3, Message[10]); + row3_1 = _mm_xor_si128(row3_1, row0_1); + row3_2 = _mm_xor_si128(row3_2, row0_2); + row3_3 = _mm_xor_si128(row3_3, row0_3); + row3_1 = _mm_shuffle_epi8(row3_1,_rotate_16); + row3_2 = _mm_shuffle_epi8(row3_2,_rotate_16); + row3_3 = _mm_shuffle_epi8(row3_3,_rotate_16); + + /* second column */ + row2_1 = _mm_add_epi32(row2_1, row3_1); + row2_2 = _mm_add_epi32(row2_2, row3_2); + _calc1 = _mm_xor_si128(row1_1, row2_1); + row2_3 = _mm_add_epi32(row2_3, row3_3); + row1_1 = _mm_srli_epi32(_calc1, (12)); + _calc1 = _mm_slli_epi32(_calc1,(32 - 12)); + _calc2 = _mm_xor_si128(row1_2, row2_2); + row1_1 = _mm_xor_si128(row1_1, _calc1); + _calc1 = _mm_xor_si128(row1_3, row2_3); + row1_2 = _mm_srli_epi32(_calc2, (12)); + row1_3 = _mm_srli_epi32(_calc1, (12)); + _calc2 = _mm_slli_epi32(_calc2,(32 - 12)); + _calc1 = _mm_slli_epi32(_calc1,(32 - 12)); + row1_2 = _mm_xor_si128(row1_2, _calc2); + row1_3 = _mm_xor_si128(row1_3, _calc1); + + /* third column */ + row0_1 = _mm_add_epi32(row0_1, row1_1); + row0_2 = _mm_add_epi32(row0_2, row1_2); + row0_1 = _mm_add_epi32(row0_1, Message[3]); + row0_3 = _mm_add_epi32(row0_3, row1_3); + row0_2 = _mm_add_epi32(row0_2, Message[7]); + row0_3 = _mm_add_epi32(row0_3, Message[11]); + row3_1 = _mm_xor_si128(row3_1, row0_1); + row3_2 = _mm_xor_si128(row3_2, row0_2); + row3_3 = _mm_xor_si128(row3_3, row0_3); + row3_1 = _mm_shuffle_epi8(row3_1,_rotate_8_r); + row3_2 = _mm_shuffle_epi8(row3_2,_rotate_8_r); + row3_3 = _mm_shuffle_epi8(row3_3,_rotate_8_r); + + /* fourth column */ + row2_1 = _mm_add_epi32(row2_1, row3_1); + row2_2 = _mm_add_epi32(row2_2, row3_2); + _calc1 = _mm_xor_si128(row1_1, row2_1); + row2_3 = _mm_add_epi32(row2_3, row3_3); + row1_1 = _mm_srli_epi32(_calc1, (7)); + _calc1 = _mm_slli_epi32(_calc1,(32 - 7)); + _calc2 = _mm_xor_si128(row1_2, row2_2); + row1_1 = _mm_xor_si128(row1_1, _calc1); + _calc1 = _mm_xor_si128(row1_3, row2_3); + row1_2 = _mm_srli_epi32(_calc2, (7)); + row1_3 = _mm_srli_epi32(_calc1, (7)); + _calc2 = _mm_slli_epi32(_calc2,(32 - 7)); + _calc1 = _mm_slli_epi32(_calc1,(32 - 7)); + row1_2 = _mm_xor_si128(row1_2, _calc2); + row1_3 = _mm_xor_si128(row1_3, _calc1); + + // transpose_matrix(row1, row2, row3, row4, row_to_column); + row1_1 = _mm_shuffle_epi32(row1_1,0x93); + row2_1 = _mm_shuffle_epi32(row2_1,0x4e); + row3_1 = _mm_shuffle_epi32(row3_1,0x39); + row1_2 = _mm_shuffle_epi32(row1_2,0x93); + row2_2 = _mm_shuffle_epi32(row2_2,0x4e); + row3_2 = _mm_shuffle_epi32(row3_2,0x39); + row1_3 = _mm_shuffle_epi32(row1_3,0x93); + row2_3 = _mm_shuffle_epi32(row2_3,0x4e); + row3_3 = _mm_shuffle_epi32(row3_3,0x39); + + state[0] = row0_1; + state[1] = row1_1; + state[2] = row2_1; + state[3] = row3_1; + state[4] = row0_2; + state[5] = row1_2; + state[6] = row2_2; + state[7] = row3_2; + state[8] = row0_3; + state[9] = row1_3; + state[10] = row2_3; + state[11] = row3_3; + // end transpose + +} + +void blake2_compress_sidm_X3(blake2s_state_sidm *S_1, blake2s_state_sidm *S_2, blake2s_state_sidm *S_3) +{ + uint32_t v[16*3] __attribute__((aligned(32))); + __m128i *_mm_v_1 = (__m128i*) &v[0]; + __m128i *_mm_v_2 = (__m128i*) &v[16]; + __m128i *_mm_v_3 = (__m128i*) &v[32]; + uint32_t *m_1 = (uint *) S_1->buf; + uint32_t *m_2 = (uint *) S_2->buf; + uint32_t *m_3 = (uint *) S_3->buf; + uint32_t Message[16*3] __attribute__((aligned(32))); + + v[0] = S_1->h[0]; + v[1] = S_1->h[1]; + v[2] = S_1->h[2]; + v[3] = S_1->h[3]; + v[4] = S_1->h[4]; + v[5] = S_1->h[5]; + v[6] = S_1->h[6]; + v[7] = S_1->h[7]; + v[8] = blake2s_IV_sidm[0]; + v[9] = blake2s_IV_sidm[1]; + v[10] = blake2s_IV_sidm[2]; + v[11] = blake2s_IV_sidm[3]; + v[12] = S_1->t[0] ^ blake2s_IV_sidm[4]; + v[13] = S_1->t[1] ^ blake2s_IV_sidm[5]; + v[14] = S_1->f[0] ^ blake2s_IV_sidm[6]; + v[15] = S_1->f[1] ^ blake2s_IV_sidm[7]; + v[16] = S_2->h[0]; + v[17] = S_2->h[1]; + v[18] = S_2->h[2]; + v[19] = S_2->h[3]; + v[20] = S_2->h[4]; + v[21] = S_2->h[5]; + v[22] = S_2->h[6]; + v[23] = S_2->h[7]; + v[24] = blake2s_IV_sidm[0]; + v[25] = blake2s_IV_sidm[1]; + v[26] = blake2s_IV_sidm[2]; + v[27] = blake2s_IV_sidm[3]; + v[28] = S_2->t[0] ^ blake2s_IV_sidm[4]; + v[29] = S_2->t[1] ^ blake2s_IV_sidm[5]; + v[30] = S_2->f[0] ^ blake2s_IV_sidm[6]; + v[31] = S_2->f[1] ^ blake2s_IV_sidm[7]; + v[32] = S_3->h[0]; + v[33] = S_3->h[1]; + v[34] = S_3->h[2]; + v[35] = S_3->h[3]; + v[36] = S_3->h[4]; + v[37] = S_3->h[5]; + v[38] = S_3->h[6]; + v[39] = S_3->h[7]; + v[40] = blake2s_IV_sidm[0]; + v[41] = blake2s_IV_sidm[1]; + v[42] = blake2s_IV_sidm[2]; + v[43] = blake2s_IV_sidm[3]; + v[44] = S_3->t[0] ^ blake2s_IV_sidm[4]; + v[45] = S_3->t[1] ^ blake2s_IV_sidm[5]; + v[46] = S_3->f[0] ^ blake2s_IV_sidm[6]; + v[47] = S_3->f[1] ^ blake2s_IV_sidm[7]; + // round 1 + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , + + load_message(Message, 0, m_1[0], m_1[2], m_1[4], m_1[6]); + load_message(Message, 4, m_1[1], m_1[3], m_1[5], m_1[7]); + load_message(Message, 8, m_1[8], m_1[10], m_1[12], m_1[14]); + load_message(Message, 12, m_1[9], m_1[11], m_1[13], m_1[15]); + load_message(Message+16, 0, m_2[0], m_2[2], m_2[4], m_2[6]); + load_message(Message+16, 4, m_2[1], m_2[3], m_2[5], m_2[7]); + load_message(Message+16, 8, m_2[8], m_2[10], m_2[12], m_2[14]); + load_message(Message+16, 12, m_2[9], m_2[11], m_2[13], m_2[15]); + load_message(Message+32, 0, m_3[0], m_3[2], m_3[4], m_3[6]); + load_message(Message+32, 4, m_3[1], m_3[3], m_3[5], m_3[7]); + load_message(Message+32, 8, m_3[8], m_3[10], m_3[12], m_3[14]); + load_message(Message+32, 12, m_3[9], m_3[11], m_3[13], m_3[15]); + + blake2_round_sidm_X3((__m128i *)v, (__m128i *)Message); + +// { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , + load_message(Message, 0, m_1[14], m_1[4], m_1[9], m_1[13]); + load_message(Message, 4, m_1[10], m_1[8], m_1[15], m_1[6]); + load_message(Message, 8, m_1[1], m_1[0], m_1[11], m_1[5]); + load_message(Message, 12, m_1[12], m_1[2], m_1[7], m_1[3]); + load_message(Message+16, 0, m_2[14], m_2[4], m_2[9], m_2[13]); + load_message(Message+16, 4, m_2[10], m_2[8], m_2[15], m_2[6]); + load_message(Message+16, 8, m_2[1], m_2[0], m_2[11], m_2[5]); + load_message(Message+16, 12, m_2[12], m_2[2], m_2[7], m_2[3]); + load_message(Message+32, 0, m_3[14], m_3[4], m_3[9], m_3[13]); + load_message(Message+32, 4, m_3[10], m_3[8], m_3[15], m_3[6]); + load_message(Message+32, 8, m_3[1], m_3[0], m_3[11], m_3[5]); + load_message(Message+32, 12, m_3[12], m_3[2], m_3[7], m_3[3]); + + blake2_round_sidm_X3((__m128i *)v, (__m128i *)Message); + +// { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , + load_message(Message, 0, m_1[11], m_1[12], m_1[5], m_1[15]); + load_message(Message, 4, m_1[8], m_1[0], m_1[2], m_1[13]); + load_message(Message, 8, m_1[10], m_1[3], m_1[7], m_1[9]); + load_message(Message, 12, m_1[14], m_1[6], m_1[1], m_1[4]); + load_message(Message+16, 0, m_2[11], m_2[12], m_2[5], m_2[15]); + load_message(Message+16, 4, m_2[8], m_2[0], m_2[2], m_2[13]); + load_message(Message+16, 8, m_2[10], m_2[3], m_2[7], m_2[9]); + load_message(Message+16, 12, m_2[14], m_2[6], m_2[1], m_2[4]); + load_message(Message+32, 0, m_3[11], m_3[12], m_3[5], m_3[15]); + load_message(Message+32, 4, m_3[8], m_3[0], m_3[2], m_3[13]); + load_message(Message+32, 8, m_3[10], m_3[3], m_3[7], m_3[9]); + load_message(Message+32, 12, m_3[14], m_3[6], m_3[1], m_3[4]); + + blake2_round_sidm_X3((__m128i *)v, (__m128i *)Message); +// { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , + load_message(Message, 0, m_1[7], m_1[3], m_1[13], m_1[11]); + load_message(Message, 4, m_1[9], m_1[1], m_1[12], m_1[14]); + load_message(Message, 8, m_1[2], m_1[5], m_1[4], m_1[15]); + load_message(Message, 12, m_1[6], m_1[10], m_1[0], m_1[8]); + load_message(Message+16, 0, m_2[7], m_2[3], m_2[13], m_2[11]); + load_message(Message+16, 4, m_2[9], m_2[1], m_2[12], m_2[14]); + load_message(Message+16, 8, m_2[2], m_2[5], m_2[4], m_2[15]); + load_message(Message+16, 12, m_2[6], m_2[10], m_2[0], m_2[8]); + load_message(Message+32, 0, m_3[7], m_3[3], m_3[13], m_3[11]); + load_message(Message+32, 4, m_3[9], m_3[1], m_3[12], m_3[14]); + load_message(Message+32, 8, m_3[2], m_3[5], m_3[4], m_3[15]); + load_message(Message+32, 12, m_3[6], m_3[10], m_3[0], m_3[8]); + + blake2_round_sidm_X3((__m128i *)v, (__m128i *)Message); +// { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , + load_message(Message, 0, m_1[9], m_1[5], m_1[2], m_1[10]); + load_message(Message, 4, m_1[0], m_1[7], m_1[4], m_1[15]); + load_message(Message, 8, m_1[14], m_1[11], m_1[6], m_1[3]); + load_message(Message, 12, m_1[1], m_1[12], m_1[8], m_1[13]); + load_message(Message+16, 0, m_2[9], m_2[5], m_2[2], m_2[10]); + load_message(Message+16, 4, m_2[0], m_2[7], m_2[4], m_2[15]); + load_message(Message+16, 8, m_2[14], m_2[11], m_2[6], m_2[3]); + load_message(Message+16, 12, m_2[1], m_2[12], m_2[8], m_2[13]); + load_message(Message+32, 0, m_3[9], m_3[5], m_3[2], m_3[10]); + load_message(Message+32, 4, m_3[0], m_3[7], m_3[4], m_3[15]); + load_message(Message+32, 8, m_3[14], m_3[11], m_3[6], m_3[3]); + load_message(Message+32, 12, m_3[1], m_3[12], m_3[8], m_3[13]); + + blake2_round_sidm_X3((__m128i *)v, (__m128i *)Message); +// { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , + load_message(Message, 0, m_1[2], m_1[6], m_1[0], m_1[8]); + load_message(Message, 4, m_1[12], m_1[10], m_1[11], m_1[3]); + load_message(Message, 8, m_1[4], m_1[7], m_1[15], m_1[1]); + load_message(Message, 12, m_1[13], m_1[5], m_1[14], m_1[9]); + load_message(Message+16, 0, m_2[2], m_2[6], m_2[0], m_2[8]); + load_message(Message+16, 4, m_2[12], m_2[10], m_2[11], m_2[3]); + load_message(Message+16, 8, m_2[4], m_2[7], m_2[15], m_2[1]); + load_message(Message+16, 12, m_2[13], m_2[5], m_2[14], m_2[9]); + load_message(Message+32, 0, m_3[2], m_3[6], m_3[0], m_3[8]); + load_message(Message+32, 4, m_3[12], m_3[10], m_3[11], m_3[3]); + load_message(Message+32, 8, m_3[4], m_3[7], m_3[15], m_3[1]); + load_message(Message+32, 12, m_3[13], m_3[5], m_3[14], m_3[9]); + + blake2_round_sidm_X3((__m128i *)v, (__m128i *)Message); +// { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , + load_message(Message, 0, m_1[12], m_1[1], m_1[14], m_1[4]); + load_message(Message, 4, m_1[5], m_1[15], m_1[13], m_1[10]); + load_message(Message, 8, m_1[0], m_1[6], m_1[9], m_1[8]); + load_message(Message, 12, m_1[7], m_1[3], m_1[2], m_1[11]); + load_message(Message+16, 0, m_2[12], m_2[1], m_2[14], m_2[4]); + load_message(Message+16, 4, m_2[5], m_2[15], m_2[13], m_2[10]); + load_message(Message+16, 8, m_2[0], m_2[6], m_2[9], m_2[8]); + load_message(Message+16, 12, m_2[7], m_2[3], m_2[2], m_2[11]); + load_message(Message+32, 0, m_3[12], m_3[1], m_3[14], m_3[4]); + load_message(Message+32, 4, m_3[5], m_3[15], m_3[13], m_3[10]); + load_message(Message+32, 8, m_3[0], m_3[6], m_3[9], m_3[8]); + load_message(Message+32, 12, m_3[7], m_3[3], m_3[2], m_3[11]); + + blake2_round_sidm_X3((__m128i *)v, (__m128i *)Message); +// { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , + load_message(Message, 0, m_1[13], m_1[7], m_1[12], m_1[3]); + load_message(Message, 4, m_1[11], m_1[14], m_1[1], m_1[9]); + load_message(Message, 8, m_1[5], m_1[15], m_1[8], m_1[2]); + load_message(Message, 12, m_1[0], m_1[4], m_1[6], m_1[10]); + load_message(Message+16, 0, m_2[13], m_2[7], m_2[12], m_2[3]); + load_message(Message+16, 4, m_2[11], m_2[14], m_2[1], m_2[9]); + load_message(Message+16, 8, m_2[5], m_2[15], m_2[8], m_2[2]); + load_message(Message+16, 12, m_2[0], m_2[4], m_2[6], m_2[10]); + load_message(Message+32, 0, m_3[13], m_3[7], m_3[12], m_3[3]); + load_message(Message+32, 4, m_3[11], m_3[14], m_3[1], m_3[9]); + load_message(Message+32, 8, m_3[5], m_3[15], m_3[8], m_3[2]); + load_message(Message+32, 12, m_3[0], m_3[4], m_3[6], m_3[10]); + + + blake2_round_sidm_X3((__m128i *)v, (__m128i *)Message); +// { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , + load_message(Message, 0, m_1[6], m_1[14], m_1[11], m_1[0]); + load_message(Message, 4, m_1[15], m_1[9], m_1[3], m_1[8]); + load_message(Message, 8, m_1[12], m_1[13], m_1[1], m_1[10]); + load_message(Message, 12, m_1[2], m_1[7], m_1[4], m_1[5]); + load_message(Message+16, 0, m_2[6], m_2[14], m_2[11], m_2[0]); + load_message(Message+16, 4, m_2[15], m_2[9], m_2[3], m_2[8]); + load_message(Message+16, 8, m_2[12], m_2[13], m_2[1], m_2[10]); + load_message(Message+16, 12, m_2[2], m_2[7], m_2[4], m_2[5]); + load_message(Message+32, 0, m_3[6], m_3[14], m_3[11], m_3[0]); + load_message(Message+32, 4, m_3[15], m_3[9], m_3[3], m_3[8]); + load_message(Message+32, 8, m_3[12], m_3[13], m_3[1], m_3[10]); + load_message(Message+32, 12, m_3[2], m_3[7], m_3[4], m_3[5]); + + blake2_round_sidm_X3((__m128i *)v, (__m128i *)Message); +// { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , + load_message(Message, 0, m_1[10], m_1[8], m_1[7], m_1[1]); + load_message(Message, 4, m_1[2], m_1[4], m_1[6], m_1[5]); + load_message(Message, 8, m_1[15], m_1[9], m_1[3], m_1[13]); + load_message(Message, 12, m_1[11], m_1[14], m_1[12], m_1[0]); + load_message(Message+16, 0, m_2[10], m_2[8], m_2[7], m_2[1]); + load_message(Message+16, 4, m_2[2], m_2[4], m_2[6], m_2[5]); + load_message(Message+16, 8, m_2[15], m_2[9], m_2[3], m_2[13]); + load_message(Message+16, 12, m_2[11], m_2[14], m_2[12], m_2[0]); + load_message(Message+32, 0, m_3[10], m_3[8], m_3[7], m_3[1]); + load_message(Message+32, 4, m_3[2], m_3[4], m_3[6], m_3[5]); + load_message(Message+32, 8, m_3[15], m_3[9], m_3[3], m_3[13]); + load_message(Message+32, 12, m_3[11], m_3[14], m_3[12], m_3[0]); + + blake2_round_sidm_X3((__m128i *)v, (__m128i *)Message); + + _mm_v_1[0] = _mm_xor_si128(_mm_v_1[0],_mm_v_1[2]); + _mm_v_1[1] = _mm_xor_si128(_mm_v_1[1],_mm_v_1[3]); + _mm_v_2[0] = _mm_xor_si128(_mm_v_2[0],_mm_v_2[2]); + _mm_v_2[1] = _mm_xor_si128(_mm_v_2[1],_mm_v_2[3]); + _mm_v_3[0] = _mm_xor_si128(_mm_v_3[0],_mm_v_3[2]); + _mm_v_3[1] = _mm_xor_si128(_mm_v_3[1],_mm_v_3[3]); + + S_1->h[0] ^= v[0];// ^ v[8]; + S_1->h[1] ^= v[1];// ^ v[9]; + S_1->h[2] ^= v[2];// ^ v[10]; + S_1->h[3] ^= v[3];// ^ v[11]; + S_1->h[4] ^= v[4];// ^ v[12]; + S_1->h[5] ^= v[5];// ^ v[13]; + S_1->h[6] ^= v[6];// ^ v[14]; + S_1->h[7] ^= v[7];// ^ v[15]; + S_2->h[0] ^= v[16];// ^ v[8]; + S_2->h[1] ^= v[17];// ^ v[9]; + S_2->h[2] ^= v[18];// ^ v[10]; + S_2->h[3] ^= v[19];// ^ v[11]; + S_2->h[4] ^= v[20];// ^ v[12]; + S_2->h[5] ^= v[21];// ^ v[13]; + S_2->h[6] ^= v[22];// ^ v[14]; + S_2->h[7] ^= v[23];// ^ v[15]; + S_3->h[0] ^= v[32];// ^ v[8]; + S_3->h[1] ^= v[33];// ^ v[9]; + S_3->h[2] ^= v[34];// ^ v[10]; + S_3->h[3] ^= v[35];// ^ v[11]; + S_3->h[4] ^= v[36];// ^ v[12]; + S_3->h[5] ^= v[37];// ^ v[13]; + S_3->h[6] ^= v[38];// ^ v[14]; + S_3->h[7] ^= v[39];// ^ v[15]; +} + +/* + * end threefold + */ +#endif + + + diff --git a/chacha_20_sidm.c b/chacha_20_sidm.c new file mode 100644 index 000000000..875f2b7b8 --- /dev/null +++ b/chacha_20_sidm.c @@ -0,0 +1,939 @@ +/* + * Copyright 2013 gerko.deroo@kangaderoo.nl + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include + +#if defined(__x86_64__) + +static inline void xor_chacha_sidm(__m128i *calc_16, __m128i *calc_12, __m128i *calc_8, __m128i *calc_7, + __m128i *calc_1, __m128i *calc_2, __m128i *calc_3, __m128i *calc_4, + uint32_t double_rounds) +{ + int i; + __m128i _calc; + __m128i _rotate_16 = _mm_setr_epi8(2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13); + __m128i _rotate_8 = _mm_setr_epi8(3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14); + __m128i rowa = _mm_xor_si128(*calc_16, *calc_1);; + __m128i rowb = _mm_xor_si128(*calc_12, *calc_2);; + __m128i rowc = _mm_xor_si128(*calc_8, *calc_3);; + __m128i rowd = _mm_xor_si128(*calc_7, *calc_4);; + + *calc_16 = _mm_xor_si128(*calc_16, *calc_1); + *calc_12 = _mm_xor_si128(*calc_12, *calc_2); + *calc_8 = _mm_xor_si128(*calc_8, *calc_3); + *calc_7 = _mm_xor_si128(*calc_7, *calc_4); + + for (i = 0; i < double_rounds; i++) { + /* first row */ + rowa = _mm_add_epi32(rowa, rowb); + rowd = _mm_xor_si128(rowd, rowa); + rowd = _mm_shuffle_epi8(rowd,_rotate_16); + + /* second row */ + rowc = _mm_add_epi32(rowc, rowd); + _calc = _mm_xor_si128(rowb, rowc); + rowb = _mm_slli_epi32(_calc, (12)); + _calc = _mm_srli_epi32(_calc,(32 - 12)); + rowb = _mm_xor_si128(rowb, _calc); + + /* third row */ + rowa = _mm_add_epi32(rowa, rowb); + rowd = _mm_xor_si128(rowd, rowa); + rowd = _mm_shuffle_epi8(rowd,_rotate_8); + + /* fourth row */ + rowc = _mm_add_epi32(rowc, rowd); + _calc = _mm_xor_si128(rowb, rowc); + rowb = _mm_slli_epi32(_calc, (7)); + _calc = _mm_srli_epi32(_calc,(32 - 7)); + rowb = _mm_xor_si128(rowb, _calc); + +// row a 0 1 2 3 0 1 2 3 +// row b 4 5 6 7 --> 5 6 7 4 +// row c 8 9 10 11 ..> 10 11 8 9 +// row d 12 13 14 15 15 12 13 14 + // transpose_matrix(row1, row2, row3, row4, row_to_column); + rowb = _mm_shuffle_epi32(rowb,0x39); // 10 01 00 11 + rowc = _mm_shuffle_epi32(rowc,0x4e); // 01 00 11 10 + rowd = _mm_shuffle_epi32(rowd,0x93); // 00 11 10 01 + // end transpose + + /* first column */ + rowa = _mm_add_epi32(rowa, rowb); + rowd = _mm_xor_si128(rowd, rowa); + rowd = _mm_shuffle_epi8(rowd,_rotate_16); + + /* second column */ + rowc = _mm_add_epi32(rowc, rowd); + _calc = _mm_xor_si128(rowb, rowc); + rowb = _mm_slli_epi32(_calc, (12)); + _calc = _mm_srli_epi32(_calc,(32 - 12)); + rowb = _mm_xor_si128(rowb, _calc); + + /* third column */ + rowa = _mm_add_epi32(rowa, rowb); + rowd = _mm_xor_si128(rowd, rowa); + rowd = _mm_shuffle_epi8(rowd,_rotate_8); + + /* fourth column */ + rowc = _mm_add_epi32(rowc, rowd); + _calc = _mm_xor_si128(rowb, rowc); + rowb = _mm_slli_epi32(_calc, (7)); + _calc = _mm_srli_epi32(_calc,(32 - 7)); + rowb = _mm_xor_si128(rowb, _calc); + + // transpose_matrix(row1, row2, row3, row4, row_to_column); + rowb = _mm_shuffle_epi32(rowb,0x93); + rowc = _mm_shuffle_epi32(rowc,0x4e); + rowd = _mm_shuffle_epi32(rowd,0x39); + // end transpose + } + *calc_16 = _mm_add_epi32(*calc_16,rowa); + *calc_12 = _mm_add_epi32(*calc_12, rowb); + *calc_8 = _mm_add_epi32(*calc_8, rowc); + *calc_7 = _mm_add_epi32(*calc_7, rowd); +} + +static inline void xor_chacha_sidm_swap(__m128i *calc_16, __m128i *calc_12, __m128i *calc_8, __m128i *calc_7, + __m128i *calc_1, __m128i *calc_2, __m128i *calc_3, __m128i *calc_4, + uint32_t double_rounds) +{ + int i; + __m128i _calc; + __m128i _rotate_16 = _mm_setr_epi8(2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13); + __m128i _rotate_8 = _mm_setr_epi8(3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14); + __m128i rowa = _mm_xor_si128(*calc_16, *calc_1); + __m128i rowb = _mm_xor_si128(*calc_12, *calc_2); + __m128i rowc = _mm_xor_si128(*calc_8, *calc_3); + __m128i rowd = _mm_xor_si128(*calc_7, *calc_4); + + *calc_16 = _mm_xor_si128(*calc_16, *calc_1); + *calc_12 = _mm_xor_si128(*calc_12, *calc_2); + *calc_8 = _mm_xor_si128(*calc_8, *calc_3); + *calc_7 = _mm_xor_si128(*calc_7, *calc_4); + + for (i = 0; i < double_rounds; i++) { + /* first row */ + rowa = _mm_add_epi32(rowa, rowb); + rowd = _mm_xor_si128(rowd, rowa); + rowd = _mm_shuffle_epi8(rowd,_rotate_16); + + /* second row */ + rowc = _mm_add_epi32(rowc, rowd); + _calc = _mm_xor_si128(rowb, rowc); + rowb = _mm_slli_epi32(_calc, (12)); + _calc = _mm_srli_epi32(_calc,(32 - 12)); + rowb = _mm_xor_si128(rowb, _calc); + + /* third row */ + rowa = _mm_add_epi32(rowa, rowb); + rowd = _mm_xor_si128(rowd, rowa); + rowd = _mm_shuffle_epi8(rowd,_rotate_8); + + /* fourth row */ + rowc = _mm_add_epi32(rowc, rowd); + _calc = _mm_xor_si128(rowb, rowc); + rowb = _mm_slli_epi32(_calc, (7)); + _calc = _mm_srli_epi32(_calc,(32 - 7)); + rowb = _mm_xor_si128(rowb, _calc); + +// row a 0 1 2 3 0 1 2 3 +// row b 4 5 6 7 --> 5 6 7 4 +// row c 8 9 10 11 ..> 10 11 8 9 +// row d 12 13 14 15 15 12 13 14 + // transpose_matrix(row1, row2, row3, row4, row_to_column); + rowb = _mm_shuffle_epi32(rowb,0x39); // 10 01 00 11 + rowc = _mm_shuffle_epi32(rowc,0x4e); // 01 00 11 10 + rowd = _mm_shuffle_epi32(rowd,0x93); // 00 11 10 01 + // end transpose + + /* first column */ + rowa = _mm_add_epi32(rowa, rowb); + rowd = _mm_xor_si128(rowd, rowa); + rowd = _mm_shuffle_epi8(rowd,_rotate_16); + + /* second column */ + rowc = _mm_add_epi32(rowc, rowd); + _calc = _mm_xor_si128(rowb, rowc); + rowb = _mm_slli_epi32(_calc, (12)); + _calc = _mm_srli_epi32(_calc,(32 - 12)); + rowb = _mm_xor_si128(rowb, _calc); + + /* third column */ + rowa = _mm_add_epi32(rowa, rowb); + rowd = _mm_xor_si128(rowd, rowa); + rowd = _mm_shuffle_epi8(rowd,_rotate_8); + + /* fourth column */ + rowc = _mm_add_epi32(rowc, rowd); + _calc = _mm_xor_si128(rowb, rowc); + rowb = _mm_slli_epi32(_calc, (7)); + _calc = _mm_srli_epi32(_calc,(32 - 7)); + rowb = _mm_xor_si128(rowb, _calc); + + // transpose_matrix(row1, row2, row3, row4, row_to_column); + rowb = _mm_shuffle_epi32(rowb,0x93); + rowc = _mm_shuffle_epi32(rowc,0x4e); + rowd = _mm_shuffle_epi32(rowd,0x39); + // end transpose + } + + rowa = _mm_add_epi32(*calc_16,rowa); + rowb = _mm_add_epi32(*calc_12, rowb); + rowc = _mm_add_epi32(*calc_8, rowc); + rowd = _mm_add_epi32(*calc_7, rowd); + + *calc_16 = *calc_1; + *calc_12 = *calc_2; + *calc_8 = *calc_3; + *calc_7 = *calc_4; + + *calc_1 = rowa; + *calc_2 = rowb; + *calc_3 = rowc; + *calc_4 = rowd; +} + +static inline void chacha_core_r2_sidm(__m128i *X , uint32_t Loops, uint32_t double_rounds) +{ + uint32_t i, j; + __m128i scratch[Loops * 8 * 4]; + + __m128i *calc_1 = (__m128i*) &X[0]; + __m128i *calc_2 = (__m128i*) &X[1]; + __m128i *calc_3 = (__m128i*) &X[2]; + __m128i *calc_4 = (__m128i*) &X[3]; + + __m128i *calc_11 = (__m128i*) &X[4]; + __m128i *calc_12 = (__m128i*) &X[5]; + __m128i *calc_13 = (__m128i*) &X[6]; + __m128i *calc_14 = (__m128i*) &X[7]; + + __m128i *calc_21 = (__m128i*) &X[8]; + __m128i *calc_22 = (__m128i*) &X[9]; + __m128i *calc_23 = (__m128i*) &X[10]; + __m128i *calc_24 = (__m128i*) &X[11]; + + __m128i *calc_31 = (__m128i*) &X[12]; + __m128i *calc_32 = (__m128i*) &X[13]; + __m128i *calc_33 = (__m128i*) &X[14]; + __m128i *calc_34 = (__m128i*) &X[15]; + + for (i = 0; i < Loops; i++) { + scratch[i * 16 + 0] = *calc_1; scratch[i * 16 + 1] = *calc_2; + scratch[i * 16 + 2] = *calc_3; scratch[i * 16 + 3] = *calc_4; + scratch[i * 16 + 4] = *calc_11; scratch[i * 16 + 5] = *calc_12; + scratch[i * 16 + 6] = *calc_13; scratch[i * 16 + 7] = *calc_14; + scratch[i * 16 + 8] = *calc_21; scratch[i * 16 + 9] = *calc_22; + scratch[i * 16 + 10] = *calc_23; scratch[i * 16 + 11] = *calc_24; + scratch[i * 16 + 12] = *calc_31; scratch[i * 16 + 13] = *calc_32; + scratch[i * 16 + 14] = *calc_33; scratch[i * 16 + 15] = *calc_34; + + xor_chacha_sidm( calc_1, calc_2, calc_3, calc_4, calc_31,calc_32,calc_33,calc_34, double_rounds); + xor_chacha_sidm(calc_11,calc_12,calc_13,calc_14, calc_1, calc_2, calc_3, calc_4, double_rounds); + xor_chacha_sidm_swap(calc_21,calc_22,calc_23,calc_24, calc_11,calc_12,calc_13,calc_14, double_rounds); + xor_chacha_sidm(calc_31,calc_32,calc_33,calc_34, calc_11,calc_12,calc_13,calc_14, double_rounds); + // swap calc_2x with calc_1x + } + + for (i = 0; i < Loops; i++) { + j = 16 * (_mm_extract_epi16(*calc_31,0x00) & (Loops-1)); + + *calc_1 = _mm_xor_si128(*calc_1, scratch[j]); + *calc_2 = _mm_xor_si128(*calc_2, scratch[j+1]); + *calc_3 = _mm_xor_si128(*calc_3, scratch[j+2]); + *calc_4 = _mm_xor_si128(*calc_4, scratch[j+3]); + *calc_11 = _mm_xor_si128(*calc_11, scratch[j+4]); + *calc_12 = _mm_xor_si128(*calc_12, scratch[j+5]); + *calc_13 = _mm_xor_si128(*calc_13, scratch[j+6]); + *calc_14 = _mm_xor_si128(*calc_14, scratch[j+7]); + *calc_21 = _mm_xor_si128(*calc_21, scratch[j+8]); + *calc_22 = _mm_xor_si128(*calc_22, scratch[j+9]); + *calc_23 = _mm_xor_si128(*calc_23, scratch[j+10]); + *calc_24 = _mm_xor_si128(*calc_24, scratch[j+11]); + *calc_31 = _mm_xor_si128(*calc_31, scratch[j+12]); + *calc_32 = _mm_xor_si128(*calc_32, scratch[j+13]); + *calc_33 = _mm_xor_si128(*calc_33, scratch[j+14]); + *calc_34 = _mm_xor_si128(*calc_34, scratch[j+15]); + + xor_chacha_sidm( calc_1, calc_2, calc_3, calc_4, calc_31,calc_32,calc_33,calc_34, double_rounds); + xor_chacha_sidm(calc_11,calc_12,calc_13,calc_14, calc_1, calc_2, calc_3, calc_4, double_rounds); + xor_chacha_sidm_swap(calc_21,calc_22,calc_23,calc_24, calc_11,calc_12,calc_13,calc_14, double_rounds); + xor_chacha_sidm(calc_31,calc_32,calc_33,calc_34, calc_11,calc_12,calc_13,calc_14, double_rounds); + } + +} + +//--------------------------------------------------------------------------------------------- +// threefold +//--------------------------------------------------------------------------------------------- + + +static inline void xor_chacha_sidm_X3( + __m128i *calc_16_1, __m128i *calc_12_1, __m128i *calc_8_1, __m128i *calc_7_1, + __m128i *calc_1_1, __m128i *calc_2_1, __m128i *calc_3_1, __m128i *calc_4_1, + __m128i *calc_16_2, __m128i *calc_12_2, __m128i *calc_8_2, __m128i *calc_7_2, + __m128i *calc_1_2, __m128i *calc_2_2, __m128i *calc_3_2, __m128i *calc_4_2, + __m128i *calc_16_3, __m128i *calc_12_3, __m128i *calc_8_3, __m128i *calc_7_3, + __m128i *calc_1_3, __m128i *calc_2_3, __m128i *calc_3_3, __m128i *calc_4_3, + uint32_t double_rounds) +{ + int i; + __m128i _calc1, _calc2; + __m128i _rotate_16 = _mm_setr_epi8(2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13); + __m128i _rotate_8 = _mm_setr_epi8(3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14); + __m128i rowa_1 = _mm_xor_si128(*calc_16_1, *calc_1_1); + __m128i rowb_1 = _mm_xor_si128(*calc_12_1, *calc_2_1); + __m128i rowc_1 = _mm_xor_si128(*calc_8_1, *calc_3_1); + __m128i rowd_1 = _mm_xor_si128(*calc_7_1, *calc_4_1); + + __m128i rowa_2 = _mm_xor_si128(*calc_16_2, *calc_1_2); + __m128i rowb_2 = _mm_xor_si128(*calc_12_2, *calc_2_2); + __m128i rowc_2 = _mm_xor_si128(*calc_8_2, *calc_3_2); + __m128i rowd_2 = _mm_xor_si128(*calc_7_2, *calc_4_2); + + __m128i rowa_3 = _mm_xor_si128(*calc_16_3, *calc_1_3); + __m128i rowb_3 = _mm_xor_si128(*calc_12_3, *calc_2_3); + __m128i rowc_3 = _mm_xor_si128(*calc_8_3, *calc_3_3); + __m128i rowd_3 = _mm_xor_si128(*calc_7_3, *calc_4_3); + + *calc_16_1 = _mm_xor_si128(*calc_16_1, *calc_1_1); + *calc_12_1 = _mm_xor_si128(*calc_12_1, *calc_2_1); + *calc_8_1 = _mm_xor_si128(*calc_8_1, *calc_3_1); + *calc_7_1 = _mm_xor_si128(*calc_7_1, *calc_4_1); + *calc_16_2 = _mm_xor_si128(*calc_16_2, *calc_1_2); + *calc_12_2 = _mm_xor_si128(*calc_12_2, *calc_2_2); + *calc_8_2 = _mm_xor_si128(*calc_8_2, *calc_3_2); + *calc_7_2 = _mm_xor_si128(*calc_7_2, *calc_4_2); + *calc_16_3 = _mm_xor_si128(*calc_16_3, *calc_1_3); + *calc_12_3 = _mm_xor_si128(*calc_12_3, *calc_2_3); + *calc_8_3 = _mm_xor_si128(*calc_8_3, *calc_3_3); + *calc_7_3 = _mm_xor_si128(*calc_7_3, *calc_4_3); + + for (i = 0; i < double_rounds; i++) { + /* first row */ + rowa_1 = _mm_add_epi32(rowa_1, rowb_1); + rowa_2 = _mm_add_epi32(rowa_2, rowb_2); + rowa_3 = _mm_add_epi32(rowa_3, rowb_3); + rowd_1 = _mm_xor_si128(rowd_1, rowa_1); + rowd_2 = _mm_xor_si128(rowd_2, rowa_2); + rowd_3 = _mm_xor_si128(rowd_3, rowa_3); + rowd_1 = _mm_shuffle_epi8(rowd_1,_rotate_16); + rowd_2 = _mm_shuffle_epi8(rowd_2,_rotate_16); + rowd_3 = _mm_shuffle_epi8(rowd_3,_rotate_16); + + /* second row */ + rowc_1 = _mm_add_epi32(rowc_1, rowd_1); + rowc_2 = _mm_add_epi32(rowc_2, rowd_2); + _calc1 = _mm_xor_si128(rowb_1, rowc_1); + rowc_3 = _mm_add_epi32(rowc_3, rowd_3); + rowb_1 = _mm_slli_epi32(_calc1, (12)); + _calc1 = _mm_srli_epi32(_calc1,(32 - 12)); + _calc2 = _mm_xor_si128(rowb_2, rowc_2); + rowb_1 = _mm_xor_si128(rowb_1, _calc1); + _calc1 = _mm_xor_si128(rowb_3, rowc_3); + rowb_2 = _mm_slli_epi32(_calc2, (12)); + rowb_3 = _mm_slli_epi32(_calc1, (12)); + _calc1 = _mm_srli_epi32(_calc1,(32 - 12)); + _calc2 = _mm_srli_epi32(_calc2,(32 - 12)); + rowb_2 = _mm_xor_si128(rowb_2, _calc2); + rowb_3 = _mm_xor_si128(rowb_3, _calc1); + + /* third row */ + rowa_1 = _mm_add_epi32(rowa_1, rowb_1); + rowa_2 = _mm_add_epi32(rowa_2, rowb_2); + rowa_3 = _mm_add_epi32(rowa_3, rowb_3); + rowd_1 = _mm_xor_si128(rowd_1, rowa_1); + rowd_2 = _mm_xor_si128(rowd_2, rowa_2); + rowd_3 = _mm_xor_si128(rowd_3, rowa_3); + rowd_1 = _mm_shuffle_epi8(rowd_1,_rotate_8); + rowd_2 = _mm_shuffle_epi8(rowd_2,_rotate_8); + rowd_3 = _mm_shuffle_epi8(rowd_3,_rotate_8); + + /* fourth row */ + rowc_1 = _mm_add_epi32(rowc_1, rowd_1); + rowc_2 = _mm_add_epi32(rowc_2, rowd_2); + _calc1 = _mm_xor_si128(rowb_1, rowc_1); + rowc_3 = _mm_add_epi32(rowc_3, rowd_3); + rowb_1 = _mm_slli_epi32(_calc1, (7)); + _calc1 = _mm_srli_epi32(_calc1,(32 - 7)); + _calc2 = _mm_xor_si128(rowb_2, rowc_2); + rowb_1 = _mm_xor_si128(rowb_1, _calc1); + _calc1 = _mm_xor_si128(rowb_3, rowc_3); + rowb_2 = _mm_slli_epi32(_calc2, (7)); + rowb_3 = _mm_slli_epi32(_calc1, (7)); + _calc1 = _mm_srli_epi32(_calc1,(32 - 7)); + _calc2 = _mm_srli_epi32(_calc2,(32 - 7)); + rowb_2 = _mm_xor_si128(rowb_2, _calc2); + rowb_3 = _mm_xor_si128(rowb_3, _calc1); + +// row a 0 1 2 3 0 1 2 3 +// row b 4 5 6 7 --> 5 6 7 4 +// row c 8 9 10 11 ..> 10 11 8 9 +// row d 12 13 14 15 15 12 13 14 + // transpose_matrix(row1, row2, row3, row4, row_to_column); + rowb_1 = _mm_shuffle_epi32(rowb_1,0x39); // 10 01 00 11 + rowc_1 = _mm_shuffle_epi32(rowc_1,0x4e); // 01 00 11 10 + rowd_1 = _mm_shuffle_epi32(rowd_1,0x93); // 00 11 10 01 + rowb_2 = _mm_shuffle_epi32(rowb_2,0x39); // 10 01 00 11 + rowc_2 = _mm_shuffle_epi32(rowc_2,0x4e); // 01 00 11 10 + rowd_2 = _mm_shuffle_epi32(rowd_2,0x93); // 00 11 10 01 + rowb_3 = _mm_shuffle_epi32(rowb_3,0x39); // 10 01 00 11 + rowc_3 = _mm_shuffle_epi32(rowc_3,0x4e); // 01 00 11 10 + rowd_3 = _mm_shuffle_epi32(rowd_3,0x93); // 00 11 10 01 + // end transpose + + /* first column */ + rowa_1 = _mm_add_epi32(rowa_1, rowb_1); + rowa_2 = _mm_add_epi32(rowa_2, rowb_2); + rowa_3 = _mm_add_epi32(rowa_3, rowb_3); + rowd_1 = _mm_xor_si128(rowd_1, rowa_1); + rowd_2 = _mm_xor_si128(rowd_2, rowa_2); + rowd_3 = _mm_xor_si128(rowd_3, rowa_3); + rowd_1 = _mm_shuffle_epi8(rowd_1,_rotate_16); + rowd_2 = _mm_shuffle_epi8(rowd_2,_rotate_16); + rowd_3 = _mm_shuffle_epi8(rowd_3,_rotate_16); + + /* second column */ + rowc_1 = _mm_add_epi32(rowc_1, rowd_1); + rowc_2 = _mm_add_epi32(rowc_2, rowd_2); + _calc1 = _mm_xor_si128(rowb_1, rowc_1); + rowc_3 = _mm_add_epi32(rowc_3, rowd_3); + rowb_1 = _mm_slli_epi32(_calc1, (12)); + _calc1 = _mm_srli_epi32(_calc1,(32 - 12)); + _calc2 = _mm_xor_si128(rowb_2, rowc_2); + rowb_1 = _mm_xor_si128(rowb_1, _calc1); + _calc1 = _mm_xor_si128(rowb_3, rowc_3); + rowb_2 = _mm_slli_epi32(_calc2, (12)); + rowb_3 = _mm_slli_epi32(_calc1, (12)); + _calc1 = _mm_srli_epi32(_calc1,(32 - 12)); + _calc2 = _mm_srli_epi32(_calc2,(32 - 12)); + rowb_2 = _mm_xor_si128(rowb_2, _calc2); + rowb_3 = _mm_xor_si128(rowb_3, _calc1); + + /* third column */ + rowa_1 = _mm_add_epi32(rowa_1, rowb_1); + rowa_2 = _mm_add_epi32(rowa_2, rowb_2); + rowa_3 = _mm_add_epi32(rowa_3, rowb_3); + rowd_1 = _mm_xor_si128(rowd_1, rowa_1); + rowd_2 = _mm_xor_si128(rowd_2, rowa_2); + rowd_3 = _mm_xor_si128(rowd_3, rowa_3); + rowd_1 = _mm_shuffle_epi8(rowd_1,_rotate_8); + rowd_2 = _mm_shuffle_epi8(rowd_2,_rotate_8); + rowd_3 = _mm_shuffle_epi8(rowd_3,_rotate_8); + + /* fourth column */ + rowc_1 = _mm_add_epi32(rowc_1, rowd_1); + rowc_2 = _mm_add_epi32(rowc_2, rowd_2); + _calc1 = _mm_xor_si128(rowb_1, rowc_1); + rowc_3 = _mm_add_epi32(rowc_3, rowd_3); + rowb_1 = _mm_slli_epi32(_calc1, (7)); + _calc1 = _mm_srli_epi32(_calc1,(32 - 7)); + _calc2 = _mm_xor_si128(rowb_2, rowc_2); + rowb_1 = _mm_xor_si128(rowb_1, _calc1); + _calc1 = _mm_xor_si128(rowb_3, rowc_3); + rowb_2 = _mm_slli_epi32(_calc2, (7)); + rowb_3 = _mm_slli_epi32(_calc1, (7)); + _calc1 = _mm_srli_epi32(_calc1,(32 - 7)); + _calc2 = _mm_srli_epi32(_calc2,(32 - 7)); + rowb_2 = _mm_xor_si128(rowb_2, _calc2); + rowb_3 = _mm_xor_si128(rowb_3, _calc1); + // transpose_matrix(row1, row2, row3, row4, row_to_column); + rowb_1 = _mm_shuffle_epi32(rowb_1,0x93); + rowc_1 = _mm_shuffle_epi32(rowc_1,0x4e); + rowd_1 = _mm_shuffle_epi32(rowd_1,0x39); + rowb_2 = _mm_shuffle_epi32(rowb_2,0x93); + rowc_2 = _mm_shuffle_epi32(rowc_2,0x4e); + rowd_2 = _mm_shuffle_epi32(rowd_2,0x39); + rowb_3 = _mm_shuffle_epi32(rowb_3,0x93); + rowc_3 = _mm_shuffle_epi32(rowc_3,0x4e); + rowd_3 = _mm_shuffle_epi32(rowd_3,0x39); + // end transpose + } + *calc_16_1 = _mm_add_epi32(*calc_16_1, rowa_1); + *calc_12_1 = _mm_add_epi32(*calc_12_1, rowb_1); + *calc_8_1 = _mm_add_epi32( *calc_8_1, rowc_1); + *calc_7_1 = _mm_add_epi32( *calc_7_1, rowd_1); + *calc_16_2 = _mm_add_epi32(*calc_16_2, rowa_2); + *calc_12_2 = _mm_add_epi32(*calc_12_2, rowb_2); + *calc_8_2 = _mm_add_epi32( *calc_8_2, rowc_2); + *calc_7_2 = _mm_add_epi32( *calc_7_2, rowd_2); + *calc_16_3 = _mm_add_epi32(*calc_16_3, rowa_3); + *calc_12_3 = _mm_add_epi32(*calc_12_3, rowb_3); + *calc_8_3 = _mm_add_epi32( *calc_8_3, rowc_3); + *calc_7_3 = _mm_add_epi32( *calc_7_3, rowd_3); +} + +static inline void xor_chacha_sidm_swap_X3( + __m128i *calc_16_1, __m128i *calc_12_1, __m128i *calc_8_1, __m128i *calc_7_1, + __m128i *calc_1_1, __m128i *calc_2_1, __m128i *calc_3_1, __m128i *calc_4_1, + __m128i *calc_16_2, __m128i *calc_12_2, __m128i *calc_8_2, __m128i *calc_7_2, + __m128i *calc_1_2, __m128i *calc_2_2, __m128i *calc_3_2, __m128i *calc_4_2, + __m128i *calc_16_3, __m128i *calc_12_3, __m128i *calc_8_3, __m128i *calc_7_3, + __m128i *calc_1_3, __m128i *calc_2_3, __m128i *calc_3_3, __m128i *calc_4_3, + uint32_t double_rounds) +{ + int i; + __m128i _calc1, _calc2; + __m128i _rotate_16 = _mm_setr_epi8(2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13); + __m128i _rotate_8 = _mm_setr_epi8(3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14); + __m128i rowa_1 = _mm_xor_si128(*calc_16_1, *calc_1_1); + __m128i rowb_1 = _mm_xor_si128(*calc_12_1, *calc_2_1); + __m128i rowc_1 = _mm_xor_si128(*calc_8_1, *calc_3_1); + __m128i rowd_1 = _mm_xor_si128(*calc_7_1, *calc_4_1); + + __m128i rowa_2 = _mm_xor_si128(*calc_16_2, *calc_1_2); + __m128i rowb_2 = _mm_xor_si128(*calc_12_2, *calc_2_2); + __m128i rowc_2 = _mm_xor_si128(*calc_8_2, *calc_3_2); + __m128i rowd_2 = _mm_xor_si128(*calc_7_2, *calc_4_2); + + __m128i rowa_3 = _mm_xor_si128(*calc_16_3, *calc_1_3); + __m128i rowb_3 = _mm_xor_si128(*calc_12_3, *calc_2_3); + __m128i rowc_3 = _mm_xor_si128(*calc_8_3, *calc_3_3); + __m128i rowd_3 = _mm_xor_si128(*calc_7_3, *calc_4_3); + + *calc_16_1 = _mm_xor_si128(*calc_16_1, *calc_1_1); + *calc_12_1 = _mm_xor_si128(*calc_12_1, *calc_2_1); + *calc_8_1 = _mm_xor_si128(*calc_8_1, *calc_3_1); + *calc_7_1 = _mm_xor_si128(*calc_7_1, *calc_4_1); + *calc_16_2 = _mm_xor_si128(*calc_16_2, *calc_1_2); + *calc_12_2 = _mm_xor_si128(*calc_12_2, *calc_2_2); + *calc_8_2 = _mm_xor_si128(*calc_8_2, *calc_3_2); + *calc_7_2 = _mm_xor_si128(*calc_7_2, *calc_4_2); + *calc_16_3 = _mm_xor_si128(*calc_16_3, *calc_1_3); + *calc_12_3 = _mm_xor_si128(*calc_12_3, *calc_2_3); + *calc_8_3 = _mm_xor_si128(*calc_8_3, *calc_3_3); + *calc_7_3 = _mm_xor_si128(*calc_7_3, *calc_4_3); + + for (i = 0; i < double_rounds; i++) { + /* first row */ + rowa_1 = _mm_add_epi32(rowa_1, rowb_1); + rowa_2 = _mm_add_epi32(rowa_2, rowb_2); + rowa_3 = _mm_add_epi32(rowa_3, rowb_3); + rowd_1 = _mm_xor_si128(rowd_1, rowa_1); + rowd_2 = _mm_xor_si128(rowd_2, rowa_2); + rowd_3 = _mm_xor_si128(rowd_3, rowa_3); + rowd_1 = _mm_shuffle_epi8(rowd_1,_rotate_16); + rowd_2 = _mm_shuffle_epi8(rowd_2,_rotate_16); + rowd_3 = _mm_shuffle_epi8(rowd_3,_rotate_16); + + /* second row */ + rowc_1 = _mm_add_epi32(rowc_1, rowd_1); + rowc_2 = _mm_add_epi32(rowc_2, rowd_2); + _calc1 = _mm_xor_si128(rowb_1, rowc_1); + rowc_3 = _mm_add_epi32(rowc_3, rowd_3); + rowb_1 = _mm_slli_epi32(_calc1, (12)); + _calc1 = _mm_srli_epi32(_calc1,(32 - 12)); + _calc2 = _mm_xor_si128(rowb_2, rowc_2); + rowb_1 = _mm_xor_si128(rowb_1, _calc1); + _calc1 = _mm_xor_si128(rowb_3, rowc_3); + rowb_2 = _mm_slli_epi32(_calc2, (12)); + rowb_3 = _mm_slli_epi32(_calc1, (12)); + _calc1 = _mm_srli_epi32(_calc1,(32 - 12)); + _calc2 = _mm_srli_epi32(_calc2,(32 - 12)); + rowb_2 = _mm_xor_si128(rowb_2, _calc2); + rowb_3 = _mm_xor_si128(rowb_3, _calc1); + + /* third row */ + rowa_1 = _mm_add_epi32(rowa_1, rowb_1); + rowa_2 = _mm_add_epi32(rowa_2, rowb_2); + rowa_3 = _mm_add_epi32(rowa_3, rowb_3); + rowd_1 = _mm_xor_si128(rowd_1, rowa_1); + rowd_2 = _mm_xor_si128(rowd_2, rowa_2); + rowd_3 = _mm_xor_si128(rowd_3, rowa_3); + rowd_1 = _mm_shuffle_epi8(rowd_1,_rotate_8); + rowd_2 = _mm_shuffle_epi8(rowd_2,_rotate_8); + rowd_3 = _mm_shuffle_epi8(rowd_3,_rotate_8); + + /* fourth row */ + rowc_1 = _mm_add_epi32(rowc_1, rowd_1); + rowc_2 = _mm_add_epi32(rowc_2, rowd_2); + _calc1 = _mm_xor_si128(rowb_1, rowc_1); + rowc_3 = _mm_add_epi32(rowc_3, rowd_3); + rowb_1 = _mm_slli_epi32(_calc1, (7)); + _calc1 = _mm_srli_epi32(_calc1,(32 - 7)); + _calc2 = _mm_xor_si128(rowb_2, rowc_2); + rowb_1 = _mm_xor_si128(rowb_1, _calc1); + _calc1 = _mm_xor_si128(rowb_3, rowc_3); + rowb_2 = _mm_slli_epi32(_calc2, (7)); + rowb_3 = _mm_slli_epi32(_calc1, (7)); + _calc1 = _mm_srli_epi32(_calc1,(32 - 7)); + _calc2 = _mm_srli_epi32(_calc2,(32 - 7)); + rowb_2 = _mm_xor_si128(rowb_2, _calc2); + rowb_3 = _mm_xor_si128(rowb_3, _calc1); + +// row a 0 1 2 3 0 1 2 3 +// row b 4 5 6 7 --> 5 6 7 4 +// row c 8 9 10 11 ..> 10 11 8 9 +// row d 12 13 14 15 15 12 13 14 + // transpose_matrix(row1, row2, row3, row4, row_to_column); + rowb_1 = _mm_shuffle_epi32(rowb_1,0x39); // 10 01 00 11 + rowc_1 = _mm_shuffle_epi32(rowc_1,0x4e); // 01 00 11 10 + rowd_1 = _mm_shuffle_epi32(rowd_1,0x93); // 00 11 10 01 + rowb_2 = _mm_shuffle_epi32(rowb_2,0x39); // 10 01 00 11 + rowc_2 = _mm_shuffle_epi32(rowc_2,0x4e); // 01 00 11 10 + rowd_2 = _mm_shuffle_epi32(rowd_2,0x93); // 00 11 10 01 + rowb_3 = _mm_shuffle_epi32(rowb_3,0x39); // 10 01 00 11 + rowc_3 = _mm_shuffle_epi32(rowc_3,0x4e); // 01 00 11 10 + rowd_3 = _mm_shuffle_epi32(rowd_3,0x93); // 00 11 10 01 + // end transpose + + /* first column */ + rowa_1 = _mm_add_epi32(rowa_1, rowb_1); + rowa_2 = _mm_add_epi32(rowa_2, rowb_2); + rowa_3 = _mm_add_epi32(rowa_3, rowb_3); + rowd_1 = _mm_xor_si128(rowd_1, rowa_1); + rowd_2 = _mm_xor_si128(rowd_2, rowa_2); + rowd_3 = _mm_xor_si128(rowd_3, rowa_3); + rowd_1 = _mm_shuffle_epi8(rowd_1,_rotate_16); + rowd_2 = _mm_shuffle_epi8(rowd_2,_rotate_16); + rowd_3 = _mm_shuffle_epi8(rowd_3,_rotate_16); + + /* second column */ + rowc_1 = _mm_add_epi32(rowc_1, rowd_1); + rowc_2 = _mm_add_epi32(rowc_2, rowd_2); + _calc1 = _mm_xor_si128(rowb_1, rowc_1); + rowc_3 = _mm_add_epi32(rowc_3, rowd_3); + rowb_1 = _mm_slli_epi32(_calc1, (12)); + _calc1 = _mm_srli_epi32(_calc1,(32 - 12)); + _calc2 = _mm_xor_si128(rowb_2, rowc_2); + rowb_1 = _mm_xor_si128(rowb_1, _calc1); + _calc1 = _mm_xor_si128(rowb_3, rowc_3); + rowb_2 = _mm_slli_epi32(_calc2, (12)); + rowb_3 = _mm_slli_epi32(_calc1, (12)); + _calc1 = _mm_srli_epi32(_calc1,(32 - 12)); + _calc2 = _mm_srli_epi32(_calc2,(32 - 12)); + rowb_2 = _mm_xor_si128(rowb_2, _calc2); + rowb_3 = _mm_xor_si128(rowb_3, _calc1); + + /* third column */ + rowa_1 = _mm_add_epi32(rowa_1, rowb_1); + rowa_2 = _mm_add_epi32(rowa_2, rowb_2); + rowa_3 = _mm_add_epi32(rowa_3, rowb_3); + rowd_1 = _mm_xor_si128(rowd_1, rowa_1); + rowd_2 = _mm_xor_si128(rowd_2, rowa_2); + rowd_3 = _mm_xor_si128(rowd_3, rowa_3); + rowd_1 = _mm_shuffle_epi8(rowd_1,_rotate_8); + rowd_2 = _mm_shuffle_epi8(rowd_2,_rotate_8); + rowd_3 = _mm_shuffle_epi8(rowd_3,_rotate_8); + + /* fourth column */ + rowc_1 = _mm_add_epi32(rowc_1, rowd_1); + rowc_2 = _mm_add_epi32(rowc_2, rowd_2); + _calc1 = _mm_xor_si128(rowb_1, rowc_1); + rowc_3 = _mm_add_epi32(rowc_3, rowd_3); + rowb_1 = _mm_slli_epi32(_calc1, (7)); + _calc1 = _mm_srli_epi32(_calc1,(32 - 7)); + _calc2 = _mm_xor_si128(rowb_2, rowc_2); + rowb_1 = _mm_xor_si128(rowb_1, _calc1); + _calc1 = _mm_xor_si128(rowb_3, rowc_3); + rowb_2 = _mm_slli_epi32(_calc2, (7)); + rowb_3 = _mm_slli_epi32(_calc1, (7)); + _calc1 = _mm_srli_epi32(_calc1,(32 - 7)); + _calc2 = _mm_srli_epi32(_calc2,(32 - 7)); + rowb_2 = _mm_xor_si128(rowb_2, _calc2); + rowb_3 = _mm_xor_si128(rowb_3, _calc1); + // transpose_matrix(row1, row2, row3, row4, row_to_column); + rowb_1 = _mm_shuffle_epi32(rowb_1,0x93); + rowc_1 = _mm_shuffle_epi32(rowc_1,0x4e); + rowd_1 = _mm_shuffle_epi32(rowd_1,0x39); + rowb_2 = _mm_shuffle_epi32(rowb_2,0x93); + rowc_2 = _mm_shuffle_epi32(rowc_2,0x4e); + rowd_2 = _mm_shuffle_epi32(rowd_2,0x39); + rowb_3 = _mm_shuffle_epi32(rowb_3,0x93); + rowc_3 = _mm_shuffle_epi32(rowc_3,0x4e); + rowd_3 = _mm_shuffle_epi32(rowd_3,0x39); + // end transpose + } + rowa_1 = _mm_add_epi32(*calc_16_1, rowa_1); + rowb_1 = _mm_add_epi32(*calc_12_1, rowb_1); + rowc_1 = _mm_add_epi32( *calc_8_1, rowc_1); + rowd_1 = _mm_add_epi32( *calc_7_1, rowd_1); + rowa_2 = _mm_add_epi32(*calc_16_2, rowa_2); + rowb_2 = _mm_add_epi32(*calc_12_2, rowb_2); + rowc_2 = _mm_add_epi32( *calc_8_2, rowc_2); + rowd_2 = _mm_add_epi32( *calc_7_2, rowd_2); + rowa_3 = _mm_add_epi32(*calc_16_3, rowa_3); + rowb_3 = _mm_add_epi32(*calc_12_3, rowb_3); + rowc_3 = _mm_add_epi32( *calc_8_3, rowc_3); + rowd_3 = _mm_add_epi32( *calc_7_3, rowd_3); + + *calc_16_1 = *calc_1_1; + *calc_12_1 = *calc_2_1; + *calc_8_1 = *calc_3_1; + *calc_7_1 = *calc_4_1; + *calc_16_2= *calc_1_2; + *calc_12_2 = *calc_2_2; + *calc_8_2 = *calc_3_2; + *calc_7_2 = *calc_4_2; + *calc_16_3 = *calc_1_3; + *calc_12_3 = *calc_2_3; + *calc_8_3 = *calc_3_3; + *calc_7_3 = *calc_4_3; + + *calc_1_1 = rowa_1; + *calc_2_1 = rowb_1; + *calc_3_1 = rowc_1; + *calc_4_1 = rowd_1; + *calc_1_2 = rowa_2; + *calc_2_2 = rowb_2; + *calc_3_2 = rowc_2; + *calc_4_2 = rowd_2; + *calc_1_3 = rowa_3; + *calc_2_3 = rowb_3; + *calc_3_3 = rowc_3; + *calc_4_3 = rowd_3; +} + +static inline void chacha_core_r2_sidm_X3(__m128i *X_1, __m128i *X_2, __m128i *X_3, uint32_t Loops, uint32_t double_rounds) +{ + uint32_t i, j1, j2, j3; + __m128i scratch_1[Loops * 8 * 4]; + __m128i scratch_2[Loops * 8 * 4]; + __m128i scratch_3[Loops * 8 * 4]; + + // 1 + __m128i *calc_1_1 = (__m128i*) &X_1[0]; + __m128i *calc_2_1 = (__m128i*) &X_1[1]; + __m128i *calc_3_1 = (__m128i*) &X_1[2]; + __m128i *calc_4_1 = (__m128i*) &X_1[3]; + + __m128i *calc_11_1 = (__m128i*) &X_1[4]; + __m128i *calc_12_1 = (__m128i*) &X_1[5]; + __m128i *calc_13_1 = (__m128i*) &X_1[6]; + __m128i *calc_14_1 = (__m128i*) &X_1[7]; + + __m128i *calc_21_1 = (__m128i*) &X_1[8]; + __m128i *calc_22_1 = (__m128i*) &X_1[9]; + __m128i *calc_23_1 = (__m128i*) &X_1[10]; + __m128i *calc_24_1 = (__m128i*) &X_1[11]; + + __m128i *calc_31_1 = (__m128i*) &X_1[12]; + __m128i *calc_32_1 = (__m128i*) &X_1[13]; + __m128i *calc_33_1 = (__m128i*) &X_1[14]; + __m128i *calc_34_1 = (__m128i*) &X_1[15]; + // 2 + __m128i *calc_1_2 = (__m128i*) &X_2[0]; + __m128i *calc_2_2 = (__m128i*) &X_2[1]; + __m128i *calc_3_2 = (__m128i*) &X_2[2]; + __m128i *calc_4_2 = (__m128i*) &X_2[3]; + + __m128i *calc_11_2 = (__m128i*) &X_2[4]; + __m128i *calc_12_2 = (__m128i*) &X_2[5]; + __m128i *calc_13_2 = (__m128i*) &X_2[6]; + __m128i *calc_14_2 = (__m128i*) &X_2[7]; + + __m128i *calc_21_2 = (__m128i*) &X_2[8]; + __m128i *calc_22_2 = (__m128i*) &X_2[9]; + __m128i *calc_23_2 = (__m128i*) &X_2[10]; + __m128i *calc_24_2 = (__m128i*) &X_2[11]; + + __m128i *calc_31_2 = (__m128i*) &X_2[12]; + __m128i *calc_32_2 = (__m128i*) &X_2[13]; + __m128i *calc_33_2 = (__m128i*) &X_2[14]; + __m128i *calc_34_2 = (__m128i*) &X_2[15]; + // 3 + __m128i *calc_1_3 = (__m128i*) &X_3[0]; + __m128i *calc_2_3 = (__m128i*) &X_3[1]; + __m128i *calc_3_3 = (__m128i*) &X_3[2]; + __m128i *calc_4_3 = (__m128i*) &X_3[3]; + + __m128i *calc_11_3 = (__m128i*) &X_3[4]; + __m128i *calc_12_3 = (__m128i*) &X_3[5]; + __m128i *calc_13_3 = (__m128i*) &X_3[6]; + __m128i *calc_14_3 = (__m128i*) &X_3[7]; + + __m128i *calc_21_3 = (__m128i*) &X_3[8]; + __m128i *calc_22_3 = (__m128i*) &X_3[9]; + __m128i *calc_23_3 = (__m128i*) &X_3[10]; + __m128i *calc_24_3 = (__m128i*) &X_3[11]; + + __m128i *calc_31_3 = (__m128i*) &X_3[12]; + __m128i *calc_32_3 = (__m128i*) &X_3[13]; + __m128i *calc_33_3 = (__m128i*) &X_3[14]; + __m128i *calc_34_3 = (__m128i*) &X_3[15]; + + + for (i = 0; i < Loops; i++) { + scratch_1[i * 16 + 0] = *calc_1_1; scratch_1[i * 16 + 1] = *calc_2_1; + scratch_1[i * 16 + 2] = *calc_3_1; scratch_1[i * 16 + 3] = *calc_4_1; + scratch_2[i * 16 + 0] = *calc_1_2; scratch_2[i * 16 + 1] = *calc_2_2; + scratch_2[i * 16 + 2] = *calc_3_2; scratch_2[i * 16 + 3] = *calc_4_2; + scratch_3[i * 16 + 0] = *calc_1_3; scratch_3[i * 16 + 1] = *calc_2_3; + scratch_3[i * 16 + 2] = *calc_3_3; scratch_3[i * 16 + 3] = *calc_4_3; + + scratch_1[i * 16 + 12] = *calc_31_1; scratch_1[i * 16 + 13] = *calc_32_1; + scratch_1[i * 16 + 14] = *calc_33_1; scratch_1[i * 16 + 15] = *calc_34_1; + scratch_2[i * 16 + 12] = *calc_31_2; scratch_2[i * 16 + 13] = *calc_32_2; + scratch_2[i * 16 + 14] = *calc_33_2; scratch_2[i * 16 + 15] = *calc_34_2; + scratch_3[i * 16 + 12] = *calc_31_3; scratch_3[i * 16 + 13] = *calc_32_3; + scratch_3[i * 16 + 14] = *calc_33_3; scratch_3[i * 16 + 15] = *calc_34_3; + + xor_chacha_sidm_X3( calc_1_1, calc_2_1, calc_3_1, calc_4_1, calc_31_1,calc_32_1,calc_33_1,calc_34_1, + calc_1_2, calc_2_2, calc_3_2, calc_4_2, calc_31_2,calc_32_2,calc_33_2,calc_34_2, + calc_1_3, calc_2_3, calc_3_3, calc_4_3, calc_31_3,calc_32_3,calc_33_3,calc_34_3, + double_rounds); + + scratch_1[i * 16 + 4] = *calc_11_1; scratch_1[i * 16 + 5] = *calc_12_1; + scratch_1[i * 16 + 6] = *calc_13_1; scratch_1[i * 16 + 7] = *calc_14_1; + scratch_2[i * 16 + 4] = *calc_11_2; scratch_2[i * 16 + 5] = *calc_12_2; + scratch_2[i * 16 + 6] = *calc_13_2; scratch_2[i * 16 + 7] = *calc_14_2; + scratch_3[i * 16 + 4] = *calc_11_3; scratch_3[i * 16 + 5] = *calc_12_3; + scratch_3[i * 16 + 6] = *calc_13_3; scratch_3[i * 16 + 7] = *calc_14_3; + + xor_chacha_sidm_X3( calc_11_1, calc_12_1, calc_13_1, calc_14_1, calc_1_1,calc_2_1,calc_3_1,calc_4_1, + calc_11_2, calc_12_2, calc_13_2, calc_14_2, calc_1_2,calc_2_2,calc_3_2,calc_4_2, + calc_11_3, calc_12_3, calc_13_3, calc_14_3, calc_1_3,calc_2_3,calc_3_3,calc_4_3, + double_rounds); + + scratch_1[i * 16 + 8] = *calc_21_1; scratch_1[i * 16 + 9] = *calc_22_1; + scratch_1[i * 16 + 10] = *calc_23_1; scratch_1[i * 16 + 11] = *calc_24_1; + scratch_2[i * 16 + 8] = *calc_21_2; scratch_2[i * 16 + 9] = *calc_22_2; + scratch_2[i * 16 + 10] = *calc_23_2; scratch_2[i * 16 + 11] = *calc_24_2; + scratch_3[i * 16 + 8] = *calc_21_3; scratch_3[i * 16 + 9] = *calc_22_3; + scratch_3[i * 16 + 10] = *calc_23_3; scratch_3[i * 16 + 11] = *calc_24_3; + + xor_chacha_sidm_swap_X3( calc_21_1, calc_22_1, calc_23_1, calc_24_1, calc_11_1,calc_12_1,calc_13_1,calc_14_1, + calc_21_2, calc_22_2, calc_23_2, calc_24_2, calc_11_2,calc_12_2,calc_13_2,calc_14_2, + calc_21_3, calc_22_3, calc_23_3, calc_24_3, calc_11_3,calc_12_3,calc_13_3,calc_14_3, + double_rounds); + + xor_chacha_sidm_X3( calc_31_1, calc_32_1, calc_33_1, calc_34_1, calc_11_1,calc_12_1,calc_13_1,calc_14_1, + calc_31_2, calc_32_2, calc_33_2, calc_34_2, calc_11_2,calc_12_2,calc_13_2,calc_14_2, + calc_31_3, calc_32_3, calc_33_3, calc_34_3, calc_11_3,calc_12_3,calc_13_3,calc_14_3, + double_rounds); // swap calc_2x with calc_1x + } + + for (i = 0; i < Loops; i++) { + j1 = 16 * (_mm_extract_epi16(*calc_31_1,0x00) & (Loops-1)); + j2 = 16 * (_mm_extract_epi16(*calc_31_2,0x00) & (Loops-1)); + j3 = 16 * (_mm_extract_epi16(*calc_31_3,0x00) & (Loops-1)); + + //1 + *calc_1_1 = _mm_xor_si128(*calc_1_1, scratch_1[j1]); + *calc_2_1 = _mm_xor_si128(*calc_2_1, scratch_1[j1+1]); + *calc_3_1 = _mm_xor_si128(*calc_3_1, scratch_1[j1+2]); + *calc_4_1 = _mm_xor_si128(*calc_4_1, scratch_1[j1+3]); + + *calc_31_1 = _mm_xor_si128(*calc_31_1, scratch_1[j1+12]); + *calc_32_1 = _mm_xor_si128(*calc_32_1, scratch_1[j1+13]); + *calc_33_1 = _mm_xor_si128(*calc_33_1, scratch_1[j1+14]); + *calc_34_1 = _mm_xor_si128(*calc_34_1, scratch_1[j1+15]); + //2 + *calc_1_2 = _mm_xor_si128(*calc_1_2, scratch_2[j2]); + *calc_2_2 = _mm_xor_si128(*calc_2_2, scratch_2[j2+1]); + *calc_3_2 = _mm_xor_si128(*calc_3_2, scratch_2[j2+2]); + *calc_4_2 = _mm_xor_si128(*calc_4_2, scratch_2[j2+3]); + + *calc_31_2 = _mm_xor_si128(*calc_31_2, scratch_2[j2+12]); + *calc_32_2 = _mm_xor_si128(*calc_32_2, scratch_2[j2+13]); + *calc_33_2 = _mm_xor_si128(*calc_33_2, scratch_2[j2+14]); + *calc_34_2 = _mm_xor_si128(*calc_34_2, scratch_2[j2+15]); + //3 + *calc_1_3 = _mm_xor_si128(*calc_1_3, scratch_3[j3]); + *calc_2_3 = _mm_xor_si128(*calc_2_3, scratch_3[j3+1]); + *calc_3_3 = _mm_xor_si128(*calc_3_3, scratch_3[j3+2]); + *calc_4_3 = _mm_xor_si128(*calc_4_3, scratch_3[j3+3]); + + *calc_31_3 = _mm_xor_si128(*calc_31_3, scratch_3[j3+12]); + *calc_32_3 = _mm_xor_si128(*calc_32_3, scratch_3[j3+13]); + *calc_33_3 = _mm_xor_si128(*calc_33_3, scratch_3[j3+14]); + *calc_34_3 = _mm_xor_si128(*calc_34_3, scratch_3[j3+15]); + + xor_chacha_sidm_X3( calc_1_1, calc_2_1, calc_3_1, calc_4_1, calc_31_1,calc_32_1,calc_33_1,calc_34_1, + calc_1_2, calc_2_2, calc_3_2, calc_4_2, calc_31_2,calc_32_2,calc_33_2,calc_34_2, + calc_1_3, calc_2_3, calc_3_3, calc_4_3, calc_31_3,calc_32_3,calc_33_3,calc_34_3, + double_rounds); + + //1 + *calc_11_1 = _mm_xor_si128(*calc_11_1, scratch_1[j1+4]); + *calc_12_1 = _mm_xor_si128(*calc_12_1, scratch_1[j1+5]); + *calc_13_1 = _mm_xor_si128(*calc_13_1, scratch_1[j1+6]); + *calc_14_1 = _mm_xor_si128(*calc_14_1, scratch_1[j1+7]); + //2 + *calc_11_2 = _mm_xor_si128(*calc_11_2, scratch_2[j2+4]); + *calc_12_2 = _mm_xor_si128(*calc_12_2, scratch_2[j2+5]); + *calc_13_2 = _mm_xor_si128(*calc_13_2, scratch_2[j2+6]); + *calc_14_2 = _mm_xor_si128(*calc_14_2, scratch_2[j2+7]); + //3 + *calc_11_3 = _mm_xor_si128(*calc_11_3, scratch_3[j3+4]); + *calc_12_3 = _mm_xor_si128(*calc_12_3, scratch_3[j3+5]); + *calc_13_3 = _mm_xor_si128(*calc_13_3, scratch_3[j3+6]); + *calc_14_3 = _mm_xor_si128(*calc_14_3, scratch_3[j3+7]); + + xor_chacha_sidm_X3( calc_11_1, calc_12_1, calc_13_1, calc_14_1, calc_1_1,calc_2_1,calc_3_1,calc_4_1, + calc_11_2, calc_12_2, calc_13_2, calc_14_2, calc_1_2,calc_2_2,calc_3_2,calc_4_2, + calc_11_3, calc_12_3, calc_13_3, calc_14_3, calc_1_3,calc_2_3,calc_3_3,calc_4_3, + double_rounds); + + //1 + *calc_21_1 = _mm_xor_si128(*calc_21_1, scratch_1[j1+8]); + *calc_22_1 = _mm_xor_si128(*calc_22_1, scratch_1[j1+9]); + *calc_23_1 = _mm_xor_si128(*calc_23_1, scratch_1[j1+10]); + *calc_24_1 = _mm_xor_si128(*calc_24_1, scratch_1[j1+11]); + //2 + *calc_21_2 = _mm_xor_si128(*calc_21_2, scratch_2[j2+8]); + *calc_22_2 = _mm_xor_si128(*calc_22_2, scratch_2[j2+9]); + *calc_23_2 = _mm_xor_si128(*calc_23_2, scratch_2[j2+10]); + *calc_24_2 = _mm_xor_si128(*calc_24_2, scratch_2[j2+11]); + //3 + *calc_21_3 = _mm_xor_si128(*calc_21_3, scratch_3[j3+8]); + *calc_22_3 = _mm_xor_si128(*calc_22_3, scratch_3[j3+9]); + *calc_23_3 = _mm_xor_si128(*calc_23_3, scratch_3[j3+10]); + *calc_24_3 = _mm_xor_si128(*calc_24_3, scratch_3[j3+11]); + + xor_chacha_sidm_swap_X3( calc_21_1, calc_22_1, calc_23_1, calc_24_1, calc_11_1,calc_12_1,calc_13_1,calc_14_1, + calc_21_2, calc_22_2, calc_23_2, calc_24_2, calc_11_2,calc_12_2,calc_13_2,calc_14_2, + calc_21_3, calc_22_3, calc_23_3, calc_24_3, calc_11_3,calc_12_3,calc_13_3,calc_14_3, + double_rounds); + + xor_chacha_sidm_X3( calc_31_1, calc_32_1, calc_33_1, calc_34_1, calc_11_1,calc_12_1,calc_13_1,calc_14_1, + calc_31_2, calc_32_2, calc_33_2, calc_34_2, calc_11_2,calc_12_2,calc_13_2,calc_14_2, + calc_31_3, calc_32_3, calc_33_3, calc_34_3, calc_11_3,calc_12_3,calc_13_3,calc_14_3, + double_rounds); // swap calc_2x with calc_1x + } + +} +//--------------------------------------------------------------------------------------------- +// end threefold +//--------------------------------------------------------------------------------------------- + + + + +static inline void xor_sidm(__m128i *dest, __m128i *src, uint32_t size) +{ + uint32_t i; + for (i=0; i< size; i++){ + dest[i] = _mm_xor_si128(dest[i], src[i]); + } +} + +#endif + diff --git a/cpu-miner.c b/cpu-miner.c index 32a8e241b..4f58df597 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -1164,11 +1164,52 @@ static int scanhash_neoscrypt(int thr_id, uint *pdata, const uint *ptarget, pdata[19] += inc_nonce; } + *hashes_done = pdata[19] - inc_nonce - start_nonce; + return(0); +} + +static int scanhash_neoscrypt_X3(int thr_id, uint *pdata, const uint *ptarget, + uint max_nonce, ulong *hashes_done, uint profile) { + uint hash_X3[8*3] __attribute__ ((aligned (32))); + const uint targint = ptarget[7]; + uint start_nonce = pdata[19], inc_nonce = 3, k; + /* Load the password and increment nonces */ + while((pdata[19] < max_nonce) && !work_restart[thr_id].restart) { + + neoscrypt_X3((uchar *) pdata, (uchar *) hash_X3); + + /* Quick hash check */ + if(hash_X3[7] <= targint) { + /* Complete hash check */ + if(fulltest_le(&hash_X3[0], ptarget)) { + *hashes_done = pdata[19] - start_nonce; + return(1); + } + } + if(hash_X3[15] <= targint) { + /* Complete hash check */ + if(fulltest_le(&hash_X3[8], ptarget)) { + pdata[19] += 1; + *hashes_done = pdata[19] - start_nonce; + return(1); + } + } + if(hash_X3[23] <= targint) { + /* Complete hash check */ + if(fulltest_le(&hash_X3[16], ptarget)) { + pdata[19] += 2; + *hashes_done = pdata[19] - start_nonce; + return(1); + } + } + pdata[19] += inc_nonce; + } *hashes_done = pdata[19] - inc_nonce - start_nonce; return(0); } + static int scanhash_altscrypt(int thr_id, uint *pdata, const uint *ptarget, uint max_nonce, ulong *hashes_done, uint profile) { uint hash[8], data[20]; @@ -1447,7 +1488,7 @@ static void *miner_thread(void *userdata) max_nonce, &hashes_done); else #endif - rc = scanhash_neoscrypt(thr_id, work.data, work.target, + rc = scanhash_neoscrypt_X3(thr_id, work.data, work.target, max_nonce, &hashes_done, opt_neoscrypt_profile); break; @@ -1657,7 +1698,7 @@ static void *stratum_thread(void *userdata) { struct thr_info *mythr = userdata; char *s; - + s = NULL; stratum.url = tq_pop(mythr->q, NULL); if (!stratum.url) goto out; diff --git a/neoscrypt.c b/neoscrypt.c index affaef58f..96f67e250 100644 --- a/neoscrypt.c +++ b/neoscrypt.c @@ -33,7 +33,9 @@ #include #include "neoscrypt.h" - +#include "salsa_20_sidm.c" +#include "chacha_20_sidm.c" +#include "blake2s_sidm.c" #if (SHA256) @@ -1056,14 +1058,13 @@ static const uint blake2s_IV_P_XOR[8] = { /* Performance optimised FastKDF with BLAKE2s integrated */ void neoscrypt_fastkdf_opt(const uchar *password, const uchar *salt, uchar *output, uint mode) { - const size_t stack_align = 0x40; uint bufptr, output_len, i, j; uchar *A, *B; uint *S; /* Align and set up the buffers in stack */ - uchar stack[788 + stack_align]; - A = (uchar *) (((size_t)stack & ~(stack_align - 1)) + stack_align); + uchar stack[788] __attribute__((aligned(32))); + A = (uchar *) (size_t)stack ; B = &A[320]; S = (uint *) &A[608]; @@ -1102,7 +1103,8 @@ void neoscrypt_fastkdf_opt(const uchar *password, const uchar *salt, /* BLAKE2s: compress */ S[8] = 64; - blake2s_compress((blake2s_state *) S); +// blake2s_compress((blake2s_state *) S); + blake2_compress_sidm((blake2s_state_sidm *) S); S[44] = 64; neoscrypt_copy(&S[12], &S[28], 64); @@ -1110,7 +1112,8 @@ void neoscrypt_fastkdf_opt(const uchar *password, const uchar *salt, S[8] = 128; S[10] = ~0U; neoscrypt_erase(&S[28], 64); - blake2s_compress((blake2s_state *) S); +// blake2s_compress((blake2s_state *) S); + blake2_compress_sidm((blake2s_state_sidm *) S); for(j = 0, bufptr = 0; j < 8; j++) { bufptr += S[j]; @@ -1143,9 +1146,236 @@ void neoscrypt_fastkdf_opt(const uchar *password, const uchar *salt, } -#endif /* (OPT) */ +inline void neoscrypt_fastkdf_opt_X3(const uchar *password_1, const uchar *password_2, const uchar *password_3, + const uchar *salt_1, const uchar *salt_2, const uchar *salt_3, + uchar *output_1, uchar *output_2, uchar *output_3, uint mode) { + uint output_len, i, j; + uint bufptr_1, bufptr_2, bufptr_3; + uchar *A_1, *A_2, *A_3, *B_1, *B_2, *B_3; + uint *S_1, *S_2, *S_3; + uint32_t *prtptr_1, *prtptr_2, *prtptr_3; + + /* Align and set up the buffers in stack */ + uchar stack_1[788] __attribute__((aligned(0x80))); + uchar stack_2[788] __attribute__((aligned(0x80))); + uchar stack_3[788] __attribute__((aligned(0x80))); + A_1 = (uchar *) (size_t)stack_1 ; + A_2 = (uchar *) (size_t)stack_2 ; + A_3 = (uchar *) (size_t)stack_3 ; + B_1 = &A_1[320]; + B_2 = &A_2[320]; + B_3 = &A_3[320]; + S_1 = (uint *) &A_1[608]; + S_2 = (uint *) &A_2[608]; + S_3 = (uint *) &A_3[608]; + prtptr_1 = (uint32_t *) &S_1; + prtptr_2 = (uint32_t *) &S_2; + prtptr_3 = (uint32_t *) &S_3; + + memcpy( &A_1[0], &password_1[0], 80); + memcpy( &A_1[80], &password_1[0], 80); + memcpy(&A_1[160], &password_1[0], 80); + memcpy(&A_1[240], &password_1[0], 16); + memcpy(&A_1[256], &password_1[0], 64); + + memcpy( &A_2[0], &password_2[0], 80); + memcpy( &A_2[80], &password_2[0], 80); + memcpy(&A_2[160], &password_2[0], 80); + memcpy(&A_2[240], &password_2[0], 16); + memcpy(&A_2[256], &password_2[0], 64); + + memcpy( &A_3[0], &password_3[0], 80); + memcpy( &A_3[80], &password_3[0], 80); + memcpy(&A_3[160], &password_3[0], 80); + memcpy(&A_3[240], &password_3[0], 16); + memcpy(&A_3[256], &password_3[0], 64); + + if(!mode) { + output_len = 256; + memcpy(&B_1[0], &salt_1[0], 80); + memcpy(&B_1[80], &salt_1[0], 80); + memcpy(&B_1[160], &salt_1[0], 80); + memcpy(&B_1[240], &salt_1[0], 16); + memcpy(&B_1[256], &salt_1[0], 32); + memcpy(&B_2[0], &salt_2[0], 80); + memcpy(&B_2[80], &salt_2[0], 80); + memcpy(&B_2[160], &salt_2[0], 80); + memcpy(&B_2[240], &salt_2[0], 16); + memcpy(&B_2[256], &salt_2[0], 32); + memcpy(&B_3[0], &salt_3[0], 80); + memcpy(&B_3[80], &salt_3[0], 80); + memcpy(&B_3[160], &salt_3[0], 80); + memcpy(&B_3[240], &salt_3[0], 16); + memcpy(&B_3[256], &salt_3[0], 32); + } else { + output_len = 32; + memcpy(&B_1[0], &salt_1[0], 256); + memcpy(&B_1[256], &salt_1[0], 32); + memcpy(&B_2[0], &salt_2[0], 256); // check offset here its big for neoscrypt + memcpy(&B_2[256], &salt_2[0], 32); + memcpy(&B_3[0], &salt_3[0], 256); + memcpy(&B_3[256], &salt_3[0], 32); + } + + for(i = 0, bufptr_1 = 0, bufptr_2 = 0, bufptr_3 = 0; i < 32; i++) { + + /* BLAKE2s: initialise */ + memcpy(&S_1[0], blake2s_IV_P_XOR, 32); + memcpy(&S_2[0], blake2s_IV_P_XOR, 32); + memcpy(&S_3[0], blake2s_IV_P_XOR, 32); +// neoscrypt_copy(&S[0], blake2s_IV_P_XOR, 32); + neoscrypt_erase(&S_1[8], 16); + neoscrypt_erase(&S_2[8], 16); + neoscrypt_erase(&S_3[8], 16); + + /* BLAKE2s: update key */ + memcpy(&S_1[12], &B_1[bufptr_1], 32); + memcpy(&S_2[12], &B_2[bufptr_2], 32); + memcpy(&S_3[12], &B_3[bufptr_3], 32); +// neoscrypt_copy(&S[12], &B[bufptr], 32); + //neoscrypt_erase(&S[20], 32); + memset(&S_1[20],0, 32); + memset(&S_2[20],0, 32); + memset(&S_3[20],0, 32); + + /* BLAKE2s: update input */ + memcpy(&S_1[28], &A_1[bufptr_1], 64); + memcpy(&S_2[28], &A_2[bufptr_2], 64); + memcpy(&S_3[28], &A_3[bufptr_3], 64); +// neoscrypt_copy(&S[28], &A[bufptr], 64); + S_1[44] = 128; + S_2[44] = 128; + S_3[44] = 128; + + /* BLAKE2s: compress */ + S_1[8] = 64; + S_2[8] = 64; + S_3[8] = 64; +// applog(LOG_DEBUG, "S[0] = %08x", prtptr[0]); + + blake2_compress_sidm_X3((blake2s_state_sidm *) S_1, (blake2s_state_sidm *) S_2, (blake2s_state_sidm *) S_3); +// blake2s_compress((blake2s_state *) S_1); +// blake2s_compress((blake2s_state *) S_2); +// blake2s_compress((blake2s_state *) S_3); + + S_1[44] = 64; + S_2[44] = 64; + S_3[44] = 64; + memcpy(&S_1[12], &S_1[28], 64); + memcpy(&S_2[12], &S_2[28], 64); + memcpy(&S_3[12], &S_3[28], 64); +// neoscrypt_copy(&S[12], &S[28], 64); + + /* BLAKE2s: compress again */ + S_1[8] = 128; + S_2[8] = 128; + S_3[8] = 128; + S_1[10] = ~0U; + S_2[10] = ~0U; + S_3[10] = ~0U; + neoscrypt_erase(&S_1[28], 64); + neoscrypt_erase(&S_2[28], 64); + neoscrypt_erase(&S_3[28], 64); + + blake2_compress_sidm_X3((blake2s_state_sidm *) S_1, (blake2s_state_sidm *) S_2, (blake2s_state_sidm *) S_3); +// blake2s_compress((blake2s_state *) S_1); +// blake2s_compress((blake2s_state *) S_2); +// blake2s_compress((blake2s_state *) S_3); +// 1 + for(j = 0, bufptr_1 = 0; j < 8; j++) { + bufptr_1 += S_1[j]; + bufptr_1 += (S_1[j] >> 8); + bufptr_1 += (S_1[j] >> 16); + bufptr_1 += (S_1[j] >> 24); + } + bufptr_1 &= 0xFF; + + neoscrypt_xor(&B_1[bufptr_1], &S_1[0], 32); + + if(bufptr_1 < 32){ + memcpy(&B_1[256 + bufptr_1], &B_1[bufptr_1], 32 - bufptr_1); + } + if(bufptr_1 > 224){ + memcpy(&B_1[0], &B_1[256], bufptr_1 - 224); + } +// 2 + for(j = 0, bufptr_2 = 0; j < 8; j++) { + bufptr_2 += S_2[j]; + bufptr_2 += (S_2[j] >> 8); + bufptr_2 += (S_2[j] >> 16); + bufptr_2 += (S_2[j] >> 24); + } + bufptr_2 &= 0xFF; + + neoscrypt_xor(&B_2[bufptr_2], &S_2[0], 32); + + if(bufptr_2 < 32){ + memcpy(&B_2[256 + bufptr_2], &B_2[bufptr_2], 32 - bufptr_2); +// neoscrypt_copy(&B[256 + bufptr], &B[bufptr], 32 - bufptr); + } + if(bufptr_2 > 224){ + memcpy(&B_2[0], &B_2[256], bufptr_2 - 224); +// neoscrypt_copy(&B[0], &B[256], bufptr - 224); + } +// 3 + for(j = 0, bufptr_3 = 0; j < 8; j++) { + bufptr_3 += S_3[j]; + bufptr_3 += (S_3[j] >> 8); + bufptr_3 += (S_3[j] >> 16); + bufptr_3 += (S_3[j] >> 24); + } + bufptr_3 &= 0xFF; + + neoscrypt_xor(&B_3[bufptr_3], &S_3[0], 32); + + if(bufptr_3 < 32){ + memcpy(&B_3[256 + bufptr_3], &B_3[bufptr_3], 32 - bufptr_3); +// neoscrypt_copy(&B[256 + bufptr], &B[bufptr], 32 - bufptr); + } + if(bufptr_3 > 224){ + memcpy(&B_3[0], &B_3[256], bufptr_3 - 224); +// neoscrypt_copy(&B[0], &B[256], bufptr - 224); + } + + } +// 1 + i = 256 - bufptr_1; + if(i >= output_len) { + neoscrypt_xor(&B_1[bufptr_1], &A_1[0], output_len); + memcpy(&output_1[0], &B_1[bufptr_1], output_len); + } else { + neoscrypt_xor(&B_1[bufptr_1], &A_1[0], i); + neoscrypt_xor(&B_1[0], &A_1[i], output_len - i); + memcpy(&output_1[0], &B_1[bufptr_1], i); + memcpy(&output_1[i], &B_1[0], output_len - i); + } +//2 + i = 256 - bufptr_2; + if(i >= output_len) { + neoscrypt_xor(&B_2[bufptr_2], &A_2[0], output_len); + memcpy(&output_2[0], &B_2[bufptr_2], output_len); + } else { + neoscrypt_xor(&B_2[bufptr_2], &A_2[0], i); + neoscrypt_xor(&B_2[0], &A_2[i], output_len - i); + memcpy(&output_2[0], &B_2[bufptr_2], i); + memcpy(&output_2[i], &B_2[0], output_len - i); + } +// 3 + i = 256 - bufptr_3; + if(i >= output_len) { + neoscrypt_xor(&B_3[bufptr_3], &A_3[0], output_len); + memcpy(&output_3[0], &B_3[bufptr_3], output_len); + } else { + neoscrypt_xor(&B_3[bufptr_3], &A_3[0], i); + neoscrypt_xor(&B_3[0], &A_3[i], output_len - i); + memcpy(&output_3[0], &B_3[bufptr_3], i); + memcpy(&output_3[i], &B_3[0], output_len - i); + } +} +#endif /* (OPT) */ + #if !(ASM) /* Configurable optimised block mixer */ @@ -1252,10 +1482,11 @@ static void neoscrypt_blkmix(uint *X, uint *Y, uint r, uint mixmode) { * 11110 = N of 2147483648; * profile bits 30 to 13 are reserved */ void neoscrypt(const uchar *password, uchar *output, uint profile) { - const size_t stack_align = 0x40; +// const size_t stack_align = 0x40; uint N = 128, r = 2, dblmix = 1, mixmode = 0x14; uint kdf, i, j; - uint *X, *Y, *Z, *V; + uint *X, *Z; +// uint *X, *Y, *Z, *V; if(profile & 0x1) { N = 1024; /* N = (1 << (Nfactor + 1)); */ @@ -1269,15 +1500,15 @@ void neoscrypt(const uchar *password, uchar *output, uint profile) { r = (1 << ((profile >> 5) & 0x7)); } - uchar stack[(N + 3) * r * 2 * SCRYPT_BLOCK_SIZE + stack_align]; + uchar stack[N * r * 2 * SCRYPT_BLOCK_SIZE] __attribute__((aligned(32))); /* X = r * 2 * SCRYPT_BLOCK_SIZE */ - X = (uint *) (((size_t)stack & ~(stack_align - 1)) + stack_align); + X = (uint *) ((size_t)stack ); /* Z is a copy of X for ChaCha */ Z = &X[32 * r]; /* Y is an X sized temporal space */ - Y = &X[64 * r]; + // Y = &X[64 * r]; /* V = N * r * 2 * SCRYPT_BLOCK_SIZE */ - V = &X[96 * r]; + // V = &X[96 * r]; /* X = KDF(password, salt) */ kdf = (profile >> 1) & 0xF; @@ -1314,6 +1545,11 @@ void neoscrypt(const uchar *password, uchar *output, uint profile) { if(dblmix) { /* blkcpy(Z, X) */ +#if 1 + memcpy(Z, X, r * 2 * SCRYPT_BLOCK_SIZE); +// neoscrypt_blkcpy(&Z[0], &X[0], r * 2 * SCRYPT_BLOCK_SIZE); + chacha_core_r2_sidm((__m128i *)Z, N, (mixmode & 0xFF)/2); +#else neoscrypt_blkcpy(&Z[0], &X[0], r * 2 * SCRYPT_BLOCK_SIZE); /* Z = SMix(Z) */ @@ -1332,8 +1568,11 @@ void neoscrypt(const uchar *password, uchar *output, uint profile) { /* blkmix(Z, Y) */ neoscrypt_blkmix(&Z[0], &Y[0], r, (mixmode | 0x0100)); } +#endif } - +#if 1 + scrypt_core_r2_sidm((__m128i *) X, N, (mixmode & 0xFF) /2); +#else /* X = SMix(X) */ for(i = 0; i < N; i++) { /* blkcpy(V, X) */ @@ -1349,10 +1588,11 @@ void neoscrypt(const uchar *password, uchar *output, uint profile) { /* blkmix(X, Y) */ neoscrypt_blkmix(&X[0], &Y[0], r, mixmode); } - +#endif if(dblmix) /* blkxor(X, Z) */ - neoscrypt_blkxor(&X[0], &Z[0], r * 2 * SCRYPT_BLOCK_SIZE); + xor_sidm((__m128i*)X,(__m128i*)Z, 16); +// neoscrypt_blkxor(&X[0], &Z[0], r * 2 * SCRYPT_BLOCK_SIZE); /* output = KDF(password, X) */ switch(kdf) { @@ -1385,6 +1625,81 @@ void neoscrypt(const uchar *password, uchar *output, uint profile) { } +void neoscrypt_X3(const uchar *password, uchar *output) { + const size_t stack_align = 0x40; + uint N = 128, r = 2, dblmix = 1, mixmode = 0x14; + uint double_rounds; + uint i, j, k, kdf; + uint *X_1, *Y_1, *Z_1, *V_1; + uint *X_2, *Y_2, *Z_2, *V_2; + uint *X_3, *Y_3, *Z_3, *V_3; + double_rounds = (mixmode & 0xFF)/2; + uchar passwd_buf[3*80] __attribute__ ((aligned (64))); + uint *passwd=(uint*)passwd_buf; + + uchar stack[3 * (N + 3) * r * 2 * SCRYPT_BLOCK_SIZE] __attribute__ ((aligned (64))); + X_1 = (uint *) &stack[0]; + X_2 = (uint *) &stack[(N + 3) * r * 2 * SCRYPT_BLOCK_SIZE]; + X_3 = (uint *) &stack[2*(N + 3) * r * 2 * SCRYPT_BLOCK_SIZE]; + /* Z is a copy of X for ChaCha */ + Z_1 = &X_1[32 * r]; + Z_2 = &X_2[32 * r]; + Z_3 = &X_3[32 * r]; + /* Y is an X sized temporal space */ + Y_1 = &X_1[64 * r]; + Y_2 = &X_2[64 * r]; + Y_3 = &X_3[64 * r]; + /* V = N * r * 2 * SCRYPT_BLOCK_SIZE */ + V_1 = &X_1[96 * r]; + V_2 = &X_2[96 * r]; + V_3 = &X_3[96 * r]; + + /* Load the password and increment nonces */ + for(k = 0; k < 3; k++) { + memcpy(&passwd[k * 20], password, 80); + passwd[(k + 1) * 20 - 1] += k; + } +#if 1 + neoscrypt_fastkdf_opt(&passwd_buf[0], &passwd_buf[0], (uchar *) X_1, 0); + neoscrypt_fastkdf_opt(&passwd_buf[80], &passwd_buf[80], (uchar *) X_2, 0); + neoscrypt_fastkdf_opt(&passwd_buf[160], &passwd_buf[160], (uchar *) X_3, 0); +#else + neoscrypt_fastkdf_opt_X3(&passwd_buf[0], &passwd_buf[80],&passwd_buf[160], + &passwd_buf[0], &passwd_buf[80],&passwd_buf[160], + (uchar *) X_1, (uchar *) X_2, (uchar *) X_3, 0); +#endif + /* Process ChaCha 1st, Salsa 2nd and XOR them into FastKDF; otherwise Salsa only */ + + memcpy(Z_1, X_1, r * 2 * SCRYPT_BLOCK_SIZE); + memcpy(Z_2, X_2, r * 2 * SCRYPT_BLOCK_SIZE); + memcpy(Z_3, X_3, r * 2 * SCRYPT_BLOCK_SIZE); + +// chacha_core_r2_sidm((__m128i *)Z_1, N, double_rounds); +// chacha_core_r2_sidm((__m128i *)Z_2, N, double_rounds); +// chacha_core_r2_sidm((__m128i *)Z_3, N, double_rounds); + chacha_core_r2_sidm_X3((__m128i *)Z_1,(__m128i *)Z_2,(__m128i *)Z_3, N, double_rounds); + +// scrypt_core_r2_sidm((__m128i *) X_1, N, double_rounds); +// scrypt_core_r2_sidm((__m128i *) X_2, N, double_rounds); +// scrypt_core_r2_sidm((__m128i *) X_3, N, double_rounds); + scrypt_core_r2_sidm_X3((__m128i *) X_1,(__m128i *) X_2,(__m128i *) X_3, N, double_rounds); + /* blkxor(X, Z) */ + + xor_sidm((__m128i*)X_1,(__m128i*)Z_1, 16); + xor_sidm((__m128i*)X_2,(__m128i*)Z_2, 16); + xor_sidm((__m128i*)X_3,(__m128i*)Z_3, 16); + +#if 1 + neoscrypt_fastkdf_opt(&passwd_buf[0], (uchar *) X_1, output, 1); + neoscrypt_fastkdf_opt(&passwd_buf[80], (uchar *) X_2, output+32, 1); + neoscrypt_fastkdf_opt(&passwd_buf[160], (uchar *) X_3, output+64, 1); +#else + neoscrypt_fastkdf_opt_X3(&passwd_buf[0], &passwd_buf[80],&passwd_buf[160], + (uchar *) X_1, (uchar *) X_2, (uchar *) X_3, + output, output+32, output+64, 1); +#endif + +} #endif /* !(ASM) */ @@ -1593,7 +1908,6 @@ void neoscrypt_4way(const uchar *password, uchar *output, uint profile) { neoscrypt_xor_salsa_4way(&X[192], &X[128], &Y[0], double_rounds); neoscrypt_blkswp(&X[64], &X[128], r * 128); } - neoscrypt_blkxor(&X[0], &Z[0], 4 * r * 128); neoscrypt_unpack_4way(&Y[0], &X[0], 4 * r * 128); diff --git a/neoscrypt.h b/neoscrypt.h index 32990a6ec..26ae17b83 100644 --- a/neoscrypt.h +++ b/neoscrypt.h @@ -2,6 +2,8 @@ extern "C" { #endif +void neoscrypt_X3(const unsigned char *input, unsigned char *output); + void neoscrypt(const unsigned char *input, unsigned char *output, unsigned int profile); diff --git a/salsa_20_sidm.c b/salsa_20_sidm.c new file mode 100644 index 000000000..6b7f47c42 --- /dev/null +++ b/salsa_20_sidm.c @@ -0,0 +1,1841 @@ +/* + * Copyright 2013 gerko.deroo@kangaderoo.nl + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include + +#if defined(__x86_64__) + +static inline void xor_salsa_sidm(__m128i *calc_18, __m128i *calc_13, __m128i *calc_9, __m128i *calc_7, + __m128i *calc_1, __m128i *calc_4, __m128i *calc_3, __m128i *calc_2, + uint32_t double_rounds) +{ + int i; + __m128i _calc; + __m128i _shift_left; + __m128i row1 = _mm_xor_si128(*calc_18, *calc_1);; + __m128i row2 = _mm_xor_si128(*calc_7, *calc_2);; + __m128i row3 = _mm_xor_si128(*calc_9, *calc_3);; + __m128i row4 = _mm_xor_si128(*calc_13, *calc_4);; + + *calc_18 = _mm_xor_si128(*calc_18, *calc_1); + *calc_7 = _mm_xor_si128(*calc_7, *calc_2); + *calc_9 = _mm_xor_si128(*calc_9, *calc_3); + *calc_13 = _mm_xor_si128(*calc_13, *calc_4); + + for (i = 0; i < double_rounds; i++) { + /* first row */ + _calc = _mm_add_epi32(row1, row4); + _shift_left = _mm_slli_epi32(_calc, 7); + _calc = _mm_srli_epi32(_calc,(32 - 7)); + row2 = _mm_xor_si128(row2, _shift_left); + row2 = _mm_xor_si128(row2, _calc); + + /* second row */ + _calc = _mm_add_epi32(row2, row1); + _shift_left = _mm_slli_epi32(_calc, 9); + _calc = _mm_srli_epi32(_calc,(32 - 9)); + row3 = _mm_xor_si128(row3, _shift_left); + row3 = _mm_xor_si128(row3, _calc); + + /* third row */ + _calc = _mm_add_epi32(row3, row2); + _shift_left = _mm_slli_epi32(_calc, 13); + _calc = _mm_srli_epi32(_calc,(32 - 13)); + row4 = _mm_xor_si128(row4, _shift_left); + row4 = _mm_xor_si128(row4, _calc); + + /* fourth row */ + _calc = _mm_add_epi32(row4, row3); + _shift_left = _mm_slli_epi32(_calc, 18); + _calc = _mm_srli_epi32(_calc,(32 - 18)); + row1 = _mm_xor_si128(row1, _shift_left); + row1 = _mm_xor_si128(row1, _calc); + + // transpose_matrix(row1, row2, row3, row4, row_to_column); + row2 = _mm_shuffle_epi32(row2,0x93); + row3 = _mm_shuffle_epi32(row3,0x4e); + row4 = _mm_shuffle_epi32(row4,0x39); + // end transpose + + // switch *calc_13 and * calc_7 usage compared to rows + /* first column */ + _calc = _mm_add_epi32(row1, row2); + _shift_left = _mm_slli_epi32(_calc, 7); + _calc = _mm_srli_epi32(_calc,(32 - 7)); + row4 = _mm_xor_si128(row4, _shift_left); + row4 = _mm_xor_si128(row4, _calc); + + /* second column */ + _calc = _mm_add_epi32(row4, row1); + _shift_left = _mm_slli_epi32(_calc, 9); + _calc = _mm_srli_epi32(_calc,(32 - 9)); + row3 = _mm_xor_si128(row3, _shift_left); + row3 = _mm_xor_si128(row3, _calc); + + /* third column */ + _calc = _mm_add_epi32(row3, row4); + _shift_left = _mm_slli_epi32(_calc, 13); + _calc = _mm_srli_epi32(_calc,(32 - 13)); + row2 = _mm_xor_si128(row2, _shift_left); + row2 = _mm_xor_si128(row2, _calc); + + /* fourth column */ + _calc = _mm_add_epi32(row2, row3); + _shift_left = _mm_slli_epi32(_calc, 18); + _calc = _mm_srli_epi32(_calc,(32 - 18)); + row1 = _mm_xor_si128(row1, _shift_left); + row1 = _mm_xor_si128(row1, _calc); + + // transpose_matrix(row1, row2, row3, row4, row_to_column); + row2 = _mm_shuffle_epi32(row2,0x39); + row3 = _mm_shuffle_epi32(row3,0x4e); + row4 = _mm_shuffle_epi32(row4,0x93); + // end transpose + } + *calc_18 = _mm_add_epi32(*calc_18,row1); + *calc_7 = _mm_add_epi32(*calc_7, row2); + *calc_9 = _mm_add_epi32(*calc_9, row3); + *calc_13 = _mm_add_epi32(*calc_13, row4); +} + +static inline void xor_salsa_sidm_swap(__m128i *calc_18, __m128i *calc_13, __m128i *calc_9, __m128i *calc_7, + __m128i *calc_1, __m128i *calc_4, __m128i *calc_3, __m128i *calc_2, + uint32_t double_rounds) +{ + int i; + __m128i _calc; + __m128i _shift_left; + __m128i row1 = _mm_xor_si128(*calc_18, *calc_1);; + __m128i row2 = _mm_xor_si128(*calc_7, *calc_2);; + __m128i row3 = _mm_xor_si128(*calc_9, *calc_3);; + __m128i row4 = _mm_xor_si128(*calc_13, *calc_4);; + + *calc_18 = _mm_xor_si128(*calc_18, *calc_1); + *calc_7 = _mm_xor_si128(*calc_7, *calc_2); + *calc_9 = _mm_xor_si128(*calc_9, *calc_3); + *calc_13 = _mm_xor_si128(*calc_13, *calc_4); + + for (i = 0; i < double_rounds; i++) { + /* first row */ + _calc = _mm_add_epi32(row1, row4); + _shift_left = _mm_slli_epi32(_calc, 7); + _calc = _mm_srli_epi32(_calc,(32 - 7)); + row2 = _mm_xor_si128(row2, _calc); + row2 = _mm_xor_si128(row2, _shift_left); + + /* second row */ + _calc = _mm_add_epi32(row2, row1); + _shift_left = _mm_slli_epi32(_calc, 9); + _calc = _mm_srli_epi32(_calc,(32 - 9)); + row3 = _mm_xor_si128(row3, _calc); + row3 = _mm_xor_si128(row3, _shift_left); + + /* third row */ + _calc = _mm_add_epi32(row3, row2); + _shift_left = _mm_slli_epi32(_calc, 13); + _calc = _mm_srli_epi32(_calc,(32 - 13)); + row4 = _mm_xor_si128(row4, _calc); + row4 = _mm_xor_si128(row4, _shift_left); + + /* fourth row */ + _calc = _mm_add_epi32(row4, row3); + _shift_left = _mm_slli_epi32(_calc, 18); + _calc = _mm_srli_epi32(_calc,(32 - 18)); + row1 = _mm_xor_si128(row1, _calc); + row1 = _mm_xor_si128(row1, _shift_left); + + // transpose_matrix(row1, row2, row3, row4, row_to_column); + row2 = _mm_shuffle_epi32(row2,0x93); + row3 = _mm_shuffle_epi32(row3,0x4e); + row4 = _mm_shuffle_epi32(row4,0x39); + // end transpose + + // switch *calc_13 and * calc_7 usage compared to rows + /* first column */ + _calc = _mm_add_epi32(row1, row2); + _shift_left = _mm_slli_epi32(_calc, 7); + _calc = _mm_srli_epi32(_calc,(32 - 7)); + row4 = _mm_xor_si128(row4, _calc); + row4 = _mm_xor_si128(row4, _shift_left); + + /* second column */ + _calc = _mm_add_epi32(row4, row1); + _shift_left = _mm_slli_epi32(_calc, 9); + _calc = _mm_srli_epi32(_calc,(32 - 9)); + row3 = _mm_xor_si128(row3, _calc); + row3 = _mm_xor_si128(row3, _shift_left); + + /* third column */ + _calc = _mm_add_epi32(row3, row4); + _shift_left = _mm_slli_epi32(_calc, 13); + _calc = _mm_srli_epi32(_calc,(32 - 13)); + row2 = _mm_xor_si128(row2, _calc); + row2 = _mm_xor_si128(row2, _shift_left); + + /* fourth column */ + _calc = _mm_add_epi32(row2, row3); + _shift_left = _mm_slli_epi32(_calc, 18); + _calc = _mm_srli_epi32(_calc,(32 - 18)); + row1 = _mm_xor_si128(row1, _calc); + row1 = _mm_xor_si128(row1, _shift_left); + + // transpose_matrix(row1, row2, row3, row4, row_to_column); + row2 = _mm_shuffle_epi32(row2,0x39); + row3 = _mm_shuffle_epi32(row3,0x4e); + row4 = _mm_shuffle_epi32(row4,0x93); + // end transpose + } + + row1 = _mm_add_epi32(*calc_18,row1); + row2 = _mm_add_epi32(*calc_7, row2); + row3 = _mm_add_epi32(*calc_9, row3); + row4 = _mm_add_epi32(*calc_13, row4); + + *calc_18 = *calc_1; + *calc_7 = *calc_2; + *calc_9 = *calc_3; + *calc_13 = *calc_4; + + *calc_1 = row1; + *calc_2 = row2; + *calc_3 = row3; + *calc_4 = row4; +} + +static inline void scrypt_core_r2_sidm(__m128i *X , uint32_t Loops, uint32_t double_rounds) +{ + uint32_t i, j; + __m128i scratch[Loops * 8 * 4]; + + __m128i *calc_1 = (__m128i*) &X[0]; + __m128i *calc_2 = (__m128i*) &X[1]; + __m128i *calc_3 = (__m128i*) &X[2]; + __m128i *calc_4 = (__m128i*) &X[3]; + + __m128i *calc_11 = (__m128i*) &X[4]; + __m128i *calc_12 = (__m128i*) &X[5]; + __m128i *calc_13 = (__m128i*) &X[6]; + __m128i *calc_14 = (__m128i*) &X[7]; + + __m128i *calc_21 = (__m128i*) &X[8]; + __m128i *calc_22 = (__m128i*) &X[9]; + __m128i *calc_23 = (__m128i*) &X[10]; + __m128i *calc_24 = (__m128i*) &X[11]; + + __m128i *calc_31 = (__m128i*) &X[12]; + __m128i *calc_32 = (__m128i*) &X[13]; + __m128i *calc_33 = (__m128i*) &X[14]; + __m128i *calc_34 = (__m128i*) &X[15]; + + __m128i _calc5; + __m128i _calc6; + __m128i _calc7; + __m128i _calc8; + + /* transpose the data from *X */ + _calc5 =_mm_blend_epi16(*calc_31, *calc_33, 0xf0); + _calc6 =_mm_blend_epi16(*calc_32, *calc_34, 0x0f); + _calc7 =_mm_blend_epi16(*calc_33, *calc_31, 0xf0); + _calc8 =_mm_blend_epi16(*calc_34, *calc_32, 0x0f); + *calc_31 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_32 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_33 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_34 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_21, *calc_23, 0xf0); + _calc6 =_mm_blend_epi16(*calc_22, *calc_24, 0x0f); + _calc7 =_mm_blend_epi16(*calc_23, *calc_21, 0xf0); + _calc8 =_mm_blend_epi16(*calc_24, *calc_22, 0x0f); + *calc_21 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_22 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_23 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_24 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_11, *calc_13, 0xf0); + _calc6 =_mm_blend_epi16(*calc_12, *calc_14, 0x0f); + _calc7 =_mm_blend_epi16(*calc_13, *calc_11, 0xf0); + _calc8 =_mm_blend_epi16(*calc_14, *calc_12, 0x0f); + *calc_11 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_12 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_13 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_14 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_1, *calc_3, 0xf0); + _calc6 =_mm_blend_epi16(*calc_2, *calc_4, 0x0f); + _calc7 =_mm_blend_epi16(*calc_3, *calc_1, 0xf0); + _calc8 =_mm_blend_epi16(*calc_4, *calc_2, 0x0f); + *calc_1 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_2 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_3 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_4 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + for (i = 0; i < Loops; i++) { + scratch[i * 16 + 0] = *calc_1; scratch[i * 16 + 1] = *calc_2; + scratch[i * 16 + 2] = *calc_3; scratch[i * 16 + 3] = *calc_4; + scratch[i * 16 + 4] = *calc_11; scratch[i * 16 + 5] = *calc_12; + scratch[i * 16 + 6] = *calc_13; scratch[i * 16 + 7] = *calc_14; + scratch[i * 16 + 8] = *calc_21; scratch[i * 16 + 9] = *calc_22; + scratch[i * 16 + 10] = *calc_23; scratch[i * 16 + 11] = *calc_24; + scratch[i * 16 + 12] = *calc_31; scratch[i * 16 + 13] = *calc_32; + scratch[i * 16 + 14] = *calc_33; scratch[i * 16 + 15] = *calc_34; + + xor_salsa_sidm( calc_1, calc_2, calc_3, calc_4,calc_31,calc_32,calc_33,calc_34, double_rounds); + xor_salsa_sidm(calc_11,calc_12,calc_13,calc_14, calc_1, calc_2, calc_3, calc_4, double_rounds); + xor_salsa_sidm_swap(calc_21,calc_22,calc_23,calc_24, calc_11, calc_12, calc_13, calc_14, double_rounds); + xor_salsa_sidm(calc_31,calc_32,calc_33,calc_34, calc_11, calc_12, calc_13, calc_14, double_rounds); + } + + for (i = 0; i < Loops; i++) { + j = 16 * (_mm_extract_epi16(*calc_31,0x00) & (Loops-1)); + + *calc_1 = _mm_xor_si128(*calc_1, scratch[j]); + *calc_2 = _mm_xor_si128(*calc_2, scratch[j+1]); + *calc_3 = _mm_xor_si128(*calc_3, scratch[j+2]); + *calc_4 = _mm_xor_si128(*calc_4, scratch[j+3]); + *calc_11 = _mm_xor_si128(*calc_11, scratch[j+4]); + *calc_12 = _mm_xor_si128(*calc_12, scratch[j+5]); + *calc_13 = _mm_xor_si128(*calc_13, scratch[j+6]); + *calc_14 = _mm_xor_si128(*calc_14, scratch[j+7]); + *calc_21 = _mm_xor_si128(*calc_21, scratch[j+8]); + *calc_22 = _mm_xor_si128(*calc_22, scratch[j+9]); + *calc_23 = _mm_xor_si128(*calc_23, scratch[j+10]); + *calc_24 = _mm_xor_si128(*calc_24, scratch[j+11]); + *calc_31 = _mm_xor_si128(*calc_31, scratch[j+12]); + *calc_32 = _mm_xor_si128(*calc_32, scratch[j+13]); + *calc_33 = _mm_xor_si128(*calc_33, scratch[j+14]); + *calc_34 = _mm_xor_si128(*calc_34, scratch[j+15]); + + xor_salsa_sidm( calc_1, calc_2, calc_3, calc_4,calc_31,calc_32,calc_33,calc_34, double_rounds); + xor_salsa_sidm(calc_11,calc_12,calc_13,calc_14, calc_1, calc_2, calc_3, calc_4, double_rounds); + xor_salsa_sidm_swap(calc_21,calc_22,calc_23,calc_24, calc_11, calc_12, calc_13, calc_14, double_rounds); + xor_salsa_sidm(calc_31,calc_32,calc_33,calc_34, calc_11, calc_12, calc_13, calc_14, double_rounds); + } + + _calc5 =_mm_blend_epi16(*calc_31, *calc_33, 0xf0); + _calc6 =_mm_blend_epi16(*calc_32, *calc_34, 0x0f); + _calc7 =_mm_blend_epi16(*calc_33, *calc_31, 0xf0); + _calc8 =_mm_blend_epi16(*calc_34, *calc_32, 0x0f); + *calc_31 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_32 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_33 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_34 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_21, *calc_23, 0xf0); + _calc6 =_mm_blend_epi16(*calc_22, *calc_24, 0x0f); + _calc7 =_mm_blend_epi16(*calc_23, *calc_21, 0xf0); + _calc8 =_mm_blend_epi16(*calc_24, *calc_22, 0x0f); + *calc_21 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_22 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_23 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_24 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_11, *calc_13, 0xf0); + _calc6 =_mm_blend_epi16(*calc_12, *calc_14, 0x0f); + _calc7 =_mm_blend_epi16(*calc_13, *calc_11, 0xf0); + _calc8 =_mm_blend_epi16(*calc_14, *calc_12, 0x0f); + *calc_11 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_12 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_13 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_14 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_1, *calc_3, 0xf0); + _calc6 =_mm_blend_epi16(*calc_2, *calc_4, 0x0f); + _calc7 =_mm_blend_epi16(*calc_3, *calc_1, 0xf0); + _calc8 =_mm_blend_epi16(*calc_4, *calc_2, 0x0f); + *calc_1 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_2 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_3 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_4 = _mm_blend_epi16(_calc8, _calc7, 0xcc); +} + +//--------------------------------------------------------------------------------------------- +// threefold +//--------------------------------------------------------------------------------------------- +static inline void xor_salsa_sidm_X3( + __m128i *calc_18_1, __m128i *calc_13_1, __m128i *calc_9_1, __m128i *calc_7_1, + __m128i *calc_1_1, __m128i *calc_4_1, __m128i *calc_3_1, __m128i *calc_2_1, + __m128i *calc_18_2, __m128i *calc_13_2, __m128i *calc_9_2, __m128i *calc_7_2, + __m128i *calc_1_2, __m128i *calc_4_2, __m128i *calc_3_2, __m128i *calc_2_2, + __m128i *calc_18_3, __m128i *calc_13_3, __m128i *calc_9_3, __m128i *calc_7_3, + __m128i *calc_1_3, __m128i *calc_4_3, __m128i *calc_3_3, __m128i *calc_2_3, + uint32_t double_rounds) +{ + int i; + __m128i _calc_1, _calc_2, _calc_3; + __m128i _shift_left; + __m128i row1_1 = _mm_xor_si128(*calc_18_1, *calc_1_1); + __m128i row2_1 = _mm_xor_si128(*calc_7_1, *calc_2_1); + __m128i row3_1 = _mm_xor_si128(*calc_9_1, *calc_3_1); + __m128i row4_1 = _mm_xor_si128(*calc_13_1, *calc_4_1); + __m128i row1_2 = _mm_xor_si128(*calc_18_2, *calc_1_2); + __m128i row2_2 = _mm_xor_si128(*calc_7_2, *calc_2_2); + __m128i row3_2 = _mm_xor_si128(*calc_9_2, *calc_3_2); + __m128i row4_2 = _mm_xor_si128(*calc_13_2, *calc_4_2); + __m128i row1_3 = _mm_xor_si128(*calc_18_3, *calc_1_3); + __m128i row2_3 = _mm_xor_si128(*calc_7_3, *calc_2_3); + __m128i row3_3 = _mm_xor_si128(*calc_9_3, *calc_3_3); + __m128i row4_3 = _mm_xor_si128(*calc_13_3, *calc_4_3); + + + *calc_18_1 = _mm_xor_si128(*calc_18_1, *calc_1_1); + *calc_7_1 = _mm_xor_si128(*calc_7_1, *calc_2_1); + *calc_9_1 = _mm_xor_si128(*calc_9_1, *calc_3_1); + *calc_13_1 = _mm_xor_si128(*calc_13_1, *calc_4_1); + + *calc_18_2 = _mm_xor_si128(*calc_18_2, *calc_1_2); + *calc_7_2 = _mm_xor_si128(*calc_7_2, *calc_2_2); + *calc_9_2 = _mm_xor_si128(*calc_9_2, *calc_3_2); + *calc_13_2 = _mm_xor_si128(*calc_13_2, *calc_4_2); + + *calc_18_3 = _mm_xor_si128(*calc_18_3, *calc_1_3); + *calc_7_3 = _mm_xor_si128(*calc_7_3, *calc_2_3); + *calc_9_3 = _mm_xor_si128(*calc_9_3, *calc_3_3); + *calc_13_3 = _mm_xor_si128(*calc_13_3, *calc_4_3); + + for (i = 0; i < double_rounds; i++) { + /* first row */ + _calc_1 = _mm_add_epi32(row1_1, row4_1); + _calc_2 = _mm_add_epi32(row1_2, row4_2); + _shift_left = _mm_slli_epi32(_calc_1, 7); + _calc_3 = _mm_add_epi32(row1_3, row4_3); + row2_1 = _mm_xor_si128(row2_1, _shift_left); + _calc_1 = _mm_srli_epi32(_calc_1,(32 - 7)); + _shift_left = _mm_slli_epi32(_calc_2, 7); + row2_1 = _mm_xor_si128(row2_1, _calc_1); + row2_2 = _mm_xor_si128(row2_2, _shift_left); + _calc_2 = _mm_srli_epi32(_calc_2,(32 - 7)); + _shift_left = _mm_slli_epi32(_calc_3, 7); + _calc_3 = _mm_srli_epi32(_calc_3,(32 - 7)); + row2_3 = _mm_xor_si128(row2_3, _shift_left); + row2_2 = _mm_xor_si128(row2_2, _calc_2); + row2_3 = _mm_xor_si128(row2_3, _calc_3); + + /* second row */ + _calc_1 = _mm_add_epi32(row2_1, row1_1); + _calc_2 = _mm_add_epi32(row2_2, row1_2); + _shift_left = _mm_slli_epi32(_calc_1, 9); + _calc_3 = _mm_add_epi32(row2_3, row1_3); + row3_1 = _mm_xor_si128(row3_1, _shift_left); + _calc_1 = _mm_srli_epi32(_calc_1,(32 - 9)); + _shift_left = _mm_slli_epi32(_calc_2, 9); + row3_1 = _mm_xor_si128(row3_1, _calc_1); + row3_2 = _mm_xor_si128(row3_2, _shift_left); + _calc_2 = _mm_srli_epi32(_calc_2,(32 - 9)); + _shift_left = _mm_slli_epi32(_calc_3, 9); + _calc_3 = _mm_srli_epi32(_calc_3,(32 - 9)); + row3_3 = _mm_xor_si128(row3_3, _shift_left); + row3_2 = _mm_xor_si128(row3_2, _calc_2); + row3_3 = _mm_xor_si128(row3_3, _calc_3); + + /* third row */ + _calc_1 = _mm_add_epi32(row3_1, row2_1); + _calc_2 = _mm_add_epi32(row3_2, row2_2); + _shift_left = _mm_slli_epi32(_calc_1, 13); + _calc_3 = _mm_add_epi32(row3_3, row2_3); + row4_1 = _mm_xor_si128(row4_1, _shift_left); + _calc_1 = _mm_srli_epi32(_calc_1,(32 - 13)); + _shift_left = _mm_slli_epi32(_calc_2, 13); + row4_1 = _mm_xor_si128(row4_1, _calc_1); + row4_2 = _mm_xor_si128(row4_2, _shift_left); + _calc_2 = _mm_srli_epi32(_calc_2,(32 - 13)); + _shift_left = _mm_slli_epi32(_calc_3, 13); + _calc_3 = _mm_srli_epi32(_calc_3,(32 - 13)); + row4_3 = _mm_xor_si128(row4_3, _shift_left); + row4_2 = _mm_xor_si128(row4_2, _calc_2); + row4_3 = _mm_xor_si128(row4_3, _calc_3); + + /* fourth row */ + _calc_1 = _mm_add_epi32(row4_1, row3_1); + _calc_2 = _mm_add_epi32(row4_2, row3_2); + _shift_left = _mm_slli_epi32(_calc_1, 18); + _calc_3 = _mm_add_epi32(row4_3, row3_3); + row1_1 = _mm_xor_si128(row1_1, _shift_left); + _calc_1 = _mm_srli_epi32(_calc_1,(32 - 18)); + _shift_left = _mm_slli_epi32(_calc_2, 18); + row1_1 = _mm_xor_si128(row1_1, _calc_1); + row1_2 = _mm_xor_si128(row1_2, _shift_left); + _calc_2 = _mm_srli_epi32(_calc_2,(32 - 18)); + _shift_left = _mm_slli_epi32(_calc_3, 18); + _calc_3 = _mm_srli_epi32(_calc_3,(32 - 18)); + row1_3 = _mm_xor_si128(row1_3, _shift_left); + row1_2 = _mm_xor_si128(row1_2, _calc_2); + row1_3 = _mm_xor_si128(row1_3, _calc_3); + + // transpose_matrix(row1, row2, row3, row4, row_to_column); + row2_1 = _mm_shuffle_epi32(row2_1,0x93); + row3_1 = _mm_shuffle_epi32(row3_1,0x4e); + row4_1 = _mm_shuffle_epi32(row4_1,0x39); + row2_2 = _mm_shuffle_epi32(row2_2,0x93); + row3_2 = _mm_shuffle_epi32(row3_2,0x4e); + row4_2 = _mm_shuffle_epi32(row4_2,0x39); + row2_3 = _mm_shuffle_epi32(row2_3,0x93); + row3_3 = _mm_shuffle_epi32(row3_3,0x4e); + row4_3 = _mm_shuffle_epi32(row4_3,0x39); + // end transpose + + // switch *calc_13 and * calc_7 usage compared to rows + /* first column */ + _calc_1 = _mm_add_epi32(row1_1, row2_1); + _calc_2 = _mm_add_epi32(row1_2, row2_2); + _shift_left = _mm_slli_epi32(_calc_1, 7); + _calc_3 = _mm_add_epi32(row1_3, row2_3); + row4_1 = _mm_xor_si128(row4_1, _shift_left); + _calc_1 = _mm_srli_epi32(_calc_1,(32 - 7)); + _shift_left = _mm_slli_epi32(_calc_2, 7); + row4_1 = _mm_xor_si128(row4_1, _calc_1); + row4_2 = _mm_xor_si128(row4_2, _shift_left); + _calc_2 = _mm_srli_epi32(_calc_2,(32 - 7)); + _shift_left = _mm_slli_epi32(_calc_3, 7); + _calc_3 = _mm_srli_epi32(_calc_3,(32 - 7)); + row4_3 = _mm_xor_si128(row4_3, _shift_left); + row4_2 = _mm_xor_si128(row4_2, _calc_2); + row4_3 = _mm_xor_si128(row4_3, _calc_3); + + /* second column */ + _calc_1 = _mm_add_epi32(row4_1, row1_1); + _calc_2 = _mm_add_epi32(row4_2, row1_2); + _shift_left = _mm_slli_epi32(_calc_1, 9); + _calc_3 = _mm_add_epi32(row4_3, row1_3); + row3_1 = _mm_xor_si128(row3_1, _shift_left); + _calc_1 = _mm_srli_epi32(_calc_1,(32 - 9)); + _shift_left = _mm_slli_epi32(_calc_2, 9); + row3_1 = _mm_xor_si128(row3_1, _calc_1); + row3_2 = _mm_xor_si128(row3_2, _shift_left); + _calc_2 = _mm_srli_epi32(_calc_2,(32 - 9)); + _shift_left = _mm_slli_epi32(_calc_3, 9); + _calc_3 = _mm_srli_epi32(_calc_3,(32 - 9)); + row3_3 = _mm_xor_si128(row3_3, _shift_left); + row3_2 = _mm_xor_si128(row3_2, _calc_2); + row3_3 = _mm_xor_si128(row3_3, _calc_3); + + /* third column */ + _calc_1 = _mm_add_epi32(row3_1, row4_1); + _calc_2 = _mm_add_epi32(row3_2, row4_2); + _shift_left = _mm_slli_epi32(_calc_1, 13); + _calc_3 = _mm_add_epi32(row3_3, row4_3); + row2_1 = _mm_xor_si128(row2_1, _shift_left); + _calc_1 = _mm_srli_epi32(_calc_1,(32 - 13)); + _shift_left = _mm_slli_epi32(_calc_2, 13); + row2_1 = _mm_xor_si128(row2_1, _calc_1); + row2_2 = _mm_xor_si128(row2_2, _shift_left); + _calc_2 = _mm_srli_epi32(_calc_2,(32 - 13)); + _shift_left = _mm_slli_epi32(_calc_3, 13); + _calc_3 = _mm_srli_epi32(_calc_3,(32 - 13)); + row2_3 = _mm_xor_si128(row2_3, _shift_left); + row2_2 = _mm_xor_si128(row2_2, _calc_2); + row2_3 = _mm_xor_si128(row2_3, _calc_3); + + /* fourth column */ + _calc_1 = _mm_add_epi32(row2_1, row3_1); + _calc_2 = _mm_add_epi32(row2_2, row3_2); + _shift_left = _mm_slli_epi32(_calc_1, 18); + _calc_3 = _mm_add_epi32(row2_3, row3_3); + row1_1 = _mm_xor_si128(row1_1, _shift_left); + _calc_1 = _mm_srli_epi32(_calc_1,(32 - 18)); + _shift_left = _mm_slli_epi32(_calc_2, 18); + row1_1 = _mm_xor_si128(row1_1, _calc_1); + row1_2 = _mm_xor_si128(row1_2, _shift_left); + _calc_2 = _mm_srli_epi32(_calc_2,(32 - 18)); + _shift_left = _mm_slli_epi32(_calc_3, 18); + _calc_3 = _mm_srli_epi32(_calc_3,(32 - 18)); + row1_3 = _mm_xor_si128(row1_3, _shift_left); + row1_2 = _mm_xor_si128(row1_2, _calc_2); + row1_3 = _mm_xor_si128(row1_3, _calc_3); + + // transpose_matrix(row1, row2, row3, row4, row_to_column); + row2_1 = _mm_shuffle_epi32(row2_1,0x39); + row3_1 = _mm_shuffle_epi32(row3_1,0x4e); + row4_1 = _mm_shuffle_epi32(row4_1,0x93); + row2_2 = _mm_shuffle_epi32(row2_2,0x39); + row3_2 = _mm_shuffle_epi32(row3_2,0x4e); + row4_2 = _mm_shuffle_epi32(row4_2,0x93); + row2_3 = _mm_shuffle_epi32(row2_3,0x39); + row3_3 = _mm_shuffle_epi32(row3_3,0x4e); + row4_3 = _mm_shuffle_epi32(row4_3,0x93); + // end transpose + } + *calc_18_1 = _mm_add_epi32(*calc_18_1,row1_1); + *calc_7_1 = _mm_add_epi32(*calc_7_1, row2_1); + *calc_9_1 = _mm_add_epi32(*calc_9_1, row3_1); + *calc_13_1 = _mm_add_epi32(*calc_13_1, row4_1); + + *calc_18_2 = _mm_add_epi32(*calc_18_2,row1_2); + *calc_7_2 = _mm_add_epi32(*calc_7_2, row2_2); + *calc_9_2 = _mm_add_epi32(*calc_9_2, row3_2); + *calc_13_2 = _mm_add_epi32(*calc_13_2, row4_2); + + *calc_18_3 = _mm_add_epi32(*calc_18_3,row1_3); + *calc_7_3 = _mm_add_epi32(*calc_7_3, row2_3); + *calc_9_3 = _mm_add_epi32(*calc_9_3, row3_3); + *calc_13_3 = _mm_add_epi32(*calc_13_3, row4_3); +} + +static inline void xor_salsa_sidm_swap_X3( + __m128i *calc_18_1, __m128i *calc_13_1, __m128i *calc_9_1, __m128i *calc_7_1, + __m128i *calc_1_1, __m128i *calc_4_1, __m128i *calc_3_1, __m128i *calc_2_1, + __m128i *calc_18_2, __m128i *calc_13_2, __m128i *calc_9_2, __m128i *calc_7_2, + __m128i *calc_1_2, __m128i *calc_4_2, __m128i *calc_3_2, __m128i *calc_2_2, + __m128i *calc_18_3, __m128i *calc_13_3, __m128i *calc_9_3, __m128i *calc_7_3, + __m128i *calc_1_3, __m128i *calc_4_3, __m128i *calc_3_3, __m128i *calc_2_3, + uint32_t double_rounds) +{ + int i; + __m128i _calc_1, _calc_2, _calc_3; + __m128i _shift_left; + __m128i row1_1 = _mm_xor_si128(*calc_18_1, *calc_1_1); + __m128i row2_1 = _mm_xor_si128(*calc_7_1, *calc_2_1); + __m128i row3_1 = _mm_xor_si128(*calc_9_1, *calc_3_1); + __m128i row4_1 = _mm_xor_si128(*calc_13_1, *calc_4_1); + __m128i row1_2 = _mm_xor_si128(*calc_18_2, *calc_1_2); + __m128i row2_2 = _mm_xor_si128(*calc_7_2, *calc_2_2); + __m128i row3_2 = _mm_xor_si128(*calc_9_2, *calc_3_2); + __m128i row4_2 = _mm_xor_si128(*calc_13_2, *calc_4_2); + __m128i row1_3 = _mm_xor_si128(*calc_18_3, *calc_1_3); + __m128i row2_3 = _mm_xor_si128(*calc_7_3, *calc_2_3); + __m128i row3_3 = _mm_xor_si128(*calc_9_3, *calc_3_3); + __m128i row4_3 = _mm_xor_si128(*calc_13_3, *calc_4_3); + + + *calc_18_1 = _mm_xor_si128(*calc_18_1, *calc_1_1); + *calc_7_1 = _mm_xor_si128(*calc_7_1, *calc_2_1); + *calc_9_1 = _mm_xor_si128(*calc_9_1, *calc_3_1); + *calc_13_1 = _mm_xor_si128(*calc_13_1, *calc_4_1); + + *calc_18_2 = _mm_xor_si128(*calc_18_2, *calc_1_2); + *calc_7_2 = _mm_xor_si128(*calc_7_2, *calc_2_2); + *calc_9_2 = _mm_xor_si128(*calc_9_2, *calc_3_2); + *calc_13_2 = _mm_xor_si128(*calc_13_2, *calc_4_2); + + *calc_18_3 = _mm_xor_si128(*calc_18_3, *calc_1_3); + *calc_7_3 = _mm_xor_si128(*calc_7_3, *calc_2_3); + *calc_9_3 = _mm_xor_si128(*calc_9_3, *calc_3_3); + *calc_13_3 = _mm_xor_si128(*calc_13_3, *calc_4_3); + + for (i = 0; i < double_rounds; i++) { + /* first row */ + _calc_1 = _mm_add_epi32(row1_1, row4_1); + _calc_2 = _mm_add_epi32(row1_2, row4_2); + _shift_left = _mm_slli_epi32(_calc_1, 7); + _calc_3 = _mm_add_epi32(row1_3, row4_3); + row2_1 = _mm_xor_si128(row2_1, _shift_left); + _calc_1 = _mm_srli_epi32(_calc_1,(32 - 7)); + _shift_left = _mm_slli_epi32(_calc_2, 7); + row2_1 = _mm_xor_si128(row2_1, _calc_1); + row2_2 = _mm_xor_si128(row2_2, _shift_left); + _calc_2 = _mm_srli_epi32(_calc_2,(32 - 7)); + _shift_left = _mm_slli_epi32(_calc_3, 7); + _calc_3 = _mm_srli_epi32(_calc_3,(32 - 7)); + row2_3 = _mm_xor_si128(row2_3, _shift_left); + row2_2 = _mm_xor_si128(row2_2, _calc_2); + row2_3 = _mm_xor_si128(row2_3, _calc_3); + + /* second row */ + _calc_1 = _mm_add_epi32(row2_1, row1_1); + _calc_2 = _mm_add_epi32(row2_2, row1_2); + _shift_left = _mm_slli_epi32(_calc_1, 9); + _calc_3 = _mm_add_epi32(row2_3, row1_3); + row3_1 = _mm_xor_si128(row3_1, _shift_left); + _calc_1 = _mm_srli_epi32(_calc_1,(32 - 9)); + _shift_left = _mm_slli_epi32(_calc_2, 9); + row3_1 = _mm_xor_si128(row3_1, _calc_1); + row3_2 = _mm_xor_si128(row3_2, _shift_left); + _calc_2 = _mm_srli_epi32(_calc_2,(32 - 9)); + _shift_left = _mm_slli_epi32(_calc_3, 9); + _calc_3 = _mm_srli_epi32(_calc_3,(32 - 9)); + row3_3 = _mm_xor_si128(row3_3, _shift_left); + row3_2 = _mm_xor_si128(row3_2, _calc_2); + row3_3 = _mm_xor_si128(row3_3, _calc_3); + + /* third row */ + _calc_1 = _mm_add_epi32(row3_1, row2_1); + _calc_2 = _mm_add_epi32(row3_2, row2_2); + _shift_left = _mm_slli_epi32(_calc_1, 13); + _calc_3 = _mm_add_epi32(row3_3, row2_3); + row4_1 = _mm_xor_si128(row4_1, _shift_left); + _calc_1 = _mm_srli_epi32(_calc_1,(32 - 13)); + _shift_left = _mm_slli_epi32(_calc_2, 13); + row4_1 = _mm_xor_si128(row4_1, _calc_1); + row4_2 = _mm_xor_si128(row4_2, _shift_left); + _calc_2 = _mm_srli_epi32(_calc_2,(32 - 13)); + _shift_left = _mm_slli_epi32(_calc_3, 13); + _calc_3 = _mm_srli_epi32(_calc_3,(32 - 13)); + row4_3 = _mm_xor_si128(row4_3, _shift_left); + row4_2 = _mm_xor_si128(row4_2, _calc_2); + row4_3 = _mm_xor_si128(row4_3, _calc_3); + + /* fourth row */ + _calc_1 = _mm_add_epi32(row4_1, row3_1); + _calc_2 = _mm_add_epi32(row4_2, row3_2); + _shift_left = _mm_slli_epi32(_calc_1, 18); + _calc_3 = _mm_add_epi32(row4_3, row3_3); + row1_1 = _mm_xor_si128(row1_1, _shift_left); + _calc_1 = _mm_srli_epi32(_calc_1,(32 - 18)); + _shift_left = _mm_slli_epi32(_calc_2, 18); + row1_1 = _mm_xor_si128(row1_1, _calc_1); + row1_2 = _mm_xor_si128(row1_2, _shift_left); + _calc_2 = _mm_srli_epi32(_calc_2,(32 - 18)); + _shift_left = _mm_slli_epi32(_calc_3, 18); + _calc_3 = _mm_srli_epi32(_calc_3,(32 - 18)); + row1_3 = _mm_xor_si128(row1_3, _shift_left); + row1_2 = _mm_xor_si128(row1_2, _calc_2); + row1_3 = _mm_xor_si128(row1_3, _calc_3); + + // transpose_matrix(row1, row2, row3, row4, row_to_column); + row2_1 = _mm_shuffle_epi32(row2_1,0x93); + row3_1 = _mm_shuffle_epi32(row3_1,0x4e); + row4_1 = _mm_shuffle_epi32(row4_1,0x39); + row2_2 = _mm_shuffle_epi32(row2_2,0x93); + row3_2 = _mm_shuffle_epi32(row3_2,0x4e); + row4_2 = _mm_shuffle_epi32(row4_2,0x39); + row2_3 = _mm_shuffle_epi32(row2_3,0x93); + row3_3 = _mm_shuffle_epi32(row3_3,0x4e); + row4_3 = _mm_shuffle_epi32(row4_3,0x39); + // end transpose + + // switch *calc_13 and * calc_7 usage compared to rows + /* first column */ + _calc_1 = _mm_add_epi32(row1_1, row2_1); + _calc_2 = _mm_add_epi32(row1_2, row2_2); + _shift_left = _mm_slli_epi32(_calc_1, 7); + _calc_3 = _mm_add_epi32(row1_3, row2_3); + row4_1 = _mm_xor_si128(row4_1, _shift_left); + _calc_1 = _mm_srli_epi32(_calc_1,(32 - 7)); + _shift_left = _mm_slli_epi32(_calc_2, 7); + row4_1 = _mm_xor_si128(row4_1, _calc_1); + row4_2 = _mm_xor_si128(row4_2, _shift_left); + _calc_2 = _mm_srli_epi32(_calc_2,(32 - 7)); + _shift_left = _mm_slli_epi32(_calc_3, 7); + _calc_3 = _mm_srli_epi32(_calc_3,(32 - 7)); + row4_3 = _mm_xor_si128(row4_3, _shift_left); + row4_2 = _mm_xor_si128(row4_2, _calc_2); + row4_3 = _mm_xor_si128(row4_3, _calc_3); + + /* second column */ + _calc_1 = _mm_add_epi32(row4_1, row1_1); + _calc_2 = _mm_add_epi32(row4_2, row1_2); + _shift_left = _mm_slli_epi32(_calc_1, 9); + _calc_3 = _mm_add_epi32(row4_3, row1_3); + row3_1 = _mm_xor_si128(row3_1, _shift_left); + _calc_1 = _mm_srli_epi32(_calc_1,(32 - 9)); + _shift_left = _mm_slli_epi32(_calc_2, 9); + row3_1 = _mm_xor_si128(row3_1, _calc_1); + row3_2 = _mm_xor_si128(row3_2, _shift_left); + _calc_2 = _mm_srli_epi32(_calc_2,(32 - 9)); + _shift_left = _mm_slli_epi32(_calc_3, 9); + _calc_3 = _mm_srli_epi32(_calc_3,(32 - 9)); + row3_3 = _mm_xor_si128(row3_3, _shift_left); + row3_2 = _mm_xor_si128(row3_2, _calc_2); + row3_3 = _mm_xor_si128(row3_3, _calc_3); + + /* third column */ + _calc_1 = _mm_add_epi32(row3_1, row4_1); + _calc_2 = _mm_add_epi32(row3_2, row4_2); + _shift_left = _mm_slli_epi32(_calc_1, 13); + _calc_3 = _mm_add_epi32(row3_3, row4_3); + row2_1 = _mm_xor_si128(row2_1, _shift_left); + _calc_1 = _mm_srli_epi32(_calc_1,(32 - 13)); + _shift_left = _mm_slli_epi32(_calc_2, 13); + row2_1 = _mm_xor_si128(row2_1, _calc_1); + row2_2 = _mm_xor_si128(row2_2, _shift_left); + _calc_2 = _mm_srli_epi32(_calc_2,(32 - 13)); + _shift_left = _mm_slli_epi32(_calc_3, 13); + _calc_3 = _mm_srli_epi32(_calc_3,(32 - 13)); + row2_3 = _mm_xor_si128(row2_3, _shift_left); + row2_2 = _mm_xor_si128(row2_2, _calc_2); + row2_3 = _mm_xor_si128(row2_3, _calc_3); + + /* fourth column */ + _calc_1 = _mm_add_epi32(row2_1, row3_1); + _calc_2 = _mm_add_epi32(row2_2, row3_2); + _shift_left = _mm_slli_epi32(_calc_1, 18); + _calc_3 = _mm_add_epi32(row2_3, row3_3); + row1_1 = _mm_xor_si128(row1_1, _shift_left); + _calc_1 = _mm_srli_epi32(_calc_1,(32 - 18)); + _shift_left = _mm_slli_epi32(_calc_2, 18); + row1_1 = _mm_xor_si128(row1_1, _calc_1); + row1_2 = _mm_xor_si128(row1_2, _shift_left); + _calc_2 = _mm_srli_epi32(_calc_2,(32 - 18)); + _shift_left = _mm_slli_epi32(_calc_3, 18); + _calc_3 = _mm_srli_epi32(_calc_3,(32 - 18)); + row1_3 = _mm_xor_si128(row1_3, _shift_left); + row1_2 = _mm_xor_si128(row1_2, _calc_2); + row1_3 = _mm_xor_si128(row1_3, _calc_3); + + // transpose_matrix(row1, row2, row3, row4, row_to_column); + row2_1 = _mm_shuffle_epi32(row2_1,0x39); + row3_1 = _mm_shuffle_epi32(row3_1,0x4e); + row4_1 = _mm_shuffle_epi32(row4_1,0x93); + row2_2 = _mm_shuffle_epi32(row2_2,0x39); + row3_2 = _mm_shuffle_epi32(row3_2,0x4e); + row4_2 = _mm_shuffle_epi32(row4_2,0x93); + row2_3 = _mm_shuffle_epi32(row2_3,0x39); + row3_3 = _mm_shuffle_epi32(row3_3,0x4e); + row4_3 = _mm_shuffle_epi32(row4_3,0x93); + // end transpose + } + row1_1 = _mm_add_epi32(*calc_18_1,row1_1); + row2_1 = _mm_add_epi32(*calc_7_1, row2_1); + row3_1 = _mm_add_epi32(*calc_9_1, row3_1); + row4_1 = _mm_add_epi32(*calc_13_1, row4_1); + + row1_2 = _mm_add_epi32(*calc_18_2,row1_2); + row2_2 = _mm_add_epi32(*calc_7_2, row2_2); + row3_2 = _mm_add_epi32(*calc_9_2, row3_2); + row4_2 = _mm_add_epi32(*calc_13_2, row4_2); + + row1_3 = _mm_add_epi32(*calc_18_3,row1_3); + row2_3 = _mm_add_epi32(*calc_7_3, row2_3); + row3_3 = _mm_add_epi32(*calc_9_3, row3_3); + row4_3 = _mm_add_epi32(*calc_13_3, row4_3); + + *calc_18_1 = *calc_1_1; + *calc_7_1 = *calc_2_1; + *calc_9_1 = *calc_3_1; + *calc_13_1 = *calc_4_1; + *calc_18_2 = *calc_1_2; + *calc_7_2 = *calc_2_2; + *calc_9_2 = *calc_3_2; + *calc_13_2 = *calc_4_2; + *calc_18_3 = *calc_1_3; + *calc_7_3 = *calc_2_3; + *calc_9_3 = *calc_3_3; + *calc_13_3 = *calc_4_3; + + *calc_1_1 = row1_1; + *calc_2_1 = row2_1; + *calc_3_1 = row3_1; + *calc_4_1 = row4_1; + *calc_1_2 = row1_2; + *calc_2_2 = row2_2; + *calc_3_2 = row3_2; + *calc_4_2 = row4_2; + *calc_1_3 = row1_3; + *calc_2_3 = row2_3; + *calc_3_3 = row3_3; + *calc_4_3 = row4_3; +} + +static inline void scrypt_core_r2_sidm_X3(__m128i *X_1, __m128i *X_2, __m128i *X_3, uint32_t Loops, uint32_t double_rounds) +{ + uint32_t i, j1, j2, j3; + __m128i scratch_1[Loops * 8 * 4]; + __m128i scratch_2[Loops * 8 * 4]; + __m128i scratch_3[Loops * 8 * 4]; +// 1 + __m128i *calc_1_1 = (__m128i*) &X_1[0]; + __m128i *calc_2_1 = (__m128i*) &X_1[1]; + __m128i *calc_3_1 = (__m128i*) &X_1[2]; + __m128i *calc_4_1 = (__m128i*) &X_1[3]; + + __m128i *calc_11_1 = (__m128i*) &X_1[4]; + __m128i *calc_12_1 = (__m128i*) &X_1[5]; + __m128i *calc_13_1 = (__m128i*) &X_1[6]; + __m128i *calc_14_1 = (__m128i*) &X_1[7]; + + __m128i *calc_21_1 = (__m128i*) &X_1[8]; + __m128i *calc_22_1 = (__m128i*) &X_1[9]; + __m128i *calc_23_1 = (__m128i*) &X_1[10]; + __m128i *calc_24_1 = (__m128i*) &X_1[11]; + + __m128i *calc_31_1 = (__m128i*) &X_1[12]; + __m128i *calc_32_1 = (__m128i*) &X_1[13]; + __m128i *calc_33_1 = (__m128i*) &X_1[14]; + __m128i *calc_34_1 = (__m128i*) &X_1[15]; +// 2 + __m128i *calc_1_2 = (__m128i*) &X_2[0]; + __m128i *calc_2_2 = (__m128i*) &X_2[1]; + __m128i *calc_3_2 = (__m128i*) &X_2[2]; + __m128i *calc_4_2 = (__m128i*) &X_2[3]; + + __m128i *calc_11_2 = (__m128i*) &X_2[4]; + __m128i *calc_12_2 = (__m128i*) &X_2[5]; + __m128i *calc_13_2 = (__m128i*) &X_2[6]; + __m128i *calc_14_2 = (__m128i*) &X_2[7]; + + __m128i *calc_21_2 = (__m128i*) &X_2[8]; + __m128i *calc_22_2 = (__m128i*) &X_2[9]; + __m128i *calc_23_2 = (__m128i*) &X_2[10]; + __m128i *calc_24_2 = (__m128i*) &X_2[11]; + + __m128i *calc_31_2 = (__m128i*) &X_2[12]; + __m128i *calc_32_2 = (__m128i*) &X_2[13]; + __m128i *calc_33_2 = (__m128i*) &X_2[14]; + __m128i *calc_34_2 = (__m128i*) &X_2[15]; +// 3 + __m128i *calc_1_3 = (__m128i*) &X_3[0]; + __m128i *calc_2_3 = (__m128i*) &X_3[1]; + __m128i *calc_3_3 = (__m128i*) &X_3[2]; + __m128i *calc_4_3 = (__m128i*) &X_3[3]; + + __m128i *calc_11_3 = (__m128i*) &X_3[4]; + __m128i *calc_12_3 = (__m128i*) &X_3[5]; + __m128i *calc_13_3 = (__m128i*) &X_3[6]; + __m128i *calc_14_3 = (__m128i*) &X_3[7]; + + __m128i *calc_21_3 = (__m128i*) &X_3[8]; + __m128i *calc_22_3 = (__m128i*) &X_3[9]; + __m128i *calc_23_3 = (__m128i*) &X_3[10]; + __m128i *calc_24_3 = (__m128i*) &X_3[11]; + + __m128i *calc_31_3 = (__m128i*) &X_3[12]; + __m128i *calc_32_3 = (__m128i*) &X_3[13]; + __m128i *calc_33_3 = (__m128i*) &X_3[14]; + __m128i *calc_34_3 = (__m128i*) &X_3[15]; + + + __m128i _calc5; + __m128i _calc6; + __m128i _calc7; + __m128i _calc8; + + /* transpose the data from *X_1 */ + _calc5 =_mm_blend_epi16(*calc_31_1, *calc_33_1, 0xf0); + _calc6 =_mm_blend_epi16(*calc_32_1, *calc_34_1, 0x0f); + _calc7 =_mm_blend_epi16(*calc_33_1, *calc_31_1, 0xf0); + _calc8 =_mm_blend_epi16(*calc_34_1, *calc_32_1, 0x0f); + *calc_31_1 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_32_1 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_33_1 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_34_1 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_21_1, *calc_23_1, 0xf0); + _calc6 =_mm_blend_epi16(*calc_22_1, *calc_24_1, 0x0f); + _calc7 =_mm_blend_epi16(*calc_23_1, *calc_21_1, 0xf0); + _calc8 =_mm_blend_epi16(*calc_24_1, *calc_22_1, 0x0f); + *calc_21_1 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_22_1 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_23_1 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_24_1 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_11_1, *calc_13_1, 0xf0); + _calc6 =_mm_blend_epi16(*calc_12_1, *calc_14_1, 0x0f); + _calc7 =_mm_blend_epi16(*calc_13_1, *calc_11_1, 0xf0); + _calc8 =_mm_blend_epi16(*calc_14_1, *calc_12_1, 0x0f); + *calc_11_1 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_12_1 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_13_1 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_14_1 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_1_1, *calc_3_1, 0xf0); + _calc6 =_mm_blend_epi16(*calc_2_1, *calc_4_1, 0x0f); + _calc7 =_mm_blend_epi16(*calc_3_1, *calc_1_1, 0xf0); + _calc8 =_mm_blend_epi16(*calc_4_1, *calc_2_1, 0x0f); + *calc_1_1 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_2_1 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_3_1 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_4_1 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + /* transpose the data from *X_2 */ + _calc5 =_mm_blend_epi16(*calc_31_2, *calc_33_2, 0xf0); + _calc6 =_mm_blend_epi16(*calc_32_2, *calc_34_2, 0x0f); + _calc7 =_mm_blend_epi16(*calc_33_2, *calc_31_2, 0xf0); + _calc8 =_mm_blend_epi16(*calc_34_2, *calc_32_2, 0x0f); + *calc_31_2 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_32_2 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_33_2 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_34_2 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_21_2, *calc_23_2, 0xf0); + _calc6 =_mm_blend_epi16(*calc_22_2, *calc_24_2, 0x0f); + _calc7 =_mm_blend_epi16(*calc_23_2, *calc_21_2, 0xf0); + _calc8 =_mm_blend_epi16(*calc_24_2, *calc_22_2, 0x0f); + *calc_21_2 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_22_2 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_23_2 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_24_2 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_11_2, *calc_13_2, 0xf0); + _calc6 =_mm_blend_epi16(*calc_12_2, *calc_14_2, 0x0f); + _calc7 =_mm_blend_epi16(*calc_13_2, *calc_11_2, 0xf0); + _calc8 =_mm_blend_epi16(*calc_14_2, *calc_12_2, 0x0f); + *calc_11_2 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_12_2 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_13_2 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_14_2 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_1_2, *calc_3_2, 0xf0); + _calc6 =_mm_blend_epi16(*calc_2_2, *calc_4_2, 0x0f); + _calc7 =_mm_blend_epi16(*calc_3_2, *calc_1_2, 0xf0); + _calc8 =_mm_blend_epi16(*calc_4_2, *calc_2_2, 0x0f); + *calc_1_2 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_2_2 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_3_2 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_4_2 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + /* transpose the data from *X_3 */ + _calc5 =_mm_blend_epi16(*calc_31_3, *calc_33_3, 0xf0); + _calc6 =_mm_blend_epi16(*calc_32_3, *calc_34_3, 0x0f); + _calc7 =_mm_blend_epi16(*calc_33_3, *calc_31_3, 0xf0); + _calc8 =_mm_blend_epi16(*calc_34_3, *calc_32_3, 0x0f); + *calc_31_3 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_32_3 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_33_3 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_34_3 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_21_3, *calc_23_3, 0xf0); + _calc6 =_mm_blend_epi16(*calc_22_3, *calc_24_3, 0x0f); + _calc7 =_mm_blend_epi16(*calc_23_3, *calc_21_3, 0xf0); + _calc8 =_mm_blend_epi16(*calc_24_3, *calc_22_3, 0x0f); + *calc_21_3 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_22_3 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_23_3 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_24_3 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_11_3, *calc_13_3, 0xf0); + _calc6 =_mm_blend_epi16(*calc_12_3, *calc_14_3, 0x0f); + _calc7 =_mm_blend_epi16(*calc_13_3, *calc_11_3, 0xf0); + _calc8 =_mm_blend_epi16(*calc_14_3, *calc_12_3, 0x0f); + *calc_11_3 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_12_3 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_13_3 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_14_3 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_1_3, *calc_3_3, 0xf0); + _calc6 =_mm_blend_epi16(*calc_2_3, *calc_4_3, 0x0f); + _calc7 =_mm_blend_epi16(*calc_3_3, *calc_1_3, 0xf0); + _calc8 =_mm_blend_epi16(*calc_4_3, *calc_2_3, 0x0f); + *calc_1_3 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_2_3 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_3_3 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_4_3 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + + for (i = 0; i < Loops; i++) { + scratch_1[i * 16 + 0] = *calc_1_1; scratch_1[i * 16 + 1] = *calc_2_1; + scratch_1[i * 16 + 2] = *calc_3_1; scratch_1[i * 16 + 3] = *calc_4_1; + scratch_2[i * 16 + 0] = *calc_1_2; scratch_2[i * 16 + 1] = *calc_2_2; + scratch_2[i * 16 + 2] = *calc_3_2; scratch_2[i * 16 + 3] = *calc_4_2; + scratch_3[i * 16 + 0] = *calc_1_3; scratch_3[i * 16 + 1] = *calc_2_3; + scratch_3[i * 16 + 2] = *calc_3_3; scratch_3[i * 16 + 3] = *calc_4_3; + + scratch_1[i * 16 + 12] = *calc_31_1; scratch_1[i * 16 + 13] = *calc_32_1; + scratch_1[i * 16 + 14] = *calc_33_1; scratch_1[i * 16 + 15] = *calc_34_1; + scratch_2[i * 16 + 12] = *calc_31_2; scratch_2[i * 16 + 13] = *calc_32_2; + scratch_2[i * 16 + 14] = *calc_33_2; scratch_2[i * 16 + 15] = *calc_34_2; + scratch_3[i * 16 + 12] = *calc_31_3; scratch_3[i * 16 + 13] = *calc_32_3; + scratch_3[i * 16 + 14] = *calc_33_3; scratch_3[i * 16 + 15] = *calc_34_3; + + xor_salsa_sidm_X3( calc_1_1, calc_2_1, calc_3_1, calc_4_1, calc_31_1,calc_32_1,calc_33_1,calc_34_1, + calc_1_2, calc_2_2, calc_3_2, calc_4_2, calc_31_2,calc_32_2,calc_33_2,calc_34_2, + calc_1_3, calc_2_3, calc_3_3, calc_4_3, calc_31_3,calc_32_3,calc_33_3,calc_34_3, + double_rounds); + + scratch_1[i * 16 + 4] = *calc_11_1; scratch_1[i * 16 + 5] = *calc_12_1; + scratch_1[i * 16 + 6] = *calc_13_1; scratch_1[i * 16 + 7] = *calc_14_1; + scratch_2[i * 16 + 4] = *calc_11_2; scratch_2[i * 16 + 5] = *calc_12_2; + scratch_2[i * 16 + 6] = *calc_13_2; scratch_2[i * 16 + 7] = *calc_14_2; + scratch_3[i * 16 + 4] = *calc_11_3; scratch_3[i * 16 + 5] = *calc_12_3; + scratch_3[i * 16 + 6] = *calc_13_3; scratch_3[i * 16 + 7] = *calc_14_3; + + xor_salsa_sidm_X3( calc_11_1, calc_12_1, calc_13_1, calc_14_1, calc_1_1,calc_2_1,calc_3_1,calc_4_1, + calc_11_2, calc_12_2, calc_13_2, calc_14_2, calc_1_2,calc_2_2,calc_3_2,calc_4_2, + calc_11_3, calc_12_3, calc_13_3, calc_14_3, calc_1_3,calc_2_3,calc_3_3,calc_4_3, + double_rounds); + + scratch_1[i * 16 + 8] = *calc_21_1; scratch_1[i * 16 + 9] = *calc_22_1; + scratch_1[i * 16 + 10] = *calc_23_1; scratch_1[i * 16 + 11] = *calc_24_1; + scratch_2[i * 16 + 8] = *calc_21_2; scratch_2[i * 16 + 9] = *calc_22_2; + scratch_2[i * 16 + 10] = *calc_23_2; scratch_2[i * 16 + 11] = *calc_24_2; + scratch_3[i * 16 + 8] = *calc_21_3; scratch_3[i * 16 + 9] = *calc_22_3; + scratch_3[i * 16 + 10] = *calc_23_3; scratch_3[i * 16 + 11] = *calc_24_3; + + xor_salsa_sidm_swap_X3( calc_21_1, calc_22_1, calc_23_1, calc_24_1, calc_11_1,calc_12_1,calc_13_1,calc_14_1, + calc_21_2, calc_22_2, calc_23_2, calc_24_2, calc_11_2,calc_12_2,calc_13_2,calc_14_2, + calc_21_3, calc_22_3, calc_23_3, calc_24_3, calc_11_3,calc_12_3,calc_13_3,calc_14_3, + double_rounds); + + xor_salsa_sidm_X3( calc_31_1, calc_32_1, calc_33_1, calc_34_1, calc_11_1,calc_12_1,calc_13_1,calc_14_1, + calc_31_2, calc_32_2, calc_33_2, calc_34_2, calc_11_2,calc_12_2,calc_13_2,calc_14_2, + calc_31_3, calc_32_3, calc_33_3, calc_34_3, calc_11_3,calc_12_3,calc_13_3,calc_14_3, + double_rounds); + } + + for (i = 0; i < Loops; i++) { + j1 = 16 * (_mm_extract_epi16(*calc_31_1,0x00) & (Loops-1)); + j2 = 16 * (_mm_extract_epi16(*calc_31_2,0x00) & (Loops-1)); + j3 = 16 * (_mm_extract_epi16(*calc_31_3,0x00) & (Loops-1)); + + //1 + *calc_1_1 = _mm_xor_si128(*calc_1_1, scratch_1[j1]); + *calc_2_1 = _mm_xor_si128(*calc_2_1, scratch_1[j1+1]); + *calc_3_1 = _mm_xor_si128(*calc_3_1, scratch_1[j1+2]); + *calc_4_1 = _mm_xor_si128(*calc_4_1, scratch_1[j1+3]); + + *calc_31_1 = _mm_xor_si128(*calc_31_1, scratch_1[j1+12]); + *calc_32_1 = _mm_xor_si128(*calc_32_1, scratch_1[j1+13]); + *calc_33_1 = _mm_xor_si128(*calc_33_1, scratch_1[j1+14]); + *calc_34_1 = _mm_xor_si128(*calc_34_1, scratch_1[j1+15]); + //2 + *calc_1_2 = _mm_xor_si128(*calc_1_2, scratch_2[j2]); + *calc_2_2 = _mm_xor_si128(*calc_2_2, scratch_2[j2+1]); + *calc_3_2 = _mm_xor_si128(*calc_3_2, scratch_2[j2+2]); + *calc_4_2 = _mm_xor_si128(*calc_4_2, scratch_2[j2+3]); + + *calc_31_2 = _mm_xor_si128(*calc_31_2, scratch_2[j2+12]); + *calc_32_2 = _mm_xor_si128(*calc_32_2, scratch_2[j2+13]); + *calc_33_2 = _mm_xor_si128(*calc_33_2, scratch_2[j2+14]); + *calc_34_2 = _mm_xor_si128(*calc_34_2, scratch_2[j2+15]); + //3 + *calc_1_3 = _mm_xor_si128(*calc_1_3, scratch_3[j3]); + *calc_2_3 = _mm_xor_si128(*calc_2_3, scratch_3[j3+1]); + *calc_3_3 = _mm_xor_si128(*calc_3_3, scratch_3[j3+2]); + *calc_4_3 = _mm_xor_si128(*calc_4_3, scratch_3[j3+3]); + + *calc_31_3 = _mm_xor_si128(*calc_31_3, scratch_3[j3+12]); + *calc_32_3 = _mm_xor_si128(*calc_32_3, scratch_3[j3+13]); + *calc_33_3 = _mm_xor_si128(*calc_33_3, scratch_3[j3+14]); + *calc_34_3 = _mm_xor_si128(*calc_34_3, scratch_3[j3+15]); + + + xor_salsa_sidm_X3( calc_1_1, calc_2_1, calc_3_1, calc_4_1, calc_31_1,calc_32_1,calc_33_1,calc_34_1, + calc_1_2, calc_2_2, calc_3_2, calc_4_2, calc_31_2,calc_32_2,calc_33_2,calc_34_2, + calc_1_3, calc_2_3, calc_3_3, calc_4_3, calc_31_3,calc_32_3,calc_33_3,calc_34_3, + double_rounds); + + //1 + *calc_11_1 = _mm_xor_si128(*calc_11_1, scratch_1[j1+4]); + *calc_12_1 = _mm_xor_si128(*calc_12_1, scratch_1[j1+5]); + *calc_13_1 = _mm_xor_si128(*calc_13_1, scratch_1[j1+6]); + *calc_14_1 = _mm_xor_si128(*calc_14_1, scratch_1[j1+7]); + //2 + *calc_11_2 = _mm_xor_si128(*calc_11_2, scratch_2[j2+4]); + *calc_12_2 = _mm_xor_si128(*calc_12_2, scratch_2[j2+5]); + *calc_13_2 = _mm_xor_si128(*calc_13_2, scratch_2[j2+6]); + *calc_14_2 = _mm_xor_si128(*calc_14_2, scratch_2[j2+7]); + //3 + *calc_11_3 = _mm_xor_si128(*calc_11_3, scratch_3[j3+4]); + *calc_12_3 = _mm_xor_si128(*calc_12_3, scratch_3[j3+5]); + *calc_13_3 = _mm_xor_si128(*calc_13_3, scratch_3[j3+6]); + *calc_14_3 = _mm_xor_si128(*calc_14_3, scratch_3[j3+7]); + + xor_salsa_sidm_X3( calc_11_1, calc_12_1, calc_13_1, calc_14_1, calc_1_1,calc_2_1,calc_3_1,calc_4_1, + calc_11_2, calc_12_2, calc_13_2, calc_14_2, calc_1_2,calc_2_2,calc_3_2,calc_4_2, + calc_11_3, calc_12_3, calc_13_3, calc_14_3, calc_1_3,calc_2_3,calc_3_3,calc_4_3, + double_rounds); + + //1 + *calc_21_1 = _mm_xor_si128(*calc_21_1, scratch_1[j1+8]); + *calc_22_1 = _mm_xor_si128(*calc_22_1, scratch_1[j1+9]); + *calc_23_1 = _mm_xor_si128(*calc_23_1, scratch_1[j1+10]); + *calc_24_1 = _mm_xor_si128(*calc_24_1, scratch_1[j1+11]); + //2 + *calc_21_2 = _mm_xor_si128(*calc_21_2, scratch_2[j2+8]); + *calc_22_2 = _mm_xor_si128(*calc_22_2, scratch_2[j2+9]); + *calc_23_2 = _mm_xor_si128(*calc_23_2, scratch_2[j2+10]); + *calc_24_2 = _mm_xor_si128(*calc_24_2, scratch_2[j2+11]); + //3 + *calc_21_3 = _mm_xor_si128(*calc_21_3, scratch_3[j3+8]); + *calc_22_3 = _mm_xor_si128(*calc_22_3, scratch_3[j3+9]); + *calc_23_3 = _mm_xor_si128(*calc_23_3, scratch_3[j3+10]); + *calc_24_3 = _mm_xor_si128(*calc_24_3, scratch_3[j3+11]); + + xor_salsa_sidm_swap_X3( calc_21_1, calc_22_1, calc_23_1, calc_24_1, calc_11_1,calc_12_1,calc_13_1,calc_14_1, + calc_21_2, calc_22_2, calc_23_2, calc_24_2, calc_11_2,calc_12_2,calc_13_2,calc_14_2, + calc_21_3, calc_22_3, calc_23_3, calc_24_3, calc_11_3,calc_12_3,calc_13_3,calc_14_3, + double_rounds); + + xor_salsa_sidm_X3( calc_31_1, calc_32_1, calc_33_1, calc_34_1, calc_11_1,calc_12_1,calc_13_1,calc_14_1, + calc_31_2, calc_32_2, calc_33_2, calc_34_2, calc_11_2,calc_12_2,calc_13_2,calc_14_2, + calc_31_3, calc_32_3, calc_33_3, calc_34_3, calc_11_3,calc_12_3,calc_13_3,calc_14_3, + double_rounds); + } +// 1 + _calc5 =_mm_blend_epi16(*calc_31_1, *calc_33_1, 0xf0); + _calc6 =_mm_blend_epi16(*calc_32_1, *calc_34_1, 0x0f); + _calc7 =_mm_blend_epi16(*calc_33_1, *calc_31_1, 0xf0); + _calc8 =_mm_blend_epi16(*calc_34_1, *calc_32_1, 0x0f); + *calc_31_1 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_32_1 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_33_1 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_34_1 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_21_1, *calc_23_1, 0xf0); + _calc6 =_mm_blend_epi16(*calc_22_1, *calc_24_1, 0x0f); + _calc7 =_mm_blend_epi16(*calc_23_1, *calc_21_1, 0xf0); + _calc8 =_mm_blend_epi16(*calc_24_1, *calc_22_1, 0x0f); + *calc_21_1 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_22_1 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_23_1 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_24_1 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_11_1, *calc_13_1, 0xf0); + _calc6 =_mm_blend_epi16(*calc_12_1, *calc_14_1, 0x0f); + _calc7 =_mm_blend_epi16(*calc_13_1, *calc_11_1, 0xf0); + _calc8 =_mm_blend_epi16(*calc_14_1, *calc_12_1, 0x0f); + *calc_11_1 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_12_1 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_13_1 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_14_1 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_1_1, *calc_3_1, 0xf0); + _calc6 =_mm_blend_epi16(*calc_2_1, *calc_4_1, 0x0f); + _calc7 =_mm_blend_epi16(*calc_3_1, *calc_1_1, 0xf0); + _calc8 =_mm_blend_epi16(*calc_4_1, *calc_2_1, 0x0f); + *calc_1_1 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_2_1 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_3_1 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_4_1 = _mm_blend_epi16(_calc8, _calc7, 0xcc); +// 2 + _calc5 =_mm_blend_epi16(*calc_31_2, *calc_33_2, 0xf0); + _calc6 =_mm_blend_epi16(*calc_32_2, *calc_34_2, 0x0f); + _calc7 =_mm_blend_epi16(*calc_33_2, *calc_31_2, 0xf0); + _calc8 =_mm_blend_epi16(*calc_34_2, *calc_32_2, 0x0f); + *calc_31_2 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_32_2 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_33_2 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_34_2 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_21_2, *calc_23_2, 0xf0); + _calc6 =_mm_blend_epi16(*calc_22_2, *calc_24_2, 0x0f); + _calc7 =_mm_blend_epi16(*calc_23_2, *calc_21_2, 0xf0); + _calc8 =_mm_blend_epi16(*calc_24_2, *calc_22_2, 0x0f); + *calc_21_2 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_22_2 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_23_2 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_24_2 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_11_2, *calc_13_2, 0xf0); + _calc6 =_mm_blend_epi16(*calc_12_2, *calc_14_2, 0x0f); + _calc7 =_mm_blend_epi16(*calc_13_2, *calc_11_2, 0xf0); + _calc8 =_mm_blend_epi16(*calc_14_2, *calc_12_2, 0x0f); + *calc_11_2 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_12_2 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_13_2 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_14_2 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_1_2, *calc_3_2, 0xf0); + _calc6 =_mm_blend_epi16(*calc_2_2, *calc_4_2, 0x0f); + _calc7 =_mm_blend_epi16(*calc_3_2, *calc_1_2, 0xf0); + _calc8 =_mm_blend_epi16(*calc_4_2, *calc_2_2, 0x0f); + *calc_1_2 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_2_2 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_3_2 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_4_2 = _mm_blend_epi16(_calc8, _calc7, 0xcc); +// 3 + _calc5 =_mm_blend_epi16(*calc_31_3, *calc_33_3, 0xf0); + _calc6 =_mm_blend_epi16(*calc_32_3, *calc_34_3, 0x0f); + _calc7 =_mm_blend_epi16(*calc_33_3, *calc_31_3, 0xf0); + _calc8 =_mm_blend_epi16(*calc_34_3, *calc_32_3, 0x0f); + *calc_31_3 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_32_3 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_33_3 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_34_3 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_21_3, *calc_23_3, 0xf0); + _calc6 =_mm_blend_epi16(*calc_22_3, *calc_24_3, 0x0f); + _calc7 =_mm_blend_epi16(*calc_23_3, *calc_21_3, 0xf0); + _calc8 =_mm_blend_epi16(*calc_24_3, *calc_22_3, 0x0f); + *calc_21_3 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_22_3 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_23_3 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_24_3 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_11_3, *calc_13_3, 0xf0); + _calc6 =_mm_blend_epi16(*calc_12_3, *calc_14_3, 0x0f); + _calc7 =_mm_blend_epi16(*calc_13_3, *calc_11_3, 0xf0); + _calc8 =_mm_blend_epi16(*calc_14_3, *calc_12_3, 0x0f); + *calc_11_3 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_12_3 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_13_3 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_14_3 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_1_3, *calc_3_3, 0xf0); + _calc6 =_mm_blend_epi16(*calc_2_3, *calc_4_3, 0x0f); + _calc7 =_mm_blend_epi16(*calc_3_3, *calc_1_3, 0xf0); + _calc8 =_mm_blend_epi16(*calc_4_3, *calc_2_3, 0x0f); + *calc_1_3 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_2_3 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_3_3 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_4_3 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + +} + +//--------------------------------------------------------------------------------------------- +// end threefold +//--------------------------------------------------------------------------------------------- + +static inline void scrypt_core_sidm(__m128i *X , uint32_t Loops, uint32_t double_rounds) +{ + uint32_t i, j; + __m128i scratch[Loops * 8]; + + __m128i *calc_1 = (__m128i*) &X[0]; + __m128i *calc_2 = (__m128i*) &X[1]; + __m128i *calc_3 = (__m128i*) &X[2]; + __m128i *calc_4 = (__m128i*) &X[3]; + + __m128i *calc_11 = (__m128i*) &X[4]; + __m128i *calc_21 = (__m128i*) &X[5]; + __m128i *calc_31 = (__m128i*) &X[6]; + __m128i *calc_41 = (__m128i*) &X[7]; + + __m128i _calc5; + __m128i _calc6; + __m128i _calc7; + __m128i _calc8; + + /* transpose the data from *X */ + _calc5 =_mm_blend_epi16(*calc_11, *calc_31, 0xf0); + _calc6 =_mm_blend_epi16(*calc_21, *calc_41, 0x0f); + _calc7 =_mm_blend_epi16(*calc_31, *calc_11, 0xf0); + _calc8 =_mm_blend_epi16(*calc_41, *calc_21, 0x0f); + *calc_11 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_21 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_31 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_41 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_1, *calc_3, 0xf0); + _calc6 =_mm_blend_epi16(*calc_2, *calc_4, 0x0f); + _calc7 =_mm_blend_epi16(*calc_3, *calc_1, 0xf0); + _calc8 =_mm_blend_epi16(*calc_4, *calc_2, 0x0f); + *calc_1 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_2 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_3 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_4 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + for (i = 0; i < Loops; i++) { + scratch[i * 8 + 0] = *calc_1; + scratch[i * 8 + 1] = *calc_2; + scratch[i * 8 + 2] = *calc_3; + scratch[i * 8 + 3] = *calc_4; + scratch[i * 8 + 4] = *calc_11; + scratch[i * 8 + 5] = *calc_21; + scratch[i * 8 + 6] = *calc_31; + scratch[i * 8 + 7] = *calc_41; + + xor_salsa_sidm( calc_1, calc_2, calc_3, calc_4,calc_11,calc_21,calc_31,calc_41, double_rounds); + xor_salsa_sidm(calc_11,calc_21,calc_31,calc_41, calc_1, calc_2, calc_3, calc_4, double_rounds); + } + + for (i = 0; i < Loops; i++) { + j = 8 * (_mm_extract_epi16(*calc_11,0x00) & (Loops-1)); + + *calc_1 = _mm_xor_si128(*calc_1, scratch[j]); + *calc_2 = _mm_xor_si128(*calc_2, scratch[j+1]); + *calc_3 = _mm_xor_si128(*calc_3, scratch[j+2]); + *calc_4 = _mm_xor_si128(*calc_4, scratch[j+3]); + *calc_11 = _mm_xor_si128(*calc_11, scratch[j+4]); + *calc_21 = _mm_xor_si128(*calc_21, scratch[j+5]); + *calc_31 = _mm_xor_si128(*calc_31, scratch[j+6]); + *calc_41 = _mm_xor_si128(*calc_41, scratch[j+7]); + + xor_salsa_sidm( calc_1, calc_2, calc_3, calc_4,calc_11,calc_21,calc_31,calc_41, double_rounds); + xor_salsa_sidm(calc_11,calc_21,calc_31,calc_41, calc_1, calc_2, calc_3, calc_4, double_rounds); + } + + _calc5 =_mm_blend_epi16(*calc_11, *calc_31, 0xf0); + _calc6 =_mm_blend_epi16(*calc_21, *calc_41, 0x0f); + _calc7 =_mm_blend_epi16(*calc_31, *calc_11, 0xf0); + _calc8 =_mm_blend_epi16(*calc_41, *calc_21, 0x0f); + *calc_11 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_21 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_31 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_41 = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(*calc_1, *calc_3, 0xf0); + _calc6 =_mm_blend_epi16(*calc_2, *calc_4, 0x0f); + _calc7 =_mm_blend_epi16(*calc_3, *calc_1, 0xf0); + _calc8 =_mm_blend_epi16(*calc_4, *calc_2, 0x0f); + *calc_1 = _mm_blend_epi16(_calc5, _calc8, 0xcc); + *calc_2 = _mm_blend_epi16(_calc6, _calc5, 0xcc); + *calc_3 = _mm_blend_epi16(_calc7, _calc6, 0xcc); + *calc_4 = _mm_blend_epi16(_calc8, _calc7, 0xcc); +} + +static inline void xor_salsa_sidm_3way(__m128i *calc_11, __m128i *calc_21, __m128i *calc_31, + uint32_t double_rounds) +{ + int i; + __m128i _calc_x1; + __m128i _calc_x2; + __m128i _calc_x3; + __m128i _shift_left; + __m128i X1[4]; + __m128i X2[4]; + __m128i X3[4]; + + X1[0] = calc_11[0]; + X1[1] = calc_11[1]; + X1[2] = calc_11[2]; + X1[3] = calc_11[3]; + + X2[0] = calc_21[0]; + X2[1] = calc_21[1]; + X2[2] = calc_21[2]; + X2[3] = calc_21[3]; + + X3[0] = calc_31[0]; + X3[1] = calc_31[1]; + X3[2] = calc_31[2]; + X3[3] = calc_31[3]; + + for (i = 0; i < double_rounds; i++) { + + /* first row X[3]=f(X0,X1) */ + _calc_x1 = _mm_add_epi32(X1[0], X1[1]); //X[0] and X[1] + _calc_x2 = _mm_add_epi32(X2[0], X2[1]); //X[0] and X[1] + _shift_left = _mm_slli_epi32(_calc_x1, 7); + _calc_x3 = _mm_add_epi32(X3[0], X3[1]); //X[0] and X[1] + X1[3] ^= _shift_left; + _calc_x1 = _mm_srli_epi32(_calc_x1,(32 - 7)); + _shift_left = _mm_slli_epi32(_calc_x2, 7); + X1[3] ^= _calc_x1; + X2[3] ^= _shift_left; + _calc_x2 = _mm_srli_epi32(_calc_x2,(32 - 7)); + _shift_left = _mm_slli_epi32(_calc_x3, 7); + _calc_x3 = _mm_srli_epi32(_calc_x3,(32 - 7)); + X3[3] ^= _shift_left; + X2[3] ^= _calc_x2; + X3[3] ^= _calc_x3; + + /* second rows X[2]=f(X3,X0) */ + _calc_x1 = _mm_add_epi32(X1[3], X1[0]); //X[3] and X[0] + _calc_x2 = _mm_add_epi32(X2[3], X2[0]); //X[3] and X[0] + _shift_left = _mm_slli_epi32(_calc_x1, 9); + _calc_x3 = _mm_add_epi32(X3[3], X3[0]); //X[3] and X[0] + X1[2] ^= _shift_left; + _calc_x1 = _mm_srli_epi32(_calc_x1,(32 - 9)); + _shift_left = _mm_slli_epi32(_calc_x2, 9); + X1[2] ^= _calc_x1; + X2[2] ^= _shift_left; + _calc_x2 = _mm_srli_epi32(_calc_x2,(32 - 9)); + _shift_left = _mm_slli_epi32(_calc_x3, 9); + _calc_x3 = _mm_srli_epi32(_calc_x3,(32 - 9)); + X3[2] ^= _shift_left; + X2[2] ^= _calc_x2; + X3[2] ^= _calc_x3; + + /* third rows X[1]=f(X2,X3) */ + _calc_x1 = _mm_add_epi32(X1[2], X1[3]); //X[2] and X[3] + _calc_x2 = _mm_add_epi32(X2[2], X2[3]); //X[2] and X[3] + _shift_left = _mm_slli_epi32(_calc_x1, 13); + _calc_x3 = _mm_add_epi32(X3[2], X3[3]); //X[2] and X[3] + X1[1] ^= _shift_left; + _calc_x1 = _mm_srli_epi32(_calc_x1,(32 - 13)); + _shift_left = _mm_slli_epi32(_calc_x2, 13); + X1[1] ^= _calc_x1; + X2[1] ^= _shift_left; + _calc_x2 = _mm_srli_epi32(_calc_x2,(32 - 13)); + _shift_left = _mm_slli_epi32(_calc_x3, 13); + _calc_x3 = _mm_srli_epi32(_calc_x3,(32 - 13)); + X3[1] ^= _shift_left; + X2[1] ^= _calc_x2; + X3[1] ^= _calc_x3; + + /* fourth rows X[0]=f(X1,X2) */ + _calc_x1 = _mm_add_epi32(X1[1], X1[2]); //X[1] and X[2] + _calc_x2 = _mm_add_epi32(X2[1], X2[2]); //X[1] and X[2] + _shift_left = _mm_slli_epi32(_calc_x1, 18); + _calc_x3 = _mm_add_epi32(X3[1], X3[2]); //X[1] and X[2] + X1[0] ^= _shift_left; + _calc_x1 = _mm_srli_epi32(_calc_x1,(32 - 18)); + _shift_left = _mm_slli_epi32(_calc_x2, 18); + X1[0] ^= _calc_x1; + X2[0] ^= _shift_left; + _calc_x2 = _mm_srli_epi32(_calc_x2,(32 - 18)); + _shift_left = _mm_slli_epi32(_calc_x3, 18); + _calc_x3 = _mm_srli_epi32(_calc_x3,(32 - 18)); + X3[0] ^= _shift_left; + X2[0] ^= _calc_x2; + X3[0] ^= _calc_x3; + + // transpose_matrix(row1, row2, row3, row4, row_to_column); + X1[3] = _mm_shuffle_epi32(X1[3],0x93); //x[3] + X2[3] = _mm_shuffle_epi32(X2[3],0x93); //x[3] + X3[3] = _mm_shuffle_epi32(X3[3],0x93); //x[3] + X1[2] = _mm_shuffle_epi32(X1[2],0x4e); //x[2] + X2[2] = _mm_shuffle_epi32(X2[2],0x4e); //x[2] + X3[2] = _mm_shuffle_epi32(X3[2],0x4e); //x[2] + X1[1] = _mm_shuffle_epi32(X1[1],0x39); //x[1] + X2[1] = _mm_shuffle_epi32(X2[1],0x39); //x[1] + X3[1] = _mm_shuffle_epi32(X3[1],0x39); //x[1] + // end transpose + + // switch *calc_13 and * calc_7 usage compared to rows + /* first column X[1]=f(X0,X3) */ + _calc_x1 = _mm_add_epi32(X1[0], X1[3]); //X[0] and X[3] + _calc_x2 = _mm_add_epi32(X2[0], X2[3]); //X[0] and X[3] + _shift_left = _mm_slli_epi32(_calc_x1, 7); + _calc_x3 = _mm_add_epi32(X3[0], X3[3]); //X[0] and X[3] + X1[1] ^= _shift_left; + _calc_x1 = _mm_srli_epi32(_calc_x1,(32 - 7)); + _shift_left = _mm_slli_epi32(_calc_x2, 7); + X1[1] ^= _calc_x1; + X2[1] ^= _shift_left; + _calc_x2 = _mm_srli_epi32(_calc_x2,(32 - 7)); + _shift_left = _mm_slli_epi32(_calc_x3, 7); + _calc_x3 = _mm_srli_epi32(_calc_x3,(32 - 7)); + X3[1] ^= _shift_left; + X2[1] ^= _calc_x2; + X3[1] ^= _calc_x3; + + /* second column X[2]=f(X1,X0) */ + _calc_x1 = _mm_add_epi32(X1[1], X1[0]); //X[1] and X[0] + _calc_x2 = _mm_add_epi32(X2[1], X2[0]); //X[1] and X[0] + _shift_left = _mm_slli_epi32(_calc_x1, 9); + _calc_x3 = _mm_add_epi32(X3[1], X3[0]); //X[1] and X[0] + X1[2] ^= _shift_left; + _calc_x1 = _mm_srli_epi32(_calc_x1,(32 - 9)); + _shift_left = _mm_slli_epi32(_calc_x2, 9); + X1[2] ^= _calc_x1; + X2[2] ^= _shift_left; + _calc_x2 = _mm_srli_epi32(_calc_x2,(32 - 9)); + _shift_left = _mm_slli_epi32(_calc_x3, 9); + _calc_x3 = _mm_srli_epi32(_calc_x3,(32 - 9)); + X3[2] ^= _shift_left; + X2[2] ^= _calc_x2; + X3[2] ^= _calc_x3; + + /* third column X[3]=f(X2,X1) */ + _calc_x1 = _mm_add_epi32(X1[2], X1[1]); //X[2] and X[1] + _calc_x2 = _mm_add_epi32(X2[2], X2[1]); //X[2] and X[1] + _shift_left = _mm_slli_epi32(_calc_x1, 13); + _calc_x3 = _mm_add_epi32(X3[2], X3[1]); //X[2] and X[1] + X1[3] ^= _shift_left; + _calc_x1 = _mm_srli_epi32(_calc_x1,(32 - 13)); + _shift_left = _mm_slli_epi32(_calc_x2, 13); + X1[3] ^= _calc_x1; + X2[3] ^= _shift_left; + _calc_x2 = _mm_srli_epi32(_calc_x2,(32 - 13)); + _shift_left = _mm_slli_epi32(_calc_x3, 13); + _calc_x3 = _mm_srli_epi32(_calc_x3,(32 - 13)); + X3[3] ^= _shift_left; + X2[3] ^= _calc_x2; + X3[3] ^= _calc_x3; + + /* fourth column X[0]=f(X3,X2) */ + _calc_x1 = _mm_add_epi32(X1[3], X1[2]); //X[3] and X[2] + _calc_x2 = _mm_add_epi32(X2[3], X2[2]); //X[3] and X[2] + _shift_left = _mm_slli_epi32(_calc_x1, 18); + _calc_x3 = _mm_add_epi32(X3[3], X3[2]); //X[3] and X[2] + X1[0] ^= _shift_left; + _calc_x1 = _mm_srli_epi32(_calc_x1,(32 - 18)); + _shift_left = _mm_slli_epi32(_calc_x2, 18); + X1[0] ^= _calc_x1; //X[0] + X2[0] ^= _shift_left; + _calc_x2 = _mm_srli_epi32(_calc_x2,(32 - 18)); + _shift_left = _mm_slli_epi32(_calc_x3, 18); + _calc_x3 = _mm_srli_epi32(_calc_x3,(32 - 18)); + X3[0] ^= _shift_left; + X2[0] ^= _calc_x2; //X[0] + X3[0] ^= _calc_x3; //X[0] + + // transpose_matrix(row1, row2, row3, row4, row_to_column); + X1[3] = _mm_shuffle_epi32(X1[3],0x39); //x[3] + X2[3] = _mm_shuffle_epi32(X2[3],0x39); //x[3] + X3[3] = _mm_shuffle_epi32(X3[3],0x39); //x[3] + X1[2] = _mm_shuffle_epi32(X1[2],0x4e); //x[2] + X2[2] = _mm_shuffle_epi32(X2[2],0x4e); //x[2] + X3[2] = _mm_shuffle_epi32(X3[2],0x4e); //x[2] + X1[1] = _mm_shuffle_epi32(X1[1],0x93); //x[1] + X2[1] = _mm_shuffle_epi32(X2[1],0x93); //x[1] + X3[1] = _mm_shuffle_epi32(X3[1],0x93); //x[1] + + // end transpose + } + + calc_11[0] = _mm_add_epi32(calc_11[0], X1[0]); + calc_11[1] = _mm_add_epi32(calc_11[1], X1[1]); + calc_11[2] = _mm_add_epi32(calc_11[2], X1[2]); + calc_11[3] = _mm_add_epi32(calc_11[3], X1[3]); + + calc_21[0] = _mm_add_epi32(calc_21[0], X2[0]); + calc_21[1] = _mm_add_epi32(calc_21[1], X2[1]); + calc_21[2] = _mm_add_epi32(calc_21[2], X2[2]); + calc_21[3] = _mm_add_epi32(calc_21[3], X2[3]); + + calc_31[0] = _mm_add_epi32(calc_31[0], X3[0]); + calc_31[1] = _mm_add_epi32(calc_31[1], X3[1]); + calc_31[2] = _mm_add_epi32(calc_31[2], X3[2]); + calc_31[3] = _mm_add_epi32(calc_31[3], X3[3]); + +} + + +static inline void scrypt_core_sidm_3way(__m128i *X , uint32_t Loops, uint32_t double_rounds) +{ + uint32_t i, j; + + __m128i scratch[Loops * 8 * 3]; + __m128i *SourcePtr = (__m128i*) X; + __m128i X11[4]; + __m128i X12[4]; + __m128i X21[4]; + __m128i X22[4]; + __m128i X31[4]; + __m128i X32[4]; + + __m128i *calc_11 = (__m128i*) X11; + __m128i *calc_21 = (__m128i*) X21; + __m128i *calc_31 = (__m128i*) X31; + __m128i *calc_12 = (__m128i*) X12; + __m128i *calc_22 = (__m128i*) X22; + __m128i *calc_32 = (__m128i*) X32; + + __m128i _calc5; + __m128i _calc6; + __m128i _calc7; + __m128i _calc8; + + // working with multiple pointers for the scratch-pad results in minimized instruction count. + __m128i *scratchPrt1 = &scratch[0]; + __m128i *scratchPrt2 = &scratch[1]; + __m128i *scratchPrt3 = &scratch[2]; + __m128i *scratchPrt4 = &scratch[3]; + __m128i *scratchPrt5 = &scratch[4]; + __m128i *scratchPrt6 = &scratch[5]; + __m128i *scratchPrt7 = &scratch[6]; + __m128i *scratchPrt8 = &scratch[7]; + + /* transpose the data from *X1x */ + _calc5 =_mm_blend_epi16(SourcePtr[0], SourcePtr[2], 0xf0); + _calc6 =_mm_blend_epi16(SourcePtr[1], SourcePtr[3], 0x0f); + _calc7 =_mm_blend_epi16(SourcePtr[2], SourcePtr[0], 0xf0); + _calc8 =_mm_blend_epi16(SourcePtr[3], SourcePtr[1], 0x0f); + calc_11[0] = _mm_blend_epi16(_calc5, _calc8, 0xcc); + calc_11[1] = _mm_blend_epi16(_calc6, _calc5, 0xcc); + calc_11[2] = _mm_blend_epi16(_calc7, _calc6, 0xcc); + calc_11[3] = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(SourcePtr[4], SourcePtr[6], 0xf0); + _calc6 =_mm_blend_epi16(SourcePtr[5], SourcePtr[7], 0x0f); + _calc7 =_mm_blend_epi16(SourcePtr[6], SourcePtr[4], 0xf0); + _calc8 =_mm_blend_epi16(SourcePtr[7], SourcePtr[5], 0x0f); + calc_12[0] = _mm_blend_epi16(_calc5, _calc8, 0xcc); + calc_12[1] = _mm_blend_epi16(_calc6, _calc5, 0xcc); + calc_12[2] = _mm_blend_epi16(_calc7, _calc6, 0xcc); + calc_12[3] = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + /* transpose the data from *X2x */ + _calc5 =_mm_blend_epi16(SourcePtr[8], SourcePtr[10], 0xf0); + _calc6 =_mm_blend_epi16(SourcePtr[9], SourcePtr[11], 0x0f); + _calc7 =_mm_blend_epi16(SourcePtr[10], SourcePtr[8], 0xf0); + _calc8 =_mm_blend_epi16(SourcePtr[11], SourcePtr[9], 0x0f); + calc_21[0] = _mm_blend_epi16(_calc5, _calc8, 0xcc); + calc_21[1] = _mm_blend_epi16(_calc6, _calc5, 0xcc); + calc_21[2] = _mm_blend_epi16(_calc7, _calc6, 0xcc); + calc_21[3] = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(SourcePtr[12], SourcePtr[14], 0xf0); + _calc6 =_mm_blend_epi16(SourcePtr[13], SourcePtr[15], 0x0f); + _calc7 =_mm_blend_epi16(SourcePtr[14], SourcePtr[12], 0xf0); + _calc8 =_mm_blend_epi16(SourcePtr[15], SourcePtr[13], 0x0f); + calc_22[0] = _mm_blend_epi16(_calc5, _calc8, 0xcc); + calc_22[1] = _mm_blend_epi16(_calc6, _calc5, 0xcc); + calc_22[2] = _mm_blend_epi16(_calc7, _calc6, 0xcc); + calc_22[3] = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + /* transpose the data from *X3x */ + _calc5 =_mm_blend_epi16(SourcePtr[16], SourcePtr[18], 0xf0); + _calc6 =_mm_blend_epi16(SourcePtr[17], SourcePtr[19], 0x0f); + _calc7 =_mm_blend_epi16(SourcePtr[18], SourcePtr[16], 0xf0); + _calc8 =_mm_blend_epi16(SourcePtr[19], SourcePtr[17], 0x0f); + calc_31[0] = _mm_blend_epi16(_calc5, _calc8, 0xcc); + calc_31[1] = _mm_blend_epi16(_calc6, _calc5, 0xcc); + calc_31[2] = _mm_blend_epi16(_calc7, _calc6, 0xcc); + calc_31[3] = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(SourcePtr[20], SourcePtr[22], 0xf0); + _calc6 =_mm_blend_epi16(SourcePtr[21], SourcePtr[23], 0x0f); + _calc7 =_mm_blend_epi16(SourcePtr[22], SourcePtr[20], 0xf0); + _calc8 =_mm_blend_epi16(SourcePtr[23], SourcePtr[21], 0x0f); + calc_32[0] = _mm_blend_epi16(_calc5, _calc8, 0xcc); + calc_32[1] = _mm_blend_epi16(_calc6, _calc5, 0xcc); + calc_32[2] = _mm_blend_epi16(_calc7, _calc6, 0xcc); + calc_32[3] = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + for (i = 0; i < Loops; i++) { + for (j=0; j<4; j++){ + scratch[i * 24 + 0 + j] = calc_11[j]; + scratch[i * 24 + 4 + j] = calc_12[j]; + scratch[i * 24 + 8 + j] = calc_21[j]; + scratch[i * 24 + 12 + j] = calc_22[j]; + scratch[i * 24 + 16 + j] = calc_31[j]; + scratch[i * 24 + 20 + j] = calc_32[j]; + } + calc_11[0] ^= calc_12[0]; + calc_11[1] ^= calc_12[1]; + calc_11[2] ^= calc_12[2]; + calc_11[3] ^= calc_12[3]; + + calc_21[0] ^= calc_22[0]; + calc_21[1] ^= calc_22[1]; + calc_21[2] ^= calc_22[2]; + calc_21[3] ^= calc_22[3]; + + calc_31[0] ^= calc_32[0]; + calc_31[1] ^= calc_32[1]; + calc_31[2] ^= calc_32[2]; + calc_31[3] ^= calc_32[3]; + + xor_salsa_sidm_3way(calc_11, calc_21, calc_31, double_rounds); + + calc_12[0] ^= calc_11[0]; + calc_12[1] ^= calc_11[1]; + calc_12[2] ^= calc_11[2]; + calc_12[3] ^= calc_11[3]; + + calc_22[0] ^= calc_21[0]; + calc_22[1] ^= calc_21[1]; + calc_22[2] ^= calc_21[2]; + calc_22[3] ^= calc_21[3]; + + calc_32[0] ^= calc_31[0]; + calc_32[1] ^= calc_31[1]; + calc_32[2] ^= calc_31[2]; + calc_32[3] ^= calc_31[3]; + + xor_salsa_sidm_3way(calc_12, calc_22, calc_32, double_rounds); + } + for (i = 0; i < Loops; i++) { + j = 24 * (_mm_extract_epi16(calc_12[0],0x00) & (Loops-1)); + + calc_11[0] ^= scratchPrt1[j]; + calc_11[1] ^= scratchPrt2[j]; + calc_11[2] ^= scratchPrt3[j]; + calc_11[3] ^= scratchPrt4[j]; + calc_12[0] ^= scratchPrt5[j]; + calc_12[1] ^= scratchPrt6[j]; + calc_12[2] ^= scratchPrt7[j]; + calc_12[3] ^= scratchPrt8[j]; + + j = 8 + 24 * (_mm_extract_epi16(calc_22[0],0x00) & (Loops-1)); + + calc_21[0] ^= scratchPrt1[j]; + calc_21[1] ^= scratchPrt2[j]; + calc_21[2] ^= scratchPrt3[j]; + calc_21[3] ^= scratchPrt4[j]; + calc_22[0] ^= scratchPrt5[j]; + calc_22[1] ^= scratchPrt6[j]; + calc_22[2] ^= scratchPrt7[j]; + calc_22[3] ^= scratchPrt8[j]; + + j = 16 + 24 * (_mm_extract_epi16(calc_32[0],0x00) & (Loops-1)); + + calc_31[0] ^= scratchPrt1[j]; + calc_31[1] ^= scratchPrt2[j]; + calc_31[2] ^= scratchPrt3[j]; + calc_31[3] ^= scratchPrt4[j]; + calc_32[0] ^= scratchPrt5[j]; + calc_32[1] ^= scratchPrt6[j]; + calc_32[2] ^= scratchPrt7[j]; + calc_32[3] ^= scratchPrt8[j]; + + calc_11[0] ^= calc_12[0]; + calc_11[1] ^= calc_12[1]; + calc_11[2] ^= calc_12[2]; + calc_11[3] ^= calc_12[3]; + + calc_21[0] ^= calc_22[0]; + calc_21[1] ^= calc_22[1]; + calc_21[2] ^= calc_22[2]; + calc_21[3] ^= calc_22[3]; + + calc_31[0] ^= calc_32[0]; + calc_31[1] ^= calc_32[1]; + calc_31[2] ^= calc_32[2]; + calc_31[3] ^= calc_32[3]; + + xor_salsa_sidm_3way(calc_11, calc_21, calc_31, double_rounds); + + calc_12[0] ^= calc_11[0]; + calc_12[1] ^= calc_11[1]; + calc_12[2] ^= calc_11[2]; + calc_12[3] ^= calc_11[3]; + + calc_22[0] ^= calc_21[0]; + calc_22[1] ^= calc_21[1]; + calc_22[2] ^= calc_21[2]; + calc_22[3] ^= calc_21[3]; + + calc_32[0] ^= calc_31[0]; + calc_32[1] ^= calc_31[1]; + calc_32[2] ^= calc_31[2]; + calc_32[3] ^= calc_31[3]; + + xor_salsa_sidm_3way(calc_12, calc_22, calc_32, double_rounds); + } +// return the valueÅ› to X + _calc5 =_mm_blend_epi16(calc_11[0], calc_11[2], 0xf0); + _calc6 =_mm_blend_epi16(calc_11[1], calc_11[3], 0x0f); + _calc7 =_mm_blend_epi16(calc_11[2], calc_11[0], 0xf0); + _calc8 =_mm_blend_epi16(calc_11[3], calc_11[1], 0x0f); + SourcePtr[0] = _mm_blend_epi16(_calc5, _calc8, 0xcc); + SourcePtr[1] = _mm_blend_epi16(_calc6, _calc5, 0xcc); + SourcePtr[2] = _mm_blend_epi16(_calc7, _calc6, 0xcc); + SourcePtr[3] = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(calc_12[0], calc_12[2], 0xf0); + _calc6 =_mm_blend_epi16(calc_12[1], calc_12[3], 0x0f); + _calc7 =_mm_blend_epi16(calc_12[2], calc_12[0], 0xf0); + _calc8 =_mm_blend_epi16(calc_12[3], calc_12[1], 0x0f); + SourcePtr[4] = _mm_blend_epi16(_calc5, _calc8, 0xcc); + SourcePtr[5] = _mm_blend_epi16(_calc6, _calc5, 0xcc); + SourcePtr[6] = _mm_blend_epi16(_calc7, _calc6, 0xcc); + SourcePtr[7] = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(calc_21[0], calc_21[2], 0xf0); + _calc6 =_mm_blend_epi16(calc_21[1], calc_21[3], 0x0f); + _calc7 =_mm_blend_epi16(calc_21[2], calc_21[0], 0xf0); + _calc8 =_mm_blend_epi16(calc_21[3], calc_21[1], 0x0f); + SourcePtr[8] = _mm_blend_epi16(_calc5, _calc8, 0xcc); + SourcePtr[9] = _mm_blend_epi16(_calc6, _calc5, 0xcc); + SourcePtr[10] = _mm_blend_epi16(_calc7, _calc6, 0xcc); + SourcePtr[11] = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(calc_22[0], calc_22[2], 0xf0); + _calc6 =_mm_blend_epi16(calc_22[1], calc_22[3], 0x0f); + _calc7 =_mm_blend_epi16(calc_22[2], calc_22[0], 0xf0); + _calc8 =_mm_blend_epi16(calc_22[3], calc_22[1], 0x0f); + SourcePtr[12] = _mm_blend_epi16(_calc5, _calc8, 0xcc); + SourcePtr[13] = _mm_blend_epi16(_calc6, _calc5, 0xcc); + SourcePtr[14] = _mm_blend_epi16(_calc7, _calc6, 0xcc); + SourcePtr[15] = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(calc_31[0], calc_31[2], 0xf0); + _calc6 =_mm_blend_epi16(calc_31[1], calc_31[3], 0x0f); + _calc7 =_mm_blend_epi16(calc_31[2], calc_31[0], 0xf0); + _calc8 =_mm_blend_epi16(calc_31[3], calc_31[1], 0x0f); + SourcePtr[16] = _mm_blend_epi16(_calc5, _calc8, 0xcc); + SourcePtr[17] = _mm_blend_epi16(_calc6, _calc5, 0xcc); + SourcePtr[18] = _mm_blend_epi16(_calc7, _calc6, 0xcc); + SourcePtr[19] = _mm_blend_epi16(_calc8, _calc7, 0xcc); + + _calc5 =_mm_blend_epi16(calc_32[0], calc_32[2], 0xf0); + _calc6 =_mm_blend_epi16(calc_32[1], calc_32[3], 0x0f); + _calc7 =_mm_blend_epi16(calc_32[2], calc_32[0], 0xf0); + _calc8 =_mm_blend_epi16(calc_32[3], calc_32[1], 0x0f); + SourcePtr[20] = _mm_blend_epi16(_calc5, _calc8, 0xcc); + SourcePtr[21] = _mm_blend_epi16(_calc6, _calc5, 0xcc); + SourcePtr[22] = _mm_blend_epi16(_calc7, _calc6, 0xcc); + SourcePtr[23] = _mm_blend_epi16(_calc8, _calc7, 0xcc); +} + +#endif + diff --git a/util.c b/util.c index 6608fd727..9f1bb10c8 100644 --- a/util.c +++ b/util.c @@ -1022,11 +1022,58 @@ static const char *get_stratum_session_id(json_t *val) return NULL; } +static bool stratum_parse_extranonce(struct stratum_ctx *sctx, json_t *params, int pndx) +{ + const char* xnonce1; + int xn2_size; + + xnonce1 = json_string_value(json_array_get(params, pndx)); + if (!xnonce1) { + applog(LOG_ERR, "Failed to get extranonce1"); + goto out; + } + xn2_size = json_integer_value(json_array_get(params, pndx+1)); + if (!xn2_size) { + applog(LOG_ERR, "Failed to get extranonce2_size"); + goto out; + } + if (xn2_size < 2 || xn2_size > 16) { + applog(LOG_INFO, "Failed to get valid n2size in parse_extranonce"); + goto out; + } + + pthread_mutex_lock(&sctx->work_lock); + if (sctx->xnonce1) + free(sctx->xnonce1); + sctx->xnonce1_size = strlen(xnonce1) / 2; + sctx->xnonce1 = malloc(sctx->xnonce1_size); +// sctx->xnonce1 = /*(uchar*)*/ calloc(1, sctx->xnonce1_size); + if (unlikely(!sctx->xnonce1)) { + applog(LOG_ERR, "Failed to alloc xnonce1"); + pthread_mutex_unlock(&sctx->work_lock); + goto out; + } + hex2bin(sctx->xnonce1, xnonce1, sctx->xnonce1_size); + sctx->xnonce2_size = xn2_size; + pthread_mutex_unlock(&sctx->work_lock); + + if (pndx == 0 && opt_debug) /* pool dynamic change */ + applog(LOG_DEBUG, "Stratum set nonce %s with extranonce2 size=%d", + xnonce1, xn2_size); + + return true; +out: + return false; +} + + + bool stratum_subscribe(struct stratum_ctx *sctx) { char *s, *sret = NULL; - const char *sid, *xnonce1; - int xn2_size; +// const char *sid, *xnonce1; +// int xn2_size; + const char *sid; json_t *val = NULL, *res_val, *err_val; json_error_t err; bool ret = false, retry = false; @@ -1078,29 +1125,42 @@ bool stratum_subscribe(struct stratum_ctx *sctx) } sid = get_stratum_session_id(res_val); - if (opt_debug && !sid) - applog(LOG_DEBUG, "Failed to get Stratum session id"); - xnonce1 = json_string_value(json_array_get(res_val, 1)); - if (!xnonce1) { - applog(LOG_ERR, "Failed to get extranonce1"); - goto out; - } - xn2_size = json_integer_value(json_array_get(res_val, 2)); - if (!xn2_size) { - applog(LOG_ERR, "Failed to get extranonce2_size"); +// if (opt_debug && !sid) +// applog(LOG_DEBUG, "Failed to get Stratum session id"); +// xnonce1 = json_string_value(json_array_get(res_val, 1)); +// if (!xnonce1) { +// applog(LOG_ERR, "Failed to get extranonce1"); +// goto out; +// } +// xn2_size = json_integer_value(json_array_get(res_val, 2)); +// if (!xn2_size) { +// applog(LOG_ERR, "Failed to get extranonce2_size"); +// goto out; +// } +// if (opt_debug && sid) +// applog(LOG_DEBUG, "Stratum session id: %s", sid); + + pthread_mutex_lock(&sctx->work_lock); + if (sctx->session_id) + free(sctx->session_id); + sctx->session_id = sid ? strdup(sid) : NULL; + sctx->next_diff = 1.0; + pthread_mutex_unlock(&sctx->work_lock); + +// sid is param 1, extranonce params are 2 and 3 + if (!stratum_parse_extranonce(sctx, res_val, 1)) { goto out; } - pthread_mutex_lock(&sctx->work_lock); - free(sctx->session_id); - free(sctx->xnonce1); - sctx->session_id = sid ? strdup(sid) : NULL; - sctx->xnonce1_size = strlen(xnonce1) / 2; - sctx->xnonce1 = malloc(sctx->xnonce1_size); - hex2bin(sctx->xnonce1, xnonce1, sctx->xnonce1_size); - sctx->xnonce2_size = xn2_size; - sctx->next_diff = 1.0; - pthread_mutex_unlock(&sctx->work_lock); +// free(sctx->session_id); +// free(sctx->xnonce1); +// sctx->session_id = sid ? strdup(sid) : NULL; +// sctx->xnonce1_size = strlen(xnonce1) / 2; +// sctx->xnonce1 = malloc(sctx->xnonce1_size); +// hex2bin(sctx->xnonce1, xnonce1, sctx->xnonce1_size); +// sctx->xnonce2_size = xn2_size; +// sctx->next_diff = 1.0; +// pthread_mutex_unlock(&sctx->work_lock); if (opt_debug && sid) applog(LOG_DEBUG, "Stratum session id: %s", sctx->session_id); @@ -1125,7 +1185,7 @@ bool stratum_subscribe(struct stratum_ctx *sctx) bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass) { json_t *val = NULL, *res_val, *err_val; - char *s, *sret; + char *s, *sret, *subret; json_error_t err; bool ret = false; @@ -1162,6 +1222,45 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p } ret = true; + // subscribe to extranonce (optional) + sprintf(s, "{\"id\": 3, \"method\": \"mining.extranonce.subscribe\", \"params\": []}"); + + if (!stratum_send_line(sctx, s)) + goto out; + + while (1) { + if (!socket_full(sctx->sock, 3)) { + if (opt_debug) + applog(LOG_DEBUG, "stratum extranonce subscribe timed out"); + goto out; + } + sret = stratum_recv_line(sctx); + if (!sret) + goto out; + if (!stratum_handle_method(sctx, sret)) + break; + free(sret); + } + + sret = stratum_recv_line(sctx); + if (sret) { + json_t *extra = JSON_LOADS(sret, &err); + if (!extra) { + applog(LOG_WARNING, "JSON decode failed(%d): %s", err.line, err.text); + } else { + if (json_integer_value(json_object_get(extra, "id")) != 3) { + // we receive a standard method if extranonce is ignored + if (!stratum_handle_method(sctx, sret)) + applog(LOG_WARNING, "Stratum answer id is not correct!"); + // applog(LOG_WARNING, "Stratum answer id is not correct!"); + } + res_val = json_object_get(extra, "result"); + if (opt_debug && (!res_val || json_is_false(res_val))) + applog(LOG_DEBUG, "extranonce subscribe not supported"); + json_decref(extra); + } + free(sret); + } out: free(s); @@ -1364,6 +1463,7 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s) } method = json_string_value(json_object_get(val, "method")); +// applog(LOG_DEBUG, "Parse_Method: %s", method); if (!method) goto out; id = json_object_get(val, "id"); @@ -1377,6 +1477,10 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s) ret = stratum_set_difficulty(sctx, params); goto out; } + if (!strcasecmp(method, "mining.set_extranonce")) { + ret = stratum_parse_extranonce(sctx, params, 0); + goto out; + } if (!strcasecmp(method, "client.reconnect")) { ret = stratum_reconnect(sctx, params); goto out;