From aa82b5836bc778f5c57d824e09a1d4884938ef33 Mon Sep 17 00:00:00 2001 From: Sun Yimin Date: Mon, 11 Nov 2024 17:40:41 +0800 Subject: [PATCH] sm4: use package level instead of local for shared variables --- sm4/aesni_macros_amd64.s | 227 +++++++++++++++------------------------ sm4/aesni_macros_arm64.s | 38 +------ sm4/asm_amd64.s | 73 +++++++++++-- sm4/asm_arm64.s | 33 ++++-- sm4/cbc_amd64.s | 6 +- sm4/cbc_arm64.s | 20 ++-- sm4/ecb_amd64.s | 6 +- sm4/ecb_arm64.s | 18 ++-- sm4/gcm_amd64.s | 36 +++---- sm4/gcm_arm64.s | 15 +-- sm4/xts_amd64.s | 26 ++--- sm4/xts_arm64.s | 14 +-- zuc/asm_ppc64x.s | 1 - 13 files changed, 247 insertions(+), 266 deletions(-) diff --git a/sm4/aesni_macros_amd64.s b/sm4/aesni_macros_amd64.s index 4fd1b40..4cb4066 100644 --- a/sm4/aesni_macros_amd64.s +++ b/sm4/aesni_macros_amd64.s @@ -1,62 +1,3 @@ -// shuffle byte order from LE to BE -DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203 -DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b -GLOBL flip_mask<>(SB), 8, $16 - -// shuffle byte and word order -DATA bswap_mask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f -DATA bswap_mask<>+0x08(SB)/8, $0x0001020304050607 -GLOBL bswap_mask<>(SB), 8, $16 - -//nibble mask -DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F -DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F -GLOBL nibble_mask<>(SB), 8, $16 - -// inverse shift rows -DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00 -DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508 -DATA inverse_shift_rows<>+0x10(SB)/8, $0x0B0E0104070A0D00 -DATA inverse_shift_rows<>+0x18(SB)/8, $0x0306090C0F020508 -GLOBL inverse_shift_rows<>(SB), 8, $32 - -// Affine transform 1 (low and high nibbles) -DATA m1_low<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69 -DATA m1_low<>+0x08(SB)/8, $0x3045F98CEF9A2653 -DATA m1_low<>+0x10(SB)/8, $0x0A7FC3B6D5A01C69 -DATA m1_low<>+0x18(SB)/8, $0x3045F98CEF9A2653 -GLOBL m1_low<>(SB), 8, $32 - -DATA m1_high<>+0x00(SB)/8, $0xC35BF46CAF379800 -DATA m1_high<>+0x08(SB)/8, $0x68F05FC7049C33AB -DATA m1_high<>+0x10(SB)/8, $0xC35BF46CAF379800 -DATA m1_high<>+0x18(SB)/8, $0x68F05FC7049C33AB -GLOBL m1_high<>(SB), 8, $32 - -// Affine transform 2 (low and high nibbles) -DATA m2_low<>+0x00(SB)/8, $0x9A950A05FEF16E61 -DATA m2_low<>+0x08(SB)/8, $0x0E019E916A65FAF5 -DATA m2_low<>+0x10(SB)/8, $0x9A950A05FEF16E61 -DATA m2_low<>+0x18(SB)/8, $0x0E019E916A65FAF5 -GLOBL m2_low<>(SB), 8, $32 - -DATA m2_high<>+0x00(SB)/8, $0x892D69CD44E0A400 -DATA m2_high<>+0x08(SB)/8, $0x2C88CC68E14501A5 -DATA m2_high<>+0x10(SB)/8, $0x892D69CD44E0A400 -DATA m2_high<>+0x18(SB)/8, $0x2C88CC68E14501A5 -GLOBL m2_high<>(SB), 8, $32 - -// left rotations of 32-bit words by 8-bit increments -DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003 -DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B -DATA r08_mask<>+0x10(SB)/8, $0x0605040702010003 -DATA r08_mask<>+0x18(SB)/8, $0x0E0D0C0F0A09080B -GLOBL r08_mask<>(SB), 8, $32 - -DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6 -DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197 -GLOBL fk_mask<>(SB), 8, $16 - // Transpose matrix with PUNPCKHDQ/PUNPCKLDQ/PUNPCKHQDQ/PUNPCKLQDQ instructions. // input: from high to low // r0 = [w3, w2, w1, w0] @@ -110,26 +51,26 @@ GLOBL fk_mask<>(SB), 8, $16 #define SM4_SBOX(x, y, z) \ ; \ //############################# inner affine ############################// MOVOU x, z; \ - PAND nibble_mask<>(SB), z; \ //y = _mm_and_si128(x, c0f); - MOVOU m1_low<>(SB), y; \ + PAND ·nibble_mask(SB), z; \ //y = _mm_and_si128(x, c0f); + MOVOU ·m1_low(SB), y; \ PSHUFB z, y; \ //y = _mm_shuffle_epi8(m1l, y); PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4); - PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f); - MOVOU m1_high<>(SB), z; \ + PAND ·nibble_mask(SB), x; \ //x = _mm_and_si128(x, c0f); + MOVOU ·m1_high(SB), z; \ PSHUFB x, z; \ //x = _mm_shuffle_epi8(m1h, x); MOVOU z, x; \ //x = _mm_shuffle_epi8(m1h, x); PXOR y, x; \ //x = _mm_shuffle_epi8(m1h, x) ^ y; ; \ // inverse ShiftRows - PSHUFB inverse_shift_rows<>(SB), x; \ //x = _mm_shuffle_epi8(x, shr); - AESENCLAST nibble_mask<>(SB), x; \ // AESNI instruction + PSHUFB ·inverse_shift_rows(SB), x; \ //x = _mm_shuffle_epi8(x, shr); + AESENCLAST ·nibble_mask(SB), x; \ // AESNI instruction ; \ //############################# outer affine ############################// MOVOU x, z; \ - PANDN nibble_mask<>(SB), z; \ //z = _mm_andnot_si128(x, c0f); - MOVOU m2_low<>(SB), y; \ + PANDN ·nibble_mask(SB), z; \ //z = _mm_andnot_si128(x, c0f); + MOVOU ·m2_low(SB), y; \ PSHUFB z, y; \ //y = _mm_shuffle_epi8(m2l, z) PSRLQ $4, x; \ //x = _mm_srli_epi64(x, 4); - PAND nibble_mask<>(SB), x; \ //x = _mm_and_si128(x, c0f); - MOVOU m2_high<>(SB), z; \ + PAND ·nibble_mask(SB), x; \ //x = _mm_and_si128(x, c0f); + MOVOU ·m2_high(SB), z; \ PSHUFB x, z; \ MOVOU z, x; \ //x = _mm_shuffle_epi8(m2h, x) PXOR y, x //x = _mm_shuffle_epi8(m2h, x) ^ y; @@ -143,12 +84,12 @@ GLOBL fk_mask<>(SB), 8, $16 SM4_SBOX(x, y, z); \ ; \ //#################### 4 parallel L1 linear transforms ##################// MOVOU x, y; \ - PSHUFB r08_mask<>(SB), y; \ //y = x <<< 8 + PSHUFB ·r08_mask(SB), y; \ //y = x <<< 8 MOVOU y, z; \ - PSHUFB r08_mask<>(SB), z; \ //z = x <<< 16 + PSHUFB ·r08_mask(SB), z; \ //z = x <<< 16 PXOR x, y; \ //y = x ^ (x <<< 8) PXOR z, y; \ //y = x ^ (x <<< 8) ^ (x <<< 16) - PSHUFB r08_mask<>(SB), z; \ //z = x <<< 24 + PSHUFB ·r08_mask(SB), z; \ //z = x <<< 24 PXOR z, x; \ //x = x ^ (x <<< 24) MOVOU y, z; \ PSLLL $2, z; \ @@ -214,7 +155,7 @@ GLOBL fk_mask<>(SB), 8, $16 // Requires: SSSE3 #define SM4_SINGLE_BLOCK(RK, rk128, x, y, z, t0, t1, t2, t3) \ - PSHUFB flip_mask<>(SB), t0; \ + PSHUFB ·flip_mask(SB), t0; \ PSHUFD $1, t0, t1; \ PSHUFD $2, t0, t2; \ PSHUFD $3, t0, t3; \ @@ -238,13 +179,13 @@ GLOBL fk_mask<>(SB), 8, $16 PALIGNR $4, t3, t2; \ PALIGNR $4, t2, t1; \ PALIGNR $4, t1, t0; \ - PSHUFB flip_mask<>(SB), t0 + PSHUFB ·flip_mask(SB), t0 #define SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \ - PSHUFB flip_mask<>(SB), t0; \ - PSHUFB flip_mask<>(SB), t1; \ - PSHUFB flip_mask<>(SB), t2; \ - PSHUFB flip_mask<>(SB), t3; \ + PSHUFB ·flip_mask(SB), t0; \ + PSHUFB ·flip_mask(SB), t1; \ + PSHUFB ·flip_mask(SB), t2; \ + PSHUFB ·flip_mask(SB), t3; \ SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3) #define SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3) \ @@ -266,10 +207,10 @@ GLOBL fk_mask<>(SB), 8, $16 MOVOU (7*16)(RK), rk128; \ SM4_4BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3); \ SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \ - PSHUFB bswap_mask<>(SB), t3; \ - PSHUFB bswap_mask<>(SB), t2; \ - PSHUFB bswap_mask<>(SB), t1; \ - PSHUFB bswap_mask<>(SB), t0 + PSHUFB ·bswap_mask(SB), t3; \ + PSHUFB ·bswap_mask(SB), t2; \ + PSHUFB ·bswap_mask(SB), t1; \ + PSHUFB ·bswap_mask(SB), t0 #define SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \ PSHUFD $0, rk128, x; \ @@ -290,14 +231,14 @@ GLOBL fk_mask<>(SB), 8, $16 SM4_ONE_ROUND_SSE(x, y, z, t7, t4, t5, t6); \ #define SM4_8BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \ - PSHUFB flip_mask<>(SB), t0; \ - PSHUFB flip_mask<>(SB), t1; \ - PSHUFB flip_mask<>(SB), t2; \ - PSHUFB flip_mask<>(SB), t3; \ - PSHUFB flip_mask<>(SB), t4; \ - PSHUFB flip_mask<>(SB), t5; \ - PSHUFB flip_mask<>(SB), t6; \ - PSHUFB flip_mask<>(SB), t7; \ + PSHUFB ·flip_mask(SB), t0; \ + PSHUFB ·flip_mask(SB), t1; \ + PSHUFB ·flip_mask(SB), t2; \ + PSHUFB ·flip_mask(SB), t3; \ + PSHUFB ·flip_mask(SB), t4; \ + PSHUFB ·flip_mask(SB), t5; \ + PSHUFB ·flip_mask(SB), t6; \ + PSHUFB ·flip_mask(SB), t7; \ SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) #define SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \ @@ -321,14 +262,14 @@ GLOBL fk_mask<>(SB), 8, $16 SM4_8BLOCKS_4ROUNDS(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7); \ SSE_TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y); \ SSE_TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y); \ - PSHUFB bswap_mask<>(SB), t3; \ - PSHUFB bswap_mask<>(SB), t2; \ - PSHUFB bswap_mask<>(SB), t1; \ - PSHUFB bswap_mask<>(SB), t0; \ - PSHUFB bswap_mask<>(SB), t7; \ - PSHUFB bswap_mask<>(SB), t6; \ - PSHUFB bswap_mask<>(SB), t5; \ - PSHUFB bswap_mask<>(SB), t4 + PSHUFB ·bswap_mask(SB), t3; \ + PSHUFB ·bswap_mask(SB), t2; \ + PSHUFB ·bswap_mask(SB), t1; \ + PSHUFB ·bswap_mask(SB), t0; \ + PSHUFB ·bswap_mask(SB), t7; \ + PSHUFB ·bswap_mask(SB), t6; \ + PSHUFB ·bswap_mask(SB), t5; \ + PSHUFB ·bswap_mask(SB), t4 // SM4 sbox function, AVX version // parameters: @@ -336,22 +277,22 @@ GLOBL fk_mask<>(SB), 8, $16 // - y: 128 bits temp register // - tmp: 128 bits temp register #define AVX_SM4_SBOX(x, y, tmp) \ - VPAND nibble_mask<>(SB), x, tmp; \ - VMOVDQU m1_low<>(SB), y; \ + VPAND ·nibble_mask(SB), x, tmp; \ + VMOVDQU ·m1_low(SB), y; \ VPSHUFB tmp, y, y; \ VPSRLQ $4, x, x; \ - VPAND nibble_mask<>(SB), x, x; \ - VMOVDQU m1_high<>(SB), tmp; \ + VPAND ·nibble_mask(SB), x, x; \ + VMOVDQU ·m1_high(SB), tmp; \ VPSHUFB x, tmp, x; \ VPXOR y, x, x; \ - VPSHUFB inverse_shift_rows<>(SB), x, x; \ - VAESENCLAST nibble_mask<>(SB), x, x; \ - VPANDN nibble_mask<>(SB), x, tmp; \ - VMOVDQU m2_low<>(SB), y; \ + VPSHUFB ·inverse_shift_rows(SB), x, x; \ + VAESENCLAST ·nibble_mask(SB), x, x; \ + VPANDN ·nibble_mask(SB), x, tmp; \ + VMOVDQU ·m2_low(SB), y; \ VPSHUFB tmp, y, y; \ VPSRLQ $4, x, x; \ - VPAND nibble_mask<>(SB), x, x; \ - VMOVDQU m2_high<>(SB), tmp; \ + VPAND ·nibble_mask(SB), x, x; \ + VMOVDQU ·m2_high(SB), tmp; \ VPSHUFB x, tmp, x; \ VPXOR y, x, x @@ -362,11 +303,11 @@ GLOBL fk_mask<>(SB), 8, $16 // - tmp: 128 bits temp register #define AVX_SM4_TAO_L1(x, y, tmp) \ AVX_SM4_SBOX(x, y, tmp); \ - VPSHUFB r08_mask<>(SB), x, y; \ // y = x <<< 8 - VPSHUFB r08_mask<>(SB), y, tmp; \ // tmp = x <<< 16 + VPSHUFB ·r08_mask(SB), x, y; \ // y = x <<< 8 + VPSHUFB ·r08_mask(SB), y, tmp; \ // tmp = x <<< 16 VPXOR x, y, y; \ // y = x ^ (x <<< 8) VPXOR tmp, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16) - VPSHUFB r08_mask<>(SB), tmp, tmp; \ // tmp = x <<< 24 + VPSHUFB ·r08_mask(SB), tmp, tmp; \ // tmp = x <<< 24 VPXOR x, tmp, x; \ // x = x ^ (x <<< 24) VPSLLD $2, y, tmp; \ VPSRLD $30, y, y; \ @@ -429,10 +370,10 @@ GLOBL fk_mask<>(SB), 8, $16 SM4_ONE_ROUND_AVX(x, y, z, t3, t0, t1, t2); \ #define AVX_SM4_4BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3) \ - VPSHUFB flip_mask<>(SB), t0, t0 \ - VPSHUFB flip_mask<>(SB), t1, t1 \ - VPSHUFB flip_mask<>(SB), t2, t2 \ - VPSHUFB flip_mask<>(SB), t3, t3 \ + VPSHUFB ·flip_mask(SB), t0, t0 \ + VPSHUFB ·flip_mask(SB), t1, t1 \ + VPSHUFB ·flip_mask(SB), t2, t2 \ + VPSHUFB ·flip_mask(SB), t3, t3 \ ; \ AVX_SM4_4BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3) @@ -456,10 +397,10 @@ GLOBL fk_mask<>(SB), 8, $16 SM4_4BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3); \ ; \ // Transpose matrix 4 x 4 32bits word TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \ - VPSHUFB bswap_mask<>(SB), t0, t0 \ - VPSHUFB bswap_mask<>(SB), t1, t1 \ - VPSHUFB bswap_mask<>(SB), t2, t2 \ - VPSHUFB bswap_mask<>(SB), t3, t3 \ + VPSHUFB ·bswap_mask(SB), t0, t0 \ + VPSHUFB ·bswap_mask(SB), t1, t1 \ + VPSHUFB ·bswap_mask(SB), t2, t2 \ + VPSHUFB ·bswap_mask(SB), t3, t3 \ #define SM4_8BLOCKS_4ROUNDS_AVX(rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \ VPSHUFD $0, rk128, x; \ @@ -480,14 +421,14 @@ GLOBL fk_mask<>(SB), 8, $16 SM4_ONE_ROUND_AVX(x, y, z, t7, t4, t5, t6); \ #define AVX_SM4_8BLOCKS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) \ - VPSHUFB flip_mask<>(SB), t0, t0 \ - VPSHUFB flip_mask<>(SB), t1, t1 \ - VPSHUFB flip_mask<>(SB), t2, t2 \ - VPSHUFB flip_mask<>(SB), t3, t3 \ - VPSHUFB flip_mask<>(SB), t4, t4 \ - VPSHUFB flip_mask<>(SB), t5, t5 \ - VPSHUFB flip_mask<>(SB), t6, t6 \ - VPSHUFB flip_mask<>(SB), t7, t7 \ + VPSHUFB ·flip_mask(SB), t0, t0 \ + VPSHUFB ·flip_mask(SB), t1, t1 \ + VPSHUFB ·flip_mask(SB), t2, t2 \ + VPSHUFB ·flip_mask(SB), t3, t3 \ + VPSHUFB ·flip_mask(SB), t4, t4 \ + VPSHUFB ·flip_mask(SB), t5, t5 \ + VPSHUFB ·flip_mask(SB), t6, t6 \ + VPSHUFB ·flip_mask(SB), t7, t7 \ ; \ AVX_SM4_8BLOCKS_WO_BS(RK, rk128, x, y, z, t0, t1, t2, t3, t4, t5, t6, t7) @@ -513,14 +454,14 @@ GLOBL fk_mask<>(SB), 8, $16 ; \ // Transpose matrix 4 x 4 32bits word TRANSPOSE_MATRIX(t0, t1, t2, t3, x, y) \ TRANSPOSE_MATRIX(t4, t5, t6, t7, x, y) \ - VPSHUFB bswap_mask<>(SB), t0, t0 \ - VPSHUFB bswap_mask<>(SB), t1, t1 \ - VPSHUFB bswap_mask<>(SB), t2, t2 \ - VPSHUFB bswap_mask<>(SB), t3, t3 \ - VPSHUFB bswap_mask<>(SB), t4, t4 \ - VPSHUFB bswap_mask<>(SB), t5, t5 \ - VPSHUFB bswap_mask<>(SB), t6, t6 \ - VPSHUFB bswap_mask<>(SB), t7, t7 \ + VPSHUFB ·bswap_mask(SB), t0, t0 \ + VPSHUFB ·bswap_mask(SB), t1, t1 \ + VPSHUFB ·bswap_mask(SB), t2, t2 \ + VPSHUFB ·bswap_mask(SB), t3, t3 \ + VPSHUFB ·bswap_mask(SB), t4, t4 \ + VPSHUFB ·bswap_mask(SB), t5, t5 \ + VPSHUFB ·bswap_mask(SB), t6, t6 \ + VPSHUFB ·bswap_mask(SB), t7, t7 \ // SM4 sbox function, AVX2 version // parameters: @@ -533,24 +474,24 @@ GLOBL fk_mask<>(SB), 8, $16 // - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier. #define AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \ VPAND yNibbleMask, x, z; \ - VMOVDQU m1_low<>(SB), y; \ + VMOVDQU ·m1_low(SB), y; \ VPSHUFB z, y, y; \ VPSRLQ $4, x, x; \ VPAND yNibbleMask, x, x; \ - VMOVDQU m1_high<>(SB), z; \ + VMOVDQU ·m1_high(SB), z; \ VPSHUFB x, z, x; \ VPXOR y, x, x; \ - VPSHUFB inverse_shift_rows<>(SB), x, x; \ + VPSHUFB ·inverse_shift_rows(SB), x, x; \ VEXTRACTI128 $1, x, yw \ VAESENCLAST xNibbleMask, xw, xw; \ VAESENCLAST xNibbleMask, yw, yw; \ VINSERTI128 $1, yw, x, x; \ VPANDN yNibbleMask, x, z; \ - VMOVDQU m2_low<>(SB), y; \ + VMOVDQU ·m2_low(SB), y; \ VPSHUFB z, y, y; \ VPSRLQ $4, x, x; \ VPAND yNibbleMask, x, x; \ - VMOVDQU m2_high<>(SB), z; \ + VMOVDQU ·m2_high(SB), z; \ VPSHUFB x, z, x; \ VPXOR y, x, x @@ -565,11 +506,11 @@ GLOBL fk_mask<>(SB), 8, $16 // - yNibbleMask: 256 bits register stored nibble mask, should be loaded earlier. #define AVX2_SM4_TAO_L1(x, y, z, xw, yw, xNibbleMask, yNibbleMask) \ AVX2_SM4_SBOX(x, y, z, xw, yw, xNibbleMask, yNibbleMask); \ - VPSHUFB r08_mask<>(SB), x, y; \ // y = x <<< 8 - VPSHUFB r08_mask<>(SB), y, z; \ // z = x <<< 16 + VPSHUFB ·r08_mask(SB), x, y; \ // y = x <<< 8 + VPSHUFB ·r08_mask(SB), y, z; \ // z = x <<< 16 VPXOR x, y, y; \ // y = x ^ (x <<< 8) VPXOR z, y, y; \ // y = x ^ (x <<< 8) ^ (x <<< 16) - VPSHUFB r08_mask<>(SB), z, z; \ // z = x <<< 24 + VPSHUFB ·r08_mask(SB), z, z; \ // z = x <<< 24 VPXOR x, z, x; \ // x = x ^ (x <<< 24) VPSLLD $2, y, z; \ VPSRLD $30, y, y; \ diff --git a/sm4/aesni_macros_arm64.s b/sm4/aesni_macros_arm64.s index 0a57a3a..151971a 100644 --- a/sm4/aesni_macros_arm64.s +++ b/sm4/aesni_macros_arm64.s @@ -1,37 +1,9 @@ -// inverse shift rows -DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00 -DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508 -GLOBL inverse_shift_rows<>(SB), (16+8), $16 - -// Affine transform 1 & 2 (low and high nibbles) -DATA m1_2<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69 -DATA m1_2<>+0x08(SB)/8, $0x3045F98CEF9A2653 -DATA m1_2<>+0x10(SB)/8, $0xC35BF46CAF379800 -DATA m1_2<>+0x18(SB)/8, $0x68F05FC7049C33AB -DATA m1_2<>+0x20(SB)/8, $0x9A950A05FEF16E61 -DATA m1_2<>+0x28(SB)/8, $0x0E019E916A65FAF5 -DATA m1_2<>+0x30(SB)/8, $0x892D69CD44E0A400 -DATA m1_2<>+0x38(SB)/8, $0x2C88CC68E14501A5 -GLOBL m1_2<>(SB), (16+8), $64 - -// left rotations of 32-bit words by 8-bit increments -DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003 -DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B -GLOBL r08_mask<>(SB), (16+8), $16 - -DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6 -DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197 -GLOBL fk_mask<>(SB), (16+8), $16 - #define LOAD_SM4_AESNI_CONSTS() \ - MOVW $0x0F0F0F0F, R20 \ - VDUP R20, NIBBLE_MASK.S4 \ - MOVD $m1_2<>(SB), R20 \ - VLD1 (R20), [M1L.B16, M1H.B16, M2L.B16, M2H.B16] \ - MOVD $inverse_shift_rows<>(SB), R20 \ - VLD1 (R20), [INVERSE_SHIFT_ROWS.B16] \ - MOVD $r08_mask<>(SB), R20 \ - VLD1 (R20), [R08_MASK.B16] \ + MOVW $0x0F0F0F0F, R20 \ + VDUP R20, NIBBLE_MASK.S4 \ + MOVD $·rcon(SB), R20 \ + VLD1.P 64(R20), [M1L.B16, M1H.B16, M2L.B16, M2H.B16] \ + VLD1 (R20), [R08_MASK.B16, INVERSE_SHIFT_ROWS.B16] // input: from high to low // t0 = t0.S3, t0.S2, t0.S1, t0.S0 diff --git a/sm4/asm_amd64.s b/sm4/asm_amd64.s index 974315c..0248535 100644 --- a/sm4/asm_amd64.s +++ b/sm4/asm_amd64.s @@ -13,6 +13,61 @@ #define XTMP6 X10 #define XTMP7 X11 +// shuffle byte order from LE to BE +DATA ·flip_mask+0x00(SB)/8, $0x0405060700010203 +DATA ·flip_mask+0x08(SB)/8, $0x0c0d0e0f08090a0b +GLOBL ·flip_mask(SB), RODATA, $16 + +// shuffle byte and word order +DATA ·bswap_mask+0x00(SB)/8, $0x08090a0b0c0d0e0f +DATA ·bswap_mask+0x08(SB)/8, $0x0001020304050607 +GLOBL ·bswap_mask(SB), RODATA, $16 + +//nibble mask +DATA ·nibble_mask+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F +DATA ·nibble_mask+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F +GLOBL ·nibble_mask(SB), RODATA, $16 + +// inverse shift rows +DATA ·inverse_shift_rows+0x00(SB)/8, $0x0B0E0104070A0D00 +DATA ·inverse_shift_rows+0x08(SB)/8, $0x0306090C0F020508 +DATA ·inverse_shift_rows+0x10(SB)/8, $0x0B0E0104070A0D00 +DATA ·inverse_shift_rows+0x18(SB)/8, $0x0306090C0F020508 +GLOBL ·inverse_shift_rows(SB), RODATA, $32 + +// Affine transform 1 (low and high nibbles) +DATA ·m1_low+0x00(SB)/8, $0x0A7FC3B6D5A01C69 +DATA ·m1_low+0x08(SB)/8, $0x3045F98CEF9A2653 +DATA ·m1_low+0x10(SB)/8, $0x0A7FC3B6D5A01C69 +DATA ·m1_low+0x18(SB)/8, $0x3045F98CEF9A2653 +GLOBL ·m1_low(SB), RODATA, $32 + +DATA ·m1_high+0x00(SB)/8, $0xC35BF46CAF379800 +DATA ·m1_high+0x08(SB)/8, $0x68F05FC7049C33AB +DATA ·m1_high+0x10(SB)/8, $0xC35BF46CAF379800 +DATA ·m1_high+0x18(SB)/8, $0x68F05FC7049C33AB +GLOBL ·m1_high(SB), RODATA, $32 + +// Affine transform 2 (low and high nibbles) +DATA ·m2_low+0x00(SB)/8, $0x9A950A05FEF16E61 +DATA ·m2_low+0x08(SB)/8, $0x0E019E916A65FAF5 +DATA ·m2_low+0x10(SB)/8, $0x9A950A05FEF16E61 +DATA ·m2_low+0x18(SB)/8, $0x0E019E916A65FAF5 +GLOBL ·m2_low(SB), RODATA, $32 + +DATA ·m2_high+0x00(SB)/8, $0x892D69CD44E0A400 +DATA ·m2_high+0x08(SB)/8, $0x2C88CC68E14501A5 +DATA ·m2_high+0x10(SB)/8, $0x892D69CD44E0A400 +DATA ·m2_high+0x18(SB)/8, $0x2C88CC68E14501A5 +GLOBL ·m2_high(SB), RODATA, $32 + +// left rotations of 32-bit words by 8-bit increments +DATA ·r08_mask+0x00(SB)/8, $0x0605040702010003 +DATA ·r08_mask+0x08(SB)/8, $0x0E0D0C0F0A09080B +DATA ·r08_mask+0x10(SB)/8, $0x0605040702010003 +DATA ·r08_mask+0x18(SB)/8, $0x0E0D0C0F0A09080B +GLOBL ·r08_mask(SB), RODATA, $32 + #include "aesni_macros_amd64.s" // SM4 TAO L2 function, used for key expand @@ -105,8 +160,8 @@ TEXT ·expandKeyAsm(SB),NOSPLIT,$0 MOVQ dec+24(FP), DI MOVUPS 0(AX), t0 - PSHUFB flip_mask<>(SB), t0 - PXOR fk_mask<>(SB), t0 + PSHUFB ·flip_mask(SB), t0 + PXOR ·fk(SB), t0 PSHUFD $1, t0, t1 PSHUFD $2, t0, t2 PSHUFD $3, t0, t3 @@ -225,7 +280,7 @@ avx_done_sm4: RET avx2: - VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK + VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK CMPQ DI, $256 JEQ avx2_16blocks @@ -235,7 +290,7 @@ avx2_8blocks: VMOVDQU 32(DX), XDWORD1 VMOVDQU 64(DX), XDWORD2 VMOVDQU 96(DX), XDWORD3 - VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK + VBROADCASTI128 ·flip_mask(SB), BYTE_FLIP_MASK // Apply Byte Flip Mask: LE -> BE VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 @@ -251,7 +306,7 @@ avx2_8blocks: // Transpose matrix 4 x 4 32bits word TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) - VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK + VBROADCASTI128 ·bswap_mask(SB), BYTE_FLIP_MASK VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 @@ -275,7 +330,7 @@ avx2_16blocks: VMOVDQU 192(DX), XDWORD6 VMOVDQU 224(DX), XDWORD7 - VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK + VBROADCASTI128 ·flip_mask(SB), BYTE_FLIP_MASK // Apply Byte Flip Mask: LE -> BE VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 @@ -297,7 +352,7 @@ avx2_16blocks: TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2) TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2) - VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK + VBROADCASTI128 ·bswap_mask(SB), BYTE_FLIP_MASK VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0 VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1 VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2 @@ -328,7 +383,7 @@ TEXT ·encryptBlockAsm(SB),NOSPLIT,$0 MOVQ src+16(FP), DX MOVUPS (DX), t0 - PSHUFB flip_mask<>(SB), t0 + PSHUFB ·flip_mask(SB), t0 PSHUFD $1, t0, t1 PSHUFD $2, t0, t2 PSHUFD $3, t0, t3 @@ -353,7 +408,7 @@ loop: PUNPCKLLQ t2, t3 PUNPCKLLQ t0, t1 PUNPCKLQDQ t1, t3 - PSHUFB flip_mask<>(SB), t3 + PSHUFB ·flip_mask(SB), t3 MOVUPS t3, (BX) done_sm4: diff --git a/sm4/asm_arm64.s b/sm4/asm_arm64.s index ace14de..cafdfc1 100644 --- a/sm4/asm_arm64.s +++ b/sm4/asm_arm64.s @@ -20,10 +20,26 @@ #define M2H V23 #define R08_MASK V24 #define INVERSE_SHIFT_ROWS V25 -#define NIBBLE_MASK V26 -#define FK_MASK V27 +#define FK_MASK V26 +#define NIBBLE_MASK V27 #define ZERO V28 +DATA ·rcon+0x00(SB)/8, $0x0A7FC3B6D5A01C69 // m1l +DATA ·rcon+0x08(SB)/8, $0x3045F98CEF9A2653 +DATA ·rcon+0x10(SB)/8, $0xC35BF46CAF379800 // m1h +DATA ·rcon+0x18(SB)/8, $0x68F05FC7049C33AB +DATA ·rcon+0x20(SB)/8, $0x9A950A05FEF16E61 // m2l +DATA ·rcon+0x28(SB)/8, $0x0E019E916A65FAF5 +DATA ·rcon+0x30(SB)/8, $0x892D69CD44E0A400 // m2h +DATA ·rcon+0x38(SB)/8, $0x2C88CC68E14501A5 +DATA ·rcon+0x40(SB)/8, $0x0605040702010003 // left rotations of 32-bit words by 8-bit increments +DATA ·rcon+0x48(SB)/8, $0x0E0D0C0F0A09080B +DATA ·rcon+0x50(SB)/8, $0x0B0E0104070A0D00 // inverse shift rows +DATA ·rcon+0x58(SB)/8, $0x0306090C0F020508 +DATA ·rcon+0x60(SB)/8, $0x56aa3350a3b1bac6 // fk +DATA ·rcon+0x68(SB)/8, $0xb27022dc677d9197 +GLOBL ·rcon(SB), RODATA, $112 + #include "aesni_macros_arm64.s" #define SM4_TAO_L2(x, y) \ @@ -49,14 +65,11 @@ MOVW.P R2, -4(R11) #define LOAD_SM4KEY_AESNI_CONSTS() \ - MOVW $0x0F0F0F0F, R0 \ - VDUP R0, NIBBLE_MASK.S4 \ - MOVD $m1_2<>(SB), R0 \ - VLD1 (R0), [M1L.B16, M1H.B16, M2L.B16, M2H.B16] \ - MOVD $fk_mask<>(SB), R0 \ - VLD1 (R0), [FK_MASK.B16] \ - MOVD $inverse_shift_rows<>(SB), R0 \ - VLD1 (R0), [INVERSE_SHIFT_ROWS.B16] + MOVW $0x0F0F0F0F, R0 \ + VDUP R0, NIBBLE_MASK.S4 \ + MOVD $·rcon(SB), R0 \ + VLD1.P 64(R0), [M1L.B16, M1H.B16, M2L.B16, M2H.B16] \ + VLD1 (R0), [R08_MASK.B16, INVERSE_SHIFT_ROWS.B16, FK.B16] #define SM4EKEY_EXPORT_KEYS() \ VREV64 V8.S4, V11.S4 \ diff --git a/sm4/cbc_amd64.s b/sm4/cbc_amd64.s index 2ee4da7..44c98b6 100644 --- a/sm4/cbc_amd64.s +++ b/sm4/cbc_amd64.s @@ -360,9 +360,9 @@ avxCbcSm4Done: RET avx2Start: - VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK - VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK - VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK + VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK + VBROADCASTI128 ·flip_mask(SB), BYTE_FLIP_MASK + VBROADCASTI128 ·bswap_mask(SB), BSWAP_MASK VMOVDQU -16(DX), X15 diff --git a/sm4/cbc_arm64.s b/sm4/cbc_arm64.s index 746b2ec..4fd7fac 100644 --- a/sm4/cbc_arm64.s +++ b/sm4/cbc_arm64.s @@ -16,15 +16,15 @@ #define t7 V13 #define IV V18 +#define LAST_BLOCK V15 #define ZERO V16 -#define NIBBLE_MASK V20 -#define INVERSE_SHIFT_ROWS V21 -#define M1L V22 -#define M1H V23 -#define M2L V24 -#define M2H V25 -#define R08_MASK V26 -#define FK_MASK V27 +#define M1L V20 +#define M1H V21 +#define M2L V22 +#define M2H V23 +#define R08_MASK V24 +#define INVERSE_SHIFT_ROWS V25 +#define NIBBLE_MASK V26 #include "aesni_macros_arm64.s" @@ -49,7 +49,7 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0 ADD srcPtr, srcPtrLen, R10 SUB $16, R10, R10 - VLD1 (R10), [V15.S4] + VLD1 (R10), [LAST_BLOCK.S4] cbcSm4Octets: CMP $128, srcPtrLen @@ -293,5 +293,5 @@ cbc4BlocksLoop48: VST1 [t0.S4, t1.S4, t2.S4], (dstPtr) cbcSm4Done: - VST1 [V15.S4], (R6) + VST1 [LAST_BLOCK.S4], (R6) RET diff --git a/sm4/ecb_amd64.s b/sm4/ecb_amd64.s index 00cf937..48c83e9 100644 --- a/sm4/ecb_amd64.s +++ b/sm4/ecb_amd64.s @@ -219,9 +219,9 @@ avxEcbSm4Done: RET avx2_start: - VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK - VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK - VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK + VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK + VBROADCASTI128 ·flip_mask(SB), BYTE_FLIP_MASK + VBROADCASTI128 ·bswap_mask(SB), BSWAP_MASK avx2_16blocks: CMPQ DI, $256 diff --git a/sm4/ecb_arm64.s b/sm4/ecb_arm64.s index 5f8e4a3..8a8a151 100644 --- a/sm4/ecb_arm64.s +++ b/sm4/ecb_arm64.s @@ -8,15 +8,6 @@ #define t1 V3 #define t2 V4 #define t3 V5 -#define ZERO V16 -#define NIBBLE_MASK V20 -#define INVERSE_SHIFT_ROWS V21 -#define M1L V22 -#define M1H V23 -#define M2L V24 -#define M2H V25 -#define R08_MASK V26 -#define FK_MASK V27 #define XTMP6 V6 #define XTMP7 V7 #define t4 V10 @@ -24,6 +15,15 @@ #define t6 V12 #define t7 V13 +#define ZERO V16 +#define M1L V20 +#define M1H V21 +#define M2L V22 +#define M2H V23 +#define R08_MASK V24 +#define INVERSE_SHIFT_ROWS V25 +#define NIBBLE_MASK V26 + #include "aesni_macros_arm64.s" // func encryptSm4Ecb(xk *uint32, dst, src []byte) diff --git a/sm4/gcm_amd64.s b/sm4/gcm_amd64.s index a39970f..92b47e7 100644 --- a/sm4/gcm_amd64.s +++ b/sm4/gcm_amd64.s @@ -95,7 +95,7 @@ TEXT ·gcmSm4Finish(SB),NOSPLIT,$0 MOVOU (tPtr), ACC0 MOVOU (tMsk), T2 - MOVOU bswap_mask<>(SB), BSWAP + MOVOU ·bswap_mask(SB), BSWAP MOVOU gcmPoly<>(SB), POLY SHLQ $3, plen @@ -279,7 +279,7 @@ TEXT ·gcmSm4Data(SB),NOSPLIT,$0 PXOR ACC0, ACC0 // MOVOU (tPtr), ACC0 // originally we passed in tag initial value - MOVOU bswap_mask<>(SB), BSWAP + MOVOU ·bswap_mask(SB), BSWAP MOVOU gcmPoly<>(SB), POLY TESTQ autLen, autLen @@ -527,14 +527,14 @@ TEXT ·gcmSm4Enc(SB),0,$256-96 CMPB ·useAVX(SB), $1 JE avxGcmSm4Enc - MOVOU bswap_mask<>(SB), BSWAP + MOVOU ·bswap_mask(SB), BSWAP MOVOU gcmPoly<>(SB), POLY MOVOU (tPtr), ACC0 PXOR ACC1, ACC1 PXOR ACCM, ACCM MOVOU (ctrPtr), T0 - PSHUFB flip_mask<>(SB), T0 + PSHUFB ·flip_mask(SB), T0 PEXTRD $3, T0, aluCTR MOVOU T0, (8*16 + 0*16)(SP) @@ -870,14 +870,14 @@ gcmSm4EncDone: RET avxGcmSm4Enc: - VMOVDQU bswap_mask<>(SB), BSWAP + VMOVDQU ·bswap_mask(SB), BSWAP VMOVDQU gcmPoly<>(SB), POLY VMOVDQU (tPtr), ACC0 VPXOR ACC1, ACC1, ACC1 VPXOR ACCM, ACCM, ACCM VMOVDQU (ctrPtr), T0 - VPSHUFB flip_mask<>(SB), T0, T0 + VPSHUFB ·flip_mask(SB), T0, T0 VPEXTRD $3, T0, aluCTR VMOVDQU T0, (8*16 + 0*16)(SP) @@ -1198,14 +1198,14 @@ avxGcmSm4EncDone: RET avx2GcmSm4Enc: - VMOVDQU bswap_mask<>(SB), BSWAP + VMOVDQU ·bswap_mask(SB), BSWAP VMOVDQU gcmPoly<>(SB), POLY VMOVDQU (tPtr), ACC0 VPXOR ACC1, ACC1, ACC1 VPXOR ACCM, ACCM, ACCM VMOVDQU (ctrPtr), T0 - VPSHUFB flip_mask<>(SB), T0, T0 + VPSHUFB ·flip_mask(SB), T0, T0 VPEXTRD $3, T0, aluCTR VINSERTI128 $1, T0, Y11, Y11 @@ -1228,7 +1228,7 @@ avx2GcmSm4Enc: increment(6) increment(7) - VBROADCASTI128 bswap_mask<>(SB), DWBSWAP + VBROADCASTI128 ·bswap_mask(SB), DWBSWAP // load 8 ctrs for encryption VMOVDQU (4*32 + 0*32)(SP), DWB0 VMOVDQU (4*32 + 1*32)(SP), DWB1 @@ -1239,7 +1239,7 @@ avx2GcmSm4Enc: // Transpose matrix 4 x 4 32bits word TRANSPOSE_MATRIX(DWB0, DWB1, DWB2, DWB3, XDWORD, YDWORD) - VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK + VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK increment(1) AVX2_SM4_8BLOCKS(rk, XDWORD, YDWORD, X1, X3, XDWTMP0, DWB0, DWB1, DWB2, DWB3) increment(2) @@ -1613,14 +1613,14 @@ TEXT ·gcmSm4Dec(SB),0,$128-96 CMPB ·useAVX(SB), $1 JE avxGcmSm4Dec - MOVOU bswap_mask<>(SB), BSWAP + MOVOU ·bswap_mask(SB), BSWAP MOVOU gcmPoly<>(SB), POLY MOVOU (tPtr), ACC0 PXOR ACC1, ACC1 PXOR ACCM, ACCM MOVOU (ctrPtr), T0 - PSHUFB flip_mask<>(SB), T0 + PSHUFB ·flip_mask(SB), T0 PEXTRD $3, T0, aluCTR MOVOU T0, (0*16)(SP) @@ -1841,14 +1841,14 @@ gcmSm4DecDone: RET avxGcmSm4Dec: - VMOVDQU bswap_mask<>(SB), BSWAP + VMOVDQU ·bswap_mask(SB), BSWAP VMOVDQU gcmPoly<>(SB), POLY VMOVDQU (tPtr), ACC0 VPXOR ACC1, ACC1, ACC1 VPXOR ACCM, ACCM, ACCM VMOVDQU (ctrPtr), T0 - VPSHUFB flip_mask<>(SB), T0, T0 + VPSHUFB ·flip_mask(SB), T0, T0 VPEXTRD $3, T0, aluCTR VMOVDQU T0, (0*16)(SP) @@ -2065,14 +2065,14 @@ avxGcmSm4DecDone: RET avx2GcmSm4Dec: - VMOVDQU bswap_mask<>(SB), BSWAP + VMOVDQU ·bswap_mask(SB), BSWAP VMOVDQU gcmPoly<>(SB), POLY VMOVDQU (tPtr), ACC0 VPXOR ACC1, ACC1, ACC1 VPXOR ACCM, ACCM, ACCM VMOVDQU (ctrPtr), T0 - VPSHUFB flip_mask<>(SB), T0, T0 + VPSHUFB ·flip_mask(SB), T0, T0 VPEXTRD $3, T0, aluCTR VINSERTI128 $1, T0, Y11, Y11 @@ -2094,8 +2094,8 @@ avx2GcmSm4Dec: increment(6) increment(7) - VBROADCASTI128 bswap_mask<>(SB), DWBSWAP - VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK + VBROADCASTI128 ·bswap_mask(SB), DWBSWAP + VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK avx2GcmSm4DecOctetsLoop: CMPQ ptxLen, $128 diff --git a/sm4/gcm_arm64.s b/sm4/gcm_arm64.s index bdaf920..3994d5e 100644 --- a/sm4/gcm_arm64.s +++ b/sm4/gcm_arm64.s @@ -29,13 +29,14 @@ #define K1 V20 #define K2 V21 #define K3 V22 -#define NIBBLE_MASK V23 -#define INVERSE_SHIFT_ROWS V24 -#define M1L V25 -#define M1H V26 -#define M2L V27 -#define M2H V28 -#define R08_MASK V29 + +#define M1L V23 +#define M1H V24 +#define M2L V25 +#define M2H V26 +#define R08_MASK V27 +#define INVERSE_SHIFT_ROWS V28 +#define NIBBLE_MASK V29 #define reduce() \ VEOR ACC0.B16, ACCM.B16, ACCM.B16 \ diff --git a/sm4/xts_amd64.s b/sm4/xts_amd64.s index b96812c..7d7b387 100644 --- a/sm4/xts_amd64.s +++ b/sm4/xts_amd64.s @@ -329,7 +329,7 @@ GLOBL gbGcmPoly<>(SB), (NOPTR+RODATA), $16 VPXOR (32*7)(SP), Y7, Y7 #define avx2LE2BE8Blocks \ - VBROADCASTI128 flip_mask<>(SB), Y11; \ + VBROADCASTI128 ·flip_mask(SB), Y11; \ VPSHUFB Y11, Y0, Y0; \ VPSHUFB Y11, Y1, Y1; \ VPSHUFB Y11, Y2, Y2; \ @@ -589,8 +589,8 @@ avxXtsSm4EncDone: avx2XtsSm4Enc: VMOVDQU gcmPoly<>(SB), POLY VMOVDQU (0*16)(BX), TW - VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK - VBROADCASTI128 bswap_mask<>(SB), DWBSWAP + VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK + VBROADCASTI128 ·bswap_mask(SB), DWBSWAP avx2XtsSm4Enc16Blocks: CMPQ DI, $256 @@ -735,7 +735,7 @@ TEXT ·encryptSm4XtsGB(SB),0,$256-64 JE avxXtsSm4Enc MOVOU gbGcmPoly<>(SB), POLY - MOVOU bswap_mask<>(SB), BSWAP + MOVOU ·bswap_mask(SB), BSWAP MOVOU (0*16)(BX), TW xtsSm4EncOctets: @@ -834,7 +834,7 @@ xtsSm4EncDone: avxXtsSm4Enc: VMOVDQU gbGcmPoly<>(SB), POLY - VMOVDQU bswap_mask<>(SB), BSWAP + VMOVDQU ·bswap_mask(SB), BSWAP VMOVDQU (0*16)(BX), TW avxXtsSm4EncOctets: @@ -934,8 +934,8 @@ avxXtsSm4EncDone: avx2XtsSm4Enc: VMOVDQU gbGcmPoly<>(SB), POLY VMOVDQU (0*16)(BX), TW - VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK - VBROADCASTI128 bswap_mask<>(SB), DWBSWAP + VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK + VBROADCASTI128 ·bswap_mask(SB), DWBSWAP avx2XtsSm4Enc16Blocks: CMPQ DI, $256 @@ -1327,8 +1327,8 @@ avxXtsSm4DecDone: avx2XtsSm4Dec: VMOVDQU gcmPoly<>(SB), POLY VMOVDQU (0*16)(BX), TW - VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK - VBROADCASTI128 bswap_mask<>(SB), DWBSWAP + VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK + VBROADCASTI128 ·bswap_mask(SB), DWBSWAP avx2XtsSm4Dec16Blocks: CMPQ DI, $256 @@ -1498,7 +1498,7 @@ TEXT ·decryptSm4XtsGB(SB),0,$256-64 JE avxXtsSm4Dec MOVOU gbGcmPoly<>(SB), POLY - MOVOU bswap_mask<>(SB), BSWAP + MOVOU ·bswap_mask(SB), BSWAP MOVOU (0*16)(BX), TW xtsSm4DecOctets: @@ -1622,7 +1622,7 @@ xtsSm4DecDone: avxXtsSm4Dec: VMOVDQU gbGcmPoly<>(SB), POLY - VMOVDQU bswap_mask<>(SB), BSWAP + VMOVDQU ·bswap_mask(SB), BSWAP VMOVDQU (0*16)(BX), TW avxXtsSm4DecOctets: @@ -1747,8 +1747,8 @@ avxXtsSm4DecDone: avx2XtsSm4Dec: VMOVDQU gbGcmPoly<>(SB), POLY VMOVDQU (0*16)(BX), TW - VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK - VBROADCASTI128 bswap_mask<>(SB), DWBSWAP + VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK + VBROADCASTI128 ·bswap_mask(SB), DWBSWAP avx2XtsSm4Dec16Blocks: CMPQ DI, $256 diff --git a/sm4/xts_arm64.s b/sm4/xts_arm64.s index 8e810b1..5820534 100644 --- a/sm4/xts_arm64.s +++ b/sm4/xts_arm64.s @@ -29,13 +29,13 @@ #define K2 V21 #define K3 V22 -#define NIBBLE_MASK V23 -#define INVERSE_SHIFT_ROWS V24 -#define M1L V25 -#define M1H V26 -#define M2L V27 -#define M2H V28 -#define R08_MASK V29 +#define M1L V23 +#define M1H V24 +#define M2L V25 +#define M2H V26 +#define R08_MASK V27 +#define INVERSE_SHIFT_ROWS V28 +#define NIBBLE_MASK V29 #include "aesni_macros_arm64.s" #include "xts_macros_arm64.s" diff --git a/zuc/asm_ppc64x.s b/zuc/asm_ppc64x.s index 188d64a..907cf9c 100644 --- a/zuc/asm_ppc64x.s +++ b/zuc/asm_ppc64x.s @@ -28,7 +28,6 @@ DATA rcon<>+0x90(SB)/8, $0x00ff00ff00ff00ff // S1 DATA rcon<>+0x98(SB)/8, $0x00ff00ff00ff00ff GLOBL rcon<>(SB), RODATA, $160 - #define M1L V20 #define M1H V21 #define M2L V22