Skip to content

Commit

Permalink
sm4: use package level instead of local for shared variables
Browse files Browse the repository at this point in the history
  • Loading branch information
emmansun authored Nov 11, 2024
1 parent b721bed commit aa82b58
Show file tree
Hide file tree
Showing 13 changed files with 247 additions and 266 deletions.
227 changes: 84 additions & 143 deletions sm4/aesni_macros_amd64.s

Large diffs are not rendered by default.

38 changes: 5 additions & 33 deletions sm4/aesni_macros_arm64.s
Original file line number Diff line number Diff line change
@@ -1,37 +1,9 @@
// inverse shift rows
DATA inverse_shift_rows<>+0x00(SB)/8, $0x0B0E0104070A0D00
DATA inverse_shift_rows<>+0x08(SB)/8, $0x0306090C0F020508
GLOBL inverse_shift_rows<>(SB), (16+8), $16

// Affine transform 1 & 2 (low and high nibbles)
DATA m1_2<>+0x00(SB)/8, $0x0A7FC3B6D5A01C69
DATA m1_2<>+0x08(SB)/8, $0x3045F98CEF9A2653
DATA m1_2<>+0x10(SB)/8, $0xC35BF46CAF379800
DATA m1_2<>+0x18(SB)/8, $0x68F05FC7049C33AB
DATA m1_2<>+0x20(SB)/8, $0x9A950A05FEF16E61
DATA m1_2<>+0x28(SB)/8, $0x0E019E916A65FAF5
DATA m1_2<>+0x30(SB)/8, $0x892D69CD44E0A400
DATA m1_2<>+0x38(SB)/8, $0x2C88CC68E14501A5
GLOBL m1_2<>(SB), (16+8), $64

// left rotations of 32-bit words by 8-bit increments
DATA r08_mask<>+0x00(SB)/8, $0x0605040702010003
DATA r08_mask<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
GLOBL r08_mask<>(SB), (16+8), $16

DATA fk_mask<>+0x00(SB)/8, $0x56aa3350a3b1bac6
DATA fk_mask<>+0x08(SB)/8, $0xb27022dc677d9197
GLOBL fk_mask<>(SB), (16+8), $16

#define LOAD_SM4_AESNI_CONSTS() \
MOVW $0x0F0F0F0F, R20 \
VDUP R20, NIBBLE_MASK.S4 \
MOVD $m1_2<>(SB), R20 \
VLD1 (R20), [M1L.B16, M1H.B16, M2L.B16, M2H.B16] \
MOVD $inverse_shift_rows<>(SB), R20 \
VLD1 (R20), [INVERSE_SHIFT_ROWS.B16] \
MOVD $r08_mask<>(SB), R20 \
VLD1 (R20), [R08_MASK.B16] \
MOVW $0x0F0F0F0F, R20 \
VDUP R20, NIBBLE_MASK.S4 \
MOVD $·rcon(SB), R20 \
VLD1.P 64(R20), [M1L.B16, M1H.B16, M2L.B16, M2H.B16] \
VLD1 (R20), [R08_MASK.B16, INVERSE_SHIFT_ROWS.B16]

// input: from high to low
// t0 = t0.S3, t0.S2, t0.S1, t0.S0
Expand Down
73 changes: 64 additions & 9 deletions sm4/asm_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,61 @@
#define XTMP6 X10
#define XTMP7 X11

// shuffle byte order from LE to BE
DATA ·flip_mask+0x00(SB)/8, $0x0405060700010203
DATA ·flip_mask+0x08(SB)/8, $0x0c0d0e0f08090a0b
GLOBL ·flip_mask(SB), RODATA, $16

// shuffle byte and word order
DATA ·bswap_mask+0x00(SB)/8, $0x08090a0b0c0d0e0f
DATA ·bswap_mask+0x08(SB)/8, $0x0001020304050607
GLOBL ·bswap_mask(SB), RODATA, $16

//nibble mask
DATA ·nibble_mask+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
DATA ·nibble_mask+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
GLOBL ·nibble_mask(SB), RODATA, $16

// inverse shift rows
DATA ·inverse_shift_rows+0x00(SB)/8, $0x0B0E0104070A0D00
DATA ·inverse_shift_rows+0x08(SB)/8, $0x0306090C0F020508
DATA ·inverse_shift_rows+0x10(SB)/8, $0x0B0E0104070A0D00
DATA ·inverse_shift_rows+0x18(SB)/8, $0x0306090C0F020508
GLOBL ·inverse_shift_rows(SB), RODATA, $32

// Affine transform 1 (low and high nibbles)
DATA ·m1_low+0x00(SB)/8, $0x0A7FC3B6D5A01C69
DATA ·m1_low+0x08(SB)/8, $0x3045F98CEF9A2653
DATA ·m1_low+0x10(SB)/8, $0x0A7FC3B6D5A01C69
DATA ·m1_low+0x18(SB)/8, $0x3045F98CEF9A2653
GLOBL ·m1_low(SB), RODATA, $32

DATA ·m1_high+0x00(SB)/8, $0xC35BF46CAF379800
DATA ·m1_high+0x08(SB)/8, $0x68F05FC7049C33AB
DATA ·m1_high+0x10(SB)/8, $0xC35BF46CAF379800
DATA ·m1_high+0x18(SB)/8, $0x68F05FC7049C33AB
GLOBL ·m1_high(SB), RODATA, $32

// Affine transform 2 (low and high nibbles)
DATA ·m2_low+0x00(SB)/8, $0x9A950A05FEF16E61
DATA ·m2_low+0x08(SB)/8, $0x0E019E916A65FAF5
DATA ·m2_low+0x10(SB)/8, $0x9A950A05FEF16E61
DATA ·m2_low+0x18(SB)/8, $0x0E019E916A65FAF5
GLOBL ·m2_low(SB), RODATA, $32

DATA ·m2_high+0x00(SB)/8, $0x892D69CD44E0A400
DATA ·m2_high+0x08(SB)/8, $0x2C88CC68E14501A5
DATA ·m2_high+0x10(SB)/8, $0x892D69CD44E0A400
DATA ·m2_high+0x18(SB)/8, $0x2C88CC68E14501A5
GLOBL ·m2_high(SB), RODATA, $32

// left rotations of 32-bit words by 8-bit increments
DATA ·r08_mask+0x00(SB)/8, $0x0605040702010003
DATA ·r08_mask+0x08(SB)/8, $0x0E0D0C0F0A09080B
DATA ·r08_mask+0x10(SB)/8, $0x0605040702010003
DATA ·r08_mask+0x18(SB)/8, $0x0E0D0C0F0A09080B
GLOBL ·r08_mask(SB), RODATA, $32

#include "aesni_macros_amd64.s"

// SM4 TAO L2 function, used for key expand
Expand Down Expand Up @@ -105,8 +160,8 @@ TEXT ·expandKeyAsm(SB),NOSPLIT,$0
MOVQ dec+24(FP), DI

MOVUPS 0(AX), t0
PSHUFB flip_mask<>(SB), t0
PXOR fk_mask<>(SB), t0
PSHUFB ·flip_mask(SB), t0
PXOR ·fk(SB), t0
PSHUFD $1, t0, t1
PSHUFD $2, t0, t2
PSHUFD $3, t0, t3
Expand Down Expand Up @@ -225,7 +280,7 @@ avx_done_sm4:
RET

avx2:
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK

CMPQ DI, $256
JEQ avx2_16blocks
Expand All @@ -235,7 +290,7 @@ avx2_8blocks:
VMOVDQU 32(DX), XDWORD1
VMOVDQU 64(DX), XDWORD2
VMOVDQU 96(DX), XDWORD3
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
VBROADCASTI128 ·flip_mask(SB), BYTE_FLIP_MASK

// Apply Byte Flip Mask: LE -> BE
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
Expand All @@ -251,7 +306,7 @@ avx2_8blocks:
// Transpose matrix 4 x 4 32bits word
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)

VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
VBROADCASTI128 ·bswap_mask(SB), BYTE_FLIP_MASK
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
Expand All @@ -275,7 +330,7 @@ avx2_16blocks:
VMOVDQU 192(DX), XDWORD6
VMOVDQU 224(DX), XDWORD7

VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
VBROADCASTI128 ·flip_mask(SB), BYTE_FLIP_MASK

// Apply Byte Flip Mask: LE -> BE
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
Expand All @@ -297,7 +352,7 @@ avx2_16blocks:
TRANSPOSE_MATRIX(XDWORD0, XDWORD1, XDWORD2, XDWORD3, XDWTMP1, XDWTMP2)
TRANSPOSE_MATRIX(XDWORD4, XDWORD5, XDWORD6, XDWORD7, XDWTMP1, XDWTMP2)

VBROADCASTI128 bswap_mask<>(SB), BYTE_FLIP_MASK
VBROADCASTI128 ·bswap_mask(SB), BYTE_FLIP_MASK
VPSHUFB BYTE_FLIP_MASK, XDWORD0, XDWORD0
VPSHUFB BYTE_FLIP_MASK, XDWORD1, XDWORD1
VPSHUFB BYTE_FLIP_MASK, XDWORD2, XDWORD2
Expand Down Expand Up @@ -328,7 +383,7 @@ TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
MOVQ src+16(FP), DX

MOVUPS (DX), t0
PSHUFB flip_mask<>(SB), t0
PSHUFB ·flip_mask(SB), t0
PSHUFD $1, t0, t1
PSHUFD $2, t0, t2
PSHUFD $3, t0, t3
Expand All @@ -353,7 +408,7 @@ loop:
PUNPCKLLQ t2, t3
PUNPCKLLQ t0, t1
PUNPCKLQDQ t1, t3
PSHUFB flip_mask<>(SB), t3
PSHUFB ·flip_mask(SB), t3
MOVUPS t3, (BX)

done_sm4:
Expand Down
33 changes: 23 additions & 10 deletions sm4/asm_arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,26 @@
#define M2H V23
#define R08_MASK V24
#define INVERSE_SHIFT_ROWS V25
#define NIBBLE_MASK V26
#define FK_MASK V27
#define FK_MASK V26
#define NIBBLE_MASK V27
#define ZERO V28

DATA ·rcon+0x00(SB)/8, $0x0A7FC3B6D5A01C69 // m1l
DATA ·rcon+0x08(SB)/8, $0x3045F98CEF9A2653
DATA ·rcon+0x10(SB)/8, $0xC35BF46CAF379800 // m1h
DATA ·rcon+0x18(SB)/8, $0x68F05FC7049C33AB
DATA ·rcon+0x20(SB)/8, $0x9A950A05FEF16E61 // m2l
DATA ·rcon+0x28(SB)/8, $0x0E019E916A65FAF5
DATA ·rcon+0x30(SB)/8, $0x892D69CD44E0A400 // m2h
DATA ·rcon+0x38(SB)/8, $0x2C88CC68E14501A5
DATA ·rcon+0x40(SB)/8, $0x0605040702010003 // left rotations of 32-bit words by 8-bit increments
DATA ·rcon+0x48(SB)/8, $0x0E0D0C0F0A09080B
DATA ·rcon+0x50(SB)/8, $0x0B0E0104070A0D00 // inverse shift rows
DATA ·rcon+0x58(SB)/8, $0x0306090C0F020508
DATA ·rcon+0x60(SB)/8, $0x56aa3350a3b1bac6 // fk
DATA ·rcon+0x68(SB)/8, $0xb27022dc677d9197
GLOBL ·rcon(SB), RODATA, $112

#include "aesni_macros_arm64.s"

#define SM4_TAO_L2(x, y) \
Expand All @@ -49,14 +65,11 @@
MOVW.P R2, -4(R11)

#define LOAD_SM4KEY_AESNI_CONSTS() \
MOVW $0x0F0F0F0F, R0 \
VDUP R0, NIBBLE_MASK.S4 \
MOVD $m1_2<>(SB), R0 \
VLD1 (R0), [M1L.B16, M1H.B16, M2L.B16, M2H.B16] \
MOVD $fk_mask<>(SB), R0 \
VLD1 (R0), [FK_MASK.B16] \
MOVD $inverse_shift_rows<>(SB), R0 \
VLD1 (R0), [INVERSE_SHIFT_ROWS.B16]
MOVW $0x0F0F0F0F, R0 \
VDUP R0, NIBBLE_MASK.S4 \
MOVD $·rcon(SB), R0 \
VLD1.P 64(R0), [M1L.B16, M1H.B16, M2L.B16, M2H.B16] \
VLD1 (R0), [R08_MASK.B16, INVERSE_SHIFT_ROWS.B16, FK.B16]

#define SM4EKEY_EXPORT_KEYS() \
VREV64 V8.S4, V11.S4 \
Expand Down
6 changes: 3 additions & 3 deletions sm4/cbc_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -360,9 +360,9 @@ avxCbcSm4Done:
RET

avx2Start:
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK
VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
VBROADCASTI128 ·flip_mask(SB), BYTE_FLIP_MASK
VBROADCASTI128 ·bswap_mask(SB), BSWAP_MASK

VMOVDQU -16(DX), X15

Expand Down
20 changes: 10 additions & 10 deletions sm4/cbc_arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@
#define t7 V13
#define IV V18

#define LAST_BLOCK V15
#define ZERO V16
#define NIBBLE_MASK V20
#define INVERSE_SHIFT_ROWS V21
#define M1L V22
#define M1H V23
#define M2L V24
#define M2H V25
#define R08_MASK V26
#define FK_MASK V27
#define M1L V20
#define M1H V21
#define M2L V22
#define M2H V23
#define R08_MASK V24
#define INVERSE_SHIFT_ROWS V25
#define NIBBLE_MASK V26

#include "aesni_macros_arm64.s"

Expand All @@ -49,7 +49,7 @@ TEXT ·decryptBlocksChain(SB),NOSPLIT,$0

ADD srcPtr, srcPtrLen, R10
SUB $16, R10, R10
VLD1 (R10), [V15.S4]
VLD1 (R10), [LAST_BLOCK.S4]

cbcSm4Octets:
CMP $128, srcPtrLen
Expand Down Expand Up @@ -293,5 +293,5 @@ cbc4BlocksLoop48:
VST1 [t0.S4, t1.S4, t2.S4], (dstPtr)

cbcSm4Done:
VST1 [V15.S4], (R6)
VST1 [LAST_BLOCK.S4], (R6)
RET
6 changes: 3 additions & 3 deletions sm4/ecb_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -219,9 +219,9 @@ avxEcbSm4Done:
RET

avx2_start:
VBROADCASTI128 nibble_mask<>(SB), NIBBLE_MASK
VBROADCASTI128 flip_mask<>(SB), BYTE_FLIP_MASK
VBROADCASTI128 bswap_mask<>(SB), BSWAP_MASK
VBROADCASTI128 ·nibble_mask(SB), NIBBLE_MASK
VBROADCASTI128 ·flip_mask(SB), BYTE_FLIP_MASK
VBROADCASTI128 ·bswap_mask(SB), BSWAP_MASK

avx2_16blocks:
CMPQ DI, $256
Expand Down
18 changes: 9 additions & 9 deletions sm4/ecb_arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,22 @@
#define t1 V3
#define t2 V4
#define t3 V5
#define ZERO V16
#define NIBBLE_MASK V20
#define INVERSE_SHIFT_ROWS V21
#define M1L V22
#define M1H V23
#define M2L V24
#define M2H V25
#define R08_MASK V26
#define FK_MASK V27
#define XTMP6 V6
#define XTMP7 V7
#define t4 V10
#define t5 V11
#define t6 V12
#define t7 V13

#define ZERO V16
#define M1L V20
#define M1H V21
#define M2L V22
#define M2H V23
#define R08_MASK V24
#define INVERSE_SHIFT_ROWS V25
#define NIBBLE_MASK V26

#include "aesni_macros_arm64.s"

// func encryptSm4Ecb(xk *uint32, dst, src []byte)
Expand Down
Loading

0 comments on commit aa82b58

Please sign in to comment.