Skip to content

SM4 with AESENCLAST

Sun Yimin edited this page Oct 11, 2023 · 37 revisions

简介

This is the pure golang code to study SM4 implementation with AESENCLAST instruction.

sm4 with AESENCLAST

We combine various linear operations into two affine transforms (one on each side), A1 and A2. Here affine transform consists of a multiplication with a 8x8 binary matrix M and addition of a 8-bit constant C.

SM4-S(x) = A2(AES-S(A1(x))
A1(x) = M1*x + C1
A2(x) = M2*x + C2

The combinations of (M1, C1, M2, C2) or (A1, A2) are not unique.

Known (M1, C1, M2, C2), please reference sm4 with AESENCLAST and AES 和 SM4 S盒複合域實現方法python code. In AES 和 SM4 S盒複合域實現方法python code, we found 8 groups of (M1, C1, M2, C2). 其实按A very compact Rijndael S-box,这八组对应E All Possible Bases的八组正规基:1 4 19 22 37 40 55 58,另外还有73 76 91 94 109 112 127 130八组正规基,其它都是混合基或者多项式基,共432组,它们都假设trace is unity。

{(M1, C1, M2, C2) | SM4-S(x) = A2(AES-S(A1(x)), A1(x) = M1*x + C1, A2(x) = M2*x + C2}

计算、收集的(M1, C1, M2, C2)列表:

M1= 0x96 ,0x47 ,0xe9 ,0x3d ,0xde ,0x65 ,0xac ,0xa7
C1= 0x69
M2= 0xfa ,0x64 ,0xb4 ,0x0a ,0x41 ,0xdd ,0x01 ,0xc1
C2= 0x61

//https://github.com/intel/ipp-crypto/blob/develop/sources/ippcp/pcpsms4_l9cn.h
//Intel也用了这组
M1= 0x52 ,0xbc ,0x2d ,0x02 ,0x9e ,0x25 ,0xac ,0x34
C1= 0x65
M2= 0xcb ,0x9a ,0x0a ,0xb4 ,0xc7 ,0xac ,0x87 ,0x4e
C2= 0x2f

M1= 0x5d ,0x50 ,0x22 ,0x1a ,0xb9 ,0x7d ,0x28 ,0x4c
C1= 0x3e
M2= 0xd3 ,0xba ,0x1d ,0x65 ,0x47 ,0x4c ,0x0e ,0x48
C2= 0x6c

M1= 0xe6 ,0xab ,0x99 ,0x5a ,0x86 ,0x42 ,0x28 ,0x24
C1= 0x8e
M2= 0x2d ,0x8b ,0x65 ,0x1d ,0xc8 ,0xfb ,0x81 ,0xce
C2= 0xe9

M1= 0xd1 ,0x37 ,0xae ,0xce ,0x05 ,0x45 ,0xec ,0xdd
C1= 0x86
M2= 0x50 ,0x16 ,0x5b ,0x2a ,0x53 ,0x92 ,0x62 ,0x33
C2= 0x3c

M1= 0xee ,0xb3 ,0x91 ,0x75 ,0xc1 ,0x81 ,0xec ,0x8a
C1= 0xd6
M2= 0x19 ,0x56 ,0x2a ,0x5b ,0xa4 ,0xea ,0x95 ,0x0b
C2= 0x4d

M1= 0x4d ,0x1f ,0x32 ,0xfe ,0x8e ,0xb1 ,0x17 ,0xd5
C1= 0xce
M2= 0xe8 ,0x28 ,0x74 ,0xc3 ,0xfc ,0x32 ,0x02 ,0x6b
C2= 0x81

M1= 0x0d ,0x9b ,0x72 ,0x3a ,0x35 ,0x0a ,0x17 ,0x06
C1= 0x23
M2= 0xa8 ,0x61 ,0xc3 ,0x74 ,0xc4 ,0x8c ,0x3a ,0x9c
C2= 0x3b

//以上都为计算获得

//https://github.com/mjosaarinen/sm4ni
M1= 0x14, 0x2e, 0x16, 0x8a, 0x60, 0x0d, 0x9b, 0x66
C1= 0x01
M2= 0xfe, 0x54, 0xaf, 0xdd, 0xf7, 0xf9, 0xac, 0xe2
C2= 0x34

Evolution path

sm4_box_aesenclast <-> sm4_box_aesbox_1 <-> sm4_box_aesbox_2 <-> sm4_box_aesbox_3 <-> sm4_box_aesbox_4

We note that each affine transform can be constructed from XOR of two 4x8-bit table lookups, which we implement with constant time byte shuffle instructions (each 16-entry table is in a single 128-bit register).

sm4_box_aesenclast
	y := mm_and_si128(x, const_0f)
	y = mm_shuffle_epi8(a1l, y)
	x = mm_srli_epi64(x, 4)
	x = mm_and_si128(x, const_0f)
	x = xor(mm_shuffle_epi8(a1h, x), y)

	x = mm_shuffle_epi8(x, shift_row_inv)
	x = mm_aesenclast_si128(x, const_0f)

	y = mm_andnot_si128(x, const_0f)
	y = mm_shuffle_epi8(a2l, y)
	x = mm_srli_epi64(x, 4)
	x = mm_and_si128(x, const_0f)
	x = xor(mm_shuffle_epi8(a2h, x), y)

sm4_box_aesbox_1 
	var y __m128i
	for i := 0; i < 16; i++ {
		y.bytes[i] = a1l.bytes[x.bytes[i]&0xf] ^ a1h.bytes[x.bytes[i]>>4]
	}
	x = y

	for i := 0; i < 16; i++ {
		x.bytes[i] = aes_sbox[x.bytes[i]] ^ 0xf
	}

	for i := 0; i < 16; i++ {
		y.bytes[i] = a2l.bytes[(^x.bytes[i])&0xf] ^ a2h.bytes[x.bytes[i]>>4]
	}

sm4_box_aesbox_2 
	for i := 0; i < 16; i++ {
		v := x.bytes[i]
		v = a1l.bytes[v&0xf] ^ a1h.bytes[v>>4]     // v = A1(x)
		v = aes_sbox[v] ^ 0xf                      // v = AES-S(A1(x)) XOR 0x0f, 相当于对低四位取反。
		v = a2l.bytes[^v&0xf] ^ a2h.bytes[v>>4]    // v = A2(AES-S(A1(x)))
		x.bytes[i] = v
	}

sm4_box_aesbox_3 
	for i := 0; i < 16; i++ {
		v := x.bytes[i]
		v = a1l.bytes[v&0xf] ^ a1h.bytes[v>>4]

		v = aes_sbox[v]

		v = a2l.bytes[v&0xf] ^ a2h.bytes[v>>4]

		x.bytes[i] = v
	}

sm4_box_aesbox_4
	for i := 0; i < 16; i++ {
		v := x.bytes[i]
		v = a1_table[v]

		v = aes_sbox[v]

		v = a2_table[v]

		x.bytes[i] = v
	} 

Intel的方法

pcpsms4_l9cn

var shift_row_inv = set64(0x0306090C0F020508, 0x0B0E0104070A0D00)
var intelm1l = set64(0xdcf84460b3972b0f, 0xb6922e0ad9fd4165)
var intelm1h = set64(0x64ad03cae42d834a, 0x2ee74980ae67c900)
var intelm2l = set64(0x48c2a32957ddbc36, 0xad2746ccb23859d3)
var intelm2h = set64(0x134307579aca8ede, 0xcd9dd98944145000)
var intelenckey = set64(0x6363636363636363, 0x6363636363636363)
var intelmaskSrows = shift_row_inv

func sm4_box_aesenclast_intel(rk uint32, t0, t1, t2, t3, a1l, a1h, a2l, a2h __m128i) __m128i {
	rk128 := mm_set_epi32(rk, rk, rk, rk)
	x := xor(xor(t1, t2), t3)
	x = xor(x, rk128)

	y := mm_and_si128(x, const_0f)
	y = mm_shuffle_epi8(a1l, y)
	x = mm_srli_epi64(x, 4)
	x = mm_and_si128(x, const_0f)
	x = xor(mm_shuffle_epi8(a1h, x), y)

	x = mm_aesenclast_si128(x, intelenckey)
	x = mm_shuffle_epi8(x, intelmaskSrows)

	y = mm_and_si128(x, const_0f)
	y = mm_shuffle_epi8(a2l, y)
	x = mm_srli_epi64(x, 4)
	x = mm_and_si128(x, const_0f)
	x = xor(mm_shuffle_epi8(a2h, x), y)

	return x
}

其实x = mm_shuffle_epi8(x, intelmaskSrows)在mm_aesenclast_si128之前调用,结果也是一样的。

类似于:

{(M1, C1, M2, C2) | SM4-S(x) = A2(AES-S(A1(x)), A1(x) = M1*x + C1, A2(x) = M2*(x+0x63) + C2 = M2*x + (M2*0x63 + C2)}

如何生成Intel算法的外层查找表?

from pyfinite import genericmatrix

def XOR(x, y): return x ^ y
def AND(x, y): return x & y
def DIV(x, y): return x

def genCMatrix(c):
    Imatrix = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
    for j in range (8):
        Imatrix.SetRow(j, [(0x63 >> (7 - j)) & 1])
    return Imatrix

def matrix_from_cols(cols):
    m = genericmatrix.GenericMatrix(size=(8, 8), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
    for i in range (8):
        k = 7 - i
        j = 1 << k
        m.SetRow(i, [(cols[0] & j) >> k, (cols[1] & j) >> k, (cols[2] & j) >> k, (cols[3] & j) >> k, (cols[4] & j) >> k, (cols[5] & j) >> k, (cols[6] & j) >> k, (cols[7] & j) >> k])    

    return m

def gen_matrix_based_table(table):
    return matrix_from_cols([table[0x80] ^ table[0], table[0x40] ^ table[0], table[0x20] ^ table[0], table[0x10] ^ table[0], table[0x08] ^ table[0], table[0x04] ^ table[0], table[0x02] ^ table[0], table[0x01] ^ table[0]])

def gen_matrix_based_high_low(high, low):
    table = []
    for i in range(16):
        for j in range(16):
            table.append(high[i] ^ low[j])    
    return gen_matrix_based_table(table) 

def matrix_col_byte(c):
    return (c[0] << 7) ^ (c[1] << 6) ^ (c[2] << 5) ^ (c[3] << 4) ^ (c[4] << 3) ^ (c[5] << 2) ^ (c[6] << 1) ^ (c[7] << 0)

def gen_lookup(m, c):
    table = []
    for i in range(256):
        Imatrix = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
        for j in range (8):
            Imatrix.SetRow(j, [(i >> (7 - j)) & 1])
        tmp = m * Imatrix
        table.append(matrix_col_byte(tmp.GetColumn(0)) ^ c)    
    return table

def gen_lookup_low(m, c):
    table = []
    for i in range(256):
        Imatrix = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
        for j in range (8):
            if j < 4:
                Imatrix.SetRow(j, [0])
            else:
                Imatrix.SetRow(j, [(i >> (7 - j)) & 1])        
        tmp = m * Imatrix
        table.append(matrix_col_byte(tmp.GetColumn(0)) ^ c)    
    return table

def gen_lookup_high(m):
    table = []
    for i in range(256):
        Imatrix = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
        for j in range (8):
            if j < 4:
                Imatrix.SetRow(j, [(i >> (7 - j)) & 1])
            else:
                Imatrix.SetRow(j, [0])
        tmp = m * Imatrix
        table.append(matrix_col_byte(tmp.GetColumn(0)))    
    return table

def print_table(table):
    for i, s in enumerate(table):
        print(f'0x%02X'%s,',', end='')    
        if (i+1) % 16 == 0:
            print()

def to_matrix(x):
    m = genericmatrix.GenericMatrix(size=(8,8), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
    for i in range(8):
        m.SetRow(i, [(x[i] & 0x80) >> 7, (x[i] & 0x40) >> 6, (x[i] & 0x20) >> 5, (x[i] & 0x10) >> 4, (x[i] & 0x08) >> 3, (x[i] & 0x04) >> 2, (x[i] & 0x02) >> 1, (x[i] & 0x01) >> 0]) 
    return m

def gen_intel_c(m, c):
    Cmatrix = genCMatrix(0x63)
    c1 = m*Cmatrix
    return matrix_col_byte(c1.GetColumn(0)) ^ c

Mmatrix = to_matrix([0xcb, 0x9a, 0x0a, 0xb4, 0xc7, 0xac, 0x87, 0x4e])
c1 = gen_intel_c(Mmatrix, 0x2f)
print(f'0x%02X'%c1)
print()
print_table(gen_lookup_high(Mmatrix))
print()
print_table(gen_lookup_low(Mmatrix, c1))

How to calculate lookup table from M, C?

${ M\times i + C \mid i \in [0,255] }$

这个查找表有256个元素,考虑到寄存器的使用,需要换个形式。
$M\times i + C = M\times i_{4highbits} + (M\times i_{4lowbits} + C) \mid i \in [0,255]$
我们可以看到 $M\times i_{4highbits} \mid i \in [0,255]$ 的每一列(16个字节)都是相同的。而 $M\times i_{4lowbits} + C \mid i \in [0,255]$ 的每一行(16个字节)都是相同的。
这样,我们去除重复,只用16*2个字节就可以存储这个查找表。

// {Mi+C | i>=0 && i<256}

// Generate lookup table based on M matrix and C
func gen_lookup_table(m [8]byte, c byte) {
	for i := 0; i < 16; i++ {
		for j := 0; j < 16; j++ {
			x := ((byte(bits.OnesCount8(byte(i*16+j)&m[0])) & 1) << 7) ^
				((byte(bits.OnesCount8(byte(i*16+j)&m[1])) & 1) << 6) ^
				((byte(bits.OnesCount8(byte(i*16+j)&m[2])) & 1) << 5) ^
				((byte(bits.OnesCount8(byte(i*16+j)&m[3])) & 1) << 4) ^
				((byte(bits.OnesCount8(byte(i*16+j)&m[4])) & 1) << 3) ^
				((byte(bits.OnesCount8(byte(i*16+j)&m[5])) & 1) << 2) ^
				((byte(bits.OnesCount8(byte(i*16+j)&m[6])) & 1) << 1) ^
				((byte(bits.OnesCount8(byte(i*16+j)&m[7])) & 1) << 0) ^ c
			fmt.Printf("0x%02X, ", x)
		}
		fmt.Println()
	}
}

Below python code is more intuitive:

from pyfinite import genericmatrix

XOR = lambda x,y:x^y
AND = lambda x,y:x&y
DIV = lambda x,y:x


def to_matrix(x):
    m = genericmatrix.GenericMatrix(size=(8,8), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
    for i in range(8):
        m.SetRow(i, [(x[i] & 0x80) >> 7, (x[i] & 0x40) >> 6, (x[i] & 0x20) >> 5, (x[i] & 0x10) >> 4, (x[i] & 0x08) >> 3, (x[i] & 0x04) >> 2, (x[i] & 0x02) >> 1, (x[i] & 0x01) >> 0]) 
    return m

def matrix_col_byte(c):
    return (c[0] << 7) ^ (c[1] << 6) ^ (c[2] << 5) ^ (c[3] << 4) ^ (c[4] << 3) ^ (c[5] << 2) ^ (c[6] << 1) ^ (c[7] << 0)

def gen_lookup(m, c):
    Mmatrix = to_matrix(m)
    table = []
    for i in range(256):
        Imatrix = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
        for j in range (8):
            Imatrix.SetRow(j, [(i >> (7 - j)) & 1])
        tmp = Mmatrix * Imatrix
        table.append(matrix_col_byte(tmp.GetColumn(0)) ^ c)    
    return table

def gen_lookup_low(m, c):
    Mmatrix = to_matrix(m)
    table = []
    for i in range(256):
        Imatrix = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
        for j in range (8):
            if j < 4:
                Imatrix.SetRow(j, [0])
            else:
                Imatrix.SetRow(j, [(i >> (7 - j)) & 1])        
        tmp = Mmatrix * Imatrix
        table.append(matrix_col_byte(tmp.GetColumn(0)) ^ c)    
    return table

def gen_lookup_high(m, c):
    Mmatrix = to_matrix(m)
    table = []
    for i in range(256):
        Imatrix = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
        for j in range (8):
            if j < 4:
                Imatrix.SetRow(j, [(i >> (7 - j)) & 1])
            else:
                Imatrix.SetRow(j, [0])
        tmp = Mmatrix * Imatrix
        table.append(matrix_col_byte(tmp.GetColumn(0)))    
    return table

def print_table(table):
    for i, s in enumerate(table):
        print(f'0x%02X'%s,',', end='')    
        if (i+1) % 16 == 0:
            print()

print_table(gen_lookup_low([0xfe, 0x54, 0xaf, 0xdd, 0xf7, 0xf9, 0xac, 0xe2], 0x34))
print()
print_table(gen_lookup_high([0xfe, 0x54, 0xaf, 0xdd, 0xf7, 0xf9, 0xac, 0xe2], 0x34))
print()
print_table(gen_lookup([0xfe, 0x54, 0xaf, 0xdd, 0xf7, 0xf9, 0xac, 0xe2], 0x34))
print()

示例结果:

0x34 ,0x08 ,0x9D ,0xA1 ,0xCE ,0xF2 ,0x67 ,0x5B ,0x82 ,0xBE ,0x2B ,0x17 ,0x78 ,0x44 ,0xD1 ,0xED ,
0x34 ,0x08 ,0x9D ,0xA1 ,0xCE ,0xF2 ,0x67 ,0x5B ,0x82 ,0xBE ,0x2B ,0x17 ,0x78 ,0x44 ,0xD1 ,0xED ,
0x34 ,0x08 ,0x9D ,0xA1 ,0xCE ,0xF2 ,0x67 ,0x5B ,0x82 ,0xBE ,0x2B ,0x17 ,0x78 ,0x44 ,0xD1 ,0xED ,
0x34 ,0x08 ,0x9D ,0xA1 ,0xCE ,0xF2 ,0x67 ,0x5B ,0x82 ,0xBE ,0x2B ,0x17 ,0x78 ,0x44 ,0xD1 ,0xED ,
0x34 ,0x08 ,0x9D ,0xA1 ,0xCE ,0xF2 ,0x67 ,0x5B ,0x82 ,0xBE ,0x2B ,0x17 ,0x78 ,0x44 ,0xD1 ,0xED ,
0x34 ,0x08 ,0x9D ,0xA1 ,0xCE ,0xF2 ,0x67 ,0x5B ,0x82 ,0xBE ,0x2B ,0x17 ,0x78 ,0x44 ,0xD1 ,0xED ,
0x34 ,0x08 ,0x9D ,0xA1 ,0xCE ,0xF2 ,0x67 ,0x5B ,0x82 ,0xBE ,0x2B ,0x17 ,0x78 ,0x44 ,0xD1 ,0xED ,
0x34 ,0x08 ,0x9D ,0xA1 ,0xCE ,0xF2 ,0x67 ,0x5B ,0x82 ,0xBE ,0x2B ,0x17 ,0x78 ,0x44 ,0xD1 ,0xED ,
0x34 ,0x08 ,0x9D ,0xA1 ,0xCE ,0xF2 ,0x67 ,0x5B ,0x82 ,0xBE ,0x2B ,0x17 ,0x78 ,0x44 ,0xD1 ,0xED ,
0x34 ,0x08 ,0x9D ,0xA1 ,0xCE ,0xF2 ,0x67 ,0x5B ,0x82 ,0xBE ,0x2B ,0x17 ,0x78 ,0x44 ,0xD1 ,0xED ,
0x34 ,0x08 ,0x9D ,0xA1 ,0xCE ,0xF2 ,0x67 ,0x5B ,0x82 ,0xBE ,0x2B ,0x17 ,0x78 ,0x44 ,0xD1 ,0xED ,
0x34 ,0x08 ,0x9D ,0xA1 ,0xCE ,0xF2 ,0x67 ,0x5B ,0x82 ,0xBE ,0x2B ,0x17 ,0x78 ,0x44 ,0xD1 ,0xED ,
0x34 ,0x08 ,0x9D ,0xA1 ,0xCE ,0xF2 ,0x67 ,0x5B ,0x82 ,0xBE ,0x2B ,0x17 ,0x78 ,0x44 ,0xD1 ,0xED ,
0x34 ,0x08 ,0x9D ,0xA1 ,0xCE ,0xF2 ,0x67 ,0x5B ,0x82 ,0xBE ,0x2B ,0x17 ,0x78 ,0x44 ,0xD1 ,0xED ,
0x34 ,0x08 ,0x9D ,0xA1 ,0xCE ,0xF2 ,0x67 ,0x5B ,0x82 ,0xBE ,0x2B ,0x17 ,0x78 ,0x44 ,0xD1 ,0xED ,
0x34 ,0x08 ,0x9D ,0xA1 ,0xCE ,0xF2 ,0x67 ,0x5B ,0x82 ,0xBE ,0x2B ,0x17 ,0x78 ,0x44 ,0xD1 ,0xED ,

0x00 ,0x00 ,0x00 ,0x00 ,0x00 ,0x00 ,0x00 ,0x00 ,0x00 ,0x00 ,0x00 ,0x00 ,0x00 ,0x00 ,0x00 ,0x00 ,
0xDC ,0xDC ,0xDC ,0xDC ,0xDC ,0xDC ,0xDC ,0xDC ,0xDC ,0xDC ,0xDC ,0xDC ,0xDC ,0xDC ,0xDC ,0xDC ,
0xAF ,0xAF ,0xAF ,0xAF ,0xAF ,0xAF ,0xAF ,0xAF ,0xAF ,0xAF ,0xAF ,0xAF ,0xAF ,0xAF ,0xAF ,0xAF ,
0x73 ,0x73 ,0x73 ,0x73 ,0x73 ,0x73 ,0x73 ,0x73 ,0x73 ,0x73 ,0x73 ,0x73 ,0x73 ,0x73 ,0x73 ,0x73 ,
0xDD ,0xDD ,0xDD ,0xDD ,0xDD ,0xDD ,0xDD ,0xDD ,0xDD ,0xDD ,0xDD ,0xDD ,0xDD ,0xDD ,0xDD ,0xDD ,
0x01 ,0x01 ,0x01 ,0x01 ,0x01 ,0x01 ,0x01 ,0x01 ,0x01 ,0x01 ,0x01 ,0x01 ,0x01 ,0x01 ,0x01 ,0x01 ,
0x72 ,0x72 ,0x72 ,0x72 ,0x72 ,0x72 ,0x72 ,0x72 ,0x72 ,0x72 ,0x72 ,0x72 ,0x72 ,0x72 ,0x72 ,0x72 ,
0xAE ,0xAE ,0xAE ,0xAE ,0xAE ,0xAE ,0xAE ,0xAE ,0xAE ,0xAE ,0xAE ,0xAE ,0xAE ,0xAE ,0xAE ,0xAE ,
0xBF ,0xBF ,0xBF ,0xBF ,0xBF ,0xBF ,0xBF ,0xBF ,0xBF ,0xBF ,0xBF ,0xBF ,0xBF ,0xBF ,0xBF ,0xBF ,
0x63 ,0x63 ,0x63 ,0x63 ,0x63 ,0x63 ,0x63 ,0x63 ,0x63 ,0x63 ,0x63 ,0x63 ,0x63 ,0x63 ,0x63 ,0x63 ,
0x10 ,0x10 ,0x10 ,0x10 ,0x10 ,0x10 ,0x10 ,0x10 ,0x10 ,0x10 ,0x10 ,0x10 ,0x10 ,0x10 ,0x10 ,0x10 ,
0xCC ,0xCC ,0xCC ,0xCC ,0xCC ,0xCC ,0xCC ,0xCC ,0xCC ,0xCC ,0xCC ,0xCC ,0xCC ,0xCC ,0xCC ,0xCC ,
0x62 ,0x62 ,0x62 ,0x62 ,0x62 ,0x62 ,0x62 ,0x62 ,0x62 ,0x62 ,0x62 ,0x62 ,0x62 ,0x62 ,0x62 ,0x62 ,
0xBE ,0xBE ,0xBE ,0xBE ,0xBE ,0xBE ,0xBE ,0xBE ,0xBE ,0xBE ,0xBE ,0xBE ,0xBE ,0xBE ,0xBE ,0xBE ,
0xCD ,0xCD ,0xCD ,0xCD ,0xCD ,0xCD ,0xCD ,0xCD ,0xCD ,0xCD ,0xCD ,0xCD ,0xCD ,0xCD ,0xCD ,0xCD ,
0x11 ,0x11 ,0x11 ,0x11 ,0x11 ,0x11 ,0x11 ,0x11 ,0x11 ,0x11 ,0x11 ,0x11 ,0x11 ,0x11 ,0x11 ,0x11 ,

0x34 ,0x08 ,0x9D ,0xA1 ,0xCE ,0xF2 ,0x67 ,0x5B ,0x82 ,0xBE ,0x2B ,0x17 ,0x78 ,0x44 ,0xD1 ,0xED ,
0xE8 ,0xD4 ,0x41 ,0x7D ,0x12 ,0x2E ,0xBB ,0x87 ,0x5E ,0x62 ,0xF7 ,0xCB ,0xA4 ,0x98 ,0x0D ,0x31 ,
0x9B ,0xA7 ,0x32 ,0x0E ,0x61 ,0x5D ,0xC8 ,0xF4 ,0x2D ,0x11 ,0x84 ,0xB8 ,0xD7 ,0xEB ,0x7E ,0x42 ,
0x47 ,0x7B ,0xEE ,0xD2 ,0xBD ,0x81 ,0x14 ,0x28 ,0xF1 ,0xCD ,0x58 ,0x64 ,0x0B ,0x37 ,0xA2 ,0x9E ,
0xE9 ,0xD5 ,0x40 ,0x7C ,0x13 ,0x2F ,0xBA ,0x86 ,0x5F ,0x63 ,0xF6 ,0xCA ,0xA5 ,0x99 ,0x0C ,0x30 ,
0x35 ,0x09 ,0x9C ,0xA0 ,0xCF ,0xF3 ,0x66 ,0x5A ,0x83 ,0xBF ,0x2A ,0x16 ,0x79 ,0x45 ,0xD0 ,0xEC ,
0x46 ,0x7A ,0xEF ,0xD3 ,0xBC ,0x80 ,0x15 ,0x29 ,0xF0 ,0xCC ,0x59 ,0x65 ,0x0A ,0x36 ,0xA3 ,0x9F ,
0x9A ,0xA6 ,0x33 ,0x0F ,0x60 ,0x5C ,0xC9 ,0xF5 ,0x2C ,0x10 ,0x85 ,0xB9 ,0xD6 ,0xEA ,0x7F ,0x43 ,
0x8B ,0xB7 ,0x22 ,0x1E ,0x71 ,0x4D ,0xD8 ,0xE4 ,0x3D ,0x01 ,0x94 ,0xA8 ,0xC7 ,0xFB ,0x6E ,0x52 ,
0x57 ,0x6B ,0xFE ,0xC2 ,0xAD ,0x91 ,0x04 ,0x38 ,0xE1 ,0xDD ,0x48 ,0x74 ,0x1B ,0x27 ,0xB2 ,0x8E ,
0x24 ,0x18 ,0x8D ,0xB1 ,0xDE ,0xE2 ,0x77 ,0x4B ,0x92 ,0xAE ,0x3B ,0x07 ,0x68 ,0x54 ,0xC1 ,0xFD ,
0xF8 ,0xC4 ,0x51 ,0x6D ,0x02 ,0x3E ,0xAB ,0x97 ,0x4E ,0x72 ,0xE7 ,0xDB ,0xB4 ,0x88 ,0x1D ,0x21 ,
0x56 ,0x6A ,0xFF ,0xC3 ,0xAC ,0x90 ,0x05 ,0x39 ,0xE0 ,0xDC ,0x49 ,0x75 ,0x1A ,0x26 ,0xB3 ,0x8F ,
0x8A ,0xB6 ,0x23 ,0x1F ,0x70 ,0x4C ,0xD9 ,0xE5 ,0x3C ,0x00 ,0x95 ,0xA9 ,0xC6 ,0xFA ,0x6F ,0x53 ,
0xF9 ,0xC5 ,0x50 ,0x6C ,0x03 ,0x3F ,0xAA ,0x96 ,0x4F ,0x73 ,0xE6 ,0xDA ,0xB5 ,0x89 ,0x1C ,0x20 ,
0x25 ,0x19 ,0x8C ,0xB0 ,0xDF ,0xE3 ,0x76 ,0x4A ,0x93 ,0xAF ,0x3A ,0x06 ,0x69 ,0x55 ,0xC0 ,0xFC ,

How to calculate M, C from lookup table?

1.The first element of the table, T[0] should be the C.
2.Use T[1] XOR T[0], T[2] XOR T[0], T[4] XOR T[0], T[8] XOR T[0], T[16] XOR T[0], T[32] XOR T[0], T[64] XOR T[0], T[128] XOR T[0] to calculate matrix M.

Below is sample

               1          2        3         4         5         6        7         8         

               00000110   01110011 01110101  11100101  11100011  10010110 10010000  01010110  
0x00,          0x06,      0x73     0x75      0xE5      0xE3      0x96     0x90      0x56      
1 10100010
2 01001001
3 11111011
4 00001001
5 10101011
6 01000000
7 11100010
8 00010010
9 

00010010
00001001
01001001
10100010
01010110
11100101         
01110011
00000110

               1          2        3         4         5         6        7         8         

               00111100   10101001           11111010                               10110110     
0x00           0x3c       0xa9               0xfa                                   0xb6
1 11011100
2 10101111
3
4 11011101 
5 
6
7
8 10111111

10111111
11011101
10101111
11011100
10110110
11111010
10101001
00111100
// Generate matrix based on lookup table
func gen_matrix(lookup [256]byte) (m [8]byte) {
	c := lookup[0]
	m80 := lookup[0x80] ^ c
	m40 := lookup[0x40] ^ c
	m20 := lookup[0x20] ^ c
	m10 := lookup[0x10] ^ c
	m08 := lookup[0x08] ^ c
	m04 := lookup[0x04] ^ c
	m02 := lookup[0x02] ^ c
	m01 := lookup[0x01] ^ c

	m[0] = (m80 & 0x80) ^ ((m40 & 0x80) >> 1) ^ ((m20 & 0x80) >> 2) ^ ((m10 & 0x80) >> 3) ^ ((m08 & 0x80) >> 4) ^ ((m04 & 0x80) >> 5) ^ ((m02 & 0x80) >> 6) ^ ((m01 & 0x80) >> 7)
	m[1] = ((m80 & 0x40) << 1) ^ (m40 & 0x40) ^ ((m20 & 0x40) >> 1) ^ ((m10 & 0x40) >> 2) ^ ((m08 & 0x40) >> 3) ^ ((m04 & 0x40) >> 4) ^ ((m02 & 0x40) >> 5) ^ ((m01 & 0x40) >> 6)
	m[2] = ((m80 & 0x20) << 2) ^ ((m40 & 0x20) << 1) ^ (m20 & 0x20) ^ ((m10 & 0x20) >> 1) ^ ((m08 & 0x20) >> 2) ^ ((m04 & 0x20) >> 3) ^ ((m02 & 0x20) >> 4) ^ ((m01 & 0x20) >> 5)
	m[3] = ((m80 & 0x10) << 3) ^ ((m40 & 0x10) << 2) ^ ((m20 & 0x10) << 1) ^ (m10 & 0x10) ^ ((m08 & 0x10) >> 1) ^ ((m04 & 0x10) >> 2) ^ ((m02 & 0x10) >> 3) ^ ((m01 & 0x10) >> 4)
	m[4] = ((m80 & 0x08) << 4) ^ ((m40 & 0x08) << 3) ^ ((m20 & 0x08) << 2) ^ ((m10 & 0x08) << 1) ^ (m08 & 0x08) ^ ((m04 & 0x08) >> 1) ^ ((m02 & 0x08) >> 2) ^ ((m01 & 0x08) >> 3)
	m[5] = ((m80 & 0x04) << 5) ^ ((m40 & 0x04) << 4) ^ ((m20 & 0x04) << 3) ^ ((m10 & 0x04) << 2) ^ ((m08 & 0x04) << 1) ^ (m04 & 0x04) ^ ((m02 & 0x04) >> 1) ^ ((m01 & 0x04) >> 2)
	m[6] = ((m80 & 0x02) << 6) ^ ((m40 & 0x02) << 5) ^ ((m20 & 0x02) << 4) ^ ((m10 & 0x02) << 3) ^ ((m08 & 0x02) << 2) ^ ((m04 & 0x02) << 1) ^ (m02 & 0x02) ^ ((m01 & 0x02) >> 1)
	m[7] = ((m80 & 0x01) << 7) ^ ((m40 & 0x01) << 6) ^ ((m20 & 0x01) << 5) ^ ((m10 & 0x01) << 4) ^ ((m08 & 0x01) << 2) ^ ((m04 & 0x01) << 2) ^ ((m02 & 0x01) << 1) ^ (m01 & 0x01)
	return
}

Similar python code:

from pyfinite import genericmatrix


def XOR(x, y): return x ^ y
def AND(x, y): return x & y
def DIV(x, y): return x


def matrix_from_cols(cols):
    m = genericmatrix.GenericMatrix(size=(8, 8), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
    for i in range (8):
        k = 7 - i
        j = 1 << k
        m.SetRow(i, [(cols[0] & j) >> k, (cols[1] & j) >> k, (cols[2] & j) >> k, (cols[3] & j) >> k, (cols[4] & j) >> k, (cols[5] & j) >> k, (cols[6] & j) >> k, (cols[7] & j) >> k])    

    return m


def gen_matrix_based_table(table):
    return matrix_from_cols([table[0x80] ^ table[0], table[0x40] ^ table[0], table[0x20] ^ table[0], table[0x10] ^ table[0], table[0x08] ^ table[0], table[0x04] ^ table[0], table[0x02] ^ table[0], table[0x01] ^ table[0]])

def gen_matrix_based_high_low(high, low):
    table = []
    for i in range(16):
        for j in range(16):
            table.append(high[i] ^ low[j])    
    return gen_matrix_based_table(table) 

print(gen_matrix_based_high_low([0x00,0x50,0x14,0x44,0x89,0xd9,0x9d,0xcd,0xde,0x8e,0xca,0x9a,0x57,0x07,0x43,0x13], [0xd3,0x59,0x38,0xb2,0xcc,0x46,0x27,0xad,0x36,0xbc,0xdd,0x57,0x29,0xa3,0xc2,0x48]))

AES ShiftRows

image image

16字节State是这样存储的:
$in_0 \ in_1 \ in_2 \ in_3 \ in_4\ in_5\ in_6\ in_7\ in_8\ in_9\ in_{10}\ in_{11}\ in_{12}\ in_{13}\ in_{14}\ in_{15}$

STATE先逆ShiftRows, 再ShiftRows回到初始STATE。

0 4 8 c
1 5 9 d
2 6 a e
3 7 b f
逆ShiftRows后=>
0 4 8 c
d 1 5 9
a e 2 6
7 b f 3
再ShiftRows后=>
0 4 8 c
1 5 9 d
2 6 a e
3 7 b f

Reference