Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support Arbitrary Matrix Shapes #4

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ generate_testvectors:
- python testGenerator.py -H 1 -S 64 -E 64 -P 64 -F 64 --activation gelu
- python testGenerator.py -H 1 -S 128 -E 192 -P 256 -F 256 --activation gelu
- python testGenerator.py -H 1 -S 192 -E 256 -P 128 -F 128 --activation relu
# - python testGenerator.py -H 1 -S 1 -E 2 -P 3 -F 3 --activation gelu --no-bias
- python testGenerator.py -H 1 -S 1 -E 2 -P 3 -F 3 --activation relu --no-bias
- python testGenerator.py -H 1 -S 63 -E 62 -P 61 -F 61 --activation relu --no-bias
# - python testGenerator.py -H 1 -S 65 -E 130 -P 195 -F 195 --activation relu --no-bias
# - python testGenerator.py -H 1 -S 127 -E 190 -P 253 -F 253 --activation relu --no-bias
artifacts:
paths:
- simvectors
Expand Down Expand Up @@ -60,6 +65,42 @@ run_sim:
- make sim VSIM_FLAGS=-c s=$S e=$E p=$P f=$F bias=1 activation=$activation
- ./modelsim/return_status.sh modelsim/build/transcript $S $E $P $F ita_tb

run_sim_padding:
stage: sim
needs:
- generate_testvectors
parallel:
matrix:
# - S: 1
# E: 2
# P: 3
# F: 3
# activation: gelu
- S: 1
E: 2
P: 3
F: 3
activation: relu
- S: 63
E: 62
P: 61
F: 61
activation: relu
# - S: 65
# E: 130
# P: 195
# F: 195
# activation: relu
# - S: 127
# E: 190
# P: 253
# F: 253
# activation: relu
script:
- make bender
- make sim VSIM_FLAGS=-c s=$S e=$E p=$P f=$F bias=0 activation=$activation
- ./modelsim/return_status.sh modelsim/build/transcript $S $E $P $F ita_tb

run_hwpe_sim:
stage: sim
needs:
Expand Down
1 change: 1 addition & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
"-S${input:seq_len}",
"-E${input:emb_len}",
"-P${input:prj_len}",
"--no-bias"
],
}
],
Expand Down
88 changes: 66 additions & 22 deletions PyITA/ITA.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,10 @@ def __init__(self,

self._init_paths(path)

self.S_ITA = max(64, S)
self.P_ITA = max(64, P)
self.E_ITA = max(64, E)
self.F_ITA = max(64, F)
self.S_ITA = ((S - 1) // self.ITA_M + 1) * self.ITA_M
self.P_ITA = ((P - 1) // self.ITA_M + 1) * self.ITA_M
self.E_ITA = ((E - 1) // self.ITA_M + 1) * self.ITA_M
self.F_ITA = ((F - 1) // self.ITA_M + 1) * self.ITA_M
self.H_ITA = 4
self.split = self.ITA_M // self.ITA_N

Expand Down Expand Up @@ -109,10 +109,10 @@ def _validate_matrix_constraints(self, K: ArrayLike, V: ArrayLike):
assert (np.all(K == V))

# WIESEP: Current restrictions for ITA
assert (self.S % self.ITA_M == 0), "Sequence length must be divisible by ITA_M"
assert (self.P % self.ITA_M == 0), "Projection space must be divisible by ITA_M"
assert (self.E % self.ITA_M == 0), "Embedding size must be divisible by ITA_M"
assert (self.F % self.ITA_M == 0), "Feedforward size must be divisible by ITA_M"
# assert (self.S % self.ITA_M == 0), "Sequence length must be divisible by ITA_M"
# assert (self.P % self.ITA_M == 0), "Projection space must be divisible by ITA_M"
# assert (self.E % self.ITA_M == 0), "Embedding size must be divisible by ITA_M"
# assert (self.F % self.ITA_M == 0), "Feedforward size must be divisible by ITA_M"

assert (
self.E <= 512
Expand Down Expand Up @@ -171,46 +171,52 @@ def _initialize_tensors(self, Q, V, Wq, Wk, Wv, Wo, Bq, Bk, Bv, Bo, FF_in, Wff,
else:
self.Bq_in = np.zeros((self.H, self.P), dtype = np.int8)
self.Bq = np.pad(self.Bq_in, ((0, 0), (0, self.P_ITA - self.P)))
self.Bq_broadcast = np.reshape(np.repeat(self.Bq, self.S, axis = 0), (self.H, self.S, self.P))
self.Bq_broadcast = np.reshape(np.repeat(self.Bq, self.S, axis = 0), (self.H, self.S, self.P_ITA))
self.Bq_broadcast = np.pad(self.Bq_broadcast, ((0, 0), (0, self.S_ITA - self.S), (0, 0)))

if self.bias:
self.Bk_in = random_shuffled_tensor(
(self.H, self.P), int(np.log2(self.P)) + 8, type = np.int32) if Bk is None else Bk
else:
self.Bk_in = np.zeros((self.H, self.P), dtype = np.int8)
self.Bk = np.pad(self.Bk_in, ((0, 0), (0, self.P_ITA - self.P)))
self.Bk_broadcast = np.reshape(np.repeat(self.Bk, self.S, axis = 0), (self.H, self.S, self.P))
self.Bk_broadcast = np.reshape(np.repeat(self.Bk, self.S, axis = 0), (self.H, self.S, self.P_ITA))
self.Bk_broadcast = np.pad(self.Bk_broadcast, ((0, 0), (0, self.S_ITA - self.S), (0, 0)))

if self.bias:
self.Bv_in = random_shuffled_tensor(
(self.H, self.P), int(np.log2(self.P)) + 8, type = np.int32) if Bv is None else Bv
else:
self.Bv_in = np.zeros((self.H, self.P), dtype = np.int8)
self.Bv = np.pad(self.Bv_in, ((0, 0), (0, self.P_ITA - self.P)))
self.Bv_broadcast = np.reshape(np.repeat(self.Bv, self.S, axis = 0), (self.H, self.S, self.P))
self.Bv_broadcast = np.reshape(np.repeat(self.Bv, self.S, axis = 0), (self.H, self.S, self.P_ITA))
self.Bv_broadcast = np.pad(self.Bv_broadcast, ((0, 0), (0, self.S_ITA - self.S), (0, 0)))

if self.bias:
self.Bo_in = random_shuffled_tensor(
(self.H, self.E), int(np.log2(self.E)) + 8, type = np.int32) if Bo is None else Bo
else:
self.Bo_in = np.zeros((self.H, self.E), dtype = np.int8)
self.Bo = np.pad(self.Bo_in, ((0, 0), (0, self.E_ITA - self.E)))
self.Bo_broadcast = np.reshape(np.repeat(self.Bo, self.S, axis = 0), (self.H, self.S, self.E))
self.Bo_broadcast = np.reshape(np.repeat(self.Bo, self.S, axis = 0), (self.H, self.S, self.E_ITA))
self.Bo_broadcast = np.pad(self.Bo_broadcast, ((0, 0), (0, self.S_ITA - self.S), (0, 0)))

if self.bias:
self.Bff_in = random_shuffled_tensor(
(1, self.F), int(np.log2(self.F)) + 8, type = np.int32) if Bff is None else Bff
else:
self.Bff_in = np.zeros((1, self.F), dtype = np.int8)
self.Bff = np.pad(self.Bff_in, ((0, 0), (0, self.F_ITA - self.F)))
self.Bff_broadcast = np.reshape(np.repeat(self.Bff, self.S, axis = 0), (1, self.S, self.F))
self.Bff_broadcast = np.reshape(np.repeat(self.Bff, self.S, axis = 0), (1, self.S, self.F_ITA))
self.Bff_broadcast = np.pad(self.Bff_broadcast, ((0, 0), (0, self.S_ITA - self.S), (0, 0)))
if self.bias:
self.Bff2_in = random_shuffled_tensor(
(1, self.E), int(np.log2(self.E)) + 8, type = np.int32) if Bff2 is None else Bff2
else:
self.Bff2_in = np.zeros((1, self.E), dtype = np.int8)
self.Bff2 = np.pad(self.Bff2_in, ((0, 0), (0, self.E_ITA - self.E)))
self.Bff2_broadcast = np.reshape(np.repeat(self.Bff2, self.S, axis = 0), (1, self.S, self.E))
self.Bff2_broadcast = np.reshape(np.repeat(self.Bff2, self.S, axis = 0), (1, self.S, self.E_ITA))
self.Bff2_broadcast = np.pad(self.Bff2_broadcast, ((0, 0), (0, self.S_ITA - self.S), (0, 0)))

#### Intermediate tensors ####

Expand Down Expand Up @@ -372,7 +378,7 @@ def tiler_QK(self, qk: np.ndarray, weight: np.ndarray, bias: np.ndarray, output:

# Bias Bqk is H x P
# Broadcast Bias Bqk to H x S x P
bias = np.tile(bias, [1, self.S, 1])
bias = np.tile(bias, [1, self.S_ITA, 1])
for h in range(self.H):
Bias = split_matrix(bias[h], (self.ITA_M, self.ITA_N))
write_matrix(Bias, f"{bias_file}_{h}", self.paths["standalone"])
Expand Down Expand Up @@ -415,7 +421,7 @@ def tiler_V(self, v, weight, bias, output, input_file, weight_file, bias_file, o

# Bias Bv is H x P
# Broadcast Bias Bv to H x S x P
bias = np.tile(bias, [1, self.S, 1])
bias = np.tile(bias, [1, self.S_ITA, 1])
# Transpose Bias Bv to H x P x S
bias = np.transpose(bias, (0, 2, 1))
for h in range(self.H):
Expand Down Expand Up @@ -496,7 +502,7 @@ def tiler_Out(self, O, weight, bias, output, input_file, weight_file, bias_file,

# Bias Bo is H x E
# Broadcast Bias Bo to H x S x E
bias = np.tile(bias, [1, self.S, 1])
bias = np.tile(bias, [1, self.S_ITA, 1])
for h in range(self.H):
Bias = split_matrix(bias[h], (self.ITA_M, self.ITA_N))
write_matrix(Bias, f"{bias_file}_{h}", self.paths["standalone"])
Expand All @@ -512,6 +518,12 @@ def step1_Qp(self):
self.Qp_requant = requantize(self.Qp, self.requant_eps_mult[0], self.requant_right_shift[0],
self.requant_add[0])

# Set padded values to zero
if (self.S_ITA - self.S) > 0:
self.Qp_requant[:, -(self.S_ITA - self.S):, :] = 0
if (self.P_ITA - self.P) > 0:
self.Qp_requant[:, :, -(self.P_ITA - self.P):] = 0

self.tiler_QK(self.Q, self.Wq, self.Bq, self.Qp_requant, "Q", "Wq", "Bq", "Qp")

def step2_Kp(self):
Expand All @@ -520,6 +532,11 @@ def step2_Kp(self):
self.Kp_requant = requantize(self.Kp, self.requant_eps_mult[1], self.requant_right_shift[1],
self.requant_add[1])

if (self.S_ITA - self.S) > 0:
self.Kp_requant[:, -(self.S_ITA - self.S):, :] = 0
if (self.P_ITA - self.P) > 0:
self.Kp_requant[:, :, -(self.P_ITA - self.P):] = 0

self.tiler_QK(self.K, self.Wk, self.Bk, self.Kp_requant, "K", "Wk", "Bk", "Kp")

def step3_Vp(self):
Expand All @@ -528,6 +545,11 @@ def step3_Vp(self):
self.Vp_requant = requantize(self.Vp, self.requant_eps_mult[2], self.requant_right_shift[2],
self.requant_add[2])

if (self.S_ITA - self.S) > 0:
self.Vp_requant[:, -(self.S_ITA - self.S):, :] = 0
if (self.P_ITA - self.P) > 0:
self.Vp_requant[:, :, -(self.P_ITA - self.P):] = 0

# Compute Vp in transposed form
self.tiler_V(self.V, self.Wv, self.Bv, self.Vp_requant, "V", "Wv", "Bv", "Vp")

Expand All @@ -536,16 +558,27 @@ def step4_QK(self, no_partial_softmax):
[np.matmul(self.Qp_requant[i], np.transpose(self.Kp_requant[i]), dtype = np.int32) for i in range(self.H)])
self.A = np.clip(self.A, -2**(self.WO - 1), 2**(self.WO - 1) - 1)
self.A_requant = requantize(self.A, self.requant_eps_mult[3], self.requant_right_shift[3], self.requant_add[3])

if (self.S_ITA - self.S) > 0:
self.A_requant[:, -(self.S_ITA - self.S):, :] = 0
self.A_requant[:, :, -(self.S_ITA - self.S):] = 0

self.soft(no_partial_softmax)

self.tiler_AV(self.Qp_requant, self.Kp_requant, self.A_requant, "Qp_in", "Kp_in", "A")

def soft(self, no_partial_softmax = False):
self.A_real_softmax = realSoftmax(self.A_requant)
self.A_real_softmax = realSoftmax(self.A_requant[:, :self.S, :self.S])
self.A_real_softmax = np.pad(self.A_real_softmax, ((0, 0), (0, self.S_ITA - self.S), (0, self.S_ITA - self.S)))

if no_partial_softmax:
self.A_partial_softmax = fastSoftmax(self.A_requant)
self.A_partial_softmax = fastSoftmax(self.A_requant[:, :self.S, :self.S])
self.A_partial_softmax = np.pad(self.A_partial_softmax,
((0, 0), (0, self.S_ITA - self.S), (0, self.S_ITA - self.S)))
else:
self.A_partial_softmax = streamingPartialSoftmax(self.A_requant)
self.A_partial_softmax = streamingPartialSoftmax(self.A_requant[:, :self.S, :self.S])
self.A_partial_softmax = np.pad(self.A_partial_softmax,
((0, 0), (0, self.S_ITA - self.S), (0, self.S_ITA - self.S)))

if self.H == 1:
A_save = [np.tile(self.A_partial_softmax[i], [self.split, 1]) for i in range(self.H)]
Expand All @@ -561,6 +594,11 @@ def step5_AV(self):
self.O_soft_requant = requantize(self.O_soft, self.requant_eps_mult[4], self.requant_right_shift[4],
self.requant_add[4])

if (self.S_ITA - self.S) > 0:
self.O_soft_requant[:, -(self.S_ITA - self.S):, :] = 0
if (self.P_ITA - self.P) > 0:
self.O_soft_requant[:, :, -(self.P_ITA - self.P):] = 0

self.tiler_AV(self.A_requant, np.transpose(self.Vp_requant, (0, 2, 1)), self.O_soft_requant, "A_stream_soft_in",
"Vp_in", "O_soft")

Expand All @@ -587,6 +625,12 @@ def step6_O(self):
self.Out_soft = np.clip(self.Out_soft, -2**(self.WO - 1), 2**(self.WO - 1) - 1)
self.Out_soft_requant = requantize(self.Out_soft, self.requant_eps_mult[5], self.requant_right_shift[5],
self.requant_add[5])

if (self.S_ITA - self.S) > 0:
self.Out_soft_requant[:, -(self.S_ITA - self.S):, :] = 0
if (self.E_ITA - self.E) > 0:
self.Out_soft_requant[:, :, -(self.E_ITA - self.E):] = 0

self.tiler_Out(self.O_soft_requant, self.Wo, self.Bo, self.Out_soft_requant, "O_soft_in", "Wo", "Bo",
"Out_soft")

Expand Down Expand Up @@ -931,8 +975,8 @@ def export_mempool(self, path):

def export_numpy(self):
assert np.all(np.equal(self.K, self.V)), "For ITA, keys and values have to be equal"
q = self.Q
k = self.K
q = self.Q_in
k = self.K_in
w1 = self.Wq_in
b1 = self.Bq_in
w2 = self.Wk_in
Expand Down
4 changes: 2 additions & 2 deletions PyITA/ITA_onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,8 +259,8 @@ def exportONNX(path, verbose = False, **kwargs):
# Transform from MUL-DIV-ADD to MUL-ADD-DIV
RQ_ADD = (RQ_ADD * 2**RQ_SHIFT.astype(np.float32))

input0_values = np.expand_dims(inputs['q'][:(S * E // 64), :].reshape(S, E), axis = 0)
input1_values = np.expand_dims(inputs['k'][:(S * E // 64), :].reshape(S, E), axis = 0)
input0_values = np.expand_dims(inputs['q'].reshape(S, E), axis = 0)
input1_values = np.expand_dims(inputs['k'].reshape(S, E), axis = 0)

np.savez(path + "inputs.npz", input0_values, input1_values)

Expand Down
Loading
Loading