pulp-platform · Xeratec · Sep 24, 2024 · Sep 24, 2024 · Sep 24, 2024
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -29,6 +29,11 @@ generate_testvectors:
     - python testGenerator.py -H 1 -S 64 -E 64 -P 64 -F 64 --activation gelu
     - python testGenerator.py -H 1 -S 128 -E 192 -P 256 -F 256 --activation gelu
     - python testGenerator.py -H 1 -S 192 -E 256 -P 128 -F 128 --activation relu
+    # - python testGenerator.py -H 1 -S 1 -E 2 -P 3 -F 3 --activation gelu --no-bias
+    - python testGenerator.py -H 1 -S 1 -E 2 -P 3 -F 3 --activation relu --no-bias
+    - python testGenerator.py -H 1 -S 63 -E 62 -P 61 -F 61 --activation relu --no-bias
+    # - python testGenerator.py -H 1 -S 65 -E 130 -P 195 -F 195 --activation relu --no-bias
+    # - python testGenerator.py -H 1 -S 127 -E 190 -P 253 -F 253 --activation relu --no-bias
   artifacts:
     paths:
       - simvectors
@@ -60,6 +65,42 @@ run_sim:
     - make sim VSIM_FLAGS=-c s=$S e=$E p=$P f=$F bias=1 activation=$activation
     - ./modelsim/return_status.sh modelsim/build/transcript $S $E $P $F ita_tb
 
+run_sim_padding:
+  stage: sim
+  needs:
+    - generate_testvectors
+  parallel:
+    matrix:
+    # - S: 1
+    #   E: 2
+    #   P: 3
+    #   F: 3
+    #   activation: gelu
+    - S: 1
+      E: 2
+      P: 3
+      F: 3
+      activation: relu
+    - S: 63
+      E: 62
+      P: 61
+      F: 61
+      activation: relu
+    # - S: 65
+    #   E: 130
+    #   P: 195
+    #   F: 195
+    #   activation: relu
+    # - S: 127
+    #   E: 190
+    #   P: 253
+    #   F: 253
+    #   activation: relu
+  script:
+    - make bender
+    - make sim VSIM_FLAGS=-c s=$S e=$E p=$P f=$F bias=0 activation=$activation
+    - ./modelsim/return_status.sh modelsim/build/transcript $S $E $P $F ita_tb
+
 run_hwpe_sim:
   stage: sim
   needs:

diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -17,6 +17,7 @@
                 "-S${input:seq_len}",
                 "-E${input:emb_len}",
                 "-P${input:prj_len}",
+                "--no-bias"
             ],
         }
     ],

diff --git a/PyITA/ITA.py b/PyITA/ITA.py
@@ -68,10 +68,10 @@ def __init__(self,
 
         self._init_paths(path)
 
-        self.S_ITA = max(64, S)
-        self.P_ITA = max(64, P)
-        self.E_ITA = max(64, E)
-        self.F_ITA = max(64, F)
+        self.S_ITA = ((S - 1) // self.ITA_M + 1) * self.ITA_M
+        self.P_ITA = ((P - 1) // self.ITA_M + 1) * self.ITA_M
+        self.E_ITA = ((E - 1) // self.ITA_M + 1) * self.ITA_M
+        self.F_ITA = ((F - 1) // self.ITA_M + 1) * self.ITA_M
         self.H_ITA = 4
         self.split = self.ITA_M // self.ITA_N
 
@@ -109,10 +109,10 @@ def _validate_matrix_constraints(self, K: ArrayLike, V: ArrayLike):
         assert (np.all(K == V))
 
         # WIESEP: Current restrictions for ITA
-        assert (self.S % self.ITA_M == 0), "Sequence length must be divisible by ITA_M"
-        assert (self.P % self.ITA_M == 0), "Projection space must be divisible by ITA_M"
-        assert (self.E % self.ITA_M == 0), "Embedding size must be divisible by ITA_M"
-        assert (self.F % self.ITA_M == 0), "Feedforward size must be divisible by ITA_M"
+        # assert (self.S % self.ITA_M == 0), "Sequence length must be divisible by ITA_M"
+        # assert (self.P % self.ITA_M == 0), "Projection space must be divisible by ITA_M"
+        # assert (self.E % self.ITA_M == 0), "Embedding size must be divisible by ITA_M"
+        # assert (self.F % self.ITA_M == 0), "Feedforward size must be divisible by ITA_M"
 
         assert (
             self.E <= 512
@@ -171,46 +171,52 @@ def _initialize_tensors(self, Q, V, Wq, Wk, Wv, Wo, Bq, Bk, Bv, Bo, FF_in, Wff,
         else:
             self.Bq_in = np.zeros((self.H, self.P), dtype = np.int8)
         self.Bq = np.pad(self.Bq_in, ((0, 0), (0, self.P_ITA - self.P)))
-        self.Bq_broadcast = np.reshape(np.repeat(self.Bq, self.S, axis = 0), (self.H, self.S, self.P))
+        self.Bq_broadcast = np.reshape(np.repeat(self.Bq, self.S, axis = 0), (self.H, self.S, self.P_ITA))
+        self.Bq_broadcast = np.pad(self.Bq_broadcast, ((0, 0), (0, self.S_ITA - self.S), (0, 0)))
 
         if self.bias:
             self.Bk_in = random_shuffled_tensor(
                 (self.H, self.P), int(np.log2(self.P)) + 8, type = np.int32) if Bk is None else Bk
         else:
             self.Bk_in = np.zeros((self.H, self.P), dtype = np.int8)
         self.Bk = np.pad(self.Bk_in, ((0, 0), (0, self.P_ITA - self.P)))
-        self.Bk_broadcast = np.reshape(np.repeat(self.Bk, self.S, axis = 0), (self.H, self.S, self.P))
+        self.Bk_broadcast = np.reshape(np.repeat(self.Bk, self.S, axis = 0), (self.H, self.S, self.P_ITA))
+        self.Bk_broadcast = np.pad(self.Bk_broadcast, ((0, 0), (0, self.S_ITA - self.S), (0, 0)))
 
         if self.bias:
             self.Bv_in = random_shuffled_tensor(
                 (self.H, self.P), int(np.log2(self.P)) + 8, type = np.int32) if Bv is None else Bv
         else:
             self.Bv_in = np.zeros((self.H, self.P), dtype = np.int8)
         self.Bv = np.pad(self.Bv_in, ((0, 0), (0, self.P_ITA - self.P)))
-        self.Bv_broadcast = np.reshape(np.repeat(self.Bv, self.S, axis = 0), (self.H, self.S, self.P))
+        self.Bv_broadcast = np.reshape(np.repeat(self.Bv, self.S, axis = 0), (self.H, self.S, self.P_ITA))
+        self.Bv_broadcast = np.pad(self.Bv_broadcast, ((0, 0), (0, self.S_ITA - self.S), (0, 0)))
 
         if self.bias:
             self.Bo_in = random_shuffled_tensor(
                 (self.H, self.E), int(np.log2(self.E)) + 8, type = np.int32) if Bo is None else Bo
         else:
             self.Bo_in = np.zeros((self.H, self.E), dtype = np.int8)
         self.Bo = np.pad(self.Bo_in, ((0, 0), (0, self.E_ITA - self.E)))
-        self.Bo_broadcast = np.reshape(np.repeat(self.Bo, self.S, axis = 0), (self.H, self.S, self.E))
+        self.Bo_broadcast = np.reshape(np.repeat(self.Bo, self.S, axis = 0), (self.H, self.S, self.E_ITA))
+        self.Bo_broadcast = np.pad(self.Bo_broadcast, ((0, 0), (0, self.S_ITA - self.S), (0, 0)))
 
         if self.bias:
             self.Bff_in = random_shuffled_tensor(
                 (1, self.F), int(np.log2(self.F)) + 8, type = np.int32) if Bff is None else Bff
         else:
             self.Bff_in = np.zeros((1, self.F), dtype = np.int8)
         self.Bff = np.pad(self.Bff_in, ((0, 0), (0, self.F_ITA - self.F)))
-        self.Bff_broadcast = np.reshape(np.repeat(self.Bff, self.S, axis = 0), (1, self.S, self.F))
+        self.Bff_broadcast = np.reshape(np.repeat(self.Bff, self.S, axis = 0), (1, self.S, self.F_ITA))
+        self.Bff_broadcast = np.pad(self.Bff_broadcast, ((0, 0), (0, self.S_ITA - self.S), (0, 0)))
         if self.bias:
             self.Bff2_in = random_shuffled_tensor(
                 (1, self.E), int(np.log2(self.E)) + 8, type = np.int32) if Bff2 is None else Bff2
         else:
             self.Bff2_in = np.zeros((1, self.E), dtype = np.int8)
         self.Bff2 = np.pad(self.Bff2_in, ((0, 0), (0, self.E_ITA - self.E)))
-        self.Bff2_broadcast = np.reshape(np.repeat(self.Bff2, self.S, axis = 0), (1, self.S, self.E))
+        self.Bff2_broadcast = np.reshape(np.repeat(self.Bff2, self.S, axis = 0), (1, self.S, self.E_ITA))
+        self.Bff2_broadcast = np.pad(self.Bff2_broadcast, ((0, 0), (0, self.S_ITA - self.S), (0, 0)))
 
         #### Intermediate tensors ####
 
@@ -372,7 +378,7 @@ def tiler_QK(self, qk: np.ndarray, weight: np.ndarray, bias: np.ndarray, output:
 
         # Bias Bqk is H x P
         # Broadcast Bias Bqk to H x S x P
-        bias = np.tile(bias, [1, self.S, 1])
+        bias = np.tile(bias, [1, self.S_ITA, 1])
         for h in range(self.H):
             Bias = split_matrix(bias[h], (self.ITA_M, self.ITA_N))
             write_matrix(Bias, f"{bias_file}_{h}", self.paths["standalone"])
@@ -415,7 +421,7 @@ def tiler_V(self, v, weight, bias, output, input_file, weight_file, bias_file, o
 
         # Bias Bv is H x P
         # Broadcast Bias Bv to H x S x P
-        bias = np.tile(bias, [1, self.S, 1])
+        bias = np.tile(bias, [1, self.S_ITA, 1])
         # Transpose Bias Bv to H x P x S
         bias = np.transpose(bias, (0, 2, 1))
         for h in range(self.H):
@@ -496,7 +502,7 @@ def tiler_Out(self, O, weight, bias, output, input_file, weight_file, bias_file,
 
         # Bias Bo is H x E
         # Broadcast Bias Bo to H x S x E
-        bias = np.tile(bias, [1, self.S, 1])
+        bias = np.tile(bias, [1, self.S_ITA, 1])
         for h in range(self.H):
             Bias = split_matrix(bias[h], (self.ITA_M, self.ITA_N))
             write_matrix(Bias, f"{bias_file}_{h}", self.paths["standalone"])
@@ -512,6 +518,12 @@ def step1_Qp(self):
         self.Qp_requant = requantize(self.Qp, self.requant_eps_mult[0], self.requant_right_shift[0],
                                      self.requant_add[0])
 
+        # Set padded values to zero
+        if (self.S_ITA - self.S) > 0:
+            self.Qp_requant[:, -(self.S_ITA - self.S):, :] = 0
+        if (self.P_ITA - self.P) > 0:
+            self.Qp_requant[:, :, -(self.P_ITA - self.P):] = 0
+
         self.tiler_QK(self.Q, self.Wq, self.Bq, self.Qp_requant, "Q", "Wq", "Bq", "Qp")
 
     def step2_Kp(self):
@@ -520,6 +532,11 @@ def step2_Kp(self):
         self.Kp_requant = requantize(self.Kp, self.requant_eps_mult[1], self.requant_right_shift[1],
                                      self.requant_add[1])
 
+        if (self.S_ITA - self.S) > 0:
+            self.Kp_requant[:, -(self.S_ITA - self.S):, :] = 0
+        if (self.P_ITA - self.P) > 0:
+            self.Kp_requant[:, :, -(self.P_ITA - self.P):] = 0
+
         self.tiler_QK(self.K, self.Wk, self.Bk, self.Kp_requant, "K", "Wk", "Bk", "Kp")
 
     def step3_Vp(self):
@@ -528,6 +545,11 @@ def step3_Vp(self):
         self.Vp_requant = requantize(self.Vp, self.requant_eps_mult[2], self.requant_right_shift[2],
                                      self.requant_add[2])
 
+        if (self.S_ITA - self.S) > 0:
+            self.Vp_requant[:, -(self.S_ITA - self.S):, :] = 0
+        if (self.P_ITA - self.P) > 0:
+            self.Vp_requant[:, :, -(self.P_ITA - self.P):] = 0
+
         # Compute Vp in transposed form
         self.tiler_V(self.V, self.Wv, self.Bv, self.Vp_requant, "V", "Wv", "Bv", "Vp")
 
@@ -536,16 +558,27 @@ def step4_QK(self, no_partial_softmax):
             [np.matmul(self.Qp_requant[i], np.transpose(self.Kp_requant[i]), dtype = np.int32) for i in range(self.H)])
         self.A = np.clip(self.A, -2**(self.WO - 1), 2**(self.WO - 1) - 1)
         self.A_requant = requantize(self.A, self.requant_eps_mult[3], self.requant_right_shift[3], self.requant_add[3])
+
+        if (self.S_ITA - self.S) > 0:
+            self.A_requant[:, -(self.S_ITA - self.S):, :] = 0
+            self.A_requant[:, :, -(self.S_ITA - self.S):] = 0
+
         self.soft(no_partial_softmax)
 
         self.tiler_AV(self.Qp_requant, self.Kp_requant, self.A_requant, "Qp_in", "Kp_in", "A")
 
     def soft(self, no_partial_softmax = False):
-        self.A_real_softmax = realSoftmax(self.A_requant)
+        self.A_real_softmax = realSoftmax(self.A_requant[:, :self.S, :self.S])
+        self.A_real_softmax = np.pad(self.A_real_softmax, ((0, 0), (0, self.S_ITA - self.S), (0, self.S_ITA - self.S)))
+
         if no_partial_softmax:
-            self.A_partial_softmax = fastSoftmax(self.A_requant)
+            self.A_partial_softmax = fastSoftmax(self.A_requant[:, :self.S, :self.S])
+            self.A_partial_softmax = np.pad(self.A_partial_softmax,
+                                            ((0, 0), (0, self.S_ITA - self.S), (0, self.S_ITA - self.S)))
         else:
-            self.A_partial_softmax = streamingPartialSoftmax(self.A_requant)
+            self.A_partial_softmax = streamingPartialSoftmax(self.A_requant[:, :self.S, :self.S])
+            self.A_partial_softmax = np.pad(self.A_partial_softmax,
+                                            ((0, 0), (0, self.S_ITA - self.S), (0, self.S_ITA - self.S)))
 
         if self.H == 1:
             A_save = [np.tile(self.A_partial_softmax[i], [self.split, 1]) for i in range(self.H)]
@@ -561,6 +594,11 @@ def step5_AV(self):
         self.O_soft_requant = requantize(self.O_soft, self.requant_eps_mult[4], self.requant_right_shift[4],
                                          self.requant_add[4])
 
+        if (self.S_ITA - self.S) > 0:
+            self.O_soft_requant[:, -(self.S_ITA - self.S):, :] = 0
+        if (self.P_ITA - self.P) > 0:
+            self.O_soft_requant[:, :, -(self.P_ITA - self.P):] = 0
+
         self.tiler_AV(self.A_requant, np.transpose(self.Vp_requant, (0, 2, 1)), self.O_soft_requant, "A_stream_soft_in",
                       "Vp_in", "O_soft")
 
@@ -587,6 +625,12 @@ def step6_O(self):
         self.Out_soft = np.clip(self.Out_soft, -2**(self.WO - 1), 2**(self.WO - 1) - 1)
         self.Out_soft_requant = requantize(self.Out_soft, self.requant_eps_mult[5], self.requant_right_shift[5],
                                            self.requant_add[5])
+
+        if (self.S_ITA - self.S) > 0:
+            self.Out_soft_requant[:, -(self.S_ITA - self.S):, :] = 0
+        if (self.E_ITA - self.E) > 0:
+            self.Out_soft_requant[:, :, -(self.E_ITA - self.E):] = 0
+
         self.tiler_Out(self.O_soft_requant, self.Wo, self.Bo, self.Out_soft_requant, "O_soft_in", "Wo", "Bo",
                        "Out_soft")
 
@@ -931,8 +975,8 @@ def export_mempool(self, path):
 
     def export_numpy(self):
         assert np.all(np.equal(self.K, self.V)), "For ITA, keys and values have to be equal"
-        q = self.Q
-        k = self.K
+        q = self.Q_in
+        k = self.K_in
         w1 = self.Wq_in
         b1 = self.Bq_in
         w2 = self.Wk_in

diff --git a/PyITA/ITA_onnx.py b/PyITA/ITA_onnx.py
@@ -259,8 +259,8 @@ def exportONNX(path, verbose = False, **kwargs):
     # Transform from MUL-DIV-ADD to MUL-ADD-DIV
     RQ_ADD = (RQ_ADD * 2**RQ_SHIFT.astype(np.float32))
 
-    input0_values = np.expand_dims(inputs['q'][:(S * E // 64), :].reshape(S, E), axis = 0)
-    input1_values = np.expand_dims(inputs['k'][:(S * E // 64), :].reshape(S, E), axis = 0)
+    input0_values = np.expand_dims(inputs['q'].reshape(S, E), axis = 0)
+    input1_values = np.expand_dims(inputs['k'].reshape(S, E), axis = 0)
 
     np.savez(path + "inputs.npz", input0_values, input1_values)