[software] Add flag to fold q16 MMSE

mbertuletti · mbertuletti · commit dc8ca4928423 · 2025-01-06T11:30:21.000+01:00
diff --git a/software/apps/baremetal/mimo_mmse_q16/main.c b/software/apps/baremetal/mimo_mmse_q16/main.c
@@ -22,18 +22,29 @@ Parameters and defines
 
 PARALLEL: When defined benchmark parallel MIMO-MMSE.
 SINGLE: When defined benchmark single-core MIMO-MMSE.
+FOLD: When defined 1 fold matrices in memory.
 */
 
-int16_t l1_H[2 * N_TX * N_RX * N_ITR]
-    __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)),
-                   section(".l1_prio")));
+#define FOLD (1)
+#define PARALLEL
+
+#if FOLD
+#define NUM_ROW (1 + ((N_ITR * N_TX - 1) / NUM_BANKS))
+#define NUM_COL (NUM_BANKS / N_TX)
+
+int16_t l1_G[2 * N_TX * NUM_BANKS * NUM_ROW]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+int16_t l1_L[2 * N_TX * NUM_BANKS * NUM_ROW]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+#else
 int16_t l1_G[2 * N_TX * N_TX * N_ITR]
-    __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)),
-                   section(".l1_prio")));
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
 int16_t l1_L[2 * N_TX * N_TX * N_ITR]
-    __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)),
-                   section(".l1_prio")));
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+#endif
 
+int16_t l1_H[2 * N_TX * N_RX * N_ITR]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
 int16_t l1_S[2 * N_TX * N_ITR]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
 int16_t l1_y[2 * N_RX * N_ITR]
@@ -51,12 +62,14 @@ int main() {
   uint32_t core_id = mempool_get_core_id();
   uint32_t num_cores = mempool_get_core_count();
   mempool_barrier_init(core_id); // Initialize barrier and synchronize
+  uint32_t time_init, time_end;
 
   /* Initialize matrices */
   if (core_id == 0) {
     dma_memcpy_blocking(l1_H, l2_H, N_TX * N_RX * N_ITR * sizeof(int32_t));
     dma_memcpy_blocking(l1_y, l2_y, N_RX * N_ITR * sizeof(int32_t));
     dma_memcpy_blocking(l1_S, l2_S, N_TX * N_ITR * sizeof(int32_t));
+    printf("Data transferred\n");
   }
   mempool_barrier(num_cores);
 
@@ -65,13 +78,18 @@ int main() {
 
   if (core_id == 0) {
     mempool_start_benchmark();
-    mempool_hermitian_q16vecs((v2s *)l1_H, (v2s *)l1_G, (v2s *)l1_Sigma, N_RX,
-                              N_TX);
-    mempool_MVP_conjtransp_q16vecs((v2s *)l1_H, (v2s *)l1_y, (v2s *)y2, N_RX,
-                                   N_TX, 0);
-    mempool_cholesky_q16vecs(l1_G, l1_L, N_TX);
-    mempool_Ltrisol_q16vecs(l1_L, y2, y3, N_TX, 0);
-    mempool_Ltrisol_q16vecs(l1_L, y3, l1_x, N_TX, 1);
+    time_init = mempool_get_timer();
+    v2s *PtrH = (v2s *)l1_H;
+    v2s *PtrG = (v2s *)l1_G;
+    v2s *PtrS = (v2s *)l1_Sigma;
+    v2s *Ptry = (v2s *)l1_y;
+    v2s *Ptry2 = (v2s *)y2;
+    mempool_hermitian_q16vecs(PtrH, PtrG, PtrS, N_RX, N_TX);
+    mempool_MVP_conjtransp_q16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX, FOLD);
+    mempool_cholesky_q16vecs(l1_G, l1_L, N_TX, FOLD);
+    mempool_Ltrisol_q16vecs(l1_L, y2, y3, N_TX, 0, FOLD);
+    mempool_Ltrisol_q16vecs(l1_L, y3, l1_x, N_TX, 1, FOLD);
+    time_end = mempool_get_timer();
     mempool_stop_benchmark();
   }
   mempool_barrier(num_cores);
@@ -81,30 +99,49 @@ int main() {
 #ifdef PARALLEL
 
   mempool_start_benchmark();
+  time_init = mempool_get_timer();
   for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) {
 
     int16_t *PtrH = l1_H + itr * (2 * N_TX * N_RX);
     int16_t *Ptry = l1_y + itr * (2 * N_RX);
-    int16_t *PtrSigma = l1_S + itr * (2 * N_TX);
-
+    int16_t *PtrS = l1_S + itr * (2 * N_TX);
+
+#if FOLD
+    int16_t *PtrG = l1_G + (itr / NUM_COL) * (2 * N_TX * NUM_BANKS) +
+                    (itr % NUM_COL) * (2 * N_TX);
+    int16_t *PtrL = l1_L + (itr / NUM_COL) * (2 * N_TX * NUM_BANKS) +
+                    (itr % NUM_COL) * (2 * N_TX);
+    int16_t *Ptry2 =
+        y2 + (itr / NUM_COL) * (2 * NUM_BANKS) + (itr % NUM_COL) * (2 * N_TX);
+    int16_t *Ptry3 =
+        y3 + (itr / NUM_COL) * (2 * NUM_BANKS) + (itr % NUM_COL) * (2 * N_TX);
+    int16_t *Ptrx = l1_x + itr * (2 * N_TX);
+#else
     int16_t *PtrG = l1_G + itr * (2 * N_TX * N_TX);
     int16_t *PtrL = l1_L + itr * (2 * N_TX * N_TX);
     int16_t *Ptry2 = y2 + itr * (2 * N_TX);
     int16_t *Ptry3 = y3 + itr * (2 * N_TX);
     int16_t *Ptrx = l1_x + itr * (2 * N_TX);
+#endif
 
-    mempool_hermitian_q16vecs((v2s *)PtrH, (v2s *)PtrG, (v2s *)PtrSigma, N_RX,
+    mempool_hermitian_q16vecs((v2s *)PtrH, (v2s *)PtrG, (v2s *)PtrS, N_RX,
                               N_TX);
     mempool_MVP_conjtransp_q16vecs((v2s *)PtrH, (v2s *)Ptry, (v2s *)Ptry2, N_RX,
-                                   N_TX, 0);
-    mempool_cholesky_q16vecs(PtrG, PtrL, N_TX);
-    mempool_Ltrisol_q16vecs(PtrL, Ptry2, Ptry3, N_TX, 0);
-    mempool_Ltrisol_q16vecs(PtrL, Ptry3, Ptrx, N_TX, 1);
+                                   N_TX, FOLD);
+    mempool_cholesky_q16vecs(PtrG, PtrL, N_TX, FOLD);
+    mempool_Ltrisol_q16vecs(PtrL, Ptry2, Ptry3, N_TX, 0, FOLD);
+    mempool_Ltrisol_q16vecs(PtrL, Ptry3, Ptrx, N_TX, 1, FOLD);
   }
-  mempool_log_barrier(2, core_id);
+  mempool_barrier(num_cores);
+  time_end = mempool_get_timer();
   mempool_stop_benchmark();
 
 #endif
 
+  if (core_id == 0) {
+    printf("Runtime: %d\n", time_end - time_init);
+  }
+  mempool_barrier(num_cores);
+
   return 0;
 }
diff --git a/software/kernels/baremetal/mempool_cholesky_q16s.h b/software/kernels/baremetal/mempool_cholesky_q16s.h
@@ -16,13 +16,15 @@
   @param[in]     n dimension of the input data
   @return        none
 */
-void mempool_cholesky_q16vecs(int16_t *pSrc, int16_t *pL, const uint32_t n) {
+void mempool_cholesky_q16vecs(int16_t *pSrc, int16_t *pL, const uint32_t n,
+                              const uint32_t folded) {
 
-  uint32_t i, j, k;
   int32_t sum;    // Sum for elements on diagonal (real)
   int32_t diag;   // Diagonal element (real)
   int32_t as, bs; // Sum for elements on rows (complex)
   int32_t ap, bp; // Pivot elements (complex)
+  uint32_t i, j, k;
+  const uint32_t offset = folded ? NUM_BANKS : n;
 
   v2s ab = (v2s){0, 0};
   v2s cd = (v2s){0, 0};
@@ -33,30 +35,30 @@ void mempool_cholesky_q16vecs(int16_t *pSrc, int16_t *pL, const uint32_t n) {
 
     // Elements on diagonal (input matrix is positive-definite)
     sum = 0;
-    diag = (int32_t)pSrc[2 * (j * n + j)];
+    diag = (int32_t)pSrc[2 * (j * offset + j)];
     for (k = 0; k < j; k++) {
-      ab = *(v2s *)&pL[2 * (j * n + k)];
+      ab = *(v2s *)&pL[2 * (j * offset + k)];
       asm volatile("pv.dotsp.h %[sum], %[ab],  %[ab];"
                    "srai       %[sum], %[sum],  0x8;"
                    "p.clip     %[sum], %[sum],  0x16;"
                    : [sum] "+&r"(sum)
                    : [ab] "r"(ab)
                    :);
     }
-    pL[2U * (j * n + j)] = (int16_t)mempool_sqrt_q32s(diag - sum, 16);
+    pL[2U * (j * offset + j)] = (int16_t)mempool_sqrt_q32s(diag - sum, 16);
 
     // Elements on rows
     for (i = j + 1; i < n; i++) {
-      ap = (int32_t)pSrc[2 * (i * n + j)];     // Pivot
-      bp = (int32_t)pSrc[2 * (i * n + j) + 1]; // Pivot
-      diag = (int32_t)pL[2 * (j * n + j)];     // Diag
+      ap = (int32_t)pSrc[2 * (i * offset + j)];     // Pivot
+      bp = (int32_t)pSrc[2 * (i * offset + j) + 1]; // Pivot
+      diag = (int32_t)pL[2 * (j * offset + j)];     // Diag
 
       as = 0;
       bs = 0;
       // Sum -> s = s + (ac + bd) + j*(bc - ad)
       for (k = 0; k < j; k++) {
-        ab = *(v2s *)&pL[2U * (i * n + k)];
-        cd = *(v2s *)&pL[2U * (j * n + k)];
+        ab = *(v2s *)&pL[2U * (i * offset + k)];
+        cd = *(v2s *)&pL[2U * (j * offset + k)];
         const uint32_t shuffle_mask = 0x00020003;
         asm volatile(
             // s = s + (ac + bd) + j(bc - ad)
@@ -81,7 +83,7 @@ void mempool_cholesky_q16vecs(int16_t *pSrc, int16_t *pL, const uint32_t n) {
                    : [ap] "+&r"(ap), [bp] "+&r"(bp), [res] "+&r"(res)
                    : [as] "r"(as), [bs] "r"(bs), [diag] "r"(diag)
                    :);
-      (*(v2s *)&pL[2 * (i * n + j)]) = res;
+      (*(v2s *)&pL[2 * (i * offset + j)]) = res;
     }
   }
   return;
diff --git a/software/kernels/baremetal/mempool_linearsolver_q16s.h b/software/kernels/baremetal/mempool_linearsolver_q16s.h
@@ -17,26 +17,29 @@
 */
 
 void mempool_Ltrisol_q16vecs(int16_t *pL, int16_t *y, int16_t *x,
-                             const uint32_t n, const uint32_t transposed) {
+                             const uint32_t n, const uint32_t transposed,
+                             const uint32_t folded) {
 
   uint32_t i, j;
   int32_t as, bs, diag;
   v2s ab, cd;
   v2s res = (v2s){0, 0};
   v2s ndc = (v2s){0, 0};
+  const uint32_t offset = folded ? NUM_BANKS : n;
 
   // Solve for each variable x[i] in loop
   for (i = 0; i < n; i++) {
     uint32_t ridx = transposed ? (n - i - 1) : i;
-    diag = pL[2U * (ridx + ridx)];
+    diag = pL[2U * (ridx * offset + ridx)];
     // Initialize the sums
     as = 0;
     bs = 0;
     // Use the previously solved variables to compute the sum
     for (j = 0; j < i; j++) {
+
       uint32_t cidx = transposed ? (n - j - 1) : j;
       if (!transposed) {
-        ab = *(v2s *)&pL[2U * (ridx * n + cidx)];
+        ab = *(v2s *)&pL[2U * (ridx * offset + cidx)];
       } else {
         ab = *(v2s *)&pL[2U * (cidx * n + ridx)];
       }