@@ -22,18 +22,29 @@ Parameters and defines
22
22
23
23
PARALLEL: When defined benchmark parallel MIMO-MMSE.
24
24
SINGLE: When defined benchmark single-core MIMO-MMSE.
25
+ FOLD: When defined 1 fold matrices in memory.
25
26
*/
26
27
27
- int16_t l1_H [2 * N_TX * N_RX * N_ITR ]
28
- __attribute__((aligned (BANKING_FACTOR * NUM_CORES * sizeof (int32_t )),
29
- section (".l1_prio" )));
28
+ #define FOLD (1)
29
+ #define PARALLEL
30
+
31
+ #if FOLD
32
+ #define NUM_ROW (1 + ((N_ITR * N_TX - 1) / NUM_BANKS))
33
+ #define NUM_COL (NUM_BANKS / N_TX)
34
+
35
+ int16_t l1_G [2 * N_TX * NUM_BANKS * NUM_ROW ]
36
+ __attribute__((aligned (sizeof (int32_t )), section (".l1_prio" )));
37
+ int16_t l1_L [2 * N_TX * NUM_BANKS * NUM_ROW ]
38
+ __attribute__((aligned (sizeof (int32_t )), section (".l1_prio" )));
39
+ #else
30
40
int16_t l1_G [2 * N_TX * N_TX * N_ITR ]
31
- __attribute__((aligned (BANKING_FACTOR * NUM_CORES * sizeof (int32_t )),
32
- section (".l1_prio" )));
41
+ __attribute__((aligned (sizeof (int32_t )), section (".l1_prio" )));
33
42
int16_t l1_L [2 * N_TX * N_TX * N_ITR ]
34
- __attribute__((aligned (BANKING_FACTOR * NUM_CORES * sizeof (int32_t )),
35
- section ( ".l1_prio" )));
43
+ __attribute__((aligned (sizeof (int32_t )), section ( ".l1_prio" )));
44
+ #endif
36
45
46
+ int16_t l1_H [2 * N_TX * N_RX * N_ITR ]
47
+ __attribute__((aligned (sizeof (int32_t )), section (".l1_prio" )));
37
48
int16_t l1_S [2 * N_TX * N_ITR ]
38
49
__attribute__((aligned (sizeof (int32_t )), section (".l1_prio" )));
39
50
int16_t l1_y [2 * N_RX * N_ITR ]
@@ -51,12 +62,14 @@ int main() {
51
62
uint32_t core_id = mempool_get_core_id ();
52
63
uint32_t num_cores = mempool_get_core_count ();
53
64
mempool_barrier_init (core_id ); // Initialize barrier and synchronize
65
+ uint32_t time_init , time_end ;
54
66
55
67
/* Initialize matrices */
56
68
if (core_id == 0 ) {
57
69
dma_memcpy_blocking (l1_H , l2_H , N_TX * N_RX * N_ITR * sizeof (int32_t ));
58
70
dma_memcpy_blocking (l1_y , l2_y , N_RX * N_ITR * sizeof (int32_t ));
59
71
dma_memcpy_blocking (l1_S , l2_S , N_TX * N_ITR * sizeof (int32_t ));
72
+ printf ("Data transferred\n" );
60
73
}
61
74
mempool_barrier (num_cores );
62
75
@@ -65,13 +78,18 @@ int main() {
65
78
66
79
if (core_id == 0 ) {
67
80
mempool_start_benchmark ();
68
- mempool_hermitian_q16vecs ((v2s * )l1_H , (v2s * )l1_G , (v2s * )l1_Sigma , N_RX ,
69
- N_TX );
70
- mempool_MVP_conjtransp_q16vecs ((v2s * )l1_H , (v2s * )l1_y , (v2s * )y2 , N_RX ,
71
- N_TX , 0 );
72
- mempool_cholesky_q16vecs (l1_G , l1_L , N_TX );
73
- mempool_Ltrisol_q16vecs (l1_L , y2 , y3 , N_TX , 0 );
74
- mempool_Ltrisol_q16vecs (l1_L , y3 , l1_x , N_TX , 1 );
81
+ time_init = mempool_get_timer ();
82
+ v2s * PtrH = (v2s * )l1_H ;
83
+ v2s * PtrG = (v2s * )l1_G ;
84
+ v2s * PtrS = (v2s * )l1_Sigma ;
85
+ v2s * Ptry = (v2s * )l1_y ;
86
+ v2s * Ptry2 = (v2s * )y2 ;
87
+ mempool_hermitian_q16vecs (PtrH , PtrG , PtrS , N_RX , N_TX );
88
+ mempool_MVP_conjtransp_q16vecs (PtrH , Ptry , Ptry2 , N_RX , N_TX , FOLD );
89
+ mempool_cholesky_q16vecs (l1_G , l1_L , N_TX , FOLD );
90
+ mempool_Ltrisol_q16vecs (l1_L , y2 , y3 , N_TX , 0 , FOLD );
91
+ mempool_Ltrisol_q16vecs (l1_L , y3 , l1_x , N_TX , 1 , FOLD );
92
+ time_end = mempool_get_timer ();
75
93
mempool_stop_benchmark ();
76
94
}
77
95
mempool_barrier (num_cores );
@@ -81,30 +99,49 @@ int main() {
81
99
#ifdef PARALLEL
82
100
83
101
mempool_start_benchmark ();
102
+ time_init = mempool_get_timer ();
84
103
for (uint32_t itr = core_id ; itr < N_ITR ; itr += num_cores ) {
85
104
86
105
int16_t * PtrH = l1_H + itr * (2 * N_TX * N_RX );
87
106
int16_t * Ptry = l1_y + itr * (2 * N_RX );
88
- int16_t * PtrSigma = l1_S + itr * (2 * N_TX );
89
-
107
+ int16_t * PtrS = l1_S + itr * (2 * N_TX );
108
+
109
+ #if FOLD
110
+ int16_t * PtrG = l1_G + (itr / NUM_COL ) * (2 * N_TX * NUM_BANKS ) +
111
+ (itr % NUM_COL ) * (2 * N_TX );
112
+ int16_t * PtrL = l1_L + (itr / NUM_COL ) * (2 * N_TX * NUM_BANKS ) +
113
+ (itr % NUM_COL ) * (2 * N_TX );
114
+ int16_t * Ptry2 =
115
+ y2 + (itr / NUM_COL ) * (2 * NUM_BANKS ) + (itr % NUM_COL ) * (2 * N_TX );
116
+ int16_t * Ptry3 =
117
+ y3 + (itr / NUM_COL ) * (2 * NUM_BANKS ) + (itr % NUM_COL ) * (2 * N_TX );
118
+ int16_t * Ptrx = l1_x + itr * (2 * N_TX );
119
+ #else
90
120
int16_t * PtrG = l1_G + itr * (2 * N_TX * N_TX );
91
121
int16_t * PtrL = l1_L + itr * (2 * N_TX * N_TX );
92
122
int16_t * Ptry2 = y2 + itr * (2 * N_TX );
93
123
int16_t * Ptry3 = y3 + itr * (2 * N_TX );
94
124
int16_t * Ptrx = l1_x + itr * (2 * N_TX );
125
+ #endif
95
126
96
- mempool_hermitian_q16vecs ((v2s * )PtrH , (v2s * )PtrG , (v2s * )PtrSigma , N_RX ,
127
+ mempool_hermitian_q16vecs ((v2s * )PtrH , (v2s * )PtrG , (v2s * )PtrS , N_RX ,
97
128
N_TX );
98
129
mempool_MVP_conjtransp_q16vecs ((v2s * )PtrH , (v2s * )Ptry , (v2s * )Ptry2 , N_RX ,
99
- N_TX , 0 );
100
- mempool_cholesky_q16vecs (PtrG , PtrL , N_TX );
101
- mempool_Ltrisol_q16vecs (PtrL , Ptry2 , Ptry3 , N_TX , 0 );
102
- mempool_Ltrisol_q16vecs (PtrL , Ptry3 , Ptrx , N_TX , 1 );
130
+ N_TX , FOLD );
131
+ mempool_cholesky_q16vecs (PtrG , PtrL , N_TX , FOLD );
132
+ mempool_Ltrisol_q16vecs (PtrL , Ptry2 , Ptry3 , N_TX , 0 , FOLD );
133
+ mempool_Ltrisol_q16vecs (PtrL , Ptry3 , Ptrx , N_TX , 1 , FOLD );
103
134
}
104
- mempool_log_barrier (2 , core_id );
135
+ mempool_barrier (num_cores );
136
+ time_end = mempool_get_timer ();
105
137
mempool_stop_benchmark ();
106
138
107
139
#endif
108
140
141
+ if (core_id == 0 ) {
142
+ printf ("Runtime: %d\n" , time_end - time_init );
143
+ }
144
+ mempool_barrier (num_cores );
145
+
109
146
return 0 ;
110
147
}
0 commit comments