From 4abcd8890c80e0e30add578c4247a0dd208ec42b Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Thu, 5 Sep 2024 15:36:24 +0200 Subject: [PATCH] [software] Clean-up data transfers in MIMO MMSE --- software/apps/baremetal/mimo_mmse_f16/main.c | 239 ++++++++----------- 1 file changed, 96 insertions(+), 143 deletions(-) diff --git a/software/apps/baremetal/mimo_mmse_f16/main.c b/software/apps/baremetal/mimo_mmse_f16/main.c index 4389a0f3e..c5cb20155 100644 --- a/software/apps/baremetal/mimo_mmse_f16/main.c +++ b/software/apps/baremetal/mimo_mmse_f16/main.c @@ -18,35 +18,34 @@ #include "baremetal/mempool_mimo_mmse_f16s.h" #include "data_mimo_mmse_f16.h" +#define NUM_BANKS (BANKING_FACTOR * NUM_CORES) +#define DOUBLE_BUFFERING -// #define DOUBLE_BUFFERING -// #define N_ROUNDS (1) -// #define DMA_TRANSFER2 +/********************************************************** + ********************************************************** + _ _ ___ _ _ _____ __ + | \ | |/ _ \ | | / |_ _| __ __ _ _ __ ___ / _| + | \| | | | |_____| | | | | || '__/ _` | '_ \/ __| |_ + | |\ | |_| |_____| |___| | | || | | (_| | | | \__ \ _| + |_| \_|\___/ |_____|_| |_||_| \__,_|_| |_|___/_|(_) -#ifndef DOUBLE_BUFFERING - -/**********************************************/ -/* TEST OF THE KERNELS WITH NO DATA MOVEMENTS */ -/**********************************************/ +*********************************************************** +***********************************************************/ -//#define SINGLE +#ifndef DOUBLE_BUFFERING #define PARALLEL -//#define FOLDED __fp16 l1_H[2 * N_TX * N_RX * N_ITR] - __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)), - section(".l1_prio"))); + __attribute__((aligned(NUM_BANKS * sizeof(int32_t)), section(".l1_prio"))); __fp16 l1_G[2 * N_TX * N_TX * N_ITR] - __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)), - section(".l1_prio"))); + __attribute__((aligned(NUM_BANKS * sizeof(int32_t)), section(".l1_prio"))); __fp16 l1_L[2 * N_TX * N_TX * N_ITR] - __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)), - section(".l1_prio"))); + __attribute__((aligned(NUM_BANKS * sizeof(int32_t)), section(".l1_prio"))); uint32_t l1_beamgroups[N_ITR] __attribute__((aligned(sizeof(int32_t)), section(".l1"))); -__fp16 l1_Sigma[2 * N_TX * N_ITR] - __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); +__fp16 l1_S[2 * N_TX * N_ITR] + __attribute__((aligned(sizeof(int32_t)), section(".l1"))); __fp16 l1_y[2 * N_RX * N_ITR] __attribute__((aligned(sizeof(int32_t)), section(".l1"))); __fp16 y2[2 * N_TX * N_ITR] @@ -62,21 +61,19 @@ int main() { uint32_t core_id = mempool_get_core_id(); uint32_t num_cores = mempool_get_core_count(); mempool_barrier_init(core_id); // Initialize barrier and synchronize - /* Initialize matrices */ if (core_id == 0) { dma_memcpy_blocking(l1_beamgroups, l2_beamgroups, N_ITR * sizeof(int32_t)); dma_memcpy_blocking(l1_H, l2_H, N_TX * N_RX * N_ITR * sizeof(int32_t)); dma_memcpy_blocking(l1_y, l2_y, N_RX * N_ITR * sizeof(int32_t)); - dma_memcpy_blocking(l1_Sigma, l2_Sigma, N_TX * N_ITR * sizeof(int32_t)); + dma_memcpy_blocking(l1_S, l2_Sigma, N_TX * N_ITR * sizeof(int32_t)); } mempool_barrier(num_cores); - /* Benchmark */ #ifdef SINGLE if (core_id == 0) { mempool_start_benchmark(); - mempool_hermitian_f16s(l1_H, l1_G, l1_Sigma, N_RX, N_TX, 0, 0); + mempool_hermitian_f16s(l1_H, l1_G, l1_S, N_RX, N_TX, 0, 0); mempool_MVP_conjtransp_f16s(l1_H, l1_y, y2, N_RX, N_TX, 0); mempool_cholesky_f16vecs(l1_G, l1_L, N_TX); mempool_Ltrisol_f16s(l1_L, y2, y3, N_TX); @@ -87,30 +84,27 @@ int main() { #endif #ifdef PARALLEL - // Each iteration is assigned to a processor mempool_start_benchmark(); + // Parallel subcarrier loop for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) { - - uint32_t N_bg = l1_beamgroups[itr]; - uint32_t N_TX_bg = N_TX / N_bg; - __fp16 *PtrH = l1_H + itr * (2 * N_TX * N_RX); __fp16 *Ptry = l1_y + itr * (2 * N_RX); - __fp16 *PtrSigma = l1_Sigma + itr * (2 * N_TX); - + __fp16 *PtrSigma = l1_S + itr * (2 * N_TX); + // Auxiliary vectors __fp16 *PtrG = l1_G + itr * (2 * N_TX * N_TX); __fp16 *PtrL = l1_L + itr * (2 * N_TX * N_TX); __fp16 *Ptry2 = y2 + itr * (2 * N_TX); __fp16 *Ptry3 = y3 + itr * (2 * N_TX); __fp16 *Ptrx = l1_x + itr * (2 * N_TX); - + // Serial beamgroup Beamgroups loop + uint32_t N_bg = l1_beamgroups[itr]; + uint32_t N_TX_bg = N_TX / N_bg; for (uint32_t itr_bg = 0; itr_bg < N_bg; itr_bg++) { mempool_hermitian_f16vecs(PtrH, PtrG, PtrSigma, N_RX, N_TX_bg); mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX_bg); mempool_cholesky_f16vecs(PtrG, PtrL, N_TX_bg); mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX_bg); mempool_Lttrisol_f16s(PtrL, Ptry3, Ptrx, N_TX_bg); - // Shift over the subsequent beamgroup PtrH += 2 * itr_bg * N_TX_bg * N_RX; PtrSigma += 2 * itr_bg * N_TX_bg; @@ -121,76 +115,53 @@ int main() { mempool_stop_benchmark(); #endif -#ifdef FOLDED - // Each iteration is assigned to a processor - mempool_start_benchmark(); - for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) { - - __fp16 *PtrH = l1_H + itr * (2 * N_TX * N_RX); - __fp16 *Ptry = l1_y + itr * (2 * N_RX); - __fp16 *PtrSigma = l1_Sigma + itr * (2 * N_TX); - - __fp16 *PtrG = l1_G + (itr % num_cores) * (2 * N_TX) + - (itr / num_cores) * (N_TX * N_BANKS); - __fp16 *PtrL = l1_L + (itr % num_cores) * (2 * N_TX) + - (itr / num_cores) * (N_TX * N_BANKS); - __fp16 *Ptry2 = - y2 + (itr % num_cores) * (2 * N_TX) + (itr / num_cores) * (N_BANKS); - __fp16 *Ptry3 = - y3 + (itr % num_cores) * (2 * N_TX) + (itr / num_cores) * (N_BANKS); - __fp16 *Ptrx = l1_x + itr * (2 * N_TX); - - mempool_hermitian_f16s(PtrH, PtrG, PtrSigma, N_RX, N_TX, 1, 0); - mempool_MVP_conjtransp_f16s(PtrH, Ptry, Ptry2, N_RX, N_TX, 1); - mempool_cholesky_folded_f16s(PtrG, PtrL, N_TX); - mempool_Ltrisol_folded_f16s(PtrL, Ptry2, Ptry3, N_TX); - mempool_Lttrisol_folded_f16s(PtrL, Ptry3, Ptrx, N_TX); - } - mempool_barrier(num_cores); - mempool_stop_benchmark(); -#endif - // Check the result mempool_check_f16(l1_x, l2_x, 2 * N_TX, 0.01f, 0); mempool_barrier(num_cores); return 0; } -#else +/********************************************************** + ********************************************************** + ____ __ __ _ _____ __ + | _ \| \/ | / \ |_ _| __ __ _ _ __ ___ / _| + | | | | |\/| | / _ \ _____| || '__/ _` | '_ \/ __| |_ + | |_| | | | |/ ___ \_____| || | | (_| | | | \__ \ _| + |____/|_| |_/_/ \_\ |_||_| \__,_|_| |_|___/_|(_) -/*******************************************/ -/* TEST OF THE KERNELS WITH DATA MOVEMENTS */ -/*******************************************/ +*********************************************************** +***********************************************************/ + +#else +#define N_ROUNDS (1) +#define DMA_TRANSFER1 // Inputs-Outputs even double-buffering rounds __fp16 l1A_H[2 * N_TX * N_RX * N_ITR] - __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)), - section(".l1_prio"))); -__fp16 l1A_Sigma[2 * N_TX * N_ITR] + __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); +__fp16 l1A_S[2 * N_TX * N_ITR] __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); __fp16 l1A_y[2 * N_RX * N_ITR] __attribute__((aligned(sizeof(int32_t)), section(".l1"))); __fp16 l1A_x[2 * N_TX * N_ITR] __attribute__((aligned(sizeof(int32_t)), section(".l1"))); - // Inputs-Outputs odd double-buffering rounds __fp16 l1B_H[2 * N_TX * N_RX * N_ITR] - __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)), - section(".l1_prio"))); -__fp16 l1B_Sigma[2 * N_TX * N_ITR] + __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); +__fp16 l1B_S[2 * N_TX * N_ITR] __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); __fp16 l1B_y[2 * N_RX * N_ITR] __attribute__((aligned(sizeof(int32_t)), section(".l1"))); __fp16 l1B_x[2 * N_TX * N_ITR] __attribute__((aligned(sizeof(int32_t)), section(".l1"))); +/* Barrier for double buffering */ +uint32_t volatile dma_barrier __attribute__((section(".l1"))); // Auxiliary vectors __fp16 G[2 * N_TX * N_TX * N_ITR] - __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)), - section(".l1_prio"))); + __attribute__((aligned(NUM_BANKS * sizeof(int32_t)), section(".l1_prio"))); __fp16 L[2 * N_TX * N_TX * N_ITR] - __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)), - section(".l1_prio"))); + __attribute__((aligned(NUM_BANKS * sizeof(int32_t)), section(".l1_prio"))); __fp16 y2[2 * N_TX * N_ITR] __attribute__((aligned(sizeof(int32_t)), section(".l1"))); __fp16 y3[2 * N_TX * N_ITR] @@ -198,128 +169,106 @@ __fp16 y3[2 * N_TX * N_ITR] // Driver program int main() { - uint32_t core_id = mempool_get_core_id(); uint32_t num_cores = mempool_get_core_count(); - mempool_barrier_init(core_id); // Initialize barrier and synchronize + mempool_barrier_init(core_id); -#ifdef DMA_TRANSFER1 - - // INITIALIZATION - mempool_start_benchmark(); if (core_id == 0) { dma_memcpy_blocking(l1A_H, l2_H, N_TX * N_RX * N_ITR * sizeof(int32_t)); dma_memcpy_blocking(l1A_y, l2_y, N_RX * N_ITR * sizeof(int32_t)); - dma_memcpy_blocking(l1A_Sigma, l2_Sigma, N_TX * N_ITR * sizeof(int32_t)); + dma_memcpy_blocking(l1A_S, l2_Sigma, N_TX * N_ITR * sizeof(int32_t)); } mempool_barrier(num_cores); for (uint32_t round = 0; round < N_ROUNDS; round++) { - - // TRANSFER + /* Start DMA-transfer round */ mempool_start_benchmark(); // Transfer vectors __fp16 *trsf_H = ((round % 2) == 1) ? l1A_H : l1B_H; __fp16 *trsf_y = ((round % 2) == 1) ? l1A_y : l1B_y; - __fp16 *trsf_Sigma = ((round % 2) == 1) ? l1A_Sigma : l1B_Sigma; + __fp16 *trsf_S = ((round % 2) == 1) ? l1A_S : l1B_S; // Compute vectors __fp16 *cmpt_H = ((round % 2) == 0) ? l1A_H : l1B_H; __fp16 *cmpt_y = ((round % 2) == 0) ? l1A_y : l1B_y; - __fp16 *cmpt_Sigma = ((round % 2) == 0) ? l1A_Sigma : l1B_Sigma; - // On even rounds we transfer the result of odd computation and viceversa + __fp16 *cmpt_S = ((round % 2) == 0) ? l1A_S : l1B_S; + // On even rounds transfer the result of odd computation and viceversa __fp16 *trsf_x = ((round % 2) == 0) ? l1A_x : l1B_x; __fp16 *cmpt_x = ((round % 2) == 1) ? l1A_x : l1B_x; + + /***************************** + ***************************** + ___ _ ___ ___ _ _ + / __| /_\ / __| __| / | (_) + | (__ / _ \\__ \ _| | | _ + \___/_/ \_\___/___| |_| (_) + + ****************************** + ******************************/ + +#ifdef DMA_TRANSFER1 + // TRANSFER if (core_id == 0) { + dma_barrier = 0; dma_memcpy_nonblocking(trsf_H, l2_H, N_TX * N_RX * N_ITR * sizeof(int32_t)); dma_memcpy_nonblocking(trsf_y, l2_y, N_RX * N_ITR * sizeof(int32_t)); - dma_memcpy_nonblocking(trsf_Sigma, l2_Sigma, - N_TX * N_ITR * sizeof(int32_t)); - if (round >= 1) // Transfer to L2 is done only if not the + dma_memcpy_nonblocking(trsf_S, l2_Sigma, N_TX * N_ITR * sizeof(int32_t)); + if (round >= 1) // Transfer to L2 is done only if not the first round dma_memcpy_nonblocking(l2_x, trsf_x, (N_TX * N_ITR) * sizeof(int32_t)); } - - // COMPUTATION - // Each iteration is assigned to a processor + // COMPUTE mempool_start_benchmark(); for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) { __fp16 *PtrH = cmpt_H + itr * (2 * N_TX * N_RX); __fp16 *Ptry = cmpt_y + itr * (2 * N_RX); __fp16 *Ptrx = cmpt_x + itr * (2 * N_TX); - __fp16 *PtrSigma = cmpt_Sigma + itr * (2 * N_TX); + __fp16 *PtrS = cmpt_S + itr * (2 * N_TX); __fp16 *PtrG = G + itr * (2 * N_TX * N_TX); __fp16 *PtrL = L + itr * (2 * N_TX * N_TX); __fp16 *Ptry2 = y2 + itr * (2 * N_TX); __fp16 *Ptry3 = y3 + itr * (2 * N_TX); - mempool_hermitian_f16vecs(PtrH, PtrG, PtrSigma, N_RX, N_TX); + mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX); mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX); mempool_cholesky_f16vecs(PtrG, PtrL, N_TX); mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX); mempool_Lttrisol_f16s(PtrL, Ptry3, Ptrx, N_TX); } - mempool_log_barrier(2, core_id); - - // WAIT FOR DMA - mempool_start_benchmark(); - dma_wait(); // Wait for the end of the dma transfer - mempool_stop_benchmark(); - } - #endif -#ifdef DMA_TRANSFER2 + /***************************** + ***************************** + ___ _ ___ ___ ___ _ + / __| /_\ / __| __| |_ ) (_) + | (__ / _ \\__ \ _| / / _ + \___/_/ \_\___/___| /___| (_) - // INITIALIZATION - mempool_start_benchmark(); - if (core_id == 0) { - dma_memcpy_blocking(l1A_H, l2_H, N_TX * N_RX * N_ITR * sizeof(int32_t)); - dma_memcpy_blocking(l1A_y, l2_y, N_RX * N_ITR * sizeof(int32_t)); - dma_memcpy_blocking(l1A_Sigma, l2_Sigma, N_TX * N_ITR * sizeof(int32_t)); - } - mempool_barrier(num_cores); + ****************************** + ******************************/ - for (uint32_t round = 0; round < N_ROUNDS; round++) { - - // Transfer vectors - __fp16 *trsf_H = ((round % 2) == 1) ? l1A_H : l1B_H; - __fp16 *trsf_y = ((round % 2) == 1) ? l1A_y : l1B_y; - __fp16 *trsf_Sigma = ((round % 2) == 1) ? l1A_Sigma : l1B_Sigma; - // Compute vectors - __fp16 *cmpt_H = ((round % 2) == 0) ? l1A_H : l1B_H; - __fp16 *cmpt_y = ((round % 2) == 0) ? l1A_y : l1B_y; - __fp16 *cmpt_Sigma = ((round % 2) == 0) ? l1A_Sigma : l1B_Sigma; - // On even rounds we transfer the result of odd computation and viceversa - __fp16 *trsf_x = ((round % 2) == 0) ? l1A_x : l1B_x; - __fp16 *cmpt_x = ((round % 2) == 1) ? l1A_x : l1B_x; - - // COMPUTATION - // Each iteration is assigned to a processor +#ifdef DMA_TRANSFER2 + // COMPUTE mempool_start_benchmark(); for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) { __fp16 *PtrH = cmpt_H + itr * (2 * N_TX * N_RX); __fp16 *Ptry = cmpt_y + itr * (2 * N_RX); - __fp16 *PtrSigma = cmpt_Sigma + itr * (2 * N_TX); + __fp16 *PtrS = cmpt_S + itr * (2 * N_TX); __fp16 *PtrG = G + itr * (2 * N_TX * N_TX); __fp16 *Ptry2 = y2 + itr * (2 * N_TX); - mempool_hermitian_f16vecs(PtrH, PtrG, PtrSigma, N_RX, N_TX); + mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX); mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX); } mempool_log_barrier(2, core_id); - // TRANSFER mempool_start_benchmark(); if (core_id == 0) { dma_memcpy_nonblocking(trsf_H, l2_H, N_TX * N_RX * N_ITR * sizeof(int32_t)); dma_memcpy_nonblocking(trsf_y, l2_y, N_RX * N_ITR * sizeof(int32_t)); - dma_memcpy_nonblocking(trsf_Sigma, l2_Sigma, - N_TX * N_ITR * sizeof(int32_t)); + dma_memcpy_nonblocking(trsf_S, l2_Sigma, N_TX * N_ITR * sizeof(int32_t)); if (round >= 1) // Transfer to L2 is done only if not the dma_memcpy_nonblocking(l2_x, trsf_x, (N_TX * N_ITR) * sizeof(int32_t)); } - - // COMPUTATION - // Each iteration is assigned to a processor + // COMPUTE mempool_start_benchmark(); for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) { __fp16 *Ptrx = cmpt_x + itr * (2 * N_TX); @@ -331,16 +280,20 @@ int main() { mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX); mempool_Lttrisol_f16s(PtrL, Ptry3, Ptrx, N_TX); } - mempool_log_barrier(2, core_id); +#endif - // WAIT FOR DMA - mempool_start_benchmark(); - dma_wait(); // Wait for the end of the dma transfer + // Synchronize and wait DMA + if ((num_cores - 1) == + __atomic_fetch_add(&dma_barrier, 1, __ATOMIC_RELAXED)) { + __atomic_store_n(&dma_barrier, 0, __ATOMIC_RELAXED); + __sync_synchronize(); + dma_wait(); + wake_up_all(); + } + mempool_wfi(); mempool_stop_benchmark(); + /* End DMA-transfer round */ } - -#endif - mempool_barrier(num_cores); return 0; }