[software] Adapt to new folder structure

pulp-platform · Jul 5, 2024 · f3f9212 · f3f9212
1 parent d3f5650
commit f3f9212
Show file tree

Hide file tree

Showing 69 changed files with 1,102 additions and 4,496 deletions.
diff --git a/Makefile b/Makefile
@@ -183,7 +183,7 @@ toolchain/riscv-opcodes/*:
 
 format:
 	$(ROOT_DIR)/scripts/run_clang_format.py --clang-format-executable=$(LLVM_INSTALL_DIR)/bin/clang-format -i -r $(ROOT_DIR)
-	find ./software/runtime/data -name '*.py' -exec autopep8 --in-place --aggressive {} +
+	find ./software/data -name '*.py' -exec autopep8 --in-place --aggressive {} +
 
 clean: clean-riscv-tests
 	rm -rf $(INSTALL_DIR)
diff --git a/software/apps/baremetal/Makefile b/software/apps/baremetal/Makefile
@@ -22,8 +22,8 @@ ALLPYS := $(patsubst %.py,%.h,$(wildcard $(DATA_DIR)/*.py))
 BINARIES := $(addprefix $(BIN_DIR)/,$(APPS))
 ALL := $(APPS)
 
-ALL_GCC := $(filter-out matmul_f16 matmul_f32, $(ALL))
-ALL_LLVM := $(filter-out synth_i32 chest_q16 cfft_radix2_q16 cfft_radix4_q16, $(ALL))
+ALL_GCC := $(filter-out cfft_radix4_f16 chest_f16 cholesky_f16 cmatmul_f16 matmul_f16 matmul_f32 mimo_mmse_f32 mimo_mmse_f16 ofdm, $(ALL))
+ALL_LLVM := $(filter-out synth_i32 cfft_radix2_q16 cfft_radix4_q16 chest_q16 cholesky_q16 cholesky_q32 cmatmul_q16 mimo_mmse_q16, $(ALL))
 
 # Make all applications
 all: $(ALL_GCC)

diff --git a/software/apps/baremetal/cfft_radix4_f16/main.c b/software/apps/baremetal/cfft_radix4_f16/main.c
@@ -10,35 +10,53 @@
 #include <string.h>
 
 /* Mempool runtime libraries */
+#include "builtins_v2.h"
 #include "dma.h"
 #include "encoding.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
-#include "xpulp/builtins_v2.h"
 
 /* CFFT data libraries */
-#include "data/data_cfft_radix4_f16.h"
-
-/*
-   - FOLDED:    Parallel FFT with "memory-aware" load/store scheme
-   - SCHEDULED: Scheduling of multiple parallel FFTs with "memory-aware"
-   load/store scheme
-      - N_FFTs_COL: Independent FFTs scheduled on one row (default 1)
-      - N_FFTs_ROW: Independent FFTs scheduled on columns (default 1)
-      - FOLDED_TWIDDLES: Also the twiddles have "memory-aware" load/stores
-*/
-
-#define FOLDED
+#include "data_cfft_radix4_f16.h"
+
+/* CHOOSE ONE */
+//#define SINGLE // Single core FFT.
+//#define PARALLEL // Parallel FFT not "memory-aware".
+//#define FOLDED // Parallel FFT with "memory-aware" load/store.
+#define SCHEDULED // Folded FFTs arranged in rows and cols.'''
+
+// Bitreversal index from table.
+#define BITREVERSETABLE
+// Independent FFTs scheduled on one row (default 1).
+#define N_FFTs_ROW 2
+// Independent FFTs scheduled on columns (default 1).
+#define N_FFTs_COL 2
+#if (N_FFTs_COL > MAX_COL)
+#error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)]
+#endif
+// Also the twiddles have "memory-aware" load/stores.
 #define FOLDED_TWIDDLES
-#define N_FFTs_ROW 1
-#define N_FFTs_COL 1
 
-#include "kernel/mempool_checks.h"
-#include "kernel/mempool_radix4_cfft_butterfly_f16.h"
-#include "kernel/mempool_radix4_cfft_f16p.h"
-#include "kernel/mempool_radix4_cfft_q16_bitreversal.h"
+#include "baremetal/mempool_cfft_q16_bitreversal.h"
+#include "baremetal/mempool_checks.h"
+#include "baremetal/mempool_radix4_cfft_butterfly_f16.h"
+#include "baremetal/mempool_radix4_cfft_f16p.h"
+
+#if (defined(SINGLE) || defined(PARALLEL))
+__fp16 l1_pSrc[2 * N_CSAMPLES]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+__fp16 l1_pDst[2 * N_CSAMPLES]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+__fp16 l1_twiddleCoef_f16_src[2 * 3 * N_CSAMPLES / 4]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+__fp16 l1_twiddleCoef_f16_dst[2 * 3 * N_CSAMPLES / 4]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+#endif
 
+#if (defined(SCHEDULED) || defined(FOLDED))
 __fp16 l1_pSrc[N_FFTs_ROW * 8 * N_BANKS]
     __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
 __fp16 l1_pDst[N_FFTs_ROW * 8 * N_BANKS]
@@ -49,49 +67,44 @@ __fp16 l1_twiddleCoef_f16_dst[8 * N_BANKS]
     __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
 uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
     __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+#endif
 
 int main() {
-
   uint32_t core_id = mempool_get_core_id();
   uint32_t num_cores = mempool_get_core_count();
-  mempool_barrier_init(core_id);
   __fp16 *pRes = (__fp16 *)0;
+  mempool_barrier_init(core_id);
+
+  /* INITIALIZATION */
 
   if (core_id == 0) {
-    // Each FFT is folded over 4 memory rows
-    // Each memory row is 2 * N_BANKS (real-imag) samples
     for (uint32_t j = 0; j < N_FFTs_ROW; j++) {
-      dma_memcpy_blocking(l1_pSrc + j * (8 * N_BANKS), l2_pSrc,
-                          (N_CSAMPLES * N_FFTs_COL) * sizeof(int32_t));
+      for (uint32_t i = 0; i < N_FFTs_COL; i++) {
+        dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * N_BANKS),
+                            l2_pSrc, N_CSAMPLES * sizeof(int32_t));
+      }
     }
-    dma_memcpy_blocking(l1_pSrc, l2_pSrc, N_CSAMPLES * sizeof(int32_t));
     dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable,
-                        BITREVINDEXTABLE_LENGTH * sizeof(int16_t));
-    dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16,
-                        3 * (N_CSAMPLES / 4) * sizeof(int32_t));
+                        BITREVINDEXTABLE_LENGTH * sizeof(int32_t));
   }
-// Initialize the Twiddles folded
-#ifdef FOLDED_TWIDDLES
+  mempool_barrier(num_cores);
   for (uint32_t j = 0; j < N_FFTs_COL; j++) {
-    uint32_t N_WORDS_COL = (N_CSAMPLES / 4);
+    uint32_t N_WORDS_COL = N_CSAMPLES >> 2;
     for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) {
-      *(v2h *)&l1_twiddleCoef_f16_src[2U * (i + j * N_WORDS_COL)] =
-          *(v2h *)&l2_twiddleCoef_f16[2U * i];
-      *(v2h *)&l1_twiddleCoef_f16_src[2U *
-                                      (i + j * N_WORDS_COL + 1 * N_BANKS)] =
-          *(v2h *)&l2_twiddleCoef_f16[2U * (i * 2U)];
-      *(v2h *)&l1_twiddleCoef_f16_src[2U *
-                                      (i + j * N_WORDS_COL + 2 * N_BANKS)] =
-          *(v2h *)&l2_twiddleCoef_f16[2U * (i * 3U)];
+      *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL)] =
+          *(v2h *)&l2_twiddleCoef_f16[2 * i];
+      *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL + 1 * N_BANKS)] =
+          *(v2h *)&l2_twiddleCoef_f16[2 * (i * 2U)];
+      *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL + 2 * N_BANKS)] =
+          *(v2h *)&l2_twiddleCoef_f16[2 * (i * 3U)];
     }
   }
-#endif
   if (core_id == 0) {
     printf("01: END INITIALIZATION\n");
   }
   mempool_barrier(num_cores);
 
-#if (defined(FOLDED) && defined(FOLDED_TWIDDLES))
+#ifdef FOLDED
   if (core_id < (N_CSAMPLES / 16)) {
     mempool_start_benchmark();
     mempool_radix4_cfft_f16p_folded(l1_pSrc, l1_pDst, N_CSAMPLES,
@@ -105,26 +118,28 @@ int main() {
 #endif
 
 #ifdef SCHEDULED
-  uint32_t nPE = (N_CSAMPLES / 16);
-  if (core_id < N_FFTs_COL * nPE) {
+  uint32_t CORES_USED = (N_CSAMPLES / 4) / BANKING_FACTOR;
+  if (core_id < N_FFTs_COL * CORES_USED) {
     mempool_start_benchmark();
-    uint32_t N_WORDS_COL = N_CSAMPLES / 4;
-    uint32_t col_id = core_id / nPE;
     mempool_radix4_cfft_f16p_scheduler(
         l1_pSrc, l1_pDst, N_CSAMPLES, N_FFTs_ROW, N_FFTs_COL,
-        l1_twiddleCoef_f16_src + 2 * col_id * N_WORDS_COL,
-        l1_twiddleCoef_f16_dst + 2 * col_id * N_WORDS_COL, l1_BitRevIndexTable,
-        BITREVINDEXTABLE_LENGTH, 1, nPE);
-    pRes = l1_pDst;
-    mempool_log_partial_barrier(2, core_id, N_FFTs_COL * nPE);
+        l1_twiddleCoef_f16_src, l1_twiddleCoef_f16_dst, l1_BitRevIndexTable,
+        BITREVINDEXTABLE_LENGTH, 1, CORES_USED);
+    mempool_log_partial_barrier(2, core_id, N_FFTs_COL * CORES_USED);
     mempool_stop_benchmark();
   }
+#ifdef BITREVERSETABLE
+  pRes = ((LOG2 / 2) % 2) == 0 ? l1_pSrc : l1_pDst;
+#else
+  pRes = ((LOG2 / 2) % 2) == 0 ? l1_pDst : l1_pSrc;
+#endif
 #endif
 
   mempool_barrier(num_cores);
   if (core_id == 0) {
     printf("02: END COMPUTATION\n");
   }
+
   mempool_check_f16(pRes, l2_pRes, 2 * N_CSAMPLES, 0.5, 0);
   mempool_barrier(num_cores);
   return 0;

diff --git a/software/apps/baremetal/chest_f16/main.c b/software/apps/baremetal/chest_f16/main.c
@@ -8,17 +8,16 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "builtins_v2.h"
 #include "dma.h"
 #include "encoding.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
-#include "builtins_v2.h"
 
-#include "data_chest_f16.h"
 #include "baremetal/mempool_checks.h"
-#include "baremetal/mempool_chest_f16p.h"
-#include "baremetal/mempool_chest_f16s.h"
+#include "baremetal/mempool_chest_f16.h"
+#include "data_chest_f16.h"
 
 //#define SINGLE
 #define PARALLEL

diff --git a/software/apps/baremetal/chest_q16/main.c b/software/apps/baremetal/chest_q16/main.c
@@ -55,8 +55,8 @@ int main() {
 #endif
 #ifdef PARALLEL
   mempool_start_benchmark();
-  mempool_chest_q16p_unrolled4_local(l1_HEST, l1_PilotRX, l1_PilotTX, N_RX,
-                                     N_TX, N_SAMPLES, core_id, num_cores);
+  mempool_chest_q16p_unrolled4(l1_HEST, l1_PilotRX, l1_PilotTX, N_RX, N_TX,
+                               N_SAMPLES, core_id, num_cores);
   mempool_stop_benchmark();
   mempool_barrier(num_cores);
 #endif

diff --git a/software/apps/baremetal/cholesky_f16/main.c b/software/apps/baremetal/cholesky_f16/main.c
@@ -11,7 +11,6 @@
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
-#include "builtins_v2.h"
 
 #include "data_cholesky_f16.h"
 

diff --git a/software/apps/baremetal/cholesky_q16/main.c b/software/apps/baremetal/cholesky_q16/main.c
@@ -9,11 +9,10 @@
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
-#include "xpulp/builtins_v2.h"
 
+#include "baremetal/mempool_checks.h"
+#include "baremetal/mempool_cholesky_q16s.h"
 #include "data_cholesky_q16.h"
-#include "kernel/mempool_checks.h"
-#include "kernel/mempool_cholesky_q16s.h"
 
 #define SINGLE
 

diff --git a/software/apps/baremetal/cmatmul_f16/main.c b/software/apps/baremetal/cmatmul_f16/main.c
@@ -13,9 +13,11 @@
 #include "synchronization.h"
 
 #include "data_cmatmul_f16.h"
+
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_cmatmul_f16.h"
-#define PARALLEL_2x2
+#define PARALLEL_2x4
+#define TEST
 
 __fp16 matrix_a[2 * dim_M * dim_N]
     __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)),
@@ -26,7 +28,7 @@ __fp16 matrix_b[2 * dim_N * dim_P]
 __fp16 matrix_c[2 * dim_M * dim_P]
     __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)),
                    section(".l1_prio")));
-__fp16 matrix_a_folded[2 * dim_M * (4 * NUM_CORES)]
+__fp16 matrix_a_folded[2 * (BANKING_FACTOR * NUM_CORES)]
     __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)),
                    section(".l1_prio")));
 
@@ -73,7 +75,6 @@ int main() {
     mempool_start_benchmark();
     cmatmul_2x4_f16p(matrix_a, matrix_b, matrix_c, dim_M, dim_N, dim_P, core_id,
                      nPE);
-    mempool_log_partial_barrier(2, core_id, nPE);
     mempool_stop_benchmark();
   }
   mempool_barrier(num_cores);

diff --git a/software/apps/cmatmul_q16/main.c → software/apps/baremetal/cmatmul_q16/main.c b/software/apps/cmatmul_q16/main.c → software/apps/baremetal/cmatmul_q16/main.c
@@ -12,9 +12,9 @@
 #include "runtime.h"
 #include "synchronization.h"
 
-#include "data/data_cmatmul_q16.h"
-#include "kernel/mempool_checks.h"
-#include "kernel/mempool_cmatmul_q16.h"
+#include "baremetal/mempool_checks.h"
+#include "baremetal/mempool_cmatmul_q16.h"
+#include "data_cmatmul_q16.h"
 
 #define PARALLEL
 

diff --git a/software/apps/baremetal/mimo_mmse_f16/main.c b/software/apps/baremetal/mimo_mmse_f16/main.c
@@ -11,19 +11,26 @@
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
-#include "builtins_v2.h"
 
-#include "data_mimo_mmse_f16.h"
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_cholesky_f16s.h"
 #include "baremetal/mempool_linearsolver_f16s.h"
 #include "baremetal/mempool_mimo_mmse_f16s.h"
 
-//#define DOUBLE_BUFFERING
+#include "data_mimo_mmse_f16.h"
+
+// #define DOUBLE_BUFFERING
+// #define N_ROUNDS (1)
+// #define DMA_TRANSFER2
+
 #ifndef DOUBLE_BUFFERING
 
-#define SINGLE
-//#define PARALLEL
+/**********************************************/
+/* TEST OF THE KERNELS WITH NO DATA MOVEMENTS */
+/**********************************************/
+
+//#define SINGLE
+#define PARALLEL
 //#define FOLDED
 
 __fp16 l1_H[2 * N_TX * N_RX * N_ITR]
@@ -110,7 +117,7 @@ int main() {
       Ptrx += 2 * itr_bg * N_TX_bg;
     }
   }
-  mempool_log_barrier(2, core_id);
+  mempool_barrier(num_cores);
   mempool_stop_benchmark();
 #endif
 
@@ -139,7 +146,7 @@ int main() {
     mempool_Ltrisol_folded_f16s(PtrL, Ptry2, Ptry3, N_TX);
     mempool_Lttrisol_folded_f16s(PtrL, Ptry3, Ptrx, N_TX);
   }
-  mempool_log_barrier(2, core_id);
+  mempool_barrier(num_cores);
   mempool_stop_benchmark();
 #endif
 
@@ -244,8 +251,8 @@ int main() {
       __fp16 *PtrL = L + itr * (2 * N_TX * N_TX);
       __fp16 *Ptry2 = y2 + itr * (2 * N_TX);
       __fp16 *Ptry3 = y3 + itr * (2 * N_TX);
-      mempool_hermitian_f16s(PtrH, PtrG, PtrSigma, N_RX, N_TX, 0, 0);
-      mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX, 0);
+      mempool_hermitian_f16vecs(PtrH, PtrG, PtrSigma, N_RX, N_TX);
+      mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX);
       mempool_cholesky_f16vecs(PtrG, PtrL, N_TX);
       mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX);
       mempool_Lttrisol_f16s(PtrL, Ptry3, Ptrx, N_TX);
@@ -294,8 +301,8 @@ int main() {
       __fp16 *PtrSigma = cmpt_Sigma + itr * (2 * N_TX);
       __fp16 *PtrG = G + itr * (2 * N_TX * N_TX);
       __fp16 *Ptry2 = y2 + itr * (2 * N_TX);
-      mempool_hermitian_f16s(PtrH, PtrG, PtrSigma, N_RX, N_TX, 0, 0);
-      mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX, 0);
+      mempool_hermitian_f16vecs(PtrH, PtrG, PtrSigma, N_RX, N_TX);
+      mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX);
     }
     mempool_log_barrier(2, core_id);