[software] Change Data Generation

pulp-platform · Dec 6, 2024 · e10df9f · e10df9f
1 parent 92c681c
commit e10df9f
Show file tree

Hide file tree

Showing 49 changed files with 875 additions and 2,146 deletions.
diff --git a/software/apps/baremetal/axpy_f16/main.c b/software/apps/baremetal/axpy_f16/main.c
@@ -18,8 +18,8 @@
 #define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
 
 // Vectors for kernel computation
-__fp16 l1_X[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
-__fp16 l1_Y[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+__fp16 l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+__fp16 l1_Y[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 
 #include "baremetal/mempool_axpy_f16.h"
 #include "baremetal/mempool_checks.h"
@@ -34,27 +34,27 @@ int main() {
   time_init = 0;
   time_end = 0;
   if (core_id == 0) {
-    dma_memcpy_blocking(l1_X, l2_X, LEN * sizeof(int16_t));
-    dma_memcpy_blocking(l1_Y, l2_Y, LEN * sizeof(int16_t));
+    dma_memcpy_blocking(l1_X, l2_X, array_N * sizeof(int16_t));
+    dma_memcpy_blocking(l1_Y, l2_Y, array_N * sizeof(int16_t));
   }
-  uint32_t register volatile a = *(uint32_t *)&(A)&0x0000FFFF;
+  uint32_t register volatile a = *(uint32_t *)&(l2_A)&0x0000FFFF;
   mempool_barrier(num_cores);
 
   //  // SINGLE
   //  time_init = mempool_get_timer();
-  //  axpy_f16s(A, l1_X, l1_Y, LEN);
+  //  axpy_f16s(A, l1_X, l1_Y, array_N);
   //  time_end = mempool_get_timer();
 
   //  // PARALLEL
   //  time_init = mempool_get_timer();
-  //  axpy_f16vecp_unrolled4(A, l1_X, l1_Y, LEN, num_cores);
+  //  axpy_f16vecp_unrolled4(A, l1_X, l1_Y, array_N, num_cores);
   //  time_end = mempool_get_timer();
 
   // PARALLEL, LOCAL ACCESSES
   time_init = mempool_get_timer();
-  // axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, LEN);
+  // axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, array_N);
   mempool_start_benchmark();
-  axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, LEN);
+  axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, array_N);
   mempool_stop_benchmark();
   time_end = mempool_get_timer();
 
@@ -64,7 +64,7 @@ int main() {
     uint32_t clock_cycles = (time_end - time_init);
     printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
   }
-  mempool_check_f16(l1_Y, l2_out, 100, 0.1f, 0);
+  mempool_check_f16(l1_Y, l2_Z, 100, 0.1f, 0);
   mempool_barrier(num_cores);
 
   return 0;

diff --git a/software/apps/baremetal/axpy_f32/main.c b/software/apps/baremetal/axpy_f32/main.c
@@ -18,8 +18,8 @@
 #define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
 
 // Vectors for kernel computation
-float l1_X[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
-float l1_Y[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+float l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+float l1_Y[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 
 #include "baremetal/mempool_axpy_f32.h"
 #include "baremetal/mempool_checks.h"
@@ -34,25 +34,25 @@ int main() {
   time_init = 0;
   time_end = 0;
   if (core_id == 0) {
-    dma_memcpy_blocking(l1_X, l2_X, LEN * sizeof(int32_t));
-    dma_memcpy_blocking(l1_Y, l2_Y, LEN * sizeof(int32_t));
+    dma_memcpy_blocking(l1_X, l2_X, array_N * sizeof(int32_t));
+    dma_memcpy_blocking(l1_Y, l2_Y, array_N * sizeof(int32_t));
   }
-  float register volatile a = A;
+  float register volatile a = l2_A;
   mempool_barrier(num_cores);
 
   // PARALLEL
   time_init = mempool_get_timer();
-  // axpy_f32p(a, l1_X, l1_Y, LEN, num_cores);
-  // axpy_f32p_unrolled4(a, l1_X, l1_Y, LEN, num_cores);
-  axpy_f32p_local_unrolled4(a, l1_X, l1_Y, LEN);
+  // axpy_f32p(a, l1_X, l1_Y, array_N, num_cores);
+  // axpy_f32p_unrolled4(a, l1_X, l1_Y, array_N, num_cores);
+  axpy_f32p_local_unrolled4(a, l1_X, l1_Y, array_N);
   time_end = mempool_get_timer();
 
   // Check results
   if (core_id == 0) {
     uint32_t clock_cycles = (time_end - time_init);
     printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
   }
-  mempool_check_f32(l1_Y, l2_out, 100, 0.1f, 0);
+  mempool_check_f32(l1_Y, l2_Z, 100, 0.1f, 0);
   mempool_barrier(num_cores);
 
   return 0;

diff --git a/software/apps/baremetal/axpy_i32/main.c b/software/apps/baremetal/axpy_i32/main.c
@@ -16,7 +16,7 @@
 #include "runtime.h"
 #include "synchronization.h"
 
-#include "baremetal/mempool_axpy_i32p.h"
+#include "baremetal/mempool_axpy_i32.h"
 #include "baremetal/mempool_checks.h"
 #include "data_axpy_i32.h"
 
@@ -38,11 +38,12 @@ int main() {
     dma_memcpy_blocking(l1_Y, l2_Y, array_N * sizeof(int32_t));
     error = 0;
   }
+  register volatile int32_t a = l2_A;
   mempool_barrier(num_cores);
 
   // Benchmark
   mempool_start_benchmark();
-  calc_axpy_unloop_x4_localbank(l1_X, l1_Y, ALPHA, array_N, core_id, num_cores);
+  calc_axpy_unloop_x4_localbank(l1_X, l1_Y, a, array_N, core_id, num_cores);
   mempool_barrier(num_cores);
   mempool_stop_benchmark();
 

diff --git a/software/apps/baremetal/cfft_radix4_f16/main.c b/software/apps/baremetal/cfft_radix4_f16/main.c
@@ -19,23 +19,26 @@
 
 /* CFFT data libraries */
 #include "data_cfft_radix4_f16.h"
+#define N_BANKS (NUM_CORES * BANKING_FACTOR)
+#define MAX_COL (N_BANKS / (N_CSAMPLES / 4))
 
 /* CHOOSE ONE */
-//#define PARALLEL // Parallel FFT not "memory-aware".
-//#define FOLDED // Parallel FFT with "memory-aware" load/store.
-#define SCHEDULED // Folded FFTs arranged in rows and cols.'''
+#define PARALLEL // Parallel FFT not "memory-aware".
+// #define FOLDED // Parallel FFT with "memory-aware" load/store.
+//#define SCHEDULED // Folded FFTs arranged in rows and cols.'''
 
 // Bitreversal index from table.
 #define BITREVERSETABLE
+// Also the twiddles have "memory-aware" load/stores.
+// #define FOLDED_TWIDDLES
+
 // Independent FFTs scheduled on one row (default 1).
 #define N_FFTs_ROW 1
 // Independent FFTs scheduled on columns (default 1).
 #define N_FFTs_COL 1
 #if (N_FFTs_COL > MAX_COL)
 #error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)]
 #endif
-// Also the twiddles have "memory-aware" load/stores.
-#define FOLDED_TWIDDLES
 
 #include "baremetal/mempool_cfft_q16_bitreversal.h"
 #include "baremetal/mempool_checks.h"
@@ -47,9 +50,9 @@ __fp16 l1_pSrc[2 * N_CSAMPLES]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
 __fp16 l1_pDst[2 * N_CSAMPLES]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
-__fp16 l1_twiddleCoef_f16_src[2 * 3 * N_CSAMPLES / 4]
+__fp16 l1_twiddleCoef_f16_src[2 * N_TWIDDLES]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
-__fp16 l1_twiddleCoef_f16_dst[2 * 3 * N_CSAMPLES / 4]
+__fp16 l1_twiddleCoef_f16_dst[2 * N_TWIDDLES]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
 uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
@@ -80,7 +83,7 @@ int main() {
   if (core_id == 0) {
     dma_memcpy_blocking(l1_pSrc, l2_pSrc, N_CSAMPLES * sizeof(int32_t));
     dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16,
-                        3 * (N_CSAMPLES / 4) * sizeof(int32_t));
+                        N_TWIDDLES * sizeof(int32_t));
     dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable,
                         BITREVINDEXTABLE_LENGTH * sizeof(int16_t));
     printf("01: END INITIALIZATION\n");
@@ -97,6 +100,8 @@ int main() {
                             l2_pSrc, N_CSAMPLES * sizeof(int32_t));
       }
     }
+    dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16,
+                        N_TWIDDLES * sizeof(int32_t));
     dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable,
                         BITREVINDEXTABLE_LENGTH * sizeof(int32_t));
   }
@@ -114,13 +119,8 @@ int main() {
           *(v2h *)&l2_twiddleCoef_f16[2 * (i * 3U)];
     }
   }
-#else
-  if (core_id == 0) {
-    dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16,
-                        3 * (N_CSAMPLES / 4) * sizeof(int32_t));
-  }
-#endif
   mempool_barrier(num_cores);
+#endif
 
   if (core_id == 0) {
     printf("01: END INITIALIZATION\n");
@@ -132,7 +132,7 @@ int main() {
 
 #ifdef PARALLEL
   mempool_start_benchmark();
-  mempool_radix4_cfft_f16p(l1_pSrc, N_CSAMPLES, l1_twiddleCoef_f16_src, 1,
+  mempool_radix4_cfft_f16p(l1_pSrc, N_CSAMPLES, l1_twiddleCoef_f16_src,
                            num_cores);
   mempool_bitrevtable_q16p_xpulpimg((int16_t *)l1_pSrc, BITREVINDEXTABLE_LENGTH,
                                     l1_BitRevIndexTable, num_cores);
@@ -176,7 +176,7 @@ int main() {
     printf("02: END COMPUTATION\n");
   }
 
-  mempool_check_f16(pRes, l2_pRes, 2 * N_CSAMPLES, 0.05f, 0);
+  mempool_check_f16(pRes, l2_pRes, 2 * N_CSAMPLES, (float)TOLERANCE, 0);
   mempool_barrier(num_cores);
   return 0;
 }
diff --git a/software/apps/baremetal/cholesky_f16/main.c b/software/apps/baremetal/cholesky_f16/main.c
@@ -18,10 +18,11 @@
 #include "baremetal/mempool_cholesky_f16s.h"
 
 #define SINGLE
+#define FOLDED (0)
 
-__fp16 l1_GIn[2 * dim_N * dim_N * N_SAMPLES]
+__fp16 l1_GIn[2 * matrix_N * matrix_N * N_SAMPLES]
     __attribute__((section(".l1_prio")));
-__fp16 l1_LOut[2 * dim_N * dim_N * N_SAMPLES]
+__fp16 l1_LOut[2 * matrix_N * matrix_N * N_SAMPLES]
     __attribute__((section(".l1_prio")));
 
 int main() {
@@ -32,9 +33,9 @@ int main() {
   /* Initialize matrices */
   if (core_id == 0) {
     dma_memcpy_blocking(l1_GIn, l2_GIn,
-                        dim_N * dim_N * N_SAMPLES * sizeof(int32_t));
+                        matrix_N * matrix_N * N_SAMPLES * sizeof(int32_t));
     dma_memcpy_blocking(l1_LOut, l2_LOut,
-                        dim_N * dim_N * N_SAMPLES * sizeof(int32_t));
+                        matrix_N * matrix_N * N_SAMPLES * sizeof(int32_t));
   }
   // Wait at barrier until everyone is ready
   mempool_barrier(num_cores);
@@ -43,7 +44,7 @@ int main() {
   /* Benchmark */
   if (core_id == 0) {
     mempool_start_benchmark();
-    mempool_cholesky_f16vecs(l1_GIn, l1_LOut, dim_N);
+    mempool_cholesky_f16vecs(l1_GIn, l1_LOut, matrix_N, FOLDED);
     mempool_stop_benchmark();
   }
   mempool_barrier(num_cores);
@@ -52,15 +53,15 @@ int main() {
 #ifdef PARALLEL
   for (uint32_t i = core_id; i < N_SAMPLES; i += num_cores) {
     mempool_start_benchmark();
-    __fp16 *ptr_in_matrix = l1_GIn + i * 2 * dim_N * dim_N;
-    __fp16 *ptr_out_matrix = l1_LOut + i * 2 * dim_N * dim_N;
-    mempool_cholesky_f16vecs(ptr_in_matrix, ptr_out_matrix, dim_N);
+    __fp16 *ptr_in_matrix = l1_GIn + i * 2 * matrix_N * matrix_N;
+    __fp16 *ptr_out_matrix = l1_LOut + i * 2 * matrix_N * matrix_N;
+    mempool_cholesky_f16vecs(ptr_in_matrix, ptr_out_matrix, matrix_N, FOLDED);
   }
   mempool_barrier(num_cores);
   mempool_stop_benchmark();
 #endif
 
-  mempool_check_f16(l1_LOut, l2_LOut, 2 * dim_N * dim_N, 0.01f, 0);
+  mempool_check_f16(l1_LOut, l2_LOut, 2 * matrix_N * matrix_N, 0.01f, 0);
   mempool_barrier(num_cores);
   return 0;
 }
diff --git a/software/apps/baremetal/cholesky_q16/main.c b/software/apps/baremetal/cholesky_q16/main.c
@@ -16,9 +16,9 @@
 
 #define SINGLE
 
-int16_t l1_GIn[2 * dim_N * dim_N * N_SAMPLES]
+int16_t l1_GIn[2 * matrix_N * matrix_N * N_SAMPLES]
     __attribute__((section(".l1_prio")));
-int16_t l1_LOut[2 * dim_N * dim_N * N_SAMPLES]
+int16_t l1_LOut[2 * matrix_N * matrix_N * N_SAMPLES]
     __attribute__((section(".l1_prio")));
 
 int main() {
@@ -29,9 +29,9 @@ int main() {
   /* Initialize matrices */
   if (core_id == 0) {
     dma_memcpy_blocking(l1_GIn, l2_GIn,
-                        dim_N * dim_N * N_SAMPLES * sizeof(int32_t));
+                        matrix_N * matrix_N * N_SAMPLES * sizeof(int32_t));
     dma_memcpy_blocking(l1_LOut, l2_LOut,
-                        dim_N * dim_N * N_SAMPLES * sizeof(int32_t));
+                        matrix_N * matrix_N * N_SAMPLES * sizeof(int32_t));
   }
   // Wait at barrier until everyone is ready
   mempool_barrier(num_cores);
@@ -40,7 +40,7 @@ int main() {
   /* Benchmark */
   if (core_id == 0) {
     mempool_start_benchmark();
-    mempool_cholesky_q16vecs(l1_GIn, l1_LOut, dim_N);
+    mempool_cholesky_q16vecs(l1_GIn, l1_LOut, matrix_N);
     mempool_stop_benchmark();
   }
   mempool_barrier(num_cores);
@@ -49,15 +49,15 @@ int main() {
 #ifdef PARALLEL
   for (uint32_t i = core_id; i < N_SAMPLES; i += num_cores) {
     mempool_start_benchmark();
-    __fp16 *ptr_in_matrix = l1_GIn + i * 2 * dim_N * dim_N;
-    __fp16 *ptr_out_matrix = l1_LOut + i * 2 * dim_N * dim_N;
-    mempool_cholesky_q16s(ptr_in_matrix, ptr_out_matrix, dim_N);
+    __fp16 *ptr_in_matrix = l1_GIn + i * 2 * matrix_N * matrix_N;
+    __fp16 *ptr_out_matrix = l1_LOut + i * 2 * matrix_N * matrix_N;
+    mempool_cholesky_q16s(ptr_in_matrix, ptr_out_matrix, matrix_N);
   }
   mempool_barrier(num_cores);
   mempool_stop_benchmark();
 #endif
 
-  mempool_check_q16(l1_LOut, l2_LOut, 2 * dim_N * dim_N, 16, 0);
+  mempool_check_i16(l1_LOut, l2_LOut, 2 * matrix_N * matrix_N, 16, 0);
   mempool_barrier(num_cores);
   return 0;
 }
diff --git a/software/apps/baremetal/cmatmul_f16/main.c b/software/apps/baremetal/cmatmul_f16/main.c
@@ -13,11 +13,13 @@
 #include "synchronization.h"
 
 #include "data_cmatmul_f16.h"
+#define dim_M (matrix_M)
+#define dim_N (matrix_N)
+#define dim_P (matrix_P)
 
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_cmatmul_f16.h"
-#define PARALLEL_2x4
-#define TEST
+#define PARALLEL_4x4
 
 #if defined(PARALLEL_4x4_COPIES_A)
 __fp16 matrix_a[2 * (BANKING_FACTOR * NUM_CORES)]
@@ -43,8 +45,8 @@ int main() {
 
   // Initialize Matrices
   if (core_id == 0) {
-    dma_memcpy_blocking(matrix_a, A, 2 * dim_M * dim_N * sizeof(int16_t));
-    dma_memcpy_blocking(matrix_b, B, 2 * dim_N * dim_P * sizeof(int16_t));
+    dma_memcpy_blocking(matrix_a, l2_A, 2 * dim_M * dim_N * sizeof(int16_t));
+    dma_memcpy_blocking(matrix_b, l2_B, 2 * dim_N * dim_P * sizeof(int16_t));
   }
   // Wait at barrier until everyone is ready
   mempool_barrier(num_cores);
@@ -104,10 +106,7 @@ int main() {
   mempool_stop_benchmark();
 #endif
 
-#if defined(TEST)
-  mempool_check_f16(matrix_c, C, 2 * dim_M * dim_P, 0.1f, 0);
+  mempool_check_f16(matrix_c, l2_C, 10, 0.1f, 0);
   mempool_barrier(num_cores);
-#endif
-
   return 0;
 }