From e10df9f8f0ae7c9d671648a7ff2a73469acb024e Mon Sep 17 00:00:00 2001
From: mbertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Tue, 26 Nov 2024 16:49:51 +0100
Subject: [PATCH] [software] Change Data Generation

---
 software/apps/baremetal/axpy_f16/main.c       |  20 +-
 software/apps/baremetal/axpy_f32/main.c       |  18 +-
 software/apps/baremetal/axpy_i32/main.c       |   5 +-
 .../apps/baremetal/cfft_radix4_f16/main.c     |  32 +-
 software/apps/baremetal/cholesky_f16/main.c   |  19 +-
 software/apps/baremetal/cholesky_q16/main.c   |  18 +-
 software/apps/baremetal/cmatmul_f16/main.c    |  15 +-
 software/apps/baremetal/cmatmul_q16/main.c    |  11 +-
 software/apps/baremetal/dotp_f16/main.c       |  20 +-
 software/apps/baremetal/dotp_f32/main.c       |  16 +-
 software/apps/baremetal/mimo_mmse_f16/main.c  |  30 +-
 software/apps/baremetal/mimo_mmse_f32/main.c  |  37 +-
 software/apps/baremetal/mimo_mmse_q16/main.c  |   6 +-
 .../apps/baremetal/{ofdm => ofdm_f16}/main.c  |   3 +-
 software/data/README.md                       |   4 +-
 software/data/data_axpy_f16.h.tpl             |  26 --
 software/data/data_axpy_f32.h.tpl             |  27 --
 software/data/data_cfft_f16.h.tpl             |  48 --
 software/data/data_cfft_q16.h.tpl             |  51 ---
 software/data/data_chest_f16.h.tpl            |  30 --
 software/data/data_cholesky_f16.h.tpl         |  21 -
 software/data/data_cholesky_q16.h.tpl         |  21 -
 software/data/data_cholesky_q32.h.tpl         |  22 -
 software/data/data_cmatmul_f16.h.tpl          |  26 --
 software/data/data_cmatmul_q16.h.tpl          |  26 --
 software/data/data_dotp_f16.h.tpl             |  24 -
 software/data/data_dotp_f32.h.tpl             |  24 -
 software/data/data_dotp_i32.h.tpl             |  24 -
 software/data/data_mimo_mmse_f16.h.tpl        |  47 --
 software/data/data_mimo_mmse_f32.h.tpl        |  34 --
 software/data/data_mimo_mmse_f8.h.tpl         |  47 --
 software/data/data_mimo_mmse_q16.h.tpl        |  34 --
 software/data/data_ofdm.h.tpl                 |  48 --
 software/data/gendata_header.py               |  27 +-
 software/data/gendata_params.hjson            | 256 ++++++++++-
 software/data/gendatalib.py                   | 432 +++++++++++++++++-
 software/data/generate_cfft.py                | 220 ---------
 software/data/generate_chest.py               | 210 ---------
 software/data/generate_cholesky.py            | 179 --------
 software/data/generate_dotp.py                | 157 -------
 software/data/generate_matmul.py              | 204 ---------
 software/data/generate_mimo_mmse.py           | 232 ----------
 software/data/generate_ofdm.py                | 137 ------
 .../kernels/baremetal/mempool_cholesky_f16s.h |   3 +
 .../baremetal/mempool_linearsolver_q16s.h     |   2 +-
 .../baremetal/mempool_mimo_mmse_f16s.h        |  64 +--
 .../baremetal/mempool_mimo_mmse_f32s.h        |   3 +-
 .../mempool_radix4_cfft_butterfly_f16.h       |  39 +-
 .../baremetal/mempool_radix4_cfft_f16p.h      |  22 +-
 49 files changed, 875 insertions(+), 2146 deletions(-)
 rename software/apps/baremetal/{ofdm => ofdm_f16}/main.c (98%)
 delete mode 100644 software/data/data_axpy_f16.h.tpl
 delete mode 100644 software/data/data_axpy_f32.h.tpl
 delete mode 100644 software/data/data_cfft_f16.h.tpl
 delete mode 100644 software/data/data_cfft_q16.h.tpl
 delete mode 100644 software/data/data_chest_f16.h.tpl
 delete mode 100644 software/data/data_cholesky_f16.h.tpl
 delete mode 100644 software/data/data_cholesky_q16.h.tpl
 delete mode 100644 software/data/data_cholesky_q32.h.tpl
 delete mode 100644 software/data/data_cmatmul_f16.h.tpl
 delete mode 100644 software/data/data_cmatmul_q16.h.tpl
 delete mode 100644 software/data/data_dotp_f16.h.tpl
 delete mode 100644 software/data/data_dotp_f32.h.tpl
 delete mode 100644 software/data/data_dotp_i32.h.tpl
 delete mode 100644 software/data/data_mimo_mmse_f16.h.tpl
 delete mode 100644 software/data/data_mimo_mmse_f32.h.tpl
 delete mode 100644 software/data/data_mimo_mmse_f8.h.tpl
 delete mode 100644 software/data/data_mimo_mmse_q16.h.tpl
 delete mode 100644 software/data/data_ofdm.h.tpl
 delete mode 100755 software/data/generate_cfft.py
 delete mode 100755 software/data/generate_chest.py
 delete mode 100644 software/data/generate_cholesky.py
 delete mode 100644 software/data/generate_dotp.py
 delete mode 100644 software/data/generate_matmul.py
 delete mode 100644 software/data/generate_mimo_mmse.py
 delete mode 100644 software/data/generate_ofdm.py

diff --git a/software/apps/baremetal/axpy_f16/main.c b/software/apps/baremetal/axpy_f16/main.c
index ff13cb879..46119009a 100644
--- a/software/apps/baremetal/axpy_f16/main.c
+++ b/software/apps/baremetal/axpy_f16/main.c
@@ -18,8 +18,8 @@
 #define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
 
 // Vectors for kernel computation
-__fp16 l1_X[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
-__fp16 l1_Y[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+__fp16 l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+__fp16 l1_Y[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 
 #include "baremetal/mempool_axpy_f16.h"
 #include "baremetal/mempool_checks.h"
@@ -34,27 +34,27 @@ int main() {
   time_init = 0;
   time_end = 0;
   if (core_id == 0) {
-    dma_memcpy_blocking(l1_X, l2_X, LEN * sizeof(int16_t));
-    dma_memcpy_blocking(l1_Y, l2_Y, LEN * sizeof(int16_t));
+    dma_memcpy_blocking(l1_X, l2_X, array_N * sizeof(int16_t));
+    dma_memcpy_blocking(l1_Y, l2_Y, array_N * sizeof(int16_t));
   }
-  uint32_t register volatile a = *(uint32_t *)&(A)&0x0000FFFF;
+  uint32_t register volatile a = *(uint32_t *)&(l2_A)&0x0000FFFF;
   mempool_barrier(num_cores);
 
   //  // SINGLE
   //  time_init = mempool_get_timer();
-  //  axpy_f16s(A, l1_X, l1_Y, LEN);
+  //  axpy_f16s(A, l1_X, l1_Y, array_N);
   //  time_end = mempool_get_timer();
 
   //  // PARALLEL
   //  time_init = mempool_get_timer();
-  //  axpy_f16vecp_unrolled4(A, l1_X, l1_Y, LEN, num_cores);
+  //  axpy_f16vecp_unrolled4(A, l1_X, l1_Y, array_N, num_cores);
   //  time_end = mempool_get_timer();
 
   // PARALLEL, LOCAL ACCESSES
   time_init = mempool_get_timer();
-  // axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, LEN);
+  // axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, array_N);
   mempool_start_benchmark();
-  axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, LEN);
+  axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, array_N);
   mempool_stop_benchmark();
   time_end = mempool_get_timer();
 
@@ -64,7 +64,7 @@ int main() {
     uint32_t clock_cycles = (time_end - time_init);
     printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
   }
-  mempool_check_f16(l1_Y, l2_out, 100, 0.1f, 0);
+  mempool_check_f16(l1_Y, l2_Z, 100, 0.1f, 0);
   mempool_barrier(num_cores);
 
   return 0;
diff --git a/software/apps/baremetal/axpy_f32/main.c b/software/apps/baremetal/axpy_f32/main.c
index 1b1bef859..da34c5fcd 100644
--- a/software/apps/baremetal/axpy_f32/main.c
+++ b/software/apps/baremetal/axpy_f32/main.c
@@ -18,8 +18,8 @@
 #define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
 
 // Vectors for kernel computation
-float l1_X[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
-float l1_Y[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+float l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+float l1_Y[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 
 #include "baremetal/mempool_axpy_f32.h"
 #include "baremetal/mempool_checks.h"
@@ -34,17 +34,17 @@ int main() {
   time_init = 0;
   time_end = 0;
   if (core_id == 0) {
-    dma_memcpy_blocking(l1_X, l2_X, LEN * sizeof(int32_t));
-    dma_memcpy_blocking(l1_Y, l2_Y, LEN * sizeof(int32_t));
+    dma_memcpy_blocking(l1_X, l2_X, array_N * sizeof(int32_t));
+    dma_memcpy_blocking(l1_Y, l2_Y, array_N * sizeof(int32_t));
   }
-  float register volatile a = A;
+  float register volatile a = l2_A;
   mempool_barrier(num_cores);
 
   // PARALLEL
   time_init = mempool_get_timer();
-  // axpy_f32p(a, l1_X, l1_Y, LEN, num_cores);
-  // axpy_f32p_unrolled4(a, l1_X, l1_Y, LEN, num_cores);
-  axpy_f32p_local_unrolled4(a, l1_X, l1_Y, LEN);
+  // axpy_f32p(a, l1_X, l1_Y, array_N, num_cores);
+  // axpy_f32p_unrolled4(a, l1_X, l1_Y, array_N, num_cores);
+  axpy_f32p_local_unrolled4(a, l1_X, l1_Y, array_N);
   time_end = mempool_get_timer();
 
   // Check results
@@ -52,7 +52,7 @@ int main() {
     uint32_t clock_cycles = (time_end - time_init);
     printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
   }
-  mempool_check_f32(l1_Y, l2_out, 100, 0.1f, 0);
+  mempool_check_f32(l1_Y, l2_Z, 100, 0.1f, 0);
   mempool_barrier(num_cores);
 
   return 0;
diff --git a/software/apps/baremetal/axpy_i32/main.c b/software/apps/baremetal/axpy_i32/main.c
index c391ba040..e5590c124 100644
--- a/software/apps/baremetal/axpy_i32/main.c
+++ b/software/apps/baremetal/axpy_i32/main.c
@@ -16,7 +16,7 @@
 #include "runtime.h"
 #include "synchronization.h"
 
-#include "baremetal/mempool_axpy_i32p.h"
+#include "baremetal/mempool_axpy_i32.h"
 #include "baremetal/mempool_checks.h"
 #include "data_axpy_i32.h"
 
@@ -38,11 +38,12 @@ int main() {
     dma_memcpy_blocking(l1_Y, l2_Y, array_N * sizeof(int32_t));
     error = 0;
   }
+  register volatile int32_t a = l2_A;
   mempool_barrier(num_cores);
 
   // Benchmark
   mempool_start_benchmark();
-  calc_axpy_unloop_x4_localbank(l1_X, l1_Y, ALPHA, array_N, core_id, num_cores);
+  calc_axpy_unloop_x4_localbank(l1_X, l1_Y, a, array_N, core_id, num_cores);
   mempool_barrier(num_cores);
   mempool_stop_benchmark();
 
diff --git a/software/apps/baremetal/cfft_radix4_f16/main.c b/software/apps/baremetal/cfft_radix4_f16/main.c
index 30341f46d..b06ae3189 100644
--- a/software/apps/baremetal/cfft_radix4_f16/main.c
+++ b/software/apps/baremetal/cfft_radix4_f16/main.c
@@ -19,14 +19,19 @@
 
 /* CFFT data libraries */
 #include "data_cfft_radix4_f16.h"
+#define N_BANKS (NUM_CORES * BANKING_FACTOR)
+#define MAX_COL (N_BANKS / (N_CSAMPLES / 4))
 
 /* CHOOSE ONE */
-//#define PARALLEL // Parallel FFT not "memory-aware".
-//#define FOLDED // Parallel FFT with "memory-aware" load/store.
-#define SCHEDULED // Folded FFTs arranged in rows and cols.'''
+#define PARALLEL // Parallel FFT not "memory-aware".
+// #define FOLDED // Parallel FFT with "memory-aware" load/store.
+//#define SCHEDULED // Folded FFTs arranged in rows and cols.'''
 
 // Bitreversal index from table.
 #define BITREVERSETABLE
+// Also the twiddles have "memory-aware" load/stores.
+// #define FOLDED_TWIDDLES
+
 // Independent FFTs scheduled on one row (default 1).
 #define N_FFTs_ROW 1
 // Independent FFTs scheduled on columns (default 1).
@@ -34,8 +39,6 @@
 #if (N_FFTs_COL > MAX_COL)
 #error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)]
 #endif
-// Also the twiddles have "memory-aware" load/stores.
-#define FOLDED_TWIDDLES
 
 #include "baremetal/mempool_cfft_q16_bitreversal.h"
 #include "baremetal/mempool_checks.h"
@@ -47,9 +50,9 @@ __fp16 l1_pSrc[2 * N_CSAMPLES]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
 __fp16 l1_pDst[2 * N_CSAMPLES]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
-__fp16 l1_twiddleCoef_f16_src[2 * 3 * N_CSAMPLES / 4]
+__fp16 l1_twiddleCoef_f16_src[2 * N_TWIDDLES]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
-__fp16 l1_twiddleCoef_f16_dst[2 * 3 * N_CSAMPLES / 4]
+__fp16 l1_twiddleCoef_f16_dst[2 * N_TWIDDLES]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
 uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
@@ -80,7 +83,7 @@ int main() {
   if (core_id == 0) {
     dma_memcpy_blocking(l1_pSrc, l2_pSrc, N_CSAMPLES * sizeof(int32_t));
     dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16,
-                        3 * (N_CSAMPLES / 4) * sizeof(int32_t));
+                        N_TWIDDLES * sizeof(int32_t));
     dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable,
                         BITREVINDEXTABLE_LENGTH * sizeof(int16_t));
     printf("01: END INITIALIZATION\n");
@@ -97,6 +100,8 @@ int main() {
                             l2_pSrc, N_CSAMPLES * sizeof(int32_t));
       }
     }
+    dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16,
+                        N_TWIDDLES * sizeof(int32_t));
     dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable,
                         BITREVINDEXTABLE_LENGTH * sizeof(int32_t));
   }
@@ -114,13 +119,8 @@ int main() {
           *(v2h *)&l2_twiddleCoef_f16[2 * (i * 3U)];
     }
   }
-#else
-  if (core_id == 0) {
-    dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16,
-                        3 * (N_CSAMPLES / 4) * sizeof(int32_t));
-  }
-#endif
   mempool_barrier(num_cores);
+#endif
 
   if (core_id == 0) {
     printf("01: END INITIALIZATION\n");
@@ -132,7 +132,7 @@ int main() {
 
 #ifdef PARALLEL
   mempool_start_benchmark();
-  mempool_radix4_cfft_f16p(l1_pSrc, N_CSAMPLES, l1_twiddleCoef_f16_src, 1,
+  mempool_radix4_cfft_f16p(l1_pSrc, N_CSAMPLES, l1_twiddleCoef_f16_src,
                            num_cores);
   mempool_bitrevtable_q16p_xpulpimg((int16_t *)l1_pSrc, BITREVINDEXTABLE_LENGTH,
                                     l1_BitRevIndexTable, num_cores);
@@ -176,7 +176,7 @@ int main() {
     printf("02: END COMPUTATION\n");
   }
 
-  mempool_check_f16(pRes, l2_pRes, 2 * N_CSAMPLES, 0.05f, 0);
+  mempool_check_f16(pRes, l2_pRes, 2 * N_CSAMPLES, (float)TOLERANCE, 0);
   mempool_barrier(num_cores);
   return 0;
 }
diff --git a/software/apps/baremetal/cholesky_f16/main.c b/software/apps/baremetal/cholesky_f16/main.c
index 908ca99fa..6d1c26ff2 100644
--- a/software/apps/baremetal/cholesky_f16/main.c
+++ b/software/apps/baremetal/cholesky_f16/main.c
@@ -18,10 +18,11 @@
 #include "baremetal/mempool_cholesky_f16s.h"
 
 #define SINGLE
+#define FOLDED (0)
 
-__fp16 l1_GIn[2 * dim_N * dim_N * N_SAMPLES]
+__fp16 l1_GIn[2 * matrix_N * matrix_N * N_SAMPLES]
     __attribute__((section(".l1_prio")));
-__fp16 l1_LOut[2 * dim_N * dim_N * N_SAMPLES]
+__fp16 l1_LOut[2 * matrix_N * matrix_N * N_SAMPLES]
     __attribute__((section(".l1_prio")));
 
 int main() {
@@ -32,9 +33,9 @@ int main() {
   /* Initialize matrices */
   if (core_id == 0) {
     dma_memcpy_blocking(l1_GIn, l2_GIn,
-                        dim_N * dim_N * N_SAMPLES * sizeof(int32_t));
+                        matrix_N * matrix_N * N_SAMPLES * sizeof(int32_t));
     dma_memcpy_blocking(l1_LOut, l2_LOut,
-                        dim_N * dim_N * N_SAMPLES * sizeof(int32_t));
+                        matrix_N * matrix_N * N_SAMPLES * sizeof(int32_t));
   }
   // Wait at barrier until everyone is ready
   mempool_barrier(num_cores);
@@ -43,7 +44,7 @@ int main() {
   /* Benchmark */
   if (core_id == 0) {
     mempool_start_benchmark();
-    mempool_cholesky_f16vecs(l1_GIn, l1_LOut, dim_N);
+    mempool_cholesky_f16vecs(l1_GIn, l1_LOut, matrix_N, FOLDED);
     mempool_stop_benchmark();
   }
   mempool_barrier(num_cores);
@@ -52,15 +53,15 @@ int main() {
 #ifdef PARALLEL
   for (uint32_t i = core_id; i < N_SAMPLES; i += num_cores) {
     mempool_start_benchmark();
-    __fp16 *ptr_in_matrix = l1_GIn + i * 2 * dim_N * dim_N;
-    __fp16 *ptr_out_matrix = l1_LOut + i * 2 * dim_N * dim_N;
-    mempool_cholesky_f16vecs(ptr_in_matrix, ptr_out_matrix, dim_N);
+    __fp16 *ptr_in_matrix = l1_GIn + i * 2 * matrix_N * matrix_N;
+    __fp16 *ptr_out_matrix = l1_LOut + i * 2 * matrix_N * matrix_N;
+    mempool_cholesky_f16vecs(ptr_in_matrix, ptr_out_matrix, matrix_N, FOLDED);
   }
   mempool_barrier(num_cores);
   mempool_stop_benchmark();
 #endif
 
-  mempool_check_f16(l1_LOut, l2_LOut, 2 * dim_N * dim_N, 0.01f, 0);
+  mempool_check_f16(l1_LOut, l2_LOut, 2 * matrix_N * matrix_N, 0.01f, 0);
   mempool_barrier(num_cores);
   return 0;
 }
diff --git a/software/apps/baremetal/cholesky_q16/main.c b/software/apps/baremetal/cholesky_q16/main.c
index 3c382c500..b0168614c 100644
--- a/software/apps/baremetal/cholesky_q16/main.c
+++ b/software/apps/baremetal/cholesky_q16/main.c
@@ -16,9 +16,9 @@
 
 #define SINGLE
 
-int16_t l1_GIn[2 * dim_N * dim_N * N_SAMPLES]
+int16_t l1_GIn[2 * matrix_N * matrix_N * N_SAMPLES]
     __attribute__((section(".l1_prio")));
-int16_t l1_LOut[2 * dim_N * dim_N * N_SAMPLES]
+int16_t l1_LOut[2 * matrix_N * matrix_N * N_SAMPLES]
     __attribute__((section(".l1_prio")));
 
 int main() {
@@ -29,9 +29,9 @@ int main() {
   /* Initialize matrices */
   if (core_id == 0) {
     dma_memcpy_blocking(l1_GIn, l2_GIn,
-                        dim_N * dim_N * N_SAMPLES * sizeof(int32_t));
+                        matrix_N * matrix_N * N_SAMPLES * sizeof(int32_t));
     dma_memcpy_blocking(l1_LOut, l2_LOut,
-                        dim_N * dim_N * N_SAMPLES * sizeof(int32_t));
+                        matrix_N * matrix_N * N_SAMPLES * sizeof(int32_t));
   }
   // Wait at barrier until everyone is ready
   mempool_barrier(num_cores);
@@ -40,7 +40,7 @@ int main() {
   /* Benchmark */
   if (core_id == 0) {
     mempool_start_benchmark();
-    mempool_cholesky_q16vecs(l1_GIn, l1_LOut, dim_N);
+    mempool_cholesky_q16vecs(l1_GIn, l1_LOut, matrix_N);
     mempool_stop_benchmark();
   }
   mempool_barrier(num_cores);
@@ -49,15 +49,15 @@ int main() {
 #ifdef PARALLEL
   for (uint32_t i = core_id; i < N_SAMPLES; i += num_cores) {
     mempool_start_benchmark();
-    __fp16 *ptr_in_matrix = l1_GIn + i * 2 * dim_N * dim_N;
-    __fp16 *ptr_out_matrix = l1_LOut + i * 2 * dim_N * dim_N;
-    mempool_cholesky_q16s(ptr_in_matrix, ptr_out_matrix, dim_N);
+    __fp16 *ptr_in_matrix = l1_GIn + i * 2 * matrix_N * matrix_N;
+    __fp16 *ptr_out_matrix = l1_LOut + i * 2 * matrix_N * matrix_N;
+    mempool_cholesky_q16s(ptr_in_matrix, ptr_out_matrix, matrix_N);
   }
   mempool_barrier(num_cores);
   mempool_stop_benchmark();
 #endif
 
-  mempool_check_q16(l1_LOut, l2_LOut, 2 * dim_N * dim_N, 16, 0);
+  mempool_check_i16(l1_LOut, l2_LOut, 2 * matrix_N * matrix_N, 16, 0);
   mempool_barrier(num_cores);
   return 0;
 }
diff --git a/software/apps/baremetal/cmatmul_f16/main.c b/software/apps/baremetal/cmatmul_f16/main.c
index be80f8c8b..aa2ed55a6 100644
--- a/software/apps/baremetal/cmatmul_f16/main.c
+++ b/software/apps/baremetal/cmatmul_f16/main.c
@@ -13,11 +13,13 @@
 #include "synchronization.h"
 
 #include "data_cmatmul_f16.h"
+#define dim_M (matrix_M)
+#define dim_N (matrix_N)
+#define dim_P (matrix_P)
 
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_cmatmul_f16.h"
-#define PARALLEL_2x4
-#define TEST
+#define PARALLEL_4x4
 
 #if defined(PARALLEL_4x4_COPIES_A)
 __fp16 matrix_a[2 * (BANKING_FACTOR * NUM_CORES)]
@@ -43,8 +45,8 @@ int main() {
 
   // Initialize Matrices
   if (core_id == 0) {
-    dma_memcpy_blocking(matrix_a, A, 2 * dim_M * dim_N * sizeof(int16_t));
-    dma_memcpy_blocking(matrix_b, B, 2 * dim_N * dim_P * sizeof(int16_t));
+    dma_memcpy_blocking(matrix_a, l2_A, 2 * dim_M * dim_N * sizeof(int16_t));
+    dma_memcpy_blocking(matrix_b, l2_B, 2 * dim_N * dim_P * sizeof(int16_t));
   }
   // Wait at barrier until everyone is ready
   mempool_barrier(num_cores);
@@ -104,10 +106,7 @@ int main() {
   mempool_stop_benchmark();
 #endif
 
-#if defined(TEST)
-  mempool_check_f16(matrix_c, C, 2 * dim_M * dim_P, 0.1f, 0);
+  mempool_check_f16(matrix_c, l2_C, 10, 0.1f, 0);
   mempool_barrier(num_cores);
-#endif
-
   return 0;
 }
diff --git a/software/apps/baremetal/cmatmul_q16/main.c b/software/apps/baremetal/cmatmul_q16/main.c
index f7a6bd31d..0dcffbfc7 100644
--- a/software/apps/baremetal/cmatmul_q16/main.c
+++ b/software/apps/baremetal/cmatmul_q16/main.c
@@ -17,6 +17,9 @@
 #include "data_cmatmul_q16.h"
 
 #define PARALLEL
+#define dim_M (matrix_M)
+#define dim_N (matrix_N)
+#define dim_P (matrix_P)
 
 int16_t matrix_a[2 * dim_M * dim_N]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
@@ -33,8 +36,8 @@ int main() {
 
   // Initialize Matrices
   if (core_id == 0) {
-    dma_memcpy_blocking(matrix_a, A, 2 * dim_M * dim_N * sizeof(int16_t));
-    dma_memcpy_blocking(matrix_b, B, 2 * dim_N * dim_P * sizeof(int16_t));
+    dma_memcpy_blocking(matrix_a, l2_A, 2 * dim_M * dim_N * sizeof(int16_t));
+    dma_memcpy_blocking(matrix_b, l2_B, 2 * dim_N * dim_P * sizeof(int16_t));
   }
   // Wait at barrier until everyone is ready
   mempool_barrier(num_cores);
@@ -42,7 +45,7 @@ int main() {
 #ifdef SINGLE
   if (core_id == 0) {
     mempool_start_benchmark();
-    cmatmul_2x4_q16s(matrix_a, matrix_b, matrix_c, dim_M, dim_N, dim_P);
+    cmatmul_2x2_q16s(matrix_a, matrix_b, matrix_c, dim_M, dim_N, dim_P);
     mempool_stop_benchmark();
   }
   mempool_barrier(num_cores);
@@ -56,7 +59,7 @@ int main() {
   mempool_barrier(num_cores);
 #endif
 
-  mempool_check_q16(matrix_c, C, 2 * dim_M * dim_P, 16, 0);
+  mempool_check_i16(matrix_c, l2_C, 2 * dim_M * dim_P, 16, 0);
   mempool_barrier(num_cores);
 
   return 0;
diff --git a/software/apps/baremetal/dotp_f16/main.c b/software/apps/baremetal/dotp_f16/main.c
index c579c8151..2091f0336 100644
--- a/software/apps/baremetal/dotp_f16/main.c
+++ b/software/apps/baremetal/dotp_f16/main.c
@@ -19,8 +19,8 @@
 #define BINARY_REDUCTION
 
 // Vectors for kernel computation
-__fp16 l1_A[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
-__fp16 l1_B[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+__fp16 l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+__fp16 l1_Y[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 uint32_t red_barrier[NUM_BANKS]
     __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 __fp16 sum[2 * NUM_BANKS]
@@ -38,8 +38,8 @@ int main() {
   time_init = 0;
   time_end = 0;
   if (core_id == 0) {
-    dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int16_t));
-    dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int16_t));
+    dma_memcpy_blocking(l1_X, l2_X, array_N * sizeof(int16_t));
+    dma_memcpy_blocking(l1_Y, l2_Y, array_N * sizeof(int16_t));
   }
   for (uint32_t k = core_id; k < NUM_BANKS; k += num_cores) {
     sum[k] = 0;
@@ -49,19 +49,19 @@ int main() {
 
   //  // SINGLE-CORE
   //  time_init = mempool_get_timer();
-  //  dotp_f16s(l1_A, l1_B, sum, LEN);
-  //  // dotp_f16s_unrolled4(l1_A, l1_B, sum, LEN);
+  //  dotp_f16s(l1_X, l1_Y, sum, array_N);
+  //  // dotp_f16s_unrolled4(l1_X, l1_Y, sum, array_N);
   //  time_end = mempool_get_timer();
 
   //  // PARALLEL
   //  time_init = mempool_get_timer();
-  //  dotp_f16vecp_unrolled4(l1_A, l1_B, sum, LEN, num_cores);
-  //  // dotp_f16p(l1_A, l1_B, sum, LEN, num_cores);
+  //  dotp_f16vecp_unrolled4(l1_X, l1_Y, sum, array_N, num_cores);
+  //  // dotp_f16p(l1_X, l1_Y, sum, array_N, num_cores);
   //  time_end = mempool_get_timer();
 
   // PARALLEL, LOCAL ACCESSES
   time_init = mempool_get_timer();
-  dotp_f16vecp_local_unrolled4(l1_A, l1_B, sum, LEN);
+  dotp_f16vecp_local_unrolled4(l1_X, l1_Y, sum, array_N);
   time_end = mempool_get_timer();
 
   // Check results
@@ -70,7 +70,7 @@ int main() {
     uint32_t clock_cycles = (time_end - time_init);
     printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
     printf("Result ==> %x\n", *(uint32_t *)&sum[0]);
-    printf("Check  ==> %x\n\n", *(uint32_t *)&l2_C);
+    printf("Check  ==> %x\n\n", *(uint32_t *)&l2_Z);
   }
   mempool_barrier(num_cores);
 
diff --git a/software/apps/baremetal/dotp_f32/main.c b/software/apps/baremetal/dotp_f32/main.c
index 731942eb7..3507795b1 100644
--- a/software/apps/baremetal/dotp_f32/main.c
+++ b/software/apps/baremetal/dotp_f32/main.c
@@ -20,8 +20,8 @@
 #define BINARY_REDUCTION
 
 // Vectors for kernel computation
-float l1_A[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
-float l1_B[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+float l1_X[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+float l1_Y[array_N] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 uint32_t red_barrier[NUM_BANKS]
     __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 float sum[NUM_BANKS] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
@@ -38,8 +38,8 @@ int main() {
   time_init = 0;
   time_end = 0;
   if (core_id == 0) {
-    dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int32_t));
-    dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int32_t));
+    dma_memcpy_blocking(l1_X, l2_X, array_N * sizeof(int32_t));
+    dma_memcpy_blocking(l1_Y, l2_Y, array_N * sizeof(int32_t));
   }
   for (uint32_t k = core_id; k < NUM_BANKS; k += num_cores) {
     sum[k] = 0;
@@ -49,17 +49,17 @@ int main() {
 
   //    // SINGLE-CORE
   //    time_init = mempool_get_timer();
-  //    dotp_f32s_unrolled4(l1_A, l1_B, sum, LEN);
+  //    dotp_f32s_unrolled4(l1_A, l1_B, sum, array_N);
   //    time_end = mempool_get_timer();
 
   //   // PARALLEL
   //   time_init = mempool_get_timer();
-  //   dotp_f32p(l1_A, l1_B, sum, LEN, num_cores);
+  //   dotp_f32p(l1_A, l1_B, sum, array_N, num_cores);
   //   time_end = mempool_get_timer();
 
   // PARALLEL, LOCAL ACCESSES
   time_init = mempool_get_timer();
-  dotp_f32p_local_unrolled4(l1_A, l1_B, sum, LEN);
+  dotp_f32p_local_unrolled4(l1_X, l1_Y, sum, array_N);
   time_end = mempool_get_timer();
 
   // Check results
@@ -68,7 +68,7 @@ int main() {
     uint32_t clock_cycles = (time_end - time_init);
     printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
     printf("Result ==> %d\n", sum[0]);
-    printf("Check  ==> %d\n\n", l2_C);
+    printf("Check  ==> %d\n\n", l2_Z);
   }
   mempool_barrier(num_cores);
 
diff --git a/software/apps/baremetal/mimo_mmse_f16/main.c b/software/apps/baremetal/mimo_mmse_f16/main.c
index ffd879c91..80309a1e0 100644
--- a/software/apps/baremetal/mimo_mmse_f16/main.c
+++ b/software/apps/baremetal/mimo_mmse_f16/main.c
@@ -19,8 +19,10 @@
 
 #include "data_mimo_mmse_f16.h"
 #define ZF (0)   // When asserted use zero-forcing
-#define FOLD (0) // When asserted fold matrices in memory
+#define FOLD (1) // When asserted fold matrices in memory
 #define NUM_BANKS (BANKING_FACTOR * NUM_CORES)
+#define PARALLEL
+#define VEC
 
 /**********************************************************
  **********************************************************
@@ -37,6 +39,8 @@
 
 #if FOLD
 #define NUM_ROW (1 + ((N_ITR * N_TX - 1) / NUM_BANKS))
+#define NUM_COL (NUM_BANKS / N_TX)
+
 __fp16 l1_G[2 * N_TX * NUM_BANKS * NUM_ROW]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
 __fp16 l1_L[2 * N_TX * NUM_BANKS * NUM_ROW]
@@ -68,6 +72,7 @@ int main() {
 #ifndef BANSHEE
   uint32_t num_cores = mempool_get_core_count();
   mempool_barrier_init(core_id); // Initialize barrier and synchronize
+  uint32_t time_init, time_end;
 #endif
 
   /* Initialize matrices */
@@ -97,6 +102,7 @@ int main() {
   /* Benchmark */
   if (core_id == 0) {
     mempool_start_benchmark();
+    time_init = mempool_get_timer();
     for (uint32_t itr = 0; itr < N_ITR; itr++) {
       __fp16 *PtrH = l1_H + itr * (2 * N_TX * N_RX);
       __fp16 *Ptry = l1_y + itr * (2 * N_RX);
@@ -107,24 +113,25 @@ int main() {
       __fp16 *Ptry3 = y3 + itr * (2 * N_TX);
       __fp16 *Ptrx = l1_x + itr * (2 * N_TX);
 #ifdef VEC
-      mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, FOLD, ZF);
+      mempool_hermitian_f16vecs(PtrH, PtrG, PtrS, N_RX, N_TX, ZF, FOLD);
       mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX);
       mempool_cholesky_f16vecs(PtrG, PtrL, N_TX, 0);
 #else
-      mempool_hermitian_f16s(PtrH, PtrG, PtrS, N_RX, N_TX, FOLD, ZF);
+      mempool_hermitian_f16s(PtrH, PtrG, PtrS, N_RX, N_TX, ZF, FOLD);
       mempool_MVP_conjtransp_f16s(PtrH, Ptry, Ptry2, N_RX, N_TX);
       mempool_cholesky_f16s(PtrG, PtrL, N_TX, 0);
 #endif
       mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX, 0, FOLD);
       mempool_Ltrisol_f16s(PtrL, Ptry3, Ptrx, N_TX, 1, FOLD);
     }
+    time_end = mempool_get_timer();
     mempool_stop_benchmark();
   }
 #endif
 
 #ifdef PARALLEL
   mempool_start_benchmark();
-  uint32_t time_init = mempool_get_timer();
+  time_init = mempool_get_timer();
   // Parallel subcarrier loop
   for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) {
 
@@ -133,14 +140,14 @@ int main() {
     __fp16 *PtrS = l1_S + itr * (2 * N_TX);
     // Auxiliary vectors
 #if FOLD
-    __fp16 *PtrG = l1_G + (itr % NUM_ROW) * (2 * N_TX * NUM_BANKS) +
-                   (itr / NUM_ROW) * (2 * N_TX);
-    __fp16 *PtrL = l1_L + (itr % NUM_ROW) * (2 * N_TX * NUM_BANKS) +
-                   (itr / NUM_ROW) * (2 * N_TX);
+    __fp16 *PtrG = l1_G + (itr / NUM_COL) * (2 * N_TX * NUM_BANKS) +
+                   (itr % NUM_COL) * (2 * N_TX);
+    __fp16 *PtrL = l1_L + (itr / NUM_COL) * (2 * N_TX * NUM_BANKS) +
+                   (itr % NUM_COL) * (2 * N_TX);
     __fp16 *Ptry2 =
-        y2 + (itr % NUM_ROW) * NUM_BANKS + (itr / NUM_ROW) * (2 * N_TX);
+        y2 + (itr / NUM_COL) * (2 * NUM_BANKS) + (itr % NUM_COL) * (2 * N_TX);
     __fp16 *Ptry3 =
-        y3 + (itr % NUM_ROW) * NUM_BANKS + (itr / NUM_ROW) * (2 * N_TX);
+        y3 + (itr / NUM_COL) * (2 * NUM_BANKS) + (itr % NUM_COL) * (2 * N_TX);
     __fp16 *Ptrx = l1_x + itr * (2 * N_TX);
 #else
     __fp16 *PtrG = l1_G + itr * (2 * N_TX * N_TX);
@@ -163,7 +170,7 @@ int main() {
     mempool_Ltrisol_f16s(PtrL, Ptry3, Ptrx, N_TX, 1, FOLD);
   }
   mempool_barrier(num_cores);
-  uint32_t time_end = mempool_get_timer();
+  time_end = mempool_get_timer();
   mempool_stop_benchmark();
 #endif
 
@@ -179,6 +186,7 @@ int main() {
   if (core_id == 0) {
     printf("Runtime: %d\n", time_end - time_init);
   }
+  mempool_check_f16(l1_x, l2_x, 2 * N_RX * N_TX, 0.01f, 0);
   mempool_barrier(num_cores);
 #endif
 
diff --git a/software/apps/baremetal/mimo_mmse_f32/main.c b/software/apps/baremetal/mimo_mmse_f32/main.c
index 194c4c71c..d243754fc 100644
--- a/software/apps/baremetal/mimo_mmse_f32/main.c
+++ b/software/apps/baremetal/mimo_mmse_f32/main.c
@@ -22,6 +22,8 @@
 #include "data_mimo_mmse_f32.h"
 
 #define SINGLE
+#define ZF (0)
+#define FOLD (0)
 
 float l1_H[2 * N_TX * N_RX * N_ITR]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
@@ -60,14 +62,14 @@ int main() {
   /* Benchmark */
   if (core_id == 0) {
     mempool_start_benchmark();
-    mempool_hermitian_f32s(l1_H, l1_G, l1_S, N_RX, N_TX, 0, 0);
-    mempool_MVP_conjtransp_f32s(l1_H, l1_y, y2, N_RX, N_TX, 0);
+    mempool_hermitian_f32s(l1_H, l1_G, l1_S, N_RX, N_TX, ZF, FOLD);
+    mempool_MVP_conjtransp_f32s(l1_H, l1_y, y2, N_RX, N_TX);
 #ifdef JACOBI
     mempool_jacobi_f32s(l1_G, y2, l1_x, N_TX, 0.005f, 20U);
 #else
-    mempool_cholesky_f32s(l1_G, l1_L, N_TX, 0);
-    mempool_Ltrisol_f32s(l1_L, y2, y3, N_TX, 0, 0);
-    mempool_Ltrisol_f32s(l1_L, y3, l1_x, N_TX, 1, 0);
+    mempool_cholesky_f32s(l1_G, l1_L, N_TX, FOLD);
+    mempool_Ltrisol_f32s(l1_L, y2, y3, N_TX, 0, FOLD);
+    mempool_Ltrisol_f32s(l1_L, y3, l1_x, N_TX, 1, FOLD);
 #endif
     mempool_stop_benchmark();
   }
@@ -75,7 +77,9 @@ int main() {
 #endif
 
 #if defined(PARALLEL) && defined(__XDIVSQRT)
+
   // Each iteration is assigned to a processor
+
   mempool_start_benchmark();
   for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) {
 
@@ -83,7 +87,9 @@ int main() {
     float *PtrH = l1_H + itr * (2 * N_TX * N_RX);
     float *PtrS = l1_S + itr * (2 * N_TX);
     float *Ptry = l1_y + itr * (2 * N_RX);
+
     // Intermediate results and outputs
+
 #if FOLD
     __fp16 *PtrG = l1_G + (itr % NUM_ROW) * (2 * N_TX * NUM_BANKS) +
                    (itr / NUM_ROW) * (2 * N_TX);
@@ -102,7 +108,7 @@ int main() {
     float *Ptrx = l1_x + itr * (2 * N_TX);
 #endif
 
-    mempool_hermitian_f32s(PtrH, PtrG, PtrS, N_RX, N_TX, 0, FOLD);
+    mempool_hermitian_f32s(PtrH, PtrG, PtrS, N_RX, N_TX, ZF, FOLD);
     mempool_MVP_conjtransp_f32s(PtrH, Ptry, Ptry2, N_RX, N_TX);
     mempool_cholesky_f32s(PtrG, PtrL, N_TX, FOLD);
     mempool_Ltrisol_f32s(PtrL, Ptry2, Ptry3, N_TX, 0, FOLD);
@@ -114,19 +120,24 @@ int main() {
 
 #if defined(PARALLEL_HERMITIAN) && defined(__XDIVSQRT)
   mempool_start_benchmark();
+
   // Each iteration is assigned to a pool of processors
-  // In a pool each PE gets a column of the H matrix, accumulating a row of the
-  // output matrix
+  // In a pool each PE gets a column of the H matrix, accumulating
+  // a row of the output matrix
+
   uint32_t pool_id = core_id / N_TX;
   uint32_t num_pools = num_cores / N_TX;
   for (uint32_t itr = pool_id; itr < N_ITR; itr += num_pools) {
     float *PtrH = l1_H + itr * (2 * N_TX * N_RX);
     float *PtrG = l1_G + itr * (2 * N_TX * N_TX);
     float *PtrS = l1_S + itr * N_TX;
-    mempool_hermitian_f32p(PtrH, PtrG, PtrS, N_RX, N_TX, 0, 0, core_id % N_TX,
-                           N_TX);
+    mempool_hermitian_f32p(PtrH, PtrG, PtrS, N_RX, N_TX, ZF, FOLD,
+                           core_id % N_TX, N_TX);
   }
   mempool_stop_benchmark();
+
+  // Each iteration is assigned to a processor
+
   mempool_start_benchmark();
   for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) {
     // Inputs
@@ -138,10 +149,10 @@ int main() {
     float *Ptry2 = y2 + itr * (2 * N_TX);
     float *Ptry3 = y3 + itr * (2 * N_TX);
     float *Ptrx = l1_x + itr * (2 * N_TX);
-    mempool_MVP_conjtransp_f32s(PtrH, Ptry, Ptry2, N_RX, N_TX, 0);
+    mempool_MVP_conjtransp_f32s(PtrH, Ptry, Ptry2, N_RX, N_TX);
     mempool_cholesky_f32s(PtrG, PtrL, N_TX, 0);
-    mempool_Ltrisol_f32s(PtrL, Ptry2, Ptry3, N_TX, 0, 0);
-    mempool_Ltrisol_f32s(PtrL, Ptry3, Ptrx, N_TX, 1, 0);
+    mempool_Ltrisol_f32s(PtrL, Ptry2, Ptry3, N_TX, 0, FOLD);
+    mempool_Ltrisol_f32s(PtrL, Ptry3, Ptrx, N_TX, 1, FOLD);
   }
   mempool_log_barrier(2, core_id);
   mempool_stop_benchmark();
diff --git a/software/apps/baremetal/mimo_mmse_q16/main.c b/software/apps/baremetal/mimo_mmse_q16/main.c
index 24fd9e44d..9bcb5e9db 100644
--- a/software/apps/baremetal/mimo_mmse_q16/main.c
+++ b/software/apps/baremetal/mimo_mmse_q16/main.c
@@ -28,7 +28,7 @@ int16_t l1_L[2 * N_TX * N_TX * N_ITR]
     __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)),
                    section(".l1_prio")));
 
-int16_t l1_Sigma[2 * N_TX * N_ITR]
+int16_t l1_S[2 * N_TX * N_ITR]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
 int16_t l1_y[2 * N_RX * N_ITR]
     __attribute__((aligned(sizeof(int32_t)), section(".l1")));
@@ -50,7 +50,7 @@ int main() {
   if (core_id == 0) {
     dma_memcpy_blocking(l1_H, l2_H, N_TX * N_RX * N_ITR * sizeof(int32_t));
     dma_memcpy_blocking(l1_y, l2_y, N_RX * N_ITR * sizeof(int32_t));
-    dma_memcpy_blocking(l1_Sigma, l2_Sigma, N_TX * N_ITR * sizeof(int32_t));
+    dma_memcpy_blocking(l1_S, l2_S, N_TX * N_ITR * sizeof(int32_t));
   }
   mempool_barrier(num_cores);
 
@@ -79,7 +79,7 @@ int main() {
 
     int16_t *PtrH = l1_H + itr * (2 * N_TX * N_RX);
     int16_t *Ptry = l1_y + itr * (2 * N_RX);
-    int16_t *PtrSigma = l1_Sigma + itr * (2 * N_TX);
+    int16_t *PtrSigma = l1_S + itr * (2 * N_TX);
 
     int16_t *PtrG = l1_G + itr * (2 * N_TX * N_TX);
     int16_t *PtrL = l1_L + itr * (2 * N_TX * N_TX);
diff --git a/software/apps/baremetal/ofdm/main.c b/software/apps/baremetal/ofdm_f16/main.c
similarity index 98%
rename from software/apps/baremetal/ofdm/main.c
rename to software/apps/baremetal/ofdm_f16/main.c
index 210501cad..264768199 100644
--- a/software/apps/baremetal/ofdm/main.c
+++ b/software/apps/baremetal/ofdm_f16/main.c
@@ -17,7 +17,8 @@
 #include "runtime.h"
 #include "synchronization.h"
 
-#include "data_ofdm.h"
+#include "data_ofdm_f16.h"
+#define N_BANKS (NUM_CORES * BANKING_FACTOR)
 
 // CFFT Parameters
 #define SCHEDULED
diff --git a/software/data/README.md b/software/data/README.md
index 9fdab87cf..066280965 100644
--- a/software/data/README.md
+++ b/software/data/README.md
@@ -6,7 +6,7 @@ The application parameters are passed to the script with the `gendata_params.hjs
 
 An example entry follows: `matmul_f32` is the name of MemPool application under test, the `type` refers to numpy precision, the `defines` are application parameters, turned into C constant declarations in the form `#define matrix_M (16)`, the `arrays` encode the C-type and name of input vectors for the application under test.
 
-`
+```
   "matmul_f32": {
     "type": "float32",
     "defines": [
@@ -20,7 +20,7 @@ An example entry follows: `matmul_f32` is the name of MemPool application under
       ("float", "l2_C")
     ]
   }
-`
+```
 
 ## To test a new application:
 If a new application requires to be tested with data generated from a reference golden model:
diff --git a/software/data/data_axpy_f16.h.tpl b/software/data/data_axpy_f16.h.tpl
deleted file mode 100644
index 4c6034baf..000000000
--- a/software/data/data_axpy_f16.h.tpl
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '(__fp16){:.4f}, '.format(a)
-        i += 1
-        if i % 8 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define LEN (${Len})
-
-__fp16 __attribute__((section(".l2"))) A = ${'(__fp16){:.4f}'.format(A)};
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_X[${Len}] = ${array_to_cstr(X)};
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_Y[${Len}] = ${array_to_cstr(Y)};
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_out[${Len}] = ${array_to_cstr(out)};
diff --git a/software/data/data_axpy_f32.h.tpl b/software/data/data_axpy_f32.h.tpl
deleted file mode 100644
index f3fdc8b6a..000000000
--- a/software/data/data_axpy_f32.h.tpl
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '{}f, '.format(a)
-        i += 1
-        if i % 8 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-
-#define LEN (${Len})
-
-float __attribute__((section(".l2"))) A = ${'(float){:.8f}'.format(A)};
-
-float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_X[${Len}] = ${array_to_cstr(X)};
-
-float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_Y[${Len}] = ${array_to_cstr(Y)};
-
-float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_out[${Len}] = ${array_to_cstr(out)};
diff --git a/software/data/data_cfft_f16.h.tpl b/software/data/data_cfft_f16.h.tpl
deleted file mode 100644
index d21829e88..000000000
--- a/software/data/data_cfft_f16.h.tpl
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '(__fp16){:.4}f, '.format(a)
-        i += 1
-        if i % 8 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-<% def array_to_str(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '{}, '.format(a)
-        i += 1
-        if i % 16 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define LOG2 (${Log2Len})
-#define N_CSAMPLES (${Len})
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
-#define BITREVINDEXTABLE_LENGTH (${BitrevLen})
-
-// Maximum number of independent FFT columns allowed
-#define MAX_COL (N_BANKS / (N_CSAMPLES / 4))
-
-// Data arrays
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_pSrc[${2 * Len}] = ${array_to_cstr(vector_inp)};
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_pRes[${2 * Len}] = ${array_to_cstr(vector_res)};
-
-// Twiddles
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_twiddleCoef_f16[${2 * Len}] = ${array_to_cstr(vector_twi)};
-
-// Bitreversal
-uint16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_BitRevIndexTable[${BitrevLen}] = ${array_to_str(vector_bitrev)};
diff --git a/software/data/data_cfft_q16.h.tpl b/software/data/data_cfft_q16.h.tpl
deleted file mode 100644
index fb1ba908a..000000000
--- a/software/data/data_cfft_q16.h.tpl
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Automatically generated by:
-// data/data_cfft_q16.py
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '(int16_t) 0X{:04X}, '.format(a&0xffff)
-        i += 1
-        if i % 16 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-<% def array_to_str(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '{}, '.format(a)
-        i += 1
-        if i % 16 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-#define LOG2 (${Log2Len})
-#define N_CSAMPLES (${Len})
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
-#define BITREVINDEXTABLE_LENGTH (${BitrevLen})
-
-// Maximum number of independent FFT columns allowed
-#define MAX_COL (N_BANKS / (N_CSAMPLES / 4))
-// Tolerance for correctness check
-#define TOLERANCE (${tolerance})
-
-// Data arrays
-int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_pSrc[${2 * Len}] = ${array_to_cstr(vector_inp)};
-
-int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_pRes[${2 * Len}] = ${array_to_cstr(vector_res)};
-
-// Twiddles
-int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_twiddleCoef_q16[${int(6*Len/4)}] = ${array_to_cstr(vector_twi)};
-
-// Bitreversal
-uint16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_BitRevIndexTable[${BitrevLen}] = ${array_to_str(vector_bitrev)};
diff --git a/software/data/data_chest_f16.h.tpl b/software/data/data_chest_f16.h.tpl
deleted file mode 100644
index 25d9e420f..000000000
--- a/software/data/data_chest_f16.h.tpl
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Automatically generated by:
-// data/data_chest_q16.py
-
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '(__fp16){:.5}f, '.format(a)
-        i += 1
-        if i % 8 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define N_TX (${nb_tx})
-#define N_RX (${nb_rx})
-#define N_SAMPLES (${nb_samples})
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_PilotRX[${2*nb_rx*nb_samples}] = ${array_to_cstr(pilot_rx)};
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_PilotTX[${2*nb_tx*nb_samples}] = ${array_to_cstr(pilot_tx)};
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_HEST[${2*nb_rx*nb_tx*nb_samples}] = ${array_to_cstr(Hest)};
diff --git a/software/data/data_cholesky_f16.h.tpl b/software/data/data_cholesky_f16.h.tpl
deleted file mode 100644
index 32ad3e2fe..000000000
--- a/software/data/data_cholesky_f16.h.tpl
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '(__fp16){:0.5f}f, '.format(a)
-        i += 1
-        if i % 8 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define dim_N (${n_matrix})
-#define N_SAMPLES (${n_samples})
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_GIn[2 * ${n_samples * n_matrix * n_matrix}] = ${array_to_cstr(G)};
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_LOut[2 * ${n_samples * n_matrix * n_matrix}] = ${array_to_cstr(L)};
diff --git a/software/data/data_cholesky_q16.h.tpl b/software/data/data_cholesky_q16.h.tpl
deleted file mode 100644
index 0ba9cf5f0..000000000
--- a/software/data/data_cholesky_q16.h.tpl
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '(int16_t) 0X{:04X}, '.format(a&0xffff)
-        i += 1
-        if i % 8 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define dim_N (${n_matrix})
-#define N_SAMPLES (${n_samples})
-int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_GIn[2 * ${n_samples * n_matrix * n_matrix}] = ${array_to_cstr(G)};
-int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_LOut[2 * ${n_samples * n_matrix * n_matrix}] = ${array_to_cstr(L)};
diff --git a/software/data/data_cholesky_q32.h.tpl b/software/data/data_cholesky_q32.h.tpl
deleted file mode 100644
index 0042f54c8..000000000
--- a/software/data/data_cholesky_q32.h.tpl
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '(int32_t) 0X{:08X}, '.format(a&0xffffffff)
-        i += 1
-        if i % 8 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define N (${n_matrix})
-
-int32_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_GIn[${n_matrix * n_matrix}] = ${array_to_cstr(G)};
-int32_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_LOut[${n_matrix * n_matrix}] = ${array_to_cstr(L)};
-int32_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_y[${n_matrix}] = ${array_to_cstr(y)};
diff --git a/software/data/data_cmatmul_f16.h.tpl b/software/data/data_cmatmul_f16.h.tpl
deleted file mode 100644
index 15ed570a1..000000000
--- a/software/data/data_cmatmul_f16.h.tpl
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '(__fp16){:.4f}, '.format(a)
-        i += 1
-        if i % 8 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define dim_M (${matrix_M})
-#define dim_N (${matrix_N})
-#define dim_P (${matrix_P})
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) A[${2 * matrix_M * matrix_N}] = ${array_to_cstr(A)};
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) B[${2 * matrix_N * matrix_P}] = ${array_to_cstr(B)};
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) C[${2 * matrix_M * matrix_P}] = ${array_to_cstr(C)};
diff --git a/software/data/data_cmatmul_q16.h.tpl b/software/data/data_cmatmul_q16.h.tpl
deleted file mode 100644
index b42c55f88..000000000
--- a/software/data/data_cmatmul_q16.h.tpl
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '(int16_t) 0X{:04X}, '.format(a&0xffff)
-        i += 1
-        if i % 8 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define dim_M (${matrix_M})
-#define dim_N (${matrix_N})
-#define dim_P (${matrix_P})
-
-int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) A[${2 * matrix_M * matrix_N}] = ${array_to_cstr(A)};
-
-int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) B[${2 * matrix_N * matrix_P}] = ${array_to_cstr(B)};
-
-int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) C[${2 * matrix_M * matrix_P}] = ${array_to_cstr(C)};
diff --git a/software/data/data_dotp_f16.h.tpl b/software/data/data_dotp_f16.h.tpl
deleted file mode 100644
index f7cacaed3..000000000
--- a/software/data/data_dotp_f16.h.tpl
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '(__fp16){:.4f}, '.format(a)
-        i += 1
-        if i % 8 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define LEN (${Len})
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)};
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)};
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C = (__fp16)${C}f;
diff --git a/software/data/data_dotp_f32.h.tpl b/software/data/data_dotp_f32.h.tpl
deleted file mode 100644
index 3af0fbe66..000000000
--- a/software/data/data_dotp_f32.h.tpl
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '{}f, '.format(a)
-        i += 1
-        if i % 8 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define LEN (${Len})
-
-float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)};
-
-float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)};
-
-float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C = ${C}f;
diff --git a/software/data/data_dotp_i32.h.tpl b/software/data/data_dotp_i32.h.tpl
deleted file mode 100644
index d76d92a24..000000000
--- a/software/data/data_dotp_i32.h.tpl
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '{}, '.format(a)
-        i += 1
-        if i % 8 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define LEN (${Len})
-
-int32_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)};
-
-int32_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)};
-
-int32_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C = ${C};
diff --git a/software/data/data_mimo_mmse_f16.h.tpl b/software/data/data_mimo_mmse_f16.h.tpl
deleted file mode 100644
index e6109b7f6..000000000
--- a/software/data/data_mimo_mmse_f16.h.tpl
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '(__fp16){:0.5f}f, '.format(a)
-        i += 1
-        if i % 8 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-<% def array_to_str(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '{}, '.format(a)
-        i += 1
-        if i % 16 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define N_TX (${N_tx})
-#define N_RX (${N_rx})
-#define N_ITR (${N_itr})
-
-// Inputs
-
-__fp16 l2_H[${2 * N_tx * N_rx * N_itr}] = ${array_to_cstr(H)};
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_G[${2 * N_tx * N_tx * N_itr}] = ${array_to_cstr(G)};
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_y[${2 * N_rx * N_itr}] = ${array_to_cstr(y)};
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_S[${2 * N_tx * N_itr}] = ${array_to_cstr(N)};
-
-// Outputs
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_x[${2 * N_tx * N_itr}] = ${array_to_cstr(x)};
diff --git a/software/data/data_mimo_mmse_f32.h.tpl b/software/data/data_mimo_mmse_f32.h.tpl
deleted file mode 100644
index c7bed1889..000000000
--- a/software/data/data_mimo_mmse_f32.h.tpl
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '{}f, '.format(a)
-        i += 1
-        if i % 8 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define N_TX (${N_tx})
-#define N_RX (${N_rx})
-#define N_ITR (${N_itr})
-
-// Inputs
-
-float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_H[${2 * N_tx * N_rx * N_itr}] = ${array_to_cstr(H)};
-
-float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_G[${2 * N_tx * N_tx * N_itr}] = ${array_to_cstr(G)};
-
-float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_y[${2 * N_rx * N_itr}] = ${array_to_cstr(y)};
-
-float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_S[${2 * N_tx * N_itr}] = ${array_to_cstr(N)};
-
-// Outputs
-
-float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_x[${2 * N_tx * N_itr}] = ${array_to_cstr(x)};
diff --git a/software/data/data_mimo_mmse_f8.h.tpl b/software/data/data_mimo_mmse_f8.h.tpl
deleted file mode 100644
index 780bcc041..000000000
--- a/software/data/data_mimo_mmse_f8.h.tpl
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '(__fp8)' + f'{hex(a.bits())}' +', '
-        i += 1
-        if i % 8 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-<% def array_to_cstr16(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '(__fp16){:0.5f}f, '.format(a)
-        i += 1
-        if i % 5 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define N_TX (${N_tx})
-#define N_RX (${N_rx})
-#define N_ITR (${N_itr})
-
-// Inputs
-
-__fp8 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_H[${2 * N_tx * N_rx * N_itr}] = ${array_to_cstr(H)};
-
-__fp8 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_y[${2 * N_rx * N_itr}] = ${array_to_cstr(y)};
-
-__fp8 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_S[${2 * N_tx * N_itr}] = ${array_to_cstr(N)};
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_G[${2 * N_tx * N_tx * N_itr}] = ${array_to_cstr16(G)};
-
-// Outputs
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_x[${2 * N_tx * N_itr}] = ${array_to_cstr16(x)};
diff --git a/software/data/data_mimo_mmse_q16.h.tpl b/software/data/data_mimo_mmse_q16.h.tpl
deleted file mode 100644
index ca2ed0193..000000000
--- a/software/data/data_mimo_mmse_q16.h.tpl
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '(int16_t) {}, '.format(a)
-        i += 1
-        if i % 8 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define N_TX (${N_tx})
-#define N_RX (${N_rx})
-#define N_ITR (${N_itr})
-
-// Inputs
-
-int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_H[${2 * N_tx * N_rx * N_itr}] = ${array_to_cstr(H)};
-
-int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_G[${2 * N_tx * N_tx * N_itr}] = ${array_to_cstr(G)};
-
-int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_y[${2 * N_rx * N_itr}] = ${array_to_cstr(y)};
-
-int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_Sigma[${2 * N_tx * N_itr}] = ${array_to_cstr(N)};
-
-// Outputs
-
-int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_x[${2 * N_tx * N_itr}] = ${array_to_cstr(x)};
diff --git a/software/data/data_ofdm.h.tpl b/software/data/data_ofdm.h.tpl
deleted file mode 100644
index 06da2c045..000000000
--- a/software/data/data_ofdm.h.tpl
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '(__fp16){:0.5}f, '.format(a)
-        i += 1
-        if i % 8 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-<% def array_to_str(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '{}, '.format(a)
-        i += 1
-        if i % 16 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define LOG2 (${Log2Len})
-#define N_RX (${N_rx})
-#define N_BEAMS (${N_bs})
-#define N_SC (${N_sc})
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
-#define BITREVINDEXTABLE_LENGTH (${BitrevLen})
-
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_pFFT_Src[${2 * N_sc * N_rx}] = ${array_to_cstr(pFFT_src)};
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_twiddleCoef_f16[${2 * N_sc}] = ${array_to_cstr(pTw_coef)};
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_pBF_Coef[${2 * N_bs * N_rx}] = ${array_to_cstr(pBF_coef)};
-
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_pBF_Dst[${2 * N_bs * N_sc}] = ${array_to_cstr(pBF_dst)};
-
-// Bitreversal
-uint16_t l2_BitRevIndexTable[${BitrevLen}] = ${array_to_str(bitrev)};
diff --git a/software/data/gendata_header.py b/software/data/gendata_header.py
index 44749a4a0..c8fd8c3f8 100644
--- a/software/data/gendata_header.py
+++ b/software/data/gendata_header.py
@@ -14,6 +14,7 @@
 import numpy
 
 import gendatalib as datalib
+import pyflexfloat as ff
 
 
 header = """\
@@ -46,6 +47,10 @@ def format_type(typ, value):
         stringyfied_val = '({}) {:+.8f}'.format(typ, value)
     elif typ == '__fp16':
         stringyfied_val = '({}) {:+.4f}'.format(typ, value)
+    elif typ == '__fp8':
+        value = ff.FlexFloat("e5m2", value.astype(numpy.double))
+        value = value.bits()
+        stringyfied_val = '({}) 0X{}'.format(typ, value)
     else:
         raise Exception("ERROR: Unsupported data type!!!")
 
@@ -75,7 +80,7 @@ def print_array(arr, typ, name):
         output_string += "};\n\n"
     else:
         output_string += attr
-        output_string += (name + ' = ' + format_type(typ, arr))
+        output_string += (name + ' = ' + format_type(typ, arr[0]))
         output_string += ";\n\n"
 
     return output_string
@@ -125,6 +130,8 @@ def get_type(type_string):
         return numpy.float32
     elif type_string == "float16":
         return numpy.float16
+    elif type_string == "float8":
+        return numpy.float16
     else:
         raise Exception("Input type is not valid")
 
@@ -156,16 +163,32 @@ def get_type(type_string):
     # Define function mappings for each app_name
     function_map = {
         "axpy_i32": {"func": datalib.generate_iaxpy},
-        "cfft_radix4_q16": {"func": datalib.generate_cfft_q16},
+        "axpy_f16": {"func": datalib.generate_faxpy},
+        "axpy_f32": {"func": datalib.generate_faxpy},
         "cfft_radix2_q16": {"func": datalib.generate_cfft_q16},
+        "cfft_radix4_f16": {"func": datalib.generate_fcfft},
+        "cfft_radix4_q16": {"func": datalib.generate_cfft_q16},
+        "chest_f16": {"func": datalib.generate_fchest},
         "chest_q16": {"func": datalib.generate_qchest},
+        "cholesky_f16": {"func": datalib.generate_fccholesky},
+        "cholesky_q16": {"func": datalib.generate_qccholesky},
         "cholesky_q32": {"func": datalib.generate_qcholesky},
+        "cmatmul_f16": {"func": datalib.generate_fcmatmul},
+        "cmatmul_q16": {"func": datalib.generate_qcmatmul},
+        "dotp_f16": {"func": datalib.generate_fdotp},
+        "dotp_f32": {"func": datalib.generate_fdotp},
         "dotp_i32": {"func": datalib.generate_idotp},
         "matmul_f16": {"func": datalib.generate_fmatmul},
+        "matmul_f8": {"func": datalib.generate_fmatmul},
         "matmul_f32": {"func": datalib.generate_fmatmul},
         "matmul_i32": {"func": datalib.generate_imatmul},
         "matmul_i16": {"func": datalib.generate_imatmul},
         "matmul_i8": {"func": datalib.generate_imatmul},
+        "mimo_mmse_q16": {"func": datalib.generate_qmmse},
+        "mimo_mmse_f16": {"func": datalib.generate_fmmse},
+        "mimo_mmse_f32": {"func": datalib.generate_fmmse},
+        "mimo_mmse_f8": {"func": datalib.generate_fmmse},
+        "ofdm_f16": {"func": datalib.generate_fofdm},
         "fence": {"func": datalib.generate_iarray},
         "memcpy": {"func": datalib.generate_iarray},
     }
diff --git a/software/data/gendata_params.hjson b/software/data/gendata_params.hjson
index 3a1de010e..e343a42be 100644
--- a/software/data/gendata_params.hjson
+++ b/software/data/gendata_params.hjson
@@ -9,32 +9,58 @@
   "axpy_i32": {
     "type": "int32",
     "defines": [
-      ("ALPHA",      6)
       ("array_N", 1024)
     ]
     "arrays": [
+      ("int32_t", "l2_A")
       ("int32_t", "l2_X")
       ("int32_t", "l2_Y")
       ("int32_t", "l2_Z")
     ]
   },
 
-  "dotp_i32": {
-    "type": "int32",
+  "axpy_f32": {
+    "type": "float32",
     "defines": [
       ("array_N", 1024)
     ]
     "arrays": [
-      ("int32_t", "l2_X")
-      ("int32_t", "l2_Y")
-      ("int32_t", "l2_Z")
+      ("float", "l2_A")
+      ("float", "l2_X")
+      ("float", "l2_Y")
+      ("float", "l2_Z")
     ]
   },
 
-  "cfft_radix4_q16": {
+  "axpy_f16": {
+    "type": "float16",
+    "defines": [
+      ("array_N", 1024)
+    ]
+    "arrays": [
+      ("__fp16", "l2_A")
+      ("__fp16", "l2_X")
+      ("__fp16", "l2_Y")
+      ("__fp16", "l2_Z")
+    ]
+  },
+
+  "dotp_f16": {
+    "type": "float16",
+    "defines": [
+      ("array_N", 1024)
+    ]
+    "arrays": [
+      ("__fp16", "l2_X")
+      ("__fp16", "l2_Y")
+      ("__fp16", "l2_Z")
+    ]
+  },
+
+  "cfft_radix2_q16": {
     "type": "int16",
     "defines": [
-      ("N_CSAMPLES", 64)
+      ("N_CSAMPLES", 256)
     ]
     "arrays": [
       ("int16_t", "l2_pSrc")
@@ -44,10 +70,23 @@
     ]
   },
 
-  "cfft_radix2_q16": {
+  "cfft_radix4_f16": {
+    "type": "float16",
+    "defines": [
+      ("N_CSAMPLES", 64)
+    ]
+    "arrays": [
+      ("__fp16", "l2_pSrc")
+      ("__fp16", "l2_pRes")
+      ("__fp16", "l2_twiddleCoef_f16")
+      ("int16_t", "l2_BitRevIndexTable")
+    ]
+  },
+
+  "cfft_radix4_q16": {
     "type": "int16",
     "defines": [
-      ("N_CSAMPLES", 256)
+      ("N_CSAMPLES", 64)
     ]
     "arrays": [
       ("int16_t", "l2_pSrc")
@@ -57,6 +96,20 @@
     ]
   },
 
+  "chest_f16": {
+    "type": "float16",
+    "defines": [
+      ("N_TX",        4)
+      ("N_RX",        4)
+      ("N_SAMPLES", 512)
+    ]
+    "arrays": [
+      ("__fp16", "l2_PilotTX")
+      ("__fp16", "l2_PilotRX")
+      ("__fp16", "l2_HEST")
+    ]
+  },
+
   "chest_q16": {
     "type": "int32",
     "defines": [
@@ -71,11 +124,37 @@
     ]
   },
 
+  "cholesky_q16": {
+    "type": "int16",
+    "defines": [
+      ("matrix_N",    4)
+      ("FIXED_POINT", 8)
+      ("N_SAMPLES", 32)
+    ]
+    "arrays": [
+      ("int16_t", "l2_GIn")
+      ("int16_t", "l2_LOut")
+    ]
+  },
+
+  "cholesky_f16": {
+    "type": "float16",
+    "defines": [
+      ("matrix_N",    4)
+      ("N_SAMPLES", 1024)
+    ]
+    "arrays": [
+      ("__fp16", "l2_GIn")
+      ("__fp16", "l2_LOut")
+    ]
+  },
+
   "cholesky_q32": {
     "type": "int32",
     "defines": [
       ("matrix_N",    32)
       ("FIXED_POINT", 10)
+      ("N_SAMPLES", 1)
     ]
     "arrays": [
       ("int32_t", "l2_A")
@@ -84,6 +163,58 @@
     ]
   },
 
+  "cmatmul_f16": {
+    "type": "float16",
+    "defines": [
+      ("matrix_M", 32)
+      ("matrix_N", 32)
+      ("matrix_P", 32)
+    ]
+    "arrays": [
+      ("__fp16", "l2_A")
+      ("__fp16", "l2_B")
+      ("__fp16", "l2_C")
+    ]
+  },
+
+  "cmatmul_q16": {
+    "type": "int16",
+    "defines": [
+      ("matrix_M", 32)
+      ("matrix_N", 32)
+      ("matrix_P", 32)
+    ]
+    "arrays": [
+      ("int16_t", "l2_A")
+      ("int16_t", "l2_B")
+      ("int16_t", "l2_C")
+    ]
+  },
+
+  "dotp_f32": {
+    "type": "float32",
+    "defines": [
+      ("array_N", 1024)
+    ]
+    "arrays": [
+      ("float", "l2_X")
+      ("float", "l2_Y")
+      ("float", "l2_Z")
+    ]
+  },
+
+  "dotp_i32": {
+    "type": "int32",
+    "defines": [
+      ("array_N", 1024)
+    ]
+    "arrays": [
+      ("int32_t", "l2_X")
+      ("int32_t", "l2_Y")
+      ("int32_t", "l2_Z")
+    ]
+  },
+
   "matmul_f16": {
     "type": "float16",
     "defines": [
@@ -112,6 +243,20 @@
     ]
   }
 
+  "matmul_i16": {
+    "type": "int16",
+    "defines": [
+      ("matrix_M", 64)
+      ("matrix_N", 64)
+      ("matrix_P", 64)
+    ]
+    "arrays": [
+      ("int16_t", "l2_A")
+      ("int16_t", "l2_B")
+      ("int32_t", "l2_C")
+    ]
+  }
+
   "matmul_i32": {
     "type": "int32",
     "defines": [
@@ -126,34 +271,101 @@
     ]
   }
 
-  "matmul_i16": {
-    "type": "int16",
+  "matmul_i8": {
+    "type": "int8",
     "defines": [
       ("matrix_M", 64)
       ("matrix_N", 64)
       ("matrix_P", 64)
     ]
     "arrays": [
-      ("int16_t", "l2_A")
-      ("int16_t", "l2_B")
+      ("int8_t", "l2_A")
+      ("int8_t", "l2_B")
       ("int32_t", "l2_C")
     ]
   }
 
-  "matmul_i8": {
-    "type": "int8",
+  "mimo_mmse_f16": {
+    "type": "float16",
     "defines": [
-      ("matrix_M", 64)
-      ("matrix_N", 64)
-      ("matrix_P", 64)
+      ("N_TX", 4)
+      ("N_RX", 4)
+      ("N_ITR", 32)
     ]
     "arrays": [
-      ("int8_t", "l2_A")
-      ("int8_t", "l2_B")
-      ("int32_t", "l2_C")
+      ("__fp16", "l2_H")
+      ("__fp16", "l2_G")
+      ("__fp16", "l2_y")
+      ("__fp16", "l2_S")
+      ("__fp16", "l2_x")
+    ]
+  }
+
+  "mimo_mmse_f32": {
+    "type": "float32",
+    "defines": [
+      ("N_TX", 4)
+      ("N_RX", 4)
+      ("N_ITR", 32)
+    ]
+    "arrays": [
+      ("float", "l2_H")
+      ("float", "l2_G")
+      ("float", "l2_y")
+      ("float", "l2_S")
+      ("float", "l2_x")
     ]
   }
 
+  "mimo_mmse_f8": {
+    "type": "float8",
+    "defines": [
+      ("N_TX", 4)
+      ("N_RX", 4)
+      ("N_ITR", 32)
+    ]
+    "arrays": [
+      ("__fp8", "l2_H")
+      ("__fp16", "l2_G")
+      ("__fp8", "l2_y")
+      ("__fp8", "l2_S")
+      ("__fp16", "l2_x")
+    ]
+  }
+
+  "mimo_mmse_q16": {
+    "type": "int16",
+    "defines": [
+      ("N_TX", 4)
+      ("N_RX", 4)
+      ("N_ITR", 32)
+      ("FIXED_POINT", 8)
+    ]
+    "arrays": [
+      ("int16_t", "l2_H")
+      ("int16_t", "l2_G")
+      ("int16_t", "l2_y")
+      ("int16_t", "l2_S")
+      ("int16_t", "l2_x")
+    ]
+  }
+
+  "ofdm_f16": {
+    "type": "float16",
+    "defines": [
+      ("N_SC", 4096)
+      ("N_RX", 64)
+      ("N_BEAMS", 32)
+    ]
+    "arrays": [
+      ("__fp16", "l2_pFFT_Src")
+      ("__fp16", "l2_pBF_Coef")
+      ("__fp16", "l2_pBF_Dst")
+      ("__fp16", "l2_twiddleCoef_f16")
+      ("__fp16", "l2_BitRevIndexTable")
+    ]
+  },
+
   "fence": {
     "type": "int32",
     "defines": [
diff --git a/software/data/gendatalib.py b/software/data/gendatalib.py
index c017415bf..ac051fd27 100644
--- a/software/data/gendatalib.py
+++ b/software/data/gendatalib.py
@@ -14,7 +14,9 @@
 import numpy as np
 import math
 import qmath
+
 from scipy import signal
+from scipy.linalg import solve_triangular
 
 
 def select_maxval(my_type=np.int32):
@@ -58,52 +60,274 @@ def generate_iarray(my_type=np.float32, defines={}):
     return A, defines
 
 
-def generate_fmatmul(my_type=np.float32, defines={}):
+##############################################################################
+
+
+def generate_faxpy(my_type=np.float32, defines={}):
+
+    # Create matrix
+    array_N = defines['array_N']
+    A = np.random.rand(1) - 0.5
+    X = (np.random.rand(array_N) - 0.5).astype(my_type)
+    Y = (np.random.rand(array_N) - 0.5).astype(my_type)
+    Z = (Y + X * A).astype(my_type)
+
+    return [A, X, Y, Z], defines
+
+
+def generate_fdotp(my_type=np.float32, defines={}):
+
+    # Create matrix
+    array_N = defines['array_N']
+    X = (np.random.rand(array_N) - 0.5).astype(my_type)
+    Y = (np.random.rand(array_N) - 0.5).astype(my_type)
+    Z = np.dot(X, Y).astype(my_type)
+    Z = np.array(Z).astype(my_type)
+    Z = np.resize(Z, 1)
+
+    return [X, Y, Z], defines
+
+
+def ftwiddleCoef(N, my_type=np.float32):
+    PI = np.pi
+    twiddleCoeff16 = np.zeros((int)(2 * 3 * N / 4), my_type)
+    for i in range(0, int(3 * N / 4)):
+        twiddleCoeff16_sin = np.sin(i * 2 * PI / N).astype(my_type)
+        twiddleCoeff16_cos = np.cos(i * 2 * PI / N).astype(my_type)
+        twiddleCoeff16[2 * i] = twiddleCoeff16_sin
+        twiddleCoeff16[2 * i + 1] = twiddleCoeff16_cos
+    return twiddleCoeff16
+
+
+def generate_fcfft(my_type=np.float32, defines={}):
+
+    N_CSAMPLES = defines['N_CSAMPLES']
+    src_r = np.random.normal(0, 5, N_CSAMPLES).astype(np.float16)
+    src_i = np.random.normal(0, 5, N_CSAMPLES).astype(np.float16)
+    src = src_r + 1.j * src_i
+    src = np.fft.ifft(src)
+    dst = np.fft.fft(src)
+    src = np.column_stack((src.imag, src.real)).astype(my_type).flatten()
+    dst = np.column_stack((dst.imag, dst.real)).astype(my_type).flatten()
+
+    twiddles = ftwiddleCoef(N_CSAMPLES, my_type)
+    bitrever = qmath.bitreversal(N_CSAMPLES, 2)
+
+    defines['LOG2'] = int(math.log2(N_CSAMPLES))
+    defines['N_TWIDDLES'] = 3 * N_CSAMPLES // 4
+    defines['BITREVINDEXTABLE_LENGTH'] = len(bitrever)
+    defines['TOLERANCE'] = 0.1 * np.max(dst)
+
+    return [src, dst, twiddles, bitrever], defines
+
+
+def generate_fchest(my_type=np.float32, defines={}, division=False):
+
+    nb_tx = defines['N_TX']
+    nb_rx = defines['N_RX']
+    nb_samples = defines['N_SAMPLES']
+
+    H = np.random.randn(nb_rx, nb_tx)
+    H = H + 1j * np.random.randn(nb_rx, nb_tx)
+
+    vpilot_tx = []
+    vpilot_rx = []
+    vHest = []
+    for k in range(nb_samples):
+        if (division):
+            # Compute data division
+            pilot_tx = 1 * np.exp(1j * np.random.randn(nb_tx))
+            pilot_rx = np.dot(H, pilot_tx)
+            Hest = pilot_rx[:, np.newaxis] / pilot_tx[np.newaxis, :]
+        else:
+            # Compute data multiplication
+            pilot_tx = np.exp(1j * np.random.randn(nb_tx))
+            pilot_rx = np.dot(H, pilot_tx)
+            pilot_tx = np.reciprocal(pilot_tx)
+            Hest = pilot_rx[:, np.newaxis] * pilot_tx[np.newaxis, :]
+            Hest = Hest.flatten()
+
+        # Interleaved real and imaginary parts
+        pilot_tx = np.column_stack((pilot_tx.imag, pilot_tx.real))
+        pilot_rx = np.column_stack((pilot_rx.imag, pilot_rx.real))
+        Hest = np.column_stack((Hest.imag, Hest.real))
+        # Flatten arrays
+        pilot_tx = pilot_tx.astype(my_type).flatten()
+        pilot_rx = pilot_rx.astype(my_type).flatten()
+        Hest = Hest.astype(my_type).flatten()
+        # Output vectors
+        vpilot_tx.append(pilot_tx)
+        vpilot_rx.append(pilot_rx)
+        vHest.append(Hest)
+
+    vpilot_rx = np.concatenate(vpilot_rx, axis=0)
+    vpilot_tx = np.concatenate(vpilot_tx, axis=0)
+    vHest = np.concatenate(vHest, axis=0)
+
+    return [vpilot_tx, vpilot_rx, vHest], defines
+
+
+def generate_fccholesky(my_type=np.float32, defines={}):
+
+    n_matrix = defines['matrix_N']
+    n_samples = defines['N_SAMPLES']
+
+    vector_G = []
+    vector_L = []
+    for k in range(n_samples):
+        # Create hermitian matrix
+        H = np.random.rand(n_matrix, n_matrix) + 1.j * \
+            np.random.rand(n_matrix, n_matrix)
+        # Matrix to be inverted
+        # H_H = np.asmatrix(H).H
+        G = np.matmul(H, np.asmatrix(H).H)
+        # Cholesky decomposition
+        L = np.linalg.cholesky(G)
+        # Reshape
+        G = np.reshape(np.asarray(G), (n_matrix * n_matrix), order='C')
+        L = np.reshape(np.asarray(L), (n_matrix * n_matrix), order='C')
+        G = np.column_stack((G.real, G.imag)).astype(my_type).flatten()
+        L = np.column_stack((L.real, L.imag)).astype(my_type).flatten()
+        # Output vectors
+        vector_G.append(G)
+        vector_L.append(L)
+
+    vector_G = np.concatenate(vector_G, axis=0)
+    vector_L = np.concatenate(vector_L, axis=0)
+    return [vector_G, vector_L], defines
+
+
+def generate_fcmatmul(my_type=np.float32, defines={}):
 
     # Create matrix
     matrix_M = defines['matrix_M']
     matrix_N = defines['matrix_N']
     matrix_P = defines['matrix_P']
-    A = (np.random.rand(matrix_M, matrix_N) - 0.5).astype(my_type)
-    B = (np.random.rand(matrix_N, matrix_P) - 0.5).astype(my_type)
+    A = np.random.rand(matrix_M, matrix_N) + 1j * \
+        np.random.rand(matrix_M, matrix_N)
+    B = np.random.rand(matrix_N, matrix_P) + 1j * \
+        np.random.rand(matrix_N, matrix_P)
     C = np.matmul(A, B)
 
-    A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(my_type)
-    B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(my_type)
-    C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(my_type)
+    A = np.reshape(A, (matrix_M * matrix_N), order='C')
+    B = np.reshape(B, (matrix_N * matrix_P), order='C')
+    C = np.reshape(C, (matrix_M * matrix_P), order='C')
+
+    A = np.column_stack((A.imag, A.real)).astype(my_type).flatten()
+    B = np.column_stack((B.imag, B.real)).astype(my_type).flatten()
+    C = np.column_stack((C.imag, C.real)).astype(my_type).flatten()
 
     return [A, B, C], defines
 
 
-def generate_imatmul(my_type=np.int32, defines={}):
+def generate_fmatmul(my_type=np.float32, defines={}):
 
     # Create matrix
     matrix_M = defines['matrix_M']
     matrix_N = defines['matrix_N']
     matrix_P = defines['matrix_P']
-    MAX = select_maxval(my_type)
-    A = irandom(MAX=MAX, size=(matrix_M, matrix_N), my_type=my_type)
-    B = irandom(MAX=MAX, size=(matrix_M, matrix_N), my_type=my_type)
+    A = (np.random.rand(matrix_M, matrix_N) - 0.5).astype(my_type)
+    B = (np.random.rand(matrix_N, matrix_P) - 0.5).astype(my_type)
     C = np.matmul(A, B)
 
     A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(my_type)
     B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(my_type)
-    C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(np.int32)
+    C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(my_type)
 
     return [A, B, C], defines
 
 
+def generate_fmmse(my_type=np.float16, defines={}):
+
+    N_tx = defines['N_TX']
+    N_rx = defines['N_RX']
+    N_itr = defines['N_ITR']
+    vH = np.zeros([N_itr, N_tx * 2 * N_rx], dtype=my_type)
+    vG = np.zeros([N_itr, N_tx * 2 * N_tx], dtype=my_type)
+    vy = np.zeros([N_itr, 2 * N_rx], dtype=my_type)
+    vN = np.zeros([N_itr, 2 * N_tx], dtype=my_type)
+    vx = np.zeros([N_itr, 2 * N_tx], dtype=my_type)
+
+    for k in range(N_itr):
+
+        # Create input vector
+        y = np.random.rand(N_rx).astype(my_type) + 1.j * \
+            np.random.rand(N_rx).astype(my_type)
+
+        # Create channel matrix
+        H = np.random.rand(N_rx, N_tx).astype(my_type) + 1.j * \
+            np.random.rand(N_rx, N_tx).astype(my_type)
+        # Generate noise variance
+        N = np.random.rand(1).astype(my_type)
+
+        # Matrix to be inverted in MMSE estimator
+        H_h = np.asmatrix(H).H
+        G = np.matmul(H_h, H) + N * np.eye(H.shape[1])
+        N = N * np.ones(N_tx)
+
+        # Cholesky decomposition
+        L = np.linalg.cholesky(G)
+        # Linear system solution
+        y1 = np.transpose(np.dot(H_h, y))
+        y2 = solve_triangular(L, y1, lower=True)
+        x = solve_triangular(np.asmatrix(L).H, y2)
+
+        H = np.reshape(np.asarray(H), (N_tx * N_rx), order='C')
+        G = np.reshape(np.asarray(G), (N_tx * N_tx), order='C')
+        N = np.column_stack((N.real, N.imag)).astype(my_type).flatten()
+        H = np.column_stack((H.real, H.imag)).astype(my_type).flatten()
+        G = np.column_stack((G.real, G.imag)).astype(my_type).flatten()
+        x = np.column_stack((x.real, x.imag)).astype(my_type).flatten()
+        y = np.column_stack((y.real, y.imag)).astype(my_type).flatten()
+
+        vH[k, :] = H
+        vG[k, :] = G
+        vy[k, :] = y
+        vN[k, :] = N
+        vx[k, :] = x
+
+    vN = np.reshape(vN, (2 * N_tx * N_itr)).astype(my_type)
+    vH = np.reshape(vH, (2 * N_rx * N_tx * N_itr)).astype(my_type)
+    vG = np.reshape(vG, (2 * N_tx * N_tx * N_itr)).astype(my_type)
+    vy = np.reshape(vy, (2 * N_rx * N_itr)).astype(my_type)
+    vx = np.reshape(vx, (2 * N_tx * N_itr)).astype(my_type)
+
+    return [vH, vG, vy, vN, vx], defines
+
+
+def generate_fofdm(my_type=np.float32, defines={}):
+
+    N_sc = defines['N_SC']
+    N_rx = defines['N_RX']
+    N_bs = defines['N_BEAMS']
+
+    pFFT_src = (np.random.rand(2 * N_rx * N_sc)).astype(np.float16)
+    pBF_coef = (np.random.rand(2 * N_rx * N_bs)).astype(np.float16)
+    pBF_dst = (np.random.rand(2 * N_bs * N_sc)).astype(np.float16)
+    twiddles = ftwiddleCoef(N_sc, my_type)
+    bitrever = qmath.bitreversal(N_sc, 2)
+
+    defines['LOG2'] = int(math.log2(N_sc))
+    defines['N_TWIDDLES'] = 3 * N_sc // 4
+    defines['BITREVINDEXTABLE_LENGTH'] = len(bitrever)
+
+    return [pFFT_src, pBF_coef, pBF_dst, twiddles, bitrever], defines
+
+
+##############################################################################
+
+
 def generate_iaxpy(my_type=np.int32, defines={}):
 
     # Create matrix
-    ALPHA = defines['ALPHA']
     array_N = defines['array_N']
     MAX = select_maxval(my_type)
+    A = np.random.randint(-MAX, MAX - 1, size=1, dtype=my_type)
     X = irandom(MAX=MAX, size=(array_N), my_type=my_type)
     Y = irandom(MAX=MAX, size=(array_N), my_type=my_type)
-    Z = (Y + X * ALPHA).astype(my_type)
+    Z = (Y + X * A).astype(my_type)
 
-    return [X, Y, Z], defines
+    return [A, X, Y, Z], defines
 
 
 def generate_idotp(my_type=np.int32, defines={}):
@@ -113,7 +337,9 @@ def generate_idotp(my_type=np.int32, defines={}):
     MAX = select_maxval(my_type)
     X = irandom(MAX=MAX, size=(array_N), my_type=my_type)
     Y = irandom(MAX=MAX, size=(array_N), my_type=my_type)
-    Z = np.array((np.dot(X, Y))).astype(my_type)
+    Z = np.dot(X, Y)
+    Z = np.array(Z).astype(my_type)
+    Z = np.resize(Z, 1)
 
     return [X, Y, Z], defines
 
@@ -136,6 +362,51 @@ def generate_iconv(my_type=np.int32, defines={}):
     return [X, K, Y], defines
 
 
+def generate_imatmul(my_type=np.int32, defines={}):
+
+    # Create matrix
+    matrix_M = defines['matrix_M']
+    matrix_N = defines['matrix_N']
+    matrix_P = defines['matrix_P']
+    MAX = select_maxval(my_type)
+    A = irandom(MAX=MAX, size=(matrix_M, matrix_N), my_type=my_type)
+    B = irandom(MAX=MAX, size=(matrix_M, matrix_N), my_type=my_type)
+    C = np.matmul(A, B)
+
+    A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(my_type)
+    B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(my_type)
+    C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(np.int32)
+
+    return [A, B, C], defines
+
+
+##############################################################################
+
+
+def generate_qcmatmul(my_type=np.int32, defines={}):
+    MAX = 2**15
+    FIXED_POINT = 15
+
+    # Create matrix
+    matrix_M = defines['matrix_M']
+    matrix_N = defines['matrix_N']
+    matrix_P = defines['matrix_P']
+    A = np.random.randint(-MAX, MAX - 1, size=(matrix_M, matrix_N)) + 1j * \
+        np.random.randint(-MAX, MAX - 1, size=(matrix_M, matrix_N))
+    B = np.random.randint(-MAX, MAX - 1, size=(matrix_N, matrix_P)) + 1j * \
+        np.random.randint(-MAX, MAX - 1, size=(matrix_N, matrix_P))
+    [Cr, Ci] = qmath.qcmatmul(A.real, A.imag, B.real,
+                              B.imag, FIXED_POINT, my_type)
+
+    A = np.reshape(A, (matrix_M * matrix_N), order='C')
+    B = np.reshape(B, (matrix_N * matrix_P), order='C')
+    A = np.column_stack((A.imag, A.real)).astype(my_type).flatten()
+    B = np.column_stack((B.imag, B.real)).astype(my_type).flatten()
+    C = np.column_stack((Ci, Cr)).astype(my_type).flatten()
+
+    return [A, B, C], defines
+
+
 def generate_qchest(defines={}, fixed_point=15, my_type=np.int16):
 
     N_TX = defines['N_TX']
@@ -164,19 +435,136 @@ def generate_qchest(defines={}, fixed_point=15, my_type=np.int16):
     return [qvector_pilot_tx, qvector_pilot_rx, qvector_Hest], defines
 
 
+def generate_qccholesky(defines={}, fixed_point=15, my_type=np.int32):
+
+    matrix_N = defines['matrix_N']
+    FIXED_POINT = defines['FIXED_POINT']
+    N_SAMPLES = defines['N_SAMPLES']
+
+    vA = np.zeros([N_SAMPLES, 2 * matrix_N * matrix_N], dtype=my_type)
+    vL = np.zeros([N_SAMPLES, 2 * matrix_N * matrix_N], dtype=my_type)
+    for k in range(N_SAMPLES):
+
+        Ar = np.random.normal(0, 1, [matrix_N, matrix_N]).astype(np.float32)
+        Ai = np.random.normal(0, 1, [matrix_N, matrix_N]).astype(np.float32)
+        A = Ar + 1.j * Ai
+        G = np.matmul(A.conj().T, A)
+        MAX_A = max(np.abs(A.real).max(), np.abs(A.imag).max())
+        MAX_G = max(np.abs(G.real).max(), np.abs(G.imag).max())
+        MAX = max(MAX_A, MAX_G)
+
+        Ar = np.round((Ar / MAX) * 2**FIXED_POINT).astype(int)
+        Ai = np.round((Ai / MAX) * 2**FIXED_POINT).astype(int)
+        Ar = Ar + np.eye(matrix_N, dtype=int) * 256
+        Ai = Ai + np.eye(matrix_N, dtype=int) * 256
+
+        Ar, Ai = qmath.qcmatmul(Ar.T, -Ai.T, Ar, Ai, FIXED_POINT, my_type)
+        Lr, Li = qmath.qccholesky(
+            Ar, Ai, fixed_point=FIXED_POINT, mytype=my_type)
+
+        A = np.column_stack((Ar, Ai)).astype(my_type).flatten()
+        L = np.column_stack((Lr, Li)).astype(my_type).flatten()
+        vA[k, :] = np.reshape(A, (2 * matrix_N * matrix_N),
+                              order='C').astype(my_type)
+        vL[k, :] = np.reshape(L, (2 * matrix_N * matrix_N),
+                              order='C').astype(my_type)
+
+    vA = np.reshape(vA, (2 * matrix_N * matrix_N * N_SAMPLES)).astype(my_type)
+    vL = np.reshape(vL, (2 * matrix_N * matrix_N * N_SAMPLES)).astype(my_type)
+    return [vA, vL], defines
+
+
 def generate_qcholesky(defines={}, fixed_point=15, my_type=np.int32):
 
     matrix_N = defines['matrix_N']
     FIXED_POINT = defines['FIXED_POINT']
+    N_SAMPLES = defines['N_SAMPLES']
 
-    A = irandom(size=(matrix_N, matrix_N), MAX=2**14, my_type=my_type)
-    y = irandom(size=matrix_N, MAX=2**14, my_type=my_type)
-    A = qmath.qmatmul(A.T, A, FIXED_POINT, my_type)
-    L = qmath.qcholesky(A, fixed_point=FIXED_POINT, mytype=my_type)
+    vA = np.zeros([N_SAMPLES, matrix_N * matrix_N], dtype=my_type)
+    vL = np.zeros([N_SAMPLES, matrix_N * matrix_N], dtype=my_type)
+    vy = np.zeros([N_SAMPLES, matrix_N], dtype=my_type)
+    for k in range(N_SAMPLES):
+        A = irandom(size=(matrix_N, matrix_N), MAX=2**14, my_type=my_type)
+        y = irandom(size=matrix_N, MAX=2**14, my_type=my_type)
+        A = qmath.qmatmul(A.T, A, FIXED_POINT, my_type)
+        L = qmath.qcholesky(A, FIXED_POINT, my_type)
 
-    A = np.reshape(A, (matrix_N * matrix_N), order='C').astype(my_type)
-    L = np.reshape(L, (matrix_N * matrix_N), order='C').astype(my_type)
-    return [A, L, y], defines
+        vA[k, :] = np.reshape(A, (matrix_N * matrix_N),
+                              order='C').astype(my_type)
+        vL[k, :] = np.reshape(L, (matrix_N * matrix_N),
+                              order='C').astype(my_type)
+        vy[k, :] = np.reshape(y, matrix_N, order='C').astype(my_type)
+
+    vA = np.reshape(vA, (matrix_N * matrix_N * N_SAMPLES)).astype(my_type)
+    vL = np.reshape(vL, (matrix_N * matrix_N * N_SAMPLES)).astype(my_type)
+    vy = np.reshape(vy, (matrix_N * N_SAMPLES)).astype(my_type)
+
+    return [vA, vL, vy], defines
+
+
+def generate_qmmse(defines={}, fixed_point=15, my_type=np.int32):
+
+    FIXED_POINT = defines['FIXED_POINT']
+    N_tx = defines['N_TX']
+    N_rx = defines['N_RX']
+    N_itr = defines['N_ITR']
+
+    vN = np.zeros([N_itr, 2 * N_tx], dtype=np.int16)
+    vH = np.zeros([N_itr, 2 * N_tx * N_rx], dtype=np.int16)
+    vG = np.zeros([N_itr, 2 * N_tx * N_tx], dtype=np.int16)
+    vy = np.zeros([N_itr, 2 * N_rx], dtype=np.int16)
+    vx = np.zeros([N_itr, 2 * N_tx], dtype=np.int16)
+
+    for k in range(N_itr):
+
+        # Floating point inputs
+        rH = np.random.normal(0, 1, [N_rx, N_tx]).astype(np.float32)
+        iH = np.random.normal(0, 1, [N_rx, N_tx]).astype(np.float32)
+        rN = np.random.normal(0, 1, [N_rx]).astype(np.float32)
+        ry = np.random.normal(0, 1, [N_rx]).astype(np.float32)
+        iy = np.random.normal(0, 1, [N_rx]).astype(np.float32)
+        H = rH + 1j * iH
+        y = ry + 1j * iy
+        G = np.matmul(H.conj().T, H) + rN * np.eye(H.shape[1])
+        y1 = np.dot(H.conj().T, y)
+
+        # Rescale inputs
+        H_max = max(np.abs(H.real).max(), np.abs(H.imag).max())
+        G_max = max(np.abs(G.real).max(), np.abs(G.imag).max())
+        y_max = max(np.abs(y.real).max(), np.abs(y.imag).max())
+        y1_max = max(np.abs(y1.real).max(), np.abs(y1.imag).max())
+        N_max = np.abs(rN).max()
+        MAX = max(H_max, G_max, N_max, y_max, y1_max)
+        SCALE_FACTOR = 2**FIXED_POINT
+        rH = np.round((H.real / MAX) * SCALE_FACTOR).astype(int)
+        iH = np.round((H.imag / MAX) * SCALE_FACTOR).astype(int)
+        ry = np.round((y.real / MAX) * SCALE_FACTOR).astype(int)
+        iy = np.round((y.imag / MAX) * SCALE_FACTOR).astype(int)
+        rN = np.round((rN / MAX) * SCALE_FACTOR).astype(int) + 1024
+
+        # Hermitian
+        rG, iG = qmath.qcmatmul(rH.T, -iH.T, rH, iH, FIXED_POINT, my_type)
+        ry1, iy1 = qmath.qcmvmul(rH.T, -iH.T, ry, iy, FIXED_POINT, my_type)
+        np.fill_diagonal(rG, rG.diagonal() + rN)
+
+        # Solve linear system
+        rL, iL = qmath.qccholesky(rG, iG, FIXED_POINT, my_type)
+        ry2, iy2 = qmath.qinvertLt(rL, iL, ry1, iy1, FIXED_POINT, my_type)
+        rx, ix = qmath.qinvertUt(rL.T, -iL.T, ry2, iy2, FIXED_POINT, my_type)
+
+        vN[k, :] = np.column_stack(
+            (rN, np.zeros(np.size(rN)))).astype(my_type).flatten()
+        vH[k, :] = np.column_stack((rH, iH)).astype(my_type).flatten()
+        vG[k, :] = np.column_stack((rG, iG)).astype(my_type).flatten()
+        vy[k, :] = np.column_stack((ry, iy)).astype(my_type).flatten()
+        vx[k, :] = np.column_stack((rx, ix)).astype(my_type).flatten()
+
+    vN = np.reshape(vN, (2 * N_tx * N_itr)).astype(my_type)
+    vH = np.reshape(vH, (2 * N_rx * N_tx * N_itr)).astype(my_type)
+    vG = np.reshape(vG, (2 * N_tx * N_tx * N_itr)).astype(my_type)
+    vy = np.reshape(vy, (2 * N_rx * N_itr)).astype(my_type)
+    vx = np.reshape(vx, (2 * N_tx * N_itr)).astype(my_type)
+    return [vN, vH, vG, vy, vx], defines
 
 
 def generate_cfft_q16(defines={}, fixed_point=15, my_type=np.int16):
diff --git a/software/data/generate_cfft.py b/software/data/generate_cfft.py
deleted file mode 100755
index 2412c278d..000000000
--- a/software/data/generate_cfft.py
+++ /dev/null
@@ -1,220 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the cfft kernel.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import math as M
-import argparse
-import pathlib
-from mako.template import Template
-from sympy.combinatorics import Permutation
-
-# Function to generate the expected result of the testcase.
-
-
-def generate_cfft_q16(N):
-    # Q16:
-    # len=16:    Q1.15 -> Q5.11
-    # len=32:    Q1.15 -> Q6.10
-    # len=64:    Q1.15 -> Q7.9
-    # len=128:   Q1.15 -> Q8.8
-    # len=256:   Q1.15 -> Q9.7
-    # len=512:   Q1.15 -> Q10.6
-    # len=1024:  Q1.15 -> Q11.5
-    # len=2048:  Q1.15 -> Q12.4
-    # len=4096:  Q1.15 -> Q13.3
-    src = (np.random.randint(-2**(15), 2**(15) - 1,
-           2 * N, dtype=np.int16)).astype(np.int16)
-
-    bit_shift_dict_q16 = {
-        16: 11,
-        32: 10,
-        64: 9,
-        128: 8,
-        256: 7,
-        512: 6,
-        1024: 5,
-        2048: 4,
-        4096: 3}
-    my_fixpoint = 15
-    dst = np.zeros(2 * N, dtype=np.int16)
-    complex_src = np.zeros(N, dtype=np.csingle)
-    complex_dst = np.zeros(N, dtype=np.csingle)
-    for i in range(N):
-        shift = 2**(my_fixpoint)
-        complex_src[i] = (src[2 * i].astype(np.csingle) / shift) + \
-            1j * (src[2 * i + 1].astype(np.csingle) / shift)
-    complex_dst = np.fft.fft(complex_src)
-    for i in range(N):
-        shift = 2**(bit_shift_dict_q16[N])
-        dst[2 * i] = (np.real(complex_dst[i]) * shift).astype(np.int16)
-        dst[2 * i + 1] = (np.imag(complex_dst[i]) * shift).astype(np.int16)
-    return src, dst
-
-
-def generate_cfft_f16(N):
-    # src = np.random.rand(N).astype(np.float16)
-    # src = src + 1.j * np.random.rand(N).astype(np.float16)
-    src = np.cos(np.linspace(0, N / 4, num=N)).astype(np.float16)
-    src = src + 1.j * np.sin(np.linspace(0, N / 4, num=N)).astype(np.float16)
-    dst = np.fft.fft(src)
-    src = np.column_stack((src.imag, src.real)).astype(np.float16).flatten()
-    dst = np.column_stack((dst.imag, dst.real)).astype(np.float16).flatten()
-    return src, dst
-
-
-def generate_twiddleCoefq15(N):
-    PI = 3.14159265358979
-    twiddleCoefq15 = np.zeros((int)(2 * 3 * N / 4), np.int16)
-    for i in range(0, (int)(3 * N / 4)):
-        twiddleCoefq15_cos = M.cos(i * 2 * PI / N)
-        twiddleCoefq15_sin = M.sin(i * 2 * PI / N)
-        twiddleCoefq15[2 * i] = int(round(twiddleCoefq15_cos * (2**15 - 1)))
-        twiddleCoefq15[2 * i +
-                       1] = int(round(twiddleCoefq15_sin * (2**15 - 1)))
-    return twiddleCoefq15
-
-
-def generate_twiddleCoeff16(N):
-    PI = np.pi
-    twiddleCoeff16 = np.zeros((int)(2 * 3 * N / 4), np.float16)
-    for i in range(0, int(3 * N / 4)):
-        twiddleCoeff16_sin = np.sin(i * 2 * PI / N).astype(np.float16)
-        twiddleCoeff16_cos = np.cos(i * 2 * PI / N).astype(np.float16)
-        twiddleCoeff16[2 * i] = twiddleCoeff16_sin
-        twiddleCoeff16[2 * i + 1] = twiddleCoeff16_cos
-    return twiddleCoeff16
-
-
-def generate_bitreversal(N, R):
-    # Decompose
-    logR2 = []
-    idx = N
-    while (idx >= R):
-        logR2.append(int(M.log2(R)))
-        idx = idx // R
-    if (idx > 1):
-        logR2.append(int(M.log2(idx)))
-    # Bitreversal
-    indexes = []
-    for x in range(N):
-        result = 0
-        for bits in logR2:
-            mask = (0xffffffff >> (32 - bits))
-            result = (result << bits) | (x & mask)
-            x = x >> bits
-        indexes.append(result)
-    # Create transpositions table
-    tps = []
-    for c in Permutation.from_sequence(indexes).cyclic_form:
-        for i in range(len(c) - 1):
-            tps.append([c[i] * 8, c[-1] * 8])
-    return np.ndarray.flatten(np.array(tps))
-
-
-def gen_data_header_file(
-        outdir: pathlib.Path.cwd(),
-        tpl: pathlib.Path.cwd(),
-        **kwargs):
-    file = outdir / f"{kwargs['name']}.h"
-    print(tpl, outdir, kwargs['name'])
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-d",
-        "--dimension",
-        type=int,
-        required=False,
-        default=64,
-        help='Input dimension'
-    )
-
-    args = parser.parse_args()
-
-    # Create inputs cfft_q16
-    Len = args.dimension
-    src_cfft_q16, dst_cfft_q16 = generate_cfft_q16(Len)
-    twi_cfft_q16 = generate_twiddleCoefq15(Len)
-    brv_cfft_q16 = generate_bitreversal(Len, 2)
-    tolerance = {
-        16: 16,
-        32: 20,
-        64: 24,
-        128: 28,
-        256: 32,
-        512: 48,
-        1024: 64,
-        2048: 96,
-        4096: 128}
-
-    kwargs = {'name': 'data_cfft_radix4_q16',
-              'vector_inp': src_cfft_q16,
-              'vector_res': dst_cfft_q16,
-              'vector_twi': twi_cfft_q16,
-              'vector_bitrev': brv_cfft_q16,
-              'Len': Len,
-              'Log2Len': int(np.log2(Len)),
-              'BitrevLen': len(brv_cfft_q16),
-              'tolerance': tolerance[int(Len)]}
-    gen_data_header_file(
-        args.outdir,
-        pathlib.Path(__file__).parent.absolute() /
-        "data_cfft_q16.h.tpl",
-        **kwargs)
-
-    kwargs = {'name': 'data_cfft_radix2_q16',
-              'vector_inp': src_cfft_q16,
-              'vector_res': dst_cfft_q16,
-              'vector_twi': twi_cfft_q16,
-              'vector_bitrev': brv_cfft_q16,
-              'Len': Len,
-              'Log2Len': int(np.log2(Len)),
-              'BitrevLen': int(2 * len(brv_cfft_q16)),
-              'tolerance': tolerance[int(Len)]}
-    gen_data_header_file(
-        args.outdir,
-        pathlib.Path(__file__).parent.absolute() /
-        "data_cfft_q16.h.tpl",
-        **kwargs)
-
-    # Create inputs cfft_f16
-    Len = args.dimension
-    src_cfft_f16, dst_cfft_f16 = generate_cfft_f16(Len)
-    twi_cfft_f16 = generate_twiddleCoeff16(Len)
-
-    kwargs = {'name': 'data_cfft_radix4_f16',
-              'vector_inp': src_cfft_f16,
-              'vector_res': dst_cfft_f16,
-              'vector_twi': twi_cfft_f16,
-              'vector_bitrev': brv_cfft_q16,
-              'Len': Len,
-              'Log2Len': int(np.log2(Len)),
-              'BitrevLen': len(brv_cfft_q16)}
-    gen_data_header_file(
-        args.outdir,
-        pathlib.Path(__file__).parent.absolute() /
-        "data_cfft_f16.h.tpl",
-        **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/generate_chest.py b/software/data/generate_chest.py
deleted file mode 100755
index e11eb8b62..000000000
--- a/software/data/generate_chest.py
+++ /dev/null
@@ -1,210 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the Channel estimation.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import argparse
-import pathlib
-
-from mako.template import Template
-
-##################
-#  write_result  #
-##################
-
-
-def gen_data_header_file(
-        outdir: pathlib.Path.cwd(),
-        tpl: pathlib.Path.cwd(),
-        **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def q_sat(x):
-    if x > 2**15 - 1:
-        return x - 2**16
-    elif x < -2**15:
-        return x + 2**16
-    else:
-        return x
-
-
-def generate_chest_f16(nb_tx, nb_rx, nb_samples):
-    H = np.random.randn(nb_rx, nb_tx) + 1j * np.random.randn(nb_rx, nb_tx)
-    vector_pilot_tx = []
-    vector_pilot_rx = []
-    vector_Hest = []
-    for k in range(nb_samples):
-
-        # Compute data division
-        #        pilot_tx = 1 * np.exp(1j * np.random.randn(nb_tx))
-        #        pilot_rx = np.dot(H, pilot_tx)
-        #        Hest = pilot_rx[:, np.newaxis] / pilot_tx[np.newaxis, :]
-
-        # Compute data multiplication
-        pilot_tx = 1 * np.exp(1j * np.random.randn(nb_tx))
-        pilot_rx = np.dot(H, pilot_tx)
-        pilot_tx = np.reciprocal(pilot_tx)
-        Hest = pilot_rx[:, np.newaxis] * pilot_tx[np.newaxis, :]
-
-        # Interleaved real and imaginary parts
-        pilot_tx = np.column_stack(
-            (pilot_tx.imag, pilot_tx.real)).astype(
-            np.float16).flatten()
-        pilot_rx = np.column_stack(
-            (pilot_rx.imag, pilot_rx.real)).astype(
-            np.float16).flatten()
-        Hest = Hest.flatten()
-        Hest = np.column_stack(
-            (Hest.imag, Hest.real)).astype(
-            np.float16).flatten()
-
-        # Output vectors
-        vector_pilot_tx.append(pilot_tx)
-        vector_pilot_rx.append(pilot_rx)
-        vector_Hest.append(Hest)
-
-    vector_pilot_rx = np.concatenate(vector_pilot_rx, axis=0)
-    vector_pilot_tx = np.concatenate(vector_pilot_tx, axis=0)
-    vector_Hest = np.concatenate(vector_Hest, axis=0)
-    return vector_pilot_tx, vector_pilot_rx, vector_Hest
-
-# Compute the channel estimate
-
-
-def compute_chest_q16(in_rx, in_tx, p):
-    n_rx = in_rx.size
-    n_tx = in_tx.size
-    result = np.zeros(2 * (n_tx * n_rx), dtype=np.int16)
-    for i in range(n_rx):
-        a_r = in_rx[i].real
-        a_i = in_rx[i].imag
-        for j in range(n_tx):
-            b_r = in_tx[j].real
-            b_i = in_tx[j].imag
-
-#            # Compute data division
-#            den = (2**16) // (b_r * b_r + b_i * b_i)
-#            num_r = (a_r * b_r) + (a_i * b_i)
-#            num_i = (a_i * b_r) - (a_r * b_i)
-#            result[2 * (i * n_tx + j)] = q_sat((num_r * den) // 2**p)
-#            result[2 * (i * n_tx + j) + 1] = q_sat((num_i * den) // 2**p)
-
-            # Compute data multiplication
-            num_r = (a_r * b_r) - (a_i * b_i)
-            num_i = (a_i * b_r) + (a_r * b_i)
-            result[2 * (i * n_tx + j)] = q_sat(num_r // 2**p)
-            result[2 * (i * n_tx + j) + 1] = q_sat(num_i // 2**p)
-    return result
-
-
-def generate_chest_q16(nb_tx, nb_rx, nb_samples):
-    FIXED_POINT = 8
-    MAX = 2**7
-
-    qvector_pilot_tx = []
-    qvector_pilot_rx = []
-    qvector_Hest = []
-    for k in range(nb_samples):
-        # Create pilots
-        pilot_rx = np.random.randint(-MAX, MAX - 1, size=nb_rx) + 1j * \
-            np.random.randint(-MAX, MAX - 1, size=nb_rx)
-        pilot_tx = np.random.randint(-MAX, MAX - 1, size=nb_tx) + 1j * \
-            np.random.randint(-MAX, MAX - 1, size=nb_tx)
-        # Compute Hest
-        Hest = compute_chest_q16(pilot_rx, pilot_tx, FIXED_POINT)
-
-        pilot_tx = np.column_stack(
-            (pilot_tx.imag, pilot_tx.real)).astype(
-            np.int16).flatten()
-        pilot_rx = np.column_stack(
-            (pilot_rx.imag, pilot_rx.real)).astype(
-            np.int16).flatten()
-        qvector_pilot_tx.append(pilot_tx)
-        qvector_pilot_rx.append(pilot_rx)
-        qvector_Hest.append(Hest)
-
-    qvector_pilot_tx = np.reshape(qvector_pilot_tx, [2 * nb_tx * nb_samples])
-    qvector_pilot_rx = np.reshape(qvector_pilot_rx, [2 * nb_rx * nb_samples])
-    qvector_Hest = np.reshape(qvector_Hest, [2 * nb_tx * nb_rx * nb_samples])
-    return qvector_pilot_tx, qvector_pilot_rx, qvector_Hest
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-b",
-        "--num_rx",
-        type=int,
-        required=False,
-        default=32,
-        help='Number beams'
-    )
-    parser.add_argument(
-        "-l",
-        "--num_tx",
-        type=int,
-        required=False,
-        default=4,
-        help='Number layers'
-    )
-    parser.add_argument(
-        "-s",
-        "--num_samples",
-        type=int,
-        required=False,
-        default=32,
-        help='Number samples'
-    )
-
-    args = parser.parse_args()
-    nb_tx = args.num_tx
-    nb_rx = args.num_rx
-    nb_samples = args.num_samples
-
-    pilot_tx, pilot_rx, Hest = generate_chest_q16(nb_tx, nb_rx, nb_samples)
-    tpl = pathlib.Path(__file__).parent.absolute() / "data_chest_q16.h.tpl"
-    kwargs = {'name': 'data_chest_q16',
-              'pilot_tx': pilot_tx,
-              'pilot_rx': pilot_rx,
-              'Hest': Hest,
-              'nb_tx': nb_tx,
-              'nb_rx': nb_rx,
-              'nb_samples': nb_samples}
-    gen_data_header_file(args.outdir, tpl, **kwargs)
-
-    pilot_tx, pilot_rx, Hest = generate_chest_f16(nb_tx, nb_rx, nb_samples)
-    tpl = pathlib.Path(__file__).parent.absolute() / "data_chest_f16.h.tpl"
-    kwargs = {'name': 'data_chest_f16',
-              'pilot_rx': pilot_rx,
-              'pilot_tx': pilot_tx,
-              'Hest': Hest,
-              'nb_tx': nb_tx,
-              'nb_rx': nb_rx,
-              'nb_samples': nb_samples}
-    gen_data_header_file(args.outdir, tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/generate_cholesky.py b/software/data/generate_cholesky.py
deleted file mode 100644
index 1a25c4206..000000000
--- a/software/data/generate_cholesky.py
+++ /dev/null
@@ -1,179 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the fp16 cholesky.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import argparse
-import pathlib
-from scipy.linalg import solve_triangular
-from mako.template import Template
-
-
-##################
-# compute_result #
-##################
-
-def gen_data_header_file(outdir: pathlib.Path.cwd(),
-                         tpl: pathlib.Path.cwd(), **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def generate_cholesky_q32(n_matrix):
-    # Create hermitian matrix
-    L = np.random.randint(-2**(15), 2**(15) - 1,
-                          size=(n_matrix, n_matrix), dtype=np.int32)
-    L = np.tril(L).astype(np.int32)
-    G = np.dot(np.asmatrix(L), np.asmatrix(L).transpose())
-
-    y = np.random.randint(-2**(15), 2**(15) - 1, n_matrix, dtype=np.int32)
-
-    # Linear system solution
-    y = solve_triangular(L, y, lower=True)
-    # x = solve_triangular(np.asmatrix(L).T, y)
-
-    # Reshape
-    G = np.reshape(
-        np.asarray(G),
-        (n_matrix * n_matrix),
-        order='C').astype(
-        np.int32)
-    L = np.reshape(
-        np.asarray(L),
-        (n_matrix * n_matrix),
-        order='C').astype(
-        np.int32)
-    y = np.reshape(np.asarray(y), (n_matrix), order='C').astype(np.int32)
-
-    return G, L, y
-
-
-def generate_cholesky_q16(n_matrix, n_samples):
-    vector_G = []
-    vector_L = []
-    for k in range(n_samples):
-        # Create hermitian matrix
-        H = np.random.randint(-2**(15), 2**(15) - 1, n_matrix * n_matrix,
-                              dtype=np.int16) + \
-            1.j * np.random.randint(-2**(15), 2**(15) - 1, n_matrix * n_matrix,
-                                    dtype=np.int16)
-        H = H.reshape(n_matrix, n_matrix)
-        # Matrix to be inverted
-        H_h = (np.asmatrix(H).H)
-        # H_H = np.asmatrix(H).H
-        G = H_h * H
-        # Cholesky decomposition
-        L = np.linalg.cholesky(G)
-        # Reshape
-        G = np.reshape(np.asarray(G), (n_matrix * n_matrix), order='C')
-        L = np.reshape(np.asarray(L), (n_matrix * n_matrix), order='C')
-        G = np.column_stack((G.real, G.imag)).astype(np.int16).flatten()
-        L = np.column_stack((L.real, L.imag)).astype(np.int16).flatten()
-        # Output vectors
-        vector_G.append(G)
-        vector_L.append(L)
-
-    vector_G = np.concatenate(vector_G, axis=0)
-    vector_L = np.concatenate(vector_L, axis=0)
-    return vector_G, vector_L
-
-
-def generate_cholesky_f16(n_matrix, n_samples):
-    vector_G = []
-    vector_L = []
-    for k in range(n_samples):
-        # Create hermitian matrix
-        H = np.random.rand(n_matrix, n_matrix) + 1.j * \
-            np.random.rand(n_matrix, n_matrix)
-        # Matrix to be inverted
-        # H_H = np.asmatrix(H).H
-        G = np.matmul(H, np.asmatrix(H).H)
-        # Cholesky decomposition
-        L = np.linalg.cholesky(G)
-        # Reshape
-        G = np.reshape(np.asarray(G), (n_matrix * n_matrix), order='C')
-        L = np.reshape(np.asarray(L), (n_matrix * n_matrix), order='C')
-        G = np.column_stack((G.real, G.imag)).astype(np.float16).flatten()
-        L = np.column_stack((L.real, L.imag)).astype(np.float16).flatten()
-        # Output vectors
-        vector_G.append(G)
-        vector_L.append(L)
-
-    vector_G = np.concatenate(vector_G, axis=0)
-    vector_L = np.concatenate(vector_L, axis=0)
-    return vector_G, vector_L
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-n",
-        "--dimension",
-        type=int,
-        required=False,
-        default=4,
-        help='Matrix dimension'
-    )
-    parser.add_argument(
-        "-s",
-        "--num_samples",
-        type=int,
-        required=False,
-        default=256,
-        help='Number samples'
-    )
-
-    args = parser.parse_args()
-    n_matrix = args.dimension
-    n_samples = args.num_samples
-
-    G, L, y = generate_cholesky_q32(n_matrix)
-    tpl = pathlib.Path(__file__).parent.absolute() / "data_cholesky_q32.h.tpl"
-    kwargs = {'name': 'data_cholesky_q32',
-              'G': G,
-              'L': L,
-              'y': y,
-              'n_matrix': n_matrix}
-    gen_data_header_file(args.outdir, tpl, **kwargs)
-
-    vector_G, vector_L = generate_cholesky_q16(n_matrix, n_samples)
-    tpl = pathlib.Path(__file__).parent.absolute() / "data_cholesky_q16.h.tpl"
-    kwargs = {'name': 'data_cholesky_q16',
-              'G': vector_G,
-              'L': vector_L,
-              'n_matrix': n_matrix,
-              'n_samples': n_samples}
-    gen_data_header_file(args.outdir, tpl, **kwargs)
-
-    vector_G, vector_L = generate_cholesky_f16(n_matrix, n_samples)
-    tpl = pathlib.Path(__file__).parent.absolute() / "data_cholesky_f16.h.tpl"
-    kwargs = {'name': 'data_cholesky_f16',
-              'G': vector_G,
-              'L': vector_L,
-              'n_matrix': n_matrix,
-              'n_samples': n_samples}
-    gen_data_header_file(args.outdir, tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/generate_dotp.py b/software/data/generate_dotp.py
deleted file mode 100644
index b5e7410af..000000000
--- a/software/data/generate_dotp.py
+++ /dev/null
@@ -1,157 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the fp16 matmul.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import argparse
-import pathlib
-from mako.template import Template
-
-
-def generate_dotp_i32(Len):
-
-    # Create matrix
-    MAX = 2**7 - 1
-    A = np.random.randint(-MAX, MAX - 1, size=Len)
-    B = np.random.randint(-MAX, MAX - 1, size=Len)
-    C = np.dot(A, B)
-    return A, B, C
-
-
-def generate_dotp_f32(Len):
-
-    # Create matrix
-    A = np.random.randn(Len).astype(np.float32)
-    B = np.random.randn(Len).astype(np.float32)
-    C = (np.dot(A, B)).astype(np.float32)
-    return A, B, C
-
-
-def generate_dotp_f16(Len):
-
-    # Create matrix
-    A = np.random.randn(Len).astype(np.float16)
-    B = np.random.randn(Len).astype(np.float16)
-    C = (np.dot(A, B)).astype(np.float16)
-    return A, B, C
-
-
-def generate_axpy_f32(Len):
-
-    # Create matrix
-    X = np.random.rand(Len).astype(np.float32)
-    Y = np.random.rand(Len).astype(np.float32)
-    A = np.float32(3.14)
-    out = Y + A * X
-    return A, X, Y, out
-
-
-def generate_axpy_f16(Len):
-
-    # Create matrix
-    X = np.random.rand(Len).astype(np.float16)
-    Y = np.random.rand(Len).astype(np.float16)
-    A = np.float16(3.14)
-    out = Y + A * X
-    return A, X, Y, out
-
-##################
-# compute_result #
-##################
-
-
-def gen_data_header_file(outdir: pathlib.Path.cwd(),
-                         tpl: pathlib.Path.cwd(), **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-n",
-        "--length",
-        type=int,
-        required=False,
-        default=1024,
-        help='First dimension.'
-    )
-
-    args = parser.parse_args()
-    Len = args.length
-
-    A, B, C = generate_dotp_i32(Len)
-    tpl = pathlib.Path(__file__).parent.absolute() / "data_dotp_i32.h.tpl"
-    kwargs = {
-        'name': 'data_dotp_i32',
-        'A': A,
-        'B': B,
-        'C': C,
-        'Len': Len}
-    gen_data_header_file(args.outdir, tpl, **kwargs)
-
-    A, B, C = generate_dotp_f32(Len)
-    tpl = pathlib.Path(__file__).parent.absolute() / "data_dotp_f32.h.tpl"
-    kwargs = {
-        'name': 'data_dotp_f32',
-        'A': A,
-        'B': B,
-        'C': C,
-        'Len': Len}
-    gen_data_header_file(args.outdir, tpl, **kwargs)
-
-    A, B, C = generate_dotp_f16(Len)
-    tpl = pathlib.Path(__file__).parent.absolute() / "data_dotp_f16.h.tpl"
-    kwargs = {
-        'name': 'data_dotp_f16',
-        'A': A,
-        'B': B,
-        'C': C,
-        'Len': Len}
-    gen_data_header_file(args.outdir, tpl, **kwargs)
-
-    A, X, Y, out = generate_axpy_f32(Len)
-    tpl = pathlib.Path(__file__).parent.absolute() / "data_axpy_f32.h.tpl"
-    kwargs = {
-        'name': 'data_axpy_f32',
-        'A': A,
-        'X': X,
-        'Y': Y,
-        'out': out,
-        'Len': Len}
-    gen_data_header_file(args.outdir, tpl, **kwargs)
-
-    A, X, Y, out = generate_axpy_f16(Len)
-    tpl = pathlib.Path(__file__).parent.absolute() / "data_axpy_f16.h.tpl"
-    kwargs = {
-        'name': 'data_axpy_f16',
-        'A': A,
-        'X': X,
-        'Y': Y,
-        'out': out,
-        'Len': Len}
-    gen_data_header_file(args.outdir, tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/generate_matmul.py b/software/data/generate_matmul.py
deleted file mode 100644
index 1b2edc9bc..000000000
--- a/software/data/generate_matmul.py
+++ /dev/null
@@ -1,204 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the fp16 matmul.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import argparse
-import pathlib
-from mako.template import Template
-
-
-def generate_cmatmul_f16(matrix_M, matrix_N, matrix_P):
-
-    # Create matrix
-    A = np.random.rand(matrix_M, matrix_N) + 1j * \
-        np.random.rand(matrix_M, matrix_N)
-    B = np.random.rand(matrix_N, matrix_P) + 1j * \
-        np.random.rand(matrix_N, matrix_P)
-    C = np.matmul(A, B)
-
-    A = np.reshape(A, (matrix_M * matrix_N), order='C')
-    B = np.reshape(B, (matrix_N * matrix_P), order='C')
-    C = np.reshape(C, (matrix_M * matrix_P), order='C')
-
-    A = np.column_stack((A.imag, A.real)).astype(np.float16).flatten()
-    B = np.column_stack((B.imag, B.real)).astype(np.float16).flatten()
-    C = np.column_stack((C.imag, C.real)).astype(np.float16).flatten()
-
-    return A, B, C
-
-
-def generate_cmatmul_q16(matrix_M, matrix_N, matrix_P):
-    MAX = 2**15
-    FIXED_POINT = 15
-
-    # Create matrix
-    A = np.random.randint(-MAX, MAX - 1, size=(matrix_M, matrix_N)) + 1j * \
-        np.random.randint(-MAX, MAX - 1, size=(matrix_M, matrix_N))
-    B = np.random.randint(-MAX, MAX - 1, size=(matrix_N, matrix_P)) + 1j * \
-        np.random.randint(-MAX, MAX - 1, size=(matrix_N, matrix_P))
-
-    C = np.zeros((matrix_M, matrix_P), dtype=complex)
-    for k in range(matrix_P):
-        for i in range(matrix_M):
-            for j in range(matrix_N):
-                a = A[i][j].real
-                b = A[i][j].imag
-                c = B[j][k].real
-                d = B[j][k].imag
-                C[i][k] += (a * c - b * d) // (1 << FIXED_POINT)
-                C[i][k] += (b * c + a * d) // (1 << FIXED_POINT) * 1j
-
-    A = np.reshape(A, (matrix_M * matrix_N), order='C')
-    B = np.reshape(B, (matrix_N * matrix_P), order='C')
-    C = np.reshape(C, (matrix_M * matrix_P), order='C')
-
-    A = np.column_stack((A.imag, A.real)).astype(np.int16).flatten()
-    B = np.column_stack((B.imag, B.real)).astype(np.int16).flatten()
-    C = np.column_stack((C.imag, C.real)).astype(np.int16).flatten()
-
-    return A, B, C
-
-
-def generate_matmul_f16(matrix_M, matrix_N, matrix_P):
-
-    # Create matrix
-    A = (np.random.rand(matrix_M, matrix_N) - 0.5).astype(np.float16)
-    B = (np.random.rand(matrix_N, matrix_P) - 0.5).astype(np.float16)
-    C = np.matmul(A, B)
-
-    A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(np.float16)
-    B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(np.float16)
-    C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(np.float16)
-
-    return A, B, C
-
-
-def generate_matmul_f32(matrix_M, matrix_N, matrix_P):
-
-    # Create matrix
-    A = np.random.rand(matrix_M, matrix_N)
-    B = np.random.rand(matrix_N, matrix_P)
-    C = np.matmul(A, B)
-
-    A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(np.float32)
-    B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(np.float32)
-    C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(np.float32)
-
-    return A, B, C
-
-##################
-# compute_result #
-##################
-
-
-def gen_data_header_file(outdir: pathlib.Path.cwd(),
-                         tpl: pathlib.Path.cwd(), **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-m",
-        "--dim_m",
-        type=int,
-        required=False,
-        default=16,
-        help='First dimension.'
-    )
-    parser.add_argument(
-        "-n",
-        "--dim_n",
-        type=int,
-        required=False,
-        default=16,
-        help='Second dimension.'
-    )
-    parser.add_argument(
-        "-p",
-        "--dim_p",
-        type=int,
-        required=False,
-        default=16,
-        help='Third dimension.'
-    )
-
-    args = parser.parse_args()
-
-    matrix_M = args.dim_m
-    matrix_N = args.dim_n
-    matrix_P = args.dim_p
-
-    A, B, C = generate_cmatmul_f16(matrix_M, matrix_N, matrix_P)
-    tpl = pathlib.Path(__file__).parent.absolute() / "data_cmatmul_f16.h.tpl"
-    kwargs = {
-        'name': 'data_cmatmul_f16',
-        'A': A,
-        'B': B,
-        'C': C,
-        'matrix_M': matrix_M,
-        'matrix_N': matrix_N,
-        'matrix_P': matrix_P}
-    gen_data_header_file(args.outdir, tpl, **kwargs)
-
-    A, B, C = generate_cmatmul_q16(matrix_M, matrix_N, matrix_P)
-    tpl = pathlib.Path(__file__).parent.absolute() / "data_cmatmul_q16.h.tpl"
-    kwargs = {
-        'name': 'data_cmatmul_q16',
-        'A': A,
-        'B': B,
-        'C': C,
-        'matrix_M': matrix_M,
-        'matrix_N': matrix_N,
-        'matrix_P': matrix_P}
-    gen_data_header_file(args.outdir, tpl, **kwargs)
-
-    A, B, C = generate_matmul_f16(matrix_M, matrix_N, matrix_P)
-    tpl = pathlib.Path(__file__).parent.absolute() / "data_matmul_f16.h.tpl"
-    kwargs = {
-        'name': 'data_matmul_f16',
-        'A': A,
-        'B': B,
-        'C': C,
-        'matrix_M': matrix_M,
-        'matrix_N': matrix_N,
-        'matrix_P': matrix_P}
-    gen_data_header_file(args.outdir, tpl, **kwargs)
-
-    A, B, C = generate_matmul_f32(matrix_M, matrix_N, matrix_P)
-    tpl = pathlib.Path(__file__).parent.absolute() / "data_matmul_f32.h.tpl"
-    kwargs = {
-        'name': 'data_matmul_f32',
-        'A': A,
-        'B': B,
-        'C': C,
-        'matrix_M': matrix_M,
-        'matrix_N': matrix_N,
-        'matrix_P': matrix_P}
-    gen_data_header_file(args.outdir, tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/generate_mimo_mmse.py b/software/data/generate_mimo_mmse.py
deleted file mode 100644
index f8918f561..000000000
--- a/software/data/generate_mimo_mmse.py
+++ /dev/null
@@ -1,232 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the fp16 mmse.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import argparse
-import pathlib
-from mako.template import Template
-import pyflexfloat as ff
-from scipy.linalg import solve_triangular
-
-
-def gen_data_header_file(outdir: pathlib.Path.cwd(),
-                         tpl: pathlib.Path.cwd(), **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def generate_fmmse(N_tx, N_rx, N_itr, my_type):
-
-    vH = np.zeros([N_itr, N_tx * 2 * N_rx], dtype=my_type)
-    vG = np.zeros([N_itr, N_tx * 2 * N_tx], dtype=my_type)
-    vy = np.zeros([N_itr, 2 * N_rx], dtype=my_type)
-    vN = np.zeros([N_itr, 2 * N_tx], dtype=my_type)
-    vx = np.zeros([N_itr, 2 * N_tx], dtype=my_type)
-
-    for k in range(N_itr):
-
-        # Create input vector
-        y = np.random.rand(N_rx).astype(my_type) + 1.j * \
-            np.random.rand(N_rx).astype(my_type)
-
-        # Create channel matrix
-        H = np.random.rand(N_rx, N_tx).astype(my_type) + 1.j * \
-            np.random.rand(N_rx, N_tx).astype(my_type)
-        # Generate noise variance
-        N = np.random.rand(1).astype(my_type)
-
-        # Matrix to be inverted in MMSE estimator
-        H_h = np.asmatrix(H).H
-        G = np.matmul(H_h, H) + N * np.eye(H.shape[1])
-        N = N * np.ones(N_tx)
-
-        # Cholesky decomposition
-        L = np.linalg.cholesky(G)
-        # Linear system solution
-        y1 = np.transpose(np.dot(H_h, y))
-        y2 = solve_triangular(L, y1, lower=True)
-        x = solve_triangular(np.asmatrix(L).H, y2)
-
-        H = np.reshape(np.asarray(H), (N_tx * N_rx), order='C')
-        G = np.reshape(np.asarray(G), (N_tx * N_tx), order='C')
-        N = np.column_stack((N.real, N.imag)).astype(my_type).flatten()
-        H = np.column_stack((H.real, H.imag)).astype(my_type).flatten()
-        G = np.column_stack((G.real, G.imag)).astype(my_type).flatten()
-        x = np.column_stack((x.real, x.imag)).astype(my_type).flatten()
-        y = np.column_stack((y.real, y.imag)).astype(my_type).flatten()
-
-        vH[k, :] = H
-        vG[k, :] = G
-        vy[k, :] = y
-        vN[k, :] = N
-        vx[k, :] = x
-
-    vN = np.reshape(vN, (2 * N_tx * N_itr)).astype(my_type)
-    vH = np.reshape(vH, (2 * N_rx * N_tx * N_itr)).astype(my_type)
-    vG = np.reshape(vG, (2 * N_tx * N_tx * N_itr)).astype(my_type)
-    vy = np.reshape(vy, (2 * N_rx * N_itr)).astype(my_type)
-    vx = np.reshape(vx, (2 * N_tx * N_itr)).astype(my_type)
-
-    return vN, vH, vG, vy, vx
-
-
-def generate_mimo_mmse_q16(N_tx, N_rx, N_itr):
-
-    vN = np.zeros([N_itr, 2 * N_tx], dtype=np.int16)
-    vH = np.zeros([N_itr, 2 * N_tx * N_rx], dtype=np.int16)
-    vG = np.zeros([N_itr, 2 * N_tx * N_tx], dtype=np.int16)
-    vy = np.zeros([N_itr, 2 * N_rx], dtype=np.int16)
-    vx = np.zeros([N_itr, 2 * N_tx], dtype=np.int16)
-    MAX = 2**15
-    for k in range(N_itr):
-        # Create channel matrix
-        rH = np.random.randint(-MAX, MAX - 1, N_rx * N_tx, dtype=np.int16)
-        iH = np.random.randint(-MAX, MAX - 1, N_rx * N_tx, dtype=np.int16)
-        H = rH + 1.j * iH
-        # Create input vector
-        y = np.random.randint(-MAX, MAX - 1, N_rx, dtype=np.int16) + 1.j * \
-            np.random.randint(-MAX, MAX - 1, N_rx, dtype=np.int16)
-        # Generate noise variance
-        N = np.random.randint(-MAX, MAX - 1, N_tx, dtype=np.int16)
-
-        H = H.reshape(N_rx, N_tx)
-        # Matrix to be inverted in MMSE estimator
-        H_h = (np.asmatrix(H).H)
-        # Hermitian
-        G = np.matmul(H_h, H) + N
-
-        # Matrix vector product
-        y1 = np.transpose(np.dot(H_h, y))
-        # Cholesky decomposition
-        # L = np.linalg.cholesky(G)
-        L = G
-        # Linear system solution
-        y2 = solve_triangular(L, y1, lower=True)
-        x = solve_triangular(np.asmatrix(L).H, y2)
-
-        vN[k, :] = np.column_stack((N.real, N.imag)).astype(np.int16).flatten()
-        vH[k, :] = np.column_stack((H.real, H.imag)).astype(np.int16).flatten()
-        vG[k, :] = np.column_stack((G.real, G.imag)).astype(np.int16).flatten()
-        vy[k, :] = np.column_stack((y.real, y.imag)).astype(np.int16).flatten()
-        vx[k, :] = np.column_stack((x.real, x.imag)).astype(np.int16).flatten()
-
-    vN = np.reshape(vN, (2 * N_tx * N_itr)).astype(np.int16)
-    vH = np.reshape(vH, (2 * N_rx * N_tx * N_itr)).astype(np.int16)
-    vG = np.reshape(vG, (2 * N_tx * N_tx * N_itr)).astype(np.int16)
-    vy = np.reshape(vy, (2 * N_rx * N_itr)).astype(np.int16)
-    vx = np.reshape(vx, (2 * N_tx * N_itr)).astype(np.int16)
-
-    return vN, vH, vG, vy, vx
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-n",
-        "--transmitters",
-        type=int,
-        required=False,
-        default=4,
-        help='First dimension.'
-    )
-    parser.add_argument(
-        "-m",
-        "--receivers",
-        type=int,
-        required=False,
-        default=32,
-        help='First dimension.'
-    )
-    parser.add_argument(
-        "-k",
-        "--iterations",
-        type=int,
-        required=False,
-        default=1,
-        help='Iterations.'
-    )
-
-    args = parser.parse_args()
-    N_tx = args.transmitters
-    N_rx = args.receivers
-    N_itr = args.iterations
-
-    vN, vH, vG, vy, vx = generate_fmmse(
-        N_tx, N_rx, N_itr, np.float32)
-    tpl = pathlib.Path(__file__).parent.absolute() / "data_mimo_mmse_f32.h.tpl"
-    kwargs = {'name': 'data_mimo_mmse_f32',
-              'H': vH,
-              'G': vG,
-              'N': vN,
-              'y': vy,
-              'x': vx,
-              'N_tx': N_tx,
-              'N_rx': N_rx,
-              'N_itr': N_itr}
-    gen_data_header_file(args.outdir, tpl, **kwargs)
-
-    vN, vH, vG, vy, vx = generate_fmmse(
-        N_tx, N_rx, N_itr, np.float16)
-    tpl = pathlib.Path(__file__).parent.absolute() / "data_mimo_mmse_f16.h.tpl"
-    kwargs = {'name': 'data_mimo_mmse_f16',
-              'H': vH,
-              'G': vG,
-              'N': vN,
-              'y': vy,
-              'x': vx,
-              'N_tx': N_tx,
-              'N_rx': N_rx,
-              'N_itr': N_itr}
-    gen_data_header_file(args.outdir, tpl, **kwargs)
-
-    vN, vH, vG, vy, vx = generate_fmmse(
-        N_tx, N_rx, N_itr, np.float16)
-    tpl = pathlib.Path(__file__).parent.absolute() / "data_mimo_mmse_f8.h.tpl"
-    kwargs = {'name': 'data_mimo_mmse_f8',
-              'H': ff.array(vH, "e5m2"),
-              'G': vG,
-              'N': ff.array(vN, "e5m2"),
-              'y': ff.array(vy, "e5m2"),
-              'x': vx,
-              'N_tx': N_tx,
-              'N_rx': N_rx,
-              'N_itr': N_itr}
-    gen_data_header_file(args.outdir, tpl, **kwargs)
-
-    vN, vH, vG, vy, vx = generate_mimo_mmse_q16(N_tx, N_rx, N_itr)
-    tpl = pathlib.Path(__file__).parent.absolute() / "data_mimo_mmse_q16.h.tpl"
-    kwargs = {'name': 'data_mimo_mmse_q16',
-              'H': vH,
-              'G': vG,
-              'N': vN,
-              'y': vy,
-              'x': vx,
-              'N_tx': N_tx,
-              'N_rx': N_rx,
-              'N_itr': N_itr}
-    gen_data_header_file(args.outdir, tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/generate_ofdm.py b/software/data/generate_ofdm.py
deleted file mode 100644
index 64b0a7ca6..000000000
--- a/software/data/generate_ofdm.py
+++ /dev/null
@@ -1,137 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-
-# Author: Marco Bertuletti, ETH Zurich
-
-import numpy as np
-import math as M
-import argparse
-import pathlib
-from mako.template import Template
-from sympy.combinatorics import Permutation
-
-##################
-# compute_result #
-##################
-
-
-def compute_bitreversal(N, R):
-    # Decompose
-    logR2 = []
-    idx = N
-    while (idx >= R):
-        logR2.append(int(M.log2(R)))
-        idx = idx // R
-    if (idx > 1):
-        logR2.append(int(M.log2(idx)))
-    # Bitreversal
-    indexes = []
-    for x in range(N):
-        result = 0
-        for bits in logR2:
-            mask = (0xffffffff >> (32 - bits))
-            result = (result << bits) | (x & mask)
-            x = x >> bits
-        indexes.append(result)
-
-    # Create transpositions table
-    tps = []
-    for c in Permutation.from_sequence(indexes).cyclic_form:
-        for i in range(len(c) - 1):
-            tps.append([c[i] * 8, c[-1] * 8])
-    return tps
-
-
-def gen_data_header_file(outdir: pathlib.Path.cwd(),
-                         tpl: pathlib.Path.cwd(), **kwargs):
-
-    file = outdir / f"data_{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-t",
-        "--tpl",
-        type=pathlib.Path,
-        required=False,
-        default=pathlib.Path(__file__).parent.absolute() / "data_ofdm.h.tpl",
-        help='Path to mako template'
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action='store_true',
-        help='Set verbose'
-    )
-    parser.add_argument(
-        "-rx",
-        "--receivers",
-        type=int,
-        required=False,
-        default=64,
-        help='First dimension.'
-    )
-    parser.add_argument(
-        "-bs",
-        "--beams",
-        type=int,
-        required=False,
-        default=32,
-        help='Second dimension.'
-    )
-    parser.add_argument(
-        "-sc",
-        "--subcarriers",
-        type=int,
-        required=False,
-        default=4096,
-        help='Iterations.'
-    )
-
-    args = parser.parse_args()
-    N_rx = args.receivers
-    N_bs = args.beams
-    N_sc = args.subcarriers
-
-    pFFT_src = (np.random.rand(2 * N_rx * N_sc)).astype(np.float16)
-    pTw_coef = (np.random.rand(int(3 * N_sc / 4))).astype(np.float16)
-    pBF_coef = (np.random.rand(2 * N_rx * N_bs)).astype(np.float16)
-    pBF_dst = (np.random.rand(2 * N_bs * N_sc)).astype(np.float16)
-
-    Bitreversal = np.ndarray.flatten(np.array(compute_bitreversal(N_sc, 2)))
-
-    kwargs = {'name': 'ofdm',
-              'pFFT_src': pFFT_src,
-              'pTw_coef': pTw_coef,
-              'pBF_coef': pBF_coef,
-              'pBF_dst': pBF_dst,
-              'bitrev': Bitreversal,
-              'N_rx': N_rx,
-              'N_bs': N_bs,
-              'N_sc': N_sc,
-              'Log2Len': int(np.log2(N_sc)),
-              'BitrevLen': len(Bitreversal)}
-    gen_data_header_file(args.outdir, args.tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/kernels/baremetal/mempool_cholesky_f16s.h b/software/kernels/baremetal/mempool_cholesky_f16s.h
index 121267545..3b42bdb80 100644
--- a/software/kernels/baremetal/mempool_cholesky_f16s.h
+++ b/software/kernels/baremetal/mempool_cholesky_f16s.h
@@ -135,6 +135,9 @@ void mempool_cholesky_f16vecs(__fp16 *pSrc, __fp16 *pL, const uint32_t n,
         asm volatile("fccdotpex.s.h  %0, %1, %2;"
                      : "+&r"(asbs)
                      : "r"(cd), "r"(ab));
+        //        asm volatile("fcndotpex.s.h  %0, %1, %2;"
+        //                     : "+&r"(asbs)
+        //                     : "r"(cd), "r"(ab));
       }
       asm volatile("pv.shuffle2.h %0, %0, %[mask];"
                    : "+&r"(asbs)
diff --git a/software/kernels/baremetal/mempool_linearsolver_q16s.h b/software/kernels/baremetal/mempool_linearsolver_q16s.h
index cd9134968..39bf46394 100644
--- a/software/kernels/baremetal/mempool_linearsolver_q16s.h
+++ b/software/kernels/baremetal/mempool_linearsolver_q16s.h
@@ -28,7 +28,7 @@ void mempool_Ltrisol_q16vecs(int16_t *pL, int16_t *y, int16_t *x,
   // Solve for each variable x[i] in loop
   for (i = 0; i < n; i++) {
     uint32_t ridx = transposed ? (n - i - 1) : i;
-    diag = pL[2U * (ridx * offset + ridx)];
+    diag = pL[2U * (ridx + ridx)];
     // Initialize the sums
     as = 0;
     bs = 0;
diff --git a/software/kernels/baremetal/mempool_mimo_mmse_f16s.h b/software/kernels/baremetal/mempool_mimo_mmse_f16s.h
index 45076f9fe..91e3aa789 100644
--- a/software/kernels/baremetal/mempool_mimo_mmse_f16s.h
+++ b/software/kernels/baremetal/mempool_mimo_mmse_f16s.h
@@ -254,38 +254,40 @@ void mempool_hermitian_f16vecs(__fp16 *pH, __fp16 *pG, __fp16 *pS,
   for (i = 0; i < n_tx; i++) {
 
     if (n_tx % 4 != 0) {
-      as0 = 0.0f; // Initialize the real part of sums
-      bs0 = 0.0f; // Initialize the imag part of sums
-      // Inner Loop
-      for (k = 0; k < n_rx; k++) {
-        ab = (*(v2h *)&pH[2U * (k * n_tx + i)]);
-        cd0 = (*(v2h *)&pH[2U * (k * n_tx + j)]);
-        // dotproducts (ac + bd) + j (ad - bc)
-        asm volatile(
-            // a * c + b * d
-            "vfdotpex.s.h  %[as0], %[ab], %[cd0];"
-            "pv.shuffle2.h  %[cd0], %[cd0], %[shuffle_mask];"
-            "xor %[cd0], %[neg_mask], %[cd0];"
-            // a * d - b * c
-            "vfdotpex.s.h  %[bs0], %[ab], %[cd0];"
-            : [cd0] "+&r"(cd0), [as0] "+&r"(as0), [bs0] "+&r"(bs0)
-            : [ab] "r"(ab), [neg_mask] "r"(neg_mask),
-              [shuffle_mask] "r"(shuffle_mask)
-            :);
-      }
-      // Store
-      v2h res0;
-      asm volatile("vfcpka.h.s %0, %1, %2;"
-                   : "=&r"(res0)
-                   : "r"(as0), "r"(bs0)
-                   :);
-      if (zf == 0) {
-        asm volatile("and     %0, %0, %1;" : "+&r"(res0) : "r"(0x0000FFFF));
-        asm volatile("fadd.h  %0, %0, %1;" : "+&r"(res0) : "r"(pS[2 * i]));
+      for (j = 0; j < n_tx; j++) {
+        as0 = 0.0f; // Initialize the real part of sums
+        bs0 = 0.0f; // Initialize the imag part of sums
+        // Inner Loop
+        for (k = 0; k < n_rx; k++) {
+          ab = (*(v2h *)&pH[2U * (k * n_tx + i)]);
+          cd0 = (*(v2h *)&pH[2U * (k * n_tx + j)]);
+          // dotproducts (ac + bd) + j (ad - bc)
+          asm volatile(
+              // a * c + b * d
+              "vfdotpex.s.h  %[as0], %[ab], %[cd0];"
+              "pv.shuffle2.h  %[cd0], %[cd0], %[shuffle_mask];"
+              "xor %[cd0], %[neg_mask], %[cd0];"
+              // a * d - b * c
+              "vfdotpex.s.h  %[bs0], %[ab], %[cd0];"
+              : [cd0] "+&r"(cd0), [as0] "+&r"(as0), [bs0] "+&r"(bs0)
+              : [ab] "r"(ab), [neg_mask] "r"(neg_mask),
+                [shuffle_mask] "r"(shuffle_mask)
+              :);
+        }
+        // Store
+        v2h res0;
+        asm volatile("vfcpka.h.s %0, %1, %2;"
+                     : "=&r"(res0)
+                     : "r"(as0), "r"(bs0)
+                     :);
+        if (zf == 0) {
+          asm volatile("and     %0, %0, %1;" : "+&r"(res0) : "r"(0x0000FFFF));
+          asm volatile("fadd.h  %0, %0, %1;" : "+&r"(res0) : "r"(pS[2 * i]));
+        }
+        // Store
+        uint32_t addr = folded ? 2 * (i * N_BANKS + j) : 2 * (i * n_tx + j);
+        (*(v2h *)&pG[addr]) = res0;
       }
-      // Store
-      uint32_t addr = folded ? 2 * (i * N_BANKS + j) : 2 * (i * n_tx + j);
-      (*(v2h *)&pG[addr]) = res0;
 
     } else {
       // UNROLL_4
diff --git a/software/kernels/baremetal/mempool_mimo_mmse_f32s.h b/software/kernels/baremetal/mempool_mimo_mmse_f32s.h
index baad28e0d..70d77b82d 100644
--- a/software/kernels/baremetal/mempool_mimo_mmse_f32s.h
+++ b/software/kernels/baremetal/mempool_mimo_mmse_f32s.h
@@ -127,8 +127,7 @@ void mempool_hermitian_f32s(float *pH, float *pG, float *pS,
   @return        none
 */
 void mempool_MVP_conjtransp_f32s(float *pH, float *px, float *py,
-                                 const uint32_t n_rx, const uint32_t n_tx,
-                                 const uint32_t folded) {
+                                 const uint32_t n_rx, const uint32_t n_tx) {
 
   uint32_t i, j;
   float a0, a1, a2, a3;
diff --git a/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h
index 0d68e3d9d..3ce36f3b6 100644
--- a/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h
+++ b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h
@@ -99,25 +99,22 @@ static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut,
 
       // s4 = Co1 * (xa - xc + yb - yd) + Si1 * (ya - yc + xd - xb)
       // s5 = -Si1 * (xa - xc + yb - yd) + Co1 * (ya - yc + xd - xb)
-      "vfdotpex.s.h  %[s0],%[CoSi1],%[D];"
-      "vfdotpex.s.h  %[s1],%[C1],%[D];"
-
+      "vfdotpex.s.r.h  %[s0],%[CoSi1],%[D];"
+      "vfdotpex.s.r.h  %[s1],%[C1],%[D];"
       // s0 = Co2 * (xa + xc - xb - xd) + Si2 * (ya + yc - yb - yd)
       // s1 = -Si2 * (xa + xc - xb - xd) + Co2 * (ya + yc - yb - yd)
-      "vfdotpex.s.h  %[s2],%[CoSi2],%[B];"
-      "vfdotpex.s.h  %[s3],%[C2],%[B];"
-
+      "vfdotpex.s.r.h  %[s2],%[CoSi2],%[B];"
+      "vfdotpex.s.r.h  %[s3],%[C2],%[B];"
       // s3 = Co3 * (xa - xc + yd - yb) + Si3 * (ya - yc + xb - xd)
       // s4 = -Si3 * (xa - xc + yd - yb) + Co3 * (ya - yc + xb - xd)
-      "vfdotpex.s.h  %[s4],%[CoSi3],%[C];"
-      "vfdotpex.s.h  %[s5],%[C3],%[C];"
-
+      "vfdotpex.s.r.h  %[s4],%[CoSi3],%[C];"
+      "vfdotpex.s.r.h  %[s5],%[C3],%[C];"
       // xb', yb'
-      "vfcpka.h.s %[B], %[s1], %[s0];"
+      "vfcpka.h.s %[D], %[s1], %[s0];"
       // xc', yc'
-      "vfcpka.h.s %[C], %[s3], %[s2];"
+      "vfcpka.h.s %[B], %[s3], %[s2];"
       // xd', yd'
-      "vfcpka.h.s %[D], %[s5], %[s4];"
+      "vfcpka.h.s %[C], %[s5], %[s4];"
       : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "=&r"(E),
         [F] "=&r"(F), [G] "=&r"(G), [H] "=&r"(H), [s0] "=&r"(s0),
         [s1] "=&r"(s1), [s2] "=&r"(s2), [s3] "=&r"(s3), [s4] "=&r"(s4),
@@ -127,9 +124,9 @@ static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut,
         [neg_mask] "r"(0x3C00BC00)
       :);
   *((v2h *)&pOut[i0_store * 2U]) = A;
-  *((v2h *)&pOut[i1_store * 2U]) = C;
-  *((v2h *)&pOut[i2_store * 2U]) = B;
-  *((v2h *)&pOut[i3_store * 2U]) = D;
+  *((v2h *)&pOut[i1_store * 2U]) = B;
+  *((v2h *)&pOut[i2_store * 2U]) = D;
+  *((v2h *)&pOut[i3_store * 2U]) = C;
 }
 
 /**
@@ -227,18 +224,18 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut,
 
       // s4 = Co1 * (xa - xc + yb - yd) + Si1 * (ya - yc + xd - xb)
       // s5 = -Si1 * (xa - xc + yb - yd) + Co1 * (ya - yc + xd - xb)
-      "vfdotpex.s.h  %[s0],%[CoSi1],%[D];"
-      "vfdotpex.s.h  %[s1],%[C1],%[D];"
+      "vfdotpex.s.r.h  %[s0],%[CoSi1],%[D];"
+      "vfdotpex.s.r.h  %[s1],%[C1],%[D];"
 
       // s0 = Co2 * (xa + xc - xb - xd) + Si2 * (ya + yc - yb - yd)
       // s1 = -Si2 * (xa + xc - xb - xd) + Co2 * (ya + yc - yb - yd)
-      "vfdotpex.s.h  %[s2],%[CoSi2],%[B];"
-      "vfdotpex.s.h  %[s3],%[C2],%[B];"
+      "vfdotpex.s.r.h  %[s2],%[CoSi2],%[B];"
+      "vfdotpex.s.r.h  %[s3],%[C2],%[B];"
 
       // s3 = Co3 * (xa - xc + yd - yb) + Si3 * (ya - yc + xb - xd)
       // s4 = -Si3 * (xa - xc + yd - yb) + Co3 * (ya - yc + xb - xd)
-      "vfdotpex.s.h  %[s4],%[CoSi3],%[C];"
-      "vfdotpex.s.h  %[s5],%[C3],%[C];"
+      "vfdotpex.s.r.h  %[s4],%[CoSi3],%[C];"
+      "vfdotpex.s.r.h  %[s5],%[C3],%[C];"
 
       // xb', yb'
       "vfcpka.h.s %[B], %[s1], %[s0];"
diff --git a/software/kernels/baremetal/mempool_radix4_cfft_f16p.h b/software/kernels/baremetal/mempool_radix4_cfft_f16p.h
index e7bd7edc5..c6b4acf6b 100644
--- a/software/kernels/baremetal/mempool_radix4_cfft_f16p.h
+++ b/software/kernels/baremetal/mempool_radix4_cfft_f16p.h
@@ -60,14 +60,14 @@
 #endif
 
 void mempool_radix4_cfft_f16p(__fp16 *pSrc16, uint32_t fftLen,
-                              const __fp16 *pCoef16, uint32_t twidCoefModifier,
-                              uint32_t nPE) {
+                              const __fp16 *pCoef16, uint32_t nPE) {
   uint32_t absolute_core_id = mempool_get_core_id();
   uint32_t core_id = absolute_core_id % nPE;
   __fp16 t0, t1, t2, t3, t4, t5;
   v2h CoSi1, CoSi2, CoSi3;
   v2h C1, C2, C3;
   uint32_t n1, n2, ic, i0, j, k;
+  uint32_t twidCoefModifier = 1;
   uint32_t step, steps;
 
   /* START OF FIRST STAGE PROCESSING */
@@ -165,17 +165,17 @@ void mempool_radix4_cfft_f16p_folded(__fp16 *pSrc16, __fp16 *pDst16,
   uint32_t n1, n2;
   uint32_t i0, k, ic;
   __fp16 *pTmp;
-  uint32_t twidCoefModifier = 1U;
+  uint32_t twidCoefModifier = 1;
 #endif
 
   /* START OF FIRST STAGE PROCESSING */
   n1 = fftLen;
-  n2 = n1 >> 2U;
+  n2 = n1 >> 2;
   for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, n2); i0++) {
 #ifdef FOLDED_TWIDDLES
     ic = i0;
-    ic_store = ic >> 2U;
-    n2_store = n2 >> 2U;
+    ic_store = ic >> 2;
+    n2_store = n2 >> 2;
 #else
     ic = i0;
 #endif
@@ -192,22 +192,22 @@ void mempool_radix4_cfft_f16p_folded(__fp16 *pSrc16, __fp16 *pDst16,
   pCoef_src = pCoef_dst;
   pCoef_dst = pTmp;
 #else
-  twidCoefModifier <<= 2U;
+  twidCoefModifier <<= 2;
 #endif
   mempool_log_partial_barrier(2, absolute_core_id, nPE);
   /* END OF FIRST STAGE PROCESSING */
 
   /* START OF MIDDLE STAGE PROCESSING */
-  for (k = fftLen / 4U; k > 4U; k >>= 2U) {
+  for (k = fftLen / 4U; k > 4; k >>= 2) {
     n1 = n2;
-    n2 >>= 2U;
+    n2 >>= 2;
     for (i0 = core_id * 4; i0 < core_id * 4 + 4; i0++) {
 #ifdef FOLDED_TWIDDLES
       ic = i0;
       // (ic % n2) / 4 take only every 4th index in the wing
       // (ic / n2) * n2 shift of the wing size
       ic_store = ((ic % n2) >> 2) + (ic / n2) * n2;
-      n2_store = n2 >> 2U;
+      n2_store = n2 >> 2;
 #else
       ic = (i0 % n2) * twidCoefModifier;
 #endif
@@ -224,7 +224,7 @@ void mempool_radix4_cfft_f16p_folded(__fp16 *pSrc16, __fp16 *pDst16,
     pCoef_src = pCoef_dst;
     pCoef_dst = pTmp;
 #else
-    twidCoefModifier <<= 2U;
+    twidCoefModifier <<= 2;
 #endif
     mempool_log_partial_barrier(2, absolute_core_id, nPE);
   }