diff --git a/software/apps/baremetal/cfft_radix4_f16/main.c b/software/apps/baremetal/cfft_radix4_f16/main.c
index 28fcd7871..2f4270c80 100644
--- a/software/apps/baremetal/cfft_radix4_f16/main.c
+++ b/software/apps/baremetal/cfft_radix4_f16/main.c
@@ -21,29 +21,18 @@
 #include "data/data_cfft_radix4_f16.h"
 
 /*
-  CHOOSE ONE
-   - SINGLE:    Single core FFT
-   - PARALLEL:  Parallel FFT not "memory-aware"
    - FOLDED:    Parallel FFT with "memory-aware" load/store scheme
    - SCHEDULED: Scheduling of multiple parallel FFTs with "memory-aware"
-  load/store scheme
+   load/store scheme
       - N_FFTs_COL: Independent FFTs scheduled on one row (default 1)
       - N_FFTs_ROW: Independent FFTs scheduled on columns (default 1)
-      (OPTIONALLY ENABLE)
       - FOLDED_TWIDDLES: Also the twiddles have "memory-aware" load/stores
-      - BITREVERSETABLE: The bitreversal indeces are loaded from a table
-      - ASM:             Use asm_volatile statements
 */
 
 #define FOLDED
 #define FOLDED_TWIDDLES
-#define BITREVERSETABLE
-#define ASM
-
-#if !(defined(N_FFT_ROW) && defined(N_FFTs_COL))
 #define N_FFTs_ROW 1
 #define N_FFTs_COL 1
-#endif
 
 #include "kernel/mempool_checks.h"
 #include "kernel/mempool_radix4_cfft_butterfly_f16.h"
@@ -61,98 +50,82 @@ __fp16 l1_twiddleCoef_f16_dst[8 * N_BANKS]
 uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
     __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
 
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/* MAIN */
 int main() {
+
   uint32_t core_id = mempool_get_core_id();
   uint32_t num_cores = mempool_get_core_count();
   mempool_barrier_init(core_id);
+  __fp16 *pRes = (__fp16 *)0;
 
-  ///////////////////////////////////////////////////////////////////////////////////////////////////
-  /* INITIALIZATION */
   if (core_id == 0) {
     // Each FFT is folded over 4 memory rows
-    // Each memory row is 2 * N_BANKS samples
+    // Each memory row is 2 * N_BANKS (real-imag) samples
     for (uint32_t j = 0; j < N_FFTs_ROW; j++) {
       dma_memcpy_blocking(l1_pSrc + j * (8 * N_BANKS), l2_pSrc,
-                          (N_RSAMPLES * N_FFTs_COL) * sizeof(int32_t));
+                          (N_CSAMPLES * N_FFTs_COL) * sizeof(int32_t));
     }
+    dma_memcpy_blocking(l1_pSrc, l2_pSrc, N_CSAMPLES * sizeof(int32_t));
     dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable,
                         BITREVINDEXTABLE_LENGTH * sizeof(int16_t));
     dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16,
                         3 * (N_CSAMPLES / 4) * sizeof(int32_t));
   }
-  // Initialize the Twiddles folded
+// Initialize the Twiddles folded
 #ifdef FOLDED_TWIDDLES
   for (uint32_t j = 0; j < N_FFTs_COL; j++) {
     uint32_t N_WORDS_COL = (N_CSAMPLES / 4);
     for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) {
-      *(v2h *)&l1_twiddleCoef_f16_src[2U * (i + j * (N_CSAMPLES / 4))] =
+      *(v2h *)&l1_twiddleCoef_f16_src[2U * (i + j * N_WORDS_COL)] =
           *(v2h *)&l2_twiddleCoef_f16[2U * i];
-      *(v2h *)&l1_twiddleCoef_f16_src[2U * (i + j * (N_CSAMPLES / 4) +
-                                            1 * N_BANKS)] =
+      *(v2h *)&l1_twiddleCoef_f16_src[2U *
+                                      (i + j * N_WORDS_COL + 1 * N_BANKS)] =
           *(v2h *)&l2_twiddleCoef_f16[2U * (i * 2U)];
-      *(v2h *)&l1_twiddleCoef_f16_src[2U * (i + j * (N_CSAMPLES / 4) +
-                                            2 * N_BANKS)] =
+      *(v2h *)&l1_twiddleCoef_f16_src[2U *
+                                      (i + j * N_WORDS_COL + 2 * N_BANKS)] =
           *(v2h *)&l2_twiddleCoef_f16[2U * (i * 3U)];
     }
   }
 #endif
-  mempool_barrier(num_cores);
-
   if (core_id == 0) {
-    printf("On the run...\n");
+    printf("01: END INITIALIZATION\n");
   }
   mempool_barrier(num_cores);
 
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/* MULTI-CORE FOLDED */
-#ifdef FOLDED
-  __fp16 *pRes = NULL;
+#if (defined(FOLDED) && defined(FOLDED_TWIDDLES))
   if (core_id < (N_CSAMPLES / 16)) {
     mempool_start_benchmark();
-#ifdef FOLDED_TWIDDLES
-    mempool_radix4_cfft_f16p_folded(l1_pSrc, l1_pDst, (uint16_t)N_CSAMPLES,
+    mempool_radix4_cfft_f16p_folded(l1_pSrc, l1_pDst, N_CSAMPLES,
                                     l1_twiddleCoef_f16_src,
                                     l1_twiddleCoef_f16_dst, (N_CSAMPLES / 16));
-#else
-    mempool_radix4_cfft_f16p_folded(l1_pSrc, l1_pDst, (uint16_t)N_CSAMPLES,
-                                    l1_twiddleCoef_f16_src, (N_CSAMPLES / 16));
-#endif
     pRes = ((LOG2 / 2) % 2) == 0 ? l1_pSrc : l1_pDst;
     mempool_bitrevtable_q16p_xpulpimg((uint16_t *)pRes, BITREVINDEXTABLE_LENGTH,
                                       l1_BitRevIndexTable, (N_CSAMPLES / 16));
     mempool_stop_benchmark();
   }
-  mempool_barrier(num_cores);
 #endif
 
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/* MULTI-CORE SCHEDULED */
 #ifdef SCHEDULED
-  __fp16 *pRes = NULL;
-  if (core_id < N_FFTs_COL * (N_CSAMPLES / 16)) {
+  uint32_t nPE = (N_CSAMPLES / 16);
+  if (core_id < N_FFTs_COL * nPE) {
     mempool_start_benchmark();
-    uint32_t col_fftLen = N_CSAMPLES / 4;
-    uint32_t col_id = core_id / (N_CSAMPLES / 16);
-    // Distribute FFTs over columns
-    if (col_id < N_FFTs_COL) {
-      mempool_radix4_cfft_f16p_scheduler(
-          l1_pSrc, l1_pDst, N_CSAMPLES, l1_pCoef_src + 2 * col_id * col_fftLen,
-          l1_pCoef_dst + 2 * col_id * col_fftLen, l1_pRevT16,
-          BITREVINDEXTABLE_FIXED_TABLE_LENGTH, 1, (N_CSAMPLES / 16));
-    }
-    pRes = ((LOG2 / 2) % 2) == 0 ? l1_pSrc : l1_pDst;
-    mempool_log_partial_barrier(2, core_id, N_FFTs_COL * (N_CSAMPLES / 16));
+    uint32_t N_WORDS_COL = N_CSAMPLES / 4;
+    uint32_t col_id = core_id / nPE;
+    mempool_radix4_cfft_f16p_scheduler(
+        l1_pSrc, l1_pDst, N_CSAMPLES, N_FFTs_ROW, N_FFTs_COL,
+        l1_twiddleCoef_f16_src + 2 * col_id * N_WORDS_COL,
+        l1_twiddleCoef_f16_dst + 2 * col_id * N_WORDS_COL, l1_BitRevIndexTable,
+        BITREVINDEXTABLE_LENGTH, 1, nPE);
+    pRes = l1_pDst;
+    mempool_log_partial_barrier(2, core_id, N_FFTs_COL * nPE);
     mempool_stop_benchmark();
   }
-  mempool_barrier(num_cores);
 #endif
 
-  ///////////////////////////////////////////////////////////////////////////////////////////////////
-  /* CHECK */
-  mempool_check_f16(pRes, l2_pRes, 2 * N_CSAMPLES, 0.5f, 0);
   mempool_barrier(num_cores);
-
+  if (core_id == 0) {
+    printf("02: END COMPUTATION\n");
+  }
+  mempool_check_f16(pRes, l2_pRes, 2 * N_CSAMPLES, 0.5, 0);
+  mempool_barrier(num_cores);
   return 0;
 }
diff --git a/software/apps/cfft_radix4_q16/main.c b/software/apps/cfft_radix4_q16/main.c
new file mode 100644
index 000000000..e69de29bb
diff --git a/software/runtime/data/data_cfft_f16.h.tpl b/software/runtime/data/data_cfft_f16.h.tpl
index 7cfcda159..d21829e88 100644
--- a/software/runtime/data/data_cfft_f16.h.tpl
+++ b/software/runtime/data/data_cfft_f16.h.tpl
@@ -30,8 +30,6 @@
 
 #define LOG2 (${Log2Len})
 #define N_CSAMPLES (${Len})
-#define N_RSAMPLES (2 * N_CSAMPLES)
-#define N_TWIDDLES (3 * N_CSAMPLES / 4)
 #define N_BANKS (NUM_CORES * BANKING_FACTOR)
 #define BITREVINDEXTABLE_LENGTH (${BitrevLen})
 
diff --git a/software/runtime/data/data_cfft_q16.h.tpl b/software/runtime/data/data_cfft_q16.h.tpl
index fde7f2b71..fb1ba908a 100644
--- a/software/runtime/data/data_cfft_q16.h.tpl
+++ b/software/runtime/data/data_cfft_q16.h.tpl
@@ -31,8 +31,6 @@
 %> \
 #define LOG2 (${Log2Len})
 #define N_CSAMPLES (${Len})
-#define N_RSAMPLES (2 * N_CSAMPLES)
-#define N_TWIDDLES (3 * N_CSAMPLES / 4)
 #define N_BANKS (NUM_CORES * BANKING_FACTOR)
 #define BITREVINDEXTABLE_LENGTH (${BitrevLen})
 
diff --git a/software/runtime/kernel/mempool_radix2_cfft_q16s.h b/software/runtime/kernel/mempool_radix2_cfft_q16s.h
new file mode 100644
index 000000000..e69de29bb
diff --git a/software/runtime/kernel/mempool_radix4_cfft_butterfly_f16.h b/software/runtime/kernel/mempool_radix4_cfft_butterfly_f16.h
index 3e7a28245..7c305b222 100644
--- a/software/runtime/kernel/mempool_radix4_cfft_butterfly_f16.h
+++ b/software/runtime/kernel/mempool_radix4_cfft_butterfly_f16.h
@@ -4,6 +4,7 @@
 
 // Author: Marco Bertuletti, ETH Zurich
 
+#pragma once
 #include "xpulp/builtins_v2.h"
 
 /**
diff --git a/software/runtime/kernel/mempool_radix4_cfft_f16p.h b/software/runtime/kernel/mempool_radix4_cfft_f16p.h
index 8682e6553..5699480fb 100644
--- a/software/runtime/kernel/mempool_radix4_cfft_f16p.h
+++ b/software/runtime/kernel/mempool_radix4_cfft_f16p.h
@@ -4,6 +4,8 @@
 
 // Author: Marco Bertuletti, ETH Zurich
 
+#pragma once
+#define BITREVERSETABLE
 #include "xpulp/builtins_v2.h"
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 
@@ -58,7 +60,6 @@
   CoSi3 = *(v2h *)&pCoef_src[2U * (ic * 3U)];
 #endif
 
-#ifdef FOLDED_TWIDDLES
 /**
   @brief         Full FFT butterfly
   @param[in]     pSrc16  points to input buffer of 16b data, Re and Im parts are
@@ -73,25 +74,8 @@
 */
 void mempool_radix4_cfft_f16p_folded(__fp16 *pSrc16, __fp16 *pDst16,
                                      uint32_t fftLen, __fp16 *pCoef_src,
-                                     __fp16 *pCoef_dst, uint32_t nPE)
-#else
-/**
-  Twiddles are not folded in memory
-  @brief         Full FFT butterfly
-  @param[in]     pSrc16  points to input buffer of 16b data, Re and Im parts are
-  interleaved
-  @param[out]    pDst16  points to output buffer of 16b data, Re and Im parts
-  are interleaved
-  @param[in]     fftLen  Length of the complex input vector
-  @param[in]     pCoef_src Twiddle coefficients vector
-  @param[in]     nPE Number of PE
-  @return        pointer to output vector
-*/
-void mempool_radix4_cfft_f16p_folded(__fp16 *pSrc16, __fp16 *pDst16,
-                                     uint32_t fftLen, __fp16 *pCoef_src,
-                                     uint32_t nPE)
-#endif
-{
+                                     __fp16 __attribute__((unused)) * pCoef_dst,
+                                     uint32_t nPE) {
 
   uint32_t absolute_core_id = mempool_get_core_id();
   uint32_t core_id = absolute_core_id;
@@ -218,8 +202,9 @@ void mempool_radix4_cfft_f16p_folded(__fp16 *pSrc16, __fp16 *pDst16,
 */
 
 void mempool_radix4_cfft_f16p_scheduler(
-    __fp16 *pSrc16, __fp16 *pDst16, uint32_t fftLen, __fp16 *pCoef_src,
-    __fp16 *pCoef_dst, __attribute__((unused)) uint16_t *pBitRevTable,
+    __fp16 *pSrc16, __fp16 *pDst16, uint32_t fftLen, uint32_t n_FFTs_ROW,
+    uint32_t n_FFTs_COL, __fp16 *pCoef_src, __fp16 *pCoef_dst,
+    __attribute__((unused)) uint16_t *pBitRevTable,
     __attribute__((unused)) uint16_t bitReverseLen, uint8_t bitReverseFlag,
     uint32_t nPE) {
 
@@ -251,7 +236,7 @@ void mempool_radix4_cfft_f16p_scheduler(
 #endif
     LOAD_STORE_TWIDDLEFACT;
     SHUFFLE_TWIDDLEFACT;
-    for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
+    for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) {
       __fp16 *pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * fftLen;
       __fp16 *pOut =
           pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
@@ -282,7 +267,7 @@ void mempool_radix4_cfft_f16p_scheduler(
       LOAD_STORE_TWIDDLEFACT;
       SHUFFLE_TWIDDLEFACT;
 
-      for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
+      for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) {
         __fp16 *pIn =
             pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
         __fp16 *pOut =
@@ -297,12 +282,12 @@ void mempool_radix4_cfft_f16p_scheduler(
     pTmp = pCoef_src;
     pCoef_src = pCoef_dst;
     pCoef_dst = pTmp;
-    mempool_log_partial_barrier(2, absolute_core_id, N_FFTs_COL * nPE);
+    mempool_log_partial_barrier(2, absolute_core_id, n_FFTs_COL * nPE);
   }
 
   /*  LAST STAGE */
   for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, fftLen >> 2U); i0++) {
-    for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
+    for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) {
       __fp16 *pIn =
           pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
       __fp16 *pOut =
@@ -313,7 +298,7 @@ void mempool_radix4_cfft_f16p_scheduler(
   pTmp = pSrc16;
   pSrc16 = pDst16;
   pDst16 = pTmp;
-  mempool_log_partial_barrier(2, absolute_core_id, N_FFTs_COL * nPE);
+  mempool_log_partial_barrier(2, absolute_core_id, n_FFTs_COL * nPE);
   mempool_stop_benchmark();
   mempool_start_benchmark();
   /* BITREVERSAL */
@@ -362,7 +347,7 @@ void mempool_radix4_cfft_f16p_scheduler(
       b2_load = (b2 % 4) * 2 * N_BANKS + 2 * (b2 / 4);
       b3_load = (b3 % 4) * 2 * N_BANKS + 2 * (b3 / 4);
       b4_load = (b4 % 4) * 2 * N_BANKS + 2 * (b4 / 4);
-      for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
+      for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) {
         uint16_t *ptr1 = (uint16_t *)(pSrc16 + idx_row * (N_BANKS * 8));
         uint16_t *ptr2 = (uint16_t *)(pDst16 + idx_row * (N_BANKS * 8));
         // Load at address a
@@ -410,7 +395,7 @@ void mempool_radix4_cfft_f16p_scheduler(
         idx2 = idx2 >> 1U;
         idx3 = idx3 >> 1U;
       }
-      for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
+      for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) {
         uint32_t addr_src0 = (idx0 / 4) + (idx0 % 4) * N_BANKS;
         uint32_t addr_src1 = (idx1 / 4) + (idx1 % 4) * N_BANKS;
         uint32_t addr_src2 = (idx2 / 4) + (idx2 % 4) * N_BANKS;