diff --git a/software/apps/baremetal/dotp_i32/define.h b/software/apps/baremetal/dotp_i32/define.h
deleted file mode 100644
index d2b069d21..000000000
--- a/software/apps/baremetal/dotp_i32/define.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2021 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-#define LEN (1024)
-#define N_PE (NUM_CORES)
-#define N_BANK (NUM_CORES * 4)
-#define N_BANK_PE (N_PE * 4)
-
-/* Enable log barriers */
-#define LOG_BARRIERS
-
-/* STEP core 0 reduction */
-#define STEP (256)
-#define STEP_CORES (STEP / 4)
-
-//////////////////////////////////
-/*          SELECT ONE          */
-
-// #define SINGLE
-// #define SINGLE_UNROLLED
-
-// #define PARALLEL
-// #define PARALLEL_UNROLLED
-
-// #define PARALLEL_LOCAL
-// #define LOCAL_UNROLLED
-
-// #define PARALLEL_RED0
-// #define PARALLEL_UNROLLED_RED0
-
-// #define PARALLEL_REDTREE
-// #define PARALLEL_UNROLLED_REDTREE
-
-//////////////////////////////////
-
-// Vectors for kernel computation
-int32_t vector_a[LEN] __attribute__((aligned(LEN), section(".l1")));
-int32_t vector_b[LEN] __attribute__((aligned(LEN), section(".l1")));
-
-#if defined(PARALLEL_RED0) || defined(PARALLEL_UNROLLED_RED0) ||               \
-    defined(PARALLEL_REDTREE) || defined(PARALLEL_UNROLLED_REDTREE)
-int32_t sum[N_BANK] __attribute__((aligned(N_BANK), section(".l1")));
-#else
-int32_t sum __attribute__((section(".l1")));
-#endif
-
-// Vectors for performance metrics
-uint32_t volatile red_barrier[NUM_CORES * 4]
-    __attribute__((aligned(NUM_CORES * 4), section(".l1")));
-int32_t result __attribute__((section(".l1")));
-int32_t check __attribute__((section(".l1")));
-int volatile error __attribute__((section(".l1")));
diff --git a/software/apps/baremetal/dotp_i32/dotp_parallel.h b/software/apps/baremetal/dotp_i32/dotp_parallel.h
deleted file mode 100644
index b765f6987..000000000
--- a/software/apps/baremetal/dotp_i32/dotp_parallel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright 2021 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-/* Parallel dot-product */
-void dotp_parallel(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len,
-                   uint32_t nPE) {
-
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  uint32_t step = Len / nPE;
-
-  register int32_t local_sum = 0;
-  register int32_t a, b;
-  for (uint32_t i = core_id * step; i < core_id * step + step; i++) {
-    a = in_a[i];
-    b = in_b[i];
-    local_sum += a * b;
-  }
-  mempool_stop_benchmark();
-  mempool_start_benchmark();
-  __atomic_fetch_add(&s[0], local_sum, __ATOMIC_RELAXED);
-#ifdef LOG_BARRIERS
-  mempool_log_barrier(2, core_id);
-  (void)num_cores;
-#else
-  mempool_barrier(num_cores);
-#endif
-}
-
-/* Parallel dot-product */
-void dotp_parallel_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s,
-                             uint32_t Len, uint32_t nPE) {
-
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  uint32_t step = Len / nPE;
-  uint32_t reminder = step % 4;
-  uint32_t i;
-
-  register int32_t a0 = 0, b0 = 0, a1 = 0, b1 = 0, a2 = 0, b2 = 0, a3 = 0,
-                   b3 = 0;
-  register int32_t local_sum0 = 0;
-  register int32_t local_sum1 = 0;
-  register int32_t local_sum2 = 0;
-  register int32_t local_sum3 = 0;
-  for (i = core_id * step; i < (core_id * step + step) - reminder; i += 4) {
-    a0 = in_a[i];
-    b0 = in_b[i];
-    a1 = in_a[i + 1];
-    b1 = in_b[i + 1];
-    a2 = in_a[i + 2];
-    b2 = in_b[i + 2];
-    a3 = in_a[i + 3];
-    b3 = in_b[i + 3];
-    local_sum0 += a0 * b0;
-    local_sum1 += a1 * b1;
-    local_sum2 += a2 * b2;
-    local_sum3 += a3 * b3;
-  }
-  i = core_id * step + step - reminder;
-  while (i < step) {
-    a0 = in_a[i];
-    b0 = in_b[i];
-    local_sum0 += a0 * b0;
-    i++;
-  }
-  local_sum0 += local_sum1;
-  local_sum2 += local_sum3;
-  local_sum0 += local_sum2;
-  mempool_barrier(num_cores);
-
-  mempool_stop_benchmark();
-  mempool_start_benchmark();
-  __atomic_fetch_add(&s[0], local_sum0, __ATOMIC_RELAXED);
-#ifdef LOG_BARRIERS
-  mempool_log_barrier(2, core_id);
-#else
-  mempool_barrier(num_cores);
-#endif
-}
diff --git a/software/apps/baremetal/dotp_i32/dotp_parallel_local.h b/software/apps/baremetal/dotp_i32/dotp_parallel_local.h
deleted file mode 100644
index 950955832..000000000
--- a/software/apps/baremetal/dotp_i32/dotp_parallel_local.h
+++ /dev/null
@@ -1,162 +0,0 @@
-// Copyright 2021 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-/*
-  Parallel dot-product with final reduction performed by multiple cores
-  using atomic-fetch and adds to a single memory location.
-   A) Parallelized workload
-   B) Atomic fetch and add to a single memory location
-   C) Barrier */
-
-/*******************************************************/
-/**                    MULTI-CORE                     **/
-/*******************************************************/
-
-/* Parallel dot-product */
-void dotp_parallel_local(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len,
-                         uint32_t nPE) {
-
-  uint32_t const remainder = Len % 4;
-  uint32_t const idx_stop = Len - remainder;
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-
-  if (nPE == num_cores) {
-    register int32_t local_sum = 0;
-    uint32_t idx = core_id * 4;
-    while (idx < idx_stop) {
-      local_sum += in_a[idx] * in_b[idx];
-      local_sum += in_a[idx + 1] * in_b[idx + 1];
-      local_sum += in_a[idx + 2] * in_b[idx + 2];
-      local_sum += in_a[idx + 3] * in_b[idx + 3];
-      idx += N_BANK;
-    }
-    if (core_id == (Len % N_BANK) / 4) {
-      while (idx < Len) {
-        local_sum += in_a[idx] * in_b[idx];
-        idx++;
-      }
-    }
-    mempool_stop_benchmark();
-    mempool_start_benchmark();
-    __atomic_fetch_add(&s[0], local_sum, __ATOMIC_RELAXED);
-#ifdef LOG_BARRIERS
-    mempool_log_barrier(2, core_id);
-#else
-    mempool_barrier(num_cores);
-#endif
-  } else {
-    register int32_t local_sum = 0;
-    uint32_t idx = core_id * 4;
-    while (idx < idx_stop) {
-      local_sum += in_a[idx] * in_b[idx];
-      local_sum += in_a[idx + 1] * in_b[idx + 1];
-      local_sum += in_a[idx + 2] * in_b[idx + 2];
-      local_sum += in_a[idx + 3] * in_b[idx + 3];
-      idx += N_BANK_PE;
-    }
-    if (core_id == (Len % N_BANK_PE) / 4) {
-      while (idx < Len) {
-        local_sum += in_a[idx] * in_b[idx];
-        idx++;
-      }
-    }
-    if (core_id < nPE) {
-      mempool_stop_benchmark();
-      mempool_start_benchmark();
-    }
-    __atomic_fetch_add(&s[0], local_sum, __ATOMIC_RELAXED);
-#ifdef LOG_BARRIERS
-    mempool_log_partial_barrier(2, core_id, nPE);
-#else
-    mempool_barrier(num_cores);
-#endif
-  }
-}
-
-/* Parallel dot-product with loop unrolling */
-void dotp_parallel_local_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s,
-                                   uint32_t Len, uint32_t nPE) {
-
-  uint32_t const remainder = Len % 4;
-  uint32_t const idx_stop = Len - remainder;
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  register int32_t local_sum_1 = 0;
-  register int32_t local_sum_2 = 0;
-  register int32_t local_sum_3 = 0;
-  register int32_t local_sum_4 = 0;
-
-  if (nPE == num_cores) {
-    uint32_t idx = core_id * 4;
-    while (idx < idx_stop) {
-      int32_t in_a1 = in_a[idx];
-      int32_t in_b1 = in_b[idx];
-      int32_t in_a2 = in_a[idx + 1];
-      int32_t in_b2 = in_b[idx + 1];
-      int32_t in_a3 = in_a[idx + 2];
-      int32_t in_b3 = in_b[idx + 2];
-      int32_t in_a4 = in_a[idx + 3];
-      int32_t in_b4 = in_b[idx + 3];
-      local_sum_1 += in_a1 * in_b1;
-      local_sum_2 += in_a2 * in_b2;
-      local_sum_3 += in_a3 * in_b3;
-      local_sum_4 += in_a4 * in_b4;
-      idx += N_BANK;
-    }
-    if (core_id == ((Len % N_BANK) / 4)) {
-      while (idx < Len) {
-        local_sum_1 += in_a[idx] * in_b[idx];
-        idx++;
-      }
-    }
-    local_sum_1 += local_sum_2;
-    local_sum_3 += local_sum_4;
-    local_sum_1 += local_sum_3;
-    mempool_stop_benchmark();
-    mempool_start_benchmark();
-    __atomic_fetch_add(&s[0], local_sum_1, __ATOMIC_RELAXED);
-#ifdef LOG_BARRIERS
-    mempool_log_barrier(2, core_id);
-#else
-    mempool_barrier(num_cores);
-#endif
-  } else {
-    uint32_t idx = core_id * 4;
-    while (idx < idx_stop) {
-      int32_t in_a1 = in_a[idx];
-      int32_t in_b1 = in_b[idx];
-      int32_t in_a2 = in_a[idx + 1];
-      int32_t in_b2 = in_b[idx + 1];
-      int32_t in_a3 = in_a[idx + 2];
-      int32_t in_b3 = in_b[idx + 2];
-      int32_t in_a4 = in_a[idx + 3];
-      int32_t in_b4 = in_b[idx + 3];
-      local_sum_1 += in_a1 * in_b1;
-      local_sum_2 += in_a2 * in_b2;
-      local_sum_3 += in_a3 * in_b3;
-      local_sum_4 += in_a4 * in_b4;
-      idx += N_BANK_PE;
-    }
-    if (core_id == ((Len % N_BANK_PE) / 4)) {
-      while (idx < Len) {
-        local_sum_1 += in_a[idx] * in_b[idx];
-        idx++;
-      }
-    }
-    local_sum_1 += local_sum_2;
-    local_sum_3 += local_sum_4;
-    local_sum_1 += local_sum_3;
-    mempool_stop_benchmark();
-    mempool_start_benchmark();
-    __atomic_fetch_add(&s[0], local_sum_1, __ATOMIC_RELAXED);
-#ifdef LOG_BARRIERS
-    mempool_log_partial_barrier(2, core_id, nPE);
-#else
-    mempool_barrier(num_cores);
-#endif
-  }
-}
diff --git a/software/apps/baremetal/dotp_i32/dotp_parallel_red0.h b/software/apps/baremetal/dotp_i32/dotp_parallel_red0.h
deleted file mode 100644
index 0ad166d41..000000000
--- a/software/apps/baremetal/dotp_i32/dotp_parallel_red0.h
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright 2021 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-/*
-  Parallel dot-product with atomic fetch and add towards local memory
-  locations and final reduction by a single core. The cores write in
-  memory banks separated by a "step".
-    A) Parallelized workload
-    B) Atomic fetch and add to local memory banks
-    C) Barrier
-    D) Final reduction by core 0 incorporated in a barrier */
-
-/*******************************************************/
-/**                    MULTI-CORE                     **/
-/*******************************************************/
-
-/* Parallel dot-product */
-void dotp_parallel_red0(int32_t *in_a, int32_t *in_b, int32_t *s,
-                        uint32_t Len) {
-
-  uint32_t const remainder = Len % 4;
-  uint32_t const idx_stop = Len - remainder;
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  int32_t local_sum = 0;
-
-  uint32_t idx = core_id * 4;
-  while (idx < idx_stop) {
-    local_sum += in_a[idx] * in_b[idx];
-    local_sum += in_a[idx + 1] * in_b[idx + 1];
-    local_sum += in_a[idx + 2] * in_b[idx + 2];
-    local_sum += in_a[idx + 3] * in_b[idx + 3];
-    idx += N_BANK;
-  }
-  if (core_id == (Len % N_BANK) / 4) {
-    while (idx < Len) {
-      local_sum += in_a[idx] * in_b[idx];
-      idx++;
-    }
-  }
-  __atomic_fetch_add(&s[(core_id / STEP_CORES) * STEP], local_sum,
-                     __ATOMIC_RELAXED);
-  mempool_stop_benchmark();
-
-  mempool_start_benchmark();
-  if ((num_cores - 1) ==
-      __atomic_fetch_add(&red_barrier[0], 1, __ATOMIC_RELAXED)) {
-    __atomic_store_n(&red_barrier[0], 0, __ATOMIC_RELAXED);
-    __sync_synchronize(); // Full memory barrier
-    uint32_t idx_red = 0;
-    local_sum = 0;
-    while (idx_red < N_BANK) {
-      local_sum += s[idx_red];
-      idx_red += STEP;
-    }
-    s[0] = local_sum;
-    wake_up_all();
-  }
-  mempool_wfi();
-}
-
-/* Parallel dot-product with loop unrolling */
-void dotp_parallel_unrolled4_red0(int32_t *in_a, int32_t *in_b, int32_t *s,
-                                  uint32_t Len) {
-
-  uint32_t const remainder = Len % 4;
-  uint32_t const idx_stop = Len - remainder;
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  int32_t local_sum_1 = 0;
-  int32_t local_sum_2 = 0;
-  int32_t local_sum_3 = 0;
-  int32_t local_sum_4 = 0;
-
-  uint32_t idx = core_id * 4;
-  while (idx < idx_stop) {
-    int32_t in_a1 = in_a[idx];
-    int32_t in_b1 = in_b[idx];
-    int32_t in_a2 = in_a[idx + 1];
-    int32_t in_b2 = in_b[idx + 1];
-    int32_t in_a3 = in_a[idx + 2];
-    int32_t in_b3 = in_b[idx + 2];
-    int32_t in_a4 = in_a[idx + 3];
-    int32_t in_b4 = in_b[idx + 3];
-    local_sum_1 += in_a1 * in_b1;
-    local_sum_2 += in_a2 * in_b2;
-    local_sum_3 += in_a3 * in_b3;
-    local_sum_4 += in_a4 * in_b4;
-    idx += N_BANK;
-  }
-  if (core_id == ((Len % N_BANK) / 4)) {
-    while (idx < Len) {
-      local_sum_1 += in_a[idx] * in_b[idx];
-      idx++;
-    }
-  }
-  local_sum_1 += local_sum_2;
-  local_sum_3 += local_sum_4;
-  local_sum_1 += local_sum_3;
-  __atomic_fetch_add(&s[(core_id / STEP_CORES) * STEP], local_sum_1,
-                     __ATOMIC_RELAXED);
-  mempool_stop_benchmark();
-
-  mempool_start_benchmark();
-  if ((num_cores - 1) ==
-      __atomic_fetch_add(&red_barrier[0], 1, __ATOMIC_RELAXED)) {
-    __atomic_store_n(&red_barrier[0], 0, __ATOMIC_RELAXED);
-    __sync_synchronize(); // Full memory barrier
-    uint32_t idx_red = 0;
-    local_sum_1 = 0;
-    while (idx_red < N_BANK) {
-      local_sum_1 += s[idx_red];
-      idx_red += STEP;
-    }
-    s[0] = local_sum_1;
-    wake_up_all();
-  }
-  mempool_wfi();
-}
diff --git a/software/apps/baremetal/dotp_i32/dotp_parallel_redtree.h b/software/apps/baremetal/dotp_i32/dotp_parallel_redtree.h
deleted file mode 100644
index 3659de0a3..000000000
--- a/software/apps/baremetal/dotp_i32/dotp_parallel_redtree.h
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright 2021 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-/*
-  Parallel dot-product with atomic fetch and add towards local memory
-  locations and final reduction by a single core. The cores write in
-  memory banks separated by a "step".
-    A) Parallelized workload
-    B) Atomic fetch and add to local memory banks
-    C) Barrier
-    D) Final reduction by core 0 incorporated in a barrier */
-
-/*******************************************************/
-/**                    MULTI-CORE                     **/
-/*******************************************************/
-
-void mempool_log_reduction(int32_t *sum, uint32_t volatile step,
-                           uint32_t core_id);
-
-/* Parallel dot-product */
-void dotp_parallel_redtree(int32_t *in_a, int32_t *in_b, int32_t *s,
-                           uint32_t Len) {
-
-  uint32_t const remainder = Len % 4;
-  uint32_t const idx_stop = Len - remainder;
-  uint32_t core_id = mempool_get_core_id();
-
-  register int32_t local_sum = 0;
-  uint32_t idx = core_id * 4;
-  while (idx < idx_stop) {
-    local_sum += in_a[idx] * in_b[idx];
-    local_sum += in_a[idx + 1] * in_b[idx + 1];
-    local_sum += in_a[idx + 2] * in_b[idx + 2];
-    local_sum += in_a[idx + 3] * in_b[idx + 3];
-    idx += N_BANK;
-  }
-  if (core_id == (Len % N_BANK) / 4) {
-    while (idx < Len) {
-      local_sum += in_a[idx] * in_b[idx];
-      idx++;
-    }
-  }
-  s[core_id * 4] = local_sum; // Each core is storing locally
-  mempool_stop_benchmark();
-  mempool_start_benchmark();
-  mempool_log_reduction(s, 2, core_id);
-}
-
-void dotp_parallel_redtree_unrolled(int32_t *in_a, int32_t *in_b, int32_t *s,
-                                    uint32_t Len) {
-
-  uint32_t const remainder = Len % 4;
-  uint32_t const idx_stop = Len - remainder;
-  uint32_t core_id = mempool_get_core_id();
-  register int32_t local_sum_1 = 0;
-  register int32_t local_sum_2 = 0;
-  register int32_t local_sum_3 = 0;
-  register int32_t local_sum_4 = 0;
-
-  uint32_t idx = core_id * 4;
-  while (idx < idx_stop) {
-    int32_t in_a1 = in_a[idx];
-    int32_t in_b1 = in_b[idx];
-    int32_t in_a2 = in_a[idx + 1];
-    int32_t in_b2 = in_b[idx + 1];
-    int32_t in_a3 = in_a[idx + 2];
-    int32_t in_b3 = in_b[idx + 2];
-    int32_t in_a4 = in_a[idx + 3];
-    int32_t in_b4 = in_b[idx + 3];
-    local_sum_1 += in_a1 * in_b1;
-    local_sum_2 += in_a2 * in_b2;
-    local_sum_3 += in_a3 * in_b3;
-    local_sum_4 += in_a4 * in_b4;
-    idx += N_BANK;
-  }
-  if (core_id == ((Len % N_BANK) / 4)) {
-    while (idx < Len) {
-      local_sum_1 += in_a[idx] * in_b[idx];
-      idx++;
-    }
-  }
-  local_sum_1 += local_sum_2;
-  local_sum_3 += local_sum_4;
-  local_sum_1 += local_sum_3;
-  s[core_id * 4] = local_sum_1; // Each core is storing locally
-  mempool_stop_benchmark();
-  mempool_start_benchmark();
-  mempool_log_reduction(s, 2, core_id);
-}
-
-void mempool_log_reduction(int32_t *sum, uint32_t volatile step,
-                           uint32_t core_id) {
-
-  uint32_t idx_sum, idx = (step * (core_id / step)) * 4;
-  uint32_t next_step, previous_step;
-  register int32_t local_sum;
-  uint32_t num_cores = mempool_get_core_count();
-
-  previous_step = step >> 1;
-  if ((step - previous_step) ==
-      __atomic_fetch_add(&red_barrier[idx + previous_step - 1], previous_step,
-                         __ATOMIC_RELAXED)) {
-
-    local_sum = 0;
-    idx_sum = idx;
-    while (idx_sum < idx + step * 4) {
-      local_sum += sum[idx_sum];
-      idx_sum += previous_step * 4;
-    }
-    sum[idx] = local_sum;
-
-    next_step = step << 1;
-    __atomic_store_n(&red_barrier[idx + previous_step - 1], 0,
-                     __ATOMIC_RELAXED);
-    if (num_cores == step) {
-      sum[0] = sum[idx];
-      __sync_synchronize(); // Full memory barrier
-      wake_up_all();
-      mempool_wfi();
-    } else {
-      mempool_log_reduction(sum, next_step, core_id);
-    }
-
-  } else
-    mempool_wfi();
-}
diff --git a/software/apps/baremetal/dotp_i32/main.c b/software/apps/baremetal/dotp_i32/main.c
index f7cf7508f..da00a937e 100644
--- a/software/apps/baremetal/dotp_i32/main.c
+++ b/software/apps/baremetal/dotp_i32/main.c
@@ -8,132 +8,72 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "dma.h"
 #include "encoding.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
 
-#include "define.h"
+#include "data_dotp_i32.h"
+#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
+#define LOG_BARRIERS
+// #define ATOMIC_REDUCTION
+// #define SINGLE_CORE_REDUCTION
+#define BINARY_REDUCTION
 
-#include "dotp_parallel.h"
-#include "dotp_parallel_local.h"
-#include "dotp_parallel_red0.h"
-#include "dotp_parallel_redtree.h"
-#include "dotp_single.h"
+// Vectors for kernel computation
+int32_t l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
+int32_t l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
+uint32_t red_barrier[NUM_BANKS]
+    __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+int32_t sum[NUM_BANKS] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 
-void init_vectors(int32_t *in_a, int32_t *in_b, int32_t *s, int32_t *p_result,
-                  int32_t *p_check, uint32_t Len) {
-  *p_result = 0;
-  *p_check = 0;
-  uint32_t j = 0;
-  uint32_t num_cores = mempool_get_core_count();
-  while (j < Len) {
-    int32_t a = (int32_t)(j % num_cores);
-    int32_t b = (int32_t)(j % 4 + 3);
-    in_a[j] = a;
-    in_b[j] = b;
-    *p_check = *p_check + (int32_t)(a * b);
-    j++;
-  }
-#if defined(PARALLEL_RED0) || defined(PARALLEL_UNROLLED_RED0) ||               \
-    defined(PARALLEL_REDTREE) || defined(PARALLEL_UNROLLED_REDTREE)
-  for (uint32_t k = 0; k < N_BANK; k++) {
-    s[k] = 0;
-    red_barrier[k] = 0;
-  }
-#else
-  *s = 0;
-#endif
-}
+#include "baremetal/mempool_dotp_i32p.h"
+#include "baremetal/mempool_dotp_i32s.h"
 
 int main() {
 
   uint32_t core_id = mempool_get_core_id();
   uint32_t num_cores = mempool_get_core_count();
   uint32_t time_init, time_end;
-  // initialize synchronization variables
   mempool_barrier_init(core_id);
 
+  time_init = 0;
+  time_end = 0;
   if (core_id == 0) {
-    error = 0;
-    time_init = 0;
-    time_end = 0;
-#if defined(PARALLEL_RED0) || defined(PARALLEL_UNROLLED_RED0) ||               \
-    defined(PARALLEL_REDTREE) || defined(PARALLEL_UNROLLED_REDTREE)
-    init_vectors(vector_a, vector_b, sum, &result, &check, LEN);
-#else
-    init_vectors(vector_a, vector_b, &sum, &result, &check, LEN);
-#endif
+    dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int32_t));
+    dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int32_t));
+  }
+  for (uint32_t k = core_id; k < NUM_BANKS; k += num_cores) {
+    sum[k] = 0;
+    red_barrier[k] = 0;
   }
-  mempool_barrier(num_cores); // wait until all cores have finished
+  mempool_barrier(num_cores);
 
-  // Kernel execution
+  //  // SINGLE-CORE
+  //  time_init = mempool_get_timer();
+  //  dotp_i32s_unrolled4(l1_A, l1_B, sum, LEN);
+  //  time_end = mempool_get_timer();
 
-  time_init = mempool_get_timer();
-#ifdef SINGLE
-  dotp_single(vector_a, vector_b, &sum, LEN);
-#elif defined(SINGLE_UNROLLED)
-  dotp_single_unrolled4(vector_a, vector_b, &sum, LEN);
-#endif
-  time_end = mempool_get_timer();
+  //  // PARALLEL
+  //  time_init = mempool_get_timer();
+  //  dotp_i32p(l1_A, l1_B, sum, LEN, num_cores);
+  //  time_end = mempool_get_timer();
 
+  // PARALLEL, LOCAL ACCESSES
   time_init = mempool_get_timer();
-  mempool_start_benchmark();
-/* A) Parallelized workload
-   B) Atomic fetch and add to a single memory location
-   C) Barrier */
-#ifdef PARALLEL
-  dotp_parallel(vector_a, vector_b, &sum, LEN, N_PE);
-#elif defined(PARALLEL_UNROLLED)
-  dotp_parallel_unrolled4(vector_a, vector_b, &sum, LEN, N_PE);
-/* A) Parallelized workload
-   B) Atomic fetch and add to local memory banks
-   C) Barrier
-   D) Final reduction by core 0 incorporated in a barrier */
-#elif defined(PARALLEL_RED0)
-  dotp_parallel_red0(vector_a, vector_b, sum, LEN, N_PE);
-#elif defined(PARALLEL_UNROLLED_RED0)
-  dotp_parallel_unrolled4_red0(vector_a, vector_b, sum, LEN, N_PE);
-/* A) Parallelized workload
-   B) Nested set of barriers: reduction is performed in a logarithmic tree. */
-#elif defined(PARALLEL_REDTREE)
-  dotp_parallel_redtree(vector_a, vector_b, sum, LEN, N_PE);
-#elif defined(PARALLEL_UNROLLED_REDTREE)
-  dotp_parallel_redtree_unrolled(vector_a, vector_b, sum, LEN, N_PE);
-#endif
-  mempool_stop_benchmark();
+  dotp_i32p_local_unrolled4(l1_A, l1_B, sum, LEN);
   time_end = mempool_get_timer();
 
-  /* A) Parallelized workload
-     B) Atomic fetch and add to a single memory location
-     C) Barrier */
-  if (core_id < N_PE) {
-    time_init = mempool_get_timer();
-    mempool_start_benchmark();
-#ifdef PARALLEL_LOCAL
-    dotp_parallel_local(vector_a, vector_b, &sum, LEN, N_PE);
-#elif defined(LOCAL_UNROLLED)
-    dotp_parallel_local_unrolled4(vector_a, vector_b, &sum, LEN, N_PE);
-#endif
-    mempool_stop_benchmark();
-    time_end = mempool_get_timer();
-  }
-
-  mempool_barrier(num_cores);
   // Check results
+  mempool_barrier(num_cores);
   if (core_id == 0) {
     uint32_t clock_cycles = (time_end - time_init);
-#if defined(PARALLEL_RED0) || defined(PARALLEL_UNROLLED_RED0) ||               \
-    defined(PARALLEL_REDTREE) || defined(PARALLEL_UNROLLED_REDTREE)
-    result = sum[0];
-#else
-    result = sum;
-#endif
     printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
-    printf("Result ==> %d\n", result);
-    printf("Check  ==> %d\n\n", check);
+    printf("Result ==> %d\n", sum[0]);
+    printf("Check  ==> %d\n\n", l2_C);
   }
   mempool_barrier(num_cores);
 
-  return error;
+  return 0;
 }
diff --git a/software/data/data_dotp_i32.h.tpl b/software/data/data_dotp_i32.h.tpl
new file mode 100644
index 000000000..d76d92a24
--- /dev/null
+++ b/software/data/data_dotp_i32.h.tpl
@@ -0,0 +1,24 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+\
+<% def array_to_cstr(array):
+    out = '{'
+    i = 0
+    out += '\n'
+    for a in array:
+        out += '{}, '.format(a)
+        i += 1
+        if i % 8 == 0:
+            out += '\n'
+    out = out[:-2] + '}'
+    return out
+%> \
+
+#define LEN (${Len})
+
+int32_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)};
+
+int32_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)};
+
+int32_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C = ${C};
diff --git a/software/data/generate_dotp.py b/software/data/generate_dotp.py
new file mode 100644
index 000000000..6bacf2488
--- /dev/null
+++ b/software/data/generate_dotp.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+
+# Copyright 2022 ETH Zurich and University of Bologna.
+# Solderpad Hardware License, Version 0.51, see LICENSE for details.
+# SPDX-License-Identifier: SHL-0.51
+
+# This script generates data for the fp16 matmul.
+# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
+
+import numpy as np
+import argparse
+import pathlib
+from mako.template import Template
+
+
+def generate_dotp_i32(Len):
+
+    # Create matrix
+    MAX = 2**7 - 1
+    A = np.random.randint(-MAX, MAX - 1, size=Len)
+    B = np.random.randint(-MAX, MAX - 1, size=Len)
+    C = np.dot(A, B)
+    return A, B, C
+
+##################
+# compute_result #
+##################
+
+
+def gen_data_header_file(outdir: pathlib.Path.cwd(),
+                         tpl: pathlib.Path.cwd(), **kwargs):
+
+    file = outdir / f"{kwargs['name']}.h"
+
+    print(tpl, outdir, kwargs['name'])
+
+    template = Template(filename=str(tpl))
+    with file.open('w') as f:
+        f.write(template.render(**kwargs))
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Generate data for kernels')
+    parser.add_argument(
+        "-o",
+        "--outdir",
+        type=pathlib.Path,
+        default=pathlib.Path(__file__).parent.absolute(),
+        required=False,
+        help='Select out directory of generated data files'
+    )
+    parser.add_argument(
+        "-n",
+        "--length",
+        type=int,
+        required=False,
+        default=4096,
+        help='First dimension.'
+    )
+
+    args = parser.parse_args()
+    Len = args.length
+
+    A, B, C = generate_dotp_i32(Len)
+    tpl = pathlib.Path(__file__).parent.absolute() / "data_dotp_i32.h.tpl"
+    kwargs = {
+        'name': 'data_dotp_i32',
+        'A': A,
+        'B': B,
+        'C': C,
+        'Len': Len}
+    gen_data_header_file(args.outdir, tpl, **kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/software/kernels/baremetal/mempool_dotp_i32p.h b/software/kernels/baremetal/mempool_dotp_i32p.h
new file mode 100644
index 000000000..26fbe03e9
--- /dev/null
+++ b/software/kernels/baremetal/mempool_dotp_i32p.h
@@ -0,0 +1,196 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+/* Parallel dot-product */
+void dotp_i32p(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len,
+               uint32_t nPE) {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t step = Len / nPE;
+  register int32_t local_sum = 0;
+  register int32_t a, b;
+  for (uint32_t i = core_id * step; i < core_id * step + step; i++) {
+    a = in_a[i];
+    b = in_b[i];
+    local_sum += a * b;
+  }
+  __atomic_fetch_add(&s[0], local_sum, __ATOMIC_RELAXED);
+#ifdef LOG_BARRIERS
+  mempool_log_barrier(2, core_id);
+#else
+  uint32_t num_cores = mempool_get_core_count();
+  mempool_barrier(num_cores);
+#endif
+  return;
+}
+
+/* Parallel dot-product with loop unrolling*/
+void dotp_i32p_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len,
+                         uint32_t nPE) {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t step = Len / nPE;
+  uint32_t reminder = step % 4;
+  uint32_t i;
+
+  register int32_t a0 = 0, a1 = 0, a2 = 0, a3 = 0;
+  register int32_t b2 = 0, b1 = 0, b0 = 0, b3 = 0;
+  register int32_t local_sum0 = 0;
+  register int32_t local_sum1 = 0;
+  register int32_t local_sum2 = 0;
+  register int32_t local_sum3 = 0;
+  for (i = core_id * step; i < (core_id * step + step) - reminder; i += 4) {
+    a0 = in_a[i];
+    b0 = in_b[i];
+    a1 = in_a[i + 1];
+    b1 = in_b[i + 1];
+    a2 = in_a[i + 2];
+    b2 = in_b[i + 2];
+    a3 = in_a[i + 3];
+    b3 = in_b[i + 3];
+    local_sum0 += a0 * b0;
+    local_sum1 += a1 * b1;
+    local_sum2 += a2 * b2;
+    local_sum3 += a3 * b3;
+  }
+  i = core_id * step + step - reminder;
+  while (i < step) {
+    a0 = in_a[i];
+    b0 = in_b[i];
+    local_sum0 += a0 * b0;
+    i++;
+  }
+  local_sum0 += local_sum1;
+  local_sum2 += local_sum3;
+  local_sum0 += local_sum2;
+  __atomic_fetch_add(&s[0], local_sum0, __ATOMIC_RELAXED);
+#ifdef LOG_BARRIERS
+  mempool_log_barrier(2, core_id);
+#else
+  uint32_t num_cores = mempool_get_core_count();
+  mempool_barrier(num_cores);
+#endif
+  return;
+}
+
+/* Bynary tree reduction */
+void mempool_binary_reduction(int32_t *sum, uint32_t core_id,
+                              uint32_t num_cores) {
+
+  uint32_t idx, step = 2, previous_step = 1;
+  while (num_cores > 1) {
+    idx = (step * (core_id / step)) * BANKING_FACTOR;
+    // dump_prova(idx);
+    if (__atomic_fetch_add(&red_barrier[idx + previous_step - 1], 1,
+                           __ATOMIC_RELAXED)) {
+
+      // Reduction
+      sum[idx] += sum[idx + previous_step * BANKING_FACTOR];
+
+      // Next level of binary tree
+      __atomic_store_n(&red_barrier[idx + previous_step - 1], 0,
+                       __ATOMIC_RELAXED);
+      num_cores = num_cores / 2;
+      previous_step = step;
+      step = step * 2;
+
+    } else {
+      // Goes to sleep
+      break;
+    }
+  }
+
+  // Last core wakes everyone
+  if (num_cores == 1) {
+    __sync_synchronize();
+    wake_up_all();
+  }
+  mempool_wfi();
+
+  return;
+}
+
+/* Parallel dot-product with loop unrolling */
+/* Load and stores only in local memory */
+#define NUM_CORES_RED (16)
+void dotp_i32p_local_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s,
+                               uint32_t Len) {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t const remainder = Len % 4;
+  uint32_t const idx_stop = Len - remainder;
+
+  register int32_t a0 = 0, a1 = 0, a2 = 0, a3 = 0;
+  register int32_t b2 = 0, b1 = 0, b0 = 0, b3 = 0;
+  register int32_t local_sum0 = 0;
+  register int32_t local_sum1 = 0;
+  register int32_t local_sum2 = 0;
+  register int32_t local_sum3 = 0;
+
+  for (uint32_t i = core_id * 4; i < idx_stop; i += NUM_BANKS) {
+    a0 = in_a[i];
+    b0 = in_b[i];
+    a1 = in_a[i + 1];
+    b1 = in_b[i + 1];
+    a2 = in_a[i + 2];
+    b2 = in_b[i + 2];
+    a3 = in_a[i + 3];
+    b3 = in_b[i + 3];
+    local_sum0 += a0 * b0;
+    local_sum1 += a1 * b1;
+    local_sum2 += a2 * b2;
+    local_sum3 += a3 * b3;
+  }
+  if (core_id == ((Len % NUM_BANKS) / 4)) {
+    for (uint32_t i = Len - remainder; i < Len; i++) {
+      a0 = in_a[i];
+      b0 = in_b[i];
+      local_sum0 += a0 * b0;
+    }
+  }
+  local_sum0 += local_sum1;
+  local_sum2 += local_sum3;
+  local_sum0 += local_sum2;
+
+// A) Cores atomically fetch and add in sum variable
+// B) A global barrier synchronizes all of them
+#if defined(ATOMIC_REDUCTION)
+  __atomic_fetch_add(&s[0], local_sum0, __ATOMIC_RELAXED);
+  mempool_log_barrier(2, core_id);
+
+// A) Groups of NUM_CORES_RED cores atomically fetch and add in sum array
+// B) The last core to the reduction barrier sums the partial reductions
+#elif defined(SINGLE_CORE_REDUCTION)
+  uint32_t num_cores = mempool_get_core_count();
+  __atomic_fetch_add(
+      &s[BANKING_FACTOR * NUM_CORES_RED * (core_id / NUM_CORES_RED)],
+      local_sum0, __ATOMIC_RELAXED);
+  if ((num_cores - 1) ==
+      __atomic_fetch_add(&red_barrier[0], 1, __ATOMIC_RELAXED)) {
+    __atomic_store_n(&red_barrier[0], 0, __ATOMIC_RELAXED);
+    __sync_synchronize(); // Full memory barrier
+    uint32_t idx_red = 0;
+    local_sum0 = 0;
+    while (idx_red < NUM_BANKS) {
+      local_sum0 += s[idx_red];
+      idx_red += BANKING_FACTOR * NUM_CORES_RED;
+    }
+    s[0] = local_sum0;
+    wake_up_all();
+  }
+  mempool_wfi();
+
+// A) Cores store locally in sum array
+// B) Partial sums are reduced logarithmically
+#elif defined(BINARY_REDUCTION)
+  uint32_t num_cores = mempool_get_core_count();
+  s[core_id * 4] = local_sum0;
+  mempool_binary_reduction(s, core_id, num_cores);
+
+#endif
+
+  return;
+}
diff --git a/software/apps/baremetal/dotp_i32/dotp_single.h b/software/kernels/baremetal/mempool_dotp_i32s.h
similarity index 88%
rename from software/apps/baremetal/dotp_i32/dotp_single.h
rename to software/kernels/baremetal/mempool_dotp_i32s.h
index 58797ee80..dd562debb 100644
--- a/software/apps/baremetal/dotp_i32/dotp_single.h
+++ b/software/kernels/baremetal/mempool_dotp_i32s.h
@@ -5,12 +5,11 @@
 // Author: Marco Bertuletti, ETH Zurich
 
 /* Single-core dot-product */
-void dotp_single(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len) {
+void dotp_i32s(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len) {
 
   uint32_t core_id = mempool_get_core_id();
   uint32_t num_cores = mempool_get_core_count();
   if (core_id == 0) {
-
     mempool_start_benchmark();
     // Kernel execution
     register int32_t local_sum = 0;
@@ -18,7 +17,6 @@ void dotp_single(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len) {
     do {
       local_sum += ((*in_a++) * (*in_b++));
     } while (in_a < end);
-
     *s = local_sum;
     mempool_stop_benchmark();
   }
@@ -26,17 +24,15 @@ void dotp_single(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len) {
 }
 
 /* Single-core dot-product unrolled4 */
-void dotp_single_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s,
-                           uint32_t Len) {
+void dotp_i32s_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s,
+                         uint32_t Len) {
 
   uint32_t core_id = mempool_get_core_id();
   uint32_t num_cores = mempool_get_core_count();
   if (core_id == 0) {
-
     mempool_start_benchmark();
     uint32_t reminder = Len % 4;
     uint32_t i = 0;
-
     int32_t a0 = 0, b0 = 0, a1 = 0, b1 = 0, a2 = 0, b2 = 0, a3 = 0, b3 = 0;
     register int32_t local_sum_1 = 0;
     register int32_t local_sum_2 = 0;
@@ -70,5 +66,4 @@ void dotp_single_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s,
     mempool_stop_benchmark();
   }
   mempool_barrier(num_cores);
-  // mempool_log_barrier(2, core_id);
 }